Linux Audio

Check our new training course

Loading...
v3.1
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <asm/system.h>
  77#include <linux/bitops.h>
  78#include <linux/capability.h>
  79#include <linux/cpu.h>
  80#include <linux/types.h>
  81#include <linux/kernel.h>
  82#include <linux/hash.h>
  83#include <linux/slab.h>
  84#include <linux/sched.h>
  85#include <linux/mutex.h>
  86#include <linux/string.h>
  87#include <linux/mm.h>
  88#include <linux/socket.h>
  89#include <linux/sockios.h>
  90#include <linux/errno.h>
  91#include <linux/interrupt.h>
  92#include <linux/if_ether.h>
  93#include <linux/netdevice.h>
  94#include <linux/etherdevice.h>
  95#include <linux/ethtool.h>
  96#include <linux/notifier.h>
  97#include <linux/skbuff.h>
  98#include <net/net_namespace.h>
  99#include <net/sock.h>
 100#include <linux/rtnetlink.h>
 101#include <linux/proc_fs.h>
 102#include <linux/seq_file.h>
 103#include <linux/stat.h>
 104#include <net/dst.h>
 105#include <net/pkt_sched.h>
 106#include <net/checksum.h>
 107#include <net/xfrm.h>
 108#include <linux/highmem.h>
 109#include <linux/init.h>
 110#include <linux/kmod.h>
 111#include <linux/module.h>
 112#include <linux/netpoll.h>
 113#include <linux/rcupdate.h>
 114#include <linux/delay.h>
 115#include <net/wext.h>
 116#include <net/iw_handler.h>
 117#include <asm/current.h>
 118#include <linux/audit.h>
 119#include <linux/dmaengine.h>
 120#include <linux/err.h>
 121#include <linux/ctype.h>
 122#include <linux/if_arp.h>
 123#include <linux/if_vlan.h>
 124#include <linux/ip.h>
 125#include <net/ip.h>
 126#include <linux/ipv6.h>
 127#include <linux/in.h>
 128#include <linux/jhash.h>
 129#include <linux/random.h>
 130#include <trace/events/napi.h>
 131#include <trace/events/net.h>
 132#include <trace/events/skb.h>
 133#include <linux/pci.h>
 134#include <linux/inetdevice.h>
 135#include <linux/cpu_rmap.h>
 136
 137#include "net-sysfs.h"
 138
 139/* Instead of increasing this, you should create a hash table. */
 140#define MAX_GRO_SKBS 8
 141
 142/* This should be increased if a protocol with a bigger head is added. */
 143#define GRO_MAX_HEAD (MAX_HEADER + 128)
 144
 145/*
 146 *	The list of packet types we will receive (as opposed to discard)
 147 *	and the routines to invoke.
 148 *
 149 *	Why 16. Because with 16 the only overlap we get on a hash of the
 150 *	low nibble of the protocol value is RARP/SNAP/X.25.
 151 *
 152 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 153 *             sure which should go first, but I bet it won't make much
 154 *             difference if we are running VLANs.  The good news is that
 155 *             this protocol won't be in the list unless compiled in, so
 156 *             the average user (w/out VLANs) will not be adversely affected.
 157 *             --BLG
 158 *
 159 *		0800	IP
 160 *		8100    802.1Q VLAN
 161 *		0001	802.3
 162 *		0002	AX.25
 163 *		0004	802.2
 164 *		8035	RARP
 165 *		0005	SNAP
 166 *		0805	X.25
 167 *		0806	ARP
 168 *		8137	IPX
 169 *		0009	Localtalk
 170 *		86DD	IPv6
 171 */
 172
 173#define PTYPE_HASH_SIZE	(16)
 174#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 175
 176static DEFINE_SPINLOCK(ptype_lock);
 177static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 178static struct list_head ptype_all __read_mostly;	/* Taps */
 179
 180/*
 181 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 182 * semaphore.
 183 *
 184 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 185 *
 186 * Writers must hold the rtnl semaphore while they loop through the
 187 * dev_base_head list, and hold dev_base_lock for writing when they do the
 188 * actual updates.  This allows pure readers to access the list even
 189 * while a writer is preparing to update it.
 190 *
 191 * To put it another way, dev_base_lock is held for writing only to
 192 * protect against pure readers; the rtnl semaphore provides the
 193 * protection against other writers.
 194 *
 195 * See, for example usages, register_netdevice() and
 196 * unregister_netdevice(), which must be called with the rtnl
 197 * semaphore held.
 198 */
 199DEFINE_RWLOCK(dev_base_lock);
 200EXPORT_SYMBOL(dev_base_lock);
 201
 202static inline void dev_base_seq_inc(struct net *net)
 203{
 204	while (++net->dev_base_seq == 0);
 205}
 206
 207static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 208{
 209	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 210	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 211}
 212
 213static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 214{
 215	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 216}
 217
 218static inline void rps_lock(struct softnet_data *sd)
 219{
 220#ifdef CONFIG_RPS
 221	spin_lock(&sd->input_pkt_queue.lock);
 222#endif
 223}
 224
 225static inline void rps_unlock(struct softnet_data *sd)
 226{
 227#ifdef CONFIG_RPS
 228	spin_unlock(&sd->input_pkt_queue.lock);
 229#endif
 230}
 231
 232/* Device list insertion */
 233static int list_netdevice(struct net_device *dev)
 234{
 235	struct net *net = dev_net(dev);
 236
 237	ASSERT_RTNL();
 238
 239	write_lock_bh(&dev_base_lock);
 240	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 241	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 242	hlist_add_head_rcu(&dev->index_hlist,
 243			   dev_index_hash(net, dev->ifindex));
 244	write_unlock_bh(&dev_base_lock);
 245
 246	dev_base_seq_inc(net);
 247
 248	return 0;
 249}
 250
 251/* Device list removal
 252 * caller must respect a RCU grace period before freeing/reusing dev
 253 */
 254static void unlist_netdevice(struct net_device *dev)
 255{
 256	ASSERT_RTNL();
 257
 258	/* Unlink dev from the device chain */
 259	write_lock_bh(&dev_base_lock);
 260	list_del_rcu(&dev->dev_list);
 261	hlist_del_rcu(&dev->name_hlist);
 262	hlist_del_rcu(&dev->index_hlist);
 263	write_unlock_bh(&dev_base_lock);
 264
 265	dev_base_seq_inc(dev_net(dev));
 266}
 267
 268/*
 269 *	Our notifier list
 270 */
 271
 272static RAW_NOTIFIER_HEAD(netdev_chain);
 273
 274/*
 275 *	Device drivers call our routines to queue packets here. We empty the
 276 *	queue in the local softnet handler.
 277 */
 278
 279DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 280EXPORT_PER_CPU_SYMBOL(softnet_data);
 281
 282#ifdef CONFIG_LOCKDEP
 283/*
 284 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 285 * according to dev->type
 286 */
 287static const unsigned short netdev_lock_type[] =
 288	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 289	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 290	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 291	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 292	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 293	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 294	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 295	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 296	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 297	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 298	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 299	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 300	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 301	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 302	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 303	 ARPHRD_VOID, ARPHRD_NONE};
 304
 305static const char *const netdev_lock_name[] =
 306	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 307	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 308	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 309	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 310	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 311	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 312	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 313	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 314	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 315	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 316	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 317	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 318	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 319	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 320	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 321	 "_xmit_VOID", "_xmit_NONE"};
 322
 323static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 324static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 325
 326static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 327{
 328	int i;
 329
 330	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 331		if (netdev_lock_type[i] == dev_type)
 332			return i;
 333	/* the last key is used by default */
 334	return ARRAY_SIZE(netdev_lock_type) - 1;
 335}
 336
 337static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 338						 unsigned short dev_type)
 339{
 340	int i;
 341
 342	i = netdev_lock_pos(dev_type);
 343	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 344				   netdev_lock_name[i]);
 345}
 346
 347static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 348{
 349	int i;
 350
 351	i = netdev_lock_pos(dev->type);
 352	lockdep_set_class_and_name(&dev->addr_list_lock,
 353				   &netdev_addr_lock_key[i],
 354				   netdev_lock_name[i]);
 355}
 356#else
 357static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 358						 unsigned short dev_type)
 359{
 360}
 361static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 362{
 363}
 364#endif
 365
 366/*******************************************************************************
 367
 368		Protocol management and registration routines
 369
 370*******************************************************************************/
 371
 372/*
 373 *	Add a protocol ID to the list. Now that the input handler is
 374 *	smarter we can dispense with all the messy stuff that used to be
 375 *	here.
 376 *
 377 *	BEWARE!!! Protocol handlers, mangling input packets,
 378 *	MUST BE last in hash buckets and checking protocol handlers
 379 *	MUST start from promiscuous ptype_all chain in net_bh.
 380 *	It is true now, do not change it.
 381 *	Explanation follows: if protocol handler, mangling packet, will
 382 *	be the first on list, it is not able to sense, that packet
 383 *	is cloned and should be copied-on-write, so that it will
 384 *	change it and subsequent readers will get broken packet.
 385 *							--ANK (980803)
 386 */
 387
 388static inline struct list_head *ptype_head(const struct packet_type *pt)
 389{
 390	if (pt->type == htons(ETH_P_ALL))
 391		return &ptype_all;
 392	else
 393		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 394}
 395
 396/**
 397 *	dev_add_pack - add packet handler
 398 *	@pt: packet type declaration
 399 *
 400 *	Add a protocol handler to the networking stack. The passed &packet_type
 401 *	is linked into kernel lists and may not be freed until it has been
 402 *	removed from the kernel lists.
 403 *
 404 *	This call does not sleep therefore it can not
 405 *	guarantee all CPU's that are in middle of receiving packets
 406 *	will see the new packet type (until the next received packet).
 407 */
 408
 409void dev_add_pack(struct packet_type *pt)
 410{
 411	struct list_head *head = ptype_head(pt);
 412
 413	spin_lock(&ptype_lock);
 414	list_add_rcu(&pt->list, head);
 415	spin_unlock(&ptype_lock);
 416}
 417EXPORT_SYMBOL(dev_add_pack);
 418
 419/**
 420 *	__dev_remove_pack	 - remove packet handler
 421 *	@pt: packet type declaration
 422 *
 423 *	Remove a protocol handler that was previously added to the kernel
 424 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 425 *	from the kernel lists and can be freed or reused once this function
 426 *	returns.
 427 *
 428 *      The packet type might still be in use by receivers
 429 *	and must not be freed until after all the CPU's have gone
 430 *	through a quiescent state.
 431 */
 432void __dev_remove_pack(struct packet_type *pt)
 433{
 434	struct list_head *head = ptype_head(pt);
 435	struct packet_type *pt1;
 436
 437	spin_lock(&ptype_lock);
 438
 439	list_for_each_entry(pt1, head, list) {
 440		if (pt == pt1) {
 441			list_del_rcu(&pt->list);
 442			goto out;
 443		}
 444	}
 445
 446	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 447out:
 448	spin_unlock(&ptype_lock);
 449}
 450EXPORT_SYMBOL(__dev_remove_pack);
 451
 452/**
 453 *	dev_remove_pack	 - remove packet handler
 454 *	@pt: packet type declaration
 455 *
 456 *	Remove a protocol handler that was previously added to the kernel
 457 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 458 *	from the kernel lists and can be freed or reused once this function
 459 *	returns.
 460 *
 461 *	This call sleeps to guarantee that no CPU is looking at the packet
 462 *	type after return.
 463 */
 464void dev_remove_pack(struct packet_type *pt)
 465{
 466	__dev_remove_pack(pt);
 467
 468	synchronize_net();
 469}
 470EXPORT_SYMBOL(dev_remove_pack);
 471
 472/******************************************************************************
 473
 474		      Device Boot-time Settings Routines
 475
 476*******************************************************************************/
 477
 478/* Boot time configuration table */
 479static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 480
 481/**
 482 *	netdev_boot_setup_add	- add new setup entry
 483 *	@name: name of the device
 484 *	@map: configured settings for the device
 485 *
 486 *	Adds new setup entry to the dev_boot_setup list.  The function
 487 *	returns 0 on error and 1 on success.  This is a generic routine to
 488 *	all netdevices.
 489 */
 490static int netdev_boot_setup_add(char *name, struct ifmap *map)
 491{
 492	struct netdev_boot_setup *s;
 493	int i;
 494
 495	s = dev_boot_setup;
 496	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 497		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 498			memset(s[i].name, 0, sizeof(s[i].name));
 499			strlcpy(s[i].name, name, IFNAMSIZ);
 500			memcpy(&s[i].map, map, sizeof(s[i].map));
 501			break;
 502		}
 503	}
 504
 505	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 506}
 507
 508/**
 509 *	netdev_boot_setup_check	- check boot time settings
 510 *	@dev: the netdevice
 511 *
 512 * 	Check boot time settings for the device.
 513 *	The found settings are set for the device to be used
 514 *	later in the device probing.
 515 *	Returns 0 if no settings found, 1 if they are.
 516 */
 517int netdev_boot_setup_check(struct net_device *dev)
 518{
 519	struct netdev_boot_setup *s = dev_boot_setup;
 520	int i;
 521
 522	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 523		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 524		    !strcmp(dev->name, s[i].name)) {
 525			dev->irq 	= s[i].map.irq;
 526			dev->base_addr 	= s[i].map.base_addr;
 527			dev->mem_start 	= s[i].map.mem_start;
 528			dev->mem_end 	= s[i].map.mem_end;
 529			return 1;
 530		}
 531	}
 532	return 0;
 533}
 534EXPORT_SYMBOL(netdev_boot_setup_check);
 535
 536
 537/**
 538 *	netdev_boot_base	- get address from boot time settings
 539 *	@prefix: prefix for network device
 540 *	@unit: id for network device
 541 *
 542 * 	Check boot time settings for the base address of device.
 543 *	The found settings are set for the device to be used
 544 *	later in the device probing.
 545 *	Returns 0 if no settings found.
 546 */
 547unsigned long netdev_boot_base(const char *prefix, int unit)
 548{
 549	const struct netdev_boot_setup *s = dev_boot_setup;
 550	char name[IFNAMSIZ];
 551	int i;
 552
 553	sprintf(name, "%s%d", prefix, unit);
 554
 555	/*
 556	 * If device already registered then return base of 1
 557	 * to indicate not to probe for this interface
 558	 */
 559	if (__dev_get_by_name(&init_net, name))
 560		return 1;
 561
 562	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 563		if (!strcmp(name, s[i].name))
 564			return s[i].map.base_addr;
 565	return 0;
 566}
 567
 568/*
 569 * Saves at boot time configured settings for any netdevice.
 570 */
 571int __init netdev_boot_setup(char *str)
 572{
 573	int ints[5];
 574	struct ifmap map;
 575
 576	str = get_options(str, ARRAY_SIZE(ints), ints);
 577	if (!str || !*str)
 578		return 0;
 579
 580	/* Save settings */
 581	memset(&map, 0, sizeof(map));
 582	if (ints[0] > 0)
 583		map.irq = ints[1];
 584	if (ints[0] > 1)
 585		map.base_addr = ints[2];
 586	if (ints[0] > 2)
 587		map.mem_start = ints[3];
 588	if (ints[0] > 3)
 589		map.mem_end = ints[4];
 590
 591	/* Add new entry to the list */
 592	return netdev_boot_setup_add(str, &map);
 593}
 594
 595__setup("netdev=", netdev_boot_setup);
 596
 597/*******************************************************************************
 598
 599			    Device Interface Subroutines
 600
 601*******************************************************************************/
 602
 603/**
 604 *	__dev_get_by_name	- find a device by its name
 605 *	@net: the applicable net namespace
 606 *	@name: name to find
 607 *
 608 *	Find an interface by name. Must be called under RTNL semaphore
 609 *	or @dev_base_lock. If the name is found a pointer to the device
 610 *	is returned. If the name is not found then %NULL is returned. The
 611 *	reference counters are not incremented so the caller must be
 612 *	careful with locks.
 613 */
 614
 615struct net_device *__dev_get_by_name(struct net *net, const char *name)
 616{
 617	struct hlist_node *p;
 618	struct net_device *dev;
 619	struct hlist_head *head = dev_name_hash(net, name);
 620
 621	hlist_for_each_entry(dev, p, head, name_hlist)
 622		if (!strncmp(dev->name, name, IFNAMSIZ))
 623			return dev;
 624
 625	return NULL;
 626}
 627EXPORT_SYMBOL(__dev_get_by_name);
 628
 629/**
 630 *	dev_get_by_name_rcu	- find a device by its name
 631 *	@net: the applicable net namespace
 632 *	@name: name to find
 633 *
 634 *	Find an interface by name.
 635 *	If the name is found a pointer to the device is returned.
 636 * 	If the name is not found then %NULL is returned.
 637 *	The reference counters are not incremented so the caller must be
 638 *	careful with locks. The caller must hold RCU lock.
 639 */
 640
 641struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 642{
 643	struct hlist_node *p;
 644	struct net_device *dev;
 645	struct hlist_head *head = dev_name_hash(net, name);
 646
 647	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 648		if (!strncmp(dev->name, name, IFNAMSIZ))
 649			return dev;
 650
 651	return NULL;
 652}
 653EXPORT_SYMBOL(dev_get_by_name_rcu);
 654
 655/**
 656 *	dev_get_by_name		- find a device by its name
 657 *	@net: the applicable net namespace
 658 *	@name: name to find
 659 *
 660 *	Find an interface by name. This can be called from any
 661 *	context and does its own locking. The returned handle has
 662 *	the usage count incremented and the caller must use dev_put() to
 663 *	release it when it is no longer needed. %NULL is returned if no
 664 *	matching device is found.
 665 */
 666
 667struct net_device *dev_get_by_name(struct net *net, const char *name)
 668{
 669	struct net_device *dev;
 670
 671	rcu_read_lock();
 672	dev = dev_get_by_name_rcu(net, name);
 673	if (dev)
 674		dev_hold(dev);
 675	rcu_read_unlock();
 676	return dev;
 677}
 678EXPORT_SYMBOL(dev_get_by_name);
 679
 680/**
 681 *	__dev_get_by_index - find a device by its ifindex
 682 *	@net: the applicable net namespace
 683 *	@ifindex: index of device
 684 *
 685 *	Search for an interface by index. Returns %NULL if the device
 686 *	is not found or a pointer to the device. The device has not
 687 *	had its reference counter increased so the caller must be careful
 688 *	about locking. The caller must hold either the RTNL semaphore
 689 *	or @dev_base_lock.
 690 */
 691
 692struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 693{
 694	struct hlist_node *p;
 695	struct net_device *dev;
 696	struct hlist_head *head = dev_index_hash(net, ifindex);
 697
 698	hlist_for_each_entry(dev, p, head, index_hlist)
 699		if (dev->ifindex == ifindex)
 700			return dev;
 701
 702	return NULL;
 703}
 704EXPORT_SYMBOL(__dev_get_by_index);
 705
 706/**
 707 *	dev_get_by_index_rcu - find a device by its ifindex
 708 *	@net: the applicable net namespace
 709 *	@ifindex: index of device
 710 *
 711 *	Search for an interface by index. Returns %NULL if the device
 712 *	is not found or a pointer to the device. The device has not
 713 *	had its reference counter increased so the caller must be careful
 714 *	about locking. The caller must hold RCU lock.
 715 */
 716
 717struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 718{
 719	struct hlist_node *p;
 720	struct net_device *dev;
 721	struct hlist_head *head = dev_index_hash(net, ifindex);
 722
 723	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 724		if (dev->ifindex == ifindex)
 725			return dev;
 726
 727	return NULL;
 728}
 729EXPORT_SYMBOL(dev_get_by_index_rcu);
 730
 731
 732/**
 733 *	dev_get_by_index - find a device by its ifindex
 734 *	@net: the applicable net namespace
 735 *	@ifindex: index of device
 736 *
 737 *	Search for an interface by index. Returns NULL if the device
 738 *	is not found or a pointer to the device. The device returned has
 739 *	had a reference added and the pointer is safe until the user calls
 740 *	dev_put to indicate they have finished with it.
 741 */
 742
 743struct net_device *dev_get_by_index(struct net *net, int ifindex)
 744{
 745	struct net_device *dev;
 746
 747	rcu_read_lock();
 748	dev = dev_get_by_index_rcu(net, ifindex);
 749	if (dev)
 750		dev_hold(dev);
 751	rcu_read_unlock();
 752	return dev;
 753}
 754EXPORT_SYMBOL(dev_get_by_index);
 755
 756/**
 757 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 758 *	@net: the applicable net namespace
 759 *	@type: media type of device
 760 *	@ha: hardware address
 761 *
 762 *	Search for an interface by MAC address. Returns NULL if the device
 763 *	is not found or a pointer to the device.
 764 *	The caller must hold RCU or RTNL.
 765 *	The returned device has not had its ref count increased
 766 *	and the caller must therefore be careful about locking
 767 *
 768 */
 769
 770struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 771				       const char *ha)
 772{
 773	struct net_device *dev;
 774
 775	for_each_netdev_rcu(net, dev)
 776		if (dev->type == type &&
 777		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 778			return dev;
 779
 780	return NULL;
 781}
 782EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 783
 784struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 785{
 786	struct net_device *dev;
 787
 788	ASSERT_RTNL();
 789	for_each_netdev(net, dev)
 790		if (dev->type == type)
 791			return dev;
 792
 793	return NULL;
 794}
 795EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 796
 797struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 798{
 799	struct net_device *dev, *ret = NULL;
 800
 801	rcu_read_lock();
 802	for_each_netdev_rcu(net, dev)
 803		if (dev->type == type) {
 804			dev_hold(dev);
 805			ret = dev;
 806			break;
 807		}
 808	rcu_read_unlock();
 809	return ret;
 810}
 811EXPORT_SYMBOL(dev_getfirstbyhwtype);
 812
 813/**
 814 *	dev_get_by_flags_rcu - find any device with given flags
 815 *	@net: the applicable net namespace
 816 *	@if_flags: IFF_* values
 817 *	@mask: bitmask of bits in if_flags to check
 818 *
 819 *	Search for any interface with the given flags. Returns NULL if a device
 820 *	is not found or a pointer to the device. Must be called inside
 821 *	rcu_read_lock(), and result refcount is unchanged.
 822 */
 823
 824struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 825				    unsigned short mask)
 826{
 827	struct net_device *dev, *ret;
 828
 829	ret = NULL;
 830	for_each_netdev_rcu(net, dev) {
 831		if (((dev->flags ^ if_flags) & mask) == 0) {
 832			ret = dev;
 833			break;
 834		}
 835	}
 836	return ret;
 837}
 838EXPORT_SYMBOL(dev_get_by_flags_rcu);
 839
 840/**
 841 *	dev_valid_name - check if name is okay for network device
 842 *	@name: name string
 843 *
 844 *	Network device names need to be valid file names to
 845 *	to allow sysfs to work.  We also disallow any kind of
 846 *	whitespace.
 847 */
 848int dev_valid_name(const char *name)
 849{
 850	if (*name == '\0')
 851		return 0;
 852	if (strlen(name) >= IFNAMSIZ)
 853		return 0;
 854	if (!strcmp(name, ".") || !strcmp(name, ".."))
 855		return 0;
 856
 857	while (*name) {
 858		if (*name == '/' || isspace(*name))
 859			return 0;
 860		name++;
 861	}
 862	return 1;
 863}
 864EXPORT_SYMBOL(dev_valid_name);
 865
 866/**
 867 *	__dev_alloc_name - allocate a name for a device
 868 *	@net: network namespace to allocate the device name in
 869 *	@name: name format string
 870 *	@buf:  scratch buffer and result name string
 871 *
 872 *	Passed a format string - eg "lt%d" it will try and find a suitable
 873 *	id. It scans list of devices to build up a free map, then chooses
 874 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 875 *	while allocating the name and adding the device in order to avoid
 876 *	duplicates.
 877 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 878 *	Returns the number of the unit assigned or a negative errno code.
 879 */
 880
 881static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 882{
 883	int i = 0;
 884	const char *p;
 885	const int max_netdevices = 8*PAGE_SIZE;
 886	unsigned long *inuse;
 887	struct net_device *d;
 888
 889	p = strnchr(name, IFNAMSIZ-1, '%');
 890	if (p) {
 891		/*
 892		 * Verify the string as this thing may have come from
 893		 * the user.  There must be either one "%d" and no other "%"
 894		 * characters.
 895		 */
 896		if (p[1] != 'd' || strchr(p + 2, '%'))
 897			return -EINVAL;
 898
 899		/* Use one page as a bit array of possible slots */
 900		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 901		if (!inuse)
 902			return -ENOMEM;
 903
 904		for_each_netdev(net, d) {
 905			if (!sscanf(d->name, name, &i))
 906				continue;
 907			if (i < 0 || i >= max_netdevices)
 908				continue;
 909
 910			/*  avoid cases where sscanf is not exact inverse of printf */
 911			snprintf(buf, IFNAMSIZ, name, i);
 912			if (!strncmp(buf, d->name, IFNAMSIZ))
 913				set_bit(i, inuse);
 914		}
 915
 916		i = find_first_zero_bit(inuse, max_netdevices);
 917		free_page((unsigned long) inuse);
 918	}
 919
 920	if (buf != name)
 921		snprintf(buf, IFNAMSIZ, name, i);
 922	if (!__dev_get_by_name(net, buf))
 923		return i;
 924
 925	/* It is possible to run out of possible slots
 926	 * when the name is long and there isn't enough space left
 927	 * for the digits, or if all bits are used.
 928	 */
 929	return -ENFILE;
 930}
 931
 932/**
 933 *	dev_alloc_name - allocate a name for a device
 934 *	@dev: device
 935 *	@name: name format string
 936 *
 937 *	Passed a format string - eg "lt%d" it will try and find a suitable
 938 *	id. It scans list of devices to build up a free map, then chooses
 939 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 940 *	while allocating the name and adding the device in order to avoid
 941 *	duplicates.
 942 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 943 *	Returns the number of the unit assigned or a negative errno code.
 944 */
 945
 946int dev_alloc_name(struct net_device *dev, const char *name)
 947{
 948	char buf[IFNAMSIZ];
 949	struct net *net;
 950	int ret;
 951
 952	BUG_ON(!dev_net(dev));
 953	net = dev_net(dev);
 954	ret = __dev_alloc_name(net, name, buf);
 955	if (ret >= 0)
 956		strlcpy(dev->name, buf, IFNAMSIZ);
 957	return ret;
 958}
 959EXPORT_SYMBOL(dev_alloc_name);
 960
 961static int dev_get_valid_name(struct net_device *dev, const char *name)
 962{
 963	struct net *net;
 964
 965	BUG_ON(!dev_net(dev));
 966	net = dev_net(dev);
 967
 968	if (!dev_valid_name(name))
 969		return -EINVAL;
 970
 971	if (strchr(name, '%'))
 972		return dev_alloc_name(dev, name);
 973	else if (__dev_get_by_name(net, name))
 974		return -EEXIST;
 975	else if (dev->name != name)
 976		strlcpy(dev->name, name, IFNAMSIZ);
 977
 978	return 0;
 979}
 980
 981/**
 982 *	dev_change_name - change name of a device
 983 *	@dev: device
 984 *	@newname: name (or format string) must be at least IFNAMSIZ
 985 *
 986 *	Change name of a device, can pass format strings "eth%d".
 987 *	for wildcarding.
 988 */
 989int dev_change_name(struct net_device *dev, const char *newname)
 990{
 991	char oldname[IFNAMSIZ];
 992	int err = 0;
 993	int ret;
 994	struct net *net;
 995
 996	ASSERT_RTNL();
 997	BUG_ON(!dev_net(dev));
 998
 999	net = dev_net(dev);
1000	if (dev->flags & IFF_UP)
1001		return -EBUSY;
1002
1003	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1004		return 0;
1005
1006	memcpy(oldname, dev->name, IFNAMSIZ);
1007
1008	err = dev_get_valid_name(dev, newname);
1009	if (err < 0)
1010		return err;
1011
1012rollback:
1013	ret = device_rename(&dev->dev, dev->name);
1014	if (ret) {
1015		memcpy(dev->name, oldname, IFNAMSIZ);
1016		return ret;
1017	}
1018
1019	write_lock_bh(&dev_base_lock);
1020	hlist_del_rcu(&dev->name_hlist);
1021	write_unlock_bh(&dev_base_lock);
1022
1023	synchronize_rcu();
1024
1025	write_lock_bh(&dev_base_lock);
1026	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1027	write_unlock_bh(&dev_base_lock);
1028
1029	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1030	ret = notifier_to_errno(ret);
1031
1032	if (ret) {
1033		/* err >= 0 after dev_alloc_name() or stores the first errno */
1034		if (err >= 0) {
1035			err = ret;
1036			memcpy(dev->name, oldname, IFNAMSIZ);
1037			goto rollback;
1038		} else {
1039			printk(KERN_ERR
1040			       "%s: name change rollback failed: %d.\n",
1041			       dev->name, ret);
1042		}
1043	}
1044
1045	return err;
1046}
1047
1048/**
1049 *	dev_set_alias - change ifalias of a device
1050 *	@dev: device
1051 *	@alias: name up to IFALIASZ
1052 *	@len: limit of bytes to copy from info
1053 *
1054 *	Set ifalias for a device,
1055 */
1056int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1057{
1058	ASSERT_RTNL();
1059
1060	if (len >= IFALIASZ)
1061		return -EINVAL;
1062
1063	if (!len) {
1064		if (dev->ifalias) {
1065			kfree(dev->ifalias);
1066			dev->ifalias = NULL;
1067		}
1068		return 0;
1069	}
1070
1071	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1072	if (!dev->ifalias)
1073		return -ENOMEM;
1074
1075	strlcpy(dev->ifalias, alias, len+1);
1076	return len;
1077}
1078
1079
1080/**
1081 *	netdev_features_change - device changes features
1082 *	@dev: device to cause notification
1083 *
1084 *	Called to indicate a device has changed features.
1085 */
1086void netdev_features_change(struct net_device *dev)
1087{
1088	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1089}
1090EXPORT_SYMBOL(netdev_features_change);
1091
1092/**
1093 *	netdev_state_change - device changes state
1094 *	@dev: device to cause notification
1095 *
1096 *	Called to indicate a device has changed state. This function calls
1097 *	the notifier chains for netdev_chain and sends a NEWLINK message
1098 *	to the routing socket.
1099 */
1100void netdev_state_change(struct net_device *dev)
1101{
1102	if (dev->flags & IFF_UP) {
1103		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1104		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1105	}
1106}
1107EXPORT_SYMBOL(netdev_state_change);
1108
1109int netdev_bonding_change(struct net_device *dev, unsigned long event)
1110{
1111	return call_netdevice_notifiers(event, dev);
1112}
1113EXPORT_SYMBOL(netdev_bonding_change);
1114
1115/**
1116 *	dev_load 	- load a network module
1117 *	@net: the applicable net namespace
1118 *	@name: name of interface
1119 *
1120 *	If a network interface is not present and the process has suitable
1121 *	privileges this function loads the module. If module loading is not
1122 *	available in this kernel then it becomes a nop.
1123 */
1124
1125void dev_load(struct net *net, const char *name)
1126{
1127	struct net_device *dev;
1128	int no_module;
1129
1130	rcu_read_lock();
1131	dev = dev_get_by_name_rcu(net, name);
1132	rcu_read_unlock();
1133
1134	no_module = !dev;
1135	if (no_module && capable(CAP_NET_ADMIN))
1136		no_module = request_module("netdev-%s", name);
1137	if (no_module && capable(CAP_SYS_MODULE)) {
1138		if (!request_module("%s", name))
1139			pr_err("Loading kernel module for a network device "
1140"with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1141"instead\n", name);
1142	}
1143}
1144EXPORT_SYMBOL(dev_load);
1145
1146static int __dev_open(struct net_device *dev)
1147{
1148	const struct net_device_ops *ops = dev->netdev_ops;
1149	int ret;
1150
1151	ASSERT_RTNL();
1152
1153	if (!netif_device_present(dev))
1154		return -ENODEV;
1155
1156	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1157	ret = notifier_to_errno(ret);
1158	if (ret)
1159		return ret;
1160
1161	set_bit(__LINK_STATE_START, &dev->state);
1162
1163	if (ops->ndo_validate_addr)
1164		ret = ops->ndo_validate_addr(dev);
1165
1166	if (!ret && ops->ndo_open)
1167		ret = ops->ndo_open(dev);
1168
1169	if (ret)
1170		clear_bit(__LINK_STATE_START, &dev->state);
1171	else {
1172		dev->flags |= IFF_UP;
1173		net_dmaengine_get();
1174		dev_set_rx_mode(dev);
1175		dev_activate(dev);
1176	}
1177
1178	return ret;
1179}
1180
1181/**
1182 *	dev_open	- prepare an interface for use.
1183 *	@dev:	device to open
1184 *
1185 *	Takes a device from down to up state. The device's private open
1186 *	function is invoked and then the multicast lists are loaded. Finally
1187 *	the device is moved into the up state and a %NETDEV_UP message is
1188 *	sent to the netdev notifier chain.
1189 *
1190 *	Calling this function on an active interface is a nop. On a failure
1191 *	a negative errno code is returned.
1192 */
1193int dev_open(struct net_device *dev)
1194{
1195	int ret;
1196
1197	if (dev->flags & IFF_UP)
1198		return 0;
1199
1200	ret = __dev_open(dev);
1201	if (ret < 0)
1202		return ret;
1203
1204	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1205	call_netdevice_notifiers(NETDEV_UP, dev);
1206
1207	return ret;
1208}
1209EXPORT_SYMBOL(dev_open);
1210
1211static int __dev_close_many(struct list_head *head)
1212{
1213	struct net_device *dev;
1214
1215	ASSERT_RTNL();
1216	might_sleep();
1217
1218	list_for_each_entry(dev, head, unreg_list) {
1219		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1220
1221		clear_bit(__LINK_STATE_START, &dev->state);
1222
1223		/* Synchronize to scheduled poll. We cannot touch poll list, it
1224		 * can be even on different cpu. So just clear netif_running().
1225		 *
1226		 * dev->stop() will invoke napi_disable() on all of it's
1227		 * napi_struct instances on this device.
1228		 */
1229		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1230	}
1231
1232	dev_deactivate_many(head);
1233
1234	list_for_each_entry(dev, head, unreg_list) {
1235		const struct net_device_ops *ops = dev->netdev_ops;
1236
1237		/*
1238		 *	Call the device specific close. This cannot fail.
1239		 *	Only if device is UP
1240		 *
1241		 *	We allow it to be called even after a DETACH hot-plug
1242		 *	event.
1243		 */
1244		if (ops->ndo_stop)
1245			ops->ndo_stop(dev);
1246
1247		dev->flags &= ~IFF_UP;
1248		net_dmaengine_put();
1249	}
1250
1251	return 0;
1252}
1253
1254static int __dev_close(struct net_device *dev)
1255{
1256	int retval;
1257	LIST_HEAD(single);
1258
1259	list_add(&dev->unreg_list, &single);
1260	retval = __dev_close_many(&single);
1261	list_del(&single);
1262	return retval;
1263}
1264
1265static int dev_close_many(struct list_head *head)
1266{
1267	struct net_device *dev, *tmp;
1268	LIST_HEAD(tmp_list);
1269
1270	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1271		if (!(dev->flags & IFF_UP))
1272			list_move(&dev->unreg_list, &tmp_list);
1273
1274	__dev_close_many(head);
1275
1276	list_for_each_entry(dev, head, unreg_list) {
1277		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1278		call_netdevice_notifiers(NETDEV_DOWN, dev);
1279	}
1280
1281	/* rollback_registered_many needs the complete original list */
1282	list_splice(&tmp_list, head);
1283	return 0;
1284}
1285
1286/**
1287 *	dev_close - shutdown an interface.
1288 *	@dev: device to shutdown
1289 *
1290 *	This function moves an active device into down state. A
1291 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1292 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1293 *	chain.
1294 */
1295int dev_close(struct net_device *dev)
1296{
1297	if (dev->flags & IFF_UP) {
1298		LIST_HEAD(single);
1299
1300		list_add(&dev->unreg_list, &single);
1301		dev_close_many(&single);
1302		list_del(&single);
1303	}
1304	return 0;
1305}
1306EXPORT_SYMBOL(dev_close);
1307
1308
1309/**
1310 *	dev_disable_lro - disable Large Receive Offload on a device
1311 *	@dev: device
1312 *
1313 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1314 *	called under RTNL.  This is needed if received packets may be
1315 *	forwarded to another interface.
1316 */
1317void dev_disable_lro(struct net_device *dev)
1318{
1319	u32 flags;
1320
1321	/*
1322	 * If we're trying to disable lro on a vlan device
1323	 * use the underlying physical device instead
1324	 */
1325	if (is_vlan_dev(dev))
1326		dev = vlan_dev_real_dev(dev);
1327
1328	if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1329		flags = dev->ethtool_ops->get_flags(dev);
1330	else
1331		flags = ethtool_op_get_flags(dev);
1332
1333	if (!(flags & ETH_FLAG_LRO))
1334		return;
1335
1336	__ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1337	if (unlikely(dev->features & NETIF_F_LRO))
1338		netdev_WARN(dev, "failed to disable LRO!\n");
1339}
1340EXPORT_SYMBOL(dev_disable_lro);
1341
1342
1343static int dev_boot_phase = 1;
1344
1345/**
1346 *	register_netdevice_notifier - register a network notifier block
1347 *	@nb: notifier
1348 *
1349 *	Register a notifier to be called when network device events occur.
1350 *	The notifier passed is linked into the kernel structures and must
1351 *	not be reused until it has been unregistered. A negative errno code
1352 *	is returned on a failure.
1353 *
1354 * 	When registered all registration and up events are replayed
1355 *	to the new notifier to allow device to have a race free
1356 *	view of the network device list.
1357 */
1358
1359int register_netdevice_notifier(struct notifier_block *nb)
1360{
1361	struct net_device *dev;
1362	struct net_device *last;
1363	struct net *net;
1364	int err;
1365
1366	rtnl_lock();
1367	err = raw_notifier_chain_register(&netdev_chain, nb);
1368	if (err)
1369		goto unlock;
1370	if (dev_boot_phase)
1371		goto unlock;
1372	for_each_net(net) {
1373		for_each_netdev(net, dev) {
1374			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1375			err = notifier_to_errno(err);
1376			if (err)
1377				goto rollback;
1378
1379			if (!(dev->flags & IFF_UP))
1380				continue;
1381
1382			nb->notifier_call(nb, NETDEV_UP, dev);
1383		}
1384	}
1385
1386unlock:
1387	rtnl_unlock();
1388	return err;
1389
1390rollback:
1391	last = dev;
1392	for_each_net(net) {
1393		for_each_netdev(net, dev) {
1394			if (dev == last)
1395				break;
1396
1397			if (dev->flags & IFF_UP) {
1398				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1399				nb->notifier_call(nb, NETDEV_DOWN, dev);
1400			}
1401			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1402			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1403		}
1404	}
1405
1406	raw_notifier_chain_unregister(&netdev_chain, nb);
1407	goto unlock;
1408}
1409EXPORT_SYMBOL(register_netdevice_notifier);
1410
1411/**
1412 *	unregister_netdevice_notifier - unregister a network notifier block
1413 *	@nb: notifier
1414 *
1415 *	Unregister a notifier previously registered by
1416 *	register_netdevice_notifier(). The notifier is unlinked into the
1417 *	kernel structures and may then be reused. A negative errno code
1418 *	is returned on a failure.
1419 */
1420
1421int unregister_netdevice_notifier(struct notifier_block *nb)
1422{
1423	int err;
1424
1425	rtnl_lock();
1426	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1427	rtnl_unlock();
1428	return err;
1429}
1430EXPORT_SYMBOL(unregister_netdevice_notifier);
1431
1432/**
1433 *	call_netdevice_notifiers - call all network notifier blocks
1434 *      @val: value passed unmodified to notifier function
1435 *      @dev: net_device pointer passed unmodified to notifier function
1436 *
1437 *	Call all network notifier blocks.  Parameters and return value
1438 *	are as for raw_notifier_call_chain().
1439 */
1440
1441int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1442{
1443	ASSERT_RTNL();
1444	return raw_notifier_call_chain(&netdev_chain, val, dev);
1445}
1446EXPORT_SYMBOL(call_netdevice_notifiers);
1447
1448/* When > 0 there are consumers of rx skb time stamps */
1449static atomic_t netstamp_needed = ATOMIC_INIT(0);
1450
1451void net_enable_timestamp(void)
1452{
1453	atomic_inc(&netstamp_needed);
1454}
1455EXPORT_SYMBOL(net_enable_timestamp);
1456
1457void net_disable_timestamp(void)
1458{
1459	atomic_dec(&netstamp_needed);
1460}
1461EXPORT_SYMBOL(net_disable_timestamp);
1462
1463static inline void net_timestamp_set(struct sk_buff *skb)
1464{
1465	if (atomic_read(&netstamp_needed))
1466		__net_timestamp(skb);
1467	else
1468		skb->tstamp.tv64 = 0;
1469}
1470
1471static inline void net_timestamp_check(struct sk_buff *skb)
1472{
1473	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1474		__net_timestamp(skb);
1475}
1476
1477static inline bool is_skb_forwardable(struct net_device *dev,
1478				      struct sk_buff *skb)
1479{
1480	unsigned int len;
1481
1482	if (!(dev->flags & IFF_UP))
1483		return false;
1484
1485	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1486	if (skb->len <= len)
1487		return true;
1488
1489	/* if TSO is enabled, we don't care about the length as the packet
1490	 * could be forwarded without being segmented before
1491	 */
1492	if (skb_is_gso(skb))
1493		return true;
1494
1495	return false;
1496}
1497
1498/**
1499 * dev_forward_skb - loopback an skb to another netif
1500 *
1501 * @dev: destination network device
1502 * @skb: buffer to forward
1503 *
1504 * return values:
1505 *	NET_RX_SUCCESS	(no congestion)
1506 *	NET_RX_DROP     (packet was dropped, but freed)
1507 *
1508 * dev_forward_skb can be used for injecting an skb from the
1509 * start_xmit function of one device into the receive queue
1510 * of another device.
1511 *
1512 * The receiving device may be in another namespace, so
1513 * we have to clear all information in the skb that could
1514 * impact namespace isolation.
1515 */
1516int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1517{
1518	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1519		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1520			atomic_long_inc(&dev->rx_dropped);
1521			kfree_skb(skb);
1522			return NET_RX_DROP;
1523		}
1524	}
1525
1526	skb_orphan(skb);
1527	nf_reset(skb);
1528
1529	if (unlikely(!is_skb_forwardable(dev, skb))) {
1530		atomic_long_inc(&dev->rx_dropped);
1531		kfree_skb(skb);
1532		return NET_RX_DROP;
1533	}
1534	skb_set_dev(skb, dev);
1535	skb->tstamp.tv64 = 0;
1536	skb->pkt_type = PACKET_HOST;
1537	skb->protocol = eth_type_trans(skb, dev);
1538	return netif_rx(skb);
1539}
1540EXPORT_SYMBOL_GPL(dev_forward_skb);
1541
1542static inline int deliver_skb(struct sk_buff *skb,
1543			      struct packet_type *pt_prev,
1544			      struct net_device *orig_dev)
1545{
1546	atomic_inc(&skb->users);
1547	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1548}
1549
1550/*
1551 *	Support routine. Sends outgoing frames to any network
1552 *	taps currently in use.
1553 */
1554
1555static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1556{
1557	struct packet_type *ptype;
1558	struct sk_buff *skb2 = NULL;
1559	struct packet_type *pt_prev = NULL;
1560
1561	rcu_read_lock();
1562	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1563		/* Never send packets back to the socket
1564		 * they originated from - MvS (miquels@drinkel.ow.org)
1565		 */
1566		if ((ptype->dev == dev || !ptype->dev) &&
1567		    (ptype->af_packet_priv == NULL ||
1568		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1569			if (pt_prev) {
1570				deliver_skb(skb2, pt_prev, skb->dev);
1571				pt_prev = ptype;
1572				continue;
1573			}
1574
1575			skb2 = skb_clone(skb, GFP_ATOMIC);
1576			if (!skb2)
1577				break;
1578
1579			net_timestamp_set(skb2);
1580
1581			/* skb->nh should be correctly
1582			   set by sender, so that the second statement is
1583			   just protection against buggy protocols.
1584			 */
1585			skb_reset_mac_header(skb2);
1586
1587			if (skb_network_header(skb2) < skb2->data ||
1588			    skb2->network_header > skb2->tail) {
1589				if (net_ratelimit())
1590					printk(KERN_CRIT "protocol %04x is "
1591					       "buggy, dev %s\n",
1592					       ntohs(skb2->protocol),
1593					       dev->name);
1594				skb_reset_network_header(skb2);
1595			}
1596
1597			skb2->transport_header = skb2->network_header;
1598			skb2->pkt_type = PACKET_OUTGOING;
1599			pt_prev = ptype;
1600		}
1601	}
1602	if (pt_prev)
1603		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1604	rcu_read_unlock();
1605}
1606
1607/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1608 * @dev: Network device
1609 * @txq: number of queues available
1610 *
1611 * If real_num_tx_queues is changed the tc mappings may no longer be
1612 * valid. To resolve this verify the tc mapping remains valid and if
1613 * not NULL the mapping. With no priorities mapping to this
1614 * offset/count pair it will no longer be used. In the worst case TC0
1615 * is invalid nothing can be done so disable priority mappings. If is
1616 * expected that drivers will fix this mapping if they can before
1617 * calling netif_set_real_num_tx_queues.
1618 */
1619static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1620{
1621	int i;
1622	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1623
1624	/* If TC0 is invalidated disable TC mapping */
1625	if (tc->offset + tc->count > txq) {
1626		pr_warning("Number of in use tx queues changed "
1627			   "invalidating tc mappings. Priority "
1628			   "traffic classification disabled!\n");
1629		dev->num_tc = 0;
1630		return;
1631	}
1632
1633	/* Invalidated prio to tc mappings set to TC0 */
1634	for (i = 1; i < TC_BITMASK + 1; i++) {
1635		int q = netdev_get_prio_tc_map(dev, i);
1636
1637		tc = &dev->tc_to_txq[q];
1638		if (tc->offset + tc->count > txq) {
1639			pr_warning("Number of in use tx queues "
1640				   "changed. Priority %i to tc "
1641				   "mapping %i is no longer valid "
1642				   "setting map to 0\n",
1643				   i, q);
1644			netdev_set_prio_tc_map(dev, i, 0);
1645		}
1646	}
1647}
1648
1649/*
1650 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1651 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1652 */
1653int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1654{
1655	int rc;
1656
1657	if (txq < 1 || txq > dev->num_tx_queues)
1658		return -EINVAL;
1659
1660	if (dev->reg_state == NETREG_REGISTERED ||
1661	    dev->reg_state == NETREG_UNREGISTERING) {
1662		ASSERT_RTNL();
1663
1664		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1665						  txq);
1666		if (rc)
1667			return rc;
1668
1669		if (dev->num_tc)
1670			netif_setup_tc(dev, txq);
1671
1672		if (txq < dev->real_num_tx_queues)
1673			qdisc_reset_all_tx_gt(dev, txq);
1674	}
1675
1676	dev->real_num_tx_queues = txq;
1677	return 0;
1678}
1679EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1680
1681#ifdef CONFIG_RPS
1682/**
1683 *	netif_set_real_num_rx_queues - set actual number of RX queues used
1684 *	@dev: Network device
1685 *	@rxq: Actual number of RX queues
1686 *
1687 *	This must be called either with the rtnl_lock held or before
1688 *	registration of the net device.  Returns 0 on success, or a
1689 *	negative error code.  If called before registration, it always
1690 *	succeeds.
1691 */
1692int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1693{
1694	int rc;
1695
1696	if (rxq < 1 || rxq > dev->num_rx_queues)
1697		return -EINVAL;
1698
1699	if (dev->reg_state == NETREG_REGISTERED) {
1700		ASSERT_RTNL();
1701
1702		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1703						  rxq);
1704		if (rc)
1705			return rc;
1706	}
1707
1708	dev->real_num_rx_queues = rxq;
1709	return 0;
1710}
1711EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1712#endif
1713
1714static inline void __netif_reschedule(struct Qdisc *q)
1715{
1716	struct softnet_data *sd;
1717	unsigned long flags;
1718
1719	local_irq_save(flags);
1720	sd = &__get_cpu_var(softnet_data);
1721	q->next_sched = NULL;
1722	*sd->output_queue_tailp = q;
1723	sd->output_queue_tailp = &q->next_sched;
1724	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1725	local_irq_restore(flags);
1726}
1727
1728void __netif_schedule(struct Qdisc *q)
1729{
1730	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1731		__netif_reschedule(q);
1732}
1733EXPORT_SYMBOL(__netif_schedule);
1734
1735void dev_kfree_skb_irq(struct sk_buff *skb)
1736{
1737	if (atomic_dec_and_test(&skb->users)) {
1738		struct softnet_data *sd;
1739		unsigned long flags;
1740
1741		local_irq_save(flags);
1742		sd = &__get_cpu_var(softnet_data);
1743		skb->next = sd->completion_queue;
1744		sd->completion_queue = skb;
1745		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1746		local_irq_restore(flags);
1747	}
1748}
1749EXPORT_SYMBOL(dev_kfree_skb_irq);
1750
1751void dev_kfree_skb_any(struct sk_buff *skb)
1752{
1753	if (in_irq() || irqs_disabled())
1754		dev_kfree_skb_irq(skb);
1755	else
1756		dev_kfree_skb(skb);
1757}
1758EXPORT_SYMBOL(dev_kfree_skb_any);
1759
1760
1761/**
1762 * netif_device_detach - mark device as removed
1763 * @dev: network device
1764 *
1765 * Mark device as removed from system and therefore no longer available.
1766 */
1767void netif_device_detach(struct net_device *dev)
1768{
1769	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1770	    netif_running(dev)) {
1771		netif_tx_stop_all_queues(dev);
1772	}
1773}
1774EXPORT_SYMBOL(netif_device_detach);
1775
1776/**
1777 * netif_device_attach - mark device as attached
1778 * @dev: network device
1779 *
1780 * Mark device as attached from system and restart if needed.
1781 */
1782void netif_device_attach(struct net_device *dev)
1783{
1784	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1785	    netif_running(dev)) {
1786		netif_tx_wake_all_queues(dev);
1787		__netdev_watchdog_up(dev);
1788	}
1789}
1790EXPORT_SYMBOL(netif_device_attach);
1791
1792/**
1793 * skb_dev_set -- assign a new device to a buffer
1794 * @skb: buffer for the new device
1795 * @dev: network device
1796 *
1797 * If an skb is owned by a device already, we have to reset
1798 * all data private to the namespace a device belongs to
1799 * before assigning it a new device.
1800 */
1801#ifdef CONFIG_NET_NS
1802void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1803{
1804	skb_dst_drop(skb);
1805	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1806		secpath_reset(skb);
1807		nf_reset(skb);
1808		skb_init_secmark(skb);
1809		skb->mark = 0;
1810		skb->priority = 0;
1811		skb->nf_trace = 0;
1812		skb->ipvs_property = 0;
1813#ifdef CONFIG_NET_SCHED
1814		skb->tc_index = 0;
1815#endif
1816	}
1817	skb->dev = dev;
1818}
1819EXPORT_SYMBOL(skb_set_dev);
1820#endif /* CONFIG_NET_NS */
1821
1822/*
1823 * Invalidate hardware checksum when packet is to be mangled, and
1824 * complete checksum manually on outgoing path.
1825 */
1826int skb_checksum_help(struct sk_buff *skb)
1827{
1828	__wsum csum;
1829	int ret = 0, offset;
1830
1831	if (skb->ip_summed == CHECKSUM_COMPLETE)
1832		goto out_set_summed;
1833
1834	if (unlikely(skb_shinfo(skb)->gso_size)) {
1835		/* Let GSO fix up the checksum. */
1836		goto out_set_summed;
1837	}
1838
1839	offset = skb_checksum_start_offset(skb);
1840	BUG_ON(offset >= skb_headlen(skb));
1841	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1842
1843	offset += skb->csum_offset;
1844	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1845
1846	if (skb_cloned(skb) &&
1847	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1848		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1849		if (ret)
1850			goto out;
1851	}
1852
1853	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1854out_set_summed:
1855	skb->ip_summed = CHECKSUM_NONE;
1856out:
1857	return ret;
1858}
1859EXPORT_SYMBOL(skb_checksum_help);
1860
1861/**
1862 *	skb_gso_segment - Perform segmentation on skb.
1863 *	@skb: buffer to segment
1864 *	@features: features for the output path (see dev->features)
1865 *
1866 *	This function segments the given skb and returns a list of segments.
1867 *
1868 *	It may return NULL if the skb requires no segmentation.  This is
1869 *	only possible when GSO is used for verifying header integrity.
1870 */
1871struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1872{
1873	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1874	struct packet_type *ptype;
1875	__be16 type = skb->protocol;
1876	int vlan_depth = ETH_HLEN;
1877	int err;
1878
1879	while (type == htons(ETH_P_8021Q)) {
1880		struct vlan_hdr *vh;
1881
1882		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1883			return ERR_PTR(-EINVAL);
1884
1885		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1886		type = vh->h_vlan_encapsulated_proto;
1887		vlan_depth += VLAN_HLEN;
1888	}
1889
1890	skb_reset_mac_header(skb);
1891	skb->mac_len = skb->network_header - skb->mac_header;
1892	__skb_pull(skb, skb->mac_len);
1893
1894	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1895		struct net_device *dev = skb->dev;
1896		struct ethtool_drvinfo info = {};
1897
1898		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1899			dev->ethtool_ops->get_drvinfo(dev, &info);
1900
1901		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1902		     info.driver, dev ? dev->features : 0L,
1903		     skb->sk ? skb->sk->sk_route_caps : 0L,
1904		     skb->len, skb->data_len, skb->ip_summed);
1905
1906		if (skb_header_cloned(skb) &&
1907		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1908			return ERR_PTR(err);
1909	}
1910
1911	rcu_read_lock();
1912	list_for_each_entry_rcu(ptype,
1913			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1914		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1915			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1916				err = ptype->gso_send_check(skb);
1917				segs = ERR_PTR(err);
1918				if (err || skb_gso_ok(skb, features))
1919					break;
1920				__skb_push(skb, (skb->data -
1921						 skb_network_header(skb)));
1922			}
1923			segs = ptype->gso_segment(skb, features);
1924			break;
1925		}
1926	}
1927	rcu_read_unlock();
1928
1929	__skb_push(skb, skb->data - skb_mac_header(skb));
1930
1931	return segs;
1932}
1933EXPORT_SYMBOL(skb_gso_segment);
1934
1935/* Take action when hardware reception checksum errors are detected. */
1936#ifdef CONFIG_BUG
1937void netdev_rx_csum_fault(struct net_device *dev)
1938{
1939	if (net_ratelimit()) {
1940		printk(KERN_ERR "%s: hw csum failure.\n",
1941			dev ? dev->name : "<unknown>");
1942		dump_stack();
1943	}
1944}
1945EXPORT_SYMBOL(netdev_rx_csum_fault);
1946#endif
1947
1948/* Actually, we should eliminate this check as soon as we know, that:
1949 * 1. IOMMU is present and allows to map all the memory.
1950 * 2. No high memory really exists on this machine.
1951 */
1952
1953static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1954{
1955#ifdef CONFIG_HIGHMEM
1956	int i;
1957	if (!(dev->features & NETIF_F_HIGHDMA)) {
1958		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1959			if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1960				return 1;
1961	}
1962
1963	if (PCI_DMA_BUS_IS_PHYS) {
1964		struct device *pdev = dev->dev.parent;
1965
1966		if (!pdev)
1967			return 0;
1968		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1969			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1970			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1971				return 1;
1972		}
1973	}
1974#endif
1975	return 0;
1976}
1977
1978struct dev_gso_cb {
1979	void (*destructor)(struct sk_buff *skb);
1980};
1981
1982#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1983
1984static void dev_gso_skb_destructor(struct sk_buff *skb)
1985{
1986	struct dev_gso_cb *cb;
1987
1988	do {
1989		struct sk_buff *nskb = skb->next;
1990
1991		skb->next = nskb->next;
1992		nskb->next = NULL;
1993		kfree_skb(nskb);
1994	} while (skb->next);
1995
1996	cb = DEV_GSO_CB(skb);
1997	if (cb->destructor)
1998		cb->destructor(skb);
1999}
2000
2001/**
2002 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2003 *	@skb: buffer to segment
2004 *	@features: device features as applicable to this skb
2005 *
2006 *	This function segments the given skb and stores the list of segments
2007 *	in skb->next.
2008 */
2009static int dev_gso_segment(struct sk_buff *skb, int features)
2010{
2011	struct sk_buff *segs;
2012
2013	segs = skb_gso_segment(skb, features);
2014
2015	/* Verifying header integrity only. */
2016	if (!segs)
2017		return 0;
2018
2019	if (IS_ERR(segs))
2020		return PTR_ERR(segs);
2021
2022	skb->next = segs;
2023	DEV_GSO_CB(skb)->destructor = skb->destructor;
2024	skb->destructor = dev_gso_skb_destructor;
2025
2026	return 0;
2027}
2028
2029/*
2030 * Try to orphan skb early, right before transmission by the device.
2031 * We cannot orphan skb if tx timestamp is requested or the sk-reference
2032 * is needed on driver level for other reasons, e.g. see net/can/raw.c
2033 */
2034static inline void skb_orphan_try(struct sk_buff *skb)
2035{
2036	struct sock *sk = skb->sk;
2037
2038	if (sk && !skb_shinfo(skb)->tx_flags) {
2039		/* skb_tx_hash() wont be able to get sk.
2040		 * We copy sk_hash into skb->rxhash
2041		 */
2042		if (!skb->rxhash)
2043			skb->rxhash = sk->sk_hash;
2044		skb_orphan(skb);
2045	}
2046}
2047
2048static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2049{
2050	return ((features & NETIF_F_GEN_CSUM) ||
2051		((features & NETIF_F_V4_CSUM) &&
2052		 protocol == htons(ETH_P_IP)) ||
2053		((features & NETIF_F_V6_CSUM) &&
2054		 protocol == htons(ETH_P_IPV6)) ||
2055		((features & NETIF_F_FCOE_CRC) &&
2056		 protocol == htons(ETH_P_FCOE)));
2057}
2058
2059static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2060{
2061	if (!can_checksum_protocol(features, protocol)) {
2062		features &= ~NETIF_F_ALL_CSUM;
2063		features &= ~NETIF_F_SG;
2064	} else if (illegal_highdma(skb->dev, skb)) {
2065		features &= ~NETIF_F_SG;
2066	}
2067
2068	return features;
2069}
2070
2071u32 netif_skb_features(struct sk_buff *skb)
2072{
2073	__be16 protocol = skb->protocol;
2074	u32 features = skb->dev->features;
2075
2076	if (protocol == htons(ETH_P_8021Q)) {
2077		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2078		protocol = veh->h_vlan_encapsulated_proto;
2079	} else if (!vlan_tx_tag_present(skb)) {
2080		return harmonize_features(skb, protocol, features);
2081	}
2082
2083	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2084
2085	if (protocol != htons(ETH_P_8021Q)) {
2086		return harmonize_features(skb, protocol, features);
2087	} else {
2088		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2089				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2090		return harmonize_features(skb, protocol, features);
2091	}
2092}
2093EXPORT_SYMBOL(netif_skb_features);
2094
2095/*
2096 * Returns true if either:
2097 *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2098 *	2. skb is fragmented and the device does not support SG, or if
2099 *	   at least one of fragments is in highmem and device does not
2100 *	   support DMA from it.
2101 */
2102static inline int skb_needs_linearize(struct sk_buff *skb,
2103				      int features)
2104{
2105	return skb_is_nonlinear(skb) &&
2106			((skb_has_frag_list(skb) &&
2107				!(features & NETIF_F_FRAGLIST)) ||
2108			(skb_shinfo(skb)->nr_frags &&
2109				!(features & NETIF_F_SG)));
2110}
2111
2112int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2113			struct netdev_queue *txq)
2114{
2115	const struct net_device_ops *ops = dev->netdev_ops;
2116	int rc = NETDEV_TX_OK;
2117	unsigned int skb_len;
2118
2119	if (likely(!skb->next)) {
2120		u32 features;
2121
2122		/*
2123		 * If device doesn't need skb->dst, release it right now while
2124		 * its hot in this cpu cache
2125		 */
2126		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2127			skb_dst_drop(skb);
2128
2129		if (!list_empty(&ptype_all))
2130			dev_queue_xmit_nit(skb, dev);
2131
2132		skb_orphan_try(skb);
2133
2134		features = netif_skb_features(skb);
2135
2136		if (vlan_tx_tag_present(skb) &&
2137		    !(features & NETIF_F_HW_VLAN_TX)) {
2138			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2139			if (unlikely(!skb))
2140				goto out;
2141
2142			skb->vlan_tci = 0;
2143		}
2144
2145		if (netif_needs_gso(skb, features)) {
2146			if (unlikely(dev_gso_segment(skb, features)))
2147				goto out_kfree_skb;
2148			if (skb->next)
2149				goto gso;
2150		} else {
2151			if (skb_needs_linearize(skb, features) &&
2152			    __skb_linearize(skb))
2153				goto out_kfree_skb;
2154
2155			/* If packet is not checksummed and device does not
2156			 * support checksumming for this protocol, complete
2157			 * checksumming here.
2158			 */
2159			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2160				skb_set_transport_header(skb,
2161					skb_checksum_start_offset(skb));
2162				if (!(features & NETIF_F_ALL_CSUM) &&
2163				     skb_checksum_help(skb))
2164					goto out_kfree_skb;
2165			}
2166		}
2167
2168		skb_len = skb->len;
2169		rc = ops->ndo_start_xmit(skb, dev);
2170		trace_net_dev_xmit(skb, rc, dev, skb_len);
2171		if (rc == NETDEV_TX_OK)
2172			txq_trans_update(txq);
2173		return rc;
2174	}
2175
2176gso:
2177	do {
2178		struct sk_buff *nskb = skb->next;
2179
2180		skb->next = nskb->next;
2181		nskb->next = NULL;
2182
2183		/*
2184		 * If device doesn't need nskb->dst, release it right now while
2185		 * its hot in this cpu cache
2186		 */
2187		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2188			skb_dst_drop(nskb);
2189
2190		skb_len = nskb->len;
2191		rc = ops->ndo_start_xmit(nskb, dev);
2192		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2193		if (unlikely(rc != NETDEV_TX_OK)) {
2194			if (rc & ~NETDEV_TX_MASK)
2195				goto out_kfree_gso_skb;
2196			nskb->next = skb->next;
2197			skb->next = nskb;
2198			return rc;
2199		}
2200		txq_trans_update(txq);
2201		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2202			return NETDEV_TX_BUSY;
2203	} while (skb->next);
2204
2205out_kfree_gso_skb:
2206	if (likely(skb->next == NULL))
2207		skb->destructor = DEV_GSO_CB(skb)->destructor;
2208out_kfree_skb:
2209	kfree_skb(skb);
2210out:
2211	return rc;
2212}
2213
2214static u32 hashrnd __read_mostly;
2215
2216/*
2217 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2218 * to be used as a distribution range.
2219 */
2220u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2221		  unsigned int num_tx_queues)
2222{
2223	u32 hash;
2224	u16 qoffset = 0;
2225	u16 qcount = num_tx_queues;
2226
2227	if (skb_rx_queue_recorded(skb)) {
2228		hash = skb_get_rx_queue(skb);
2229		while (unlikely(hash >= num_tx_queues))
2230			hash -= num_tx_queues;
2231		return hash;
2232	}
2233
2234	if (dev->num_tc) {
2235		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2236		qoffset = dev->tc_to_txq[tc].offset;
2237		qcount = dev->tc_to_txq[tc].count;
2238	}
2239
2240	if (skb->sk && skb->sk->sk_hash)
2241		hash = skb->sk->sk_hash;
2242	else
2243		hash = (__force u16) skb->protocol ^ skb->rxhash;
2244	hash = jhash_1word(hash, hashrnd);
2245
2246	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2247}
2248EXPORT_SYMBOL(__skb_tx_hash);
2249
2250static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2251{
2252	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2253		if (net_ratelimit()) {
2254			pr_warning("%s selects TX queue %d, but "
2255				"real number of TX queues is %d\n",
2256				dev->name, queue_index, dev->real_num_tx_queues);
2257		}
2258		return 0;
2259	}
2260	return queue_index;
2261}
2262
2263static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2264{
2265#ifdef CONFIG_XPS
2266	struct xps_dev_maps *dev_maps;
2267	struct xps_map *map;
2268	int queue_index = -1;
2269
2270	rcu_read_lock();
2271	dev_maps = rcu_dereference(dev->xps_maps);
2272	if (dev_maps) {
2273		map = rcu_dereference(
2274		    dev_maps->cpu_map[raw_smp_processor_id()]);
2275		if (map) {
2276			if (map->len == 1)
2277				queue_index = map->queues[0];
2278			else {
2279				u32 hash;
2280				if (skb->sk && skb->sk->sk_hash)
2281					hash = skb->sk->sk_hash;
2282				else
2283					hash = (__force u16) skb->protocol ^
2284					    skb->rxhash;
2285				hash = jhash_1word(hash, hashrnd);
2286				queue_index = map->queues[
2287				    ((u64)hash * map->len) >> 32];
2288			}
2289			if (unlikely(queue_index >= dev->real_num_tx_queues))
2290				queue_index = -1;
2291		}
2292	}
2293	rcu_read_unlock();
2294
2295	return queue_index;
2296#else
2297	return -1;
2298#endif
2299}
2300
2301static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2302					struct sk_buff *skb)
2303{
2304	int queue_index;
2305	const struct net_device_ops *ops = dev->netdev_ops;
2306
2307	if (dev->real_num_tx_queues == 1)
2308		queue_index = 0;
2309	else if (ops->ndo_select_queue) {
2310		queue_index = ops->ndo_select_queue(dev, skb);
2311		queue_index = dev_cap_txqueue(dev, queue_index);
2312	} else {
2313		struct sock *sk = skb->sk;
2314		queue_index = sk_tx_queue_get(sk);
2315
2316		if (queue_index < 0 || skb->ooo_okay ||
2317		    queue_index >= dev->real_num_tx_queues) {
2318			int old_index = queue_index;
2319
2320			queue_index = get_xps_queue(dev, skb);
2321			if (queue_index < 0)
2322				queue_index = skb_tx_hash(dev, skb);
2323
2324			if (queue_index != old_index && sk) {
2325				struct dst_entry *dst =
2326				    rcu_dereference_check(sk->sk_dst_cache, 1);
2327
2328				if (dst && skb_dst(skb) == dst)
2329					sk_tx_queue_set(sk, queue_index);
2330			}
2331		}
2332	}
2333
2334	skb_set_queue_mapping(skb, queue_index);
2335	return netdev_get_tx_queue(dev, queue_index);
2336}
2337
2338static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2339				 struct net_device *dev,
2340				 struct netdev_queue *txq)
2341{
2342	spinlock_t *root_lock = qdisc_lock(q);
2343	bool contended;
2344	int rc;
2345
2346	qdisc_skb_cb(skb)->pkt_len = skb->len;
2347	qdisc_calculate_pkt_len(skb, q);
2348	/*
2349	 * Heuristic to force contended enqueues to serialize on a
2350	 * separate lock before trying to get qdisc main lock.
2351	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2352	 * and dequeue packets faster.
2353	 */
2354	contended = qdisc_is_running(q);
2355	if (unlikely(contended))
2356		spin_lock(&q->busylock);
2357
2358	spin_lock(root_lock);
2359	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2360		kfree_skb(skb);
2361		rc = NET_XMIT_DROP;
2362	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2363		   qdisc_run_begin(q)) {
2364		/*
2365		 * This is a work-conserving queue; there are no old skbs
2366		 * waiting to be sent out; and the qdisc is not running -
2367		 * xmit the skb directly.
2368		 */
2369		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2370			skb_dst_force(skb);
2371
2372		qdisc_bstats_update(q, skb);
2373
2374		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2375			if (unlikely(contended)) {
2376				spin_unlock(&q->busylock);
2377				contended = false;
2378			}
2379			__qdisc_run(q);
2380		} else
2381			qdisc_run_end(q);
2382
2383		rc = NET_XMIT_SUCCESS;
2384	} else {
2385		skb_dst_force(skb);
2386		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2387		if (qdisc_run_begin(q)) {
2388			if (unlikely(contended)) {
2389				spin_unlock(&q->busylock);
2390				contended = false;
2391			}
2392			__qdisc_run(q);
2393		}
2394	}
2395	spin_unlock(root_lock);
2396	if (unlikely(contended))
2397		spin_unlock(&q->busylock);
2398	return rc;
2399}
2400
2401static DEFINE_PER_CPU(int, xmit_recursion);
2402#define RECURSION_LIMIT 10
2403
2404/**
2405 *	dev_queue_xmit - transmit a buffer
2406 *	@skb: buffer to transmit
2407 *
2408 *	Queue a buffer for transmission to a network device. The caller must
2409 *	have set the device and priority and built the buffer before calling
2410 *	this function. The function can be called from an interrupt.
2411 *
2412 *	A negative errno code is returned on a failure. A success does not
2413 *	guarantee the frame will be transmitted as it may be dropped due
2414 *	to congestion or traffic shaping.
2415 *
2416 * -----------------------------------------------------------------------------------
2417 *      I notice this method can also return errors from the queue disciplines,
2418 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2419 *      be positive.
2420 *
2421 *      Regardless of the return value, the skb is consumed, so it is currently
2422 *      difficult to retry a send to this method.  (You can bump the ref count
2423 *      before sending to hold a reference for retry if you are careful.)
2424 *
2425 *      When calling this method, interrupts MUST be enabled.  This is because
2426 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2427 *          --BLG
2428 */
2429int dev_queue_xmit(struct sk_buff *skb)
2430{
2431	struct net_device *dev = skb->dev;
2432	struct netdev_queue *txq;
2433	struct Qdisc *q;
2434	int rc = -ENOMEM;
2435
2436	/* Disable soft irqs for various locks below. Also
2437	 * stops preemption for RCU.
2438	 */
2439	rcu_read_lock_bh();
2440
2441	txq = dev_pick_tx(dev, skb);
2442	q = rcu_dereference_bh(txq->qdisc);
2443
2444#ifdef CONFIG_NET_CLS_ACT
2445	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2446#endif
2447	trace_net_dev_queue(skb);
2448	if (q->enqueue) {
2449		rc = __dev_xmit_skb(skb, q, dev, txq);
2450		goto out;
2451	}
2452
2453	/* The device has no queue. Common case for software devices:
2454	   loopback, all the sorts of tunnels...
2455
2456	   Really, it is unlikely that netif_tx_lock protection is necessary
2457	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2458	   counters.)
2459	   However, it is possible, that they rely on protection
2460	   made by us here.
2461
2462	   Check this and shot the lock. It is not prone from deadlocks.
2463	   Either shot noqueue qdisc, it is even simpler 8)
2464	 */
2465	if (dev->flags & IFF_UP) {
2466		int cpu = smp_processor_id(); /* ok because BHs are off */
2467
2468		if (txq->xmit_lock_owner != cpu) {
2469
2470			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2471				goto recursion_alert;
2472
2473			HARD_TX_LOCK(dev, txq, cpu);
2474
2475			if (!netif_tx_queue_stopped(txq)) {
2476				__this_cpu_inc(xmit_recursion);
2477				rc = dev_hard_start_xmit(skb, dev, txq);
2478				__this_cpu_dec(xmit_recursion);
2479				if (dev_xmit_complete(rc)) {
2480					HARD_TX_UNLOCK(dev, txq);
2481					goto out;
2482				}
2483			}
2484			HARD_TX_UNLOCK(dev, txq);
2485			if (net_ratelimit())
2486				printk(KERN_CRIT "Virtual device %s asks to "
2487				       "queue packet!\n", dev->name);
2488		} else {
2489			/* Recursion is detected! It is possible,
2490			 * unfortunately
2491			 */
2492recursion_alert:
2493			if (net_ratelimit())
2494				printk(KERN_CRIT "Dead loop on virtual device "
2495				       "%s, fix it urgently!\n", dev->name);
2496		}
2497	}
2498
2499	rc = -ENETDOWN;
2500	rcu_read_unlock_bh();
2501
2502	kfree_skb(skb);
2503	return rc;
2504out:
2505	rcu_read_unlock_bh();
2506	return rc;
2507}
2508EXPORT_SYMBOL(dev_queue_xmit);
2509
2510
2511/*=======================================================================
2512			Receiver routines
2513  =======================================================================*/
2514
2515int netdev_max_backlog __read_mostly = 1000;
2516int netdev_tstamp_prequeue __read_mostly = 1;
2517int netdev_budget __read_mostly = 300;
2518int weight_p __read_mostly = 64;            /* old backlog weight */
2519
2520/* Called with irq disabled */
2521static inline void ____napi_schedule(struct softnet_data *sd,
2522				     struct napi_struct *napi)
2523{
2524	list_add_tail(&napi->poll_list, &sd->poll_list);
2525	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2526}
2527
2528/*
2529 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2530 * and src/dst port numbers. Returns a non-zero hash number on success
2531 * and 0 on failure.
2532 */
2533__u32 __skb_get_rxhash(struct sk_buff *skb)
2534{
2535	int nhoff, hash = 0, poff;
2536	const struct ipv6hdr *ip6;
2537	const struct iphdr *ip;
2538	u8 ip_proto;
2539	u32 addr1, addr2, ihl;
2540	union {
2541		u32 v32;
2542		u16 v16[2];
2543	} ports;
2544
2545	nhoff = skb_network_offset(skb);
2546
2547	switch (skb->protocol) {
2548	case __constant_htons(ETH_P_IP):
2549		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2550			goto done;
2551
2552		ip = (const struct iphdr *) (skb->data + nhoff);
2553		if (ip_is_fragment(ip))
2554			ip_proto = 0;
2555		else
2556			ip_proto = ip->protocol;
2557		addr1 = (__force u32) ip->saddr;
2558		addr2 = (__force u32) ip->daddr;
2559		ihl = ip->ihl;
2560		break;
2561	case __constant_htons(ETH_P_IPV6):
2562		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2563			goto done;
2564
2565		ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2566		ip_proto = ip6->nexthdr;
2567		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2568		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2569		ihl = (40 >> 2);
2570		break;
2571	default:
2572		goto done;
2573	}
2574
2575	ports.v32 = 0;
2576	poff = proto_ports_offset(ip_proto);
2577	if (poff >= 0) {
2578		nhoff += ihl * 4 + poff;
2579		if (pskb_may_pull(skb, nhoff + 4)) {
2580			ports.v32 = * (__force u32 *) (skb->data + nhoff);
2581			if (ports.v16[1] < ports.v16[0])
2582				swap(ports.v16[0], ports.v16[1]);
2583		}
2584	}
2585
2586	/* get a consistent hash (same value on both flow directions) */
2587	if (addr2 < addr1)
2588		swap(addr1, addr2);
2589
2590	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2591	if (!hash)
2592		hash = 1;
2593
2594done:
2595	return hash;
2596}
2597EXPORT_SYMBOL(__skb_get_rxhash);
2598
2599#ifdef CONFIG_RPS
2600
2601/* One global table that all flow-based protocols share. */
2602struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2603EXPORT_SYMBOL(rps_sock_flow_table);
2604
2605static struct rps_dev_flow *
2606set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2607	    struct rps_dev_flow *rflow, u16 next_cpu)
2608{
2609	u16 tcpu;
2610
2611	tcpu = rflow->cpu = next_cpu;
2612	if (tcpu != RPS_NO_CPU) {
2613#ifdef CONFIG_RFS_ACCEL
2614		struct netdev_rx_queue *rxqueue;
2615		struct rps_dev_flow_table *flow_table;
2616		struct rps_dev_flow *old_rflow;
2617		u32 flow_id;
2618		u16 rxq_index;
2619		int rc;
2620
2621		/* Should we steer this flow to a different hardware queue? */
2622		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2623		    !(dev->features & NETIF_F_NTUPLE))
2624			goto out;
2625		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2626		if (rxq_index == skb_get_rx_queue(skb))
2627			goto out;
2628
2629		rxqueue = dev->_rx + rxq_index;
2630		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2631		if (!flow_table)
2632			goto out;
2633		flow_id = skb->rxhash & flow_table->mask;
2634		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2635							rxq_index, flow_id);
2636		if (rc < 0)
2637			goto out;
2638		old_rflow = rflow;
2639		rflow = &flow_table->flows[flow_id];
2640		rflow->cpu = next_cpu;
2641		rflow->filter = rc;
2642		if (old_rflow->filter == rflow->filter)
2643			old_rflow->filter = RPS_NO_FILTER;
2644	out:
2645#endif
2646		rflow->last_qtail =
2647			per_cpu(softnet_data, tcpu).input_queue_head;
2648	}
2649
2650	return rflow;
2651}
2652
2653/*
2654 * get_rps_cpu is called from netif_receive_skb and returns the target
2655 * CPU from the RPS map of the receiving queue for a given skb.
2656 * rcu_read_lock must be held on entry.
2657 */
2658static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2659		       struct rps_dev_flow **rflowp)
2660{
2661	struct netdev_rx_queue *rxqueue;
2662	struct rps_map *map;
2663	struct rps_dev_flow_table *flow_table;
2664	struct rps_sock_flow_table *sock_flow_table;
2665	int cpu = -1;
2666	u16 tcpu;
2667
2668	if (skb_rx_queue_recorded(skb)) {
2669		u16 index = skb_get_rx_queue(skb);
2670		if (unlikely(index >= dev->real_num_rx_queues)) {
2671			WARN_ONCE(dev->real_num_rx_queues > 1,
2672				  "%s received packet on queue %u, but number "
2673				  "of RX queues is %u\n",
2674				  dev->name, index, dev->real_num_rx_queues);
2675			goto done;
2676		}
2677		rxqueue = dev->_rx + index;
2678	} else
2679		rxqueue = dev->_rx;
2680
2681	map = rcu_dereference(rxqueue->rps_map);
2682	if (map) {
2683		if (map->len == 1 &&
2684		    !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2685			tcpu = map->cpus[0];
2686			if (cpu_online(tcpu))
2687				cpu = tcpu;
2688			goto done;
2689		}
2690	} else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2691		goto done;
2692	}
2693
2694	skb_reset_network_header(skb);
2695	if (!skb_get_rxhash(skb))
2696		goto done;
2697
2698	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2699	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2700	if (flow_table && sock_flow_table) {
2701		u16 next_cpu;
2702		struct rps_dev_flow *rflow;
2703
2704		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2705		tcpu = rflow->cpu;
2706
2707		next_cpu = sock_flow_table->ents[skb->rxhash &
2708		    sock_flow_table->mask];
2709
2710		/*
2711		 * If the desired CPU (where last recvmsg was done) is
2712		 * different from current CPU (one in the rx-queue flow
2713		 * table entry), switch if one of the following holds:
2714		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2715		 *   - Current CPU is offline.
2716		 *   - The current CPU's queue tail has advanced beyond the
2717		 *     last packet that was enqueued using this table entry.
2718		 *     This guarantees that all previous packets for the flow
2719		 *     have been dequeued, thus preserving in order delivery.
2720		 */
2721		if (unlikely(tcpu != next_cpu) &&
2722		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2723		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2724		      rflow->last_qtail)) >= 0))
2725			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2726
2727		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2728			*rflowp = rflow;
2729			cpu = tcpu;
2730			goto done;
2731		}
2732	}
2733
2734	if (map) {
2735		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2736
2737		if (cpu_online(tcpu)) {
2738			cpu = tcpu;
2739			goto done;
2740		}
2741	}
2742
2743done:
2744	return cpu;
2745}
2746
2747#ifdef CONFIG_RFS_ACCEL
2748
2749/**
2750 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2751 * @dev: Device on which the filter was set
2752 * @rxq_index: RX queue index
2753 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2754 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2755 *
2756 * Drivers that implement ndo_rx_flow_steer() should periodically call
2757 * this function for each installed filter and remove the filters for
2758 * which it returns %true.
2759 */
2760bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2761			 u32 flow_id, u16 filter_id)
2762{
2763	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2764	struct rps_dev_flow_table *flow_table;
2765	struct rps_dev_flow *rflow;
2766	bool expire = true;
2767	int cpu;
2768
2769	rcu_read_lock();
2770	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2771	if (flow_table && flow_id <= flow_table->mask) {
2772		rflow = &flow_table->flows[flow_id];
2773		cpu = ACCESS_ONCE(rflow->cpu);
2774		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2775		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2776			   rflow->last_qtail) <
2777		     (int)(10 * flow_table->mask)))
2778			expire = false;
2779	}
2780	rcu_read_unlock();
2781	return expire;
2782}
2783EXPORT_SYMBOL(rps_may_expire_flow);
2784
2785#endif /* CONFIG_RFS_ACCEL */
2786
2787/* Called from hardirq (IPI) context */
2788static void rps_trigger_softirq(void *data)
2789{
2790	struct softnet_data *sd = data;
2791
2792	____napi_schedule(sd, &sd->backlog);
2793	sd->received_rps++;
2794}
2795
2796#endif /* CONFIG_RPS */
2797
2798/*
2799 * Check if this softnet_data structure is another cpu one
2800 * If yes, queue it to our IPI list and return 1
2801 * If no, return 0
2802 */
2803static int rps_ipi_queued(struct softnet_data *sd)
2804{
2805#ifdef CONFIG_RPS
2806	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2807
2808	if (sd != mysd) {
2809		sd->rps_ipi_next = mysd->rps_ipi_list;
2810		mysd->rps_ipi_list = sd;
2811
2812		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2813		return 1;
2814	}
2815#endif /* CONFIG_RPS */
2816	return 0;
2817}
2818
2819/*
2820 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2821 * queue (may be a remote CPU queue).
2822 */
2823static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2824			      unsigned int *qtail)
2825{
2826	struct softnet_data *sd;
2827	unsigned long flags;
2828
2829	sd = &per_cpu(softnet_data, cpu);
2830
2831	local_irq_save(flags);
2832
2833	rps_lock(sd);
2834	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2835		if (skb_queue_len(&sd->input_pkt_queue)) {
2836enqueue:
2837			__skb_queue_tail(&sd->input_pkt_queue, skb);
2838			input_queue_tail_incr_save(sd, qtail);
2839			rps_unlock(sd);
2840			local_irq_restore(flags);
2841			return NET_RX_SUCCESS;
2842		}
2843
2844		/* Schedule NAPI for backlog device
2845		 * We can use non atomic operation since we own the queue lock
2846		 */
2847		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2848			if (!rps_ipi_queued(sd))
2849				____napi_schedule(sd, &sd->backlog);
2850		}
2851		goto enqueue;
2852	}
2853
2854	sd->dropped++;
2855	rps_unlock(sd);
2856
2857	local_irq_restore(flags);
2858
2859	atomic_long_inc(&skb->dev->rx_dropped);
2860	kfree_skb(skb);
2861	return NET_RX_DROP;
2862}
2863
2864/**
2865 *	netif_rx	-	post buffer to the network code
2866 *	@skb: buffer to post
2867 *
2868 *	This function receives a packet from a device driver and queues it for
2869 *	the upper (protocol) levels to process.  It always succeeds. The buffer
2870 *	may be dropped during processing for congestion control or by the
2871 *	protocol layers.
2872 *
2873 *	return values:
2874 *	NET_RX_SUCCESS	(no congestion)
2875 *	NET_RX_DROP     (packet was dropped)
2876 *
2877 */
2878
2879int netif_rx(struct sk_buff *skb)
2880{
2881	int ret;
2882
2883	/* if netpoll wants it, pretend we never saw it */
2884	if (netpoll_rx(skb))
2885		return NET_RX_DROP;
2886
2887	if (netdev_tstamp_prequeue)
2888		net_timestamp_check(skb);
2889
2890	trace_netif_rx(skb);
2891#ifdef CONFIG_RPS
2892	{
2893		struct rps_dev_flow voidflow, *rflow = &voidflow;
2894		int cpu;
2895
2896		preempt_disable();
2897		rcu_read_lock();
2898
2899		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2900		if (cpu < 0)
2901			cpu = smp_processor_id();
2902
2903		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2904
2905		rcu_read_unlock();
2906		preempt_enable();
2907	}
2908#else
2909	{
2910		unsigned int qtail;
2911		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2912		put_cpu();
2913	}
2914#endif
2915	return ret;
2916}
2917EXPORT_SYMBOL(netif_rx);
2918
2919int netif_rx_ni(struct sk_buff *skb)
2920{
2921	int err;
2922
2923	preempt_disable();
2924	err = netif_rx(skb);
2925	if (local_softirq_pending())
2926		do_softirq();
2927	preempt_enable();
2928
2929	return err;
2930}
2931EXPORT_SYMBOL(netif_rx_ni);
2932
2933static void net_tx_action(struct softirq_action *h)
2934{
2935	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2936
2937	if (sd->completion_queue) {
2938		struct sk_buff *clist;
2939
2940		local_irq_disable();
2941		clist = sd->completion_queue;
2942		sd->completion_queue = NULL;
2943		local_irq_enable();
2944
2945		while (clist) {
2946			struct sk_buff *skb = clist;
2947			clist = clist->next;
2948
2949			WARN_ON(atomic_read(&skb->users));
2950			trace_kfree_skb(skb, net_tx_action);
2951			__kfree_skb(skb);
2952		}
2953	}
2954
2955	if (sd->output_queue) {
2956		struct Qdisc *head;
2957
2958		local_irq_disable();
2959		head = sd->output_queue;
2960		sd->output_queue = NULL;
2961		sd->output_queue_tailp = &sd->output_queue;
2962		local_irq_enable();
2963
2964		while (head) {
2965			struct Qdisc *q = head;
2966			spinlock_t *root_lock;
2967
2968			head = head->next_sched;
2969
2970			root_lock = qdisc_lock(q);
2971			if (spin_trylock(root_lock)) {
2972				smp_mb__before_clear_bit();
2973				clear_bit(__QDISC_STATE_SCHED,
2974					  &q->state);
2975				qdisc_run(q);
2976				spin_unlock(root_lock);
2977			} else {
2978				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2979					      &q->state)) {
2980					__netif_reschedule(q);
2981				} else {
2982					smp_mb__before_clear_bit();
2983					clear_bit(__QDISC_STATE_SCHED,
2984						  &q->state);
2985				}
2986			}
2987		}
2988	}
2989}
2990
2991#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2992    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2993/* This hook is defined here for ATM LANE */
2994int (*br_fdb_test_addr_hook)(struct net_device *dev,
2995			     unsigned char *addr) __read_mostly;
2996EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2997#endif
2998
2999#ifdef CONFIG_NET_CLS_ACT
3000/* TODO: Maybe we should just force sch_ingress to be compiled in
3001 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3002 * a compare and 2 stores extra right now if we dont have it on
3003 * but have CONFIG_NET_CLS_ACT
3004 * NOTE: This doesn't stop any functionality; if you dont have
3005 * the ingress scheduler, you just can't add policies on ingress.
3006 *
3007 */
3008static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3009{
3010	struct net_device *dev = skb->dev;
3011	u32 ttl = G_TC_RTTL(skb->tc_verd);
3012	int result = TC_ACT_OK;
3013	struct Qdisc *q;
3014
3015	if (unlikely(MAX_RED_LOOP < ttl++)) {
3016		if (net_ratelimit())
3017			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3018			       skb->skb_iif, dev->ifindex);
3019		return TC_ACT_SHOT;
3020	}
3021
3022	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3023	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3024
3025	q = rxq->qdisc;
3026	if (q != &noop_qdisc) {
3027		spin_lock(qdisc_lock(q));
3028		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3029			result = qdisc_enqueue_root(skb, q);
3030		spin_unlock(qdisc_lock(q));
3031	}
3032
3033	return result;
3034}
3035
3036static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3037					 struct packet_type **pt_prev,
3038					 int *ret, struct net_device *orig_dev)
3039{
3040	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3041
3042	if (!rxq || rxq->qdisc == &noop_qdisc)
3043		goto out;
3044
3045	if (*pt_prev) {
3046		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3047		*pt_prev = NULL;
3048	}
3049
3050	switch (ing_filter(skb, rxq)) {
3051	case TC_ACT_SHOT:
3052	case TC_ACT_STOLEN:
3053		kfree_skb(skb);
3054		return NULL;
3055	}
3056
3057out:
3058	skb->tc_verd = 0;
3059	return skb;
3060}
3061#endif
3062
3063/**
3064 *	netdev_rx_handler_register - register receive handler
3065 *	@dev: device to register a handler for
3066 *	@rx_handler: receive handler to register
3067 *	@rx_handler_data: data pointer that is used by rx handler
3068 *
3069 *	Register a receive hander for a device. This handler will then be
3070 *	called from __netif_receive_skb. A negative errno code is returned
3071 *	on a failure.
3072 *
3073 *	The caller must hold the rtnl_mutex.
3074 *
3075 *	For a general description of rx_handler, see enum rx_handler_result.
3076 */
3077int netdev_rx_handler_register(struct net_device *dev,
3078			       rx_handler_func_t *rx_handler,
3079			       void *rx_handler_data)
3080{
3081	ASSERT_RTNL();
3082
3083	if (dev->rx_handler)
3084		return -EBUSY;
3085
3086	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3087	rcu_assign_pointer(dev->rx_handler, rx_handler);
3088
3089	return 0;
3090}
3091EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3092
3093/**
3094 *	netdev_rx_handler_unregister - unregister receive handler
3095 *	@dev: device to unregister a handler from
3096 *
3097 *	Unregister a receive hander from a device.
3098 *
3099 *	The caller must hold the rtnl_mutex.
3100 */
3101void netdev_rx_handler_unregister(struct net_device *dev)
3102{
3103
3104	ASSERT_RTNL();
3105	rcu_assign_pointer(dev->rx_handler, NULL);
3106	rcu_assign_pointer(dev->rx_handler_data, NULL);
3107}
3108EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3109
3110static int __netif_receive_skb(struct sk_buff *skb)
3111{
3112	struct packet_type *ptype, *pt_prev;
3113	rx_handler_func_t *rx_handler;
3114	struct net_device *orig_dev;
3115	struct net_device *null_or_dev;
3116	bool deliver_exact = false;
3117	int ret = NET_RX_DROP;
3118	__be16 type;
3119
3120	if (!netdev_tstamp_prequeue)
3121		net_timestamp_check(skb);
3122
3123	trace_netif_receive_skb(skb);
3124
3125	/* if we've gotten here through NAPI, check netpoll */
3126	if (netpoll_receive_skb(skb))
3127		return NET_RX_DROP;
3128
3129	if (!skb->skb_iif)
3130		skb->skb_iif = skb->dev->ifindex;
3131	orig_dev = skb->dev;
3132
3133	skb_reset_network_header(skb);
3134	skb_reset_transport_header(skb);
3135	skb_reset_mac_len(skb);
3136
3137	pt_prev = NULL;
3138
3139	rcu_read_lock();
3140
3141another_round:
3142
3143	__this_cpu_inc(softnet_data.processed);
3144
3145	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3146		skb = vlan_untag(skb);
3147		if (unlikely(!skb))
3148			goto out;
3149	}
3150
3151#ifdef CONFIG_NET_CLS_ACT
3152	if (skb->tc_verd & TC_NCLS) {
3153		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3154		goto ncls;
3155	}
3156#endif
3157
3158	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3159		if (!ptype->dev || ptype->dev == skb->dev) {
3160			if (pt_prev)
3161				ret = deliver_skb(skb, pt_prev, orig_dev);
3162			pt_prev = ptype;
3163		}
3164	}
3165
3166#ifdef CONFIG_NET_CLS_ACT
3167	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3168	if (!skb)
3169		goto out;
3170ncls:
3171#endif
3172
3173	rx_handler = rcu_dereference(skb->dev->rx_handler);
3174	if (rx_handler) {
3175		if (pt_prev) {
3176			ret = deliver_skb(skb, pt_prev, orig_dev);
3177			pt_prev = NULL;
3178		}
3179		switch (rx_handler(&skb)) {
3180		case RX_HANDLER_CONSUMED:
3181			goto out;
3182		case RX_HANDLER_ANOTHER:
3183			goto another_round;
3184		case RX_HANDLER_EXACT:
3185			deliver_exact = true;
3186		case RX_HANDLER_PASS:
3187			break;
3188		default:
3189			BUG();
3190		}
3191	}
3192
3193	if (vlan_tx_tag_present(skb)) {
3194		if (pt_prev) {
3195			ret = deliver_skb(skb, pt_prev, orig_dev);
3196			pt_prev = NULL;
3197		}
3198		if (vlan_do_receive(&skb)) {
3199			ret = __netif_receive_skb(skb);
3200			goto out;
3201		} else if (unlikely(!skb))
3202			goto out;
3203	}
3204
3205	/* deliver only exact match when indicated */
3206	null_or_dev = deliver_exact ? skb->dev : NULL;
3207
3208	type = skb->protocol;
3209	list_for_each_entry_rcu(ptype,
3210			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3211		if (ptype->type == type &&
3212		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3213		     ptype->dev == orig_dev)) {
3214			if (pt_prev)
3215				ret = deliver_skb(skb, pt_prev, orig_dev);
3216			pt_prev = ptype;
3217		}
3218	}
3219
3220	if (pt_prev) {
3221		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3222	} else {
3223		atomic_long_inc(&skb->dev->rx_dropped);
3224		kfree_skb(skb);
3225		/* Jamal, now you will not able to escape explaining
3226		 * me how you were going to use this. :-)
3227		 */
3228		ret = NET_RX_DROP;
3229	}
3230
3231out:
3232	rcu_read_unlock();
3233	return ret;
3234}
3235
3236/**
3237 *	netif_receive_skb - process receive buffer from network
3238 *	@skb: buffer to process
3239 *
3240 *	netif_receive_skb() is the main receive data processing function.
3241 *	It always succeeds. The buffer may be dropped during processing
3242 *	for congestion control or by the protocol layers.
3243 *
3244 *	This function may only be called from softirq context and interrupts
3245 *	should be enabled.
3246 *
3247 *	Return values (usually ignored):
3248 *	NET_RX_SUCCESS: no congestion
3249 *	NET_RX_DROP: packet was dropped
3250 */
3251int netif_receive_skb(struct sk_buff *skb)
3252{
3253	if (netdev_tstamp_prequeue)
3254		net_timestamp_check(skb);
3255
3256	if (skb_defer_rx_timestamp(skb))
3257		return NET_RX_SUCCESS;
3258
3259#ifdef CONFIG_RPS
3260	{
3261		struct rps_dev_flow voidflow, *rflow = &voidflow;
3262		int cpu, ret;
3263
3264		rcu_read_lock();
3265
3266		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3267
3268		if (cpu >= 0) {
3269			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3270			rcu_read_unlock();
3271		} else {
3272			rcu_read_unlock();
3273			ret = __netif_receive_skb(skb);
3274		}
3275
3276		return ret;
3277	}
3278#else
3279	return __netif_receive_skb(skb);
3280#endif
3281}
3282EXPORT_SYMBOL(netif_receive_skb);
3283
3284/* Network device is going away, flush any packets still pending
3285 * Called with irqs disabled.
3286 */
3287static void flush_backlog(void *arg)
3288{
3289	struct net_device *dev = arg;
3290	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3291	struct sk_buff *skb, *tmp;
3292
3293	rps_lock(sd);
3294	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3295		if (skb->dev == dev) {
3296			__skb_unlink(skb, &sd->input_pkt_queue);
3297			kfree_skb(skb);
3298			input_queue_head_incr(sd);
3299		}
3300	}
3301	rps_unlock(sd);
3302
3303	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3304		if (skb->dev == dev) {
3305			__skb_unlink(skb, &sd->process_queue);
3306			kfree_skb(skb);
3307			input_queue_head_incr(sd);
3308		}
3309	}
3310}
3311
3312static int napi_gro_complete(struct sk_buff *skb)
3313{
3314	struct packet_type *ptype;
3315	__be16 type = skb->protocol;
3316	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3317	int err = -ENOENT;
3318
3319	if (NAPI_GRO_CB(skb)->count == 1) {
3320		skb_shinfo(skb)->gso_size = 0;
3321		goto out;
3322	}
3323
3324	rcu_read_lock();
3325	list_for_each_entry_rcu(ptype, head, list) {
3326		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3327			continue;
3328
3329		err = ptype->gro_complete(skb);
3330		break;
3331	}
3332	rcu_read_unlock();
3333
3334	if (err) {
3335		WARN_ON(&ptype->list == head);
3336		kfree_skb(skb);
3337		return NET_RX_SUCCESS;
3338	}
3339
3340out:
3341	return netif_receive_skb(skb);
3342}
3343
3344inline void napi_gro_flush(struct napi_struct *napi)
3345{
3346	struct sk_buff *skb, *next;
3347
3348	for (skb = napi->gro_list; skb; skb = next) {
3349		next = skb->next;
3350		skb->next = NULL;
3351		napi_gro_complete(skb);
3352	}
3353
3354	napi->gro_count = 0;
3355	napi->gro_list = NULL;
3356}
3357EXPORT_SYMBOL(napi_gro_flush);
3358
3359enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3360{
3361	struct sk_buff **pp = NULL;
3362	struct packet_type *ptype;
3363	__be16 type = skb->protocol;
3364	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3365	int same_flow;
3366	int mac_len;
3367	enum gro_result ret;
3368
3369	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3370		goto normal;
3371
3372	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3373		goto normal;
3374
3375	rcu_read_lock();
3376	list_for_each_entry_rcu(ptype, head, list) {
3377		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3378			continue;
3379
3380		skb_set_network_header(skb, skb_gro_offset(skb));
3381		mac_len = skb->network_header - skb->mac_header;
3382		skb->mac_len = mac_len;
3383		NAPI_GRO_CB(skb)->same_flow = 0;
3384		NAPI_GRO_CB(skb)->flush = 0;
3385		NAPI_GRO_CB(skb)->free = 0;
3386
3387		pp = ptype->gro_receive(&napi->gro_list, skb);
3388		break;
3389	}
3390	rcu_read_unlock();
3391
3392	if (&ptype->list == head)
3393		goto normal;
3394
3395	same_flow = NAPI_GRO_CB(skb)->same_flow;
3396	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3397
3398	if (pp) {
3399		struct sk_buff *nskb = *pp;
3400
3401		*pp = nskb->next;
3402		nskb->next = NULL;
3403		napi_gro_complete(nskb);
3404		napi->gro_count--;
3405	}
3406
3407	if (same_flow)
3408		goto ok;
3409
3410	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3411		goto normal;
3412
3413	napi->gro_count++;
3414	NAPI_GRO_CB(skb)->count = 1;
3415	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3416	skb->next = napi->gro_list;
3417	napi->gro_list = skb;
3418	ret = GRO_HELD;
3419
3420pull:
3421	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3422		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3423
3424		BUG_ON(skb->end - skb->tail < grow);
3425
3426		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3427
3428		skb->tail += grow;
3429		skb->data_len -= grow;
3430
3431		skb_shinfo(skb)->frags[0].page_offset += grow;
3432		skb_shinfo(skb)->frags[0].size -= grow;
3433
3434		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3435			put_page(skb_shinfo(skb)->frags[0].page);
3436			memmove(skb_shinfo(skb)->frags,
3437				skb_shinfo(skb)->frags + 1,
3438				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3439		}
3440	}
3441
3442ok:
3443	return ret;
3444
3445normal:
3446	ret = GRO_NORMAL;
3447	goto pull;
3448}
3449EXPORT_SYMBOL(dev_gro_receive);
3450
3451static inline gro_result_t
3452__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3453{
3454	struct sk_buff *p;
3455
3456	for (p = napi->gro_list; p; p = p->next) {
3457		unsigned long diffs;
3458
3459		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3460		diffs |= p->vlan_tci ^ skb->vlan_tci;
3461		diffs |= compare_ether_header(skb_mac_header(p),
3462					      skb_gro_mac_header(skb));
3463		NAPI_GRO_CB(p)->same_flow = !diffs;
3464		NAPI_GRO_CB(p)->flush = 0;
3465	}
3466
3467	return dev_gro_receive(napi, skb);
3468}
3469
3470gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3471{
3472	switch (ret) {
3473	case GRO_NORMAL:
3474		if (netif_receive_skb(skb))
3475			ret = GRO_DROP;
3476		break;
3477
3478	case GRO_DROP:
3479	case GRO_MERGED_FREE:
3480		kfree_skb(skb);
3481		break;
3482
3483	case GRO_HELD:
3484	case GRO_MERGED:
3485		break;
3486	}
3487
3488	return ret;
3489}
3490EXPORT_SYMBOL(napi_skb_finish);
3491
3492void skb_gro_reset_offset(struct sk_buff *skb)
3493{
3494	NAPI_GRO_CB(skb)->data_offset = 0;
3495	NAPI_GRO_CB(skb)->frag0 = NULL;
3496	NAPI_GRO_CB(skb)->frag0_len = 0;
3497
3498	if (skb->mac_header == skb->tail &&
3499	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3500		NAPI_GRO_CB(skb)->frag0 =
3501			page_address(skb_shinfo(skb)->frags[0].page) +
3502			skb_shinfo(skb)->frags[0].page_offset;
3503		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3504	}
3505}
3506EXPORT_SYMBOL(skb_gro_reset_offset);
3507
3508gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3509{
3510	skb_gro_reset_offset(skb);
3511
3512	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3513}
3514EXPORT_SYMBOL(napi_gro_receive);
3515
3516static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3517{
3518	__skb_pull(skb, skb_headlen(skb));
3519	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3520	skb->vlan_tci = 0;
3521	skb->dev = napi->dev;
3522	skb->skb_iif = 0;
3523
3524	napi->skb = skb;
3525}
3526
3527struct sk_buff *napi_get_frags(struct napi_struct *napi)
3528{
3529	struct sk_buff *skb = napi->skb;
3530
3531	if (!skb) {
3532		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3533		if (skb)
3534			napi->skb = skb;
3535	}
3536	return skb;
3537}
3538EXPORT_SYMBOL(napi_get_frags);
3539
3540gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3541			       gro_result_t ret)
3542{
3543	switch (ret) {
3544	case GRO_NORMAL:
3545	case GRO_HELD:
3546		skb->protocol = eth_type_trans(skb, skb->dev);
3547
3548		if (ret == GRO_HELD)
3549			skb_gro_pull(skb, -ETH_HLEN);
3550		else if (netif_receive_skb(skb))
3551			ret = GRO_DROP;
3552		break;
3553
3554	case GRO_DROP:
3555	case GRO_MERGED_FREE:
3556		napi_reuse_skb(napi, skb);
3557		break;
3558
3559	case GRO_MERGED:
3560		break;
3561	}
3562
3563	return ret;
3564}
3565EXPORT_SYMBOL(napi_frags_finish);
3566
3567struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3568{
3569	struct sk_buff *skb = napi->skb;
3570	struct ethhdr *eth;
3571	unsigned int hlen;
3572	unsigned int off;
3573
3574	napi->skb = NULL;
3575
3576	skb_reset_mac_header(skb);
3577	skb_gro_reset_offset(skb);
3578
3579	off = skb_gro_offset(skb);
3580	hlen = off + sizeof(*eth);
3581	eth = skb_gro_header_fast(skb, off);
3582	if (skb_gro_header_hard(skb, hlen)) {
3583		eth = skb_gro_header_slow(skb, hlen, off);
3584		if (unlikely(!eth)) {
3585			napi_reuse_skb(napi, skb);
3586			skb = NULL;
3587			goto out;
3588		}
3589	}
3590
3591	skb_gro_pull(skb, sizeof(*eth));
3592
3593	/*
3594	 * This works because the only protocols we care about don't require
3595	 * special handling.  We'll fix it up properly at the end.
3596	 */
3597	skb->protocol = eth->h_proto;
3598
3599out:
3600	return skb;
3601}
3602EXPORT_SYMBOL(napi_frags_skb);
3603
3604gro_result_t napi_gro_frags(struct napi_struct *napi)
3605{
3606	struct sk_buff *skb = napi_frags_skb(napi);
3607
3608	if (!skb)
3609		return GRO_DROP;
3610
3611	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3612}
3613EXPORT_SYMBOL(napi_gro_frags);
3614
3615/*
3616 * net_rps_action sends any pending IPI's for rps.
3617 * Note: called with local irq disabled, but exits with local irq enabled.
3618 */
3619static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3620{
3621#ifdef CONFIG_RPS
3622	struct softnet_data *remsd = sd->rps_ipi_list;
3623
3624	if (remsd) {
3625		sd->rps_ipi_list = NULL;
3626
3627		local_irq_enable();
3628
3629		/* Send pending IPI's to kick RPS processing on remote cpus. */
3630		while (remsd) {
3631			struct softnet_data *next = remsd->rps_ipi_next;
3632
3633			if (cpu_online(remsd->cpu))
3634				__smp_call_function_single(remsd->cpu,
3635							   &remsd->csd, 0);
3636			remsd = next;
3637		}
3638	} else
3639#endif
3640		local_irq_enable();
3641}
3642
3643static int process_backlog(struct napi_struct *napi, int quota)
3644{
3645	int work = 0;
3646	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3647
3648#ifdef CONFIG_RPS
3649	/* Check if we have pending ipi, its better to send them now,
3650	 * not waiting net_rx_action() end.
3651	 */
3652	if (sd->rps_ipi_list) {
3653		local_irq_disable();
3654		net_rps_action_and_irq_enable(sd);
3655	}
3656#endif
3657	napi->weight = weight_p;
3658	local_irq_disable();
3659	while (work < quota) {
3660		struct sk_buff *skb;
3661		unsigned int qlen;
3662
3663		while ((skb = __skb_dequeue(&sd->process_queue))) {
3664			local_irq_enable();
3665			__netif_receive_skb(skb);
3666			local_irq_disable();
3667			input_queue_head_incr(sd);
3668			if (++work >= quota) {
3669				local_irq_enable();
3670				return work;
3671			}
3672		}
3673
3674		rps_lock(sd);
3675		qlen = skb_queue_len(&sd->input_pkt_queue);
3676		if (qlen)
3677			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3678						   &sd->process_queue);
3679
3680		if (qlen < quota - work) {
3681			/*
3682			 * Inline a custom version of __napi_complete().
3683			 * only current cpu owns and manipulates this napi,
3684			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3685			 * we can use a plain write instead of clear_bit(),
3686			 * and we dont need an smp_mb() memory barrier.
3687			 */
3688			list_del(&napi->poll_list);
3689			napi->state = 0;
3690
3691			quota = work + qlen;
3692		}
3693		rps_unlock(sd);
3694	}
3695	local_irq_enable();
3696
3697	return work;
3698}
3699
3700/**
3701 * __napi_schedule - schedule for receive
3702 * @n: entry to schedule
3703 *
3704 * The entry's receive function will be scheduled to run
3705 */
3706void __napi_schedule(struct napi_struct *n)
3707{
3708	unsigned long flags;
3709
3710	local_irq_save(flags);
3711	____napi_schedule(&__get_cpu_var(softnet_data), n);
3712	local_irq_restore(flags);
3713}
3714EXPORT_SYMBOL(__napi_schedule);
3715
3716void __napi_complete(struct napi_struct *n)
3717{
3718	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3719	BUG_ON(n->gro_list);
3720
3721	list_del(&n->poll_list);
3722	smp_mb__before_clear_bit();
3723	clear_bit(NAPI_STATE_SCHED, &n->state);
3724}
3725EXPORT_SYMBOL(__napi_complete);
3726
3727void napi_complete(struct napi_struct *n)
3728{
3729	unsigned long flags;
3730
3731	/*
3732	 * don't let napi dequeue from the cpu poll list
3733	 * just in case its running on a different cpu
3734	 */
3735	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3736		return;
3737
3738	napi_gro_flush(n);
3739	local_irq_save(flags);
3740	__napi_complete(n);
3741	local_irq_restore(flags);
3742}
3743EXPORT_SYMBOL(napi_complete);
3744
3745void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3746		    int (*poll)(struct napi_struct *, int), int weight)
3747{
3748	INIT_LIST_HEAD(&napi->poll_list);
3749	napi->gro_count = 0;
3750	napi->gro_list = NULL;
3751	napi->skb = NULL;
3752	napi->poll = poll;
3753	napi->weight = weight;
3754	list_add(&napi->dev_list, &dev->napi_list);
3755	napi->dev = dev;
3756#ifdef CONFIG_NETPOLL
3757	spin_lock_init(&napi->poll_lock);
3758	napi->poll_owner = -1;
3759#endif
3760	set_bit(NAPI_STATE_SCHED, &napi->state);
3761}
3762EXPORT_SYMBOL(netif_napi_add);
3763
3764void netif_napi_del(struct napi_struct *napi)
3765{
3766	struct sk_buff *skb, *next;
3767
3768	list_del_init(&napi->dev_list);
3769	napi_free_frags(napi);
3770
3771	for (skb = napi->gro_list; skb; skb = next) {
3772		next = skb->next;
3773		skb->next = NULL;
3774		kfree_skb(skb);
3775	}
3776
3777	napi->gro_list = NULL;
3778	napi->gro_count = 0;
3779}
3780EXPORT_SYMBOL(netif_napi_del);
3781
3782static void net_rx_action(struct softirq_action *h)
3783{
3784	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3785	unsigned long time_limit = jiffies + 2;
3786	int budget = netdev_budget;
3787	void *have;
3788
3789	local_irq_disable();
3790
3791	while (!list_empty(&sd->poll_list)) {
3792		struct napi_struct *n;
3793		int work, weight;
3794
3795		/* If softirq window is exhuasted then punt.
3796		 * Allow this to run for 2 jiffies since which will allow
3797		 * an average latency of 1.5/HZ.
3798		 */
3799		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3800			goto softnet_break;
3801
3802		local_irq_enable();
3803
3804		/* Even though interrupts have been re-enabled, this
3805		 * access is safe because interrupts can only add new
3806		 * entries to the tail of this list, and only ->poll()
3807		 * calls can remove this head entry from the list.
3808		 */
3809		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3810
3811		have = netpoll_poll_lock(n);
3812
3813		weight = n->weight;
3814
3815		/* This NAPI_STATE_SCHED test is for avoiding a race
3816		 * with netpoll's poll_napi().  Only the entity which
3817		 * obtains the lock and sees NAPI_STATE_SCHED set will
3818		 * actually make the ->poll() call.  Therefore we avoid
3819		 * accidentally calling ->poll() when NAPI is not scheduled.
3820		 */
3821		work = 0;
3822		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3823			work = n->poll(n, weight);
3824			trace_napi_poll(n);
3825		}
3826
3827		WARN_ON_ONCE(work > weight);
3828
3829		budget -= work;
3830
3831		local_irq_disable();
3832
3833		/* Drivers must not modify the NAPI state if they
3834		 * consume the entire weight.  In such cases this code
3835		 * still "owns" the NAPI instance and therefore can
3836		 * move the instance around on the list at-will.
3837		 */
3838		if (unlikely(work == weight)) {
3839			if (unlikely(napi_disable_pending(n))) {
3840				local_irq_enable();
3841				napi_complete(n);
3842				local_irq_disable();
3843			} else
3844				list_move_tail(&n->poll_list, &sd->poll_list);
3845		}
3846
3847		netpoll_poll_unlock(have);
3848	}
3849out:
3850	net_rps_action_and_irq_enable(sd);
3851
3852#ifdef CONFIG_NET_DMA
3853	/*
3854	 * There may not be any more sk_buffs coming right now, so push
3855	 * any pending DMA copies to hardware
3856	 */
3857	dma_issue_pending_all();
3858#endif
3859
3860	return;
3861
3862softnet_break:
3863	sd->time_squeeze++;
3864	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3865	goto out;
3866}
3867
3868static gifconf_func_t *gifconf_list[NPROTO];
3869
3870/**
3871 *	register_gifconf	-	register a SIOCGIF handler
3872 *	@family: Address family
3873 *	@gifconf: Function handler
3874 *
3875 *	Register protocol dependent address dumping routines. The handler
3876 *	that is passed must not be freed or reused until it has been replaced
3877 *	by another handler.
3878 */
3879int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3880{
3881	if (family >= NPROTO)
3882		return -EINVAL;
3883	gifconf_list[family] = gifconf;
3884	return 0;
3885}
3886EXPORT_SYMBOL(register_gifconf);
3887
3888
3889/*
3890 *	Map an interface index to its name (SIOCGIFNAME)
3891 */
3892
3893/*
3894 *	We need this ioctl for efficient implementation of the
3895 *	if_indextoname() function required by the IPv6 API.  Without
3896 *	it, we would have to search all the interfaces to find a
3897 *	match.  --pb
3898 */
3899
3900static int dev_ifname(struct net *net, struct ifreq __user *arg)
3901{
3902	struct net_device *dev;
3903	struct ifreq ifr;
3904
3905	/*
3906	 *	Fetch the caller's info block.
3907	 */
3908
3909	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3910		return -EFAULT;
3911
3912	rcu_read_lock();
3913	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3914	if (!dev) {
3915		rcu_read_unlock();
3916		return -ENODEV;
3917	}
3918
3919	strcpy(ifr.ifr_name, dev->name);
3920	rcu_read_unlock();
3921
3922	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3923		return -EFAULT;
3924	return 0;
3925}
3926
3927/*
3928 *	Perform a SIOCGIFCONF call. This structure will change
3929 *	size eventually, and there is nothing I can do about it.
3930 *	Thus we will need a 'compatibility mode'.
3931 */
3932
3933static int dev_ifconf(struct net *net, char __user *arg)
3934{
3935	struct ifconf ifc;
3936	struct net_device *dev;
3937	char __user *pos;
3938	int len;
3939	int total;
3940	int i;
3941
3942	/*
3943	 *	Fetch the caller's info block.
3944	 */
3945
3946	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3947		return -EFAULT;
3948
3949	pos = ifc.ifc_buf;
3950	len = ifc.ifc_len;
3951
3952	/*
3953	 *	Loop over the interfaces, and write an info block for each.
3954	 */
3955
3956	total = 0;
3957	for_each_netdev(net, dev) {
3958		for (i = 0; i < NPROTO; i++) {
3959			if (gifconf_list[i]) {
3960				int done;
3961				if (!pos)
3962					done = gifconf_list[i](dev, NULL, 0);
3963				else
3964					done = gifconf_list[i](dev, pos + total,
3965							       len - total);
3966				if (done < 0)
3967					return -EFAULT;
3968				total += done;
3969			}
3970		}
3971	}
3972
3973	/*
3974	 *	All done.  Write the updated control block back to the caller.
3975	 */
3976	ifc.ifc_len = total;
3977
3978	/*
3979	 * 	Both BSD and Solaris return 0 here, so we do too.
3980	 */
3981	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3982}
3983
3984#ifdef CONFIG_PROC_FS
3985/*
3986 *	This is invoked by the /proc filesystem handler to display a device
3987 *	in detail.
3988 */
3989void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3990	__acquires(RCU)
3991{
3992	struct net *net = seq_file_net(seq);
3993	loff_t off;
3994	struct net_device *dev;
3995
3996	rcu_read_lock();
3997	if (!*pos)
3998		return SEQ_START_TOKEN;
3999
4000	off = 1;
4001	for_each_netdev_rcu(net, dev)
4002		if (off++ == *pos)
4003			return dev;
4004
4005	return NULL;
4006}
4007
4008void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4009{
4010	struct net_device *dev = v;
4011
4012	if (v == SEQ_START_TOKEN)
4013		dev = first_net_device_rcu(seq_file_net(seq));
4014	else
4015		dev = next_net_device_rcu(dev);
4016
4017	++*pos;
4018	return dev;
4019}
4020
4021void dev_seq_stop(struct seq_file *seq, void *v)
4022	__releases(RCU)
4023{
4024	rcu_read_unlock();
4025}
4026
4027static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4028{
4029	struct rtnl_link_stats64 temp;
4030	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4031
4032	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4033		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4034		   dev->name, stats->rx_bytes, stats->rx_packets,
4035		   stats->rx_errors,
4036		   stats->rx_dropped + stats->rx_missed_errors,
4037		   stats->rx_fifo_errors,
4038		   stats->rx_length_errors + stats->rx_over_errors +
4039		    stats->rx_crc_errors + stats->rx_frame_errors,
4040		   stats->rx_compressed, stats->multicast,
4041		   stats->tx_bytes, stats->tx_packets,
4042		   stats->tx_errors, stats->tx_dropped,
4043		   stats->tx_fifo_errors, stats->collisions,
4044		   stats->tx_carrier_errors +
4045		    stats->tx_aborted_errors +
4046		    stats->tx_window_errors +
4047		    stats->tx_heartbeat_errors,
4048		   stats->tx_compressed);
4049}
4050
4051/*
4052 *	Called from the PROCfs module. This now uses the new arbitrary sized
4053 *	/proc/net interface to create /proc/net/dev
4054 */
4055static int dev_seq_show(struct seq_file *seq, void *v)
4056{
4057	if (v == SEQ_START_TOKEN)
4058		seq_puts(seq, "Inter-|   Receive                            "
4059			      "                    |  Transmit\n"
4060			      " face |bytes    packets errs drop fifo frame "
4061			      "compressed multicast|bytes    packets errs "
4062			      "drop fifo colls carrier compressed\n");
4063	else
4064		dev_seq_printf_stats(seq, v);
4065	return 0;
4066}
4067
4068static struct softnet_data *softnet_get_online(loff_t *pos)
4069{
4070	struct softnet_data *sd = NULL;
4071
4072	while (*pos < nr_cpu_ids)
4073		if (cpu_online(*pos)) {
4074			sd = &per_cpu(softnet_data, *pos);
4075			break;
4076		} else
4077			++*pos;
4078	return sd;
4079}
4080
4081static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4082{
4083	return softnet_get_online(pos);
4084}
4085
4086static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4087{
4088	++*pos;
4089	return softnet_get_online(pos);
4090}
4091
4092static void softnet_seq_stop(struct seq_file *seq, void *v)
4093{
4094}
4095
4096static int softnet_seq_show(struct seq_file *seq, void *v)
4097{
4098	struct softnet_data *sd = v;
4099
4100	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4101		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4102		   0, 0, 0, 0, /* was fastroute */
4103		   sd->cpu_collision, sd->received_rps);
4104	return 0;
4105}
4106
4107static const struct seq_operations dev_seq_ops = {
4108	.start = dev_seq_start,
4109	.next  = dev_seq_next,
4110	.stop  = dev_seq_stop,
4111	.show  = dev_seq_show,
4112};
4113
4114static int dev_seq_open(struct inode *inode, struct file *file)
4115{
4116	return seq_open_net(inode, file, &dev_seq_ops,
4117			    sizeof(struct seq_net_private));
4118}
4119
4120static const struct file_operations dev_seq_fops = {
4121	.owner	 = THIS_MODULE,
4122	.open    = dev_seq_open,
4123	.read    = seq_read,
4124	.llseek  = seq_lseek,
4125	.release = seq_release_net,
4126};
4127
4128static const struct seq_operations softnet_seq_ops = {
4129	.start = softnet_seq_start,
4130	.next  = softnet_seq_next,
4131	.stop  = softnet_seq_stop,
4132	.show  = softnet_seq_show,
4133};
4134
4135static int softnet_seq_open(struct inode *inode, struct file *file)
4136{
4137	return seq_open(file, &softnet_seq_ops);
4138}
4139
4140static const struct file_operations softnet_seq_fops = {
4141	.owner	 = THIS_MODULE,
4142	.open    = softnet_seq_open,
4143	.read    = seq_read,
4144	.llseek  = seq_lseek,
4145	.release = seq_release,
4146};
4147
4148static void *ptype_get_idx(loff_t pos)
4149{
4150	struct packet_type *pt = NULL;
4151	loff_t i = 0;
4152	int t;
4153
4154	list_for_each_entry_rcu(pt, &ptype_all, list) {
4155		if (i == pos)
4156			return pt;
4157		++i;
4158	}
4159
4160	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4161		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4162			if (i == pos)
4163				return pt;
4164			++i;
4165		}
4166	}
4167	return NULL;
4168}
4169
4170static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4171	__acquires(RCU)
4172{
4173	rcu_read_lock();
4174	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4175}
4176
4177static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4178{
4179	struct packet_type *pt;
4180	struct list_head *nxt;
4181	int hash;
4182
4183	++*pos;
4184	if (v == SEQ_START_TOKEN)
4185		return ptype_get_idx(0);
4186
4187	pt = v;
4188	nxt = pt->list.next;
4189	if (pt->type == htons(ETH_P_ALL)) {
4190		if (nxt != &ptype_all)
4191			goto found;
4192		hash = 0;
4193		nxt = ptype_base[0].next;
4194	} else
4195		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4196
4197	while (nxt == &ptype_base[hash]) {
4198		if (++hash >= PTYPE_HASH_SIZE)
4199			return NULL;
4200		nxt = ptype_base[hash].next;
4201	}
4202found:
4203	return list_entry(nxt, struct packet_type, list);
4204}
4205
4206static void ptype_seq_stop(struct seq_file *seq, void *v)
4207	__releases(RCU)
4208{
4209	rcu_read_unlock();
4210}
4211
4212static int ptype_seq_show(struct seq_file *seq, void *v)
4213{
4214	struct packet_type *pt = v;
4215
4216	if (v == SEQ_START_TOKEN)
4217		seq_puts(seq, "Type Device      Function\n");
4218	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4219		if (pt->type == htons(ETH_P_ALL))
4220			seq_puts(seq, "ALL ");
4221		else
4222			seq_printf(seq, "%04x", ntohs(pt->type));
4223
4224		seq_printf(seq, " %-8s %pF\n",
4225			   pt->dev ? pt->dev->name : "", pt->func);
4226	}
4227
4228	return 0;
4229}
4230
4231static const struct seq_operations ptype_seq_ops = {
4232	.start = ptype_seq_start,
4233	.next  = ptype_seq_next,
4234	.stop  = ptype_seq_stop,
4235	.show  = ptype_seq_show,
4236};
4237
4238static int ptype_seq_open(struct inode *inode, struct file *file)
4239{
4240	return seq_open_net(inode, file, &ptype_seq_ops,
4241			sizeof(struct seq_net_private));
4242}
4243
4244static const struct file_operations ptype_seq_fops = {
4245	.owner	 = THIS_MODULE,
4246	.open    = ptype_seq_open,
4247	.read    = seq_read,
4248	.llseek  = seq_lseek,
4249	.release = seq_release_net,
4250};
4251
4252
4253static int __net_init dev_proc_net_init(struct net *net)
4254{
4255	int rc = -ENOMEM;
4256
4257	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4258		goto out;
4259	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4260		goto out_dev;
4261	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4262		goto out_softnet;
4263
4264	if (wext_proc_init(net))
4265		goto out_ptype;
4266	rc = 0;
4267out:
4268	return rc;
4269out_ptype:
4270	proc_net_remove(net, "ptype");
4271out_softnet:
4272	proc_net_remove(net, "softnet_stat");
4273out_dev:
4274	proc_net_remove(net, "dev");
4275	goto out;
4276}
4277
4278static void __net_exit dev_proc_net_exit(struct net *net)
4279{
4280	wext_proc_exit(net);
4281
4282	proc_net_remove(net, "ptype");
4283	proc_net_remove(net, "softnet_stat");
4284	proc_net_remove(net, "dev");
4285}
4286
4287static struct pernet_operations __net_initdata dev_proc_ops = {
4288	.init = dev_proc_net_init,
4289	.exit = dev_proc_net_exit,
4290};
4291
4292static int __init dev_proc_init(void)
4293{
4294	return register_pernet_subsys(&dev_proc_ops);
4295}
4296#else
4297#define dev_proc_init() 0
4298#endif	/* CONFIG_PROC_FS */
4299
4300
4301/**
4302 *	netdev_set_master	-	set up master pointer
4303 *	@slave: slave device
4304 *	@master: new master device
4305 *
4306 *	Changes the master device of the slave. Pass %NULL to break the
4307 *	bonding. The caller must hold the RTNL semaphore. On a failure
4308 *	a negative errno code is returned. On success the reference counts
4309 *	are adjusted and the function returns zero.
4310 */
4311int netdev_set_master(struct net_device *slave, struct net_device *master)
4312{
4313	struct net_device *old = slave->master;
4314
4315	ASSERT_RTNL();
4316
4317	if (master) {
4318		if (old)
4319			return -EBUSY;
4320		dev_hold(master);
4321	}
4322
4323	slave->master = master;
4324
4325	if (old)
4326		dev_put(old);
4327	return 0;
4328}
4329EXPORT_SYMBOL(netdev_set_master);
4330
4331/**
4332 *	netdev_set_bond_master	-	set up bonding master/slave pair
4333 *	@slave: slave device
4334 *	@master: new master device
4335 *
4336 *	Changes the master device of the slave. Pass %NULL to break the
4337 *	bonding. The caller must hold the RTNL semaphore. On a failure
4338 *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4339 *	to the routing socket and the function returns zero.
4340 */
4341int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4342{
4343	int err;
4344
4345	ASSERT_RTNL();
4346
4347	err = netdev_set_master(slave, master);
4348	if (err)
4349		return err;
4350	if (master)
4351		slave->flags |= IFF_SLAVE;
4352	else
4353		slave->flags &= ~IFF_SLAVE;
4354
4355	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4356	return 0;
4357}
4358EXPORT_SYMBOL(netdev_set_bond_master);
4359
4360static void dev_change_rx_flags(struct net_device *dev, int flags)
4361{
4362	const struct net_device_ops *ops = dev->netdev_ops;
4363
4364	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4365		ops->ndo_change_rx_flags(dev, flags);
4366}
4367
4368static int __dev_set_promiscuity(struct net_device *dev, int inc)
4369{
4370	unsigned short old_flags = dev->flags;
4371	uid_t uid;
4372	gid_t gid;
4373
4374	ASSERT_RTNL();
4375
4376	dev->flags |= IFF_PROMISC;
4377	dev->promiscuity += inc;
4378	if (dev->promiscuity == 0) {
4379		/*
4380		 * Avoid overflow.
4381		 * If inc causes overflow, untouch promisc and return error.
4382		 */
4383		if (inc < 0)
4384			dev->flags &= ~IFF_PROMISC;
4385		else {
4386			dev->promiscuity -= inc;
4387			printk(KERN_WARNING "%s: promiscuity touches roof, "
4388				"set promiscuity failed, promiscuity feature "
4389				"of device might be broken.\n", dev->name);
4390			return -EOVERFLOW;
4391		}
4392	}
4393	if (dev->flags != old_flags) {
4394		printk(KERN_INFO "device %s %s promiscuous mode\n",
4395		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4396							       "left");
4397		if (audit_enabled) {
4398			current_uid_gid(&uid, &gid);
4399			audit_log(current->audit_context, GFP_ATOMIC,
4400				AUDIT_ANOM_PROMISCUOUS,
4401				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4402				dev->name, (dev->flags & IFF_PROMISC),
4403				(old_flags & IFF_PROMISC),
4404				audit_get_loginuid(current),
4405				uid, gid,
4406				audit_get_sessionid(current));
4407		}
4408
4409		dev_change_rx_flags(dev, IFF_PROMISC);
4410	}
4411	return 0;
4412}
4413
4414/**
4415 *	dev_set_promiscuity	- update promiscuity count on a device
4416 *	@dev: device
4417 *	@inc: modifier
4418 *
4419 *	Add or remove promiscuity from a device. While the count in the device
4420 *	remains above zero the interface remains promiscuous. Once it hits zero
4421 *	the device reverts back to normal filtering operation. A negative inc
4422 *	value is used to drop promiscuity on the device.
4423 *	Return 0 if successful or a negative errno code on error.
4424 */
4425int dev_set_promiscuity(struct net_device *dev, int inc)
4426{
4427	unsigned short old_flags = dev->flags;
4428	int err;
4429
4430	err = __dev_set_promiscuity(dev, inc);
4431	if (err < 0)
4432		return err;
4433	if (dev->flags != old_flags)
4434		dev_set_rx_mode(dev);
4435	return err;
4436}
4437EXPORT_SYMBOL(dev_set_promiscuity);
4438
4439/**
4440 *	dev_set_allmulti	- update allmulti count on a device
4441 *	@dev: device
4442 *	@inc: modifier
4443 *
4444 *	Add or remove reception of all multicast frames to a device. While the
4445 *	count in the device remains above zero the interface remains listening
4446 *	to all interfaces. Once it hits zero the device reverts back to normal
4447 *	filtering operation. A negative @inc value is used to drop the counter
4448 *	when releasing a resource needing all multicasts.
4449 *	Return 0 if successful or a negative errno code on error.
4450 */
4451
4452int dev_set_allmulti(struct net_device *dev, int inc)
4453{
4454	unsigned short old_flags = dev->flags;
4455
4456	ASSERT_RTNL();
4457
4458	dev->flags |= IFF_ALLMULTI;
4459	dev->allmulti += inc;
4460	if (dev->allmulti == 0) {
4461		/*
4462		 * Avoid overflow.
4463		 * If inc causes overflow, untouch allmulti and return error.
4464		 */
4465		if (inc < 0)
4466			dev->flags &= ~IFF_ALLMULTI;
4467		else {
4468			dev->allmulti -= inc;
4469			printk(KERN_WARNING "%s: allmulti touches roof, "
4470				"set allmulti failed, allmulti feature of "
4471				"device might be broken.\n", dev->name);
4472			return -EOVERFLOW;
4473		}
4474	}
4475	if (dev->flags ^ old_flags) {
4476		dev_change_rx_flags(dev, IFF_ALLMULTI);
4477		dev_set_rx_mode(dev);
4478	}
4479	return 0;
4480}
4481EXPORT_SYMBOL(dev_set_allmulti);
4482
4483/*
4484 *	Upload unicast and multicast address lists to device and
4485 *	configure RX filtering. When the device doesn't support unicast
4486 *	filtering it is put in promiscuous mode while unicast addresses
4487 *	are present.
4488 */
4489void __dev_set_rx_mode(struct net_device *dev)
4490{
4491	const struct net_device_ops *ops = dev->netdev_ops;
4492
4493	/* dev_open will call this function so the list will stay sane. */
4494	if (!(dev->flags&IFF_UP))
4495		return;
4496
4497	if (!netif_device_present(dev))
4498		return;
4499
4500	if (ops->ndo_set_rx_mode)
4501		ops->ndo_set_rx_mode(dev);
4502	else {
4503		/* Unicast addresses changes may only happen under the rtnl,
4504		 * therefore calling __dev_set_promiscuity here is safe.
4505		 */
4506		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4507			__dev_set_promiscuity(dev, 1);
4508			dev->uc_promisc = true;
4509		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4510			__dev_set_promiscuity(dev, -1);
4511			dev->uc_promisc = false;
4512		}
4513
4514		if (ops->ndo_set_multicast_list)
4515			ops->ndo_set_multicast_list(dev);
4516	}
4517}
4518
4519void dev_set_rx_mode(struct net_device *dev)
4520{
4521	netif_addr_lock_bh(dev);
4522	__dev_set_rx_mode(dev);
4523	netif_addr_unlock_bh(dev);
4524}
4525
4526/**
4527 *	dev_ethtool_get_settings - call device's ethtool_ops::get_settings()
4528 *	@dev: device
4529 *	@cmd: memory area for ethtool_ops::get_settings() result
4530 *
4531 *      The cmd arg is initialized properly (cleared and
4532 *      ethtool_cmd::cmd field set to ETHTOOL_GSET).
4533 *
4534 *	Return device's ethtool_ops::get_settings() result value or
4535 *	-EOPNOTSUPP when device doesn't expose
4536 *	ethtool_ops::get_settings() operation.
4537 */
4538int dev_ethtool_get_settings(struct net_device *dev,
4539			     struct ethtool_cmd *cmd)
4540{
4541	if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings)
4542		return -EOPNOTSUPP;
4543
4544	memset(cmd, 0, sizeof(struct ethtool_cmd));
4545	cmd->cmd = ETHTOOL_GSET;
4546	return dev->ethtool_ops->get_settings(dev, cmd);
4547}
4548EXPORT_SYMBOL(dev_ethtool_get_settings);
4549
4550/**
4551 *	dev_get_flags - get flags reported to userspace
4552 *	@dev: device
4553 *
4554 *	Get the combination of flag bits exported through APIs to userspace.
4555 */
4556unsigned dev_get_flags(const struct net_device *dev)
4557{
4558	unsigned flags;
4559
4560	flags = (dev->flags & ~(IFF_PROMISC |
4561				IFF_ALLMULTI |
4562				IFF_RUNNING |
4563				IFF_LOWER_UP |
4564				IFF_DORMANT)) |
4565		(dev->gflags & (IFF_PROMISC |
4566				IFF_ALLMULTI));
4567
4568	if (netif_running(dev)) {
4569		if (netif_oper_up(dev))
4570			flags |= IFF_RUNNING;
4571		if (netif_carrier_ok(dev))
4572			flags |= IFF_LOWER_UP;
4573		if (netif_dormant(dev))
4574			flags |= IFF_DORMANT;
4575	}
4576
4577	return flags;
4578}
4579EXPORT_SYMBOL(dev_get_flags);
4580
4581int __dev_change_flags(struct net_device *dev, unsigned int flags)
4582{
4583	int old_flags = dev->flags;
4584	int ret;
4585
4586	ASSERT_RTNL();
4587
4588	/*
4589	 *	Set the flags on our device.
4590	 */
4591
4592	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4593			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4594			       IFF_AUTOMEDIA)) |
4595		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4596				    IFF_ALLMULTI));
4597
4598	/*
4599	 *	Load in the correct multicast list now the flags have changed.
4600	 */
4601
4602	if ((old_flags ^ flags) & IFF_MULTICAST)
4603		dev_change_rx_flags(dev, IFF_MULTICAST);
4604
4605	dev_set_rx_mode(dev);
4606
4607	/*
4608	 *	Have we downed the interface. We handle IFF_UP ourselves
4609	 *	according to user attempts to set it, rather than blindly
4610	 *	setting it.
4611	 */
4612
4613	ret = 0;
4614	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4615		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4616
4617		if (!ret)
4618			dev_set_rx_mode(dev);
4619	}
4620
4621	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4622		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4623
4624		dev->gflags ^= IFF_PROMISC;
4625		dev_set_promiscuity(dev, inc);
4626	}
4627
4628	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4629	   is important. Some (broken) drivers set IFF_PROMISC, when
4630	   IFF_ALLMULTI is requested not asking us and not reporting.
4631	 */
4632	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4633		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4634
4635		dev->gflags ^= IFF_ALLMULTI;
4636		dev_set_allmulti(dev, inc);
4637	}
4638
4639	return ret;
4640}
4641
4642void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4643{
4644	unsigned int changes = dev->flags ^ old_flags;
4645
4646	if (changes & IFF_UP) {
4647		if (dev->flags & IFF_UP)
4648			call_netdevice_notifiers(NETDEV_UP, dev);
4649		else
4650			call_netdevice_notifiers(NETDEV_DOWN, dev);
4651	}
4652
4653	if (dev->flags & IFF_UP &&
4654	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4655		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4656}
4657
4658/**
4659 *	dev_change_flags - change device settings
4660 *	@dev: device
4661 *	@flags: device state flags
4662 *
4663 *	Change settings on device based state flags. The flags are
4664 *	in the userspace exported format.
4665 */
4666int dev_change_flags(struct net_device *dev, unsigned flags)
4667{
4668	int ret, changes;
4669	int old_flags = dev->flags;
4670
4671	ret = __dev_change_flags(dev, flags);
4672	if (ret < 0)
4673		return ret;
4674
4675	changes = old_flags ^ dev->flags;
4676	if (changes)
4677		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4678
4679	__dev_notify_flags(dev, old_flags);
4680	return ret;
4681}
4682EXPORT_SYMBOL(dev_change_flags);
4683
4684/**
4685 *	dev_set_mtu - Change maximum transfer unit
4686 *	@dev: device
4687 *	@new_mtu: new transfer unit
4688 *
4689 *	Change the maximum transfer size of the network device.
4690 */
4691int dev_set_mtu(struct net_device *dev, int new_mtu)
4692{
4693	const struct net_device_ops *ops = dev->netdev_ops;
4694	int err;
4695
4696	if (new_mtu == dev->mtu)
4697		return 0;
4698
4699	/*	MTU must be positive.	 */
4700	if (new_mtu < 0)
4701		return -EINVAL;
4702
4703	if (!netif_device_present(dev))
4704		return -ENODEV;
4705
4706	err = 0;
4707	if (ops->ndo_change_mtu)
4708		err = ops->ndo_change_mtu(dev, new_mtu);
4709	else
4710		dev->mtu = new_mtu;
4711
4712	if (!err && dev->flags & IFF_UP)
4713		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4714	return err;
4715}
4716EXPORT_SYMBOL(dev_set_mtu);
4717
4718/**
4719 *	dev_set_group - Change group this device belongs to
4720 *	@dev: device
4721 *	@new_group: group this device should belong to
4722 */
4723void dev_set_group(struct net_device *dev, int new_group)
4724{
4725	dev->group = new_group;
4726}
4727EXPORT_SYMBOL(dev_set_group);
4728
4729/**
4730 *	dev_set_mac_address - Change Media Access Control Address
4731 *	@dev: device
4732 *	@sa: new address
4733 *
4734 *	Change the hardware (MAC) address of the device
4735 */
4736int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4737{
4738	const struct net_device_ops *ops = dev->netdev_ops;
4739	int err;
4740
4741	if (!ops->ndo_set_mac_address)
4742		return -EOPNOTSUPP;
4743	if (sa->sa_family != dev->type)
4744		return -EINVAL;
4745	if (!netif_device_present(dev))
4746		return -ENODEV;
4747	err = ops->ndo_set_mac_address(dev, sa);
4748	if (!err)
4749		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4750	return err;
4751}
4752EXPORT_SYMBOL(dev_set_mac_address);
4753
4754/*
4755 *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4756 */
4757static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4758{
4759	int err;
4760	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4761
4762	if (!dev)
4763		return -ENODEV;
4764
4765	switch (cmd) {
4766	case SIOCGIFFLAGS:	/* Get interface flags */
4767		ifr->ifr_flags = (short) dev_get_flags(dev);
4768		return 0;
4769
4770	case SIOCGIFMETRIC:	/* Get the metric on the interface
4771				   (currently unused) */
4772		ifr->ifr_metric = 0;
4773		return 0;
4774
4775	case SIOCGIFMTU:	/* Get the MTU of a device */
4776		ifr->ifr_mtu = dev->mtu;
4777		return 0;
4778
4779	case SIOCGIFHWADDR:
4780		if (!dev->addr_len)
4781			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4782		else
4783			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4784			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4785		ifr->ifr_hwaddr.sa_family = dev->type;
4786		return 0;
4787
4788	case SIOCGIFSLAVE:
4789		err = -EINVAL;
4790		break;
4791
4792	case SIOCGIFMAP:
4793		ifr->ifr_map.mem_start = dev->mem_start;
4794		ifr->ifr_map.mem_end   = dev->mem_end;
4795		ifr->ifr_map.base_addr = dev->base_addr;
4796		ifr->ifr_map.irq       = dev->irq;
4797		ifr->ifr_map.dma       = dev->dma;
4798		ifr->ifr_map.port      = dev->if_port;
4799		return 0;
4800
4801	case SIOCGIFINDEX:
4802		ifr->ifr_ifindex = dev->ifindex;
4803		return 0;
4804
4805	case SIOCGIFTXQLEN:
4806		ifr->ifr_qlen = dev->tx_queue_len;
4807		return 0;
4808
4809	default:
4810		/* dev_ioctl() should ensure this case
4811		 * is never reached
4812		 */
4813		WARN_ON(1);
4814		err = -ENOTTY;
4815		break;
4816
4817	}
4818	return err;
4819}
4820
4821/*
4822 *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4823 */
4824static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4825{
4826	int err;
4827	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4828	const struct net_device_ops *ops;
4829
4830	if (!dev)
4831		return -ENODEV;
4832
4833	ops = dev->netdev_ops;
4834
4835	switch (cmd) {
4836	case SIOCSIFFLAGS:	/* Set interface flags */
4837		return dev_change_flags(dev, ifr->ifr_flags);
4838
4839	case SIOCSIFMETRIC:	/* Set the metric on the interface
4840				   (currently unused) */
4841		return -EOPNOTSUPP;
4842
4843	case SIOCSIFMTU:	/* Set the MTU of a device */
4844		return dev_set_mtu(dev, ifr->ifr_mtu);
4845
4846	case SIOCSIFHWADDR:
4847		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4848
4849	case SIOCSIFHWBROADCAST:
4850		if (ifr->ifr_hwaddr.sa_family != dev->type)
4851			return -EINVAL;
4852		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4853		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4854		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4855		return 0;
4856
4857	case SIOCSIFMAP:
4858		if (ops->ndo_set_config) {
4859			if (!netif_device_present(dev))
4860				return -ENODEV;
4861			return ops->ndo_set_config(dev, &ifr->ifr_map);
4862		}
4863		return -EOPNOTSUPP;
4864
4865	case SIOCADDMULTI:
4866		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4867		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4868			return -EINVAL;
4869		if (!netif_device_present(dev))
4870			return -ENODEV;
4871		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4872
4873	case SIOCDELMULTI:
4874		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4875		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4876			return -EINVAL;
4877		if (!netif_device_present(dev))
4878			return -ENODEV;
4879		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4880
4881	case SIOCSIFTXQLEN:
4882		if (ifr->ifr_qlen < 0)
4883			return -EINVAL;
4884		dev->tx_queue_len = ifr->ifr_qlen;
4885		return 0;
4886
4887	case SIOCSIFNAME:
4888		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4889		return dev_change_name(dev, ifr->ifr_newname);
4890
4891	/*
4892	 *	Unknown or private ioctl
4893	 */
4894	default:
4895		if ((cmd >= SIOCDEVPRIVATE &&
4896		    cmd <= SIOCDEVPRIVATE + 15) ||
4897		    cmd == SIOCBONDENSLAVE ||
4898		    cmd == SIOCBONDRELEASE ||
4899		    cmd == SIOCBONDSETHWADDR ||
4900		    cmd == SIOCBONDSLAVEINFOQUERY ||
4901		    cmd == SIOCBONDINFOQUERY ||
4902		    cmd == SIOCBONDCHANGEACTIVE ||
4903		    cmd == SIOCGMIIPHY ||
4904		    cmd == SIOCGMIIREG ||
4905		    cmd == SIOCSMIIREG ||
4906		    cmd == SIOCBRADDIF ||
4907		    cmd == SIOCBRDELIF ||
4908		    cmd == SIOCSHWTSTAMP ||
4909		    cmd == SIOCWANDEV) {
4910			err = -EOPNOTSUPP;
4911			if (ops->ndo_do_ioctl) {
4912				if (netif_device_present(dev))
4913					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4914				else
4915					err = -ENODEV;
4916			}
4917		} else
4918			err = -EINVAL;
4919
4920	}
4921	return err;
4922}
4923
4924/*
4925 *	This function handles all "interface"-type I/O control requests. The actual
4926 *	'doing' part of this is dev_ifsioc above.
4927 */
4928
4929/**
4930 *	dev_ioctl	-	network device ioctl
4931 *	@net: the applicable net namespace
4932 *	@cmd: command to issue
4933 *	@arg: pointer to a struct ifreq in user space
4934 *
4935 *	Issue ioctl functions to devices. This is normally called by the
4936 *	user space syscall interfaces but can sometimes be useful for
4937 *	other purposes. The return value is the return from the syscall if
4938 *	positive or a negative errno code on error.
4939 */
4940
4941int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4942{
4943	struct ifreq ifr;
4944	int ret;
4945	char *colon;
4946
4947	/* One special case: SIOCGIFCONF takes ifconf argument
4948	   and requires shared lock, because it sleeps writing
4949	   to user space.
4950	 */
4951
4952	if (cmd == SIOCGIFCONF) {
4953		rtnl_lock();
4954		ret = dev_ifconf(net, (char __user *) arg);
4955		rtnl_unlock();
4956		return ret;
4957	}
4958	if (cmd == SIOCGIFNAME)
4959		return dev_ifname(net, (struct ifreq __user *)arg);
4960
4961	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4962		return -EFAULT;
4963
4964	ifr.ifr_name[IFNAMSIZ-1] = 0;
4965
4966	colon = strchr(ifr.ifr_name, ':');
4967	if (colon)
4968		*colon = 0;
4969
4970	/*
4971	 *	See which interface the caller is talking about.
4972	 */
4973
4974	switch (cmd) {
4975	/*
4976	 *	These ioctl calls:
4977	 *	- can be done by all.
4978	 *	- atomic and do not require locking.
4979	 *	- return a value
4980	 */
4981	case SIOCGIFFLAGS:
4982	case SIOCGIFMETRIC:
4983	case SIOCGIFMTU:
4984	case SIOCGIFHWADDR:
4985	case SIOCGIFSLAVE:
4986	case SIOCGIFMAP:
4987	case SIOCGIFINDEX:
4988	case SIOCGIFTXQLEN:
4989		dev_load(net, ifr.ifr_name);
4990		rcu_read_lock();
4991		ret = dev_ifsioc_locked(net, &ifr, cmd);
4992		rcu_read_unlock();
4993		if (!ret) {
4994			if (colon)
4995				*colon = ':';
4996			if (copy_to_user(arg, &ifr,
4997					 sizeof(struct ifreq)))
4998				ret = -EFAULT;
4999		}
5000		return ret;
5001
5002	case SIOCETHTOOL:
5003		dev_load(net, ifr.ifr_name);
5004		rtnl_lock();
5005		ret = dev_ethtool(net, &ifr);
5006		rtnl_unlock();
5007		if (!ret) {
5008			if (colon)
5009				*colon = ':';
5010			if (copy_to_user(arg, &ifr,
5011					 sizeof(struct ifreq)))
5012				ret = -EFAULT;
5013		}
5014		return ret;
5015
5016	/*
5017	 *	These ioctl calls:
5018	 *	- require superuser power.
5019	 *	- require strict serialization.
5020	 *	- return a value
5021	 */
5022	case SIOCGMIIPHY:
5023	case SIOCGMIIREG:
5024	case SIOCSIFNAME:
5025		if (!capable(CAP_NET_ADMIN))
5026			return -EPERM;
5027		dev_load(net, ifr.ifr_name);
5028		rtnl_lock();
5029		ret = dev_ifsioc(net, &ifr, cmd);
5030		rtnl_unlock();
5031		if (!ret) {
5032			if (colon)
5033				*colon = ':';
5034			if (copy_to_user(arg, &ifr,
5035					 sizeof(struct ifreq)))
5036				ret = -EFAULT;
5037		}
5038		return ret;
5039
5040	/*
5041	 *	These ioctl calls:
5042	 *	- require superuser power.
5043	 *	- require strict serialization.
5044	 *	- do not return a value
5045	 */
5046	case SIOCSIFFLAGS:
5047	case SIOCSIFMETRIC:
5048	case SIOCSIFMTU:
5049	case SIOCSIFMAP:
5050	case SIOCSIFHWADDR:
5051	case SIOCSIFSLAVE:
5052	case SIOCADDMULTI:
5053	case SIOCDELMULTI:
5054	case SIOCSIFHWBROADCAST:
5055	case SIOCSIFTXQLEN:
5056	case SIOCSMIIREG:
5057	case SIOCBONDENSLAVE:
5058	case SIOCBONDRELEASE:
5059	case SIOCBONDSETHWADDR:
5060	case SIOCBONDCHANGEACTIVE:
5061	case SIOCBRADDIF:
5062	case SIOCBRDELIF:
5063	case SIOCSHWTSTAMP:
5064		if (!capable(CAP_NET_ADMIN))
5065			return -EPERM;
5066		/* fall through */
5067	case SIOCBONDSLAVEINFOQUERY:
5068	case SIOCBONDINFOQUERY:
5069		dev_load(net, ifr.ifr_name);
5070		rtnl_lock();
5071		ret = dev_ifsioc(net, &ifr, cmd);
5072		rtnl_unlock();
5073		return ret;
5074
5075	case SIOCGIFMEM:
5076		/* Get the per device memory space. We can add this but
5077		 * currently do not support it */
5078	case SIOCSIFMEM:
5079		/* Set the per device memory buffer space.
5080		 * Not applicable in our case */
5081	case SIOCSIFLINK:
5082		return -ENOTTY;
5083
5084	/*
5085	 *	Unknown or private ioctl.
5086	 */
5087	default:
5088		if (cmd == SIOCWANDEV ||
5089		    (cmd >= SIOCDEVPRIVATE &&
5090		     cmd <= SIOCDEVPRIVATE + 15)) {
5091			dev_load(net, ifr.ifr_name);
5092			rtnl_lock();
5093			ret = dev_ifsioc(net, &ifr, cmd);
5094			rtnl_unlock();
5095			if (!ret && copy_to_user(arg, &ifr,
5096						 sizeof(struct ifreq)))
5097				ret = -EFAULT;
5098			return ret;
5099		}
5100		/* Take care of Wireless Extensions */
5101		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5102			return wext_handle_ioctl(net, &ifr, cmd, arg);
5103		return -ENOTTY;
5104	}
5105}
5106
5107
5108/**
5109 *	dev_new_index	-	allocate an ifindex
5110 *	@net: the applicable net namespace
5111 *
5112 *	Returns a suitable unique value for a new device interface
5113 *	number.  The caller must hold the rtnl semaphore or the
5114 *	dev_base_lock to be sure it remains unique.
5115 */
5116static int dev_new_index(struct net *net)
5117{
5118	static int ifindex;
5119	for (;;) {
5120		if (++ifindex <= 0)
5121			ifindex = 1;
5122		if (!__dev_get_by_index(net, ifindex))
5123			return ifindex;
5124	}
5125}
5126
5127/* Delayed registration/unregisteration */
5128static LIST_HEAD(net_todo_list);
5129
5130static void net_set_todo(struct net_device *dev)
5131{
5132	list_add_tail(&dev->todo_list, &net_todo_list);
5133}
5134
5135static void rollback_registered_many(struct list_head *head)
5136{
5137	struct net_device *dev, *tmp;
5138
5139	BUG_ON(dev_boot_phase);
5140	ASSERT_RTNL();
5141
5142	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5143		/* Some devices call without registering
5144		 * for initialization unwind. Remove those
5145		 * devices and proceed with the remaining.
5146		 */
5147		if (dev->reg_state == NETREG_UNINITIALIZED) {
5148			pr_debug("unregister_netdevice: device %s/%p never "
5149				 "was registered\n", dev->name, dev);
5150
5151			WARN_ON(1);
5152			list_del(&dev->unreg_list);
5153			continue;
5154		}
5155		dev->dismantle = true;
5156		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5157	}
5158
5159	/* If device is running, close it first. */
5160	dev_close_many(head);
5161
5162	list_for_each_entry(dev, head, unreg_list) {
5163		/* And unlink it from device chain. */
5164		unlist_netdevice(dev);
5165
5166		dev->reg_state = NETREG_UNREGISTERING;
5167	}
5168
5169	synchronize_net();
5170
5171	list_for_each_entry(dev, head, unreg_list) {
5172		/* Shutdown queueing discipline. */
5173		dev_shutdown(dev);
5174
5175
5176		/* Notify protocols, that we are about to destroy
5177		   this device. They should clean all the things.
5178		*/
5179		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5180
5181		if (!dev->rtnl_link_ops ||
5182		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5183			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5184
5185		/*
5186		 *	Flush the unicast and multicast chains
5187		 */
5188		dev_uc_flush(dev);
5189		dev_mc_flush(dev);
5190
5191		if (dev->netdev_ops->ndo_uninit)
5192			dev->netdev_ops->ndo_uninit(dev);
5193
5194		/* Notifier chain MUST detach us from master device. */
5195		WARN_ON(dev->master);
5196
5197		/* Remove entries from kobject tree */
5198		netdev_unregister_kobject(dev);
5199	}
5200
5201	/* Process any work delayed until the end of the batch */
5202	dev = list_first_entry(head, struct net_device, unreg_list);
5203	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5204
5205	rcu_barrier();
5206
5207	list_for_each_entry(dev, head, unreg_list)
5208		dev_put(dev);
5209}
5210
5211static void rollback_registered(struct net_device *dev)
5212{
5213	LIST_HEAD(single);
5214
5215	list_add(&dev->unreg_list, &single);
5216	rollback_registered_many(&single);
5217	list_del(&single);
5218}
5219
5220static u32 netdev_fix_features(struct net_device *dev, u32 features)
5221{
5222	/* Fix illegal checksum combinations */
5223	if ((features & NETIF_F_HW_CSUM) &&
5224	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5225		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5226		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5227	}
5228
5229	if ((features & NETIF_F_NO_CSUM) &&
5230	    (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5231		netdev_warn(dev, "mixed no checksumming and other settings.\n");
5232		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5233	}
5234
5235	/* Fix illegal SG+CSUM combinations. */
5236	if ((features & NETIF_F_SG) &&
5237	    !(features & NETIF_F_ALL_CSUM)) {
5238		netdev_dbg(dev,
5239			"Dropping NETIF_F_SG since no checksum feature.\n");
5240		features &= ~NETIF_F_SG;
5241	}
5242
5243	/* TSO requires that SG is present as well. */
5244	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5245		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5246		features &= ~NETIF_F_ALL_TSO;
5247	}
5248
5249	/* TSO ECN requires that TSO is present as well. */
5250	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5251		features &= ~NETIF_F_TSO_ECN;
5252
5253	/* Software GSO depends on SG. */
5254	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5255		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5256		features &= ~NETIF_F_GSO;
5257	}
5258
5259	/* UFO needs SG and checksumming */
5260	if (features & NETIF_F_UFO) {
5261		/* maybe split UFO into V4 and V6? */
5262		if (!((features & NETIF_F_GEN_CSUM) ||
5263		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5264			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5265			netdev_dbg(dev,
5266				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5267			features &= ~NETIF_F_UFO;
5268		}
5269
5270		if (!(features & NETIF_F_SG)) {
5271			netdev_dbg(dev,
5272				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5273			features &= ~NETIF_F_UFO;
5274		}
5275	}
5276
5277	return features;
5278}
5279
5280int __netdev_update_features(struct net_device *dev)
5281{
5282	u32 features;
5283	int err = 0;
5284
5285	ASSERT_RTNL();
5286
5287	features = netdev_get_wanted_features(dev);
5288
5289	if (dev->netdev_ops->ndo_fix_features)
5290		features = dev->netdev_ops->ndo_fix_features(dev, features);
5291
5292	/* driver might be less strict about feature dependencies */
5293	features = netdev_fix_features(dev, features);
5294
5295	if (dev->features == features)
5296		return 0;
5297
5298	netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
5299		dev->features, features);
5300
5301	if (dev->netdev_ops->ndo_set_features)
5302		err = dev->netdev_ops->ndo_set_features(dev, features);
5303
5304	if (unlikely(err < 0)) {
5305		netdev_err(dev,
5306			"set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5307			err, features, dev->features);
5308		return -1;
5309	}
5310
5311	if (!err)
5312		dev->features = features;
5313
5314	return 1;
5315}
5316
5317/**
5318 *	netdev_update_features - recalculate device features
5319 *	@dev: the device to check
5320 *
5321 *	Recalculate dev->features set and send notifications if it
5322 *	has changed. Should be called after driver or hardware dependent
5323 *	conditions might have changed that influence the features.
5324 */
5325void netdev_update_features(struct net_device *dev)
5326{
5327	if (__netdev_update_features(dev))
5328		netdev_features_change(dev);
5329}
5330EXPORT_SYMBOL(netdev_update_features);
5331
5332/**
5333 *	netdev_change_features - recalculate device features
5334 *	@dev: the device to check
5335 *
5336 *	Recalculate dev->features set and send notifications even
5337 *	if they have not changed. Should be called instead of
5338 *	netdev_update_features() if also dev->vlan_features might
5339 *	have changed to allow the changes to be propagated to stacked
5340 *	VLAN devices.
5341 */
5342void netdev_change_features(struct net_device *dev)
5343{
5344	__netdev_update_features(dev);
5345	netdev_features_change(dev);
5346}
5347EXPORT_SYMBOL(netdev_change_features);
5348
5349/**
5350 *	netif_stacked_transfer_operstate -	transfer operstate
5351 *	@rootdev: the root or lower level device to transfer state from
5352 *	@dev: the device to transfer operstate to
5353 *
5354 *	Transfer operational state from root to device. This is normally
5355 *	called when a stacking relationship exists between the root
5356 *	device and the device(a leaf device).
5357 */
5358void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5359					struct net_device *dev)
5360{
5361	if (rootdev->operstate == IF_OPER_DORMANT)
5362		netif_dormant_on(dev);
5363	else
5364		netif_dormant_off(dev);
5365
5366	if (netif_carrier_ok(rootdev)) {
5367		if (!netif_carrier_ok(dev))
5368			netif_carrier_on(dev);
5369	} else {
5370		if (netif_carrier_ok(dev))
5371			netif_carrier_off(dev);
5372	}
5373}
5374EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5375
5376#ifdef CONFIG_RPS
5377static int netif_alloc_rx_queues(struct net_device *dev)
5378{
5379	unsigned int i, count = dev->num_rx_queues;
5380	struct netdev_rx_queue *rx;
5381
5382	BUG_ON(count < 1);
5383
5384	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5385	if (!rx) {
5386		pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5387		return -ENOMEM;
5388	}
5389	dev->_rx = rx;
5390
5391	for (i = 0; i < count; i++)
5392		rx[i].dev = dev;
5393	return 0;
5394}
5395#endif
5396
5397static void netdev_init_one_queue(struct net_device *dev,
5398				  struct netdev_queue *queue, void *_unused)
5399{
5400	/* Initialize queue lock */
5401	spin_lock_init(&queue->_xmit_lock);
5402	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5403	queue->xmit_lock_owner = -1;
5404	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5405	queue->dev = dev;
5406}
5407
5408static int netif_alloc_netdev_queues(struct net_device *dev)
5409{
5410	unsigned int count = dev->num_tx_queues;
5411	struct netdev_queue *tx;
5412
5413	BUG_ON(count < 1);
5414
5415	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5416	if (!tx) {
5417		pr_err("netdev: Unable to allocate %u tx queues.\n",
5418		       count);
5419		return -ENOMEM;
5420	}
5421	dev->_tx = tx;
5422
5423	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5424	spin_lock_init(&dev->tx_global_lock);
5425
5426	return 0;
5427}
5428
5429/**
5430 *	register_netdevice	- register a network device
5431 *	@dev: device to register
5432 *
5433 *	Take a completed network device structure and add it to the kernel
5434 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5435 *	chain. 0 is returned on success. A negative errno code is returned
5436 *	on a failure to set up the device, or if the name is a duplicate.
5437 *
5438 *	Callers must hold the rtnl semaphore. You may want
5439 *	register_netdev() instead of this.
5440 *
5441 *	BUGS:
5442 *	The locking appears insufficient to guarantee two parallel registers
5443 *	will not get the same name.
5444 */
5445
5446int register_netdevice(struct net_device *dev)
5447{
5448	int ret;
5449	struct net *net = dev_net(dev);
5450
5451	BUG_ON(dev_boot_phase);
5452	ASSERT_RTNL();
5453
5454	might_sleep();
5455
5456	/* When net_device's are persistent, this will be fatal. */
5457	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5458	BUG_ON(!net);
5459
5460	spin_lock_init(&dev->addr_list_lock);
5461	netdev_set_addr_lockdep_class(dev);
5462
5463	dev->iflink = -1;
5464
5465	ret = dev_get_valid_name(dev, dev->name);
5466	if (ret < 0)
5467		goto out;
5468
5469	/* Init, if this function is available */
5470	if (dev->netdev_ops->ndo_init) {
5471		ret = dev->netdev_ops->ndo_init(dev);
5472		if (ret) {
5473			if (ret > 0)
5474				ret = -EIO;
5475			goto out;
5476		}
5477	}
5478
5479	dev->ifindex = dev_new_index(net);
5480	if (dev->iflink == -1)
5481		dev->iflink = dev->ifindex;
5482
5483	/* Transfer changeable features to wanted_features and enable
5484	 * software offloads (GSO and GRO).
5485	 */
5486	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5487	dev->features |= NETIF_F_SOFT_FEATURES;
5488	dev->wanted_features = dev->features & dev->hw_features;
5489
5490	/* Turn on no cache copy if HW is doing checksum */
5491	dev->hw_features |= NETIF_F_NOCACHE_COPY;
5492	if ((dev->features & NETIF_F_ALL_CSUM) &&
5493	    !(dev->features & NETIF_F_NO_CSUM)) {
5494		dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5495		dev->features |= NETIF_F_NOCACHE_COPY;
5496	}
5497
5498	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5499	 */
5500	dev->vlan_features |= NETIF_F_HIGHDMA;
5501
5502	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5503	ret = notifier_to_errno(ret);
5504	if (ret)
5505		goto err_uninit;
5506
5507	ret = netdev_register_kobject(dev);
5508	if (ret)
5509		goto err_uninit;
5510	dev->reg_state = NETREG_REGISTERED;
5511
5512	__netdev_update_features(dev);
5513
5514	/*
5515	 *	Default initial state at registry is that the
5516	 *	device is present.
5517	 */
5518
5519	set_bit(__LINK_STATE_PRESENT, &dev->state);
5520
5521	dev_init_scheduler(dev);
5522	dev_hold(dev);
5523	list_netdevice(dev);
5524
5525	/* Notify protocols, that a new device appeared. */
5526	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5527	ret = notifier_to_errno(ret);
5528	if (ret) {
5529		rollback_registered(dev);
5530		dev->reg_state = NETREG_UNREGISTERED;
5531	}
5532	/*
5533	 *	Prevent userspace races by waiting until the network
5534	 *	device is fully setup before sending notifications.
5535	 */
5536	if (!dev->rtnl_link_ops ||
5537	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5538		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5539
5540out:
5541	return ret;
5542
5543err_uninit:
5544	if (dev->netdev_ops->ndo_uninit)
5545		dev->netdev_ops->ndo_uninit(dev);
5546	goto out;
5547}
5548EXPORT_SYMBOL(register_netdevice);
5549
5550/**
5551 *	init_dummy_netdev	- init a dummy network device for NAPI
5552 *	@dev: device to init
5553 *
5554 *	This takes a network device structure and initialize the minimum
5555 *	amount of fields so it can be used to schedule NAPI polls without
5556 *	registering a full blown interface. This is to be used by drivers
5557 *	that need to tie several hardware interfaces to a single NAPI
5558 *	poll scheduler due to HW limitations.
5559 */
5560int init_dummy_netdev(struct net_device *dev)
5561{
5562	/* Clear everything. Note we don't initialize spinlocks
5563	 * are they aren't supposed to be taken by any of the
5564	 * NAPI code and this dummy netdev is supposed to be
5565	 * only ever used for NAPI polls
5566	 */
5567	memset(dev, 0, sizeof(struct net_device));
5568
5569	/* make sure we BUG if trying to hit standard
5570	 * register/unregister code path
5571	 */
5572	dev->reg_state = NETREG_DUMMY;
5573
5574	/* NAPI wants this */
5575	INIT_LIST_HEAD(&dev->napi_list);
5576
5577	/* a dummy interface is started by default */
5578	set_bit(__LINK_STATE_PRESENT, &dev->state);
5579	set_bit(__LINK_STATE_START, &dev->state);
5580
5581	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5582	 * because users of this 'device' dont need to change
5583	 * its refcount.
5584	 */
5585
5586	return 0;
5587}
5588EXPORT_SYMBOL_GPL(init_dummy_netdev);
5589
5590
5591/**
5592 *	register_netdev	- register a network device
5593 *	@dev: device to register
5594 *
5595 *	Take a completed network device structure and add it to the kernel
5596 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5597 *	chain. 0 is returned on success. A negative errno code is returned
5598 *	on a failure to set up the device, or if the name is a duplicate.
5599 *
5600 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5601 *	and expands the device name if you passed a format string to
5602 *	alloc_netdev.
5603 */
5604int register_netdev(struct net_device *dev)
5605{
5606	int err;
5607
5608	rtnl_lock();
5609	err = register_netdevice(dev);
5610	rtnl_unlock();
5611	return err;
5612}
5613EXPORT_SYMBOL(register_netdev);
5614
5615int netdev_refcnt_read(const struct net_device *dev)
5616{
5617	int i, refcnt = 0;
5618
5619	for_each_possible_cpu(i)
5620		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5621	return refcnt;
5622}
5623EXPORT_SYMBOL(netdev_refcnt_read);
5624
5625/*
5626 * netdev_wait_allrefs - wait until all references are gone.
5627 *
5628 * This is called when unregistering network devices.
5629 *
5630 * Any protocol or device that holds a reference should register
5631 * for netdevice notification, and cleanup and put back the
5632 * reference if they receive an UNREGISTER event.
5633 * We can get stuck here if buggy protocols don't correctly
5634 * call dev_put.
5635 */
5636static void netdev_wait_allrefs(struct net_device *dev)
5637{
5638	unsigned long rebroadcast_time, warning_time;
5639	int refcnt;
5640
5641	linkwatch_forget_dev(dev);
5642
5643	rebroadcast_time = warning_time = jiffies;
5644	refcnt = netdev_refcnt_read(dev);
5645
5646	while (refcnt != 0) {
5647		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5648			rtnl_lock();
5649
5650			/* Rebroadcast unregister notification */
5651			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5652			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5653			 * should have already handle it the first time */
5654
5655			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5656				     &dev->state)) {
5657				/* We must not have linkwatch events
5658				 * pending on unregister. If this
5659				 * happens, we simply run the queue
5660				 * unscheduled, resulting in a noop
5661				 * for this device.
5662				 */
5663				linkwatch_run_queue();
5664			}
5665
5666			__rtnl_unlock();
5667
5668			rebroadcast_time = jiffies;
5669		}
5670
5671		msleep(250);
5672
5673		refcnt = netdev_refcnt_read(dev);
5674
5675		if (time_after(jiffies, warning_time + 10 * HZ)) {
5676			printk(KERN_EMERG "unregister_netdevice: "
5677			       "waiting for %s to become free. Usage "
5678			       "count = %d\n",
5679			       dev->name, refcnt);
5680			warning_time = jiffies;
5681		}
5682	}
5683}
5684
5685/* The sequence is:
5686 *
5687 *	rtnl_lock();
5688 *	...
5689 *	register_netdevice(x1);
5690 *	register_netdevice(x2);
5691 *	...
5692 *	unregister_netdevice(y1);
5693 *	unregister_netdevice(y2);
5694 *      ...
5695 *	rtnl_unlock();
5696 *	free_netdev(y1);
5697 *	free_netdev(y2);
5698 *
5699 * We are invoked by rtnl_unlock().
5700 * This allows us to deal with problems:
5701 * 1) We can delete sysfs objects which invoke hotplug
5702 *    without deadlocking with linkwatch via keventd.
5703 * 2) Since we run with the RTNL semaphore not held, we can sleep
5704 *    safely in order to wait for the netdev refcnt to drop to zero.
5705 *
5706 * We must not return until all unregister events added during
5707 * the interval the lock was held have been completed.
5708 */
5709void netdev_run_todo(void)
5710{
5711	struct list_head list;
5712
5713	/* Snapshot list, allow later requests */
5714	list_replace_init(&net_todo_list, &list);
5715
5716	__rtnl_unlock();
5717
5718	while (!list_empty(&list)) {
5719		struct net_device *dev
5720			= list_first_entry(&list, struct net_device, todo_list);
5721		list_del(&dev->todo_list);
5722
5723		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5724			printk(KERN_ERR "network todo '%s' but state %d\n",
5725			       dev->name, dev->reg_state);
5726			dump_stack();
5727			continue;
5728		}
5729
5730		dev->reg_state = NETREG_UNREGISTERED;
5731
5732		on_each_cpu(flush_backlog, dev, 1);
5733
5734		netdev_wait_allrefs(dev);
5735
5736		/* paranoia */
5737		BUG_ON(netdev_refcnt_read(dev));
5738		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5739		WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5740		WARN_ON(dev->dn_ptr);
5741
5742		if (dev->destructor)
5743			dev->destructor(dev);
5744
5745		/* Free network device */
5746		kobject_put(&dev->dev.kobj);
5747	}
5748}
5749
5750/* Convert net_device_stats to rtnl_link_stats64.  They have the same
5751 * fields in the same order, with only the type differing.
5752 */
5753static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5754				    const struct net_device_stats *netdev_stats)
5755{
5756#if BITS_PER_LONG == 64
5757        BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5758        memcpy(stats64, netdev_stats, sizeof(*stats64));
5759#else
5760	size_t i, n = sizeof(*stats64) / sizeof(u64);
5761	const unsigned long *src = (const unsigned long *)netdev_stats;
5762	u64 *dst = (u64 *)stats64;
5763
5764	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5765		     sizeof(*stats64) / sizeof(u64));
5766	for (i = 0; i < n; i++)
5767		dst[i] = src[i];
5768#endif
5769}
5770
5771/**
5772 *	dev_get_stats	- get network device statistics
5773 *	@dev: device to get statistics from
5774 *	@storage: place to store stats
5775 *
5776 *	Get network statistics from device. Return @storage.
5777 *	The device driver may provide its own method by setting
5778 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5779 *	otherwise the internal statistics structure is used.
5780 */
5781struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5782					struct rtnl_link_stats64 *storage)
5783{
5784	const struct net_device_ops *ops = dev->netdev_ops;
5785
5786	if (ops->ndo_get_stats64) {
5787		memset(storage, 0, sizeof(*storage));
5788		ops->ndo_get_stats64(dev, storage);
5789	} else if (ops->ndo_get_stats) {
5790		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5791	} else {
5792		netdev_stats_to_stats64(storage, &dev->stats);
5793	}
5794	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5795	return storage;
5796}
5797EXPORT_SYMBOL(dev_get_stats);
5798
5799struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5800{
5801	struct netdev_queue *queue = dev_ingress_queue(dev);
5802
5803#ifdef CONFIG_NET_CLS_ACT
5804	if (queue)
5805		return queue;
5806	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5807	if (!queue)
5808		return NULL;
5809	netdev_init_one_queue(dev, queue, NULL);
5810	queue->qdisc = &noop_qdisc;
5811	queue->qdisc_sleeping = &noop_qdisc;
5812	rcu_assign_pointer(dev->ingress_queue, queue);
5813#endif
5814	return queue;
5815}
5816
5817/**
5818 *	alloc_netdev_mqs - allocate network device
5819 *	@sizeof_priv:	size of private data to allocate space for
5820 *	@name:		device name format string
5821 *	@setup:		callback to initialize device
5822 *	@txqs:		the number of TX subqueues to allocate
5823 *	@rxqs:		the number of RX subqueues to allocate
5824 *
5825 *	Allocates a struct net_device with private data area for driver use
5826 *	and performs basic initialization.  Also allocates subquue structs
5827 *	for each queue on the device.
5828 */
5829struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5830		void (*setup)(struct net_device *),
5831		unsigned int txqs, unsigned int rxqs)
5832{
5833	struct net_device *dev;
5834	size_t alloc_size;
5835	struct net_device *p;
5836
5837	BUG_ON(strlen(name) >= sizeof(dev->name));
5838
5839	if (txqs < 1) {
5840		pr_err("alloc_netdev: Unable to allocate device "
5841		       "with zero queues.\n");
5842		return NULL;
5843	}
5844
5845#ifdef CONFIG_RPS
5846	if (rxqs < 1) {
5847		pr_err("alloc_netdev: Unable to allocate device "
5848		       "with zero RX queues.\n");
5849		return NULL;
5850	}
5851#endif
5852
5853	alloc_size = sizeof(struct net_device);
5854	if (sizeof_priv) {
5855		/* ensure 32-byte alignment of private area */
5856		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5857		alloc_size += sizeof_priv;
5858	}
5859	/* ensure 32-byte alignment of whole construct */
5860	alloc_size += NETDEV_ALIGN - 1;
5861
5862	p = kzalloc(alloc_size, GFP_KERNEL);
5863	if (!p) {
5864		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5865		return NULL;
5866	}
5867
5868	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5869	dev->padded = (char *)dev - (char *)p;
5870
5871	dev->pcpu_refcnt = alloc_percpu(int);
5872	if (!dev->pcpu_refcnt)
5873		goto free_p;
5874
5875	if (dev_addr_init(dev))
5876		goto free_pcpu;
5877
5878	dev_mc_init(dev);
5879	dev_uc_init(dev);
5880
5881	dev_net_set(dev, &init_net);
5882
5883	dev->gso_max_size = GSO_MAX_SIZE;
5884
5885	INIT_LIST_HEAD(&dev->napi_list);
5886	INIT_LIST_HEAD(&dev->unreg_list);
5887	INIT_LIST_HEAD(&dev->link_watch_list);
5888	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5889	setup(dev);
5890
5891	dev->num_tx_queues = txqs;
5892	dev->real_num_tx_queues = txqs;
5893	if (netif_alloc_netdev_queues(dev))
5894		goto free_all;
5895
5896#ifdef CONFIG_RPS
5897	dev->num_rx_queues = rxqs;
5898	dev->real_num_rx_queues = rxqs;
5899	if (netif_alloc_rx_queues(dev))
5900		goto free_all;
5901#endif
5902
5903	strcpy(dev->name, name);
5904	dev->group = INIT_NETDEV_GROUP;
5905	return dev;
5906
5907free_all:
5908	free_netdev(dev);
5909	return NULL;
5910
5911free_pcpu:
5912	free_percpu(dev->pcpu_refcnt);
5913	kfree(dev->_tx);
5914#ifdef CONFIG_RPS
5915	kfree(dev->_rx);
5916#endif
5917
5918free_p:
5919	kfree(p);
5920	return NULL;
5921}
5922EXPORT_SYMBOL(alloc_netdev_mqs);
5923
5924/**
5925 *	free_netdev - free network device
5926 *	@dev: device
5927 *
5928 *	This function does the last stage of destroying an allocated device
5929 * 	interface. The reference to the device object is released.
5930 *	If this is the last reference then it will be freed.
5931 */
5932void free_netdev(struct net_device *dev)
5933{
5934	struct napi_struct *p, *n;
5935
5936	release_net(dev_net(dev));
5937
5938	kfree(dev->_tx);
5939#ifdef CONFIG_RPS
5940	kfree(dev->_rx);
5941#endif
5942
5943	kfree(rcu_dereference_raw(dev->ingress_queue));
5944
5945	/* Flush device addresses */
5946	dev_addr_flush(dev);
5947
5948	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5949		netif_napi_del(p);
5950
5951	free_percpu(dev->pcpu_refcnt);
5952	dev->pcpu_refcnt = NULL;
5953
5954	/*  Compatibility with error handling in drivers */
5955	if (dev->reg_state == NETREG_UNINITIALIZED) {
5956		kfree((char *)dev - dev->padded);
5957		return;
5958	}
5959
5960	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5961	dev->reg_state = NETREG_RELEASED;
5962
5963	/* will free via device release */
5964	put_device(&dev->dev);
5965}
5966EXPORT_SYMBOL(free_netdev);
5967
5968/**
5969 *	synchronize_net -  Synchronize with packet receive processing
5970 *
5971 *	Wait for packets currently being received to be done.
5972 *	Does not block later packets from starting.
5973 */
5974void synchronize_net(void)
5975{
5976	might_sleep();
5977	if (rtnl_is_locked())
5978		synchronize_rcu_expedited();
5979	else
5980		synchronize_rcu();
5981}
5982EXPORT_SYMBOL(synchronize_net);
5983
5984/**
5985 *	unregister_netdevice_queue - remove device from the kernel
5986 *	@dev: device
5987 *	@head: list
5988 *
5989 *	This function shuts down a device interface and removes it
5990 *	from the kernel tables.
5991 *	If head not NULL, device is queued to be unregistered later.
5992 *
5993 *	Callers must hold the rtnl semaphore.  You may want
5994 *	unregister_netdev() instead of this.
5995 */
5996
5997void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5998{
5999	ASSERT_RTNL();
6000
6001	if (head) {
6002		list_move_tail(&dev->unreg_list, head);
6003	} else {
6004		rollback_registered(dev);
6005		/* Finish processing unregister after unlock */
6006		net_set_todo(dev);
6007	}
6008}
6009EXPORT_SYMBOL(unregister_netdevice_queue);
6010
6011/**
6012 *	unregister_netdevice_many - unregister many devices
6013 *	@head: list of devices
6014 */
6015void unregister_netdevice_many(struct list_head *head)
6016{
6017	struct net_device *dev;
6018
6019	if (!list_empty(head)) {
6020		rollback_registered_many(head);
6021		list_for_each_entry(dev, head, unreg_list)
6022			net_set_todo(dev);
6023	}
6024}
6025EXPORT_SYMBOL(unregister_netdevice_many);
6026
6027/**
6028 *	unregister_netdev - remove device from the kernel
6029 *	@dev: device
6030 *
6031 *	This function shuts down a device interface and removes it
6032 *	from the kernel tables.
6033 *
6034 *	This is just a wrapper for unregister_netdevice that takes
6035 *	the rtnl semaphore.  In general you want to use this and not
6036 *	unregister_netdevice.
6037 */
6038void unregister_netdev(struct net_device *dev)
6039{
6040	rtnl_lock();
6041	unregister_netdevice(dev);
6042	rtnl_unlock();
6043}
6044EXPORT_SYMBOL(unregister_netdev);
6045
6046/**
6047 *	dev_change_net_namespace - move device to different nethost namespace
6048 *	@dev: device
6049 *	@net: network namespace
6050 *	@pat: If not NULL name pattern to try if the current device name
6051 *	      is already taken in the destination network namespace.
6052 *
6053 *	This function shuts down a device interface and moves it
6054 *	to a new network namespace. On success 0 is returned, on
6055 *	a failure a netagive errno code is returned.
6056 *
6057 *	Callers must hold the rtnl semaphore.
6058 */
6059
6060int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6061{
6062	int err;
6063
6064	ASSERT_RTNL();
6065
6066	/* Don't allow namespace local devices to be moved. */
6067	err = -EINVAL;
6068	if (dev->features & NETIF_F_NETNS_LOCAL)
6069		goto out;
6070
6071	/* Ensure the device has been registrered */
6072	err = -EINVAL;
6073	if (dev->reg_state != NETREG_REGISTERED)
6074		goto out;
6075
6076	/* Get out if there is nothing todo */
6077	err = 0;
6078	if (net_eq(dev_net(dev), net))
6079		goto out;
6080
6081	/* Pick the destination device name, and ensure
6082	 * we can use it in the destination network namespace.
6083	 */
6084	err = -EEXIST;
6085	if (__dev_get_by_name(net, dev->name)) {
6086		/* We get here if we can't use the current device name */
6087		if (!pat)
6088			goto out;
6089		if (dev_get_valid_name(dev, pat) < 0)
6090			goto out;
6091	}
6092
6093	/*
6094	 * And now a mini version of register_netdevice unregister_netdevice.
6095	 */
6096
6097	/* If device is running close it first. */
6098	dev_close(dev);
6099
6100	/* And unlink it from device chain */
6101	err = -ENODEV;
6102	unlist_netdevice(dev);
6103
6104	synchronize_net();
6105
6106	/* Shutdown queueing discipline. */
6107	dev_shutdown(dev);
6108
6109	/* Notify protocols, that we are about to destroy
6110	   this device. They should clean all the things.
6111
6112	   Note that dev->reg_state stays at NETREG_REGISTERED.
6113	   This is wanted because this way 8021q and macvlan know
6114	   the device is just moving and can keep their slaves up.
6115	*/
6116	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6117	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6118
6119	/*
6120	 *	Flush the unicast and multicast chains
6121	 */
6122	dev_uc_flush(dev);
6123	dev_mc_flush(dev);
6124
6125	/* Actually switch the network namespace */
6126	dev_net_set(dev, net);
6127
6128	/* If there is an ifindex conflict assign a new one */
6129	if (__dev_get_by_index(net, dev->ifindex)) {
6130		int iflink = (dev->iflink == dev->ifindex);
6131		dev->ifindex = dev_new_index(net);
6132		if (iflink)
6133			dev->iflink = dev->ifindex;
6134	}
6135
6136	/* Fixup kobjects */
6137	err = device_rename(&dev->dev, dev->name);
6138	WARN_ON(err);
6139
6140	/* Add the device back in the hashes */
6141	list_netdevice(dev);
6142
6143	/* Notify protocols, that a new device appeared. */
6144	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6145
6146	/*
6147	 *	Prevent userspace races by waiting until the network
6148	 *	device is fully setup before sending notifications.
6149	 */
6150	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6151
6152	synchronize_net();
6153	err = 0;
6154out:
6155	return err;
6156}
6157EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6158
6159static int dev_cpu_callback(struct notifier_block *nfb,
6160			    unsigned long action,
6161			    void *ocpu)
6162{
6163	struct sk_buff **list_skb;
6164	struct sk_buff *skb;
6165	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6166	struct softnet_data *sd, *oldsd;
6167
6168	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6169		return NOTIFY_OK;
6170
6171	local_irq_disable();
6172	cpu = smp_processor_id();
6173	sd = &per_cpu(softnet_data, cpu);
6174	oldsd = &per_cpu(softnet_data, oldcpu);
6175
6176	/* Find end of our completion_queue. */
6177	list_skb = &sd->completion_queue;
6178	while (*list_skb)
6179		list_skb = &(*list_skb)->next;
6180	/* Append completion queue from offline CPU. */
6181	*list_skb = oldsd->completion_queue;
6182	oldsd->completion_queue = NULL;
6183
6184	/* Append output queue from offline CPU. */
6185	if (oldsd->output_queue) {
6186		*sd->output_queue_tailp = oldsd->output_queue;
6187		sd->output_queue_tailp = oldsd->output_queue_tailp;
6188		oldsd->output_queue = NULL;
6189		oldsd->output_queue_tailp = &oldsd->output_queue;
6190	}
6191	/* Append NAPI poll list from offline CPU. */
6192	if (!list_empty(&oldsd->poll_list)) {
6193		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6194		raise_softirq_irqoff(NET_RX_SOFTIRQ);
6195	}
6196
6197	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6198	local_irq_enable();
6199
6200	/* Process offline CPU's input_pkt_queue */
6201	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6202		netif_rx(skb);
6203		input_queue_head_incr(oldsd);
6204	}
6205	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6206		netif_rx(skb);
6207		input_queue_head_incr(oldsd);
6208	}
6209
6210	return NOTIFY_OK;
6211}
6212
6213
6214/**
6215 *	netdev_increment_features - increment feature set by one
6216 *	@all: current feature set
6217 *	@one: new feature set
6218 *	@mask: mask feature set
6219 *
6220 *	Computes a new feature set after adding a device with feature set
6221 *	@one to the master device with current feature set @all.  Will not
6222 *	enable anything that is off in @mask. Returns the new feature set.
6223 */
6224u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6225{
6226	if (mask & NETIF_F_GEN_CSUM)
6227		mask |= NETIF_F_ALL_CSUM;
6228	mask |= NETIF_F_VLAN_CHALLENGED;
6229
6230	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6231	all &= one | ~NETIF_F_ALL_FOR_ALL;
6232
6233	/* If device needs checksumming, downgrade to it. */
6234	if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6235		all &= ~NETIF_F_NO_CSUM;
6236
6237	/* If one device supports hw checksumming, set for all. */
6238	if (all & NETIF_F_GEN_CSUM)
6239		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6240
6241	return all;
6242}
6243EXPORT_SYMBOL(netdev_increment_features);
6244
6245static struct hlist_head *netdev_create_hash(void)
6246{
6247	int i;
6248	struct hlist_head *hash;
6249
6250	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6251	if (hash != NULL)
6252		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6253			INIT_HLIST_HEAD(&hash[i]);
6254
6255	return hash;
6256}
6257
6258/* Initialize per network namespace state */
6259static int __net_init netdev_init(struct net *net)
6260{
6261	INIT_LIST_HEAD(&net->dev_base_head);
6262
6263	net->dev_name_head = netdev_create_hash();
6264	if (net->dev_name_head == NULL)
6265		goto err_name;
6266
6267	net->dev_index_head = netdev_create_hash();
6268	if (net->dev_index_head == NULL)
6269		goto err_idx;
6270
6271	return 0;
6272
6273err_idx:
6274	kfree(net->dev_name_head);
6275err_name:
6276	return -ENOMEM;
6277}
6278
6279/**
6280 *	netdev_drivername - network driver for the device
6281 *	@dev: network device
6282 *
6283 *	Determine network driver for device.
6284 */
6285const char *netdev_drivername(const struct net_device *dev)
6286{
6287	const struct device_driver *driver;
6288	const struct device *parent;
6289	const char *empty = "";
6290
6291	parent = dev->dev.parent;
6292	if (!parent)
6293		return empty;
6294
6295	driver = parent->driver;
6296	if (driver && driver->name)
6297		return driver->name;
6298	return empty;
6299}
6300
6301static int __netdev_printk(const char *level, const struct net_device *dev,
6302			   struct va_format *vaf)
6303{
6304	int r;
6305
6306	if (dev && dev->dev.parent)
6307		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6308			       netdev_name(dev), vaf);
6309	else if (dev)
6310		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6311	else
6312		r = printk("%s(NULL net_device): %pV", level, vaf);
6313
6314	return r;
6315}
6316
6317int netdev_printk(const char *level, const struct net_device *dev,
6318		  const char *format, ...)
6319{
6320	struct va_format vaf;
6321	va_list args;
6322	int r;
6323
6324	va_start(args, format);
6325
6326	vaf.fmt = format;
6327	vaf.va = &args;
6328
6329	r = __netdev_printk(level, dev, &vaf);
6330	va_end(args);
6331
6332	return r;
6333}
6334EXPORT_SYMBOL(netdev_printk);
6335
6336#define define_netdev_printk_level(func, level)			\
6337int func(const struct net_device *dev, const char *fmt, ...)	\
6338{								\
6339	int r;							\
6340	struct va_format vaf;					\
6341	va_list args;						\
6342								\
6343	va_start(args, fmt);					\
6344								\
6345	vaf.fmt = fmt;						\
6346	vaf.va = &args;						\
6347								\
6348	r = __netdev_printk(level, dev, &vaf);			\
6349	va_end(args);						\
6350								\
6351	return r;						\
6352}								\
6353EXPORT_SYMBOL(func);
6354
6355define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6356define_netdev_printk_level(netdev_alert, KERN_ALERT);
6357define_netdev_printk_level(netdev_crit, KERN_CRIT);
6358define_netdev_printk_level(netdev_err, KERN_ERR);
6359define_netdev_printk_level(netdev_warn, KERN_WARNING);
6360define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6361define_netdev_printk_level(netdev_info, KERN_INFO);
6362
6363static void __net_exit netdev_exit(struct net *net)
6364{
6365	kfree(net->dev_name_head);
6366	kfree(net->dev_index_head);
6367}
6368
6369static struct pernet_operations __net_initdata netdev_net_ops = {
6370	.init = netdev_init,
6371	.exit = netdev_exit,
6372};
6373
6374static void __net_exit default_device_exit(struct net *net)
6375{
6376	struct net_device *dev, *aux;
6377	/*
6378	 * Push all migratable network devices back to the
6379	 * initial network namespace
6380	 */
6381	rtnl_lock();
6382	for_each_netdev_safe(net, dev, aux) {
6383		int err;
6384		char fb_name[IFNAMSIZ];
6385
6386		/* Ignore unmoveable devices (i.e. loopback) */
6387		if (dev->features & NETIF_F_NETNS_LOCAL)
6388			continue;
6389
6390		/* Leave virtual devices for the generic cleanup */
6391		if (dev->rtnl_link_ops)
6392			continue;
6393
6394		/* Push remaining network devices to init_net */
6395		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6396		err = dev_change_net_namespace(dev, &init_net, fb_name);
6397		if (err) {
6398			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6399				__func__, dev->name, err);
6400			BUG();
6401		}
6402	}
6403	rtnl_unlock();
6404}
6405
6406static void __net_exit default_device_exit_batch(struct list_head *net_list)
6407{
6408	/* At exit all network devices most be removed from a network
6409	 * namespace.  Do this in the reverse order of registration.
6410	 * Do this across as many network namespaces as possible to
6411	 * improve batching efficiency.
6412	 */
6413	struct net_device *dev;
6414	struct net *net;
6415	LIST_HEAD(dev_kill_list);
6416
6417	rtnl_lock();
6418	list_for_each_entry(net, net_list, exit_list) {
6419		for_each_netdev_reverse(net, dev) {
6420			if (dev->rtnl_link_ops)
6421				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6422			else
6423				unregister_netdevice_queue(dev, &dev_kill_list);
6424		}
6425	}
6426	unregister_netdevice_many(&dev_kill_list);
6427	list_del(&dev_kill_list);
6428	rtnl_unlock();
6429}
6430
6431static struct pernet_operations __net_initdata default_device_ops = {
6432	.exit = default_device_exit,
6433	.exit_batch = default_device_exit_batch,
6434};
6435
6436/*
6437 *	Initialize the DEV module. At boot time this walks the device list and
6438 *	unhooks any devices that fail to initialise (normally hardware not
6439 *	present) and leaves us with a valid list of present and active devices.
6440 *
6441 */
6442
6443/*
6444 *       This is called single threaded during boot, so no need
6445 *       to take the rtnl semaphore.
6446 */
6447static int __init net_dev_init(void)
6448{
6449	int i, rc = -ENOMEM;
6450
6451	BUG_ON(!dev_boot_phase);
6452
6453	if (dev_proc_init())
6454		goto out;
6455
6456	if (netdev_kobject_init())
6457		goto out;
6458
6459	INIT_LIST_HEAD(&ptype_all);
6460	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6461		INIT_LIST_HEAD(&ptype_base[i]);
6462
6463	if (register_pernet_subsys(&netdev_net_ops))
6464		goto out;
6465
6466	/*
6467	 *	Initialise the packet receive queues.
6468	 */
6469
6470	for_each_possible_cpu(i) {
6471		struct softnet_data *sd = &per_cpu(softnet_data, i);
6472
6473		memset(sd, 0, sizeof(*sd));
6474		skb_queue_head_init(&sd->input_pkt_queue);
6475		skb_queue_head_init(&sd->process_queue);
6476		sd->completion_queue = NULL;
6477		INIT_LIST_HEAD(&sd->poll_list);
6478		sd->output_queue = NULL;
6479		sd->output_queue_tailp = &sd->output_queue;
6480#ifdef CONFIG_RPS
6481		sd->csd.func = rps_trigger_softirq;
6482		sd->csd.info = sd;
6483		sd->csd.flags = 0;
6484		sd->cpu = i;
6485#endif
6486
6487		sd->backlog.poll = process_backlog;
6488		sd->backlog.weight = weight_p;
6489		sd->backlog.gro_list = NULL;
6490		sd->backlog.gro_count = 0;
6491	}
6492
6493	dev_boot_phase = 0;
6494
6495	/* The loopback device is special if any other network devices
6496	 * is present in a network namespace the loopback device must
6497	 * be present. Since we now dynamically allocate and free the
6498	 * loopback device ensure this invariant is maintained by
6499	 * keeping the loopback device as the first device on the
6500	 * list of network devices.  Ensuring the loopback devices
6501	 * is the first device that appears and the last network device
6502	 * that disappears.
6503	 */
6504	if (register_pernet_device(&loopback_net_ops))
6505		goto out;
6506
6507	if (register_pernet_device(&default_device_ops))
6508		goto out;
6509
6510	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6511	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6512
6513	hotcpu_notifier(dev_cpu_callback, 0);
6514	dst_init();
6515	dev_mcast_init();
6516	rc = 0;
6517out:
6518	return rc;
6519}
6520
6521subsys_initcall(net_dev_init);
6522
6523static int __init initialize_hashrnd(void)
6524{
6525	get_random_bytes(&hashrnd, sizeof(hashrnd));
6526	return 0;
6527}
6528
6529late_initcall_sync(initialize_hashrnd);
6530
v3.1
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <asm/system.h>
  77#include <linux/bitops.h>
  78#include <linux/capability.h>
  79#include <linux/cpu.h>
  80#include <linux/types.h>
  81#include <linux/kernel.h>
  82#include <linux/hash.h>
  83#include <linux/slab.h>
  84#include <linux/sched.h>
  85#include <linux/mutex.h>
  86#include <linux/string.h>
  87#include <linux/mm.h>
  88#include <linux/socket.h>
  89#include <linux/sockios.h>
  90#include <linux/errno.h>
  91#include <linux/interrupt.h>
  92#include <linux/if_ether.h>
  93#include <linux/netdevice.h>
  94#include <linux/etherdevice.h>
  95#include <linux/ethtool.h>
  96#include <linux/notifier.h>
  97#include <linux/skbuff.h>
  98#include <net/net_namespace.h>
  99#include <net/sock.h>
 100#include <linux/rtnetlink.h>
 101#include <linux/proc_fs.h>
 102#include <linux/seq_file.h>
 103#include <linux/stat.h>
 104#include <net/dst.h>
 105#include <net/pkt_sched.h>
 106#include <net/checksum.h>
 107#include <net/xfrm.h>
 108#include <linux/highmem.h>
 109#include <linux/init.h>
 110#include <linux/kmod.h>
 111#include <linux/module.h>
 112#include <linux/netpoll.h>
 113#include <linux/rcupdate.h>
 114#include <linux/delay.h>
 115#include <net/wext.h>
 116#include <net/iw_handler.h>
 117#include <asm/current.h>
 118#include <linux/audit.h>
 119#include <linux/dmaengine.h>
 120#include <linux/err.h>
 121#include <linux/ctype.h>
 122#include <linux/if_arp.h>
 123#include <linux/if_vlan.h>
 124#include <linux/ip.h>
 125#include <net/ip.h>
 126#include <linux/ipv6.h>
 127#include <linux/in.h>
 128#include <linux/jhash.h>
 129#include <linux/random.h>
 130#include <trace/events/napi.h>
 131#include <trace/events/net.h>
 132#include <trace/events/skb.h>
 133#include <linux/pci.h>
 134#include <linux/inetdevice.h>
 135#include <linux/cpu_rmap.h>
 136
 137#include "net-sysfs.h"
 138
 139/* Instead of increasing this, you should create a hash table. */
 140#define MAX_GRO_SKBS 8
 141
 142/* This should be increased if a protocol with a bigger head is added. */
 143#define GRO_MAX_HEAD (MAX_HEADER + 128)
 144
 145/*
 146 *	The list of packet types we will receive (as opposed to discard)
 147 *	and the routines to invoke.
 148 *
 149 *	Why 16. Because with 16 the only overlap we get on a hash of the
 150 *	low nibble of the protocol value is RARP/SNAP/X.25.
 151 *
 152 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 153 *             sure which should go first, but I bet it won't make much
 154 *             difference if we are running VLANs.  The good news is that
 155 *             this protocol won't be in the list unless compiled in, so
 156 *             the average user (w/out VLANs) will not be adversely affected.
 157 *             --BLG
 158 *
 159 *		0800	IP
 160 *		8100    802.1Q VLAN
 161 *		0001	802.3
 162 *		0002	AX.25
 163 *		0004	802.2
 164 *		8035	RARP
 165 *		0005	SNAP
 166 *		0805	X.25
 167 *		0806	ARP
 168 *		8137	IPX
 169 *		0009	Localtalk
 170 *		86DD	IPv6
 171 */
 172
 173#define PTYPE_HASH_SIZE	(16)
 174#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 175
 176static DEFINE_SPINLOCK(ptype_lock);
 177static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 178static struct list_head ptype_all __read_mostly;	/* Taps */
 179
 180/*
 181 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 182 * semaphore.
 183 *
 184 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 185 *
 186 * Writers must hold the rtnl semaphore while they loop through the
 187 * dev_base_head list, and hold dev_base_lock for writing when they do the
 188 * actual updates.  This allows pure readers to access the list even
 189 * while a writer is preparing to update it.
 190 *
 191 * To put it another way, dev_base_lock is held for writing only to
 192 * protect against pure readers; the rtnl semaphore provides the
 193 * protection against other writers.
 194 *
 195 * See, for example usages, register_netdevice() and
 196 * unregister_netdevice(), which must be called with the rtnl
 197 * semaphore held.
 198 */
 199DEFINE_RWLOCK(dev_base_lock);
 200EXPORT_SYMBOL(dev_base_lock);
 201
 202static inline void dev_base_seq_inc(struct net *net)
 203{
 204	while (++net->dev_base_seq == 0);
 205}
 206
 207static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 208{
 209	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 210	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 211}
 212
 213static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 214{
 215	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 216}
 217
 218static inline void rps_lock(struct softnet_data *sd)
 219{
 220#ifdef CONFIG_RPS
 221	spin_lock(&sd->input_pkt_queue.lock);
 222#endif
 223}
 224
 225static inline void rps_unlock(struct softnet_data *sd)
 226{
 227#ifdef CONFIG_RPS
 228	spin_unlock(&sd->input_pkt_queue.lock);
 229#endif
 230}
 231
 232/* Device list insertion */
 233static int list_netdevice(struct net_device *dev)
 234{
 235	struct net *net = dev_net(dev);
 236
 237	ASSERT_RTNL();
 238
 239	write_lock_bh(&dev_base_lock);
 240	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 241	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 242	hlist_add_head_rcu(&dev->index_hlist,
 243			   dev_index_hash(net, dev->ifindex));
 244	write_unlock_bh(&dev_base_lock);
 245
 246	dev_base_seq_inc(net);
 247
 248	return 0;
 249}
 250
 251/* Device list removal
 252 * caller must respect a RCU grace period before freeing/reusing dev
 253 */
 254static void unlist_netdevice(struct net_device *dev)
 255{
 256	ASSERT_RTNL();
 257
 258	/* Unlink dev from the device chain */
 259	write_lock_bh(&dev_base_lock);
 260	list_del_rcu(&dev->dev_list);
 261	hlist_del_rcu(&dev->name_hlist);
 262	hlist_del_rcu(&dev->index_hlist);
 263	write_unlock_bh(&dev_base_lock);
 264
 265	dev_base_seq_inc(dev_net(dev));
 266}
 267
 268/*
 269 *	Our notifier list
 270 */
 271
 272static RAW_NOTIFIER_HEAD(netdev_chain);
 273
 274/*
 275 *	Device drivers call our routines to queue packets here. We empty the
 276 *	queue in the local softnet handler.
 277 */
 278
 279DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 280EXPORT_PER_CPU_SYMBOL(softnet_data);
 281
 282#ifdef CONFIG_LOCKDEP
 283/*
 284 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 285 * according to dev->type
 286 */
 287static const unsigned short netdev_lock_type[] =
 288	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 289	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 290	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 291	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 292	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 293	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 294	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 295	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 296	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 297	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 298	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 299	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 300	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 301	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 302	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 303	 ARPHRD_VOID, ARPHRD_NONE};
 304
 305static const char *const netdev_lock_name[] =
 306	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 307	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 308	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 309	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 310	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 311	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 312	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 313	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 314	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 315	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 316	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 317	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 318	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 319	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 320	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 321	 "_xmit_VOID", "_xmit_NONE"};
 322
 323static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 324static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 325
 326static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 327{
 328	int i;
 329
 330	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 331		if (netdev_lock_type[i] == dev_type)
 332			return i;
 333	/* the last key is used by default */
 334	return ARRAY_SIZE(netdev_lock_type) - 1;
 335}
 336
 337static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 338						 unsigned short dev_type)
 339{
 340	int i;
 341
 342	i = netdev_lock_pos(dev_type);
 343	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 344				   netdev_lock_name[i]);
 345}
 346
 347static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 348{
 349	int i;
 350
 351	i = netdev_lock_pos(dev->type);
 352	lockdep_set_class_and_name(&dev->addr_list_lock,
 353				   &netdev_addr_lock_key[i],
 354				   netdev_lock_name[i]);
 355}
 356#else
 357static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 358						 unsigned short dev_type)
 359{
 360}
 361static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 362{
 363}
 364#endif
 365
 366/*******************************************************************************
 367
 368		Protocol management and registration routines
 369
 370*******************************************************************************/
 371
 372/*
 373 *	Add a protocol ID to the list. Now that the input handler is
 374 *	smarter we can dispense with all the messy stuff that used to be
 375 *	here.
 376 *
 377 *	BEWARE!!! Protocol handlers, mangling input packets,
 378 *	MUST BE last in hash buckets and checking protocol handlers
 379 *	MUST start from promiscuous ptype_all chain in net_bh.
 380 *	It is true now, do not change it.
 381 *	Explanation follows: if protocol handler, mangling packet, will
 382 *	be the first on list, it is not able to sense, that packet
 383 *	is cloned and should be copied-on-write, so that it will
 384 *	change it and subsequent readers will get broken packet.
 385 *							--ANK (980803)
 386 */
 387
 388static inline struct list_head *ptype_head(const struct packet_type *pt)
 389{
 390	if (pt->type == htons(ETH_P_ALL))
 391		return &ptype_all;
 392	else
 393		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 394}
 395
 396/**
 397 *	dev_add_pack - add packet handler
 398 *	@pt: packet type declaration
 399 *
 400 *	Add a protocol handler to the networking stack. The passed &packet_type
 401 *	is linked into kernel lists and may not be freed until it has been
 402 *	removed from the kernel lists.
 403 *
 404 *	This call does not sleep therefore it can not
 405 *	guarantee all CPU's that are in middle of receiving packets
 406 *	will see the new packet type (until the next received packet).
 407 */
 408
 409void dev_add_pack(struct packet_type *pt)
 410{
 411	struct list_head *head = ptype_head(pt);
 412
 413	spin_lock(&ptype_lock);
 414	list_add_rcu(&pt->list, head);
 415	spin_unlock(&ptype_lock);
 416}
 417EXPORT_SYMBOL(dev_add_pack);
 418
 419/**
 420 *	__dev_remove_pack	 - remove packet handler
 421 *	@pt: packet type declaration
 422 *
 423 *	Remove a protocol handler that was previously added to the kernel
 424 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 425 *	from the kernel lists and can be freed or reused once this function
 426 *	returns.
 427 *
 428 *      The packet type might still be in use by receivers
 429 *	and must not be freed until after all the CPU's have gone
 430 *	through a quiescent state.
 431 */
 432void __dev_remove_pack(struct packet_type *pt)
 433{
 434	struct list_head *head = ptype_head(pt);
 435	struct packet_type *pt1;
 436
 437	spin_lock(&ptype_lock);
 438
 439	list_for_each_entry(pt1, head, list) {
 440		if (pt == pt1) {
 441			list_del_rcu(&pt->list);
 442			goto out;
 443		}
 444	}
 445
 446	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 447out:
 448	spin_unlock(&ptype_lock);
 449}
 450EXPORT_SYMBOL(__dev_remove_pack);
 451
 452/**
 453 *	dev_remove_pack	 - remove packet handler
 454 *	@pt: packet type declaration
 455 *
 456 *	Remove a protocol handler that was previously added to the kernel
 457 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 458 *	from the kernel lists and can be freed or reused once this function
 459 *	returns.
 460 *
 461 *	This call sleeps to guarantee that no CPU is looking at the packet
 462 *	type after return.
 463 */
 464void dev_remove_pack(struct packet_type *pt)
 465{
 466	__dev_remove_pack(pt);
 467
 468	synchronize_net();
 469}
 470EXPORT_SYMBOL(dev_remove_pack);
 471
 472/******************************************************************************
 473
 474		      Device Boot-time Settings Routines
 475
 476*******************************************************************************/
 477
 478/* Boot time configuration table */
 479static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 480
 481/**
 482 *	netdev_boot_setup_add	- add new setup entry
 483 *	@name: name of the device
 484 *	@map: configured settings for the device
 485 *
 486 *	Adds new setup entry to the dev_boot_setup list.  The function
 487 *	returns 0 on error and 1 on success.  This is a generic routine to
 488 *	all netdevices.
 489 */
 490static int netdev_boot_setup_add(char *name, struct ifmap *map)
 491{
 492	struct netdev_boot_setup *s;
 493	int i;
 494
 495	s = dev_boot_setup;
 496	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 497		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 498			memset(s[i].name, 0, sizeof(s[i].name));
 499			strlcpy(s[i].name, name, IFNAMSIZ);
 500			memcpy(&s[i].map, map, sizeof(s[i].map));
 501			break;
 502		}
 503	}
 504
 505	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 506}
 507
 508/**
 509 *	netdev_boot_setup_check	- check boot time settings
 510 *	@dev: the netdevice
 511 *
 512 * 	Check boot time settings for the device.
 513 *	The found settings are set for the device to be used
 514 *	later in the device probing.
 515 *	Returns 0 if no settings found, 1 if they are.
 516 */
 517int netdev_boot_setup_check(struct net_device *dev)
 518{
 519	struct netdev_boot_setup *s = dev_boot_setup;
 520	int i;
 521
 522	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 523		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 524		    !strcmp(dev->name, s[i].name)) {
 525			dev->irq 	= s[i].map.irq;
 526			dev->base_addr 	= s[i].map.base_addr;
 527			dev->mem_start 	= s[i].map.mem_start;
 528			dev->mem_end 	= s[i].map.mem_end;
 529			return 1;
 530		}
 531	}
 532	return 0;
 533}
 534EXPORT_SYMBOL(netdev_boot_setup_check);
 535
 536
 537/**
 538 *	netdev_boot_base	- get address from boot time settings
 539 *	@prefix: prefix for network device
 540 *	@unit: id for network device
 541 *
 542 * 	Check boot time settings for the base address of device.
 543 *	The found settings are set for the device to be used
 544 *	later in the device probing.
 545 *	Returns 0 if no settings found.
 546 */
 547unsigned long netdev_boot_base(const char *prefix, int unit)
 548{
 549	const struct netdev_boot_setup *s = dev_boot_setup;
 550	char name[IFNAMSIZ];
 551	int i;
 552
 553	sprintf(name, "%s%d", prefix, unit);
 554
 555	/*
 556	 * If device already registered then return base of 1
 557	 * to indicate not to probe for this interface
 558	 */
 559	if (__dev_get_by_name(&init_net, name))
 560		return 1;
 561
 562	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 563		if (!strcmp(name, s[i].name))
 564			return s[i].map.base_addr;
 565	return 0;
 566}
 567
 568/*
 569 * Saves at boot time configured settings for any netdevice.
 570 */
 571int __init netdev_boot_setup(char *str)
 572{
 573	int ints[5];
 574	struct ifmap map;
 575
 576	str = get_options(str, ARRAY_SIZE(ints), ints);
 577	if (!str || !*str)
 578		return 0;
 579
 580	/* Save settings */
 581	memset(&map, 0, sizeof(map));
 582	if (ints[0] > 0)
 583		map.irq = ints[1];
 584	if (ints[0] > 1)
 585		map.base_addr = ints[2];
 586	if (ints[0] > 2)
 587		map.mem_start = ints[3];
 588	if (ints[0] > 3)
 589		map.mem_end = ints[4];
 590
 591	/* Add new entry to the list */
 592	return netdev_boot_setup_add(str, &map);
 593}
 594
 595__setup("netdev=", netdev_boot_setup);
 596
 597/*******************************************************************************
 598
 599			    Device Interface Subroutines
 600
 601*******************************************************************************/
 602
 603/**
 604 *	__dev_get_by_name	- find a device by its name
 605 *	@net: the applicable net namespace
 606 *	@name: name to find
 607 *
 608 *	Find an interface by name. Must be called under RTNL semaphore
 609 *	or @dev_base_lock. If the name is found a pointer to the device
 610 *	is returned. If the name is not found then %NULL is returned. The
 611 *	reference counters are not incremented so the caller must be
 612 *	careful with locks.
 613 */
 614
 615struct net_device *__dev_get_by_name(struct net *net, const char *name)
 616{
 617	struct hlist_node *p;
 618	struct net_device *dev;
 619	struct hlist_head *head = dev_name_hash(net, name);
 620
 621	hlist_for_each_entry(dev, p, head, name_hlist)
 622		if (!strncmp(dev->name, name, IFNAMSIZ))
 623			return dev;
 624
 625	return NULL;
 626}
 627EXPORT_SYMBOL(__dev_get_by_name);
 628
 629/**
 630 *	dev_get_by_name_rcu	- find a device by its name
 631 *	@net: the applicable net namespace
 632 *	@name: name to find
 633 *
 634 *	Find an interface by name.
 635 *	If the name is found a pointer to the device is returned.
 636 * 	If the name is not found then %NULL is returned.
 637 *	The reference counters are not incremented so the caller must be
 638 *	careful with locks. The caller must hold RCU lock.
 639 */
 640
 641struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 642{
 643	struct hlist_node *p;
 644	struct net_device *dev;
 645	struct hlist_head *head = dev_name_hash(net, name);
 646
 647	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 648		if (!strncmp(dev->name, name, IFNAMSIZ))
 649			return dev;
 650
 651	return NULL;
 652}
 653EXPORT_SYMBOL(dev_get_by_name_rcu);
 654
 655/**
 656 *	dev_get_by_name		- find a device by its name
 657 *	@net: the applicable net namespace
 658 *	@name: name to find
 659 *
 660 *	Find an interface by name. This can be called from any
 661 *	context and does its own locking. The returned handle has
 662 *	the usage count incremented and the caller must use dev_put() to
 663 *	release it when it is no longer needed. %NULL is returned if no
 664 *	matching device is found.
 665 */
 666
 667struct net_device *dev_get_by_name(struct net *net, const char *name)
 668{
 669	struct net_device *dev;
 670
 671	rcu_read_lock();
 672	dev = dev_get_by_name_rcu(net, name);
 673	if (dev)
 674		dev_hold(dev);
 675	rcu_read_unlock();
 676	return dev;
 677}
 678EXPORT_SYMBOL(dev_get_by_name);
 679
 680/**
 681 *	__dev_get_by_index - find a device by its ifindex
 682 *	@net: the applicable net namespace
 683 *	@ifindex: index of device
 684 *
 685 *	Search for an interface by index. Returns %NULL if the device
 686 *	is not found or a pointer to the device. The device has not
 687 *	had its reference counter increased so the caller must be careful
 688 *	about locking. The caller must hold either the RTNL semaphore
 689 *	or @dev_base_lock.
 690 */
 691
 692struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 693{
 694	struct hlist_node *p;
 695	struct net_device *dev;
 696	struct hlist_head *head = dev_index_hash(net, ifindex);
 697
 698	hlist_for_each_entry(dev, p, head, index_hlist)
 699		if (dev->ifindex == ifindex)
 700			return dev;
 701
 702	return NULL;
 703}
 704EXPORT_SYMBOL(__dev_get_by_index);
 705
 706/**
 707 *	dev_get_by_index_rcu - find a device by its ifindex
 708 *	@net: the applicable net namespace
 709 *	@ifindex: index of device
 710 *
 711 *	Search for an interface by index. Returns %NULL if the device
 712 *	is not found or a pointer to the device. The device has not
 713 *	had its reference counter increased so the caller must be careful
 714 *	about locking. The caller must hold RCU lock.
 715 */
 716
 717struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 718{
 719	struct hlist_node *p;
 720	struct net_device *dev;
 721	struct hlist_head *head = dev_index_hash(net, ifindex);
 722
 723	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 724		if (dev->ifindex == ifindex)
 725			return dev;
 726
 727	return NULL;
 728}
 729EXPORT_SYMBOL(dev_get_by_index_rcu);
 730
 731
 732/**
 733 *	dev_get_by_index - find a device by its ifindex
 734 *	@net: the applicable net namespace
 735 *	@ifindex: index of device
 736 *
 737 *	Search for an interface by index. Returns NULL if the device
 738 *	is not found or a pointer to the device. The device returned has
 739 *	had a reference added and the pointer is safe until the user calls
 740 *	dev_put to indicate they have finished with it.
 741 */
 742
 743struct net_device *dev_get_by_index(struct net *net, int ifindex)
 744{
 745	struct net_device *dev;
 746
 747	rcu_read_lock();
 748	dev = dev_get_by_index_rcu(net, ifindex);
 749	if (dev)
 750		dev_hold(dev);
 751	rcu_read_unlock();
 752	return dev;
 753}
 754EXPORT_SYMBOL(dev_get_by_index);
 755
 756/**
 757 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 758 *	@net: the applicable net namespace
 759 *	@type: media type of device
 760 *	@ha: hardware address
 761 *
 762 *	Search for an interface by MAC address. Returns NULL if the device
 763 *	is not found or a pointer to the device.
 764 *	The caller must hold RCU or RTNL.
 765 *	The returned device has not had its ref count increased
 766 *	and the caller must therefore be careful about locking
 767 *
 768 */
 769
 770struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 771				       const char *ha)
 772{
 773	struct net_device *dev;
 774
 775	for_each_netdev_rcu(net, dev)
 776		if (dev->type == type &&
 777		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 778			return dev;
 779
 780	return NULL;
 781}
 782EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 783
 784struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 785{
 786	struct net_device *dev;
 787
 788	ASSERT_RTNL();
 789	for_each_netdev(net, dev)
 790		if (dev->type == type)
 791			return dev;
 792
 793	return NULL;
 794}
 795EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 796
 797struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 798{
 799	struct net_device *dev, *ret = NULL;
 800
 801	rcu_read_lock();
 802	for_each_netdev_rcu(net, dev)
 803		if (dev->type == type) {
 804			dev_hold(dev);
 805			ret = dev;
 806			break;
 807		}
 808	rcu_read_unlock();
 809	return ret;
 810}
 811EXPORT_SYMBOL(dev_getfirstbyhwtype);
 812
 813/**
 814 *	dev_get_by_flags_rcu - find any device with given flags
 815 *	@net: the applicable net namespace
 816 *	@if_flags: IFF_* values
 817 *	@mask: bitmask of bits in if_flags to check
 818 *
 819 *	Search for any interface with the given flags. Returns NULL if a device
 820 *	is not found or a pointer to the device. Must be called inside
 821 *	rcu_read_lock(), and result refcount is unchanged.
 822 */
 823
 824struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 825				    unsigned short mask)
 826{
 827	struct net_device *dev, *ret;
 828
 829	ret = NULL;
 830	for_each_netdev_rcu(net, dev) {
 831		if (((dev->flags ^ if_flags) & mask) == 0) {
 832			ret = dev;
 833			break;
 834		}
 835	}
 836	return ret;
 837}
 838EXPORT_SYMBOL(dev_get_by_flags_rcu);
 839
 840/**
 841 *	dev_valid_name - check if name is okay for network device
 842 *	@name: name string
 843 *
 844 *	Network device names need to be valid file names to
 845 *	to allow sysfs to work.  We also disallow any kind of
 846 *	whitespace.
 847 */
 848int dev_valid_name(const char *name)
 849{
 850	if (*name == '\0')
 851		return 0;
 852	if (strlen(name) >= IFNAMSIZ)
 853		return 0;
 854	if (!strcmp(name, ".") || !strcmp(name, ".."))
 855		return 0;
 856
 857	while (*name) {
 858		if (*name == '/' || isspace(*name))
 859			return 0;
 860		name++;
 861	}
 862	return 1;
 863}
 864EXPORT_SYMBOL(dev_valid_name);
 865
 866/**
 867 *	__dev_alloc_name - allocate a name for a device
 868 *	@net: network namespace to allocate the device name in
 869 *	@name: name format string
 870 *	@buf:  scratch buffer and result name string
 871 *
 872 *	Passed a format string - eg "lt%d" it will try and find a suitable
 873 *	id. It scans list of devices to build up a free map, then chooses
 874 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 875 *	while allocating the name and adding the device in order to avoid
 876 *	duplicates.
 877 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 878 *	Returns the number of the unit assigned or a negative errno code.
 879 */
 880
 881static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 882{
 883	int i = 0;
 884	const char *p;
 885	const int max_netdevices = 8*PAGE_SIZE;
 886	unsigned long *inuse;
 887	struct net_device *d;
 888
 889	p = strnchr(name, IFNAMSIZ-1, '%');
 890	if (p) {
 891		/*
 892		 * Verify the string as this thing may have come from
 893		 * the user.  There must be either one "%d" and no other "%"
 894		 * characters.
 895		 */
 896		if (p[1] != 'd' || strchr(p + 2, '%'))
 897			return -EINVAL;
 898
 899		/* Use one page as a bit array of possible slots */
 900		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 901		if (!inuse)
 902			return -ENOMEM;
 903
 904		for_each_netdev(net, d) {
 905			if (!sscanf(d->name, name, &i))
 906				continue;
 907			if (i < 0 || i >= max_netdevices)
 908				continue;
 909
 910			/*  avoid cases where sscanf is not exact inverse of printf */
 911			snprintf(buf, IFNAMSIZ, name, i);
 912			if (!strncmp(buf, d->name, IFNAMSIZ))
 913				set_bit(i, inuse);
 914		}
 915
 916		i = find_first_zero_bit(inuse, max_netdevices);
 917		free_page((unsigned long) inuse);
 918	}
 919
 920	if (buf != name)
 921		snprintf(buf, IFNAMSIZ, name, i);
 922	if (!__dev_get_by_name(net, buf))
 923		return i;
 924
 925	/* It is possible to run out of possible slots
 926	 * when the name is long and there isn't enough space left
 927	 * for the digits, or if all bits are used.
 928	 */
 929	return -ENFILE;
 930}
 931
 932/**
 933 *	dev_alloc_name - allocate a name for a device
 934 *	@dev: device
 935 *	@name: name format string
 936 *
 937 *	Passed a format string - eg "lt%d" it will try and find a suitable
 938 *	id. It scans list of devices to build up a free map, then chooses
 939 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 940 *	while allocating the name and adding the device in order to avoid
 941 *	duplicates.
 942 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 943 *	Returns the number of the unit assigned or a negative errno code.
 944 */
 945
 946int dev_alloc_name(struct net_device *dev, const char *name)
 947{
 948	char buf[IFNAMSIZ];
 949	struct net *net;
 950	int ret;
 951
 952	BUG_ON(!dev_net(dev));
 953	net = dev_net(dev);
 954	ret = __dev_alloc_name(net, name, buf);
 955	if (ret >= 0)
 956		strlcpy(dev->name, buf, IFNAMSIZ);
 957	return ret;
 958}
 959EXPORT_SYMBOL(dev_alloc_name);
 960
 961static int dev_get_valid_name(struct net_device *dev, const char *name)
 962{
 963	struct net *net;
 964
 965	BUG_ON(!dev_net(dev));
 966	net = dev_net(dev);
 967
 968	if (!dev_valid_name(name))
 969		return -EINVAL;
 970
 971	if (strchr(name, '%'))
 972		return dev_alloc_name(dev, name);
 973	else if (__dev_get_by_name(net, name))
 974		return -EEXIST;
 975	else if (dev->name != name)
 976		strlcpy(dev->name, name, IFNAMSIZ);
 977
 978	return 0;
 979}
 980
 981/**
 982 *	dev_change_name - change name of a device
 983 *	@dev: device
 984 *	@newname: name (or format string) must be at least IFNAMSIZ
 985 *
 986 *	Change name of a device, can pass format strings "eth%d".
 987 *	for wildcarding.
 988 */
 989int dev_change_name(struct net_device *dev, const char *newname)
 990{
 991	char oldname[IFNAMSIZ];
 992	int err = 0;
 993	int ret;
 994	struct net *net;
 995
 996	ASSERT_RTNL();
 997	BUG_ON(!dev_net(dev));
 998
 999	net = dev_net(dev);
1000	if (dev->flags & IFF_UP)
1001		return -EBUSY;
1002
1003	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1004		return 0;
1005
1006	memcpy(oldname, dev->name, IFNAMSIZ);
1007
1008	err = dev_get_valid_name(dev, newname);
1009	if (err < 0)
1010		return err;
1011
1012rollback:
1013	ret = device_rename(&dev->dev, dev->name);
1014	if (ret) {
1015		memcpy(dev->name, oldname, IFNAMSIZ);
1016		return ret;
1017	}
1018
1019	write_lock_bh(&dev_base_lock);
1020	hlist_del_rcu(&dev->name_hlist);
1021	write_unlock_bh(&dev_base_lock);
1022
1023	synchronize_rcu();
1024
1025	write_lock_bh(&dev_base_lock);
1026	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1027	write_unlock_bh(&dev_base_lock);
1028
1029	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1030	ret = notifier_to_errno(ret);
1031
1032	if (ret) {
1033		/* err >= 0 after dev_alloc_name() or stores the first errno */
1034		if (err >= 0) {
1035			err = ret;
1036			memcpy(dev->name, oldname, IFNAMSIZ);
1037			goto rollback;
1038		} else {
1039			printk(KERN_ERR
1040			       "%s: name change rollback failed: %d.\n",
1041			       dev->name, ret);
1042		}
1043	}
1044
1045	return err;
1046}
1047
1048/**
1049 *	dev_set_alias - change ifalias of a device
1050 *	@dev: device
1051 *	@alias: name up to IFALIASZ
1052 *	@len: limit of bytes to copy from info
1053 *
1054 *	Set ifalias for a device,
1055 */
1056int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1057{
1058	ASSERT_RTNL();
1059
1060	if (len >= IFALIASZ)
1061		return -EINVAL;
1062
1063	if (!len) {
1064		if (dev->ifalias) {
1065			kfree(dev->ifalias);
1066			dev->ifalias = NULL;
1067		}
1068		return 0;
1069	}
1070
1071	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1072	if (!dev->ifalias)
1073		return -ENOMEM;
1074
1075	strlcpy(dev->ifalias, alias, len+1);
1076	return len;
1077}
1078
1079
1080/**
1081 *	netdev_features_change - device changes features
1082 *	@dev: device to cause notification
1083 *
1084 *	Called to indicate a device has changed features.
1085 */
1086void netdev_features_change(struct net_device *dev)
1087{
1088	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1089}
1090EXPORT_SYMBOL(netdev_features_change);
1091
1092/**
1093 *	netdev_state_change - device changes state
1094 *	@dev: device to cause notification
1095 *
1096 *	Called to indicate a device has changed state. This function calls
1097 *	the notifier chains for netdev_chain and sends a NEWLINK message
1098 *	to the routing socket.
1099 */
1100void netdev_state_change(struct net_device *dev)
1101{
1102	if (dev->flags & IFF_UP) {
1103		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1104		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1105	}
1106}
1107EXPORT_SYMBOL(netdev_state_change);
1108
1109int netdev_bonding_change(struct net_device *dev, unsigned long event)
1110{
1111	return call_netdevice_notifiers(event, dev);
1112}
1113EXPORT_SYMBOL(netdev_bonding_change);
1114
1115/**
1116 *	dev_load 	- load a network module
1117 *	@net: the applicable net namespace
1118 *	@name: name of interface
1119 *
1120 *	If a network interface is not present and the process has suitable
1121 *	privileges this function loads the module. If module loading is not
1122 *	available in this kernel then it becomes a nop.
1123 */
1124
1125void dev_load(struct net *net, const char *name)
1126{
1127	struct net_device *dev;
1128	int no_module;
1129
1130	rcu_read_lock();
1131	dev = dev_get_by_name_rcu(net, name);
1132	rcu_read_unlock();
1133
1134	no_module = !dev;
1135	if (no_module && capable(CAP_NET_ADMIN))
1136		no_module = request_module("netdev-%s", name);
1137	if (no_module && capable(CAP_SYS_MODULE)) {
1138		if (!request_module("%s", name))
1139			pr_err("Loading kernel module for a network device "
1140"with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1141"instead\n", name);
1142	}
1143}
1144EXPORT_SYMBOL(dev_load);
1145
1146static int __dev_open(struct net_device *dev)
1147{
1148	const struct net_device_ops *ops = dev->netdev_ops;
1149	int ret;
1150
1151	ASSERT_RTNL();
1152
1153	if (!netif_device_present(dev))
1154		return -ENODEV;
1155
1156	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1157	ret = notifier_to_errno(ret);
1158	if (ret)
1159		return ret;
1160
1161	set_bit(__LINK_STATE_START, &dev->state);
1162
1163	if (ops->ndo_validate_addr)
1164		ret = ops->ndo_validate_addr(dev);
1165
1166	if (!ret && ops->ndo_open)
1167		ret = ops->ndo_open(dev);
1168
1169	if (ret)
1170		clear_bit(__LINK_STATE_START, &dev->state);
1171	else {
1172		dev->flags |= IFF_UP;
1173		net_dmaengine_get();
1174		dev_set_rx_mode(dev);
1175		dev_activate(dev);
1176	}
1177
1178	return ret;
1179}
1180
1181/**
1182 *	dev_open	- prepare an interface for use.
1183 *	@dev:	device to open
1184 *
1185 *	Takes a device from down to up state. The device's private open
1186 *	function is invoked and then the multicast lists are loaded. Finally
1187 *	the device is moved into the up state and a %NETDEV_UP message is
1188 *	sent to the netdev notifier chain.
1189 *
1190 *	Calling this function on an active interface is a nop. On a failure
1191 *	a negative errno code is returned.
1192 */
1193int dev_open(struct net_device *dev)
1194{
1195	int ret;
1196
1197	if (dev->flags & IFF_UP)
1198		return 0;
1199
1200	ret = __dev_open(dev);
1201	if (ret < 0)
1202		return ret;
1203
1204	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1205	call_netdevice_notifiers(NETDEV_UP, dev);
1206
1207	return ret;
1208}
1209EXPORT_SYMBOL(dev_open);
1210
1211static int __dev_close_many(struct list_head *head)
1212{
1213	struct net_device *dev;
1214
1215	ASSERT_RTNL();
1216	might_sleep();
1217
1218	list_for_each_entry(dev, head, unreg_list) {
1219		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1220
1221		clear_bit(__LINK_STATE_START, &dev->state);
1222
1223		/* Synchronize to scheduled poll. We cannot touch poll list, it
1224		 * can be even on different cpu. So just clear netif_running().
1225		 *
1226		 * dev->stop() will invoke napi_disable() on all of it's
1227		 * napi_struct instances on this device.
1228		 */
1229		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1230	}
1231
1232	dev_deactivate_many(head);
1233
1234	list_for_each_entry(dev, head, unreg_list) {
1235		const struct net_device_ops *ops = dev->netdev_ops;
1236
1237		/*
1238		 *	Call the device specific close. This cannot fail.
1239		 *	Only if device is UP
1240		 *
1241		 *	We allow it to be called even after a DETACH hot-plug
1242		 *	event.
1243		 */
1244		if (ops->ndo_stop)
1245			ops->ndo_stop(dev);
1246
1247		dev->flags &= ~IFF_UP;
1248		net_dmaengine_put();
1249	}
1250
1251	return 0;
1252}
1253
1254static int __dev_close(struct net_device *dev)
1255{
1256	int retval;
1257	LIST_HEAD(single);
1258
1259	list_add(&dev->unreg_list, &single);
1260	retval = __dev_close_many(&single);
1261	list_del(&single);
1262	return retval;
1263}
1264
1265static int dev_close_many(struct list_head *head)
1266{
1267	struct net_device *dev, *tmp;
1268	LIST_HEAD(tmp_list);
1269
1270	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1271		if (!(dev->flags & IFF_UP))
1272			list_move(&dev->unreg_list, &tmp_list);
1273
1274	__dev_close_many(head);
1275
1276	list_for_each_entry(dev, head, unreg_list) {
1277		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1278		call_netdevice_notifiers(NETDEV_DOWN, dev);
1279	}
1280
1281	/* rollback_registered_many needs the complete original list */
1282	list_splice(&tmp_list, head);
1283	return 0;
1284}
1285
1286/**
1287 *	dev_close - shutdown an interface.
1288 *	@dev: device to shutdown
1289 *
1290 *	This function moves an active device into down state. A
1291 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1292 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1293 *	chain.
1294 */
1295int dev_close(struct net_device *dev)
1296{
1297	if (dev->flags & IFF_UP) {
1298		LIST_HEAD(single);
1299
1300		list_add(&dev->unreg_list, &single);
1301		dev_close_many(&single);
1302		list_del(&single);
1303	}
1304	return 0;
1305}
1306EXPORT_SYMBOL(dev_close);
1307
1308
1309/**
1310 *	dev_disable_lro - disable Large Receive Offload on a device
1311 *	@dev: device
1312 *
1313 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1314 *	called under RTNL.  This is needed if received packets may be
1315 *	forwarded to another interface.
1316 */
1317void dev_disable_lro(struct net_device *dev)
1318{
1319	u32 flags;
1320
1321	/*
1322	 * If we're trying to disable lro on a vlan device
1323	 * use the underlying physical device instead
1324	 */
1325	if (is_vlan_dev(dev))
1326		dev = vlan_dev_real_dev(dev);
1327
1328	if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1329		flags = dev->ethtool_ops->get_flags(dev);
1330	else
1331		flags = ethtool_op_get_flags(dev);
1332
1333	if (!(flags & ETH_FLAG_LRO))
1334		return;
1335
1336	__ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1337	if (unlikely(dev->features & NETIF_F_LRO))
1338		netdev_WARN(dev, "failed to disable LRO!\n");
1339}
1340EXPORT_SYMBOL(dev_disable_lro);
1341
1342
1343static int dev_boot_phase = 1;
1344
1345/**
1346 *	register_netdevice_notifier - register a network notifier block
1347 *	@nb: notifier
1348 *
1349 *	Register a notifier to be called when network device events occur.
1350 *	The notifier passed is linked into the kernel structures and must
1351 *	not be reused until it has been unregistered. A negative errno code
1352 *	is returned on a failure.
1353 *
1354 * 	When registered all registration and up events are replayed
1355 *	to the new notifier to allow device to have a race free
1356 *	view of the network device list.
1357 */
1358
1359int register_netdevice_notifier(struct notifier_block *nb)
1360{
1361	struct net_device *dev;
1362	struct net_device *last;
1363	struct net *net;
1364	int err;
1365
1366	rtnl_lock();
1367	err = raw_notifier_chain_register(&netdev_chain, nb);
1368	if (err)
1369		goto unlock;
1370	if (dev_boot_phase)
1371		goto unlock;
1372	for_each_net(net) {
1373		for_each_netdev(net, dev) {
1374			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1375			err = notifier_to_errno(err);
1376			if (err)
1377				goto rollback;
1378
1379			if (!(dev->flags & IFF_UP))
1380				continue;
1381
1382			nb->notifier_call(nb, NETDEV_UP, dev);
1383		}
1384	}
1385
1386unlock:
1387	rtnl_unlock();
1388	return err;
1389
1390rollback:
1391	last = dev;
1392	for_each_net(net) {
1393		for_each_netdev(net, dev) {
1394			if (dev == last)
1395				break;
1396
1397			if (dev->flags & IFF_UP) {
1398				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1399				nb->notifier_call(nb, NETDEV_DOWN, dev);
1400			}
1401			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1402			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1403		}
1404	}
1405
1406	raw_notifier_chain_unregister(&netdev_chain, nb);
1407	goto unlock;
1408}
1409EXPORT_SYMBOL(register_netdevice_notifier);
1410
1411/**
1412 *	unregister_netdevice_notifier - unregister a network notifier block
1413 *	@nb: notifier
1414 *
1415 *	Unregister a notifier previously registered by
1416 *	register_netdevice_notifier(). The notifier is unlinked into the
1417 *	kernel structures and may then be reused. A negative errno code
1418 *	is returned on a failure.
1419 */
1420
1421int unregister_netdevice_notifier(struct notifier_block *nb)
1422{
1423	int err;
1424
1425	rtnl_lock();
1426	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1427	rtnl_unlock();
1428	return err;
1429}
1430EXPORT_SYMBOL(unregister_netdevice_notifier);
1431
1432/**
1433 *	call_netdevice_notifiers - call all network notifier blocks
1434 *      @val: value passed unmodified to notifier function
1435 *      @dev: net_device pointer passed unmodified to notifier function
1436 *
1437 *	Call all network notifier blocks.  Parameters and return value
1438 *	are as for raw_notifier_call_chain().
1439 */
1440
1441int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1442{
1443	ASSERT_RTNL();
1444	return raw_notifier_call_chain(&netdev_chain, val, dev);
1445}
1446EXPORT_SYMBOL(call_netdevice_notifiers);
1447
1448/* When > 0 there are consumers of rx skb time stamps */
1449static atomic_t netstamp_needed = ATOMIC_INIT(0);
1450
1451void net_enable_timestamp(void)
1452{
1453	atomic_inc(&netstamp_needed);
1454}
1455EXPORT_SYMBOL(net_enable_timestamp);
1456
1457void net_disable_timestamp(void)
1458{
1459	atomic_dec(&netstamp_needed);
1460}
1461EXPORT_SYMBOL(net_disable_timestamp);
1462
1463static inline void net_timestamp_set(struct sk_buff *skb)
1464{
1465	if (atomic_read(&netstamp_needed))
1466		__net_timestamp(skb);
1467	else
1468		skb->tstamp.tv64 = 0;
1469}
1470
1471static inline void net_timestamp_check(struct sk_buff *skb)
1472{
1473	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1474		__net_timestamp(skb);
1475}
1476
1477static inline bool is_skb_forwardable(struct net_device *dev,
1478				      struct sk_buff *skb)
1479{
1480	unsigned int len;
1481
1482	if (!(dev->flags & IFF_UP))
1483		return false;
1484
1485	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1486	if (skb->len <= len)
1487		return true;
1488
1489	/* if TSO is enabled, we don't care about the length as the packet
1490	 * could be forwarded without being segmented before
1491	 */
1492	if (skb_is_gso(skb))
1493		return true;
1494
1495	return false;
1496}
1497
1498/**
1499 * dev_forward_skb - loopback an skb to another netif
1500 *
1501 * @dev: destination network device
1502 * @skb: buffer to forward
1503 *
1504 * return values:
1505 *	NET_RX_SUCCESS	(no congestion)
1506 *	NET_RX_DROP     (packet was dropped, but freed)
1507 *
1508 * dev_forward_skb can be used for injecting an skb from the
1509 * start_xmit function of one device into the receive queue
1510 * of another device.
1511 *
1512 * The receiving device may be in another namespace, so
1513 * we have to clear all information in the skb that could
1514 * impact namespace isolation.
1515 */
1516int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1517{
1518	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1519		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1520			atomic_long_inc(&dev->rx_dropped);
1521			kfree_skb(skb);
1522			return NET_RX_DROP;
1523		}
1524	}
1525
1526	skb_orphan(skb);
1527	nf_reset(skb);
1528
1529	if (unlikely(!is_skb_forwardable(dev, skb))) {
1530		atomic_long_inc(&dev->rx_dropped);
1531		kfree_skb(skb);
1532		return NET_RX_DROP;
1533	}
1534	skb_set_dev(skb, dev);
1535	skb->tstamp.tv64 = 0;
1536	skb->pkt_type = PACKET_HOST;
1537	skb->protocol = eth_type_trans(skb, dev);
1538	return netif_rx(skb);
1539}
1540EXPORT_SYMBOL_GPL(dev_forward_skb);
1541
1542static inline int deliver_skb(struct sk_buff *skb,
1543			      struct packet_type *pt_prev,
1544			      struct net_device *orig_dev)
1545{
1546	atomic_inc(&skb->users);
1547	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1548}
1549
1550/*
1551 *	Support routine. Sends outgoing frames to any network
1552 *	taps currently in use.
1553 */
1554
1555static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1556{
1557	struct packet_type *ptype;
1558	struct sk_buff *skb2 = NULL;
1559	struct packet_type *pt_prev = NULL;
1560
1561	rcu_read_lock();
1562	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1563		/* Never send packets back to the socket
1564		 * they originated from - MvS (miquels@drinkel.ow.org)
1565		 */
1566		if ((ptype->dev == dev || !ptype->dev) &&
1567		    (ptype->af_packet_priv == NULL ||
1568		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1569			if (pt_prev) {
1570				deliver_skb(skb2, pt_prev, skb->dev);
1571				pt_prev = ptype;
1572				continue;
1573			}
1574
1575			skb2 = skb_clone(skb, GFP_ATOMIC);
1576			if (!skb2)
1577				break;
1578
1579			net_timestamp_set(skb2);
1580
1581			/* skb->nh should be correctly
1582			   set by sender, so that the second statement is
1583			   just protection against buggy protocols.
1584			 */
1585			skb_reset_mac_header(skb2);
1586
1587			if (skb_network_header(skb2) < skb2->data ||
1588			    skb2->network_header > skb2->tail) {
1589				if (net_ratelimit())
1590					printk(KERN_CRIT "protocol %04x is "
1591					       "buggy, dev %s\n",
1592					       ntohs(skb2->protocol),
1593					       dev->name);
1594				skb_reset_network_header(skb2);
1595			}
1596
1597			skb2->transport_header = skb2->network_header;
1598			skb2->pkt_type = PACKET_OUTGOING;
1599			pt_prev = ptype;
1600		}
1601	}
1602	if (pt_prev)
1603		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1604	rcu_read_unlock();
1605}
1606
1607/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1608 * @dev: Network device
1609 * @txq: number of queues available
1610 *
1611 * If real_num_tx_queues is changed the tc mappings may no longer be
1612 * valid. To resolve this verify the tc mapping remains valid and if
1613 * not NULL the mapping. With no priorities mapping to this
1614 * offset/count pair it will no longer be used. In the worst case TC0
1615 * is invalid nothing can be done so disable priority mappings. If is
1616 * expected that drivers will fix this mapping if they can before
1617 * calling netif_set_real_num_tx_queues.
1618 */
1619static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1620{
1621	int i;
1622	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1623
1624	/* If TC0 is invalidated disable TC mapping */
1625	if (tc->offset + tc->count > txq) {
1626		pr_warning("Number of in use tx queues changed "
1627			   "invalidating tc mappings. Priority "
1628			   "traffic classification disabled!\n");
1629		dev->num_tc = 0;
1630		return;
1631	}
1632
1633	/* Invalidated prio to tc mappings set to TC0 */
1634	for (i = 1; i < TC_BITMASK + 1; i++) {
1635		int q = netdev_get_prio_tc_map(dev, i);
1636
1637		tc = &dev->tc_to_txq[q];
1638		if (tc->offset + tc->count > txq) {
1639			pr_warning("Number of in use tx queues "
1640				   "changed. Priority %i to tc "
1641				   "mapping %i is no longer valid "
1642				   "setting map to 0\n",
1643				   i, q);
1644			netdev_set_prio_tc_map(dev, i, 0);
1645		}
1646	}
1647}
1648
1649/*
1650 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1651 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1652 */
1653int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1654{
1655	int rc;
1656
1657	if (txq < 1 || txq > dev->num_tx_queues)
1658		return -EINVAL;
1659
1660	if (dev->reg_state == NETREG_REGISTERED ||
1661	    dev->reg_state == NETREG_UNREGISTERING) {
1662		ASSERT_RTNL();
1663
1664		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1665						  txq);
1666		if (rc)
1667			return rc;
1668
1669		if (dev->num_tc)
1670			netif_setup_tc(dev, txq);
1671
1672		if (txq < dev->real_num_tx_queues)
1673			qdisc_reset_all_tx_gt(dev, txq);
1674	}
1675
1676	dev->real_num_tx_queues = txq;
1677	return 0;
1678}
1679EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1680
1681#ifdef CONFIG_RPS
1682/**
1683 *	netif_set_real_num_rx_queues - set actual number of RX queues used
1684 *	@dev: Network device
1685 *	@rxq: Actual number of RX queues
1686 *
1687 *	This must be called either with the rtnl_lock held or before
1688 *	registration of the net device.  Returns 0 on success, or a
1689 *	negative error code.  If called before registration, it always
1690 *	succeeds.
1691 */
1692int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1693{
1694	int rc;
1695
1696	if (rxq < 1 || rxq > dev->num_rx_queues)
1697		return -EINVAL;
1698
1699	if (dev->reg_state == NETREG_REGISTERED) {
1700		ASSERT_RTNL();
1701
1702		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1703						  rxq);
1704		if (rc)
1705			return rc;
1706	}
1707
1708	dev->real_num_rx_queues = rxq;
1709	return 0;
1710}
1711EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1712#endif
1713
1714static inline void __netif_reschedule(struct Qdisc *q)
1715{
1716	struct softnet_data *sd;
1717	unsigned long flags;
1718
1719	local_irq_save(flags);
1720	sd = &__get_cpu_var(softnet_data);
1721	q->next_sched = NULL;
1722	*sd->output_queue_tailp = q;
1723	sd->output_queue_tailp = &q->next_sched;
1724	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1725	local_irq_restore(flags);
1726}
1727
1728void __netif_schedule(struct Qdisc *q)
1729{
1730	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1731		__netif_reschedule(q);
1732}
1733EXPORT_SYMBOL(__netif_schedule);
1734
1735void dev_kfree_skb_irq(struct sk_buff *skb)
1736{
1737	if (atomic_dec_and_test(&skb->users)) {
1738		struct softnet_data *sd;
1739		unsigned long flags;
1740
1741		local_irq_save(flags);
1742		sd = &__get_cpu_var(softnet_data);
1743		skb->next = sd->completion_queue;
1744		sd->completion_queue = skb;
1745		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1746		local_irq_restore(flags);
1747	}
1748}
1749EXPORT_SYMBOL(dev_kfree_skb_irq);
1750
1751void dev_kfree_skb_any(struct sk_buff *skb)
1752{
1753	if (in_irq() || irqs_disabled())
1754		dev_kfree_skb_irq(skb);
1755	else
1756		dev_kfree_skb(skb);
1757}
1758EXPORT_SYMBOL(dev_kfree_skb_any);
1759
1760
1761/**
1762 * netif_device_detach - mark device as removed
1763 * @dev: network device
1764 *
1765 * Mark device as removed from system and therefore no longer available.
1766 */
1767void netif_device_detach(struct net_device *dev)
1768{
1769	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1770	    netif_running(dev)) {
1771		netif_tx_stop_all_queues(dev);
1772	}
1773}
1774EXPORT_SYMBOL(netif_device_detach);
1775
1776/**
1777 * netif_device_attach - mark device as attached
1778 * @dev: network device
1779 *
1780 * Mark device as attached from system and restart if needed.
1781 */
1782void netif_device_attach(struct net_device *dev)
1783{
1784	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1785	    netif_running(dev)) {
1786		netif_tx_wake_all_queues(dev);
1787		__netdev_watchdog_up(dev);
1788	}
1789}
1790EXPORT_SYMBOL(netif_device_attach);
1791
1792/**
1793 * skb_dev_set -- assign a new device to a buffer
1794 * @skb: buffer for the new device
1795 * @dev: network device
1796 *
1797 * If an skb is owned by a device already, we have to reset
1798 * all data private to the namespace a device belongs to
1799 * before assigning it a new device.
1800 */
1801#ifdef CONFIG_NET_NS
1802void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1803{
1804	skb_dst_drop(skb);
1805	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1806		secpath_reset(skb);
1807		nf_reset(skb);
1808		skb_init_secmark(skb);
1809		skb->mark = 0;
1810		skb->priority = 0;
1811		skb->nf_trace = 0;
1812		skb->ipvs_property = 0;
1813#ifdef CONFIG_NET_SCHED
1814		skb->tc_index = 0;
1815#endif
1816	}
1817	skb->dev = dev;
1818}
1819EXPORT_SYMBOL(skb_set_dev);
1820#endif /* CONFIG_NET_NS */
1821
1822/*
1823 * Invalidate hardware checksum when packet is to be mangled, and
1824 * complete checksum manually on outgoing path.
1825 */
1826int skb_checksum_help(struct sk_buff *skb)
1827{
1828	__wsum csum;
1829	int ret = 0, offset;
1830
1831	if (skb->ip_summed == CHECKSUM_COMPLETE)
1832		goto out_set_summed;
1833
1834	if (unlikely(skb_shinfo(skb)->gso_size)) {
1835		/* Let GSO fix up the checksum. */
1836		goto out_set_summed;
1837	}
1838
1839	offset = skb_checksum_start_offset(skb);
1840	BUG_ON(offset >= skb_headlen(skb));
1841	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1842
1843	offset += skb->csum_offset;
1844	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1845
1846	if (skb_cloned(skb) &&
1847	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1848		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1849		if (ret)
1850			goto out;
1851	}
1852
1853	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1854out_set_summed:
1855	skb->ip_summed = CHECKSUM_NONE;
1856out:
1857	return ret;
1858}
1859EXPORT_SYMBOL(skb_checksum_help);
1860
1861/**
1862 *	skb_gso_segment - Perform segmentation on skb.
1863 *	@skb: buffer to segment
1864 *	@features: features for the output path (see dev->features)
1865 *
1866 *	This function segments the given skb and returns a list of segments.
1867 *
1868 *	It may return NULL if the skb requires no segmentation.  This is
1869 *	only possible when GSO is used for verifying header integrity.
1870 */
1871struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1872{
1873	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1874	struct packet_type *ptype;
1875	__be16 type = skb->protocol;
1876	int vlan_depth = ETH_HLEN;
1877	int err;
1878
1879	while (type == htons(ETH_P_8021Q)) {
1880		struct vlan_hdr *vh;
1881
1882		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1883			return ERR_PTR(-EINVAL);
1884
1885		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1886		type = vh->h_vlan_encapsulated_proto;
1887		vlan_depth += VLAN_HLEN;
1888	}
1889
1890	skb_reset_mac_header(skb);
1891	skb->mac_len = skb->network_header - skb->mac_header;
1892	__skb_pull(skb, skb->mac_len);
1893
1894	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1895		struct net_device *dev = skb->dev;
1896		struct ethtool_drvinfo info = {};
1897
1898		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1899			dev->ethtool_ops->get_drvinfo(dev, &info);
1900
1901		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1902		     info.driver, dev ? dev->features : 0L,
1903		     skb->sk ? skb->sk->sk_route_caps : 0L,
1904		     skb->len, skb->data_len, skb->ip_summed);
1905
1906		if (skb_header_cloned(skb) &&
1907		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1908			return ERR_PTR(err);
1909	}
1910
1911	rcu_read_lock();
1912	list_for_each_entry_rcu(ptype,
1913			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1914		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1915			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1916				err = ptype->gso_send_check(skb);
1917				segs = ERR_PTR(err);
1918				if (err || skb_gso_ok(skb, features))
1919					break;
1920				__skb_push(skb, (skb->data -
1921						 skb_network_header(skb)));
1922			}
1923			segs = ptype->gso_segment(skb, features);
1924			break;
1925		}
1926	}
1927	rcu_read_unlock();
1928
1929	__skb_push(skb, skb->data - skb_mac_header(skb));
1930
1931	return segs;
1932}
1933EXPORT_SYMBOL(skb_gso_segment);
1934
1935/* Take action when hardware reception checksum errors are detected. */
1936#ifdef CONFIG_BUG
1937void netdev_rx_csum_fault(struct net_device *dev)
1938{
1939	if (net_ratelimit()) {
1940		printk(KERN_ERR "%s: hw csum failure.\n",
1941			dev ? dev->name : "<unknown>");
1942		dump_stack();
1943	}
1944}
1945EXPORT_SYMBOL(netdev_rx_csum_fault);
1946#endif
1947
1948/* Actually, we should eliminate this check as soon as we know, that:
1949 * 1. IOMMU is present and allows to map all the memory.
1950 * 2. No high memory really exists on this machine.
1951 */
1952
1953static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1954{
1955#ifdef CONFIG_HIGHMEM
1956	int i;
1957	if (!(dev->features & NETIF_F_HIGHDMA)) {
1958		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1959			if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1960				return 1;
1961	}
1962
1963	if (PCI_DMA_BUS_IS_PHYS) {
1964		struct device *pdev = dev->dev.parent;
1965
1966		if (!pdev)
1967			return 0;
1968		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1969			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1970			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1971				return 1;
1972		}
1973	}
1974#endif
1975	return 0;
1976}
1977
1978struct dev_gso_cb {
1979	void (*destructor)(struct sk_buff *skb);
1980};
1981
1982#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1983
1984static void dev_gso_skb_destructor(struct sk_buff *skb)
1985{
1986	struct dev_gso_cb *cb;
1987
1988	do {
1989		struct sk_buff *nskb = skb->next;
1990
1991		skb->next = nskb->next;
1992		nskb->next = NULL;
1993		kfree_skb(nskb);
1994	} while (skb->next);
1995
1996	cb = DEV_GSO_CB(skb);
1997	if (cb->destructor)
1998		cb->destructor(skb);
1999}
2000
2001/**
2002 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2003 *	@skb: buffer to segment
2004 *	@features: device features as applicable to this skb
2005 *
2006 *	This function segments the given skb and stores the list of segments
2007 *	in skb->next.
2008 */
2009static int dev_gso_segment(struct sk_buff *skb, int features)
2010{
2011	struct sk_buff *segs;
2012
2013	segs = skb_gso_segment(skb, features);
2014
2015	/* Verifying header integrity only. */
2016	if (!segs)
2017		return 0;
2018
2019	if (IS_ERR(segs))
2020		return PTR_ERR(segs);
2021
2022	skb->next = segs;
2023	DEV_GSO_CB(skb)->destructor = skb->destructor;
2024	skb->destructor = dev_gso_skb_destructor;
2025
2026	return 0;
2027}
2028
2029/*
2030 * Try to orphan skb early, right before transmission by the device.
2031 * We cannot orphan skb if tx timestamp is requested or the sk-reference
2032 * is needed on driver level for other reasons, e.g. see net/can/raw.c
2033 */
2034static inline void skb_orphan_try(struct sk_buff *skb)
2035{
2036	struct sock *sk = skb->sk;
2037
2038	if (sk && !skb_shinfo(skb)->tx_flags) {
2039		/* skb_tx_hash() wont be able to get sk.
2040		 * We copy sk_hash into skb->rxhash
2041		 */
2042		if (!skb->rxhash)
2043			skb->rxhash = sk->sk_hash;
2044		skb_orphan(skb);
2045	}
2046}
2047
2048static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2049{
2050	return ((features & NETIF_F_GEN_CSUM) ||
2051		((features & NETIF_F_V4_CSUM) &&
2052		 protocol == htons(ETH_P_IP)) ||
2053		((features & NETIF_F_V6_CSUM) &&
2054		 protocol == htons(ETH_P_IPV6)) ||
2055		((features & NETIF_F_FCOE_CRC) &&
2056		 protocol == htons(ETH_P_FCOE)));
2057}
2058
2059static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2060{
2061	if (!can_checksum_protocol(features, protocol)) {
2062		features &= ~NETIF_F_ALL_CSUM;
2063		features &= ~NETIF_F_SG;
2064	} else if (illegal_highdma(skb->dev, skb)) {
2065		features &= ~NETIF_F_SG;
2066	}
2067
2068	return features;
2069}
2070
2071u32 netif_skb_features(struct sk_buff *skb)
2072{
2073	__be16 protocol = skb->protocol;
2074	u32 features = skb->dev->features;
2075
2076	if (protocol == htons(ETH_P_8021Q)) {
2077		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2078		protocol = veh->h_vlan_encapsulated_proto;
2079	} else if (!vlan_tx_tag_present(skb)) {
2080		return harmonize_features(skb, protocol, features);
2081	}
2082
2083	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2084
2085	if (protocol != htons(ETH_P_8021Q)) {
2086		return harmonize_features(skb, protocol, features);
2087	} else {
2088		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2089				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2090		return harmonize_features(skb, protocol, features);
2091	}
2092}
2093EXPORT_SYMBOL(netif_skb_features);
2094
2095/*
2096 * Returns true if either:
2097 *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2098 *	2. skb is fragmented and the device does not support SG, or if
2099 *	   at least one of fragments is in highmem and device does not
2100 *	   support DMA from it.
2101 */
2102static inline int skb_needs_linearize(struct sk_buff *skb,
2103				      int features)
2104{
2105	return skb_is_nonlinear(skb) &&
2106			((skb_has_frag_list(skb) &&
2107				!(features & NETIF_F_FRAGLIST)) ||
2108			(skb_shinfo(skb)->nr_frags &&
2109				!(features & NETIF_F_SG)));
2110}
2111
2112int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2113			struct netdev_queue *txq)
2114{
2115	const struct net_device_ops *ops = dev->netdev_ops;
2116	int rc = NETDEV_TX_OK;
2117	unsigned int skb_len;
2118
2119	if (likely(!skb->next)) {
2120		u32 features;
2121
2122		/*
2123		 * If device doesn't need skb->dst, release it right now while
2124		 * its hot in this cpu cache
2125		 */
2126		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2127			skb_dst_drop(skb);
2128
2129		if (!list_empty(&ptype_all))
2130			dev_queue_xmit_nit(skb, dev);
2131
2132		skb_orphan_try(skb);
2133
2134		features = netif_skb_features(skb);
2135
2136		if (vlan_tx_tag_present(skb) &&
2137		    !(features & NETIF_F_HW_VLAN_TX)) {
2138			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2139			if (unlikely(!skb))
2140				goto out;
2141
2142			skb->vlan_tci = 0;
2143		}
2144
2145		if (netif_needs_gso(skb, features)) {
2146			if (unlikely(dev_gso_segment(skb, features)))
2147				goto out_kfree_skb;
2148			if (skb->next)
2149				goto gso;
2150		} else {
2151			if (skb_needs_linearize(skb, features) &&
2152			    __skb_linearize(skb))
2153				goto out_kfree_skb;
2154
2155			/* If packet is not checksummed and device does not
2156			 * support checksumming for this protocol, complete
2157			 * checksumming here.
2158			 */
2159			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2160				skb_set_transport_header(skb,
2161					skb_checksum_start_offset(skb));
2162				if (!(features & NETIF_F_ALL_CSUM) &&
2163				     skb_checksum_help(skb))
2164					goto out_kfree_skb;
2165			}
2166		}
2167
2168		skb_len = skb->len;
2169		rc = ops->ndo_start_xmit(skb, dev);
2170		trace_net_dev_xmit(skb, rc, dev, skb_len);
2171		if (rc == NETDEV_TX_OK)
2172			txq_trans_update(txq);
2173		return rc;
2174	}
2175
2176gso:
2177	do {
2178		struct sk_buff *nskb = skb->next;
2179
2180		skb->next = nskb->next;
2181		nskb->next = NULL;
2182
2183		/*
2184		 * If device doesn't need nskb->dst, release it right now while
2185		 * its hot in this cpu cache
2186		 */
2187		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2188			skb_dst_drop(nskb);
2189
2190		skb_len = nskb->len;
2191		rc = ops->ndo_start_xmit(nskb, dev);
2192		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2193		if (unlikely(rc != NETDEV_TX_OK)) {
2194			if (rc & ~NETDEV_TX_MASK)
2195				goto out_kfree_gso_skb;
2196			nskb->next = skb->next;
2197			skb->next = nskb;
2198			return rc;
2199		}
2200		txq_trans_update(txq);
2201		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2202			return NETDEV_TX_BUSY;
2203	} while (skb->next);
2204
2205out_kfree_gso_skb:
2206	if (likely(skb->next == NULL))
2207		skb->destructor = DEV_GSO_CB(skb)->destructor;
2208out_kfree_skb:
2209	kfree_skb(skb);
2210out:
2211	return rc;
2212}
2213
2214static u32 hashrnd __read_mostly;
2215
2216/*
2217 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2218 * to be used as a distribution range.
2219 */
2220u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2221		  unsigned int num_tx_queues)
2222{
2223	u32 hash;
2224	u16 qoffset = 0;
2225	u16 qcount = num_tx_queues;
2226
2227	if (skb_rx_queue_recorded(skb)) {
2228		hash = skb_get_rx_queue(skb);
2229		while (unlikely(hash >= num_tx_queues))
2230			hash -= num_tx_queues;
2231		return hash;
2232	}
2233
2234	if (dev->num_tc) {
2235		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2236		qoffset = dev->tc_to_txq[tc].offset;
2237		qcount = dev->tc_to_txq[tc].count;
2238	}
2239
2240	if (skb->sk && skb->sk->sk_hash)
2241		hash = skb->sk->sk_hash;
2242	else
2243		hash = (__force u16) skb->protocol ^ skb->rxhash;
2244	hash = jhash_1word(hash, hashrnd);
2245
2246	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2247}
2248EXPORT_SYMBOL(__skb_tx_hash);
2249
2250static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2251{
2252	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2253		if (net_ratelimit()) {
2254			pr_warning("%s selects TX queue %d, but "
2255				"real number of TX queues is %d\n",
2256				dev->name, queue_index, dev->real_num_tx_queues);
2257		}
2258		return 0;
2259	}
2260	return queue_index;
2261}
2262
2263static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2264{
2265#ifdef CONFIG_XPS
2266	struct xps_dev_maps *dev_maps;
2267	struct xps_map *map;
2268	int queue_index = -1;
2269
2270	rcu_read_lock();
2271	dev_maps = rcu_dereference(dev->xps_maps);
2272	if (dev_maps) {
2273		map = rcu_dereference(
2274		    dev_maps->cpu_map[raw_smp_processor_id()]);
2275		if (map) {
2276			if (map->len == 1)
2277				queue_index = map->queues[0];
2278			else {
2279				u32 hash;
2280				if (skb->sk && skb->sk->sk_hash)
2281					hash = skb->sk->sk_hash;
2282				else
2283					hash = (__force u16) skb->protocol ^
2284					    skb->rxhash;
2285				hash = jhash_1word(hash, hashrnd);
2286				queue_index = map->queues[
2287				    ((u64)hash * map->len) >> 32];
2288			}
2289			if (unlikely(queue_index >= dev->real_num_tx_queues))
2290				queue_index = -1;
2291		}
2292	}
2293	rcu_read_unlock();
2294
2295	return queue_index;
2296#else
2297	return -1;
2298#endif
2299}
2300
2301static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2302					struct sk_buff *skb)
2303{
2304	int queue_index;
2305	const struct net_device_ops *ops = dev->netdev_ops;
2306
2307	if (dev->real_num_tx_queues == 1)
2308		queue_index = 0;
2309	else if (ops->ndo_select_queue) {
2310		queue_index = ops->ndo_select_queue(dev, skb);
2311		queue_index = dev_cap_txqueue(dev, queue_index);
2312	} else {
2313		struct sock *sk = skb->sk;
2314		queue_index = sk_tx_queue_get(sk);
2315
2316		if (queue_index < 0 || skb->ooo_okay ||
2317		    queue_index >= dev->real_num_tx_queues) {
2318			int old_index = queue_index;
2319
2320			queue_index = get_xps_queue(dev, skb);
2321			if (queue_index < 0)
2322				queue_index = skb_tx_hash(dev, skb);
2323
2324			if (queue_index != old_index && sk) {
2325				struct dst_entry *dst =
2326				    rcu_dereference_check(sk->sk_dst_cache, 1);
2327
2328				if (dst && skb_dst(skb) == dst)
2329					sk_tx_queue_set(sk, queue_index);
2330			}
2331		}
2332	}
2333
2334	skb_set_queue_mapping(skb, queue_index);
2335	return netdev_get_tx_queue(dev, queue_index);
2336}
2337
2338static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2339				 struct net_device *dev,
2340				 struct netdev_queue *txq)
2341{
2342	spinlock_t *root_lock = qdisc_lock(q);
2343	bool contended;
2344	int rc;
2345
2346	qdisc_skb_cb(skb)->pkt_len = skb->len;
2347	qdisc_calculate_pkt_len(skb, q);
2348	/*
2349	 * Heuristic to force contended enqueues to serialize on a
2350	 * separate lock before trying to get qdisc main lock.
2351	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2352	 * and dequeue packets faster.
2353	 */
2354	contended = qdisc_is_running(q);
2355	if (unlikely(contended))
2356		spin_lock(&q->busylock);
2357
2358	spin_lock(root_lock);
2359	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2360		kfree_skb(skb);
2361		rc = NET_XMIT_DROP;
2362	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2363		   qdisc_run_begin(q)) {
2364		/*
2365		 * This is a work-conserving queue; there are no old skbs
2366		 * waiting to be sent out; and the qdisc is not running -
2367		 * xmit the skb directly.
2368		 */
2369		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2370			skb_dst_force(skb);
2371
2372		qdisc_bstats_update(q, skb);
2373
2374		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2375			if (unlikely(contended)) {
2376				spin_unlock(&q->busylock);
2377				contended = false;
2378			}
2379			__qdisc_run(q);
2380		} else
2381			qdisc_run_end(q);
2382
2383		rc = NET_XMIT_SUCCESS;
2384	} else {
2385		skb_dst_force(skb);
2386		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2387		if (qdisc_run_begin(q)) {
2388			if (unlikely(contended)) {
2389				spin_unlock(&q->busylock);
2390				contended = false;
2391			}
2392			__qdisc_run(q);
2393		}
2394	}
2395	spin_unlock(root_lock);
2396	if (unlikely(contended))
2397		spin_unlock(&q->busylock);
2398	return rc;
2399}
2400
2401static DEFINE_PER_CPU(int, xmit_recursion);
2402#define RECURSION_LIMIT 10
2403
2404/**
2405 *	dev_queue_xmit - transmit a buffer
2406 *	@skb: buffer to transmit
2407 *
2408 *	Queue a buffer for transmission to a network device. The caller must
2409 *	have set the device and priority and built the buffer before calling
2410 *	this function. The function can be called from an interrupt.
2411 *
2412 *	A negative errno code is returned on a failure. A success does not
2413 *	guarantee the frame will be transmitted as it may be dropped due
2414 *	to congestion or traffic shaping.
2415 *
2416 * -----------------------------------------------------------------------------------
2417 *      I notice this method can also return errors from the queue disciplines,
2418 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2419 *      be positive.
2420 *
2421 *      Regardless of the return value, the skb is consumed, so it is currently
2422 *      difficult to retry a send to this method.  (You can bump the ref count
2423 *      before sending to hold a reference for retry if you are careful.)
2424 *
2425 *      When calling this method, interrupts MUST be enabled.  This is because
2426 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2427 *          --BLG
2428 */
2429int dev_queue_xmit(struct sk_buff *skb)
2430{
2431	struct net_device *dev = skb->dev;
2432	struct netdev_queue *txq;
2433	struct Qdisc *q;
2434	int rc = -ENOMEM;
2435
2436	/* Disable soft irqs for various locks below. Also
2437	 * stops preemption for RCU.
2438	 */
2439	rcu_read_lock_bh();
2440
2441	txq = dev_pick_tx(dev, skb);
2442	q = rcu_dereference_bh(txq->qdisc);
2443
2444#ifdef CONFIG_NET_CLS_ACT
2445	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2446#endif
2447	trace_net_dev_queue(skb);
2448	if (q->enqueue) {
2449		rc = __dev_xmit_skb(skb, q, dev, txq);
2450		goto out;
2451	}
2452
2453	/* The device has no queue. Common case for software devices:
2454	   loopback, all the sorts of tunnels...
2455
2456	   Really, it is unlikely that netif_tx_lock protection is necessary
2457	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2458	   counters.)
2459	   However, it is possible, that they rely on protection
2460	   made by us here.
2461
2462	   Check this and shot the lock. It is not prone from deadlocks.
2463	   Either shot noqueue qdisc, it is even simpler 8)
2464	 */
2465	if (dev->flags & IFF_UP) {
2466		int cpu = smp_processor_id(); /* ok because BHs are off */
2467
2468		if (txq->xmit_lock_owner != cpu) {
2469
2470			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2471				goto recursion_alert;
2472
2473			HARD_TX_LOCK(dev, txq, cpu);
2474
2475			if (!netif_tx_queue_stopped(txq)) {
2476				__this_cpu_inc(xmit_recursion);
2477				rc = dev_hard_start_xmit(skb, dev, txq);
2478				__this_cpu_dec(xmit_recursion);
2479				if (dev_xmit_complete(rc)) {
2480					HARD_TX_UNLOCK(dev, txq);
2481					goto out;
2482				}
2483			}
2484			HARD_TX_UNLOCK(dev, txq);
2485			if (net_ratelimit())
2486				printk(KERN_CRIT "Virtual device %s asks to "
2487				       "queue packet!\n", dev->name);
2488		} else {
2489			/* Recursion is detected! It is possible,
2490			 * unfortunately
2491			 */
2492recursion_alert:
2493			if (net_ratelimit())
2494				printk(KERN_CRIT "Dead loop on virtual device "
2495				       "%s, fix it urgently!\n", dev->name);
2496		}
2497	}
2498
2499	rc = -ENETDOWN;
2500	rcu_read_unlock_bh();
2501
2502	kfree_skb(skb);
2503	return rc;
2504out:
2505	rcu_read_unlock_bh();
2506	return rc;
2507}
2508EXPORT_SYMBOL(dev_queue_xmit);
2509
2510
2511/*=======================================================================
2512			Receiver routines
2513  =======================================================================*/
2514
2515int netdev_max_backlog __read_mostly = 1000;
2516int netdev_tstamp_prequeue __read_mostly = 1;
2517int netdev_budget __read_mostly = 300;
2518int weight_p __read_mostly = 64;            /* old backlog weight */
2519
2520/* Called with irq disabled */
2521static inline void ____napi_schedule(struct softnet_data *sd,
2522				     struct napi_struct *napi)
2523{
2524	list_add_tail(&napi->poll_list, &sd->poll_list);
2525	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2526}
2527
2528/*
2529 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2530 * and src/dst port numbers. Returns a non-zero hash number on success
2531 * and 0 on failure.
2532 */
2533__u32 __skb_get_rxhash(struct sk_buff *skb)
2534{
2535	int nhoff, hash = 0, poff;
2536	const struct ipv6hdr *ip6;
2537	const struct iphdr *ip;
2538	u8 ip_proto;
2539	u32 addr1, addr2, ihl;
2540	union {
2541		u32 v32;
2542		u16 v16[2];
2543	} ports;
2544
2545	nhoff = skb_network_offset(skb);
2546
2547	switch (skb->protocol) {
2548	case __constant_htons(ETH_P_IP):
2549		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2550			goto done;
2551
2552		ip = (const struct iphdr *) (skb->data + nhoff);
2553		if (ip_is_fragment(ip))
2554			ip_proto = 0;
2555		else
2556			ip_proto = ip->protocol;
2557		addr1 = (__force u32) ip->saddr;
2558		addr2 = (__force u32) ip->daddr;
2559		ihl = ip->ihl;
2560		break;
2561	case __constant_htons(ETH_P_IPV6):
2562		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2563			goto done;
2564
2565		ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2566		ip_proto = ip6->nexthdr;
2567		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2568		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2569		ihl = (40 >> 2);
2570		break;
2571	default:
2572		goto done;
2573	}
2574
2575	ports.v32 = 0;
2576	poff = proto_ports_offset(ip_proto);
2577	if (poff >= 0) {
2578		nhoff += ihl * 4 + poff;
2579		if (pskb_may_pull(skb, nhoff + 4)) {
2580			ports.v32 = * (__force u32 *) (skb->data + nhoff);
2581			if (ports.v16[1] < ports.v16[0])
2582				swap(ports.v16[0], ports.v16[1]);
2583		}
2584	}
2585
2586	/* get a consistent hash (same value on both flow directions) */
2587	if (addr2 < addr1)
2588		swap(addr1, addr2);
2589
2590	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2591	if (!hash)
2592		hash = 1;
2593
2594done:
2595	return hash;
2596}
2597EXPORT_SYMBOL(__skb_get_rxhash);
2598
2599#ifdef CONFIG_RPS
2600
2601/* One global table that all flow-based protocols share. */
2602struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2603EXPORT_SYMBOL(rps_sock_flow_table);
2604
2605static struct rps_dev_flow *
2606set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2607	    struct rps_dev_flow *rflow, u16 next_cpu)
2608{
2609	u16 tcpu;
2610
2611	tcpu = rflow->cpu = next_cpu;
2612	if (tcpu != RPS_NO_CPU) {
2613#ifdef CONFIG_RFS_ACCEL
2614		struct netdev_rx_queue *rxqueue;
2615		struct rps_dev_flow_table *flow_table;
2616		struct rps_dev_flow *old_rflow;
2617		u32 flow_id;
2618		u16 rxq_index;
2619		int rc;
2620
2621		/* Should we steer this flow to a different hardware queue? */
2622		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2623		    !(dev->features & NETIF_F_NTUPLE))
2624			goto out;
2625		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2626		if (rxq_index == skb_get_rx_queue(skb))
2627			goto out;
2628
2629		rxqueue = dev->_rx + rxq_index;
2630		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2631		if (!flow_table)
2632			goto out;
2633		flow_id = skb->rxhash & flow_table->mask;
2634		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2635							rxq_index, flow_id);
2636		if (rc < 0)
2637			goto out;
2638		old_rflow = rflow;
2639		rflow = &flow_table->flows[flow_id];
2640		rflow->cpu = next_cpu;
2641		rflow->filter = rc;
2642		if (old_rflow->filter == rflow->filter)
2643			old_rflow->filter = RPS_NO_FILTER;
2644	out:
2645#endif
2646		rflow->last_qtail =
2647			per_cpu(softnet_data, tcpu).input_queue_head;
2648	}
2649
2650	return rflow;
2651}
2652
2653/*
2654 * get_rps_cpu is called from netif_receive_skb and returns the target
2655 * CPU from the RPS map of the receiving queue for a given skb.
2656 * rcu_read_lock must be held on entry.
2657 */
2658static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2659		       struct rps_dev_flow **rflowp)
2660{
2661	struct netdev_rx_queue *rxqueue;
2662	struct rps_map *map;
2663	struct rps_dev_flow_table *flow_table;
2664	struct rps_sock_flow_table *sock_flow_table;
2665	int cpu = -1;
2666	u16 tcpu;
2667
2668	if (skb_rx_queue_recorded(skb)) {
2669		u16 index = skb_get_rx_queue(skb);
2670		if (unlikely(index >= dev->real_num_rx_queues)) {
2671			WARN_ONCE(dev->real_num_rx_queues > 1,
2672				  "%s received packet on queue %u, but number "
2673				  "of RX queues is %u\n",
2674				  dev->name, index, dev->real_num_rx_queues);
2675			goto done;
2676		}
2677		rxqueue = dev->_rx + index;
2678	} else
2679		rxqueue = dev->_rx;
2680
2681	map = rcu_dereference(rxqueue->rps_map);
2682	if (map) {
2683		if (map->len == 1 &&
2684		    !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2685			tcpu = map->cpus[0];
2686			if (cpu_online(tcpu))
2687				cpu = tcpu;
2688			goto done;
2689		}
2690	} else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2691		goto done;
2692	}
2693
2694	skb_reset_network_header(skb);
2695	if (!skb_get_rxhash(skb))
2696		goto done;
2697
2698	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2699	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2700	if (flow_table && sock_flow_table) {
2701		u16 next_cpu;
2702		struct rps_dev_flow *rflow;
2703
2704		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2705		tcpu = rflow->cpu;
2706
2707		next_cpu = sock_flow_table->ents[skb->rxhash &
2708		    sock_flow_table->mask];
2709
2710		/*
2711		 * If the desired CPU (where last recvmsg was done) is
2712		 * different from current CPU (one in the rx-queue flow
2713		 * table entry), switch if one of the following holds:
2714		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2715		 *   - Current CPU is offline.
2716		 *   - The current CPU's queue tail has advanced beyond the
2717		 *     last packet that was enqueued using this table entry.
2718		 *     This guarantees that all previous packets for the flow
2719		 *     have been dequeued, thus preserving in order delivery.
2720		 */
2721		if (unlikely(tcpu != next_cpu) &&
2722		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2723		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2724		      rflow->last_qtail)) >= 0))
2725			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2726
2727		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2728			*rflowp = rflow;
2729			cpu = tcpu;
2730			goto done;
2731		}
2732	}
2733
2734	if (map) {
2735		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2736
2737		if (cpu_online(tcpu)) {
2738			cpu = tcpu;
2739			goto done;
2740		}
2741	}
2742
2743done:
2744	return cpu;
2745}
2746
2747#ifdef CONFIG_RFS_ACCEL
2748
2749/**
2750 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2751 * @dev: Device on which the filter was set
2752 * @rxq_index: RX queue index
2753 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2754 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2755 *
2756 * Drivers that implement ndo_rx_flow_steer() should periodically call
2757 * this function for each installed filter and remove the filters for
2758 * which it returns %true.
2759 */
2760bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2761			 u32 flow_id, u16 filter_id)
2762{
2763	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2764	struct rps_dev_flow_table *flow_table;
2765	struct rps_dev_flow *rflow;
2766	bool expire = true;
2767	int cpu;
2768
2769	rcu_read_lock();
2770	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2771	if (flow_table && flow_id <= flow_table->mask) {
2772		rflow = &flow_table->flows[flow_id];
2773		cpu = ACCESS_ONCE(rflow->cpu);
2774		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2775		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2776			   rflow->last_qtail) <
2777		     (int)(10 * flow_table->mask)))
2778			expire = false;
2779	}
2780	rcu_read_unlock();
2781	return expire;
2782}
2783EXPORT_SYMBOL(rps_may_expire_flow);
2784
2785#endif /* CONFIG_RFS_ACCEL */
2786
2787/* Called from hardirq (IPI) context */
2788static void rps_trigger_softirq(void *data)
2789{
2790	struct softnet_data *sd = data;
2791
2792	____napi_schedule(sd, &sd->backlog);
2793	sd->received_rps++;
2794}
2795
2796#endif /* CONFIG_RPS */
2797
2798/*
2799 * Check if this softnet_data structure is another cpu one
2800 * If yes, queue it to our IPI list and return 1
2801 * If no, return 0
2802 */
2803static int rps_ipi_queued(struct softnet_data *sd)
2804{
2805#ifdef CONFIG_RPS
2806	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2807
2808	if (sd != mysd) {
2809		sd->rps_ipi_next = mysd->rps_ipi_list;
2810		mysd->rps_ipi_list = sd;
2811
2812		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2813		return 1;
2814	}
2815#endif /* CONFIG_RPS */
2816	return 0;
2817}
2818
2819/*
2820 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2821 * queue (may be a remote CPU queue).
2822 */
2823static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2824			      unsigned int *qtail)
2825{
2826	struct softnet_data *sd;
2827	unsigned long flags;
2828
2829	sd = &per_cpu(softnet_data, cpu);
2830
2831	local_irq_save(flags);
2832
2833	rps_lock(sd);
2834	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2835		if (skb_queue_len(&sd->input_pkt_queue)) {
2836enqueue:
2837			__skb_queue_tail(&sd->input_pkt_queue, skb);
2838			input_queue_tail_incr_save(sd, qtail);
2839			rps_unlock(sd);
2840			local_irq_restore(flags);
2841			return NET_RX_SUCCESS;
2842		}
2843
2844		/* Schedule NAPI for backlog device
2845		 * We can use non atomic operation since we own the queue lock
2846		 */
2847		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2848			if (!rps_ipi_queued(sd))
2849				____napi_schedule(sd, &sd->backlog);
2850		}
2851		goto enqueue;
2852	}
2853
2854	sd->dropped++;
2855	rps_unlock(sd);
2856
2857	local_irq_restore(flags);
2858
2859	atomic_long_inc(&skb->dev->rx_dropped);
2860	kfree_skb(skb);
2861	return NET_RX_DROP;
2862}
2863
2864/**
2865 *	netif_rx	-	post buffer to the network code
2866 *	@skb: buffer to post
2867 *
2868 *	This function receives a packet from a device driver and queues it for
2869 *	the upper (protocol) levels to process.  It always succeeds. The buffer
2870 *	may be dropped during processing for congestion control or by the
2871 *	protocol layers.
2872 *
2873 *	return values:
2874 *	NET_RX_SUCCESS	(no congestion)
2875 *	NET_RX_DROP     (packet was dropped)
2876 *
2877 */
2878
2879int netif_rx(struct sk_buff *skb)
2880{
2881	int ret;
2882
2883	/* if netpoll wants it, pretend we never saw it */
2884	if (netpoll_rx(skb))
2885		return NET_RX_DROP;
2886
2887	if (netdev_tstamp_prequeue)
2888		net_timestamp_check(skb);
2889
2890	trace_netif_rx(skb);
2891#ifdef CONFIG_RPS
2892	{
2893		struct rps_dev_flow voidflow, *rflow = &voidflow;
2894		int cpu;
2895
2896		preempt_disable();
2897		rcu_read_lock();
2898
2899		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2900		if (cpu < 0)
2901			cpu = smp_processor_id();
2902
2903		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2904
2905		rcu_read_unlock();
2906		preempt_enable();
2907	}
2908#else
2909	{
2910		unsigned int qtail;
2911		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2912		put_cpu();
2913	}
2914#endif
2915	return ret;
2916}
2917EXPORT_SYMBOL(netif_rx);
2918
2919int netif_rx_ni(struct sk_buff *skb)
2920{
2921	int err;
2922
2923	preempt_disable();
2924	err = netif_rx(skb);
2925	if (local_softirq_pending())
2926		do_softirq();
2927	preempt_enable();
2928
2929	return err;
2930}
2931EXPORT_SYMBOL(netif_rx_ni);
2932
2933static void net_tx_action(struct softirq_action *h)
2934{
2935	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2936
2937	if (sd->completion_queue) {
2938		struct sk_buff *clist;
2939
2940		local_irq_disable();
2941		clist = sd->completion_queue;
2942		sd->completion_queue = NULL;
2943		local_irq_enable();
2944
2945		while (clist) {
2946			struct sk_buff *skb = clist;
2947			clist = clist->next;
2948
2949			WARN_ON(atomic_read(&skb->users));
2950			trace_kfree_skb(skb, net_tx_action);
2951			__kfree_skb(skb);
2952		}
2953	}
2954
2955	if (sd->output_queue) {
2956		struct Qdisc *head;
2957
2958		local_irq_disable();
2959		head = sd->output_queue;
2960		sd->output_queue = NULL;
2961		sd->output_queue_tailp = &sd->output_queue;
2962		local_irq_enable();
2963
2964		while (head) {
2965			struct Qdisc *q = head;
2966			spinlock_t *root_lock;
2967
2968			head = head->next_sched;
2969
2970			root_lock = qdisc_lock(q);
2971			if (spin_trylock(root_lock)) {
2972				smp_mb__before_clear_bit();
2973				clear_bit(__QDISC_STATE_SCHED,
2974					  &q->state);
2975				qdisc_run(q);
2976				spin_unlock(root_lock);
2977			} else {
2978				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2979					      &q->state)) {
2980					__netif_reschedule(q);
2981				} else {
2982					smp_mb__before_clear_bit();
2983					clear_bit(__QDISC_STATE_SCHED,
2984						  &q->state);
2985				}
2986			}
2987		}
2988	}
2989}
2990
2991#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2992    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2993/* This hook is defined here for ATM LANE */
2994int (*br_fdb_test_addr_hook)(struct net_device *dev,
2995			     unsigned char *addr) __read_mostly;
2996EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2997#endif
2998
2999#ifdef CONFIG_NET_CLS_ACT
3000/* TODO: Maybe we should just force sch_ingress to be compiled in
3001 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3002 * a compare and 2 stores extra right now if we dont have it on
3003 * but have CONFIG_NET_CLS_ACT
3004 * NOTE: This doesn't stop any functionality; if you dont have
3005 * the ingress scheduler, you just can't add policies on ingress.
3006 *
3007 */
3008static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3009{
3010	struct net_device *dev = skb->dev;
3011	u32 ttl = G_TC_RTTL(skb->tc_verd);
3012	int result = TC_ACT_OK;
3013	struct Qdisc *q;
3014
3015	if (unlikely(MAX_RED_LOOP < ttl++)) {
3016		if (net_ratelimit())
3017			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3018			       skb->skb_iif, dev->ifindex);
3019		return TC_ACT_SHOT;
3020	}
3021
3022	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3023	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3024
3025	q = rxq->qdisc;
3026	if (q != &noop_qdisc) {
3027		spin_lock(qdisc_lock(q));
3028		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3029			result = qdisc_enqueue_root(skb, q);
3030		spin_unlock(qdisc_lock(q));
3031	}
3032
3033	return result;
3034}
3035
3036static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3037					 struct packet_type **pt_prev,
3038					 int *ret, struct net_device *orig_dev)
3039{
3040	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3041
3042	if (!rxq || rxq->qdisc == &noop_qdisc)
3043		goto out;
3044
3045	if (*pt_prev) {
3046		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3047		*pt_prev = NULL;
3048	}
3049
3050	switch (ing_filter(skb, rxq)) {
3051	case TC_ACT_SHOT:
3052	case TC_ACT_STOLEN:
3053		kfree_skb(skb);
3054		return NULL;
3055	}
3056
3057out:
3058	skb->tc_verd = 0;
3059	return skb;
3060}
3061#endif
3062
3063/**
3064 *	netdev_rx_handler_register - register receive handler
3065 *	@dev: device to register a handler for
3066 *	@rx_handler: receive handler to register
3067 *	@rx_handler_data: data pointer that is used by rx handler
3068 *
3069 *	Register a receive hander for a device. This handler will then be
3070 *	called from __netif_receive_skb. A negative errno code is returned
3071 *	on a failure.
3072 *
3073 *	The caller must hold the rtnl_mutex.
3074 *
3075 *	For a general description of rx_handler, see enum rx_handler_result.
3076 */
3077int netdev_rx_handler_register(struct net_device *dev,
3078			       rx_handler_func_t *rx_handler,
3079			       void *rx_handler_data)
3080{
3081	ASSERT_RTNL();
3082
3083	if (dev->rx_handler)
3084		return -EBUSY;
3085
3086	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3087	rcu_assign_pointer(dev->rx_handler, rx_handler);
3088
3089	return 0;
3090}
3091EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3092
3093/**
3094 *	netdev_rx_handler_unregister - unregister receive handler
3095 *	@dev: device to unregister a handler from
3096 *
3097 *	Unregister a receive hander from a device.
3098 *
3099 *	The caller must hold the rtnl_mutex.
3100 */
3101void netdev_rx_handler_unregister(struct net_device *dev)
3102{
3103
3104	ASSERT_RTNL();
3105	rcu_assign_pointer(dev->rx_handler, NULL);
3106	rcu_assign_pointer(dev->rx_handler_data, NULL);
3107}
3108EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3109
3110static int __netif_receive_skb(struct sk_buff *skb)
3111{
3112	struct packet_type *ptype, *pt_prev;
3113	rx_handler_func_t *rx_handler;
3114	struct net_device *orig_dev;
3115	struct net_device *null_or_dev;
3116	bool deliver_exact = false;
3117	int ret = NET_RX_DROP;
3118	__be16 type;
3119
3120	if (!netdev_tstamp_prequeue)
3121		net_timestamp_check(skb);
3122
3123	trace_netif_receive_skb(skb);
3124
3125	/* if we've gotten here through NAPI, check netpoll */
3126	if (netpoll_receive_skb(skb))
3127		return NET_RX_DROP;
3128
3129	if (!skb->skb_iif)
3130		skb->skb_iif = skb->dev->ifindex;
3131	orig_dev = skb->dev;
3132
3133	skb_reset_network_header(skb);
3134	skb_reset_transport_header(skb);
3135	skb_reset_mac_len(skb);
3136
3137	pt_prev = NULL;
3138
3139	rcu_read_lock();
3140
3141another_round:
3142
3143	__this_cpu_inc(softnet_data.processed);
3144
3145	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3146		skb = vlan_untag(skb);
3147		if (unlikely(!skb))
3148			goto out;
3149	}
3150
3151#ifdef CONFIG_NET_CLS_ACT
3152	if (skb->tc_verd & TC_NCLS) {
3153		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3154		goto ncls;
3155	}
3156#endif
3157
3158	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3159		if (!ptype->dev || ptype->dev == skb->dev) {
3160			if (pt_prev)
3161				ret = deliver_skb(skb, pt_prev, orig_dev);
3162			pt_prev = ptype;
3163		}
3164	}
3165
3166#ifdef CONFIG_NET_CLS_ACT
3167	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3168	if (!skb)
3169		goto out;
3170ncls:
3171#endif
3172
3173	rx_handler = rcu_dereference(skb->dev->rx_handler);
3174	if (rx_handler) {
3175		if (pt_prev) {
3176			ret = deliver_skb(skb, pt_prev, orig_dev);
3177			pt_prev = NULL;
3178		}
3179		switch (rx_handler(&skb)) {
3180		case RX_HANDLER_CONSUMED:
3181			goto out;
3182		case RX_HANDLER_ANOTHER:
3183			goto another_round;
3184		case RX_HANDLER_EXACT:
3185			deliver_exact = true;
3186		case RX_HANDLER_PASS:
3187			break;
3188		default:
3189			BUG();
3190		}
3191	}
3192
3193	if (vlan_tx_tag_present(skb)) {
3194		if (pt_prev) {
3195			ret = deliver_skb(skb, pt_prev, orig_dev);
3196			pt_prev = NULL;
3197		}
3198		if (vlan_do_receive(&skb)) {
3199			ret = __netif_receive_skb(skb);
3200			goto out;
3201		} else if (unlikely(!skb))
3202			goto out;
3203	}
3204
3205	/* deliver only exact match when indicated */
3206	null_or_dev = deliver_exact ? skb->dev : NULL;
3207
3208	type = skb->protocol;
3209	list_for_each_entry_rcu(ptype,
3210			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3211		if (ptype->type == type &&
3212		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3213		     ptype->dev == orig_dev)) {
3214			if (pt_prev)
3215				ret = deliver_skb(skb, pt_prev, orig_dev);
3216			pt_prev = ptype;
3217		}
3218	}
3219
3220	if (pt_prev) {
3221		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3222	} else {
3223		atomic_long_inc(&skb->dev->rx_dropped);
3224		kfree_skb(skb);
3225		/* Jamal, now you will not able to escape explaining
3226		 * me how you were going to use this. :-)
3227		 */
3228		ret = NET_RX_DROP;
3229	}
3230
3231out:
3232	rcu_read_unlock();
3233	return ret;
3234}
3235
3236/**
3237 *	netif_receive_skb - process receive buffer from network
3238 *	@skb: buffer to process
3239 *
3240 *	netif_receive_skb() is the main receive data processing function.
3241 *	It always succeeds. The buffer may be dropped during processing
3242 *	for congestion control or by the protocol layers.
3243 *
3244 *	This function may only be called from softirq context and interrupts
3245 *	should be enabled.
3246 *
3247 *	Return values (usually ignored):
3248 *	NET_RX_SUCCESS: no congestion
3249 *	NET_RX_DROP: packet was dropped
3250 */
3251int netif_receive_skb(struct sk_buff *skb)
3252{
3253	if (netdev_tstamp_prequeue)
3254		net_timestamp_check(skb);
3255
3256	if (skb_defer_rx_timestamp(skb))
3257		return NET_RX_SUCCESS;
3258
3259#ifdef CONFIG_RPS
3260	{
3261		struct rps_dev_flow voidflow, *rflow = &voidflow;
3262		int cpu, ret;
3263
3264		rcu_read_lock();
3265
3266		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3267
3268		if (cpu >= 0) {
3269			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3270			rcu_read_unlock();
3271		} else {
3272			rcu_read_unlock();
3273			ret = __netif_receive_skb(skb);
3274		}
3275
3276		return ret;
3277	}
3278#else
3279	return __netif_receive_skb(skb);
3280#endif
3281}
3282EXPORT_SYMBOL(netif_receive_skb);
3283
3284/* Network device is going away, flush any packets still pending
3285 * Called with irqs disabled.
3286 */
3287static void flush_backlog(void *arg)
3288{
3289	struct net_device *dev = arg;
3290	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3291	struct sk_buff *skb, *tmp;
3292
3293	rps_lock(sd);
3294	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3295		if (skb->dev == dev) {
3296			__skb_unlink(skb, &sd->input_pkt_queue);
3297			kfree_skb(skb);
3298			input_queue_head_incr(sd);
3299		}
3300	}
3301	rps_unlock(sd);
3302
3303	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3304		if (skb->dev == dev) {
3305			__skb_unlink(skb, &sd->process_queue);
3306			kfree_skb(skb);
3307			input_queue_head_incr(sd);
3308		}
3309	}
3310}
3311
3312static int napi_gro_complete(struct sk_buff *skb)
3313{
3314	struct packet_type *ptype;
3315	__be16 type = skb->protocol;
3316	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3317	int err = -ENOENT;
3318
3319	if (NAPI_GRO_CB(skb)->count == 1) {
3320		skb_shinfo(skb)->gso_size = 0;
3321		goto out;
3322	}
3323
3324	rcu_read_lock();
3325	list_for_each_entry_rcu(ptype, head, list) {
3326		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3327			continue;
3328
3329		err = ptype->gro_complete(skb);
3330		break;
3331	}
3332	rcu_read_unlock();
3333
3334	if (err) {
3335		WARN_ON(&ptype->list == head);
3336		kfree_skb(skb);
3337		return NET_RX_SUCCESS;
3338	}
3339
3340out:
3341	return netif_receive_skb(skb);
3342}
3343
3344inline void napi_gro_flush(struct napi_struct *napi)
3345{
3346	struct sk_buff *skb, *next;
3347
3348	for (skb = napi->gro_list; skb; skb = next) {
3349		next = skb->next;
3350		skb->next = NULL;
3351		napi_gro_complete(skb);
3352	}
3353
3354	napi->gro_count = 0;
3355	napi->gro_list = NULL;
3356}
3357EXPORT_SYMBOL(napi_gro_flush);
3358
3359enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3360{
3361	struct sk_buff **pp = NULL;
3362	struct packet_type *ptype;
3363	__be16 type = skb->protocol;
3364	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3365	int same_flow;
3366	int mac_len;
3367	enum gro_result ret;
3368
3369	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3370		goto normal;
3371
3372	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3373		goto normal;
3374
3375	rcu_read_lock();
3376	list_for_each_entry_rcu(ptype, head, list) {
3377		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3378			continue;
3379
3380		skb_set_network_header(skb, skb_gro_offset(skb));
3381		mac_len = skb->network_header - skb->mac_header;
3382		skb->mac_len = mac_len;
3383		NAPI_GRO_CB(skb)->same_flow = 0;
3384		NAPI_GRO_CB(skb)->flush = 0;
3385		NAPI_GRO_CB(skb)->free = 0;
3386
3387		pp = ptype->gro_receive(&napi->gro_list, skb);
3388		break;
3389	}
3390	rcu_read_unlock();
3391
3392	if (&ptype->list == head)
3393		goto normal;
3394
3395	same_flow = NAPI_GRO_CB(skb)->same_flow;
3396	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3397
3398	if (pp) {
3399		struct sk_buff *nskb = *pp;
3400
3401		*pp = nskb->next;
3402		nskb->next = NULL;
3403		napi_gro_complete(nskb);
3404		napi->gro_count--;
3405	}
3406
3407	if (same_flow)
3408		goto ok;
3409
3410	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3411		goto normal;
3412
3413	napi->gro_count++;
3414	NAPI_GRO_CB(skb)->count = 1;
3415	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3416	skb->next = napi->gro_list;
3417	napi->gro_list = skb;
3418	ret = GRO_HELD;
3419
3420pull:
3421	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3422		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3423
3424		BUG_ON(skb->end - skb->tail < grow);
3425
3426		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3427
3428		skb->tail += grow;
3429		skb->data_len -= grow;
3430
3431		skb_shinfo(skb)->frags[0].page_offset += grow;
3432		skb_shinfo(skb)->frags[0].size -= grow;
3433
3434		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3435			put_page(skb_shinfo(skb)->frags[0].page);
3436			memmove(skb_shinfo(skb)->frags,
3437				skb_shinfo(skb)->frags + 1,
3438				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3439		}
3440	}
3441
3442ok:
3443	return ret;
3444
3445normal:
3446	ret = GRO_NORMAL;
3447	goto pull;
3448}
3449EXPORT_SYMBOL(dev_gro_receive);
3450
3451static inline gro_result_t
3452__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3453{
3454	struct sk_buff *p;
3455
3456	for (p = napi->gro_list; p; p = p->next) {
3457		unsigned long diffs;
3458
3459		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3460		diffs |= p->vlan_tci ^ skb->vlan_tci;
3461		diffs |= compare_ether_header(skb_mac_header(p),
3462					      skb_gro_mac_header(skb));
3463		NAPI_GRO_CB(p)->same_flow = !diffs;
3464		NAPI_GRO_CB(p)->flush = 0;
3465	}
3466
3467	return dev_gro_receive(napi, skb);
3468}
3469
3470gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3471{
3472	switch (ret) {
3473	case GRO_NORMAL:
3474		if (netif_receive_skb(skb))
3475			ret = GRO_DROP;
3476		break;
3477
3478	case GRO_DROP:
3479	case GRO_MERGED_FREE:
3480		kfree_skb(skb);
3481		break;
3482
3483	case GRO_HELD:
3484	case GRO_MERGED:
3485		break;
3486	}
3487
3488	return ret;
3489}
3490EXPORT_SYMBOL(napi_skb_finish);
3491
3492void skb_gro_reset_offset(struct sk_buff *skb)
3493{
3494	NAPI_GRO_CB(skb)->data_offset = 0;
3495	NAPI_GRO_CB(skb)->frag0 = NULL;
3496	NAPI_GRO_CB(skb)->frag0_len = 0;
3497
3498	if (skb->mac_header == skb->tail &&
3499	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3500		NAPI_GRO_CB(skb)->frag0 =
3501			page_address(skb_shinfo(skb)->frags[0].page) +
3502			skb_shinfo(skb)->frags[0].page_offset;
3503		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3504	}
3505}
3506EXPORT_SYMBOL(skb_gro_reset_offset);
3507
3508gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3509{
3510	skb_gro_reset_offset(skb);
3511
3512	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3513}
3514EXPORT_SYMBOL(napi_gro_receive);
3515
3516static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3517{
3518	__skb_pull(skb, skb_headlen(skb));
3519	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3520	skb->vlan_tci = 0;
3521	skb->dev = napi->dev;
3522	skb->skb_iif = 0;
3523
3524	napi->skb = skb;
3525}
3526
3527struct sk_buff *napi_get_frags(struct napi_struct *napi)
3528{
3529	struct sk_buff *skb = napi->skb;
3530
3531	if (!skb) {
3532		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3533		if (skb)
3534			napi->skb = skb;
3535	}
3536	return skb;
3537}
3538EXPORT_SYMBOL(napi_get_frags);
3539
3540gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3541			       gro_result_t ret)
3542{
3543	switch (ret) {
3544	case GRO_NORMAL:
3545	case GRO_HELD:
3546		skb->protocol = eth_type_trans(skb, skb->dev);
3547
3548		if (ret == GRO_HELD)
3549			skb_gro_pull(skb, -ETH_HLEN);
3550		else if (netif_receive_skb(skb))
3551			ret = GRO_DROP;
3552		break;
3553
3554	case GRO_DROP:
3555	case GRO_MERGED_FREE:
3556		napi_reuse_skb(napi, skb);
3557		break;
3558
3559	case GRO_MERGED:
3560		break;
3561	}
3562
3563	return ret;
3564}
3565EXPORT_SYMBOL(napi_frags_finish);
3566
3567struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3568{
3569	struct sk_buff *skb = napi->skb;
3570	struct ethhdr *eth;
3571	unsigned int hlen;
3572	unsigned int off;
3573
3574	napi->skb = NULL;
3575
3576	skb_reset_mac_header(skb);
3577	skb_gro_reset_offset(skb);
3578
3579	off = skb_gro_offset(skb);
3580	hlen = off + sizeof(*eth);
3581	eth = skb_gro_header_fast(skb, off);
3582	if (skb_gro_header_hard(skb, hlen)) {
3583		eth = skb_gro_header_slow(skb, hlen, off);
3584		if (unlikely(!eth)) {
3585			napi_reuse_skb(napi, skb);
3586			skb = NULL;
3587			goto out;
3588		}
3589	}
3590
3591	skb_gro_pull(skb, sizeof(*eth));
3592
3593	/*
3594	 * This works because the only protocols we care about don't require
3595	 * special handling.  We'll fix it up properly at the end.
3596	 */
3597	skb->protocol = eth->h_proto;
3598
3599out:
3600	return skb;
3601}
3602EXPORT_SYMBOL(napi_frags_skb);
3603
3604gro_result_t napi_gro_frags(struct napi_struct *napi)
3605{
3606	struct sk_buff *skb = napi_frags_skb(napi);
3607
3608	if (!skb)
3609		return GRO_DROP;
3610
3611	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3612}
3613EXPORT_SYMBOL(napi_gro_frags);
3614
3615/*
3616 * net_rps_action sends any pending IPI's for rps.
3617 * Note: called with local irq disabled, but exits with local irq enabled.
3618 */
3619static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3620{
3621#ifdef CONFIG_RPS
3622	struct softnet_data *remsd = sd->rps_ipi_list;
3623
3624	if (remsd) {
3625		sd->rps_ipi_list = NULL;
3626
3627		local_irq_enable();
3628
3629		/* Send pending IPI's to kick RPS processing on remote cpus. */
3630		while (remsd) {
3631			struct softnet_data *next = remsd->rps_ipi_next;
3632
3633			if (cpu_online(remsd->cpu))
3634				__smp_call_function_single(remsd->cpu,
3635							   &remsd->csd, 0);
3636			remsd = next;
3637		}
3638	} else
3639#endif
3640		local_irq_enable();
3641}
3642
3643static int process_backlog(struct napi_struct *napi, int quota)
3644{
3645	int work = 0;
3646	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3647
3648#ifdef CONFIG_RPS
3649	/* Check if we have pending ipi, its better to send them now,
3650	 * not waiting net_rx_action() end.
3651	 */
3652	if (sd->rps_ipi_list) {
3653		local_irq_disable();
3654		net_rps_action_and_irq_enable(sd);
3655	}
3656#endif
3657	napi->weight = weight_p;
3658	local_irq_disable();
3659	while (work < quota) {
3660		struct sk_buff *skb;
3661		unsigned int qlen;
3662
3663		while ((skb = __skb_dequeue(&sd->process_queue))) {
3664			local_irq_enable();
3665			__netif_receive_skb(skb);
3666			local_irq_disable();
3667			input_queue_head_incr(sd);
3668			if (++work >= quota) {
3669				local_irq_enable();
3670				return work;
3671			}
3672		}
3673
3674		rps_lock(sd);
3675		qlen = skb_queue_len(&sd->input_pkt_queue);
3676		if (qlen)
3677			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3678						   &sd->process_queue);
3679
3680		if (qlen < quota - work) {
3681			/*
3682			 * Inline a custom version of __napi_complete().
3683			 * only current cpu owns and manipulates this napi,
3684			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3685			 * we can use a plain write instead of clear_bit(),
3686			 * and we dont need an smp_mb() memory barrier.
3687			 */
3688			list_del(&napi->poll_list);
3689			napi->state = 0;
3690
3691			quota = work + qlen;
3692		}
3693		rps_unlock(sd);
3694	}
3695	local_irq_enable();
3696
3697	return work;
3698}
3699
3700/**
3701 * __napi_schedule - schedule for receive
3702 * @n: entry to schedule
3703 *
3704 * The entry's receive function will be scheduled to run
3705 */
3706void __napi_schedule(struct napi_struct *n)
3707{
3708	unsigned long flags;
3709
3710	local_irq_save(flags);
3711	____napi_schedule(&__get_cpu_var(softnet_data), n);
3712	local_irq_restore(flags);
3713}
3714EXPORT_SYMBOL(__napi_schedule);
3715
3716void __napi_complete(struct napi_struct *n)
3717{
3718	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3719	BUG_ON(n->gro_list);
3720
3721	list_del(&n->poll_list);
3722	smp_mb__before_clear_bit();
3723	clear_bit(NAPI_STATE_SCHED, &n->state);
3724}
3725EXPORT_SYMBOL(__napi_complete);
3726
3727void napi_complete(struct napi_struct *n)
3728{
3729	unsigned long flags;
3730
3731	/*
3732	 * don't let napi dequeue from the cpu poll list
3733	 * just in case its running on a different cpu
3734	 */
3735	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3736		return;
3737
3738	napi_gro_flush(n);
3739	local_irq_save(flags);
3740	__napi_complete(n);
3741	local_irq_restore(flags);
3742}
3743EXPORT_SYMBOL(napi_complete);
3744
3745void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3746		    int (*poll)(struct napi_struct *, int), int weight)
3747{
3748	INIT_LIST_HEAD(&napi->poll_list);
3749	napi->gro_count = 0;
3750	napi->gro_list = NULL;
3751	napi->skb = NULL;
3752	napi->poll = poll;
3753	napi->weight = weight;
3754	list_add(&napi->dev_list, &dev->napi_list);
3755	napi->dev = dev;
3756#ifdef CONFIG_NETPOLL
3757	spin_lock_init(&napi->poll_lock);
3758	napi->poll_owner = -1;
3759#endif
3760	set_bit(NAPI_STATE_SCHED, &napi->state);
3761}
3762EXPORT_SYMBOL(netif_napi_add);
3763
3764void netif_napi_del(struct napi_struct *napi)
3765{
3766	struct sk_buff *skb, *next;
3767
3768	list_del_init(&napi->dev_list);
3769	napi_free_frags(napi);
3770
3771	for (skb = napi->gro_list; skb; skb = next) {
3772		next = skb->next;
3773		skb->next = NULL;
3774		kfree_skb(skb);
3775	}
3776
3777	napi->gro_list = NULL;
3778	napi->gro_count = 0;
3779}
3780EXPORT_SYMBOL(netif_napi_del);
3781
3782static void net_rx_action(struct softirq_action *h)
3783{
3784	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3785	unsigned long time_limit = jiffies + 2;
3786	int budget = netdev_budget;
3787	void *have;
3788
3789	local_irq_disable();
3790
3791	while (!list_empty(&sd->poll_list)) {
3792		struct napi_struct *n;
3793		int work, weight;
3794
3795		/* If softirq window is exhuasted then punt.
3796		 * Allow this to run for 2 jiffies since which will allow
3797		 * an average latency of 1.5/HZ.
3798		 */
3799		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3800			goto softnet_break;
3801
3802		local_irq_enable();
3803
3804		/* Even though interrupts have been re-enabled, this
3805		 * access is safe because interrupts can only add new
3806		 * entries to the tail of this list, and only ->poll()
3807		 * calls can remove this head entry from the list.
3808		 */
3809		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3810
3811		have = netpoll_poll_lock(n);
3812
3813		weight = n->weight;
3814
3815		/* This NAPI_STATE_SCHED test is for avoiding a race
3816		 * with netpoll's poll_napi().  Only the entity which
3817		 * obtains the lock and sees NAPI_STATE_SCHED set will
3818		 * actually make the ->poll() call.  Therefore we avoid
3819		 * accidentally calling ->poll() when NAPI is not scheduled.
3820		 */
3821		work = 0;
3822		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3823			work = n->poll(n, weight);
3824			trace_napi_poll(n);
3825		}
3826
3827		WARN_ON_ONCE(work > weight);
3828
3829		budget -= work;
3830
3831		local_irq_disable();
3832
3833		/* Drivers must not modify the NAPI state if they
3834		 * consume the entire weight.  In such cases this code
3835		 * still "owns" the NAPI instance and therefore can
3836		 * move the instance around on the list at-will.
3837		 */
3838		if (unlikely(work == weight)) {
3839			if (unlikely(napi_disable_pending(n))) {
3840				local_irq_enable();
3841				napi_complete(n);
3842				local_irq_disable();
3843			} else
3844				list_move_tail(&n->poll_list, &sd->poll_list);
3845		}
3846
3847		netpoll_poll_unlock(have);
3848	}
3849out:
3850	net_rps_action_and_irq_enable(sd);
3851
3852#ifdef CONFIG_NET_DMA
3853	/*
3854	 * There may not be any more sk_buffs coming right now, so push
3855	 * any pending DMA copies to hardware
3856	 */
3857	dma_issue_pending_all();
3858#endif
3859
3860	return;
3861
3862softnet_break:
3863	sd->time_squeeze++;
3864	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3865	goto out;
3866}
3867
3868static gifconf_func_t *gifconf_list[NPROTO];
3869
3870/**
3871 *	register_gifconf	-	register a SIOCGIF handler
3872 *	@family: Address family
3873 *	@gifconf: Function handler
3874 *
3875 *	Register protocol dependent address dumping routines. The handler
3876 *	that is passed must not be freed or reused until it has been replaced
3877 *	by another handler.
3878 */
3879int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3880{
3881	if (family >= NPROTO)
3882		return -EINVAL;
3883	gifconf_list[family] = gifconf;
3884	return 0;
3885}
3886EXPORT_SYMBOL(register_gifconf);
3887
3888
3889/*
3890 *	Map an interface index to its name (SIOCGIFNAME)
3891 */
3892
3893/*
3894 *	We need this ioctl for efficient implementation of the
3895 *	if_indextoname() function required by the IPv6 API.  Without
3896 *	it, we would have to search all the interfaces to find a
3897 *	match.  --pb
3898 */
3899
3900static int dev_ifname(struct net *net, struct ifreq __user *arg)
3901{
3902	struct net_device *dev;
3903	struct ifreq ifr;
3904
3905	/*
3906	 *	Fetch the caller's info block.
3907	 */
3908
3909	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3910		return -EFAULT;
3911
3912	rcu_read_lock();
3913	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3914	if (!dev) {
3915		rcu_read_unlock();
3916		return -ENODEV;
3917	}
3918
3919	strcpy(ifr.ifr_name, dev->name);
3920	rcu_read_unlock();
3921
3922	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3923		return -EFAULT;
3924	return 0;
3925}
3926
3927/*
3928 *	Perform a SIOCGIFCONF call. This structure will change
3929 *	size eventually, and there is nothing I can do about it.
3930 *	Thus we will need a 'compatibility mode'.
3931 */
3932
3933static int dev_ifconf(struct net *net, char __user *arg)
3934{
3935	struct ifconf ifc;
3936	struct net_device *dev;
3937	char __user *pos;
3938	int len;
3939	int total;
3940	int i;
3941
3942	/*
3943	 *	Fetch the caller's info block.
3944	 */
3945
3946	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3947		return -EFAULT;
3948
3949	pos = ifc.ifc_buf;
3950	len = ifc.ifc_len;
3951
3952	/*
3953	 *	Loop over the interfaces, and write an info block for each.
3954	 */
3955
3956	total = 0;
3957	for_each_netdev(net, dev) {
3958		for (i = 0; i < NPROTO; i++) {
3959			if (gifconf_list[i]) {
3960				int done;
3961				if (!pos)
3962					done = gifconf_list[i](dev, NULL, 0);
3963				else
3964					done = gifconf_list[i](dev, pos + total,
3965							       len - total);
3966				if (done < 0)
3967					return -EFAULT;
3968				total += done;
3969			}
3970		}
3971	}
3972
3973	/*
3974	 *	All done.  Write the updated control block back to the caller.
3975	 */
3976	ifc.ifc_len = total;
3977
3978	/*
3979	 * 	Both BSD and Solaris return 0 here, so we do too.
3980	 */
3981	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3982}
3983
3984#ifdef CONFIG_PROC_FS
3985/*
3986 *	This is invoked by the /proc filesystem handler to display a device
3987 *	in detail.
3988 */
3989void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3990	__acquires(RCU)
3991{
3992	struct net *net = seq_file_net(seq);
3993	loff_t off;
3994	struct net_device *dev;
3995
3996	rcu_read_lock();
3997	if (!*pos)
3998		return SEQ_START_TOKEN;
3999
4000	off = 1;
4001	for_each_netdev_rcu(net, dev)
4002		if (off++ == *pos)
4003			return dev;
4004
4005	return NULL;
4006}
4007
4008void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4009{
4010	struct net_device *dev = v;
4011
4012	if (v == SEQ_START_TOKEN)
4013		dev = first_net_device_rcu(seq_file_net(seq));
4014	else
4015		dev = next_net_device_rcu(dev);
4016
4017	++*pos;
4018	return dev;
4019}
4020
4021void dev_seq_stop(struct seq_file *seq, void *v)
4022	__releases(RCU)
4023{
4024	rcu_read_unlock();
4025}
4026
4027static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4028{
4029	struct rtnl_link_stats64 temp;
4030	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4031
4032	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4033		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4034		   dev->name, stats->rx_bytes, stats->rx_packets,
4035		   stats->rx_errors,
4036		   stats->rx_dropped + stats->rx_missed_errors,
4037		   stats->rx_fifo_errors,
4038		   stats->rx_length_errors + stats->rx_over_errors +
4039		    stats->rx_crc_errors + stats->rx_frame_errors,
4040		   stats->rx_compressed, stats->multicast,
4041		   stats->tx_bytes, stats->tx_packets,
4042		   stats->tx_errors, stats->tx_dropped,
4043		   stats->tx_fifo_errors, stats->collisions,
4044		   stats->tx_carrier_errors +
4045		    stats->tx_aborted_errors +
4046		    stats->tx_window_errors +
4047		    stats->tx_heartbeat_errors,
4048		   stats->tx_compressed);
4049}
4050
4051/*
4052 *	Called from the PROCfs module. This now uses the new arbitrary sized
4053 *	/proc/net interface to create /proc/net/dev
4054 */
4055static int dev_seq_show(struct seq_file *seq, void *v)
4056{
4057	if (v == SEQ_START_TOKEN)
4058		seq_puts(seq, "Inter-|   Receive                            "
4059			      "                    |  Transmit\n"
4060			      " face |bytes    packets errs drop fifo frame "
4061			      "compressed multicast|bytes    packets errs "
4062			      "drop fifo colls carrier compressed\n");
4063	else
4064		dev_seq_printf_stats(seq, v);
4065	return 0;
4066}
4067
4068static struct softnet_data *softnet_get_online(loff_t *pos)
4069{
4070	struct softnet_data *sd = NULL;
4071
4072	while (*pos < nr_cpu_ids)
4073		if (cpu_online(*pos)) {
4074			sd = &per_cpu(softnet_data, *pos);
4075			break;
4076		} else
4077			++*pos;
4078	return sd;
4079}
4080
4081static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4082{
4083	return softnet_get_online(pos);
4084}
4085
4086static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4087{
4088	++*pos;
4089	return softnet_get_online(pos);
4090}
4091
4092static void softnet_seq_stop(struct seq_file *seq, void *v)
4093{
4094}
4095
4096static int softnet_seq_show(struct seq_file *seq, void *v)
4097{
4098	struct softnet_data *sd = v;
4099
4100	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4101		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4102		   0, 0, 0, 0, /* was fastroute */
4103		   sd->cpu_collision, sd->received_rps);
4104	return 0;
4105}
4106
4107static const struct seq_operations dev_seq_ops = {
4108	.start = dev_seq_start,
4109	.next  = dev_seq_next,
4110	.stop  = dev_seq_stop,
4111	.show  = dev_seq_show,
4112};
4113
4114static int dev_seq_open(struct inode *inode, struct file *file)
4115{
4116	return seq_open_net(inode, file, &dev_seq_ops,
4117			    sizeof(struct seq_net_private));
4118}
4119
4120static const struct file_operations dev_seq_fops = {
4121	.owner	 = THIS_MODULE,
4122	.open    = dev_seq_open,
4123	.read    = seq_read,
4124	.llseek  = seq_lseek,
4125	.release = seq_release_net,
4126};
4127
4128static const struct seq_operations softnet_seq_ops = {
4129	.start = softnet_seq_start,
4130	.next  = softnet_seq_next,
4131	.stop  = softnet_seq_stop,
4132	.show  = softnet_seq_show,
4133};
4134
4135static int softnet_seq_open(struct inode *inode, struct file *file)
4136{
4137	return seq_open(file, &softnet_seq_ops);
4138}
4139
4140static const struct file_operations softnet_seq_fops = {
4141	.owner	 = THIS_MODULE,
4142	.open    = softnet_seq_open,
4143	.read    = seq_read,
4144	.llseek  = seq_lseek,
4145	.release = seq_release,
4146};
4147
4148static void *ptype_get_idx(loff_t pos)
4149{
4150	struct packet_type *pt = NULL;
4151	loff_t i = 0;
4152	int t;
4153
4154	list_for_each_entry_rcu(pt, &ptype_all, list) {
4155		if (i == pos)
4156			return pt;
4157		++i;
4158	}
4159
4160	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4161		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4162			if (i == pos)
4163				return pt;
4164			++i;
4165		}
4166	}
4167	return NULL;
4168}
4169
4170static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4171	__acquires(RCU)
4172{
4173	rcu_read_lock();
4174	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4175}
4176
4177static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4178{
4179	struct packet_type *pt;
4180	struct list_head *nxt;
4181	int hash;
4182
4183	++*pos;
4184	if (v == SEQ_START_TOKEN)
4185		return ptype_get_idx(0);
4186
4187	pt = v;
4188	nxt = pt->list.next;
4189	if (pt->type == htons(ETH_P_ALL)) {
4190		if (nxt != &ptype_all)
4191			goto found;
4192		hash = 0;
4193		nxt = ptype_base[0].next;
4194	} else
4195		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4196
4197	while (nxt == &ptype_base[hash]) {
4198		if (++hash >= PTYPE_HASH_SIZE)
4199			return NULL;
4200		nxt = ptype_base[hash].next;
4201	}
4202found:
4203	return list_entry(nxt, struct packet_type, list);
4204}
4205
4206static void ptype_seq_stop(struct seq_file *seq, void *v)
4207	__releases(RCU)
4208{
4209	rcu_read_unlock();
4210}
4211
4212static int ptype_seq_show(struct seq_file *seq, void *v)
4213{
4214	struct packet_type *pt = v;
4215
4216	if (v == SEQ_START_TOKEN)
4217		seq_puts(seq, "Type Device      Function\n");
4218	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4219		if (pt->type == htons(ETH_P_ALL))
4220			seq_puts(seq, "ALL ");
4221		else
4222			seq_printf(seq, "%04x", ntohs(pt->type));
4223
4224		seq_printf(seq, " %-8s %pF\n",
4225			   pt->dev ? pt->dev->name : "", pt->func);
4226	}
4227
4228	return 0;
4229}
4230
4231static const struct seq_operations ptype_seq_ops = {
4232	.start = ptype_seq_start,
4233	.next  = ptype_seq_next,
4234	.stop  = ptype_seq_stop,
4235	.show  = ptype_seq_show,
4236};
4237
4238static int ptype_seq_open(struct inode *inode, struct file *file)
4239{
4240	return seq_open_net(inode, file, &ptype_seq_ops,
4241			sizeof(struct seq_net_private));
4242}
4243
4244static const struct file_operations ptype_seq_fops = {
4245	.owner	 = THIS_MODULE,
4246	.open    = ptype_seq_open,
4247	.read    = seq_read,
4248	.llseek  = seq_lseek,
4249	.release = seq_release_net,
4250};
4251
4252
4253static int __net_init dev_proc_net_init(struct net *net)
4254{
4255	int rc = -ENOMEM;
4256
4257	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4258		goto out;
4259	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4260		goto out_dev;
4261	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4262		goto out_softnet;
4263
4264	if (wext_proc_init(net))
4265		goto out_ptype;
4266	rc = 0;
4267out:
4268	return rc;
4269out_ptype:
4270	proc_net_remove(net, "ptype");
4271out_softnet:
4272	proc_net_remove(net, "softnet_stat");
4273out_dev:
4274	proc_net_remove(net, "dev");
4275	goto out;
4276}
4277
4278static void __net_exit dev_proc_net_exit(struct net *net)
4279{
4280	wext_proc_exit(net);
4281
4282	proc_net_remove(net, "ptype");
4283	proc_net_remove(net, "softnet_stat");
4284	proc_net_remove(net, "dev");
4285}
4286
4287static struct pernet_operations __net_initdata dev_proc_ops = {
4288	.init = dev_proc_net_init,
4289	.exit = dev_proc_net_exit,
4290};
4291
4292static int __init dev_proc_init(void)
4293{
4294	return register_pernet_subsys(&dev_proc_ops);
4295}
4296#else
4297#define dev_proc_init() 0
4298#endif	/* CONFIG_PROC_FS */
4299
4300
4301/**
4302 *	netdev_set_master	-	set up master pointer
4303 *	@slave: slave device
4304 *	@master: new master device
4305 *
4306 *	Changes the master device of the slave. Pass %NULL to break the
4307 *	bonding. The caller must hold the RTNL semaphore. On a failure
4308 *	a negative errno code is returned. On success the reference counts
4309 *	are adjusted and the function returns zero.
4310 */
4311int netdev_set_master(struct net_device *slave, struct net_device *master)
4312{
4313	struct net_device *old = slave->master;
4314
4315	ASSERT_RTNL();
4316
4317	if (master) {
4318		if (old)
4319			return -EBUSY;
4320		dev_hold(master);
4321	}
4322
4323	slave->master = master;
4324
4325	if (old)
4326		dev_put(old);
4327	return 0;
4328}
4329EXPORT_SYMBOL(netdev_set_master);
4330
4331/**
4332 *	netdev_set_bond_master	-	set up bonding master/slave pair
4333 *	@slave: slave device
4334 *	@master: new master device
4335 *
4336 *	Changes the master device of the slave. Pass %NULL to break the
4337 *	bonding. The caller must hold the RTNL semaphore. On a failure
4338 *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4339 *	to the routing socket and the function returns zero.
4340 */
4341int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4342{
4343	int err;
4344
4345	ASSERT_RTNL();
4346
4347	err = netdev_set_master(slave, master);
4348	if (err)
4349		return err;
4350	if (master)
4351		slave->flags |= IFF_SLAVE;
4352	else
4353		slave->flags &= ~IFF_SLAVE;
4354
4355	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4356	return 0;
4357}
4358EXPORT_SYMBOL(netdev_set_bond_master);
4359
4360static void dev_change_rx_flags(struct net_device *dev, int flags)
4361{
4362	const struct net_device_ops *ops = dev->netdev_ops;
4363
4364	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4365		ops->ndo_change_rx_flags(dev, flags);
4366}
4367
4368static int __dev_set_promiscuity(struct net_device *dev, int inc)
4369{
4370	unsigned short old_flags = dev->flags;
4371	uid_t uid;
4372	gid_t gid;
4373
4374	ASSERT_RTNL();
4375
4376	dev->flags |= IFF_PROMISC;
4377	dev->promiscuity += inc;
4378	if (dev->promiscuity == 0) {
4379		/*
4380		 * Avoid overflow.
4381		 * If inc causes overflow, untouch promisc and return error.
4382		 */
4383		if (inc < 0)
4384			dev->flags &= ~IFF_PROMISC;
4385		else {
4386			dev->promiscuity -= inc;
4387			printk(KERN_WARNING "%s: promiscuity touches roof, "
4388				"set promiscuity failed, promiscuity feature "
4389				"of device might be broken.\n", dev->name);
4390			return -EOVERFLOW;
4391		}
4392	}
4393	if (dev->flags != old_flags) {
4394		printk(KERN_INFO "device %s %s promiscuous mode\n",
4395		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4396							       "left");
4397		if (audit_enabled) {
4398			current_uid_gid(&uid, &gid);
4399			audit_log(current->audit_context, GFP_ATOMIC,
4400				AUDIT_ANOM_PROMISCUOUS,
4401				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4402				dev->name, (dev->flags & IFF_PROMISC),
4403				(old_flags & IFF_PROMISC),
4404				audit_get_loginuid(current),
4405				uid, gid,
4406				audit_get_sessionid(current));
4407		}
4408
4409		dev_change_rx_flags(dev, IFF_PROMISC);
4410	}
4411	return 0;
4412}
4413
4414/**
4415 *	dev_set_promiscuity	- update promiscuity count on a device
4416 *	@dev: device
4417 *	@inc: modifier
4418 *
4419 *	Add or remove promiscuity from a device. While the count in the device
4420 *	remains above zero the interface remains promiscuous. Once it hits zero
4421 *	the device reverts back to normal filtering operation. A negative inc
4422 *	value is used to drop promiscuity on the device.
4423 *	Return 0 if successful or a negative errno code on error.
4424 */
4425int dev_set_promiscuity(struct net_device *dev, int inc)
4426{
4427	unsigned short old_flags = dev->flags;
4428	int err;
4429
4430	err = __dev_set_promiscuity(dev, inc);
4431	if (err < 0)
4432		return err;
4433	if (dev->flags != old_flags)
4434		dev_set_rx_mode(dev);
4435	return err;
4436}
4437EXPORT_SYMBOL(dev_set_promiscuity);
4438
4439/**
4440 *	dev_set_allmulti	- update allmulti count on a device
4441 *	@dev: device
4442 *	@inc: modifier
4443 *
4444 *	Add or remove reception of all multicast frames to a device. While the
4445 *	count in the device remains above zero the interface remains listening
4446 *	to all interfaces. Once it hits zero the device reverts back to normal
4447 *	filtering operation. A negative @inc value is used to drop the counter
4448 *	when releasing a resource needing all multicasts.
4449 *	Return 0 if successful or a negative errno code on error.
4450 */
4451
4452int dev_set_allmulti(struct net_device *dev, int inc)
4453{
4454	unsigned short old_flags = dev->flags;
4455
4456	ASSERT_RTNL();
4457
4458	dev->flags |= IFF_ALLMULTI;
4459	dev->allmulti += inc;
4460	if (dev->allmulti == 0) {
4461		/*
4462		 * Avoid overflow.
4463		 * If inc causes overflow, untouch allmulti and return error.
4464		 */
4465		if (inc < 0)
4466			dev->flags &= ~IFF_ALLMULTI;
4467		else {
4468			dev->allmulti -= inc;
4469			printk(KERN_WARNING "%s: allmulti touches roof, "
4470				"set allmulti failed, allmulti feature of "
4471				"device might be broken.\n", dev->name);
4472			return -EOVERFLOW;
4473		}
4474	}
4475	if (dev->flags ^ old_flags) {
4476		dev_change_rx_flags(dev, IFF_ALLMULTI);
4477		dev_set_rx_mode(dev);
4478	}
4479	return 0;
4480}
4481EXPORT_SYMBOL(dev_set_allmulti);
4482
4483/*
4484 *	Upload unicast and multicast address lists to device and
4485 *	configure RX filtering. When the device doesn't support unicast
4486 *	filtering it is put in promiscuous mode while unicast addresses
4487 *	are present.
4488 */
4489void __dev_set_rx_mode(struct net_device *dev)
4490{
4491	const struct net_device_ops *ops = dev->netdev_ops;
4492
4493	/* dev_open will call this function so the list will stay sane. */
4494	if (!(dev->flags&IFF_UP))
4495		return;
4496
4497	if (!netif_device_present(dev))
4498		return;
4499
4500	if (ops->ndo_set_rx_mode)
4501		ops->ndo_set_rx_mode(dev);
4502	else {
4503		/* Unicast addresses changes may only happen under the rtnl,
4504		 * therefore calling __dev_set_promiscuity here is safe.
4505		 */
4506		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4507			__dev_set_promiscuity(dev, 1);
4508			dev->uc_promisc = true;
4509		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4510			__dev_set_promiscuity(dev, -1);
4511			dev->uc_promisc = false;
4512		}
4513
4514		if (ops->ndo_set_multicast_list)
4515			ops->ndo_set_multicast_list(dev);
4516	}
4517}
4518
4519void dev_set_rx_mode(struct net_device *dev)
4520{
4521	netif_addr_lock_bh(dev);
4522	__dev_set_rx_mode(dev);
4523	netif_addr_unlock_bh(dev);
4524}
4525
4526/**
4527 *	dev_ethtool_get_settings - call device's ethtool_ops::get_settings()
4528 *	@dev: device
4529 *	@cmd: memory area for ethtool_ops::get_settings() result
4530 *
4531 *      The cmd arg is initialized properly (cleared and
4532 *      ethtool_cmd::cmd field set to ETHTOOL_GSET).
4533 *
4534 *	Return device's ethtool_ops::get_settings() result value or
4535 *	-EOPNOTSUPP when device doesn't expose
4536 *	ethtool_ops::get_settings() operation.
4537 */
4538int dev_ethtool_get_settings(struct net_device *dev,
4539			     struct ethtool_cmd *cmd)
4540{
4541	if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings)
4542		return -EOPNOTSUPP;
4543
4544	memset(cmd, 0, sizeof(struct ethtool_cmd));
4545	cmd->cmd = ETHTOOL_GSET;
4546	return dev->ethtool_ops->get_settings(dev, cmd);
4547}
4548EXPORT_SYMBOL(dev_ethtool_get_settings);
4549
4550/**
4551 *	dev_get_flags - get flags reported to userspace
4552 *	@dev: device
4553 *
4554 *	Get the combination of flag bits exported through APIs to userspace.
4555 */
4556unsigned dev_get_flags(const struct net_device *dev)
4557{
4558	unsigned flags;
4559
4560	flags = (dev->flags & ~(IFF_PROMISC |
4561				IFF_ALLMULTI |
4562				IFF_RUNNING |
4563				IFF_LOWER_UP |
4564				IFF_DORMANT)) |
4565		(dev->gflags & (IFF_PROMISC |
4566				IFF_ALLMULTI));
4567
4568	if (netif_running(dev)) {
4569		if (netif_oper_up(dev))
4570			flags |= IFF_RUNNING;
4571		if (netif_carrier_ok(dev))
4572			flags |= IFF_LOWER_UP;
4573		if (netif_dormant(dev))
4574			flags |= IFF_DORMANT;
4575	}
4576
4577	return flags;
4578}
4579EXPORT_SYMBOL(dev_get_flags);
4580
4581int __dev_change_flags(struct net_device *dev, unsigned int flags)
4582{
4583	int old_flags = dev->flags;
4584	int ret;
4585
4586	ASSERT_RTNL();
4587
4588	/*
4589	 *	Set the flags on our device.
4590	 */
4591
4592	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4593			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4594			       IFF_AUTOMEDIA)) |
4595		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4596				    IFF_ALLMULTI));
4597
4598	/*
4599	 *	Load in the correct multicast list now the flags have changed.
4600	 */
4601
4602	if ((old_flags ^ flags) & IFF_MULTICAST)
4603		dev_change_rx_flags(dev, IFF_MULTICAST);
4604
4605	dev_set_rx_mode(dev);
4606
4607	/*
4608	 *	Have we downed the interface. We handle IFF_UP ourselves
4609	 *	according to user attempts to set it, rather than blindly
4610	 *	setting it.
4611	 */
4612
4613	ret = 0;
4614	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4615		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4616
4617		if (!ret)
4618			dev_set_rx_mode(dev);
4619	}
4620
4621	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4622		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4623
4624		dev->gflags ^= IFF_PROMISC;
4625		dev_set_promiscuity(dev, inc);
4626	}
4627
4628	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4629	   is important. Some (broken) drivers set IFF_PROMISC, when
4630	   IFF_ALLMULTI is requested not asking us and not reporting.
4631	 */
4632	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4633		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4634
4635		dev->gflags ^= IFF_ALLMULTI;
4636		dev_set_allmulti(dev, inc);
4637	}
4638
4639	return ret;
4640}
4641
4642void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4643{
4644	unsigned int changes = dev->flags ^ old_flags;
4645
4646	if (changes & IFF_UP) {
4647		if (dev->flags & IFF_UP)
4648			call_netdevice_notifiers(NETDEV_UP, dev);
4649		else
4650			call_netdevice_notifiers(NETDEV_DOWN, dev);
4651	}
4652
4653	if (dev->flags & IFF_UP &&
4654	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4655		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4656}
4657
4658/**
4659 *	dev_change_flags - change device settings
4660 *	@dev: device
4661 *	@flags: device state flags
4662 *
4663 *	Change settings on device based state flags. The flags are
4664 *	in the userspace exported format.
4665 */
4666int dev_change_flags(struct net_device *dev, unsigned flags)
4667{
4668	int ret, changes;
4669	int old_flags = dev->flags;
4670
4671	ret = __dev_change_flags(dev, flags);
4672	if (ret < 0)
4673		return ret;
4674
4675	changes = old_flags ^ dev->flags;
4676	if (changes)
4677		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4678
4679	__dev_notify_flags(dev, old_flags);
4680	return ret;
4681}
4682EXPORT_SYMBOL(dev_change_flags);
4683
4684/**
4685 *	dev_set_mtu - Change maximum transfer unit
4686 *	@dev: device
4687 *	@new_mtu: new transfer unit
4688 *
4689 *	Change the maximum transfer size of the network device.
4690 */
4691int dev_set_mtu(struct net_device *dev, int new_mtu)
4692{
4693	const struct net_device_ops *ops = dev->netdev_ops;
4694	int err;
4695
4696	if (new_mtu == dev->mtu)
4697		return 0;
4698
4699	/*	MTU must be positive.	 */
4700	if (new_mtu < 0)
4701		return -EINVAL;
4702
4703	if (!netif_device_present(dev))
4704		return -ENODEV;
4705
4706	err = 0;
4707	if (ops->ndo_change_mtu)
4708		err = ops->ndo_change_mtu(dev, new_mtu);
4709	else
4710		dev->mtu = new_mtu;
4711
4712	if (!err && dev->flags & IFF_UP)
4713		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4714	return err;
4715}
4716EXPORT_SYMBOL(dev_set_mtu);
4717
4718/**
4719 *	dev_set_group - Change group this device belongs to
4720 *	@dev: device
4721 *	@new_group: group this device should belong to
4722 */
4723void dev_set_group(struct net_device *dev, int new_group)
4724{
4725	dev->group = new_group;
4726}
4727EXPORT_SYMBOL(dev_set_group);
4728
4729/**
4730 *	dev_set_mac_address - Change Media Access Control Address
4731 *	@dev: device
4732 *	@sa: new address
4733 *
4734 *	Change the hardware (MAC) address of the device
4735 */
4736int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4737{
4738	const struct net_device_ops *ops = dev->netdev_ops;
4739	int err;
4740
4741	if (!ops->ndo_set_mac_address)
4742		return -EOPNOTSUPP;
4743	if (sa->sa_family != dev->type)
4744		return -EINVAL;
4745	if (!netif_device_present(dev))
4746		return -ENODEV;
4747	err = ops->ndo_set_mac_address(dev, sa);
4748	if (!err)
4749		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4750	return err;
4751}
4752EXPORT_SYMBOL(dev_set_mac_address);
4753
4754/*
4755 *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4756 */
4757static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4758{
4759	int err;
4760	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4761
4762	if (!dev)
4763		return -ENODEV;
4764
4765	switch (cmd) {
4766	case SIOCGIFFLAGS:	/* Get interface flags */
4767		ifr->ifr_flags = (short) dev_get_flags(dev);
4768		return 0;
4769
4770	case SIOCGIFMETRIC:	/* Get the metric on the interface
4771				   (currently unused) */
4772		ifr->ifr_metric = 0;
4773		return 0;
4774
4775	case SIOCGIFMTU:	/* Get the MTU of a device */
4776		ifr->ifr_mtu = dev->mtu;
4777		return 0;
4778
4779	case SIOCGIFHWADDR:
4780		if (!dev->addr_len)
4781			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4782		else
4783			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4784			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4785		ifr->ifr_hwaddr.sa_family = dev->type;
4786		return 0;
4787
4788	case SIOCGIFSLAVE:
4789		err = -EINVAL;
4790		break;
4791
4792	case SIOCGIFMAP:
4793		ifr->ifr_map.mem_start = dev->mem_start;
4794		ifr->ifr_map.mem_end   = dev->mem_end;
4795		ifr->ifr_map.base_addr = dev->base_addr;
4796		ifr->ifr_map.irq       = dev->irq;
4797		ifr->ifr_map.dma       = dev->dma;
4798		ifr->ifr_map.port      = dev->if_port;
4799		return 0;
4800
4801	case SIOCGIFINDEX:
4802		ifr->ifr_ifindex = dev->ifindex;
4803		return 0;
4804
4805	case SIOCGIFTXQLEN:
4806		ifr->ifr_qlen = dev->tx_queue_len;
4807		return 0;
4808
4809	default:
4810		/* dev_ioctl() should ensure this case
4811		 * is never reached
4812		 */
4813		WARN_ON(1);
4814		err = -ENOTTY;
4815		break;
4816
4817	}
4818	return err;
4819}
4820
4821/*
4822 *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4823 */
4824static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4825{
4826	int err;
4827	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4828	const struct net_device_ops *ops;
4829
4830	if (!dev)
4831		return -ENODEV;
4832
4833	ops = dev->netdev_ops;
4834
4835	switch (cmd) {
4836	case SIOCSIFFLAGS:	/* Set interface flags */
4837		return dev_change_flags(dev, ifr->ifr_flags);
4838
4839	case SIOCSIFMETRIC:	/* Set the metric on the interface
4840				   (currently unused) */
4841		return -EOPNOTSUPP;
4842
4843	case SIOCSIFMTU:	/* Set the MTU of a device */
4844		return dev_set_mtu(dev, ifr->ifr_mtu);
4845
4846	case SIOCSIFHWADDR:
4847		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4848
4849	case SIOCSIFHWBROADCAST:
4850		if (ifr->ifr_hwaddr.sa_family != dev->type)
4851			return -EINVAL;
4852		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4853		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4854		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4855		return 0;
4856
4857	case SIOCSIFMAP:
4858		if (ops->ndo_set_config) {
4859			if (!netif_device_present(dev))
4860				return -ENODEV;
4861			return ops->ndo_set_config(dev, &ifr->ifr_map);
4862		}
4863		return -EOPNOTSUPP;
4864
4865	case SIOCADDMULTI:
4866		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4867		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4868			return -EINVAL;
4869		if (!netif_device_present(dev))
4870			return -ENODEV;
4871		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4872
4873	case SIOCDELMULTI:
4874		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4875		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4876			return -EINVAL;
4877		if (!netif_device_present(dev))
4878			return -ENODEV;
4879		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4880
4881	case SIOCSIFTXQLEN:
4882		if (ifr->ifr_qlen < 0)
4883			return -EINVAL;
4884		dev->tx_queue_len = ifr->ifr_qlen;
4885		return 0;
4886
4887	case SIOCSIFNAME:
4888		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4889		return dev_change_name(dev, ifr->ifr_newname);
4890
4891	/*
4892	 *	Unknown or private ioctl
4893	 */
4894	default:
4895		if ((cmd >= SIOCDEVPRIVATE &&
4896		    cmd <= SIOCDEVPRIVATE + 15) ||
4897		    cmd == SIOCBONDENSLAVE ||
4898		    cmd == SIOCBONDRELEASE ||
4899		    cmd == SIOCBONDSETHWADDR ||
4900		    cmd == SIOCBONDSLAVEINFOQUERY ||
4901		    cmd == SIOCBONDINFOQUERY ||
4902		    cmd == SIOCBONDCHANGEACTIVE ||
4903		    cmd == SIOCGMIIPHY ||
4904		    cmd == SIOCGMIIREG ||
4905		    cmd == SIOCSMIIREG ||
4906		    cmd == SIOCBRADDIF ||
4907		    cmd == SIOCBRDELIF ||
4908		    cmd == SIOCSHWTSTAMP ||
4909		    cmd == SIOCWANDEV) {
4910			err = -EOPNOTSUPP;
4911			if (ops->ndo_do_ioctl) {
4912				if (netif_device_present(dev))
4913					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4914				else
4915					err = -ENODEV;
4916			}
4917		} else
4918			err = -EINVAL;
4919
4920	}
4921	return err;
4922}
4923
4924/*
4925 *	This function handles all "interface"-type I/O control requests. The actual
4926 *	'doing' part of this is dev_ifsioc above.
4927 */
4928
4929/**
4930 *	dev_ioctl	-	network device ioctl
4931 *	@net: the applicable net namespace
4932 *	@cmd: command to issue
4933 *	@arg: pointer to a struct ifreq in user space
4934 *
4935 *	Issue ioctl functions to devices. This is normally called by the
4936 *	user space syscall interfaces but can sometimes be useful for
4937 *	other purposes. The return value is the return from the syscall if
4938 *	positive or a negative errno code on error.
4939 */
4940
4941int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4942{
4943	struct ifreq ifr;
4944	int ret;
4945	char *colon;
4946
4947	/* One special case: SIOCGIFCONF takes ifconf argument
4948	   and requires shared lock, because it sleeps writing
4949	   to user space.
4950	 */
4951
4952	if (cmd == SIOCGIFCONF) {
4953		rtnl_lock();
4954		ret = dev_ifconf(net, (char __user *) arg);
4955		rtnl_unlock();
4956		return ret;
4957	}
4958	if (cmd == SIOCGIFNAME)
4959		return dev_ifname(net, (struct ifreq __user *)arg);
4960
4961	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4962		return -EFAULT;
4963
4964	ifr.ifr_name[IFNAMSIZ-1] = 0;
4965
4966	colon = strchr(ifr.ifr_name, ':');
4967	if (colon)
4968		*colon = 0;
4969
4970	/*
4971	 *	See which interface the caller is talking about.
4972	 */
4973
4974	switch (cmd) {
4975	/*
4976	 *	These ioctl calls:
4977	 *	- can be done by all.
4978	 *	- atomic and do not require locking.
4979	 *	- return a value
4980	 */
4981	case SIOCGIFFLAGS:
4982	case SIOCGIFMETRIC:
4983	case SIOCGIFMTU:
4984	case SIOCGIFHWADDR:
4985	case SIOCGIFSLAVE:
4986	case SIOCGIFMAP:
4987	case SIOCGIFINDEX:
4988	case SIOCGIFTXQLEN:
4989		dev_load(net, ifr.ifr_name);
4990		rcu_read_lock();
4991		ret = dev_ifsioc_locked(net, &ifr, cmd);
4992		rcu_read_unlock();
4993		if (!ret) {
4994			if (colon)
4995				*colon = ':';
4996			if (copy_to_user(arg, &ifr,
4997					 sizeof(struct ifreq)))
4998				ret = -EFAULT;
4999		}
5000		return ret;
5001
5002	case SIOCETHTOOL:
5003		dev_load(net, ifr.ifr_name);
5004		rtnl_lock();
5005		ret = dev_ethtool(net, &ifr);
5006		rtnl_unlock();
5007		if (!ret) {
5008			if (colon)
5009				*colon = ':';
5010			if (copy_to_user(arg, &ifr,
5011					 sizeof(struct ifreq)))
5012				ret = -EFAULT;
5013		}
5014		return ret;
5015
5016	/*
5017	 *	These ioctl calls:
5018	 *	- require superuser power.
5019	 *	- require strict serialization.
5020	 *	- return a value
5021	 */
5022	case SIOCGMIIPHY:
5023	case SIOCGMIIREG:
5024	case SIOCSIFNAME:
5025		if (!capable(CAP_NET_ADMIN))
5026			return -EPERM;
5027		dev_load(net, ifr.ifr_name);
5028		rtnl_lock();
5029		ret = dev_ifsioc(net, &ifr, cmd);
5030		rtnl_unlock();
5031		if (!ret) {
5032			if (colon)
5033				*colon = ':';
5034			if (copy_to_user(arg, &ifr,
5035					 sizeof(struct ifreq)))
5036				ret = -EFAULT;
5037		}
5038		return ret;
5039
5040	/*
5041	 *	These ioctl calls:
5042	 *	- require superuser power.
5043	 *	- require strict serialization.
5044	 *	- do not return a value
5045	 */
5046	case SIOCSIFFLAGS:
5047	case SIOCSIFMETRIC:
5048	case SIOCSIFMTU:
5049	case SIOCSIFMAP:
5050	case SIOCSIFHWADDR:
5051	case SIOCSIFSLAVE:
5052	case SIOCADDMULTI:
5053	case SIOCDELMULTI:
5054	case SIOCSIFHWBROADCAST:
5055	case SIOCSIFTXQLEN:
5056	case SIOCSMIIREG:
5057	case SIOCBONDENSLAVE:
5058	case SIOCBONDRELEASE:
5059	case SIOCBONDSETHWADDR:
5060	case SIOCBONDCHANGEACTIVE:
5061	case SIOCBRADDIF:
5062	case SIOCBRDELIF:
5063	case SIOCSHWTSTAMP:
5064		if (!capable(CAP_NET_ADMIN))
5065			return -EPERM;
5066		/* fall through */
5067	case SIOCBONDSLAVEINFOQUERY:
5068	case SIOCBONDINFOQUERY:
5069		dev_load(net, ifr.ifr_name);
5070		rtnl_lock();
5071		ret = dev_ifsioc(net, &ifr, cmd);
5072		rtnl_unlock();
5073		return ret;
5074
5075	case SIOCGIFMEM:
5076		/* Get the per device memory space. We can add this but
5077		 * currently do not support it */
5078	case SIOCSIFMEM:
5079		/* Set the per device memory buffer space.
5080		 * Not applicable in our case */
5081	case SIOCSIFLINK:
5082		return -ENOTTY;
5083
5084	/*
5085	 *	Unknown or private ioctl.
5086	 */
5087	default:
5088		if (cmd == SIOCWANDEV ||
5089		    (cmd >= SIOCDEVPRIVATE &&
5090		     cmd <= SIOCDEVPRIVATE + 15)) {
5091			dev_load(net, ifr.ifr_name);
5092			rtnl_lock();
5093			ret = dev_ifsioc(net, &ifr, cmd);
5094			rtnl_unlock();
5095			if (!ret && copy_to_user(arg, &ifr,
5096						 sizeof(struct ifreq)))
5097				ret = -EFAULT;
5098			return ret;
5099		}
5100		/* Take care of Wireless Extensions */
5101		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5102			return wext_handle_ioctl(net, &ifr, cmd, arg);
5103		return -ENOTTY;
5104	}
5105}
5106
5107
5108/**
5109 *	dev_new_index	-	allocate an ifindex
5110 *	@net: the applicable net namespace
5111 *
5112 *	Returns a suitable unique value for a new device interface
5113 *	number.  The caller must hold the rtnl semaphore or the
5114 *	dev_base_lock to be sure it remains unique.
5115 */
5116static int dev_new_index(struct net *net)
5117{
5118	static int ifindex;
5119	for (;;) {
5120		if (++ifindex <= 0)
5121			ifindex = 1;
5122		if (!__dev_get_by_index(net, ifindex))
5123			return ifindex;
5124	}
5125}
5126
5127/* Delayed registration/unregisteration */
5128static LIST_HEAD(net_todo_list);
5129
5130static void net_set_todo(struct net_device *dev)
5131{
5132	list_add_tail(&dev->todo_list, &net_todo_list);
5133}
5134
5135static void rollback_registered_many(struct list_head *head)
5136{
5137	struct net_device *dev, *tmp;
5138
5139	BUG_ON(dev_boot_phase);
5140	ASSERT_RTNL();
5141
5142	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5143		/* Some devices call without registering
5144		 * for initialization unwind. Remove those
5145		 * devices and proceed with the remaining.
5146		 */
5147		if (dev->reg_state == NETREG_UNINITIALIZED) {
5148			pr_debug("unregister_netdevice: device %s/%p never "
5149				 "was registered\n", dev->name, dev);
5150
5151			WARN_ON(1);
5152			list_del(&dev->unreg_list);
5153			continue;
5154		}
5155		dev->dismantle = true;
5156		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5157	}
5158
5159	/* If device is running, close it first. */
5160	dev_close_many(head);
5161
5162	list_for_each_entry(dev, head, unreg_list) {
5163		/* And unlink it from device chain. */
5164		unlist_netdevice(dev);
5165
5166		dev->reg_state = NETREG_UNREGISTERING;
5167	}
5168
5169	synchronize_net();
5170
5171	list_for_each_entry(dev, head, unreg_list) {
5172		/* Shutdown queueing discipline. */
5173		dev_shutdown(dev);
5174
5175
5176		/* Notify protocols, that we are about to destroy
5177		   this device. They should clean all the things.
5178		*/
5179		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5180
5181		if (!dev->rtnl_link_ops ||
5182		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5183			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5184
5185		/*
5186		 *	Flush the unicast and multicast chains
5187		 */
5188		dev_uc_flush(dev);
5189		dev_mc_flush(dev);
5190
5191		if (dev->netdev_ops->ndo_uninit)
5192			dev->netdev_ops->ndo_uninit(dev);
5193
5194		/* Notifier chain MUST detach us from master device. */
5195		WARN_ON(dev->master);
5196
5197		/* Remove entries from kobject tree */
5198		netdev_unregister_kobject(dev);
5199	}
5200
5201	/* Process any work delayed until the end of the batch */
5202	dev = list_first_entry(head, struct net_device, unreg_list);
5203	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5204
5205	rcu_barrier();
5206
5207	list_for_each_entry(dev, head, unreg_list)
5208		dev_put(dev);
5209}
5210
5211static void rollback_registered(struct net_device *dev)
5212{
5213	LIST_HEAD(single);
5214
5215	list_add(&dev->unreg_list, &single);
5216	rollback_registered_many(&single);
5217	list_del(&single);
5218}
5219
5220static u32 netdev_fix_features(struct net_device *dev, u32 features)
5221{
5222	/* Fix illegal checksum combinations */
5223	if ((features & NETIF_F_HW_CSUM) &&
5224	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5225		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5226		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5227	}
5228
5229	if ((features & NETIF_F_NO_CSUM) &&
5230	    (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5231		netdev_warn(dev, "mixed no checksumming and other settings.\n");
5232		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5233	}
5234
5235	/* Fix illegal SG+CSUM combinations. */
5236	if ((features & NETIF_F_SG) &&
5237	    !(features & NETIF_F_ALL_CSUM)) {
5238		netdev_dbg(dev,
5239			"Dropping NETIF_F_SG since no checksum feature.\n");
5240		features &= ~NETIF_F_SG;
5241	}
5242
5243	/* TSO requires that SG is present as well. */
5244	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5245		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5246		features &= ~NETIF_F_ALL_TSO;
5247	}
5248
5249	/* TSO ECN requires that TSO is present as well. */
5250	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5251		features &= ~NETIF_F_TSO_ECN;
5252
5253	/* Software GSO depends on SG. */
5254	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5255		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5256		features &= ~NETIF_F_GSO;
5257	}
5258
5259	/* UFO needs SG and checksumming */
5260	if (features & NETIF_F_UFO) {
5261		/* maybe split UFO into V4 and V6? */
5262		if (!((features & NETIF_F_GEN_CSUM) ||
5263		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5264			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5265			netdev_dbg(dev,
5266				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5267			features &= ~NETIF_F_UFO;
5268		}
5269
5270		if (!(features & NETIF_F_SG)) {
5271			netdev_dbg(dev,
5272				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5273			features &= ~NETIF_F_UFO;
5274		}
5275	}
5276
5277	return features;
5278}
5279
5280int __netdev_update_features(struct net_device *dev)
5281{
5282	u32 features;
5283	int err = 0;
5284
5285	ASSERT_RTNL();
5286
5287	features = netdev_get_wanted_features(dev);
5288
5289	if (dev->netdev_ops->ndo_fix_features)
5290		features = dev->netdev_ops->ndo_fix_features(dev, features);
5291
5292	/* driver might be less strict about feature dependencies */
5293	features = netdev_fix_features(dev, features);
5294
5295	if (dev->features == features)
5296		return 0;
5297
5298	netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
5299		dev->features, features);
5300
5301	if (dev->netdev_ops->ndo_set_features)
5302		err = dev->netdev_ops->ndo_set_features(dev, features);
5303
5304	if (unlikely(err < 0)) {
5305		netdev_err(dev,
5306			"set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5307			err, features, dev->features);
5308		return -1;
5309	}
5310
5311	if (!err)
5312		dev->features = features;
5313
5314	return 1;
5315}
5316
5317/**
5318 *	netdev_update_features - recalculate device features
5319 *	@dev: the device to check
5320 *
5321 *	Recalculate dev->features set and send notifications if it
5322 *	has changed. Should be called after driver or hardware dependent
5323 *	conditions might have changed that influence the features.
5324 */
5325void netdev_update_features(struct net_device *dev)
5326{
5327	if (__netdev_update_features(dev))
5328		netdev_features_change(dev);
5329}
5330EXPORT_SYMBOL(netdev_update_features);
5331
5332/**
5333 *	netdev_change_features - recalculate device features
5334 *	@dev: the device to check
5335 *
5336 *	Recalculate dev->features set and send notifications even
5337 *	if they have not changed. Should be called instead of
5338 *	netdev_update_features() if also dev->vlan_features might
5339 *	have changed to allow the changes to be propagated to stacked
5340 *	VLAN devices.
5341 */
5342void netdev_change_features(struct net_device *dev)
5343{
5344	__netdev_update_features(dev);
5345	netdev_features_change(dev);
5346}
5347EXPORT_SYMBOL(netdev_change_features);
5348
5349/**
5350 *	netif_stacked_transfer_operstate -	transfer operstate
5351 *	@rootdev: the root or lower level device to transfer state from
5352 *	@dev: the device to transfer operstate to
5353 *
5354 *	Transfer operational state from root to device. This is normally
5355 *	called when a stacking relationship exists between the root
5356 *	device and the device(a leaf device).
5357 */
5358void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5359					struct net_device *dev)
5360{
5361	if (rootdev->operstate == IF_OPER_DORMANT)
5362		netif_dormant_on(dev);
5363	else
5364		netif_dormant_off(dev);
5365
5366	if (netif_carrier_ok(rootdev)) {
5367		if (!netif_carrier_ok(dev))
5368			netif_carrier_on(dev);
5369	} else {
5370		if (netif_carrier_ok(dev))
5371			netif_carrier_off(dev);
5372	}
5373}
5374EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5375
5376#ifdef CONFIG_RPS
5377static int netif_alloc_rx_queues(struct net_device *dev)
5378{
5379	unsigned int i, count = dev->num_rx_queues;
5380	struct netdev_rx_queue *rx;
5381
5382	BUG_ON(count < 1);
5383
5384	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5385	if (!rx) {
5386		pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5387		return -ENOMEM;
5388	}
5389	dev->_rx = rx;
5390
5391	for (i = 0; i < count; i++)
5392		rx[i].dev = dev;
5393	return 0;
5394}
5395#endif
5396
5397static void netdev_init_one_queue(struct net_device *dev,
5398				  struct netdev_queue *queue, void *_unused)
5399{
5400	/* Initialize queue lock */
5401	spin_lock_init(&queue->_xmit_lock);
5402	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5403	queue->xmit_lock_owner = -1;
5404	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5405	queue->dev = dev;
5406}
5407
5408static int netif_alloc_netdev_queues(struct net_device *dev)
5409{
5410	unsigned int count = dev->num_tx_queues;
5411	struct netdev_queue *tx;
5412
5413	BUG_ON(count < 1);
5414
5415	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5416	if (!tx) {
5417		pr_err("netdev: Unable to allocate %u tx queues.\n",
5418		       count);
5419		return -ENOMEM;
5420	}
5421	dev->_tx = tx;
5422
5423	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5424	spin_lock_init(&dev->tx_global_lock);
5425
5426	return 0;
5427}
5428
5429/**
5430 *	register_netdevice	- register a network device
5431 *	@dev: device to register
5432 *
5433 *	Take a completed network device structure and add it to the kernel
5434 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5435 *	chain. 0 is returned on success. A negative errno code is returned
5436 *	on a failure to set up the device, or if the name is a duplicate.
5437 *
5438 *	Callers must hold the rtnl semaphore. You may want
5439 *	register_netdev() instead of this.
5440 *
5441 *	BUGS:
5442 *	The locking appears insufficient to guarantee two parallel registers
5443 *	will not get the same name.
5444 */
5445
5446int register_netdevice(struct net_device *dev)
5447{
5448	int ret;
5449	struct net *net = dev_net(dev);
5450
5451	BUG_ON(dev_boot_phase);
5452	ASSERT_RTNL();
5453
5454	might_sleep();
5455
5456	/* When net_device's are persistent, this will be fatal. */
5457	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5458	BUG_ON(!net);
5459
5460	spin_lock_init(&dev->addr_list_lock);
5461	netdev_set_addr_lockdep_class(dev);
5462
5463	dev->iflink = -1;
5464
5465	ret = dev_get_valid_name(dev, dev->name);
5466	if (ret < 0)
5467		goto out;
5468
5469	/* Init, if this function is available */
5470	if (dev->netdev_ops->ndo_init) {
5471		ret = dev->netdev_ops->ndo_init(dev);
5472		if (ret) {
5473			if (ret > 0)
5474				ret = -EIO;
5475			goto out;
5476		}
5477	}
5478
5479	dev->ifindex = dev_new_index(net);
5480	if (dev->iflink == -1)
5481		dev->iflink = dev->ifindex;
5482
5483	/* Transfer changeable features to wanted_features and enable
5484	 * software offloads (GSO and GRO).
5485	 */
5486	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5487	dev->features |= NETIF_F_SOFT_FEATURES;
5488	dev->wanted_features = dev->features & dev->hw_features;
5489
5490	/* Turn on no cache copy if HW is doing checksum */
5491	dev->hw_features |= NETIF_F_NOCACHE_COPY;
5492	if ((dev->features & NETIF_F_ALL_CSUM) &&
5493	    !(dev->features & NETIF_F_NO_CSUM)) {
5494		dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5495		dev->features |= NETIF_F_NOCACHE_COPY;
5496	}
5497
5498	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5499	 */
5500	dev->vlan_features |= NETIF_F_HIGHDMA;
5501
5502	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5503	ret = notifier_to_errno(ret);
5504	if (ret)
5505		goto err_uninit;
5506
5507	ret = netdev_register_kobject(dev);
5508	if (ret)
5509		goto err_uninit;
5510	dev->reg_state = NETREG_REGISTERED;
5511
5512	__netdev_update_features(dev);
5513
5514	/*
5515	 *	Default initial state at registry is that the
5516	 *	device is present.
5517	 */
5518
5519	set_bit(__LINK_STATE_PRESENT, &dev->state);
5520
5521	dev_init_scheduler(dev);
5522	dev_hold(dev);
5523	list_netdevice(dev);
5524
5525	/* Notify protocols, that a new device appeared. */
5526	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5527	ret = notifier_to_errno(ret);
5528	if (ret) {
5529		rollback_registered(dev);
5530		dev->reg_state = NETREG_UNREGISTERED;
5531	}
5532	/*
5533	 *	Prevent userspace races by waiting until the network
5534	 *	device is fully setup before sending notifications.
5535	 */
5536	if (!dev->rtnl_link_ops ||
5537	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5538		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5539
5540out:
5541	return ret;
5542
5543err_uninit:
5544	if (dev->netdev_ops->ndo_uninit)
5545		dev->netdev_ops->ndo_uninit(dev);
5546	goto out;
5547}
5548EXPORT_SYMBOL(register_netdevice);
5549
5550/**
5551 *	init_dummy_netdev	- init a dummy network device for NAPI
5552 *	@dev: device to init
5553 *
5554 *	This takes a network device structure and initialize the minimum
5555 *	amount of fields so it can be used to schedule NAPI polls without
5556 *	registering a full blown interface. This is to be used by drivers
5557 *	that need to tie several hardware interfaces to a single NAPI
5558 *	poll scheduler due to HW limitations.
5559 */
5560int init_dummy_netdev(struct net_device *dev)
5561{
5562	/* Clear everything. Note we don't initialize spinlocks
5563	 * are they aren't supposed to be taken by any of the
5564	 * NAPI code and this dummy netdev is supposed to be
5565	 * only ever used for NAPI polls
5566	 */
5567	memset(dev, 0, sizeof(struct net_device));
5568
5569	/* make sure we BUG if trying to hit standard
5570	 * register/unregister code path
5571	 */
5572	dev->reg_state = NETREG_DUMMY;
5573
5574	/* NAPI wants this */
5575	INIT_LIST_HEAD(&dev->napi_list);
5576
5577	/* a dummy interface is started by default */
5578	set_bit(__LINK_STATE_PRESENT, &dev->state);
5579	set_bit(__LINK_STATE_START, &dev->state);
5580
5581	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5582	 * because users of this 'device' dont need to change
5583	 * its refcount.
5584	 */
5585
5586	return 0;
5587}
5588EXPORT_SYMBOL_GPL(init_dummy_netdev);
5589
5590
5591/**
5592 *	register_netdev	- register a network device
5593 *	@dev: device to register
5594 *
5595 *	Take a completed network device structure and add it to the kernel
5596 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5597 *	chain. 0 is returned on success. A negative errno code is returned
5598 *	on a failure to set up the device, or if the name is a duplicate.
5599 *
5600 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5601 *	and expands the device name if you passed a format string to
5602 *	alloc_netdev.
5603 */
5604int register_netdev(struct net_device *dev)
5605{
5606	int err;
5607
5608	rtnl_lock();
5609	err = register_netdevice(dev);
5610	rtnl_unlock();
5611	return err;
5612}
5613EXPORT_SYMBOL(register_netdev);
5614
5615int netdev_refcnt_read(const struct net_device *dev)
5616{
5617	int i, refcnt = 0;
5618
5619	for_each_possible_cpu(i)
5620		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5621	return refcnt;
5622}
5623EXPORT_SYMBOL(netdev_refcnt_read);
5624
5625/*
5626 * netdev_wait_allrefs - wait until all references are gone.
5627 *
5628 * This is called when unregistering network devices.
5629 *
5630 * Any protocol or device that holds a reference should register
5631 * for netdevice notification, and cleanup and put back the
5632 * reference if they receive an UNREGISTER event.
5633 * We can get stuck here if buggy protocols don't correctly
5634 * call dev_put.
5635 */
5636static void netdev_wait_allrefs(struct net_device *dev)
5637{
5638	unsigned long rebroadcast_time, warning_time;
5639	int refcnt;
5640
5641	linkwatch_forget_dev(dev);
5642
5643	rebroadcast_time = warning_time = jiffies;
5644	refcnt = netdev_refcnt_read(dev);
5645
5646	while (refcnt != 0) {
5647		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5648			rtnl_lock();
5649
5650			/* Rebroadcast unregister notification */
5651			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5652			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5653			 * should have already handle it the first time */
5654
5655			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5656				     &dev->state)) {
5657				/* We must not have linkwatch events
5658				 * pending on unregister. If this
5659				 * happens, we simply run the queue
5660				 * unscheduled, resulting in a noop
5661				 * for this device.
5662				 */
5663				linkwatch_run_queue();
5664			}
5665
5666			__rtnl_unlock();
5667
5668			rebroadcast_time = jiffies;
5669		}
5670
5671		msleep(250);
5672
5673		refcnt = netdev_refcnt_read(dev);
5674
5675		if (time_after(jiffies, warning_time + 10 * HZ)) {
5676			printk(KERN_EMERG "unregister_netdevice: "
5677			       "waiting for %s to become free. Usage "
5678			       "count = %d\n",
5679			       dev->name, refcnt);
5680			warning_time = jiffies;
5681		}
5682	}
5683}
5684
5685/* The sequence is:
5686 *
5687 *	rtnl_lock();
5688 *	...
5689 *	register_netdevice(x1);
5690 *	register_netdevice(x2);
5691 *	...
5692 *	unregister_netdevice(y1);
5693 *	unregister_netdevice(y2);
5694 *      ...
5695 *	rtnl_unlock();
5696 *	free_netdev(y1);
5697 *	free_netdev(y2);
5698 *
5699 * We are invoked by rtnl_unlock().
5700 * This allows us to deal with problems:
5701 * 1) We can delete sysfs objects which invoke hotplug
5702 *    without deadlocking with linkwatch via keventd.
5703 * 2) Since we run with the RTNL semaphore not held, we can sleep
5704 *    safely in order to wait for the netdev refcnt to drop to zero.
5705 *
5706 * We must not return until all unregister events added during
5707 * the interval the lock was held have been completed.
5708 */
5709void netdev_run_todo(void)
5710{
5711	struct list_head list;
5712
5713	/* Snapshot list, allow later requests */
5714	list_replace_init(&net_todo_list, &list);
5715
5716	__rtnl_unlock();
5717
5718	while (!list_empty(&list)) {
5719		struct net_device *dev
5720			= list_first_entry(&list, struct net_device, todo_list);
5721		list_del(&dev->todo_list);
5722
5723		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5724			printk(KERN_ERR "network todo '%s' but state %d\n",
5725			       dev->name, dev->reg_state);
5726			dump_stack();
5727			continue;
5728		}
5729
5730		dev->reg_state = NETREG_UNREGISTERED;
5731
5732		on_each_cpu(flush_backlog, dev, 1);
5733
5734		netdev_wait_allrefs(dev);
5735
5736		/* paranoia */
5737		BUG_ON(netdev_refcnt_read(dev));
5738		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5739		WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5740		WARN_ON(dev->dn_ptr);
5741
5742		if (dev->destructor)
5743			dev->destructor(dev);
5744
5745		/* Free network device */
5746		kobject_put(&dev->dev.kobj);
5747	}
5748}
5749
5750/* Convert net_device_stats to rtnl_link_stats64.  They have the same
5751 * fields in the same order, with only the type differing.
5752 */
5753static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5754				    const struct net_device_stats *netdev_stats)
5755{
5756#if BITS_PER_LONG == 64
5757        BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5758        memcpy(stats64, netdev_stats, sizeof(*stats64));
5759#else
5760	size_t i, n = sizeof(*stats64) / sizeof(u64);
5761	const unsigned long *src = (const unsigned long *)netdev_stats;
5762	u64 *dst = (u64 *)stats64;
5763
5764	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5765		     sizeof(*stats64) / sizeof(u64));
5766	for (i = 0; i < n; i++)
5767		dst[i] = src[i];
5768#endif
5769}
5770
5771/**
5772 *	dev_get_stats	- get network device statistics
5773 *	@dev: device to get statistics from
5774 *	@storage: place to store stats
5775 *
5776 *	Get network statistics from device. Return @storage.
5777 *	The device driver may provide its own method by setting
5778 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5779 *	otherwise the internal statistics structure is used.
5780 */
5781struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5782					struct rtnl_link_stats64 *storage)
5783{
5784	const struct net_device_ops *ops = dev->netdev_ops;
5785
5786	if (ops->ndo_get_stats64) {
5787		memset(storage, 0, sizeof(*storage));
5788		ops->ndo_get_stats64(dev, storage);
5789	} else if (ops->ndo_get_stats) {
5790		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5791	} else {
5792		netdev_stats_to_stats64(storage, &dev->stats);
5793	}
5794	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5795	return storage;
5796}
5797EXPORT_SYMBOL(dev_get_stats);
5798
5799struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5800{
5801	struct netdev_queue *queue = dev_ingress_queue(dev);
5802
5803#ifdef CONFIG_NET_CLS_ACT
5804	if (queue)
5805		return queue;
5806	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5807	if (!queue)
5808		return NULL;
5809	netdev_init_one_queue(dev, queue, NULL);
5810	queue->qdisc = &noop_qdisc;
5811	queue->qdisc_sleeping = &noop_qdisc;
5812	rcu_assign_pointer(dev->ingress_queue, queue);
5813#endif
5814	return queue;
5815}
5816
5817/**
5818 *	alloc_netdev_mqs - allocate network device
5819 *	@sizeof_priv:	size of private data to allocate space for
5820 *	@name:		device name format string
5821 *	@setup:		callback to initialize device
5822 *	@txqs:		the number of TX subqueues to allocate
5823 *	@rxqs:		the number of RX subqueues to allocate
5824 *
5825 *	Allocates a struct net_device with private data area for driver use
5826 *	and performs basic initialization.  Also allocates subquue structs
5827 *	for each queue on the device.
5828 */
5829struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5830		void (*setup)(struct net_device *),
5831		unsigned int txqs, unsigned int rxqs)
5832{
5833	struct net_device *dev;
5834	size_t alloc_size;
5835	struct net_device *p;
5836
5837	BUG_ON(strlen(name) >= sizeof(dev->name));
5838
5839	if (txqs < 1) {
5840		pr_err("alloc_netdev: Unable to allocate device "
5841		       "with zero queues.\n");
5842		return NULL;
5843	}
5844
5845#ifdef CONFIG_RPS
5846	if (rxqs < 1) {
5847		pr_err("alloc_netdev: Unable to allocate device "
5848		       "with zero RX queues.\n");
5849		return NULL;
5850	}
5851#endif
5852
5853	alloc_size = sizeof(struct net_device);
5854	if (sizeof_priv) {
5855		/* ensure 32-byte alignment of private area */
5856		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5857		alloc_size += sizeof_priv;
5858	}
5859	/* ensure 32-byte alignment of whole construct */
5860	alloc_size += NETDEV_ALIGN - 1;
5861
5862	p = kzalloc(alloc_size, GFP_KERNEL);
5863	if (!p) {
5864		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5865		return NULL;
5866	}
5867
5868	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5869	dev->padded = (char *)dev - (char *)p;
5870
5871	dev->pcpu_refcnt = alloc_percpu(int);
5872	if (!dev->pcpu_refcnt)
5873		goto free_p;
5874
5875	if (dev_addr_init(dev))
5876		goto free_pcpu;
5877
5878	dev_mc_init(dev);
5879	dev_uc_init(dev);
5880
5881	dev_net_set(dev, &init_net);
5882
5883	dev->gso_max_size = GSO_MAX_SIZE;
5884
5885	INIT_LIST_HEAD(&dev->napi_list);
5886	INIT_LIST_HEAD(&dev->unreg_list);
5887	INIT_LIST_HEAD(&dev->link_watch_list);
5888	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5889	setup(dev);
5890
5891	dev->num_tx_queues = txqs;
5892	dev->real_num_tx_queues = txqs;
5893	if (netif_alloc_netdev_queues(dev))
5894		goto free_all;
5895
5896#ifdef CONFIG_RPS
5897	dev->num_rx_queues = rxqs;
5898	dev->real_num_rx_queues = rxqs;
5899	if (netif_alloc_rx_queues(dev))
5900		goto free_all;
5901#endif
5902
5903	strcpy(dev->name, name);
5904	dev->group = INIT_NETDEV_GROUP;
5905	return dev;
5906
5907free_all:
5908	free_netdev(dev);
5909	return NULL;
5910
5911free_pcpu:
5912	free_percpu(dev->pcpu_refcnt);
5913	kfree(dev->_tx);
5914#ifdef CONFIG_RPS
5915	kfree(dev->_rx);
5916#endif
5917
5918free_p:
5919	kfree(p);
5920	return NULL;
5921}
5922EXPORT_SYMBOL(alloc_netdev_mqs);
5923
5924/**
5925 *	free_netdev - free network device
5926 *	@dev: device
5927 *
5928 *	This function does the last stage of destroying an allocated device
5929 * 	interface. The reference to the device object is released.
5930 *	If this is the last reference then it will be freed.
5931 */
5932void free_netdev(struct net_device *dev)
5933{
5934	struct napi_struct *p, *n;
5935
5936	release_net(dev_net(dev));
5937
5938	kfree(dev->_tx);
5939#ifdef CONFIG_RPS
5940	kfree(dev->_rx);
5941#endif
5942
5943	kfree(rcu_dereference_raw(dev->ingress_queue));
5944
5945	/* Flush device addresses */
5946	dev_addr_flush(dev);
5947
5948	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5949		netif_napi_del(p);
5950
5951	free_percpu(dev->pcpu_refcnt);
5952	dev->pcpu_refcnt = NULL;
5953
5954	/*  Compatibility with error handling in drivers */
5955	if (dev->reg_state == NETREG_UNINITIALIZED) {
5956		kfree((char *)dev - dev->padded);
5957		return;
5958	}
5959
5960	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5961	dev->reg_state = NETREG_RELEASED;
5962
5963	/* will free via device release */
5964	put_device(&dev->dev);
5965}
5966EXPORT_SYMBOL(free_netdev);
5967
5968/**
5969 *	synchronize_net -  Synchronize with packet receive processing
5970 *
5971 *	Wait for packets currently being received to be done.
5972 *	Does not block later packets from starting.
5973 */
5974void synchronize_net(void)
5975{
5976	might_sleep();
5977	if (rtnl_is_locked())
5978		synchronize_rcu_expedited();
5979	else
5980		synchronize_rcu();
5981}
5982EXPORT_SYMBOL(synchronize_net);
5983
5984/**
5985 *	unregister_netdevice_queue - remove device from the kernel
5986 *	@dev: device
5987 *	@head: list
5988 *
5989 *	This function shuts down a device interface and removes it
5990 *	from the kernel tables.
5991 *	If head not NULL, device is queued to be unregistered later.
5992 *
5993 *	Callers must hold the rtnl semaphore.  You may want
5994 *	unregister_netdev() instead of this.
5995 */
5996
5997void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5998{
5999	ASSERT_RTNL();
6000
6001	if (head) {
6002		list_move_tail(&dev->unreg_list, head);
6003	} else {
6004		rollback_registered(dev);
6005		/* Finish processing unregister after unlock */
6006		net_set_todo(dev);
6007	}
6008}
6009EXPORT_SYMBOL(unregister_netdevice_queue);
6010
6011/**
6012 *	unregister_netdevice_many - unregister many devices
6013 *	@head: list of devices
6014 */
6015void unregister_netdevice_many(struct list_head *head)
6016{
6017	struct net_device *dev;
6018
6019	if (!list_empty(head)) {
6020		rollback_registered_many(head);
6021		list_for_each_entry(dev, head, unreg_list)
6022			net_set_todo(dev);
6023	}
6024}
6025EXPORT_SYMBOL(unregister_netdevice_many);
6026
6027/**
6028 *	unregister_netdev - remove device from the kernel
6029 *	@dev: device
6030 *
6031 *	This function shuts down a device interface and removes it
6032 *	from the kernel tables.
6033 *
6034 *	This is just a wrapper for unregister_netdevice that takes
6035 *	the rtnl semaphore.  In general you want to use this and not
6036 *	unregister_netdevice.
6037 */
6038void unregister_netdev(struct net_device *dev)
6039{
6040	rtnl_lock();
6041	unregister_netdevice(dev);
6042	rtnl_unlock();
6043}
6044EXPORT_SYMBOL(unregister_netdev);
6045
6046/**
6047 *	dev_change_net_namespace - move device to different nethost namespace
6048 *	@dev: device
6049 *	@net: network namespace
6050 *	@pat: If not NULL name pattern to try if the current device name
6051 *	      is already taken in the destination network namespace.
6052 *
6053 *	This function shuts down a device interface and moves it
6054 *	to a new network namespace. On success 0 is returned, on
6055 *	a failure a netagive errno code is returned.
6056 *
6057 *	Callers must hold the rtnl semaphore.
6058 */
6059
6060int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6061{
6062	int err;
6063
6064	ASSERT_RTNL();
6065
6066	/* Don't allow namespace local devices to be moved. */
6067	err = -EINVAL;
6068	if (dev->features & NETIF_F_NETNS_LOCAL)
6069		goto out;
6070
6071	/* Ensure the device has been registrered */
6072	err = -EINVAL;
6073	if (dev->reg_state != NETREG_REGISTERED)
6074		goto out;
6075
6076	/* Get out if there is nothing todo */
6077	err = 0;
6078	if (net_eq(dev_net(dev), net))
6079		goto out;
6080
6081	/* Pick the destination device name, and ensure
6082	 * we can use it in the destination network namespace.
6083	 */
6084	err = -EEXIST;
6085	if (__dev_get_by_name(net, dev->name)) {
6086		/* We get here if we can't use the current device name */
6087		if (!pat)
6088			goto out;
6089		if (dev_get_valid_name(dev, pat) < 0)
6090			goto out;
6091	}
6092
6093	/*
6094	 * And now a mini version of register_netdevice unregister_netdevice.
6095	 */
6096
6097	/* If device is running close it first. */
6098	dev_close(dev);
6099
6100	/* And unlink it from device chain */
6101	err = -ENODEV;
6102	unlist_netdevice(dev);
6103
6104	synchronize_net();
6105
6106	/* Shutdown queueing discipline. */
6107	dev_shutdown(dev);
6108
6109	/* Notify protocols, that we are about to destroy
6110	   this device. They should clean all the things.
6111
6112	   Note that dev->reg_state stays at NETREG_REGISTERED.
6113	   This is wanted because this way 8021q and macvlan know
6114	   the device is just moving and can keep their slaves up.
6115	*/
6116	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6117	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6118
6119	/*
6120	 *	Flush the unicast and multicast chains
6121	 */
6122	dev_uc_flush(dev);
6123	dev_mc_flush(dev);
6124
6125	/* Actually switch the network namespace */
6126	dev_net_set(dev, net);
6127
6128	/* If there is an ifindex conflict assign a new one */
6129	if (__dev_get_by_index(net, dev->ifindex)) {
6130		int iflink = (dev->iflink == dev->ifindex);
6131		dev->ifindex = dev_new_index(net);
6132		if (iflink)
6133			dev->iflink = dev->ifindex;
6134	}
6135
6136	/* Fixup kobjects */
6137	err = device_rename(&dev->dev, dev->name);
6138	WARN_ON(err);
6139
6140	/* Add the device back in the hashes */
6141	list_netdevice(dev);
6142
6143	/* Notify protocols, that a new device appeared. */
6144	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6145
6146	/*
6147	 *	Prevent userspace races by waiting until the network
6148	 *	device is fully setup before sending notifications.
6149	 */
6150	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6151
6152	synchronize_net();
6153	err = 0;
6154out:
6155	return err;
6156}
6157EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6158
6159static int dev_cpu_callback(struct notifier_block *nfb,
6160			    unsigned long action,
6161			    void *ocpu)
6162{
6163	struct sk_buff **list_skb;
6164	struct sk_buff *skb;
6165	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6166	struct softnet_data *sd, *oldsd;
6167
6168	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6169		return NOTIFY_OK;
6170
6171	local_irq_disable();
6172	cpu = smp_processor_id();
6173	sd = &per_cpu(softnet_data, cpu);
6174	oldsd = &per_cpu(softnet_data, oldcpu);
6175
6176	/* Find end of our completion_queue. */
6177	list_skb = &sd->completion_queue;
6178	while (*list_skb)
6179		list_skb = &(*list_skb)->next;
6180	/* Append completion queue from offline CPU. */
6181	*list_skb = oldsd->completion_queue;
6182	oldsd->completion_queue = NULL;
6183
6184	/* Append output queue from offline CPU. */
6185	if (oldsd->output_queue) {
6186		*sd->output_queue_tailp = oldsd->output_queue;
6187		sd->output_queue_tailp = oldsd->output_queue_tailp;
6188		oldsd->output_queue = NULL;
6189		oldsd->output_queue_tailp = &oldsd->output_queue;
6190	}
6191	/* Append NAPI poll list from offline CPU. */
6192	if (!list_empty(&oldsd->poll_list)) {
6193		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6194		raise_softirq_irqoff(NET_RX_SOFTIRQ);
6195	}
6196
6197	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6198	local_irq_enable();
6199
6200	/* Process offline CPU's input_pkt_queue */
6201	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6202		netif_rx(skb);
6203		input_queue_head_incr(oldsd);
6204	}
6205	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6206		netif_rx(skb);
6207		input_queue_head_incr(oldsd);
6208	}
6209
6210	return NOTIFY_OK;
6211}
6212
6213
6214/**
6215 *	netdev_increment_features - increment feature set by one
6216 *	@all: current feature set
6217 *	@one: new feature set
6218 *	@mask: mask feature set
6219 *
6220 *	Computes a new feature set after adding a device with feature set
6221 *	@one to the master device with current feature set @all.  Will not
6222 *	enable anything that is off in @mask. Returns the new feature set.
6223 */
6224u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6225{
6226	if (mask & NETIF_F_GEN_CSUM)
6227		mask |= NETIF_F_ALL_CSUM;
6228	mask |= NETIF_F_VLAN_CHALLENGED;
6229
6230	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6231	all &= one | ~NETIF_F_ALL_FOR_ALL;
6232
6233	/* If device needs checksumming, downgrade to it. */
6234	if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6235		all &= ~NETIF_F_NO_CSUM;
6236
6237	/* If one device supports hw checksumming, set for all. */
6238	if (all & NETIF_F_GEN_CSUM)
6239		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6240
6241	return all;
6242}
6243EXPORT_SYMBOL(netdev_increment_features);
6244
6245static struct hlist_head *netdev_create_hash(void)
6246{
6247	int i;
6248	struct hlist_head *hash;
6249
6250	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6251	if (hash != NULL)
6252		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6253			INIT_HLIST_HEAD(&hash[i]);
6254
6255	return hash;
6256}
6257
6258/* Initialize per network namespace state */
6259static int __net_init netdev_init(struct net *net)
6260{
6261	INIT_LIST_HEAD(&net->dev_base_head);
6262
6263	net->dev_name_head = netdev_create_hash();
6264	if (net->dev_name_head == NULL)
6265		goto err_name;
6266
6267	net->dev_index_head = netdev_create_hash();
6268	if (net->dev_index_head == NULL)
6269		goto err_idx;
6270
6271	return 0;
6272
6273err_idx:
6274	kfree(net->dev_name_head);
6275err_name:
6276	return -ENOMEM;
6277}
6278
6279/**
6280 *	netdev_drivername - network driver for the device
6281 *	@dev: network device
6282 *
6283 *	Determine network driver for device.
6284 */
6285const char *netdev_drivername(const struct net_device *dev)
6286{
6287	const struct device_driver *driver;
6288	const struct device *parent;
6289	const char *empty = "";
6290
6291	parent = dev->dev.parent;
6292	if (!parent)
6293		return empty;
6294
6295	driver = parent->driver;
6296	if (driver && driver->name)
6297		return driver->name;
6298	return empty;
6299}
6300
6301static int __netdev_printk(const char *level, const struct net_device *dev,
6302			   struct va_format *vaf)
6303{
6304	int r;
6305
6306	if (dev && dev->dev.parent)
6307		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6308			       netdev_name(dev), vaf);
6309	else if (dev)
6310		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6311	else
6312		r = printk("%s(NULL net_device): %pV", level, vaf);
6313
6314	return r;
6315}
6316
6317int netdev_printk(const char *level, const struct net_device *dev,
6318		  const char *format, ...)
6319{
6320	struct va_format vaf;
6321	va_list args;
6322	int r;
6323
6324	va_start(args, format);
6325
6326	vaf.fmt = format;
6327	vaf.va = &args;
6328
6329	r = __netdev_printk(level, dev, &vaf);
6330	va_end(args);
6331
6332	return r;
6333}
6334EXPORT_SYMBOL(netdev_printk);
6335
6336#define define_netdev_printk_level(func, level)			\
6337int func(const struct net_device *dev, const char *fmt, ...)	\
6338{								\
6339	int r;							\
6340	struct va_format vaf;					\
6341	va_list args;						\
6342								\
6343	va_start(args, fmt);					\
6344								\
6345	vaf.fmt = fmt;						\
6346	vaf.va = &args;						\
6347								\
6348	r = __netdev_printk(level, dev, &vaf);			\
6349	va_end(args);						\
6350								\
6351	return r;						\
6352}								\
6353EXPORT_SYMBOL(func);
6354
6355define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6356define_netdev_printk_level(netdev_alert, KERN_ALERT);
6357define_netdev_printk_level(netdev_crit, KERN_CRIT);
6358define_netdev_printk_level(netdev_err, KERN_ERR);
6359define_netdev_printk_level(netdev_warn, KERN_WARNING);
6360define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6361define_netdev_printk_level(netdev_info, KERN_INFO);
6362
6363static void __net_exit netdev_exit(struct net *net)
6364{
6365	kfree(net->dev_name_head);
6366	kfree(net->dev_index_head);
6367}
6368
6369static struct pernet_operations __net_initdata netdev_net_ops = {
6370	.init = netdev_init,
6371	.exit = netdev_exit,
6372};
6373
6374static void __net_exit default_device_exit(struct net *net)
6375{
6376	struct net_device *dev, *aux;
6377	/*
6378	 * Push all migratable network devices back to the
6379	 * initial network namespace
6380	 */
6381	rtnl_lock();
6382	for_each_netdev_safe(net, dev, aux) {
6383		int err;
6384		char fb_name[IFNAMSIZ];
6385
6386		/* Ignore unmoveable devices (i.e. loopback) */
6387		if (dev->features & NETIF_F_NETNS_LOCAL)
6388			continue;
6389
6390		/* Leave virtual devices for the generic cleanup */
6391		if (dev->rtnl_link_ops)
6392			continue;
6393
6394		/* Push remaining network devices to init_net */
6395		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6396		err = dev_change_net_namespace(dev, &init_net, fb_name);
6397		if (err) {
6398			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6399				__func__, dev->name, err);
6400			BUG();
6401		}
6402	}
6403	rtnl_unlock();
6404}
6405
6406static void __net_exit default_device_exit_batch(struct list_head *net_list)
6407{
6408	/* At exit all network devices most be removed from a network
6409	 * namespace.  Do this in the reverse order of registration.
6410	 * Do this across as many network namespaces as possible to
6411	 * improve batching efficiency.
6412	 */
6413	struct net_device *dev;
6414	struct net *net;
6415	LIST_HEAD(dev_kill_list);
6416
6417	rtnl_lock();
6418	list_for_each_entry(net, net_list, exit_list) {
6419		for_each_netdev_reverse(net, dev) {
6420			if (dev->rtnl_link_ops)
6421				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6422			else
6423				unregister_netdevice_queue(dev, &dev_kill_list);
6424		}
6425	}
6426	unregister_netdevice_many(&dev_kill_list);
6427	list_del(&dev_kill_list);
6428	rtnl_unlock();
6429}
6430
6431static struct pernet_operations __net_initdata default_device_ops = {
6432	.exit = default_device_exit,
6433	.exit_batch = default_device_exit_batch,
6434};
6435
6436/*
6437 *	Initialize the DEV module. At boot time this walks the device list and
6438 *	unhooks any devices that fail to initialise (normally hardware not
6439 *	present) and leaves us with a valid list of present and active devices.
6440 *
6441 */
6442
6443/*
6444 *       This is called single threaded during boot, so no need
6445 *       to take the rtnl semaphore.
6446 */
6447static int __init net_dev_init(void)
6448{
6449	int i, rc = -ENOMEM;
6450
6451	BUG_ON(!dev_boot_phase);
6452
6453	if (dev_proc_init())
6454		goto out;
6455
6456	if (netdev_kobject_init())
6457		goto out;
6458
6459	INIT_LIST_HEAD(&ptype_all);
6460	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6461		INIT_LIST_HEAD(&ptype_base[i]);
6462
6463	if (register_pernet_subsys(&netdev_net_ops))
6464		goto out;
6465
6466	/*
6467	 *	Initialise the packet receive queues.
6468	 */
6469
6470	for_each_possible_cpu(i) {
6471		struct softnet_data *sd = &per_cpu(softnet_data, i);
6472
6473		memset(sd, 0, sizeof(*sd));
6474		skb_queue_head_init(&sd->input_pkt_queue);
6475		skb_queue_head_init(&sd->process_queue);
6476		sd->completion_queue = NULL;
6477		INIT_LIST_HEAD(&sd->poll_list);
6478		sd->output_queue = NULL;
6479		sd->output_queue_tailp = &sd->output_queue;
6480#ifdef CONFIG_RPS
6481		sd->csd.func = rps_trigger_softirq;
6482		sd->csd.info = sd;
6483		sd->csd.flags = 0;
6484		sd->cpu = i;
6485#endif
6486
6487		sd->backlog.poll = process_backlog;
6488		sd->backlog.weight = weight_p;
6489		sd->backlog.gro_list = NULL;
6490		sd->backlog.gro_count = 0;
6491	}
6492
6493	dev_boot_phase = 0;
6494
6495	/* The loopback device is special if any other network devices
6496	 * is present in a network namespace the loopback device must
6497	 * be present. Since we now dynamically allocate and free the
6498	 * loopback device ensure this invariant is maintained by
6499	 * keeping the loopback device as the first device on the
6500	 * list of network devices.  Ensuring the loopback devices
6501	 * is the first device that appears and the last network device
6502	 * that disappears.
6503	 */
6504	if (register_pernet_device(&loopback_net_ops))
6505		goto out;
6506
6507	if (register_pernet_device(&default_device_ops))
6508		goto out;
6509
6510	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6511	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6512
6513	hotcpu_notifier(dev_cpu_callback, 0);
6514	dst_init();
6515	dev_mcast_init();
6516	rc = 0;
6517out:
6518	return rc;
6519}
6520
6521subsys_initcall(net_dev_init);
6522
6523static int __init initialize_hashrnd(void)
6524{
6525	get_random_bytes(&hashrnd, sizeof(hashrnd));
6526	return 0;
6527}
6528
6529late_initcall_sync(initialize_hashrnd);
6530