Linux Audio

Check our new training course

Loading...
v3.15
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/mutex.h>
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
  97#include <net/net_namespace.h>
  98#include <net/sock.h>
  99#include <linux/rtnetlink.h>
 
 
 100#include <linux/stat.h>
 101#include <net/dst.h>
 102#include <net/pkt_sched.h>
 103#include <net/checksum.h>
 104#include <net/xfrm.h>
 105#include <linux/highmem.h>
 106#include <linux/init.h>
 
 107#include <linux/module.h>
 108#include <linux/netpoll.h>
 109#include <linux/rcupdate.h>
 110#include <linux/delay.h>
 
 111#include <net/iw_handler.h>
 112#include <asm/current.h>
 113#include <linux/audit.h>
 114#include <linux/dmaengine.h>
 115#include <linux/err.h>
 116#include <linux/ctype.h>
 117#include <linux/if_arp.h>
 118#include <linux/if_vlan.h>
 119#include <linux/ip.h>
 120#include <net/ip.h>
 121#include <linux/ipv6.h>
 122#include <linux/in.h>
 123#include <linux/jhash.h>
 124#include <linux/random.h>
 125#include <trace/events/napi.h>
 126#include <trace/events/net.h>
 127#include <trace/events/skb.h>
 128#include <linux/pci.h>
 129#include <linux/inetdevice.h>
 130#include <linux/cpu_rmap.h>
 
 131#include <linux/static_key.h>
 132#include <linux/hashtable.h>
 133#include <linux/vmalloc.h>
 134#include <linux/if_macvlan.h>
 135
 136#include "net-sysfs.h"
 137
 138/* Instead of increasing this, you should create a hash table. */
 139#define MAX_GRO_SKBS 8
 140
 141/* This should be increased if a protocol with a bigger head is added. */
 142#define GRO_MAX_HEAD (MAX_HEADER + 128)
 143
 144static DEFINE_SPINLOCK(ptype_lock);
 145static DEFINE_SPINLOCK(offload_lock);
 146struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 147struct list_head ptype_all __read_mostly;	/* Taps */
 148static struct list_head offload_base __read_mostly;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 149
 150static int netif_rx_internal(struct sk_buff *skb);
 
 
 
 
 
 151
 152/*
 153 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 154 * semaphore.
 155 *
 156 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 157 *
 158 * Writers must hold the rtnl semaphore while they loop through the
 159 * dev_base_head list, and hold dev_base_lock for writing when they do the
 160 * actual updates.  This allows pure readers to access the list even
 161 * while a writer is preparing to update it.
 162 *
 163 * To put it another way, dev_base_lock is held for writing only to
 164 * protect against pure readers; the rtnl semaphore provides the
 165 * protection against other writers.
 166 *
 167 * See, for example usages, register_netdevice() and
 168 * unregister_netdevice(), which must be called with the rtnl
 169 * semaphore held.
 170 */
 171DEFINE_RWLOCK(dev_base_lock);
 172EXPORT_SYMBOL(dev_base_lock);
 173
 174/* protects napi_hash addition/deletion and napi_gen_id */
 175static DEFINE_SPINLOCK(napi_hash_lock);
 176
 177static unsigned int napi_gen_id;
 178static DEFINE_HASHTABLE(napi_hash, 8);
 179
 180static seqcount_t devnet_rename_seq;
 181
 182static inline void dev_base_seq_inc(struct net *net)
 183{
 184	while (++net->dev_base_seq == 0);
 185}
 186
 187static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 188{
 189	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 190
 191	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 192}
 193
 194static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 195{
 196	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 197}
 198
 199static inline void rps_lock(struct softnet_data *sd)
 200{
 201#ifdef CONFIG_RPS
 202	spin_lock(&sd->input_pkt_queue.lock);
 203#endif
 204}
 205
 206static inline void rps_unlock(struct softnet_data *sd)
 207{
 208#ifdef CONFIG_RPS
 209	spin_unlock(&sd->input_pkt_queue.lock);
 210#endif
 211}
 212
 213/* Device list insertion */
 214static void list_netdevice(struct net_device *dev)
 215{
 216	struct net *net = dev_net(dev);
 217
 218	ASSERT_RTNL();
 219
 220	write_lock_bh(&dev_base_lock);
 221	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 222	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 223	hlist_add_head_rcu(&dev->index_hlist,
 224			   dev_index_hash(net, dev->ifindex));
 225	write_unlock_bh(&dev_base_lock);
 226
 227	dev_base_seq_inc(net);
 
 
 228}
 229
 230/* Device list removal
 231 * caller must respect a RCU grace period before freeing/reusing dev
 232 */
 233static void unlist_netdevice(struct net_device *dev)
 234{
 235	ASSERT_RTNL();
 236
 237	/* Unlink dev from the device chain */
 238	write_lock_bh(&dev_base_lock);
 239	list_del_rcu(&dev->dev_list);
 240	hlist_del_rcu(&dev->name_hlist);
 241	hlist_del_rcu(&dev->index_hlist);
 242	write_unlock_bh(&dev_base_lock);
 243
 244	dev_base_seq_inc(dev_net(dev));
 245}
 246
 247/*
 248 *	Our notifier list
 249 */
 250
 251static RAW_NOTIFIER_HEAD(netdev_chain);
 252
 253/*
 254 *	Device drivers call our routines to queue packets here. We empty the
 255 *	queue in the local softnet handler.
 256 */
 257
 258DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 259EXPORT_PER_CPU_SYMBOL(softnet_data);
 260
 261#ifdef CONFIG_LOCKDEP
 262/*
 263 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 264 * according to dev->type
 265 */
 266static const unsigned short netdev_lock_type[] =
 267	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 268	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 269	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 270	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 271	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 272	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 273	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 274	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 275	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 276	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 277	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 278	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 279	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 280	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 281	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 282
 283static const char *const netdev_lock_name[] =
 284	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 285	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 286	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 287	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 288	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 289	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 290	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 291	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 292	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 293	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 294	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 295	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 296	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 297	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 298	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 299
 300static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 301static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 302
 303static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 304{
 305	int i;
 306
 307	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 308		if (netdev_lock_type[i] == dev_type)
 309			return i;
 310	/* the last key is used by default */
 311	return ARRAY_SIZE(netdev_lock_type) - 1;
 312}
 313
 314static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 315						 unsigned short dev_type)
 316{
 317	int i;
 318
 319	i = netdev_lock_pos(dev_type);
 320	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 321				   netdev_lock_name[i]);
 322}
 323
 324static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 325{
 326	int i;
 327
 328	i = netdev_lock_pos(dev->type);
 329	lockdep_set_class_and_name(&dev->addr_list_lock,
 330				   &netdev_addr_lock_key[i],
 331				   netdev_lock_name[i]);
 332}
 333#else
 334static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 335						 unsigned short dev_type)
 336{
 337}
 338static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 339{
 340}
 341#endif
 342
 343/*******************************************************************************
 344
 345		Protocol management and registration routines
 346
 347*******************************************************************************/
 348
 349/*
 350 *	Add a protocol ID to the list. Now that the input handler is
 351 *	smarter we can dispense with all the messy stuff that used to be
 352 *	here.
 353 *
 354 *	BEWARE!!! Protocol handlers, mangling input packets,
 355 *	MUST BE last in hash buckets and checking protocol handlers
 356 *	MUST start from promiscuous ptype_all chain in net_bh.
 357 *	It is true now, do not change it.
 358 *	Explanation follows: if protocol handler, mangling packet, will
 359 *	be the first on list, it is not able to sense, that packet
 360 *	is cloned and should be copied-on-write, so that it will
 361 *	change it and subsequent readers will get broken packet.
 362 *							--ANK (980803)
 363 */
 364
 365static inline struct list_head *ptype_head(const struct packet_type *pt)
 366{
 367	if (pt->type == htons(ETH_P_ALL))
 368		return &ptype_all;
 369	else
 370		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 371}
 372
 373/**
 374 *	dev_add_pack - add packet handler
 375 *	@pt: packet type declaration
 376 *
 377 *	Add a protocol handler to the networking stack. The passed &packet_type
 378 *	is linked into kernel lists and may not be freed until it has been
 379 *	removed from the kernel lists.
 380 *
 381 *	This call does not sleep therefore it can not
 382 *	guarantee all CPU's that are in middle of receiving packets
 383 *	will see the new packet type (until the next received packet).
 384 */
 385
 386void dev_add_pack(struct packet_type *pt)
 387{
 388	struct list_head *head = ptype_head(pt);
 389
 390	spin_lock(&ptype_lock);
 391	list_add_rcu(&pt->list, head);
 392	spin_unlock(&ptype_lock);
 393}
 394EXPORT_SYMBOL(dev_add_pack);
 395
 396/**
 397 *	__dev_remove_pack	 - remove packet handler
 398 *	@pt: packet type declaration
 399 *
 400 *	Remove a protocol handler that was previously added to the kernel
 401 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 402 *	from the kernel lists and can be freed or reused once this function
 403 *	returns.
 404 *
 405 *      The packet type might still be in use by receivers
 406 *	and must not be freed until after all the CPU's have gone
 407 *	through a quiescent state.
 408 */
 409void __dev_remove_pack(struct packet_type *pt)
 410{
 411	struct list_head *head = ptype_head(pt);
 412	struct packet_type *pt1;
 413
 414	spin_lock(&ptype_lock);
 415
 416	list_for_each_entry(pt1, head, list) {
 417		if (pt == pt1) {
 418			list_del_rcu(&pt->list);
 419			goto out;
 420		}
 421	}
 422
 423	pr_warn("dev_remove_pack: %p not found\n", pt);
 424out:
 425	spin_unlock(&ptype_lock);
 426}
 427EXPORT_SYMBOL(__dev_remove_pack);
 428
 429/**
 430 *	dev_remove_pack	 - remove packet handler
 431 *	@pt: packet type declaration
 432 *
 433 *	Remove a protocol handler that was previously added to the kernel
 434 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 435 *	from the kernel lists and can be freed or reused once this function
 436 *	returns.
 437 *
 438 *	This call sleeps to guarantee that no CPU is looking at the packet
 439 *	type after return.
 440 */
 441void dev_remove_pack(struct packet_type *pt)
 442{
 443	__dev_remove_pack(pt);
 444
 445	synchronize_net();
 446}
 447EXPORT_SYMBOL(dev_remove_pack);
 448
 449
 450/**
 451 *	dev_add_offload - register offload handlers
 452 *	@po: protocol offload declaration
 453 *
 454 *	Add protocol offload handlers to the networking stack. The passed
 455 *	&proto_offload is linked into kernel lists and may not be freed until
 456 *	it has been removed from the kernel lists.
 457 *
 458 *	This call does not sleep therefore it can not
 459 *	guarantee all CPU's that are in middle of receiving packets
 460 *	will see the new offload handlers (until the next received packet).
 461 */
 462void dev_add_offload(struct packet_offload *po)
 463{
 464	struct list_head *head = &offload_base;
 465
 466	spin_lock(&offload_lock);
 467	list_add_rcu(&po->list, head);
 468	spin_unlock(&offload_lock);
 469}
 470EXPORT_SYMBOL(dev_add_offload);
 471
 472/**
 473 *	__dev_remove_offload	 - remove offload handler
 474 *	@po: packet offload declaration
 475 *
 476 *	Remove a protocol offload handler that was previously added to the
 477 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
 478 *	is removed from the kernel lists and can be freed or reused once this
 479 *	function returns.
 480 *
 481 *      The packet type might still be in use by receivers
 482 *	and must not be freed until after all the CPU's have gone
 483 *	through a quiescent state.
 484 */
 485static void __dev_remove_offload(struct packet_offload *po)
 486{
 487	struct list_head *head = &offload_base;
 488	struct packet_offload *po1;
 489
 490	spin_lock(&offload_lock);
 491
 492	list_for_each_entry(po1, head, list) {
 493		if (po == po1) {
 494			list_del_rcu(&po->list);
 495			goto out;
 496		}
 497	}
 498
 499	pr_warn("dev_remove_offload: %p not found\n", po);
 500out:
 501	spin_unlock(&offload_lock);
 502}
 503
 504/**
 505 *	dev_remove_offload	 - remove packet offload handler
 506 *	@po: packet offload declaration
 507 *
 508 *	Remove a packet offload handler that was previously added to the kernel
 509 *	offload handlers by dev_add_offload(). The passed &offload_type is
 510 *	removed from the kernel lists and can be freed or reused once this
 511 *	function returns.
 512 *
 513 *	This call sleeps to guarantee that no CPU is looking at the packet
 514 *	type after return.
 515 */
 516void dev_remove_offload(struct packet_offload *po)
 517{
 518	__dev_remove_offload(po);
 519
 520	synchronize_net();
 521}
 522EXPORT_SYMBOL(dev_remove_offload);
 523
 524/******************************************************************************
 525
 526		      Device Boot-time Settings Routines
 527
 528*******************************************************************************/
 529
 530/* Boot time configuration table */
 531static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 532
 533/**
 534 *	netdev_boot_setup_add	- add new setup entry
 535 *	@name: name of the device
 536 *	@map: configured settings for the device
 537 *
 538 *	Adds new setup entry to the dev_boot_setup list.  The function
 539 *	returns 0 on error and 1 on success.  This is a generic routine to
 540 *	all netdevices.
 541 */
 542static int netdev_boot_setup_add(char *name, struct ifmap *map)
 543{
 544	struct netdev_boot_setup *s;
 545	int i;
 546
 547	s = dev_boot_setup;
 548	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 549		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 550			memset(s[i].name, 0, sizeof(s[i].name));
 551			strlcpy(s[i].name, name, IFNAMSIZ);
 552			memcpy(&s[i].map, map, sizeof(s[i].map));
 553			break;
 554		}
 555	}
 556
 557	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 558}
 559
 560/**
 561 *	netdev_boot_setup_check	- check boot time settings
 562 *	@dev: the netdevice
 563 *
 564 * 	Check boot time settings for the device.
 565 *	The found settings are set for the device to be used
 566 *	later in the device probing.
 567 *	Returns 0 if no settings found, 1 if they are.
 568 */
 569int netdev_boot_setup_check(struct net_device *dev)
 570{
 571	struct netdev_boot_setup *s = dev_boot_setup;
 572	int i;
 573
 574	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 575		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 576		    !strcmp(dev->name, s[i].name)) {
 577			dev->irq 	= s[i].map.irq;
 578			dev->base_addr 	= s[i].map.base_addr;
 579			dev->mem_start 	= s[i].map.mem_start;
 580			dev->mem_end 	= s[i].map.mem_end;
 581			return 1;
 582		}
 583	}
 584	return 0;
 585}
 586EXPORT_SYMBOL(netdev_boot_setup_check);
 587
 588
 589/**
 590 *	netdev_boot_base	- get address from boot time settings
 591 *	@prefix: prefix for network device
 592 *	@unit: id for network device
 593 *
 594 * 	Check boot time settings for the base address of device.
 595 *	The found settings are set for the device to be used
 596 *	later in the device probing.
 597 *	Returns 0 if no settings found.
 598 */
 599unsigned long netdev_boot_base(const char *prefix, int unit)
 600{
 601	const struct netdev_boot_setup *s = dev_boot_setup;
 602	char name[IFNAMSIZ];
 603	int i;
 604
 605	sprintf(name, "%s%d", prefix, unit);
 606
 607	/*
 608	 * If device already registered then return base of 1
 609	 * to indicate not to probe for this interface
 610	 */
 611	if (__dev_get_by_name(&init_net, name))
 612		return 1;
 613
 614	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 615		if (!strcmp(name, s[i].name))
 616			return s[i].map.base_addr;
 617	return 0;
 618}
 619
 620/*
 621 * Saves at boot time configured settings for any netdevice.
 622 */
 623int __init netdev_boot_setup(char *str)
 624{
 625	int ints[5];
 626	struct ifmap map;
 627
 628	str = get_options(str, ARRAY_SIZE(ints), ints);
 629	if (!str || !*str)
 630		return 0;
 631
 632	/* Save settings */
 633	memset(&map, 0, sizeof(map));
 634	if (ints[0] > 0)
 635		map.irq = ints[1];
 636	if (ints[0] > 1)
 637		map.base_addr = ints[2];
 638	if (ints[0] > 2)
 639		map.mem_start = ints[3];
 640	if (ints[0] > 3)
 641		map.mem_end = ints[4];
 642
 643	/* Add new entry to the list */
 644	return netdev_boot_setup_add(str, &map);
 645}
 646
 647__setup("netdev=", netdev_boot_setup);
 648
 649/*******************************************************************************
 650
 651			    Device Interface Subroutines
 652
 653*******************************************************************************/
 654
 655/**
 656 *	__dev_get_by_name	- find a device by its name
 657 *	@net: the applicable net namespace
 658 *	@name: name to find
 659 *
 660 *	Find an interface by name. Must be called under RTNL semaphore
 661 *	or @dev_base_lock. If the name is found a pointer to the device
 662 *	is returned. If the name is not found then %NULL is returned. The
 663 *	reference counters are not incremented so the caller must be
 664 *	careful with locks.
 665 */
 666
 667struct net_device *__dev_get_by_name(struct net *net, const char *name)
 668{
 
 669	struct net_device *dev;
 670	struct hlist_head *head = dev_name_hash(net, name);
 671
 672	hlist_for_each_entry(dev, head, name_hlist)
 673		if (!strncmp(dev->name, name, IFNAMSIZ))
 674			return dev;
 675
 676	return NULL;
 677}
 678EXPORT_SYMBOL(__dev_get_by_name);
 679
 680/**
 681 *	dev_get_by_name_rcu	- find a device by its name
 682 *	@net: the applicable net namespace
 683 *	@name: name to find
 684 *
 685 *	Find an interface by name.
 686 *	If the name is found a pointer to the device is returned.
 687 * 	If the name is not found then %NULL is returned.
 688 *	The reference counters are not incremented so the caller must be
 689 *	careful with locks. The caller must hold RCU lock.
 690 */
 691
 692struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 693{
 
 694	struct net_device *dev;
 695	struct hlist_head *head = dev_name_hash(net, name);
 696
 697	hlist_for_each_entry_rcu(dev, head, name_hlist)
 698		if (!strncmp(dev->name, name, IFNAMSIZ))
 699			return dev;
 700
 701	return NULL;
 702}
 703EXPORT_SYMBOL(dev_get_by_name_rcu);
 704
 705/**
 706 *	dev_get_by_name		- find a device by its name
 707 *	@net: the applicable net namespace
 708 *	@name: name to find
 709 *
 710 *	Find an interface by name. This can be called from any
 711 *	context and does its own locking. The returned handle has
 712 *	the usage count incremented and the caller must use dev_put() to
 713 *	release it when it is no longer needed. %NULL is returned if no
 714 *	matching device is found.
 715 */
 716
 717struct net_device *dev_get_by_name(struct net *net, const char *name)
 718{
 719	struct net_device *dev;
 720
 721	rcu_read_lock();
 722	dev = dev_get_by_name_rcu(net, name);
 723	if (dev)
 724		dev_hold(dev);
 725	rcu_read_unlock();
 726	return dev;
 727}
 728EXPORT_SYMBOL(dev_get_by_name);
 729
 730/**
 731 *	__dev_get_by_index - find a device by its ifindex
 732 *	@net: the applicable net namespace
 733 *	@ifindex: index of device
 734 *
 735 *	Search for an interface by index. Returns %NULL if the device
 736 *	is not found or a pointer to the device. The device has not
 737 *	had its reference counter increased so the caller must be careful
 738 *	about locking. The caller must hold either the RTNL semaphore
 739 *	or @dev_base_lock.
 740 */
 741
 742struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 743{
 
 744	struct net_device *dev;
 745	struct hlist_head *head = dev_index_hash(net, ifindex);
 746
 747	hlist_for_each_entry(dev, head, index_hlist)
 748		if (dev->ifindex == ifindex)
 749			return dev;
 750
 751	return NULL;
 752}
 753EXPORT_SYMBOL(__dev_get_by_index);
 754
 755/**
 756 *	dev_get_by_index_rcu - find a device by its ifindex
 757 *	@net: the applicable net namespace
 758 *	@ifindex: index of device
 759 *
 760 *	Search for an interface by index. Returns %NULL if the device
 761 *	is not found or a pointer to the device. The device has not
 762 *	had its reference counter increased so the caller must be careful
 763 *	about locking. The caller must hold RCU lock.
 764 */
 765
 766struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 767{
 
 768	struct net_device *dev;
 769	struct hlist_head *head = dev_index_hash(net, ifindex);
 770
 771	hlist_for_each_entry_rcu(dev, head, index_hlist)
 772		if (dev->ifindex == ifindex)
 773			return dev;
 774
 775	return NULL;
 776}
 777EXPORT_SYMBOL(dev_get_by_index_rcu);
 778
 779
 780/**
 781 *	dev_get_by_index - find a device by its ifindex
 782 *	@net: the applicable net namespace
 783 *	@ifindex: index of device
 784 *
 785 *	Search for an interface by index. Returns NULL if the device
 786 *	is not found or a pointer to the device. The device returned has
 787 *	had a reference added and the pointer is safe until the user calls
 788 *	dev_put to indicate they have finished with it.
 789 */
 790
 791struct net_device *dev_get_by_index(struct net *net, int ifindex)
 792{
 793	struct net_device *dev;
 794
 795	rcu_read_lock();
 796	dev = dev_get_by_index_rcu(net, ifindex);
 797	if (dev)
 798		dev_hold(dev);
 799	rcu_read_unlock();
 800	return dev;
 801}
 802EXPORT_SYMBOL(dev_get_by_index);
 803
 804/**
 805 *	netdev_get_name - get a netdevice name, knowing its ifindex.
 806 *	@net: network namespace
 807 *	@name: a pointer to the buffer where the name will be stored.
 808 *	@ifindex: the ifindex of the interface to get the name from.
 809 *
 810 *	The use of raw_seqcount_begin() and cond_resched() before
 811 *	retrying is required as we want to give the writers a chance
 812 *	to complete when CONFIG_PREEMPT is not set.
 813 */
 814int netdev_get_name(struct net *net, char *name, int ifindex)
 815{
 816	struct net_device *dev;
 817	unsigned int seq;
 818
 819retry:
 820	seq = raw_seqcount_begin(&devnet_rename_seq);
 821	rcu_read_lock();
 822	dev = dev_get_by_index_rcu(net, ifindex);
 823	if (!dev) {
 824		rcu_read_unlock();
 825		return -ENODEV;
 826	}
 827
 828	strcpy(name, dev->name);
 829	rcu_read_unlock();
 830	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 831		cond_resched();
 832		goto retry;
 833	}
 834
 835	return 0;
 836}
 837
 838/**
 839 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 840 *	@net: the applicable net namespace
 841 *	@type: media type of device
 842 *	@ha: hardware address
 843 *
 844 *	Search for an interface by MAC address. Returns NULL if the device
 845 *	is not found or a pointer to the device.
 846 *	The caller must hold RCU or RTNL.
 847 *	The returned device has not had its ref count increased
 848 *	and the caller must therefore be careful about locking
 849 *
 850 */
 851
 852struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 853				       const char *ha)
 854{
 855	struct net_device *dev;
 856
 857	for_each_netdev_rcu(net, dev)
 858		if (dev->type == type &&
 859		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 860			return dev;
 861
 862	return NULL;
 863}
 864EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 865
 866struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 867{
 868	struct net_device *dev;
 869
 870	ASSERT_RTNL();
 871	for_each_netdev(net, dev)
 872		if (dev->type == type)
 873			return dev;
 874
 875	return NULL;
 876}
 877EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 878
 879struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 880{
 881	struct net_device *dev, *ret = NULL;
 882
 883	rcu_read_lock();
 884	for_each_netdev_rcu(net, dev)
 885		if (dev->type == type) {
 886			dev_hold(dev);
 887			ret = dev;
 888			break;
 889		}
 890	rcu_read_unlock();
 891	return ret;
 892}
 893EXPORT_SYMBOL(dev_getfirstbyhwtype);
 894
 895/**
 896 *	dev_get_by_flags_rcu - find any device with given flags
 897 *	@net: the applicable net namespace
 898 *	@if_flags: IFF_* values
 899 *	@mask: bitmask of bits in if_flags to check
 900 *
 901 *	Search for any interface with the given flags. Returns NULL if a device
 902 *	is not found or a pointer to the device. Must be called inside
 903 *	rcu_read_lock(), and result refcount is unchanged.
 904 */
 905
 906struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 907				    unsigned short mask)
 908{
 909	struct net_device *dev, *ret;
 910
 911	ret = NULL;
 912	for_each_netdev_rcu(net, dev) {
 913		if (((dev->flags ^ if_flags) & mask) == 0) {
 914			ret = dev;
 915			break;
 916		}
 917	}
 918	return ret;
 919}
 920EXPORT_SYMBOL(dev_get_by_flags_rcu);
 921
 922/**
 923 *	dev_valid_name - check if name is okay for network device
 924 *	@name: name string
 925 *
 926 *	Network device names need to be valid file names to
 927 *	to allow sysfs to work.  We also disallow any kind of
 928 *	whitespace.
 929 */
 930bool dev_valid_name(const char *name)
 931{
 932	if (*name == '\0')
 933		return false;
 934	if (strlen(name) >= IFNAMSIZ)
 935		return false;
 936	if (!strcmp(name, ".") || !strcmp(name, ".."))
 937		return false;
 938
 939	while (*name) {
 940		if (*name == '/' || isspace(*name))
 941			return false;
 942		name++;
 943	}
 944	return true;
 945}
 946EXPORT_SYMBOL(dev_valid_name);
 947
 948/**
 949 *	__dev_alloc_name - allocate a name for a device
 950 *	@net: network namespace to allocate the device name in
 951 *	@name: name format string
 952 *	@buf:  scratch buffer and result name string
 953 *
 954 *	Passed a format string - eg "lt%d" it will try and find a suitable
 955 *	id. It scans list of devices to build up a free map, then chooses
 956 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 957 *	while allocating the name and adding the device in order to avoid
 958 *	duplicates.
 959 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 960 *	Returns the number of the unit assigned or a negative errno code.
 961 */
 962
 963static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 964{
 965	int i = 0;
 966	const char *p;
 967	const int max_netdevices = 8*PAGE_SIZE;
 968	unsigned long *inuse;
 969	struct net_device *d;
 970
 971	p = strnchr(name, IFNAMSIZ-1, '%');
 972	if (p) {
 973		/*
 974		 * Verify the string as this thing may have come from
 975		 * the user.  There must be either one "%d" and no other "%"
 976		 * characters.
 977		 */
 978		if (p[1] != 'd' || strchr(p + 2, '%'))
 979			return -EINVAL;
 980
 981		/* Use one page as a bit array of possible slots */
 982		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 983		if (!inuse)
 984			return -ENOMEM;
 985
 986		for_each_netdev(net, d) {
 987			if (!sscanf(d->name, name, &i))
 988				continue;
 989			if (i < 0 || i >= max_netdevices)
 990				continue;
 991
 992			/*  avoid cases where sscanf is not exact inverse of printf */
 993			snprintf(buf, IFNAMSIZ, name, i);
 994			if (!strncmp(buf, d->name, IFNAMSIZ))
 995				set_bit(i, inuse);
 996		}
 997
 998		i = find_first_zero_bit(inuse, max_netdevices);
 999		free_page((unsigned long) inuse);
1000	}
1001
1002	if (buf != name)
1003		snprintf(buf, IFNAMSIZ, name, i);
1004	if (!__dev_get_by_name(net, buf))
1005		return i;
1006
1007	/* It is possible to run out of possible slots
1008	 * when the name is long and there isn't enough space left
1009	 * for the digits, or if all bits are used.
1010	 */
1011	return -ENFILE;
1012}
1013
1014/**
1015 *	dev_alloc_name - allocate a name for a device
1016 *	@dev: device
1017 *	@name: name format string
1018 *
1019 *	Passed a format string - eg "lt%d" it will try and find a suitable
1020 *	id. It scans list of devices to build up a free map, then chooses
1021 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1022 *	while allocating the name and adding the device in order to avoid
1023 *	duplicates.
1024 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1025 *	Returns the number of the unit assigned or a negative errno code.
1026 */
1027
1028int dev_alloc_name(struct net_device *dev, const char *name)
1029{
1030	char buf[IFNAMSIZ];
1031	struct net *net;
1032	int ret;
1033
1034	BUG_ON(!dev_net(dev));
1035	net = dev_net(dev);
1036	ret = __dev_alloc_name(net, name, buf);
1037	if (ret >= 0)
1038		strlcpy(dev->name, buf, IFNAMSIZ);
1039	return ret;
1040}
1041EXPORT_SYMBOL(dev_alloc_name);
1042
1043static int dev_alloc_name_ns(struct net *net,
1044			     struct net_device *dev,
1045			     const char *name)
1046{
1047	char buf[IFNAMSIZ];
1048	int ret;
1049
1050	ret = __dev_alloc_name(net, name, buf);
1051	if (ret >= 0)
1052		strlcpy(dev->name, buf, IFNAMSIZ);
1053	return ret;
1054}
1055
1056static int dev_get_valid_name(struct net *net,
1057			      struct net_device *dev,
1058			      const char *name)
1059{
1060	BUG_ON(!net);
1061
1062	if (!dev_valid_name(name))
1063		return -EINVAL;
1064
1065	if (strchr(name, '%'))
1066		return dev_alloc_name_ns(net, dev, name);
1067	else if (__dev_get_by_name(net, name))
1068		return -EEXIST;
1069	else if (dev->name != name)
1070		strlcpy(dev->name, name, IFNAMSIZ);
1071
1072	return 0;
1073}
1074
1075/**
1076 *	dev_change_name - change name of a device
1077 *	@dev: device
1078 *	@newname: name (or format string) must be at least IFNAMSIZ
1079 *
1080 *	Change name of a device, can pass format strings "eth%d".
1081 *	for wildcarding.
1082 */
1083int dev_change_name(struct net_device *dev, const char *newname)
1084{
1085	char oldname[IFNAMSIZ];
1086	int err = 0;
1087	int ret;
1088	struct net *net;
1089
1090	ASSERT_RTNL();
1091	BUG_ON(!dev_net(dev));
1092
1093	net = dev_net(dev);
1094	if (dev->flags & IFF_UP)
1095		return -EBUSY;
1096
1097	write_seqcount_begin(&devnet_rename_seq);
1098
1099	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1100		write_seqcount_end(&devnet_rename_seq);
1101		return 0;
1102	}
1103
1104	memcpy(oldname, dev->name, IFNAMSIZ);
1105
1106	err = dev_get_valid_name(net, dev, newname);
1107	if (err < 0) {
1108		write_seqcount_end(&devnet_rename_seq);
1109		return err;
1110	}
1111
1112rollback:
1113	ret = device_rename(&dev->dev, dev->name);
1114	if (ret) {
1115		memcpy(dev->name, oldname, IFNAMSIZ);
1116		write_seqcount_end(&devnet_rename_seq);
1117		return ret;
1118	}
1119
1120	write_seqcount_end(&devnet_rename_seq);
1121
1122	netdev_adjacent_rename_links(dev, oldname);
1123
1124	write_lock_bh(&dev_base_lock);
1125	hlist_del_rcu(&dev->name_hlist);
1126	write_unlock_bh(&dev_base_lock);
1127
1128	synchronize_rcu();
1129
1130	write_lock_bh(&dev_base_lock);
1131	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1132	write_unlock_bh(&dev_base_lock);
1133
1134	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1135	ret = notifier_to_errno(ret);
1136
1137	if (ret) {
1138		/* err >= 0 after dev_alloc_name() or stores the first errno */
1139		if (err >= 0) {
1140			err = ret;
1141			write_seqcount_begin(&devnet_rename_seq);
1142			memcpy(dev->name, oldname, IFNAMSIZ);
1143			memcpy(oldname, newname, IFNAMSIZ);
1144			goto rollback;
1145		} else {
1146			pr_err("%s: name change rollback failed: %d\n",
1147			       dev->name, ret);
1148		}
1149	}
1150
1151	return err;
1152}
1153
1154/**
1155 *	dev_set_alias - change ifalias of a device
1156 *	@dev: device
1157 *	@alias: name up to IFALIASZ
1158 *	@len: limit of bytes to copy from info
1159 *
1160 *	Set ifalias for a device,
1161 */
1162int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1163{
1164	char *new_ifalias;
1165
1166	ASSERT_RTNL();
1167
1168	if (len >= IFALIASZ)
1169		return -EINVAL;
1170
1171	if (!len) {
1172		kfree(dev->ifalias);
1173		dev->ifalias = NULL;
 
 
1174		return 0;
1175	}
1176
1177	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1178	if (!new_ifalias)
1179		return -ENOMEM;
1180	dev->ifalias = new_ifalias;
1181
1182	strlcpy(dev->ifalias, alias, len+1);
1183	return len;
1184}
1185
1186
1187/**
1188 *	netdev_features_change - device changes features
1189 *	@dev: device to cause notification
1190 *
1191 *	Called to indicate a device has changed features.
1192 */
1193void netdev_features_change(struct net_device *dev)
1194{
1195	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1196}
1197EXPORT_SYMBOL(netdev_features_change);
1198
1199/**
1200 *	netdev_state_change - device changes state
1201 *	@dev: device to cause notification
1202 *
1203 *	Called to indicate a device has changed state. This function calls
1204 *	the notifier chains for netdev_chain and sends a NEWLINK message
1205 *	to the routing socket.
1206 */
1207void netdev_state_change(struct net_device *dev)
1208{
1209	if (dev->flags & IFF_UP) {
1210		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1211		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1212	}
1213}
1214EXPORT_SYMBOL(netdev_state_change);
1215
 
 
 
 
 
 
1216/**
1217 * 	netdev_notify_peers - notify network peers about existence of @dev
1218 * 	@dev: network device
 
1219 *
1220 * Generate traffic such that interested network peers are aware of
1221 * @dev, such as by generating a gratuitous ARP. This may be used when
1222 * a device wants to inform the rest of the network about some sort of
1223 * reconfiguration such as a failover event or virtual machine
1224 * migration.
1225 */
1226void netdev_notify_peers(struct net_device *dev)
 
1227{
1228	rtnl_lock();
1229	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1230	rtnl_unlock();
 
 
 
 
 
 
 
 
 
 
 
 
1231}
1232EXPORT_SYMBOL(netdev_notify_peers);
1233
1234static int __dev_open(struct net_device *dev)
1235{
1236	const struct net_device_ops *ops = dev->netdev_ops;
1237	int ret;
1238
1239	ASSERT_RTNL();
1240
1241	if (!netif_device_present(dev))
1242		return -ENODEV;
1243
1244	/* Block netpoll from trying to do any rx path servicing.
1245	 * If we don't do this there is a chance ndo_poll_controller
1246	 * or ndo_poll may be running while we open the device
1247	 */
1248	netpoll_poll_disable(dev);
1249
1250	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1251	ret = notifier_to_errno(ret);
1252	if (ret)
1253		return ret;
1254
1255	set_bit(__LINK_STATE_START, &dev->state);
1256
1257	if (ops->ndo_validate_addr)
1258		ret = ops->ndo_validate_addr(dev);
1259
1260	if (!ret && ops->ndo_open)
1261		ret = ops->ndo_open(dev);
1262
1263	netpoll_poll_enable(dev);
1264
1265	if (ret)
1266		clear_bit(__LINK_STATE_START, &dev->state);
1267	else {
1268		dev->flags |= IFF_UP;
1269		net_dmaengine_get();
1270		dev_set_rx_mode(dev);
1271		dev_activate(dev);
1272		add_device_randomness(dev->dev_addr, dev->addr_len);
1273	}
1274
1275	return ret;
1276}
1277
1278/**
1279 *	dev_open	- prepare an interface for use.
1280 *	@dev:	device to open
1281 *
1282 *	Takes a device from down to up state. The device's private open
1283 *	function is invoked and then the multicast lists are loaded. Finally
1284 *	the device is moved into the up state and a %NETDEV_UP message is
1285 *	sent to the netdev notifier chain.
1286 *
1287 *	Calling this function on an active interface is a nop. On a failure
1288 *	a negative errno code is returned.
1289 */
1290int dev_open(struct net_device *dev)
1291{
1292	int ret;
1293
1294	if (dev->flags & IFF_UP)
1295		return 0;
1296
1297	ret = __dev_open(dev);
1298	if (ret < 0)
1299		return ret;
1300
1301	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1302	call_netdevice_notifiers(NETDEV_UP, dev);
1303
1304	return ret;
1305}
1306EXPORT_SYMBOL(dev_open);
1307
1308static int __dev_close_many(struct list_head *head)
1309{
1310	struct net_device *dev;
1311
1312	ASSERT_RTNL();
1313	might_sleep();
1314
1315	list_for_each_entry(dev, head, close_list) {
1316		/* Temporarily disable netpoll until the interface is down */
1317		netpoll_poll_disable(dev);
1318
1319		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1320
1321		clear_bit(__LINK_STATE_START, &dev->state);
1322
1323		/* Synchronize to scheduled poll. We cannot touch poll list, it
1324		 * can be even on different cpu. So just clear netif_running().
1325		 *
1326		 * dev->stop() will invoke napi_disable() on all of it's
1327		 * napi_struct instances on this device.
1328		 */
1329		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1330	}
1331
1332	dev_deactivate_many(head);
1333
1334	list_for_each_entry(dev, head, close_list) {
1335		const struct net_device_ops *ops = dev->netdev_ops;
1336
1337		/*
1338		 *	Call the device specific close. This cannot fail.
1339		 *	Only if device is UP
1340		 *
1341		 *	We allow it to be called even after a DETACH hot-plug
1342		 *	event.
1343		 */
1344		if (ops->ndo_stop)
1345			ops->ndo_stop(dev);
1346
1347		dev->flags &= ~IFF_UP;
1348		net_dmaengine_put();
1349		netpoll_poll_enable(dev);
1350	}
1351
1352	return 0;
1353}
1354
1355static int __dev_close(struct net_device *dev)
1356{
1357	int retval;
1358	LIST_HEAD(single);
1359
1360	list_add(&dev->close_list, &single);
1361	retval = __dev_close_many(&single);
1362	list_del(&single);
1363
1364	return retval;
1365}
1366
1367static int dev_close_many(struct list_head *head)
1368{
1369	struct net_device *dev, *tmp;
 
1370
1371	/* Remove the devices that don't need to be closed */
1372	list_for_each_entry_safe(dev, tmp, head, close_list)
1373		if (!(dev->flags & IFF_UP))
1374			list_del_init(&dev->close_list);
1375
1376	__dev_close_many(head);
1377
1378	list_for_each_entry_safe(dev, tmp, head, close_list) {
1379		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1380		call_netdevice_notifiers(NETDEV_DOWN, dev);
1381		list_del_init(&dev->close_list);
1382	}
1383
 
 
1384	return 0;
1385}
1386
1387/**
1388 *	dev_close - shutdown an interface.
1389 *	@dev: device to shutdown
1390 *
1391 *	This function moves an active device into down state. A
1392 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1393 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1394 *	chain.
1395 */
1396int dev_close(struct net_device *dev)
1397{
1398	if (dev->flags & IFF_UP) {
1399		LIST_HEAD(single);
1400
1401		list_add(&dev->close_list, &single);
1402		dev_close_many(&single);
1403		list_del(&single);
1404	}
1405	return 0;
1406}
1407EXPORT_SYMBOL(dev_close);
1408
1409
1410/**
1411 *	dev_disable_lro - disable Large Receive Offload on a device
1412 *	@dev: device
1413 *
1414 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1415 *	called under RTNL.  This is needed if received packets may be
1416 *	forwarded to another interface.
1417 */
1418void dev_disable_lro(struct net_device *dev)
1419{
1420	/*
1421	 * If we're trying to disable lro on a vlan device
1422	 * use the underlying physical device instead
1423	 */
1424	if (is_vlan_dev(dev))
1425		dev = vlan_dev_real_dev(dev);
1426
1427	/* the same for macvlan devices */
1428	if (netif_is_macvlan(dev))
1429		dev = macvlan_dev_real_dev(dev);
1430
1431	dev->wanted_features &= ~NETIF_F_LRO;
1432	netdev_update_features(dev);
1433
1434	if (unlikely(dev->features & NETIF_F_LRO))
1435		netdev_WARN(dev, "failed to disable LRO!\n");
1436}
1437EXPORT_SYMBOL(dev_disable_lro);
1438
1439static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1440				   struct net_device *dev)
1441{
1442	struct netdev_notifier_info info;
1443
1444	netdev_notifier_info_init(&info, dev);
1445	return nb->notifier_call(nb, val, &info);
1446}
1447
1448static int dev_boot_phase = 1;
1449
1450/**
1451 *	register_netdevice_notifier - register a network notifier block
1452 *	@nb: notifier
1453 *
1454 *	Register a notifier to be called when network device events occur.
1455 *	The notifier passed is linked into the kernel structures and must
1456 *	not be reused until it has been unregistered. A negative errno code
1457 *	is returned on a failure.
1458 *
1459 * 	When registered all registration and up events are replayed
1460 *	to the new notifier to allow device to have a race free
1461 *	view of the network device list.
1462 */
1463
1464int register_netdevice_notifier(struct notifier_block *nb)
1465{
1466	struct net_device *dev;
1467	struct net_device *last;
1468	struct net *net;
1469	int err;
1470
1471	rtnl_lock();
1472	err = raw_notifier_chain_register(&netdev_chain, nb);
1473	if (err)
1474		goto unlock;
1475	if (dev_boot_phase)
1476		goto unlock;
1477	for_each_net(net) {
1478		for_each_netdev(net, dev) {
1479			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1480			err = notifier_to_errno(err);
1481			if (err)
1482				goto rollback;
1483
1484			if (!(dev->flags & IFF_UP))
1485				continue;
1486
1487			call_netdevice_notifier(nb, NETDEV_UP, dev);
1488		}
1489	}
1490
1491unlock:
1492	rtnl_unlock();
1493	return err;
1494
1495rollback:
1496	last = dev;
1497	for_each_net(net) {
1498		for_each_netdev(net, dev) {
1499			if (dev == last)
1500				goto outroll;
1501
1502			if (dev->flags & IFF_UP) {
1503				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1504							dev);
1505				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1506			}
1507			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
 
1508		}
1509	}
1510
1511outroll:
1512	raw_notifier_chain_unregister(&netdev_chain, nb);
1513	goto unlock;
1514}
1515EXPORT_SYMBOL(register_netdevice_notifier);
1516
1517/**
1518 *	unregister_netdevice_notifier - unregister a network notifier block
1519 *	@nb: notifier
1520 *
1521 *	Unregister a notifier previously registered by
1522 *	register_netdevice_notifier(). The notifier is unlinked into the
1523 *	kernel structures and may then be reused. A negative errno code
1524 *	is returned on a failure.
1525 *
1526 * 	After unregistering unregister and down device events are synthesized
1527 *	for all devices on the device list to the removed notifier to remove
1528 *	the need for special case cleanup code.
1529 */
1530
1531int unregister_netdevice_notifier(struct notifier_block *nb)
1532{
1533	struct net_device *dev;
1534	struct net *net;
1535	int err;
1536
1537	rtnl_lock();
1538	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1539	if (err)
1540		goto unlock;
1541
1542	for_each_net(net) {
1543		for_each_netdev(net, dev) {
1544			if (dev->flags & IFF_UP) {
1545				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1546							dev);
1547				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1548			}
1549			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
 
1550		}
1551	}
1552unlock:
1553	rtnl_unlock();
1554	return err;
1555}
1556EXPORT_SYMBOL(unregister_netdevice_notifier);
1557
1558/**
1559 *	call_netdevice_notifiers_info - call all network notifier blocks
1560 *	@val: value passed unmodified to notifier function
1561 *	@dev: net_device pointer passed unmodified to notifier function
1562 *	@info: notifier information data
1563 *
1564 *	Call all network notifier blocks.  Parameters and return value
1565 *	are as for raw_notifier_call_chain().
1566 */
1567
1568static int call_netdevice_notifiers_info(unsigned long val,
1569					 struct net_device *dev,
1570					 struct netdev_notifier_info *info)
1571{
1572	ASSERT_RTNL();
1573	netdev_notifier_info_init(info, dev);
1574	return raw_notifier_call_chain(&netdev_chain, val, info);
1575}
1576
1577/**
1578 *	call_netdevice_notifiers - call all network notifier blocks
1579 *      @val: value passed unmodified to notifier function
1580 *      @dev: net_device pointer passed unmodified to notifier function
1581 *
1582 *	Call all network notifier blocks.  Parameters and return value
1583 *	are as for raw_notifier_call_chain().
1584 */
1585
1586int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1587{
1588	struct netdev_notifier_info info;
1589
1590	return call_netdevice_notifiers_info(val, dev, &info);
1591}
1592EXPORT_SYMBOL(call_netdevice_notifiers);
1593
1594static struct static_key netstamp_needed __read_mostly;
1595#ifdef HAVE_JUMP_LABEL
1596/* We are not allowed to call static_key_slow_dec() from irq context
1597 * If net_disable_timestamp() is called from irq context, defer the
1598 * static_key_slow_dec() calls.
1599 */
1600static atomic_t netstamp_needed_deferred;
1601#endif
1602
1603void net_enable_timestamp(void)
1604{
1605#ifdef HAVE_JUMP_LABEL
1606	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1607
1608	if (deferred) {
1609		while (--deferred)
1610			static_key_slow_dec(&netstamp_needed);
1611		return;
1612	}
1613#endif
 
1614	static_key_slow_inc(&netstamp_needed);
1615}
1616EXPORT_SYMBOL(net_enable_timestamp);
1617
1618void net_disable_timestamp(void)
1619{
1620#ifdef HAVE_JUMP_LABEL
1621	if (in_interrupt()) {
1622		atomic_inc(&netstamp_needed_deferred);
1623		return;
1624	}
1625#endif
1626	static_key_slow_dec(&netstamp_needed);
1627}
1628EXPORT_SYMBOL(net_disable_timestamp);
1629
1630static inline void net_timestamp_set(struct sk_buff *skb)
1631{
1632	skb->tstamp.tv64 = 0;
1633	if (static_key_false(&netstamp_needed))
1634		__net_timestamp(skb);
1635}
1636
1637#define net_timestamp_check(COND, SKB)			\
1638	if (static_key_false(&netstamp_needed)) {		\
1639		if ((COND) && !(SKB)->tstamp.tv64)	\
1640			__net_timestamp(SKB);		\
1641	}						\
1642
1643bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1644{
1645	unsigned int len;
1646
1647	if (!(dev->flags & IFF_UP))
1648		return false;
1649
1650	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1651	if (skb->len <= len)
1652		return true;
1653
1654	/* if TSO is enabled, we don't care about the length as the packet
1655	 * could be forwarded without being segmented before
1656	 */
1657	if (skb_is_gso(skb))
1658		return true;
1659
1660	return false;
1661}
1662EXPORT_SYMBOL_GPL(is_skb_forwardable);
1663
1664/**
1665 * dev_forward_skb - loopback an skb to another netif
1666 *
1667 * @dev: destination network device
1668 * @skb: buffer to forward
1669 *
1670 * return values:
1671 *	NET_RX_SUCCESS	(no congestion)
1672 *	NET_RX_DROP     (packet was dropped, but freed)
1673 *
1674 * dev_forward_skb can be used for injecting an skb from the
1675 * start_xmit function of one device into the receive queue
1676 * of another device.
1677 *
1678 * The receiving device may be in another namespace, so
1679 * we have to clear all information in the skb that could
1680 * impact namespace isolation.
1681 */
1682int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1683{
1684	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1685		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1686			atomic_long_inc(&dev->rx_dropped);
1687			kfree_skb(skb);
1688			return NET_RX_DROP;
1689		}
1690	}
1691
 
 
 
1692	if (unlikely(!is_skb_forwardable(dev, skb))) {
1693		atomic_long_inc(&dev->rx_dropped);
1694		kfree_skb(skb);
1695		return NET_RX_DROP;
1696	}
1697
1698	skb_scrub_packet(skb, true);
 
 
 
1699	skb->protocol = eth_type_trans(skb, dev);
1700
1701	return netif_rx_internal(skb);
 
 
1702}
1703EXPORT_SYMBOL_GPL(dev_forward_skb);
1704
1705static inline int deliver_skb(struct sk_buff *skb,
1706			      struct packet_type *pt_prev,
1707			      struct net_device *orig_dev)
1708{
1709	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1710		return -ENOMEM;
1711	atomic_inc(&skb->users);
1712	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1713}
1714
1715static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1716{
1717	if (!ptype->af_packet_priv || !skb->sk)
1718		return false;
1719
1720	if (ptype->id_match)
1721		return ptype->id_match(ptype, skb->sk);
1722	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1723		return true;
1724
1725	return false;
1726}
1727
1728/*
1729 *	Support routine. Sends outgoing frames to any network
1730 *	taps currently in use.
1731 */
1732
1733static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1734{
1735	struct packet_type *ptype;
1736	struct sk_buff *skb2 = NULL;
1737	struct packet_type *pt_prev = NULL;
1738
1739	rcu_read_lock();
1740	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1741		/* Never send packets back to the socket
1742		 * they originated from - MvS (miquels@drinkel.ow.org)
1743		 */
1744		if ((ptype->dev == dev || !ptype->dev) &&
1745		    (!skb_loop_sk(ptype, skb))) {
1746			if (pt_prev) {
1747				deliver_skb(skb2, pt_prev, skb->dev);
1748				pt_prev = ptype;
1749				continue;
1750			}
1751
1752			skb2 = skb_clone(skb, GFP_ATOMIC);
1753			if (!skb2)
1754				break;
1755
1756			net_timestamp_set(skb2);
1757
1758			/* skb->nh should be correctly
1759			   set by sender, so that the second statement is
1760			   just protection against buggy protocols.
1761			 */
1762			skb_reset_mac_header(skb2);
1763
1764			if (skb_network_header(skb2) < skb2->data ||
1765			    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1766				net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1767						     ntohs(skb2->protocol),
1768						     dev->name);
1769				skb_reset_network_header(skb2);
1770			}
1771
1772			skb2->transport_header = skb2->network_header;
1773			skb2->pkt_type = PACKET_OUTGOING;
1774			pt_prev = ptype;
1775		}
1776	}
1777	if (pt_prev)
1778		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1779	rcu_read_unlock();
1780}
1781
1782/**
1783 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1784 * @dev: Network device
1785 * @txq: number of queues available
1786 *
1787 * If real_num_tx_queues is changed the tc mappings may no longer be
1788 * valid. To resolve this verify the tc mapping remains valid and if
1789 * not NULL the mapping. With no priorities mapping to this
1790 * offset/count pair it will no longer be used. In the worst case TC0
1791 * is invalid nothing can be done so disable priority mappings. If is
1792 * expected that drivers will fix this mapping if they can before
1793 * calling netif_set_real_num_tx_queues.
1794 */
1795static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1796{
1797	int i;
1798	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1799
1800	/* If TC0 is invalidated disable TC mapping */
1801	if (tc->offset + tc->count > txq) {
1802		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1803		dev->num_tc = 0;
1804		return;
1805	}
1806
1807	/* Invalidated prio to tc mappings set to TC0 */
1808	for (i = 1; i < TC_BITMASK + 1; i++) {
1809		int q = netdev_get_prio_tc_map(dev, i);
1810
1811		tc = &dev->tc_to_txq[q];
1812		if (tc->offset + tc->count > txq) {
1813			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1814				i, q);
1815			netdev_set_prio_tc_map(dev, i, 0);
1816		}
1817	}
1818}
1819
1820#ifdef CONFIG_XPS
1821static DEFINE_MUTEX(xps_map_mutex);
1822#define xmap_dereference(P)		\
1823	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1824
1825static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1826					int cpu, u16 index)
1827{
1828	struct xps_map *map = NULL;
1829	int pos;
1830
1831	if (dev_maps)
1832		map = xmap_dereference(dev_maps->cpu_map[cpu]);
1833
1834	for (pos = 0; map && pos < map->len; pos++) {
1835		if (map->queues[pos] == index) {
1836			if (map->len > 1) {
1837				map->queues[pos] = map->queues[--map->len];
1838			} else {
1839				RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1840				kfree_rcu(map, rcu);
1841				map = NULL;
1842			}
1843			break;
1844		}
1845	}
1846
1847	return map;
1848}
1849
1850static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1851{
1852	struct xps_dev_maps *dev_maps;
1853	int cpu, i;
1854	bool active = false;
1855
1856	mutex_lock(&xps_map_mutex);
1857	dev_maps = xmap_dereference(dev->xps_maps);
1858
1859	if (!dev_maps)
1860		goto out_no_maps;
1861
1862	for_each_possible_cpu(cpu) {
1863		for (i = index; i < dev->num_tx_queues; i++) {
1864			if (!remove_xps_queue(dev_maps, cpu, i))
1865				break;
1866		}
1867		if (i == dev->num_tx_queues)
1868			active = true;
1869	}
1870
1871	if (!active) {
1872		RCU_INIT_POINTER(dev->xps_maps, NULL);
1873		kfree_rcu(dev_maps, rcu);
1874	}
1875
1876	for (i = index; i < dev->num_tx_queues; i++)
1877		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1878					     NUMA_NO_NODE);
1879
1880out_no_maps:
1881	mutex_unlock(&xps_map_mutex);
1882}
1883
1884static struct xps_map *expand_xps_map(struct xps_map *map,
1885				      int cpu, u16 index)
1886{
1887	struct xps_map *new_map;
1888	int alloc_len = XPS_MIN_MAP_ALLOC;
1889	int i, pos;
1890
1891	for (pos = 0; map && pos < map->len; pos++) {
1892		if (map->queues[pos] != index)
1893			continue;
1894		return map;
1895	}
1896
1897	/* Need to add queue to this CPU's existing map */
1898	if (map) {
1899		if (pos < map->alloc_len)
1900			return map;
1901
1902		alloc_len = map->alloc_len * 2;
1903	}
1904
1905	/* Need to allocate new map to store queue on this CPU's map */
1906	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1907			       cpu_to_node(cpu));
1908	if (!new_map)
1909		return NULL;
1910
1911	for (i = 0; i < pos; i++)
1912		new_map->queues[i] = map->queues[i];
1913	new_map->alloc_len = alloc_len;
1914	new_map->len = pos;
1915
1916	return new_map;
1917}
1918
1919int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1920			u16 index)
1921{
1922	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1923	struct xps_map *map, *new_map;
1924	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1925	int cpu, numa_node_id = -2;
1926	bool active = false;
1927
1928	mutex_lock(&xps_map_mutex);
1929
1930	dev_maps = xmap_dereference(dev->xps_maps);
1931
1932	/* allocate memory for queue storage */
1933	for_each_online_cpu(cpu) {
1934		if (!cpumask_test_cpu(cpu, mask))
1935			continue;
1936
1937		if (!new_dev_maps)
1938			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1939		if (!new_dev_maps) {
1940			mutex_unlock(&xps_map_mutex);
1941			return -ENOMEM;
1942		}
1943
1944		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1945				 NULL;
1946
1947		map = expand_xps_map(map, cpu, index);
1948		if (!map)
1949			goto error;
1950
1951		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1952	}
1953
1954	if (!new_dev_maps)
1955		goto out_no_new_maps;
1956
1957	for_each_possible_cpu(cpu) {
1958		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1959			/* add queue to CPU maps */
1960			int pos = 0;
1961
1962			map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1963			while ((pos < map->len) && (map->queues[pos] != index))
1964				pos++;
1965
1966			if (pos == map->len)
1967				map->queues[map->len++] = index;
1968#ifdef CONFIG_NUMA
1969			if (numa_node_id == -2)
1970				numa_node_id = cpu_to_node(cpu);
1971			else if (numa_node_id != cpu_to_node(cpu))
1972				numa_node_id = -1;
1973#endif
1974		} else if (dev_maps) {
1975			/* fill in the new device map from the old device map */
1976			map = xmap_dereference(dev_maps->cpu_map[cpu]);
1977			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1978		}
1979
1980	}
1981
1982	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1983
1984	/* Cleanup old maps */
1985	if (dev_maps) {
1986		for_each_possible_cpu(cpu) {
1987			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1988			map = xmap_dereference(dev_maps->cpu_map[cpu]);
1989			if (map && map != new_map)
1990				kfree_rcu(map, rcu);
1991		}
1992
1993		kfree_rcu(dev_maps, rcu);
1994	}
1995
1996	dev_maps = new_dev_maps;
1997	active = true;
1998
1999out_no_new_maps:
2000	/* update Tx queue numa node */
2001	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2002				     (numa_node_id >= 0) ? numa_node_id :
2003				     NUMA_NO_NODE);
2004
2005	if (!dev_maps)
2006		goto out_no_maps;
2007
2008	/* removes queue from unused CPUs */
2009	for_each_possible_cpu(cpu) {
2010		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2011			continue;
2012
2013		if (remove_xps_queue(dev_maps, cpu, index))
2014			active = true;
2015	}
2016
2017	/* free map if not active */
2018	if (!active) {
2019		RCU_INIT_POINTER(dev->xps_maps, NULL);
2020		kfree_rcu(dev_maps, rcu);
2021	}
2022
2023out_no_maps:
2024	mutex_unlock(&xps_map_mutex);
2025
2026	return 0;
2027error:
2028	/* remove any maps that we added */
2029	for_each_possible_cpu(cpu) {
2030		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2031		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2032				 NULL;
2033		if (new_map && new_map != map)
2034			kfree(new_map);
2035	}
2036
2037	mutex_unlock(&xps_map_mutex);
2038
2039	kfree(new_dev_maps);
2040	return -ENOMEM;
2041}
2042EXPORT_SYMBOL(netif_set_xps_queue);
2043
2044#endif
2045/*
2046 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2047 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2048 */
2049int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2050{
2051	int rc;
2052
2053	if (txq < 1 || txq > dev->num_tx_queues)
2054		return -EINVAL;
2055
2056	if (dev->reg_state == NETREG_REGISTERED ||
2057	    dev->reg_state == NETREG_UNREGISTERING) {
2058		ASSERT_RTNL();
2059
2060		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2061						  txq);
2062		if (rc)
2063			return rc;
2064
2065		if (dev->num_tc)
2066			netif_setup_tc(dev, txq);
2067
2068		if (txq < dev->real_num_tx_queues) {
2069			qdisc_reset_all_tx_gt(dev, txq);
2070#ifdef CONFIG_XPS
2071			netif_reset_xps_queues_gt(dev, txq);
2072#endif
2073		}
2074	}
2075
2076	dev->real_num_tx_queues = txq;
2077	return 0;
2078}
2079EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2080
2081#ifdef CONFIG_SYSFS
2082/**
2083 *	netif_set_real_num_rx_queues - set actual number of RX queues used
2084 *	@dev: Network device
2085 *	@rxq: Actual number of RX queues
2086 *
2087 *	This must be called either with the rtnl_lock held or before
2088 *	registration of the net device.  Returns 0 on success, or a
2089 *	negative error code.  If called before registration, it always
2090 *	succeeds.
2091 */
2092int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2093{
2094	int rc;
2095
2096	if (rxq < 1 || rxq > dev->num_rx_queues)
2097		return -EINVAL;
2098
2099	if (dev->reg_state == NETREG_REGISTERED) {
2100		ASSERT_RTNL();
2101
2102		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2103						  rxq);
2104		if (rc)
2105			return rc;
2106	}
2107
2108	dev->real_num_rx_queues = rxq;
2109	return 0;
2110}
2111EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2112#endif
2113
2114/**
2115 * netif_get_num_default_rss_queues - default number of RSS queues
2116 *
2117 * This routine should set an upper limit on the number of RSS queues
2118 * used by default by multiqueue devices.
2119 */
2120int netif_get_num_default_rss_queues(void)
2121{
2122	return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2123}
2124EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2125
2126static inline void __netif_reschedule(struct Qdisc *q)
2127{
2128	struct softnet_data *sd;
2129	unsigned long flags;
2130
2131	local_irq_save(flags);
2132	sd = &__get_cpu_var(softnet_data);
2133	q->next_sched = NULL;
2134	*sd->output_queue_tailp = q;
2135	sd->output_queue_tailp = &q->next_sched;
2136	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2137	local_irq_restore(flags);
2138}
2139
2140void __netif_schedule(struct Qdisc *q)
2141{
2142	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2143		__netif_reschedule(q);
2144}
2145EXPORT_SYMBOL(__netif_schedule);
2146
2147struct dev_kfree_skb_cb {
2148	enum skb_free_reason reason;
2149};
2150
2151static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2152{
2153	return (struct dev_kfree_skb_cb *)skb->cb;
2154}
2155
2156void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2157{
2158	unsigned long flags;
2159
2160	if (likely(atomic_read(&skb->users) == 1)) {
2161		smp_rmb();
2162		atomic_set(&skb->users, 0);
2163	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2164		return;
 
2165	}
2166	get_kfree_skb_cb(skb)->reason = reason;
2167	local_irq_save(flags);
2168	skb->next = __this_cpu_read(softnet_data.completion_queue);
2169	__this_cpu_write(softnet_data.completion_queue, skb);
2170	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2171	local_irq_restore(flags);
2172}
2173EXPORT_SYMBOL(__dev_kfree_skb_irq);
2174
2175void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2176{
2177	if (in_irq() || irqs_disabled())
2178		__dev_kfree_skb_irq(skb, reason);
2179	else
2180		dev_kfree_skb(skb);
2181}
2182EXPORT_SYMBOL(__dev_kfree_skb_any);
2183
2184
2185/**
2186 * netif_device_detach - mark device as removed
2187 * @dev: network device
2188 *
2189 * Mark device as removed from system and therefore no longer available.
2190 */
2191void netif_device_detach(struct net_device *dev)
2192{
2193	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2194	    netif_running(dev)) {
2195		netif_tx_stop_all_queues(dev);
2196	}
2197}
2198EXPORT_SYMBOL(netif_device_detach);
2199
2200/**
2201 * netif_device_attach - mark device as attached
2202 * @dev: network device
2203 *
2204 * Mark device as attached from system and restart if needed.
2205 */
2206void netif_device_attach(struct net_device *dev)
2207{
2208	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2209	    netif_running(dev)) {
2210		netif_tx_wake_all_queues(dev);
2211		__netdev_watchdog_up(dev);
2212	}
2213}
2214EXPORT_SYMBOL(netif_device_attach);
2215
2216static void skb_warn_bad_offload(const struct sk_buff *skb)
2217{
2218	static const netdev_features_t null_features = 0;
2219	struct net_device *dev = skb->dev;
2220	const char *driver = "";
2221
2222	if (!net_ratelimit())
2223		return;
2224
2225	if (dev && dev->dev.parent)
2226		driver = dev_driver_string(dev->dev.parent);
2227
2228	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2229	     "gso_type=%d ip_summed=%d\n",
2230	     driver, dev ? &dev->features : &null_features,
2231	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2232	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2233	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2234}
2235
2236/*
2237 * Invalidate hardware checksum when packet is to be mangled, and
2238 * complete checksum manually on outgoing path.
2239 */
2240int skb_checksum_help(struct sk_buff *skb)
2241{
2242	__wsum csum;
2243	int ret = 0, offset;
2244
2245	if (skb->ip_summed == CHECKSUM_COMPLETE)
2246		goto out_set_summed;
2247
2248	if (unlikely(skb_shinfo(skb)->gso_size)) {
2249		skb_warn_bad_offload(skb);
2250		return -EINVAL;
2251	}
2252
2253	/* Before computing a checksum, we should make sure no frag could
2254	 * be modified by an external entity : checksum could be wrong.
2255	 */
2256	if (skb_has_shared_frag(skb)) {
2257		ret = __skb_linearize(skb);
2258		if (ret)
2259			goto out;
2260	}
2261
2262	offset = skb_checksum_start_offset(skb);
2263	BUG_ON(offset >= skb_headlen(skb));
2264	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2265
2266	offset += skb->csum_offset;
2267	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2268
2269	if (skb_cloned(skb) &&
2270	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2271		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2272		if (ret)
2273			goto out;
2274	}
2275
2276	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
2277out_set_summed:
2278	skb->ip_summed = CHECKSUM_NONE;
2279out:
2280	return ret;
2281}
2282EXPORT_SYMBOL(skb_checksum_help);
2283
2284__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
 
 
 
 
 
 
 
 
 
 
 
2285{
2286	unsigned int vlan_depth = skb->mac_len;
 
2287	__be16 type = skb->protocol;
 
 
2288
2289	/* Tunnel gso handlers can set protocol to ethernet. */
2290	if (type == htons(ETH_P_TEB)) {
2291		struct ethhdr *eth;
2292
2293		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2294			return 0;
2295
2296		eth = (struct ethhdr *)skb_mac_header(skb);
2297		type = eth->h_proto;
 
2298	}
2299
2300	/* if skb->protocol is 802.1Q/AD then the header should already be
2301	 * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2302	 * ETH_HLEN otherwise
2303	 */
2304	if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2305		if (vlan_depth) {
2306			if (unlikely(WARN_ON(vlan_depth < VLAN_HLEN)))
2307				return 0;
2308			vlan_depth -= VLAN_HLEN;
2309		} else {
2310			vlan_depth = ETH_HLEN;
2311		}
2312		do {
2313			struct vlan_hdr *vh;
2314
2315			if (unlikely(!pskb_may_pull(skb,
2316						    vlan_depth + VLAN_HLEN)))
2317				return 0;
2318
2319			vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2320			type = vh->h_vlan_encapsulated_proto;
2321			vlan_depth += VLAN_HLEN;
2322		} while (type == htons(ETH_P_8021Q) ||
2323			 type == htons(ETH_P_8021AD));
2324	}
2325
2326	*depth = vlan_depth;
2327
2328	return type;
2329}
2330
2331/**
2332 *	skb_mac_gso_segment - mac layer segmentation handler.
2333 *	@skb: buffer to segment
2334 *	@features: features for the output path (see dev->features)
2335 */
2336struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2337				    netdev_features_t features)
2338{
2339	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2340	struct packet_offload *ptype;
2341	int vlan_depth = skb->mac_len;
2342	__be16 type = skb_network_protocol(skb, &vlan_depth);
2343
2344	if (unlikely(!type))
2345		return ERR_PTR(-EINVAL);
2346
2347	__skb_pull(skb, vlan_depth);
2348
2349	rcu_read_lock();
2350	list_for_each_entry_rcu(ptype, &offload_base, list) {
2351		if (ptype->type == type && ptype->callbacks.gso_segment) {
 
2352			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2353				int err;
2354
2355				err = ptype->callbacks.gso_send_check(skb);
2356				segs = ERR_PTR(err);
2357				if (err || skb_gso_ok(skb, features))
2358					break;
2359				__skb_push(skb, (skb->data -
2360						 skb_network_header(skb)));
2361			}
2362			segs = ptype->callbacks.gso_segment(skb, features);
2363			break;
2364		}
2365	}
2366	rcu_read_unlock();
2367
2368	__skb_push(skb, skb->data - skb_mac_header(skb));
2369
2370	return segs;
2371}
2372EXPORT_SYMBOL(skb_mac_gso_segment);
2373
2374
2375/* openvswitch calls this on rx path, so we need a different check.
2376 */
2377static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2378{
2379	if (tx_path)
2380		return skb->ip_summed != CHECKSUM_PARTIAL;
2381	else
2382		return skb->ip_summed == CHECKSUM_NONE;
2383}
2384
2385/**
2386 *	__skb_gso_segment - Perform segmentation on skb.
2387 *	@skb: buffer to segment
2388 *	@features: features for the output path (see dev->features)
2389 *	@tx_path: whether it is called in TX path
2390 *
2391 *	This function segments the given skb and returns a list of segments.
2392 *
2393 *	It may return NULL if the skb requires no segmentation.  This is
2394 *	only possible when GSO is used for verifying header integrity.
2395 */
2396struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2397				  netdev_features_t features, bool tx_path)
2398{
2399	if (unlikely(skb_needs_check(skb, tx_path))) {
2400		int err;
2401
2402		skb_warn_bad_offload(skb);
2403
2404		if (skb_header_cloned(skb) &&
2405		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2406			return ERR_PTR(err);
2407	}
2408
2409	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2410	SKB_GSO_CB(skb)->encap_level = 0;
2411
2412	skb_reset_mac_header(skb);
2413	skb_reset_mac_len(skb);
2414
2415	return skb_mac_gso_segment(skb, features);
2416}
2417EXPORT_SYMBOL(__skb_gso_segment);
2418
2419/* Take action when hardware reception checksum errors are detected. */
2420#ifdef CONFIG_BUG
2421void netdev_rx_csum_fault(struct net_device *dev)
2422{
2423	if (net_ratelimit()) {
2424		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2425		dump_stack();
2426	}
2427}
2428EXPORT_SYMBOL(netdev_rx_csum_fault);
2429#endif
2430
2431/* Actually, we should eliminate this check as soon as we know, that:
2432 * 1. IOMMU is present and allows to map all the memory.
2433 * 2. No high memory really exists on this machine.
2434 */
2435
2436static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2437{
2438#ifdef CONFIG_HIGHMEM
2439	int i;
2440	if (!(dev->features & NETIF_F_HIGHDMA)) {
2441		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2442			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2443			if (PageHighMem(skb_frag_page(frag)))
2444				return 1;
2445		}
2446	}
2447
2448	if (PCI_DMA_BUS_IS_PHYS) {
2449		struct device *pdev = dev->dev.parent;
2450
2451		if (!pdev)
2452			return 0;
2453		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2454			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2455			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2456			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2457				return 1;
2458		}
2459	}
2460#endif
2461	return 0;
2462}
2463
2464struct dev_gso_cb {
2465	void (*destructor)(struct sk_buff *skb);
2466};
2467
2468#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2469
2470static void dev_gso_skb_destructor(struct sk_buff *skb)
2471{
2472	struct dev_gso_cb *cb;
2473
2474	kfree_skb_list(skb->next);
2475	skb->next = NULL;
 
 
 
 
 
2476
2477	cb = DEV_GSO_CB(skb);
2478	if (cb->destructor)
2479		cb->destructor(skb);
2480}
2481
2482/**
2483 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2484 *	@skb: buffer to segment
2485 *	@features: device features as applicable to this skb
2486 *
2487 *	This function segments the given skb and stores the list of segments
2488 *	in skb->next.
2489 */
2490static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2491{
2492	struct sk_buff *segs;
2493
2494	segs = skb_gso_segment(skb, features);
2495
2496	/* Verifying header integrity only. */
2497	if (!segs)
2498		return 0;
2499
2500	if (IS_ERR(segs))
2501		return PTR_ERR(segs);
2502
2503	skb->next = segs;
2504	DEV_GSO_CB(skb)->destructor = skb->destructor;
2505	skb->destructor = dev_gso_skb_destructor;
2506
2507	return 0;
2508}
2509
2510static netdev_features_t harmonize_features(struct sk_buff *skb,
2511	netdev_features_t features)
2512{
2513	int tmp;
 
 
 
 
 
 
 
2514
2515	if (skb->ip_summed != CHECKSUM_NONE &&
2516	    !can_checksum_protocol(features, skb_network_protocol(skb, &tmp))) {
 
 
2517		features &= ~NETIF_F_ALL_CSUM;
 
2518	} else if (illegal_highdma(skb->dev, skb)) {
2519		features &= ~NETIF_F_SG;
2520	}
2521
2522	return features;
2523}
2524
2525netdev_features_t netif_skb_features(struct sk_buff *skb)
2526{
2527	__be16 protocol = skb->protocol;
2528	netdev_features_t features = skb->dev->features;
2529
2530	if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2531		features &= ~NETIF_F_GSO_MASK;
2532
2533	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2534		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2535		protocol = veh->h_vlan_encapsulated_proto;
2536	} else if (!vlan_tx_tag_present(skb)) {
2537		return harmonize_features(skb, features);
2538	}
2539
2540	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2541					       NETIF_F_HW_VLAN_STAG_TX);
2542
2543	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
 
 
2544		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2545				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2546				NETIF_F_HW_VLAN_STAG_TX;
2547
2548	return harmonize_features(skb, features);
2549}
2550EXPORT_SYMBOL(netif_skb_features);
2551
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2552int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2553			struct netdev_queue *txq)
2554{
2555	const struct net_device_ops *ops = dev->netdev_ops;
2556	int rc = NETDEV_TX_OK;
2557	unsigned int skb_len;
2558
2559	if (likely(!skb->next)) {
2560		netdev_features_t features;
2561
2562		/*
2563		 * If device doesn't need skb->dst, release it right now while
2564		 * its hot in this cpu cache
2565		 */
2566		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2567			skb_dst_drop(skb);
2568
 
 
 
2569		features = netif_skb_features(skb);
2570
2571		if (vlan_tx_tag_present(skb) &&
2572		    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2573			skb = __vlan_put_tag(skb, skb->vlan_proto,
2574					     vlan_tx_tag_get(skb));
2575			if (unlikely(!skb))
2576				goto out;
2577
2578			skb->vlan_tci = 0;
2579		}
2580
2581		/* If encapsulation offload request, verify we are testing
2582		 * hardware encapsulation features instead of standard
2583		 * features for the netdev
2584		 */
2585		if (skb->encapsulation)
2586			features &= dev->hw_enc_features;
2587
2588		if (netif_needs_gso(skb, features)) {
2589			if (unlikely(dev_gso_segment(skb, features)))
2590				goto out_kfree_skb;
2591			if (skb->next)
2592				goto gso;
2593		} else {
2594			if (skb_needs_linearize(skb, features) &&
2595			    __skb_linearize(skb))
2596				goto out_kfree_skb;
2597
2598			/* If packet is not checksummed and device does not
2599			 * support checksumming for this protocol, complete
2600			 * checksumming here.
2601			 */
2602			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2603				if (skb->encapsulation)
2604					skb_set_inner_transport_header(skb,
2605						skb_checksum_start_offset(skb));
2606				else
2607					skb_set_transport_header(skb,
2608						skb_checksum_start_offset(skb));
2609				if (!(features & NETIF_F_ALL_CSUM) &&
2610				     skb_checksum_help(skb))
2611					goto out_kfree_skb;
2612			}
2613		}
2614
2615		if (!list_empty(&ptype_all))
2616			dev_queue_xmit_nit(skb, dev);
2617
2618		skb_len = skb->len;
2619		trace_net_dev_start_xmit(skb, dev);
2620		rc = ops->ndo_start_xmit(skb, dev);
2621		trace_net_dev_xmit(skb, rc, dev, skb_len);
2622		if (rc == NETDEV_TX_OK)
2623			txq_trans_update(txq);
2624		return rc;
2625	}
2626
2627gso:
2628	do {
2629		struct sk_buff *nskb = skb->next;
2630
2631		skb->next = nskb->next;
2632		nskb->next = NULL;
2633
2634		if (!list_empty(&ptype_all))
2635			dev_queue_xmit_nit(nskb, dev);
 
 
 
 
2636
2637		skb_len = nskb->len;
2638		trace_net_dev_start_xmit(nskb, dev);
2639		rc = ops->ndo_start_xmit(nskb, dev);
2640		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2641		if (unlikely(rc != NETDEV_TX_OK)) {
2642			if (rc & ~NETDEV_TX_MASK)
2643				goto out_kfree_gso_skb;
2644			nskb->next = skb->next;
2645			skb->next = nskb;
2646			return rc;
2647		}
2648		txq_trans_update(txq);
2649		if (unlikely(netif_xmit_stopped(txq) && skb->next))
2650			return NETDEV_TX_BUSY;
2651	} while (skb->next);
2652
2653out_kfree_gso_skb:
2654	if (likely(skb->next == NULL)) {
2655		skb->destructor = DEV_GSO_CB(skb)->destructor;
2656		consume_skb(skb);
2657		return rc;
2658	}
2659out_kfree_skb:
2660	kfree_skb(skb);
2661out:
2662	return rc;
2663}
2664EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
2665
2666static void qdisc_pkt_len_init(struct sk_buff *skb)
 
 
 
 
 
 
 
2667{
2668	const struct skb_shared_info *shinfo = skb_shinfo(skb);
 
 
2669
2670	qdisc_skb_cb(skb)->pkt_len = skb->len;
 
 
 
 
 
2671
2672	/* To get more precise estimation of bytes sent on wire,
2673	 * we add to pkt_len the headers size of all segments
2674	 */
2675	if (shinfo->gso_size)  {
2676		unsigned int hdr_len;
2677		u16 gso_segs = shinfo->gso_segs;
2678
2679		/* mac layer + network layer */
2680		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2681
2682		/* + transport layer */
2683		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2684			hdr_len += tcp_hdrlen(skb);
2685		else
2686			hdr_len += sizeof(struct udphdr);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2687
2688		if (shinfo->gso_type & SKB_GSO_DODGY)
2689			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2690						shinfo->gso_size);
 
 
 
 
 
 
 
 
2691
2692		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
 
 
 
2693	}
 
 
 
2694}
2695
2696static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2697				 struct net_device *dev,
2698				 struct netdev_queue *txq)
2699{
2700	spinlock_t *root_lock = qdisc_lock(q);
2701	bool contended;
2702	int rc;
2703
2704	qdisc_pkt_len_init(skb);
2705	qdisc_calculate_pkt_len(skb, q);
2706	/*
2707	 * Heuristic to force contended enqueues to serialize on a
2708	 * separate lock before trying to get qdisc main lock.
2709	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2710	 * and dequeue packets faster.
2711	 */
2712	contended = qdisc_is_running(q);
2713	if (unlikely(contended))
2714		spin_lock(&q->busylock);
2715
2716	spin_lock(root_lock);
2717	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2718		kfree_skb(skb);
2719		rc = NET_XMIT_DROP;
2720	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2721		   qdisc_run_begin(q)) {
2722		/*
2723		 * This is a work-conserving queue; there are no old skbs
2724		 * waiting to be sent out; and the qdisc is not running -
2725		 * xmit the skb directly.
2726		 */
2727		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2728			skb_dst_force(skb);
2729
2730		qdisc_bstats_update(q, skb);
2731
2732		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2733			if (unlikely(contended)) {
2734				spin_unlock(&q->busylock);
2735				contended = false;
2736			}
2737			__qdisc_run(q);
2738		} else
2739			qdisc_run_end(q);
2740
2741		rc = NET_XMIT_SUCCESS;
2742	} else {
2743		skb_dst_force(skb);
2744		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2745		if (qdisc_run_begin(q)) {
2746			if (unlikely(contended)) {
2747				spin_unlock(&q->busylock);
2748				contended = false;
2749			}
2750			__qdisc_run(q);
2751		}
2752	}
2753	spin_unlock(root_lock);
2754	if (unlikely(contended))
2755		spin_unlock(&q->busylock);
2756	return rc;
2757}
2758
2759#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2760static void skb_update_prio(struct sk_buff *skb)
2761{
2762	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2763
2764	if (!skb->priority && skb->sk && map) {
2765		unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2766
2767		if (prioidx < map->priomap_len)
2768			skb->priority = map->priomap[prioidx];
2769	}
2770}
2771#else
2772#define skb_update_prio(skb)
2773#endif
2774
2775static DEFINE_PER_CPU(int, xmit_recursion);
2776#define RECURSION_LIMIT 10
2777
2778/**
2779 *	dev_loopback_xmit - loop back @skb
2780 *	@skb: buffer to transmit
2781 */
2782int dev_loopback_xmit(struct sk_buff *skb)
2783{
2784	skb_reset_mac_header(skb);
2785	__skb_pull(skb, skb_network_offset(skb));
2786	skb->pkt_type = PACKET_LOOPBACK;
2787	skb->ip_summed = CHECKSUM_UNNECESSARY;
2788	WARN_ON(!skb_dst(skb));
2789	skb_dst_force(skb);
2790	netif_rx_ni(skb);
2791	return 0;
2792}
2793EXPORT_SYMBOL(dev_loopback_xmit);
2794
2795/**
2796 *	__dev_queue_xmit - transmit a buffer
2797 *	@skb: buffer to transmit
2798 *	@accel_priv: private data used for L2 forwarding offload
2799 *
2800 *	Queue a buffer for transmission to a network device. The caller must
2801 *	have set the device and priority and built the buffer before calling
2802 *	this function. The function can be called from an interrupt.
2803 *
2804 *	A negative errno code is returned on a failure. A success does not
2805 *	guarantee the frame will be transmitted as it may be dropped due
2806 *	to congestion or traffic shaping.
2807 *
2808 * -----------------------------------------------------------------------------------
2809 *      I notice this method can also return errors from the queue disciplines,
2810 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2811 *      be positive.
2812 *
2813 *      Regardless of the return value, the skb is consumed, so it is currently
2814 *      difficult to retry a send to this method.  (You can bump the ref count
2815 *      before sending to hold a reference for retry if you are careful.)
2816 *
2817 *      When calling this method, interrupts MUST be enabled.  This is because
2818 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2819 *          --BLG
2820 */
2821static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2822{
2823	struct net_device *dev = skb->dev;
2824	struct netdev_queue *txq;
2825	struct Qdisc *q;
2826	int rc = -ENOMEM;
2827
2828	skb_reset_mac_header(skb);
2829
2830	/* Disable soft irqs for various locks below. Also
2831	 * stops preemption for RCU.
2832	 */
2833	rcu_read_lock_bh();
2834
2835	skb_update_prio(skb);
2836
2837	txq = netdev_pick_tx(dev, skb, accel_priv);
2838	q = rcu_dereference_bh(txq->qdisc);
2839
2840#ifdef CONFIG_NET_CLS_ACT
2841	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2842#endif
2843	trace_net_dev_queue(skb);
2844	if (q->enqueue) {
2845		rc = __dev_xmit_skb(skb, q, dev, txq);
2846		goto out;
2847	}
2848
2849	/* The device has no queue. Common case for software devices:
2850	   loopback, all the sorts of tunnels...
2851
2852	   Really, it is unlikely that netif_tx_lock protection is necessary
2853	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2854	   counters.)
2855	   However, it is possible, that they rely on protection
2856	   made by us here.
2857
2858	   Check this and shot the lock. It is not prone from deadlocks.
2859	   Either shot noqueue qdisc, it is even simpler 8)
2860	 */
2861	if (dev->flags & IFF_UP) {
2862		int cpu = smp_processor_id(); /* ok because BHs are off */
2863
2864		if (txq->xmit_lock_owner != cpu) {
2865
2866			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2867				goto recursion_alert;
2868
2869			HARD_TX_LOCK(dev, txq, cpu);
2870
2871			if (!netif_xmit_stopped(txq)) {
2872				__this_cpu_inc(xmit_recursion);
2873				rc = dev_hard_start_xmit(skb, dev, txq);
2874				__this_cpu_dec(xmit_recursion);
2875				if (dev_xmit_complete(rc)) {
2876					HARD_TX_UNLOCK(dev, txq);
2877					goto out;
2878				}
2879			}
2880			HARD_TX_UNLOCK(dev, txq);
2881			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2882					     dev->name);
2883		} else {
2884			/* Recursion is detected! It is possible,
2885			 * unfortunately
2886			 */
2887recursion_alert:
2888			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2889					     dev->name);
2890		}
2891	}
2892
2893	rc = -ENETDOWN;
2894	rcu_read_unlock_bh();
2895
2896	atomic_long_inc(&dev->tx_dropped);
2897	kfree_skb(skb);
2898	return rc;
2899out:
2900	rcu_read_unlock_bh();
2901	return rc;
2902}
2903
2904int dev_queue_xmit(struct sk_buff *skb)
2905{
2906	return __dev_queue_xmit(skb, NULL);
2907}
2908EXPORT_SYMBOL(dev_queue_xmit);
2909
2910int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
2911{
2912	return __dev_queue_xmit(skb, accel_priv);
2913}
2914EXPORT_SYMBOL(dev_queue_xmit_accel);
2915
2916
2917/*=======================================================================
2918			Receiver routines
2919  =======================================================================*/
2920
2921int netdev_max_backlog __read_mostly = 1000;
2922EXPORT_SYMBOL(netdev_max_backlog);
2923
2924int netdev_tstamp_prequeue __read_mostly = 1;
2925int netdev_budget __read_mostly = 300;
2926int weight_p __read_mostly = 64;            /* old backlog weight */
2927
2928/* Called with irq disabled */
2929static inline void ____napi_schedule(struct softnet_data *sd,
2930				     struct napi_struct *napi)
2931{
2932	list_add_tail(&napi->poll_list, &sd->poll_list);
2933	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2934}
2935
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2936#ifdef CONFIG_RPS
2937
2938/* One global table that all flow-based protocols share. */
2939struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2940EXPORT_SYMBOL(rps_sock_flow_table);
2941
2942struct static_key rps_needed __read_mostly;
2943
2944static struct rps_dev_flow *
2945set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2946	    struct rps_dev_flow *rflow, u16 next_cpu)
2947{
2948	if (next_cpu != RPS_NO_CPU) {
2949#ifdef CONFIG_RFS_ACCEL
2950		struct netdev_rx_queue *rxqueue;
2951		struct rps_dev_flow_table *flow_table;
2952		struct rps_dev_flow *old_rflow;
2953		u32 flow_id;
2954		u16 rxq_index;
2955		int rc;
2956
2957		/* Should we steer this flow to a different hardware queue? */
2958		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2959		    !(dev->features & NETIF_F_NTUPLE))
2960			goto out;
2961		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2962		if (rxq_index == skb_get_rx_queue(skb))
2963			goto out;
2964
2965		rxqueue = dev->_rx + rxq_index;
2966		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2967		if (!flow_table)
2968			goto out;
2969		flow_id = skb_get_hash(skb) & flow_table->mask;
2970		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2971							rxq_index, flow_id);
2972		if (rc < 0)
2973			goto out;
2974		old_rflow = rflow;
2975		rflow = &flow_table->flows[flow_id];
2976		rflow->filter = rc;
2977		if (old_rflow->filter == rflow->filter)
2978			old_rflow->filter = RPS_NO_FILTER;
2979	out:
2980#endif
2981		rflow->last_qtail =
2982			per_cpu(softnet_data, next_cpu).input_queue_head;
2983	}
2984
2985	rflow->cpu = next_cpu;
2986	return rflow;
2987}
2988
2989/*
2990 * get_rps_cpu is called from netif_receive_skb and returns the target
2991 * CPU from the RPS map of the receiving queue for a given skb.
2992 * rcu_read_lock must be held on entry.
2993 */
2994static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2995		       struct rps_dev_flow **rflowp)
2996{
2997	struct netdev_rx_queue *rxqueue;
2998	struct rps_map *map;
2999	struct rps_dev_flow_table *flow_table;
3000	struct rps_sock_flow_table *sock_flow_table;
3001	int cpu = -1;
3002	u16 tcpu;
3003	u32 hash;
3004
3005	if (skb_rx_queue_recorded(skb)) {
3006		u16 index = skb_get_rx_queue(skb);
3007		if (unlikely(index >= dev->real_num_rx_queues)) {
3008			WARN_ONCE(dev->real_num_rx_queues > 1,
3009				  "%s received packet on queue %u, but number "
3010				  "of RX queues is %u\n",
3011				  dev->name, index, dev->real_num_rx_queues);
3012			goto done;
3013		}
3014		rxqueue = dev->_rx + index;
3015	} else
3016		rxqueue = dev->_rx;
3017
3018	map = rcu_dereference(rxqueue->rps_map);
3019	if (map) {
3020		if (map->len == 1 &&
3021		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
3022			tcpu = map->cpus[0];
3023			if (cpu_online(tcpu))
3024				cpu = tcpu;
3025			goto done;
3026		}
3027	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3028		goto done;
3029	}
3030
3031	skb_reset_network_header(skb);
3032	hash = skb_get_hash(skb);
3033	if (!hash)
3034		goto done;
3035
3036	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3037	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3038	if (flow_table && sock_flow_table) {
3039		u16 next_cpu;
3040		struct rps_dev_flow *rflow;
3041
3042		rflow = &flow_table->flows[hash & flow_table->mask];
3043		tcpu = rflow->cpu;
3044
3045		next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
 
3046
3047		/*
3048		 * If the desired CPU (where last recvmsg was done) is
3049		 * different from current CPU (one in the rx-queue flow
3050		 * table entry), switch if one of the following holds:
3051		 *   - Current CPU is unset (equal to RPS_NO_CPU).
3052		 *   - Current CPU is offline.
3053		 *   - The current CPU's queue tail has advanced beyond the
3054		 *     last packet that was enqueued using this table entry.
3055		 *     This guarantees that all previous packets for the flow
3056		 *     have been dequeued, thus preserving in order delivery.
3057		 */
3058		if (unlikely(tcpu != next_cpu) &&
3059		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3060		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3061		      rflow->last_qtail)) >= 0)) {
3062			tcpu = next_cpu;
3063			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3064		}
3065
3066		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3067			*rflowp = rflow;
3068			cpu = tcpu;
3069			goto done;
3070		}
3071	}
3072
3073	if (map) {
3074		tcpu = map->cpus[((u64) hash * map->len) >> 32];
3075
3076		if (cpu_online(tcpu)) {
3077			cpu = tcpu;
3078			goto done;
3079		}
3080	}
3081
3082done:
3083	return cpu;
3084}
3085
3086#ifdef CONFIG_RFS_ACCEL
3087
3088/**
3089 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3090 * @dev: Device on which the filter was set
3091 * @rxq_index: RX queue index
3092 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3093 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3094 *
3095 * Drivers that implement ndo_rx_flow_steer() should periodically call
3096 * this function for each installed filter and remove the filters for
3097 * which it returns %true.
3098 */
3099bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3100			 u32 flow_id, u16 filter_id)
3101{
3102	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3103	struct rps_dev_flow_table *flow_table;
3104	struct rps_dev_flow *rflow;
3105	bool expire = true;
3106	int cpu;
3107
3108	rcu_read_lock();
3109	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3110	if (flow_table && flow_id <= flow_table->mask) {
3111		rflow = &flow_table->flows[flow_id];
3112		cpu = ACCESS_ONCE(rflow->cpu);
3113		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3114		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3115			   rflow->last_qtail) <
3116		     (int)(10 * flow_table->mask)))
3117			expire = false;
3118	}
3119	rcu_read_unlock();
3120	return expire;
3121}
3122EXPORT_SYMBOL(rps_may_expire_flow);
3123
3124#endif /* CONFIG_RFS_ACCEL */
3125
3126/* Called from hardirq (IPI) context */
3127static void rps_trigger_softirq(void *data)
3128{
3129	struct softnet_data *sd = data;
3130
3131	____napi_schedule(sd, &sd->backlog);
3132	sd->received_rps++;
3133}
3134
3135#endif /* CONFIG_RPS */
3136
3137/*
3138 * Check if this softnet_data structure is another cpu one
3139 * If yes, queue it to our IPI list and return 1
3140 * If no, return 0
3141 */
3142static int rps_ipi_queued(struct softnet_data *sd)
3143{
3144#ifdef CONFIG_RPS
3145	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3146
3147	if (sd != mysd) {
3148		sd->rps_ipi_next = mysd->rps_ipi_list;
3149		mysd->rps_ipi_list = sd;
3150
3151		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3152		return 1;
3153	}
3154#endif /* CONFIG_RPS */
3155	return 0;
3156}
3157
3158#ifdef CONFIG_NET_FLOW_LIMIT
3159int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3160#endif
3161
3162static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3163{
3164#ifdef CONFIG_NET_FLOW_LIMIT
3165	struct sd_flow_limit *fl;
3166	struct softnet_data *sd;
3167	unsigned int old_flow, new_flow;
3168
3169	if (qlen < (netdev_max_backlog >> 1))
3170		return false;
3171
3172	sd = &__get_cpu_var(softnet_data);
3173
3174	rcu_read_lock();
3175	fl = rcu_dereference(sd->flow_limit);
3176	if (fl) {
3177		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3178		old_flow = fl->history[fl->history_head];
3179		fl->history[fl->history_head] = new_flow;
3180
3181		fl->history_head++;
3182		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3183
3184		if (likely(fl->buckets[old_flow]))
3185			fl->buckets[old_flow]--;
3186
3187		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3188			fl->count++;
3189			rcu_read_unlock();
3190			return true;
3191		}
3192	}
3193	rcu_read_unlock();
3194#endif
3195	return false;
3196}
3197
3198/*
3199 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3200 * queue (may be a remote CPU queue).
3201 */
3202static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3203			      unsigned int *qtail)
3204{
3205	struct softnet_data *sd;
3206	unsigned long flags;
3207	unsigned int qlen;
3208
3209	sd = &per_cpu(softnet_data, cpu);
3210
3211	local_irq_save(flags);
3212
3213	rps_lock(sd);
3214	qlen = skb_queue_len(&sd->input_pkt_queue);
3215	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3216		if (skb_queue_len(&sd->input_pkt_queue)) {
3217enqueue:
3218			__skb_queue_tail(&sd->input_pkt_queue, skb);
3219			input_queue_tail_incr_save(sd, qtail);
3220			rps_unlock(sd);
3221			local_irq_restore(flags);
3222			return NET_RX_SUCCESS;
3223		}
3224
3225		/* Schedule NAPI for backlog device
3226		 * We can use non atomic operation since we own the queue lock
3227		 */
3228		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3229			if (!rps_ipi_queued(sd))
3230				____napi_schedule(sd, &sd->backlog);
3231		}
3232		goto enqueue;
3233	}
3234
3235	sd->dropped++;
3236	rps_unlock(sd);
3237
3238	local_irq_restore(flags);
3239
3240	atomic_long_inc(&skb->dev->rx_dropped);
3241	kfree_skb(skb);
3242	return NET_RX_DROP;
3243}
3244
3245static int netif_rx_internal(struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3246{
3247	int ret;
3248
 
 
 
 
3249	net_timestamp_check(netdev_tstamp_prequeue, skb);
3250
3251	trace_netif_rx(skb);
3252#ifdef CONFIG_RPS
3253	if (static_key_false(&rps_needed)) {
3254		struct rps_dev_flow voidflow, *rflow = &voidflow;
3255		int cpu;
3256
3257		preempt_disable();
3258		rcu_read_lock();
3259
3260		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3261		if (cpu < 0)
3262			cpu = smp_processor_id();
3263
3264		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3265
3266		rcu_read_unlock();
3267		preempt_enable();
3268	} else
3269#endif
3270	{
3271		unsigned int qtail;
3272		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3273		put_cpu();
3274	}
3275	return ret;
3276}
3277
3278/**
3279 *	netif_rx	-	post buffer to the network code
3280 *	@skb: buffer to post
3281 *
3282 *	This function receives a packet from a device driver and queues it for
3283 *	the upper (protocol) levels to process.  It always succeeds. The buffer
3284 *	may be dropped during processing for congestion control or by the
3285 *	protocol layers.
3286 *
3287 *	return values:
3288 *	NET_RX_SUCCESS	(no congestion)
3289 *	NET_RX_DROP     (packet was dropped)
3290 *
3291 */
3292
3293int netif_rx(struct sk_buff *skb)
3294{
3295	trace_netif_rx_entry(skb);
3296
3297	return netif_rx_internal(skb);
3298}
3299EXPORT_SYMBOL(netif_rx);
3300
3301int netif_rx_ni(struct sk_buff *skb)
3302{
3303	int err;
3304
3305	trace_netif_rx_ni_entry(skb);
3306
3307	preempt_disable();
3308	err = netif_rx_internal(skb);
3309	if (local_softirq_pending())
3310		do_softirq();
3311	preempt_enable();
3312
3313	return err;
3314}
3315EXPORT_SYMBOL(netif_rx_ni);
3316
3317static void net_tx_action(struct softirq_action *h)
3318{
3319	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3320
3321	if (sd->completion_queue) {
3322		struct sk_buff *clist;
3323
3324		local_irq_disable();
3325		clist = sd->completion_queue;
3326		sd->completion_queue = NULL;
3327		local_irq_enable();
3328
3329		while (clist) {
3330			struct sk_buff *skb = clist;
3331			clist = clist->next;
3332
3333			WARN_ON(atomic_read(&skb->users));
3334			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3335				trace_consume_skb(skb);
3336			else
3337				trace_kfree_skb(skb, net_tx_action);
3338			__kfree_skb(skb);
3339		}
3340	}
3341
3342	if (sd->output_queue) {
3343		struct Qdisc *head;
3344
3345		local_irq_disable();
3346		head = sd->output_queue;
3347		sd->output_queue = NULL;
3348		sd->output_queue_tailp = &sd->output_queue;
3349		local_irq_enable();
3350
3351		while (head) {
3352			struct Qdisc *q = head;
3353			spinlock_t *root_lock;
3354
3355			head = head->next_sched;
3356
3357			root_lock = qdisc_lock(q);
3358			if (spin_trylock(root_lock)) {
3359				smp_mb__before_clear_bit();
3360				clear_bit(__QDISC_STATE_SCHED,
3361					  &q->state);
3362				qdisc_run(q);
3363				spin_unlock(root_lock);
3364			} else {
3365				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3366					      &q->state)) {
3367					__netif_reschedule(q);
3368				} else {
3369					smp_mb__before_clear_bit();
3370					clear_bit(__QDISC_STATE_SCHED,
3371						  &q->state);
3372				}
3373			}
3374		}
3375	}
3376}
3377
3378#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3379    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3380/* This hook is defined here for ATM LANE */
3381int (*br_fdb_test_addr_hook)(struct net_device *dev,
3382			     unsigned char *addr) __read_mostly;
3383EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3384#endif
3385
3386#ifdef CONFIG_NET_CLS_ACT
3387/* TODO: Maybe we should just force sch_ingress to be compiled in
3388 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3389 * a compare and 2 stores extra right now if we dont have it on
3390 * but have CONFIG_NET_CLS_ACT
3391 * NOTE: This doesn't stop any functionality; if you dont have
3392 * the ingress scheduler, you just can't add policies on ingress.
3393 *
3394 */
3395static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3396{
3397	struct net_device *dev = skb->dev;
3398	u32 ttl = G_TC_RTTL(skb->tc_verd);
3399	int result = TC_ACT_OK;
3400	struct Qdisc *q;
3401
3402	if (unlikely(MAX_RED_LOOP < ttl++)) {
3403		net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3404				     skb->skb_iif, dev->ifindex);
3405		return TC_ACT_SHOT;
3406	}
3407
3408	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3409	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3410
3411	q = rxq->qdisc;
3412	if (q != &noop_qdisc) {
3413		spin_lock(qdisc_lock(q));
3414		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3415			result = qdisc_enqueue_root(skb, q);
3416		spin_unlock(qdisc_lock(q));
3417	}
3418
3419	return result;
3420}
3421
3422static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3423					 struct packet_type **pt_prev,
3424					 int *ret, struct net_device *orig_dev)
3425{
3426	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3427
3428	if (!rxq || rxq->qdisc == &noop_qdisc)
3429		goto out;
3430
3431	if (*pt_prev) {
3432		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3433		*pt_prev = NULL;
3434	}
3435
3436	switch (ing_filter(skb, rxq)) {
3437	case TC_ACT_SHOT:
3438	case TC_ACT_STOLEN:
3439		kfree_skb(skb);
3440		return NULL;
3441	}
3442
3443out:
3444	skb->tc_verd = 0;
3445	return skb;
3446}
3447#endif
3448
3449/**
3450 *	netdev_rx_handler_register - register receive handler
3451 *	@dev: device to register a handler for
3452 *	@rx_handler: receive handler to register
3453 *	@rx_handler_data: data pointer that is used by rx handler
3454 *
3455 *	Register a receive handler for a device. This handler will then be
3456 *	called from __netif_receive_skb. A negative errno code is returned
3457 *	on a failure.
3458 *
3459 *	The caller must hold the rtnl_mutex.
3460 *
3461 *	For a general description of rx_handler, see enum rx_handler_result.
3462 */
3463int netdev_rx_handler_register(struct net_device *dev,
3464			       rx_handler_func_t *rx_handler,
3465			       void *rx_handler_data)
3466{
3467	ASSERT_RTNL();
3468
3469	if (dev->rx_handler)
3470		return -EBUSY;
3471
3472	/* Note: rx_handler_data must be set before rx_handler */
3473	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3474	rcu_assign_pointer(dev->rx_handler, rx_handler);
3475
3476	return 0;
3477}
3478EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3479
3480/**
3481 *	netdev_rx_handler_unregister - unregister receive handler
3482 *	@dev: device to unregister a handler from
3483 *
3484 *	Unregister a receive handler from a device.
3485 *
3486 *	The caller must hold the rtnl_mutex.
3487 */
3488void netdev_rx_handler_unregister(struct net_device *dev)
3489{
3490
3491	ASSERT_RTNL();
3492	RCU_INIT_POINTER(dev->rx_handler, NULL);
3493	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3494	 * section has a guarantee to see a non NULL rx_handler_data
3495	 * as well.
3496	 */
3497	synchronize_net();
3498	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3499}
3500EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3501
3502/*
3503 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3504 * the special handling of PFMEMALLOC skbs.
3505 */
3506static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3507{
3508	switch (skb->protocol) {
3509	case htons(ETH_P_ARP):
3510	case htons(ETH_P_IP):
3511	case htons(ETH_P_IPV6):
3512	case htons(ETH_P_8021Q):
3513	case htons(ETH_P_8021AD):
3514		return true;
3515	default:
3516		return false;
3517	}
3518}
3519
3520static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3521{
3522	struct packet_type *ptype, *pt_prev;
3523	rx_handler_func_t *rx_handler;
3524	struct net_device *orig_dev;
3525	struct net_device *null_or_dev;
3526	bool deliver_exact = false;
3527	int ret = NET_RX_DROP;
3528	__be16 type;
3529
3530	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3531
3532	trace_netif_receive_skb(skb);
3533
 
 
 
 
 
 
3534	orig_dev = skb->dev;
3535
3536	skb_reset_network_header(skb);
3537	if (!skb_transport_header_was_set(skb))
3538		skb_reset_transport_header(skb);
3539	skb_reset_mac_len(skb);
3540
3541	pt_prev = NULL;
3542
3543	rcu_read_lock();
3544
3545another_round:
3546	skb->skb_iif = skb->dev->ifindex;
3547
3548	__this_cpu_inc(softnet_data.processed);
3549
3550	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3551	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3552		skb = vlan_untag(skb);
3553		if (unlikely(!skb))
3554			goto unlock;
3555	}
3556
3557#ifdef CONFIG_NET_CLS_ACT
3558	if (skb->tc_verd & TC_NCLS) {
3559		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3560		goto ncls;
3561	}
3562#endif
3563
3564	if (pfmemalloc)
3565		goto skip_taps;
3566
3567	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3568		if (!ptype->dev || ptype->dev == skb->dev) {
3569			if (pt_prev)
3570				ret = deliver_skb(skb, pt_prev, orig_dev);
3571			pt_prev = ptype;
3572		}
3573	}
3574
3575skip_taps:
3576#ifdef CONFIG_NET_CLS_ACT
3577	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3578	if (!skb)
3579		goto unlock;
3580ncls:
3581#endif
3582
3583	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3584		goto drop;
3585
3586	if (vlan_tx_tag_present(skb)) {
3587		if (pt_prev) {
3588			ret = deliver_skb(skb, pt_prev, orig_dev);
3589			pt_prev = NULL;
3590		}
3591		if (vlan_do_receive(&skb))
3592			goto another_round;
3593		else if (unlikely(!skb))
3594			goto unlock;
3595	}
3596
3597	rx_handler = rcu_dereference(skb->dev->rx_handler);
3598	if (rx_handler) {
3599		if (pt_prev) {
3600			ret = deliver_skb(skb, pt_prev, orig_dev);
3601			pt_prev = NULL;
3602		}
3603		switch (rx_handler(&skb)) {
3604		case RX_HANDLER_CONSUMED:
3605			ret = NET_RX_SUCCESS;
3606			goto unlock;
3607		case RX_HANDLER_ANOTHER:
3608			goto another_round;
3609		case RX_HANDLER_EXACT:
3610			deliver_exact = true;
3611		case RX_HANDLER_PASS:
3612			break;
3613		default:
3614			BUG();
3615		}
3616	}
3617
3618	if (unlikely(vlan_tx_tag_present(skb))) {
3619		if (vlan_tx_tag_get_id(skb))
3620			skb->pkt_type = PACKET_OTHERHOST;
3621		/* Note: we might in the future use prio bits
3622		 * and set skb->priority like in vlan_do_receive()
3623		 * For the time being, just ignore Priority Code Point
3624		 */
3625		skb->vlan_tci = 0;
3626	}
3627
3628	/* deliver only exact match when indicated */
3629	null_or_dev = deliver_exact ? skb->dev : NULL;
3630
3631	type = skb->protocol;
3632	list_for_each_entry_rcu(ptype,
3633			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3634		if (ptype->type == type &&
3635		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3636		     ptype->dev == orig_dev)) {
3637			if (pt_prev)
3638				ret = deliver_skb(skb, pt_prev, orig_dev);
3639			pt_prev = ptype;
3640		}
3641	}
3642
3643	if (pt_prev) {
3644		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3645			goto drop;
3646		else
3647			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3648	} else {
3649drop:
3650		atomic_long_inc(&skb->dev->rx_dropped);
3651		kfree_skb(skb);
3652		/* Jamal, now you will not able to escape explaining
3653		 * me how you were going to use this. :-)
3654		 */
3655		ret = NET_RX_DROP;
3656	}
3657
3658unlock:
3659	rcu_read_unlock();
3660	return ret;
3661}
3662
3663static int __netif_receive_skb(struct sk_buff *skb)
3664{
3665	int ret;
3666
3667	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3668		unsigned long pflags = current->flags;
3669
3670		/*
3671		 * PFMEMALLOC skbs are special, they should
3672		 * - be delivered to SOCK_MEMALLOC sockets only
3673		 * - stay away from userspace
3674		 * - have bounded memory usage
3675		 *
3676		 * Use PF_MEMALLOC as this saves us from propagating the allocation
3677		 * context down to all allocation sites.
3678		 */
3679		current->flags |= PF_MEMALLOC;
3680		ret = __netif_receive_skb_core(skb, true);
3681		tsk_restore_flags(current, pflags, PF_MEMALLOC);
3682	} else
3683		ret = __netif_receive_skb_core(skb, false);
3684
3685	return ret;
3686}
3687
3688static int netif_receive_skb_internal(struct sk_buff *skb)
3689{
3690	net_timestamp_check(netdev_tstamp_prequeue, skb);
3691
3692	if (skb_defer_rx_timestamp(skb))
3693		return NET_RX_SUCCESS;
3694
3695#ifdef CONFIG_RPS
3696	if (static_key_false(&rps_needed)) {
3697		struct rps_dev_flow voidflow, *rflow = &voidflow;
3698		int cpu, ret;
3699
3700		rcu_read_lock();
3701
3702		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3703
3704		if (cpu >= 0) {
3705			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3706			rcu_read_unlock();
3707			return ret;
3708		}
3709		rcu_read_unlock();
3710	}
3711#endif
3712	return __netif_receive_skb(skb);
3713}
3714
3715/**
3716 *	netif_receive_skb - process receive buffer from network
3717 *	@skb: buffer to process
3718 *
3719 *	netif_receive_skb() is the main receive data processing function.
3720 *	It always succeeds. The buffer may be dropped during processing
3721 *	for congestion control or by the protocol layers.
3722 *
3723 *	This function may only be called from softirq context and interrupts
3724 *	should be enabled.
3725 *
3726 *	Return values (usually ignored):
3727 *	NET_RX_SUCCESS: no congestion
3728 *	NET_RX_DROP: packet was dropped
3729 */
3730int netif_receive_skb(struct sk_buff *skb)
3731{
3732	trace_netif_receive_skb_entry(skb);
3733
3734	return netif_receive_skb_internal(skb);
3735}
3736EXPORT_SYMBOL(netif_receive_skb);
3737
3738/* Network device is going away, flush any packets still pending
3739 * Called with irqs disabled.
3740 */
3741static void flush_backlog(void *arg)
3742{
3743	struct net_device *dev = arg;
3744	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3745	struct sk_buff *skb, *tmp;
3746
3747	rps_lock(sd);
3748	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3749		if (skb->dev == dev) {
3750			__skb_unlink(skb, &sd->input_pkt_queue);
3751			kfree_skb(skb);
3752			input_queue_head_incr(sd);
3753		}
3754	}
3755	rps_unlock(sd);
3756
3757	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3758		if (skb->dev == dev) {
3759			__skb_unlink(skb, &sd->process_queue);
3760			kfree_skb(skb);
3761			input_queue_head_incr(sd);
3762		}
3763	}
3764}
3765
3766static int napi_gro_complete(struct sk_buff *skb)
3767{
3768	struct packet_offload *ptype;
3769	__be16 type = skb->protocol;
3770	struct list_head *head = &offload_base;
3771	int err = -ENOENT;
3772
3773	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3774
3775	if (NAPI_GRO_CB(skb)->count == 1) {
3776		skb_shinfo(skb)->gso_size = 0;
3777		goto out;
3778	}
3779
3780	rcu_read_lock();
3781	list_for_each_entry_rcu(ptype, head, list) {
3782		if (ptype->type != type || !ptype->callbacks.gro_complete)
3783			continue;
3784
3785		err = ptype->callbacks.gro_complete(skb, 0);
3786		break;
3787	}
3788	rcu_read_unlock();
3789
3790	if (err) {
3791		WARN_ON(&ptype->list == head);
3792		kfree_skb(skb);
3793		return NET_RX_SUCCESS;
3794	}
3795
3796out:
3797	return netif_receive_skb_internal(skb);
3798}
3799
3800/* napi->gro_list contains packets ordered by age.
3801 * youngest packets at the head of it.
3802 * Complete skbs in reverse order to reduce latencies.
3803 */
3804void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3805{
3806	struct sk_buff *skb, *prev = NULL;
3807
3808	/* scan list and build reverse chain */
3809	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3810		skb->prev = prev;
3811		prev = skb;
3812	}
3813
3814	for (skb = prev; skb; skb = prev) {
 
3815		skb->next = NULL;
3816
3817		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3818			return;
3819
3820		prev = skb->prev;
3821		napi_gro_complete(skb);
3822		napi->gro_count--;
3823	}
3824
 
3825	napi->gro_list = NULL;
3826}
3827EXPORT_SYMBOL(napi_gro_flush);
3828
3829static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3830{
3831	struct sk_buff *p;
3832	unsigned int maclen = skb->dev->hard_header_len;
3833	u32 hash = skb_get_hash_raw(skb);
3834
3835	for (p = napi->gro_list; p; p = p->next) {
3836		unsigned long diffs;
3837
3838		NAPI_GRO_CB(p)->flush = 0;
3839
3840		if (hash != skb_get_hash_raw(p)) {
3841			NAPI_GRO_CB(p)->same_flow = 0;
3842			continue;
3843		}
3844
3845		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3846		diffs |= p->vlan_tci ^ skb->vlan_tci;
3847		if (maclen == ETH_HLEN)
3848			diffs |= compare_ether_header(skb_mac_header(p),
3849						      skb_mac_header(skb));
3850		else if (!diffs)
3851			diffs = memcmp(skb_mac_header(p),
3852				       skb_mac_header(skb),
3853				       maclen);
3854		NAPI_GRO_CB(p)->same_flow = !diffs;
3855	}
3856}
3857
3858static void skb_gro_reset_offset(struct sk_buff *skb)
3859{
3860	const struct skb_shared_info *pinfo = skb_shinfo(skb);
3861	const skb_frag_t *frag0 = &pinfo->frags[0];
3862
3863	NAPI_GRO_CB(skb)->data_offset = 0;
3864	NAPI_GRO_CB(skb)->frag0 = NULL;
3865	NAPI_GRO_CB(skb)->frag0_len = 0;
3866
3867	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3868	    pinfo->nr_frags &&
3869	    !PageHighMem(skb_frag_page(frag0))) {
3870		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3871		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3872	}
3873}
3874
3875static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3876{
3877	struct skb_shared_info *pinfo = skb_shinfo(skb);
3878
3879	BUG_ON(skb->end - skb->tail < grow);
3880
3881	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3882
3883	skb->data_len -= grow;
3884	skb->tail += grow;
3885
3886	pinfo->frags[0].page_offset += grow;
3887	skb_frag_size_sub(&pinfo->frags[0], grow);
3888
3889	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3890		skb_frag_unref(skb, 0);
3891		memmove(pinfo->frags, pinfo->frags + 1,
3892			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
3893	}
3894}
3895
3896static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3897{
3898	struct sk_buff **pp = NULL;
3899	struct packet_offload *ptype;
3900	__be16 type = skb->protocol;
3901	struct list_head *head = &offload_base;
3902	int same_flow;
 
3903	enum gro_result ret;
3904	int grow;
3905
3906	if (!(skb->dev->features & NETIF_F_GRO))
3907		goto normal;
3908
3909	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3910		goto normal;
3911
3912	gro_list_prepare(napi, skb);
3913	NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
3914
3915	rcu_read_lock();
3916	list_for_each_entry_rcu(ptype, head, list) {
3917		if (ptype->type != type || !ptype->callbacks.gro_receive)
3918			continue;
3919
3920		skb_set_network_header(skb, skb_gro_offset(skb));
3921		skb_reset_mac_len(skb);
 
3922		NAPI_GRO_CB(skb)->same_flow = 0;
3923		NAPI_GRO_CB(skb)->flush = 0;
3924		NAPI_GRO_CB(skb)->free = 0;
3925		NAPI_GRO_CB(skb)->udp_mark = 0;
3926
3927		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3928		break;
3929	}
3930	rcu_read_unlock();
3931
3932	if (&ptype->list == head)
3933		goto normal;
3934
3935	same_flow = NAPI_GRO_CB(skb)->same_flow;
3936	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3937
3938	if (pp) {
3939		struct sk_buff *nskb = *pp;
3940
3941		*pp = nskb->next;
3942		nskb->next = NULL;
3943		napi_gro_complete(nskb);
3944		napi->gro_count--;
3945	}
3946
3947	if (same_flow)
3948		goto ok;
3949
3950	if (NAPI_GRO_CB(skb)->flush)
3951		goto normal;
3952
3953	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
3954		struct sk_buff *nskb = napi->gro_list;
3955
3956		/* locate the end of the list to select the 'oldest' flow */
3957		while (nskb->next) {
3958			pp = &nskb->next;
3959			nskb = *pp;
3960		}
3961		*pp = NULL;
3962		nskb->next = NULL;
3963		napi_gro_complete(nskb);
3964	} else {
3965		napi->gro_count++;
3966	}
3967	NAPI_GRO_CB(skb)->count = 1;
3968	NAPI_GRO_CB(skb)->age = jiffies;
3969	NAPI_GRO_CB(skb)->last = skb;
3970	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3971	skb->next = napi->gro_list;
3972	napi->gro_list = skb;
3973	ret = GRO_HELD;
3974
3975pull:
3976	grow = skb_gro_offset(skb) - skb_headlen(skb);
3977	if (grow > 0)
3978		gro_pull_from_frag0(skb, grow);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3979ok:
3980	return ret;
3981
3982normal:
3983	ret = GRO_NORMAL;
3984	goto pull;
3985}
 
3986
3987struct packet_offload *gro_find_receive_by_type(__be16 type)
 
3988{
3989	struct list_head *offload_head = &offload_base;
3990	struct packet_offload *ptype;
3991
3992	list_for_each_entry_rcu(ptype, offload_head, list) {
3993		if (ptype->type != type || !ptype->callbacks.gro_receive)
3994			continue;
3995		return ptype;
3996	}
3997	return NULL;
3998}
3999EXPORT_SYMBOL(gro_find_receive_by_type);
4000
4001struct packet_offload *gro_find_complete_by_type(__be16 type)
4002{
4003	struct list_head *offload_head = &offload_base;
4004	struct packet_offload *ptype;
4005
4006	list_for_each_entry_rcu(ptype, offload_head, list) {
4007		if (ptype->type != type || !ptype->callbacks.gro_complete)
4008			continue;
4009		return ptype;
 
 
 
 
 
 
 
4010	}
4011	return NULL;
 
4012}
4013EXPORT_SYMBOL(gro_find_complete_by_type);
4014
4015static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4016{
4017	switch (ret) {
4018	case GRO_NORMAL:
4019		if (netif_receive_skb_internal(skb))
4020			ret = GRO_DROP;
4021		break;
4022
4023	case GRO_DROP:
4024		kfree_skb(skb);
4025		break;
4026
4027	case GRO_MERGED_FREE:
4028		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4029			kmem_cache_free(skbuff_head_cache, skb);
4030		else
4031			__kfree_skb(skb);
4032		break;
4033
4034	case GRO_HELD:
4035	case GRO_MERGED:
4036		break;
4037	}
4038
4039	return ret;
4040}
 
4041
4042gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4043{
4044	trace_napi_gro_receive_entry(skb);
 
 
 
 
 
 
 
 
 
 
 
4045
 
 
4046	skb_gro_reset_offset(skb);
4047
4048	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4049}
4050EXPORT_SYMBOL(napi_gro_receive);
4051
4052static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4053{
4054	__skb_pull(skb, skb_headlen(skb));
4055	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4056	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4057	skb->vlan_tci = 0;
4058	skb->dev = napi->dev;
4059	skb->skb_iif = 0;
4060	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4061
4062	napi->skb = skb;
4063}
4064
4065struct sk_buff *napi_get_frags(struct napi_struct *napi)
4066{
4067	struct sk_buff *skb = napi->skb;
4068
4069	if (!skb) {
4070		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
4071		napi->skb = skb;
 
4072	}
4073	return skb;
4074}
4075EXPORT_SYMBOL(napi_get_frags);
4076
4077static gro_result_t napi_frags_finish(struct napi_struct *napi,
4078				      struct sk_buff *skb,
4079				      gro_result_t ret)
4080{
4081	switch (ret) {
4082	case GRO_NORMAL:
4083	case GRO_HELD:
4084		__skb_push(skb, ETH_HLEN);
4085		skb->protocol = eth_type_trans(skb, skb->dev);
4086		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
 
 
 
4087			ret = GRO_DROP;
4088		break;
4089
4090	case GRO_DROP:
4091	case GRO_MERGED_FREE:
4092		napi_reuse_skb(napi, skb);
4093		break;
4094
4095	case GRO_MERGED:
4096		break;
4097	}
4098
4099	return ret;
4100}
 
4101
4102/* Upper GRO stack assumes network header starts at gro_offset=0
4103 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4104 * We copy ethernet header into skb->data to have a common layout.
4105 */
4106static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4107{
4108	struct sk_buff *skb = napi->skb;
4109	const struct ethhdr *eth;
4110	unsigned int hlen = sizeof(*eth);
 
4111
4112	napi->skb = NULL;
4113
4114	skb_reset_mac_header(skb);
4115	skb_gro_reset_offset(skb);
4116
4117	eth = skb_gro_header_fast(skb, 0);
4118	if (unlikely(skb_gro_header_hard(skb, hlen))) {
4119		eth = skb_gro_header_slow(skb, hlen, 0);
 
 
4120		if (unlikely(!eth)) {
4121			napi_reuse_skb(napi, skb);
4122			return NULL;
 
4123		}
4124	} else {
4125		gro_pull_from_frag0(skb, hlen);
4126		NAPI_GRO_CB(skb)->frag0 += hlen;
4127		NAPI_GRO_CB(skb)->frag0_len -= hlen;
4128	}
4129	__skb_pull(skb, hlen);
 
4130
4131	/*
4132	 * This works because the only protocols we care about don't require
4133	 * special handling.
4134	 * We'll fix it up properly in napi_frags_finish()
4135	 */
4136	skb->protocol = eth->h_proto;
4137
 
4138	return skb;
4139}
4140
4141gro_result_t napi_gro_frags(struct napi_struct *napi)
4142{
4143	struct sk_buff *skb = napi_frags_skb(napi);
4144
4145	if (!skb)
4146		return GRO_DROP;
4147
4148	trace_napi_gro_frags_entry(skb);
4149
4150	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4151}
4152EXPORT_SYMBOL(napi_gro_frags);
4153
4154/*
4155 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4156 * Note: called with local irq disabled, but exits with local irq enabled.
4157 */
4158static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4159{
4160#ifdef CONFIG_RPS
4161	struct softnet_data *remsd = sd->rps_ipi_list;
4162
4163	if (remsd) {
4164		sd->rps_ipi_list = NULL;
4165
4166		local_irq_enable();
4167
4168		/* Send pending IPI's to kick RPS processing on remote cpus. */
4169		while (remsd) {
4170			struct softnet_data *next = remsd->rps_ipi_next;
4171
4172			if (cpu_online(remsd->cpu))
4173				smp_call_function_single_async(remsd->cpu,
4174							   &remsd->csd);
4175			remsd = next;
4176		}
4177	} else
4178#endif
4179		local_irq_enable();
4180}
4181
4182static int process_backlog(struct napi_struct *napi, int quota)
4183{
4184	int work = 0;
4185	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4186
4187#ifdef CONFIG_RPS
4188	/* Check if we have pending ipi, its better to send them now,
4189	 * not waiting net_rx_action() end.
4190	 */
4191	if (sd->rps_ipi_list) {
4192		local_irq_disable();
4193		net_rps_action_and_irq_enable(sd);
4194	}
4195#endif
4196	napi->weight = weight_p;
4197	local_irq_disable();
4198	while (work < quota) {
4199		struct sk_buff *skb;
4200		unsigned int qlen;
4201
4202		while ((skb = __skb_dequeue(&sd->process_queue))) {
4203			local_irq_enable();
4204			__netif_receive_skb(skb);
4205			local_irq_disable();
4206			input_queue_head_incr(sd);
4207			if (++work >= quota) {
4208				local_irq_enable();
4209				return work;
4210			}
4211		}
4212
4213		rps_lock(sd);
4214		qlen = skb_queue_len(&sd->input_pkt_queue);
4215		if (qlen)
4216			skb_queue_splice_tail_init(&sd->input_pkt_queue,
4217						   &sd->process_queue);
4218
4219		if (qlen < quota - work) {
4220			/*
4221			 * Inline a custom version of __napi_complete().
4222			 * only current cpu owns and manipulates this napi,
4223			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4224			 * we can use a plain write instead of clear_bit(),
4225			 * and we dont need an smp_mb() memory barrier.
4226			 */
4227			list_del(&napi->poll_list);
4228			napi->state = 0;
4229
4230			quota = work + qlen;
4231		}
4232		rps_unlock(sd);
4233	}
4234	local_irq_enable();
4235
4236	return work;
4237}
4238
4239/**
4240 * __napi_schedule - schedule for receive
4241 * @n: entry to schedule
4242 *
4243 * The entry's receive function will be scheduled to run
4244 */
4245void __napi_schedule(struct napi_struct *n)
4246{
4247	unsigned long flags;
4248
4249	local_irq_save(flags);
4250	____napi_schedule(&__get_cpu_var(softnet_data), n);
4251	local_irq_restore(flags);
4252}
4253EXPORT_SYMBOL(__napi_schedule);
4254
4255void __napi_complete(struct napi_struct *n)
4256{
4257	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4258	BUG_ON(n->gro_list);
4259
4260	list_del(&n->poll_list);
4261	smp_mb__before_clear_bit();
4262	clear_bit(NAPI_STATE_SCHED, &n->state);
4263}
4264EXPORT_SYMBOL(__napi_complete);
4265
4266void napi_complete(struct napi_struct *n)
4267{
4268	unsigned long flags;
4269
4270	/*
4271	 * don't let napi dequeue from the cpu poll list
4272	 * just in case its running on a different cpu
4273	 */
4274	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4275		return;
4276
4277	napi_gro_flush(n, false);
4278	local_irq_save(flags);
4279	__napi_complete(n);
4280	local_irq_restore(flags);
4281}
4282EXPORT_SYMBOL(napi_complete);
4283
4284/* must be called under rcu_read_lock(), as we dont take a reference */
4285struct napi_struct *napi_by_id(unsigned int napi_id)
4286{
4287	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4288	struct napi_struct *napi;
4289
4290	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4291		if (napi->napi_id == napi_id)
4292			return napi;
4293
4294	return NULL;
4295}
4296EXPORT_SYMBOL_GPL(napi_by_id);
4297
4298void napi_hash_add(struct napi_struct *napi)
4299{
4300	if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4301
4302		spin_lock(&napi_hash_lock);
4303
4304		/* 0 is not a valid id, we also skip an id that is taken
4305		 * we expect both events to be extremely rare
4306		 */
4307		napi->napi_id = 0;
4308		while (!napi->napi_id) {
4309			napi->napi_id = ++napi_gen_id;
4310			if (napi_by_id(napi->napi_id))
4311				napi->napi_id = 0;
4312		}
4313
4314		hlist_add_head_rcu(&napi->napi_hash_node,
4315			&napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4316
4317		spin_unlock(&napi_hash_lock);
4318	}
4319}
4320EXPORT_SYMBOL_GPL(napi_hash_add);
4321
4322/* Warning : caller is responsible to make sure rcu grace period
4323 * is respected before freeing memory containing @napi
4324 */
4325void napi_hash_del(struct napi_struct *napi)
4326{
4327	spin_lock(&napi_hash_lock);
4328
4329	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4330		hlist_del_rcu(&napi->napi_hash_node);
4331
4332	spin_unlock(&napi_hash_lock);
4333}
4334EXPORT_SYMBOL_GPL(napi_hash_del);
4335
4336void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4337		    int (*poll)(struct napi_struct *, int), int weight)
4338{
4339	INIT_LIST_HEAD(&napi->poll_list);
4340	napi->gro_count = 0;
4341	napi->gro_list = NULL;
4342	napi->skb = NULL;
4343	napi->poll = poll;
4344	if (weight > NAPI_POLL_WEIGHT)
4345		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4346			    weight, dev->name);
4347	napi->weight = weight;
4348	list_add(&napi->dev_list, &dev->napi_list);
4349	napi->dev = dev;
4350#ifdef CONFIG_NETPOLL
4351	spin_lock_init(&napi->poll_lock);
4352	napi->poll_owner = -1;
4353#endif
4354	set_bit(NAPI_STATE_SCHED, &napi->state);
4355}
4356EXPORT_SYMBOL(netif_napi_add);
4357
4358void netif_napi_del(struct napi_struct *napi)
4359{
 
 
4360	list_del_init(&napi->dev_list);
4361	napi_free_frags(napi);
4362
4363	kfree_skb_list(napi->gro_list);
 
 
 
 
 
4364	napi->gro_list = NULL;
4365	napi->gro_count = 0;
4366}
4367EXPORT_SYMBOL(netif_napi_del);
4368
4369static void net_rx_action(struct softirq_action *h)
4370{
4371	struct softnet_data *sd = &__get_cpu_var(softnet_data);
4372	unsigned long time_limit = jiffies + 2;
4373	int budget = netdev_budget;
4374	void *have;
4375
4376	local_irq_disable();
4377
4378	while (!list_empty(&sd->poll_list)) {
4379		struct napi_struct *n;
4380		int work, weight;
4381
4382		/* If softirq window is exhuasted then punt.
4383		 * Allow this to run for 2 jiffies since which will allow
4384		 * an average latency of 1.5/HZ.
4385		 */
4386		if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4387			goto softnet_break;
4388
4389		local_irq_enable();
4390
4391		/* Even though interrupts have been re-enabled, this
4392		 * access is safe because interrupts can only add new
4393		 * entries to the tail of this list, and only ->poll()
4394		 * calls can remove this head entry from the list.
4395		 */
4396		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4397
4398		have = netpoll_poll_lock(n);
4399
4400		weight = n->weight;
4401
4402		/* This NAPI_STATE_SCHED test is for avoiding a race
4403		 * with netpoll's poll_napi().  Only the entity which
4404		 * obtains the lock and sees NAPI_STATE_SCHED set will
4405		 * actually make the ->poll() call.  Therefore we avoid
4406		 * accidentally calling ->poll() when NAPI is not scheduled.
4407		 */
4408		work = 0;
4409		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4410			work = n->poll(n, weight);
4411			trace_napi_poll(n);
4412		}
4413
4414		WARN_ON_ONCE(work > weight);
4415
4416		budget -= work;
4417
4418		local_irq_disable();
4419
4420		/* Drivers must not modify the NAPI state if they
4421		 * consume the entire weight.  In such cases this code
4422		 * still "owns" the NAPI instance and therefore can
4423		 * move the instance around on the list at-will.
4424		 */
4425		if (unlikely(work == weight)) {
4426			if (unlikely(napi_disable_pending(n))) {
4427				local_irq_enable();
4428				napi_complete(n);
4429				local_irq_disable();
4430			} else {
4431				if (n->gro_list) {
4432					/* flush too old packets
4433					 * If HZ < 1000, flush all packets.
4434					 */
4435					local_irq_enable();
4436					napi_gro_flush(n, HZ >= 1000);
4437					local_irq_disable();
4438				}
4439				list_move_tail(&n->poll_list, &sd->poll_list);
4440			}
4441		}
4442
4443		netpoll_poll_unlock(have);
4444	}
4445out:
4446	net_rps_action_and_irq_enable(sd);
4447
4448#ifdef CONFIG_NET_DMA
4449	/*
4450	 * There may not be any more sk_buffs coming right now, so push
4451	 * any pending DMA copies to hardware
4452	 */
4453	dma_issue_pending_all();
4454#endif
4455
4456	return;
4457
4458softnet_break:
4459	sd->time_squeeze++;
4460	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4461	goto out;
4462}
4463
4464struct netdev_adjacent {
4465	struct net_device *dev;
4466
4467	/* upper master flag, there can only be one master device per list */
4468	bool master;
4469
4470	/* counter for the number of times this device was added to us */
4471	u16 ref_nr;
4472
4473	/* private field for the users */
4474	void *private;
4475
4476	struct list_head list;
4477	struct rcu_head rcu;
4478};
4479
4480static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4481						 struct net_device *adj_dev,
4482						 struct list_head *adj_list)
4483{
4484	struct netdev_adjacent *adj;
4485
4486	list_for_each_entry(adj, adj_list, list) {
4487		if (adj->dev == adj_dev)
4488			return adj;
4489	}
4490	return NULL;
4491}
4492
4493/**
4494 * netdev_has_upper_dev - Check if device is linked to an upper device
4495 * @dev: device
4496 * @upper_dev: upper device to check
4497 *
4498 * Find out if a device is linked to specified upper device and return true
4499 * in case it is. Note that this checks only immediate upper device,
4500 * not through a complete stack of devices. The caller must hold the RTNL lock.
4501 */
4502bool netdev_has_upper_dev(struct net_device *dev,
4503			  struct net_device *upper_dev)
4504{
4505	ASSERT_RTNL();
4506
4507	return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
 
4508}
4509EXPORT_SYMBOL(netdev_has_upper_dev);
4510
4511/**
4512 * netdev_has_any_upper_dev - Check if device is linked to some device
4513 * @dev: device
4514 *
4515 * Find out if a device is linked to an upper device and return true in case
4516 * it is. The caller must hold the RTNL lock.
4517 */
4518static bool netdev_has_any_upper_dev(struct net_device *dev)
4519{
4520	ASSERT_RTNL();
4521
4522	return !list_empty(&dev->all_adj_list.upper);
4523}
 
4524
4525/**
4526 * netdev_master_upper_dev_get - Get master upper device
4527 * @dev: device
4528 *
4529 * Find a master upper device and return pointer to it or NULL in case
4530 * it's not there. The caller must hold the RTNL lock.
4531 */
4532struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4533{
4534	struct netdev_adjacent *upper;
4535
4536	ASSERT_RTNL();
 
 
 
4537
4538	if (list_empty(&dev->adj_list.upper))
4539		return NULL;
 
4540
4541	upper = list_first_entry(&dev->adj_list.upper,
4542				 struct netdev_adjacent, list);
4543	if (likely(upper->master))
4544		return upper->dev;
4545	return NULL;
4546}
4547EXPORT_SYMBOL(netdev_master_upper_dev_get);
4548
4549void *netdev_adjacent_get_private(struct list_head *adj_list)
4550{
4551	struct netdev_adjacent *adj;
 
 
 
4552
4553	adj = list_entry(adj_list, struct netdev_adjacent, list);
 
4554
4555	return adj->private;
 
 
4556}
4557EXPORT_SYMBOL(netdev_adjacent_get_private);
4558
4559/**
4560 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4561 * @dev: device
4562 * @iter: list_head ** of the current position
4563 *
4564 * Gets the next device from the dev's upper list, starting from iter
4565 * position. The caller must hold RCU read lock.
4566 */
4567struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4568						 struct list_head **iter)
4569{
4570	struct netdev_adjacent *upper;
4571
4572	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4573
4574	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4575
4576	if (&upper->list == &dev->adj_list.upper)
4577		return NULL;
4578
4579	*iter = &upper->list;
 
 
 
 
 
 
 
4580
4581	return upper->dev;
4582}
4583EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4584
4585/**
4586 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4587 * @dev: device
4588 * @iter: list_head ** of the current position
4589 *
4590 * Gets the next device from the dev's upper list, starting from iter
4591 * position. The caller must hold RCU read lock.
4592 */
4593struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4594						     struct list_head **iter)
4595{
4596	struct netdev_adjacent *upper;
4597
4598	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 
4599
4600	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 
 
4601
4602	if (&upper->list == &dev->all_adj_list.upper)
4603		return NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4604
4605	*iter = &upper->list;
 
 
 
4606
4607	return upper->dev;
 
 
 
4608}
4609EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4610
4611/**
4612 * netdev_lower_get_next_private - Get the next ->private from the
4613 *				   lower neighbour list
4614 * @dev: device
4615 * @iter: list_head ** of the current position
4616 *
4617 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4618 * list, starting from iter position. The caller must hold either hold the
4619 * RTNL lock or its own locking that guarantees that the neighbour lower
4620 * list will remain unchainged.
4621 */
4622void *netdev_lower_get_next_private(struct net_device *dev,
4623				    struct list_head **iter)
4624{
4625	struct netdev_adjacent *lower;
4626
4627	lower = list_entry(*iter, struct netdev_adjacent, list);
4628
4629	if (&lower->list == &dev->adj_list.lower)
4630		return NULL;
 
4631
4632	*iter = lower->list.next;
 
 
 
 
 
 
 
 
 
 
 
 
4633
4634	return lower->private;
4635}
4636EXPORT_SYMBOL(netdev_lower_get_next_private);
4637
4638/**
4639 * netdev_lower_get_next_private_rcu - Get the next ->private from the
4640 *				       lower neighbour list, RCU
4641 *				       variant
4642 * @dev: device
4643 * @iter: list_head ** of the current position
4644 *
4645 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4646 * list, starting from iter position. The caller must hold RCU read lock.
4647 */
4648void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4649					struct list_head **iter)
4650{
4651	struct netdev_adjacent *lower;
4652
4653	WARN_ON_ONCE(!rcu_read_lock_held());
4654
4655	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4656
4657	if (&lower->list == &dev->adj_list.lower)
4658		return NULL;
 
 
4659
4660	*iter = &lower->list;
 
 
4661
4662	return lower->private;
4663}
4664EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4665
4666/**
4667 * netdev_lower_get_next - Get the next device from the lower neighbour
4668 *                         list
4669 * @dev: device
4670 * @iter: list_head ** of the current position
4671 *
4672 * Gets the next netdev_adjacent from the dev's lower neighbour
4673 * list, starting from iter position. The caller must hold RTNL lock or
4674 * its own locking that guarantees that the neighbour lower
4675 * list will remain unchainged.
4676 */
4677void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
 
4678{
4679	struct netdev_adjacent *lower;
4680
4681	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4682
4683	if (&lower->list == &dev->adj_list.lower)
4684		return NULL;
4685
4686	*iter = &lower->list;
4687
4688	return lower->dev;
4689}
4690EXPORT_SYMBOL(netdev_lower_get_next);
4691
4692/**
4693 * netdev_lower_get_first_private_rcu - Get the first ->private from the
4694 *				       lower neighbour list, RCU
4695 *				       variant
4696 * @dev: device
4697 *
4698 * Gets the first netdev_adjacent->private from the dev's lower neighbour
4699 * list. The caller must hold RCU read lock.
4700 */
4701void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4702{
4703	struct netdev_adjacent *lower;
4704
4705	lower = list_first_or_null_rcu(&dev->adj_list.lower,
4706			struct netdev_adjacent, list);
4707	if (lower)
4708		return lower->private;
4709	return NULL;
4710}
4711EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4712
4713/**
4714 * netdev_master_upper_dev_get_rcu - Get master upper device
4715 * @dev: device
4716 *
4717 * Find a master upper device and return pointer to it or NULL in case
4718 * it's not there. The caller must hold the RCU read lock.
4719 */
4720struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4721{
4722	struct netdev_adjacent *upper;
4723
4724	upper = list_first_or_null_rcu(&dev->adj_list.upper,
4725				       struct netdev_adjacent, list);
4726	if (upper && likely(upper->master))
4727		return upper->dev;
4728	return NULL;
4729}
4730EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4731
4732static int netdev_adjacent_sysfs_add(struct net_device *dev,
4733			      struct net_device *adj_dev,
4734			      struct list_head *dev_list)
4735{
4736	char linkname[IFNAMSIZ+7];
4737	sprintf(linkname, dev_list == &dev->adj_list.upper ?
4738		"upper_%s" : "lower_%s", adj_dev->name);
4739	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4740				 linkname);
4741}
4742static void netdev_adjacent_sysfs_del(struct net_device *dev,
4743			       char *name,
4744			       struct list_head *dev_list)
4745{
4746	char linkname[IFNAMSIZ+7];
4747	sprintf(linkname, dev_list == &dev->adj_list.upper ?
4748		"upper_%s" : "lower_%s", name);
4749	sysfs_remove_link(&(dev->dev.kobj), linkname);
4750}
4751
4752#define netdev_adjacent_is_neigh_list(dev, dev_list) \
4753		(dev_list == &dev->adj_list.upper || \
4754		 dev_list == &dev->adj_list.lower)
4755
4756static int __netdev_adjacent_dev_insert(struct net_device *dev,
4757					struct net_device *adj_dev,
4758					struct list_head *dev_list,
4759					void *private, bool master)
4760{
4761	struct netdev_adjacent *adj;
4762	int ret;
4763
4764	adj = __netdev_find_adj(dev, adj_dev, dev_list);
4765
4766	if (adj) {
4767		adj->ref_nr++;
4768		return 0;
4769	}
4770
4771	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4772	if (!adj)
4773		return -ENOMEM;
4774
4775	adj->dev = adj_dev;
4776	adj->master = master;
4777	adj->ref_nr = 1;
4778	adj->private = private;
4779	dev_hold(adj_dev);
4780
4781	pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4782		 adj_dev->name, dev->name, adj_dev->name);
4783
4784	if (netdev_adjacent_is_neigh_list(dev, dev_list)) {
4785		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
4786		if (ret)
4787			goto free_adj;
4788	}
4789
4790	/* Ensure that master link is always the first item in list. */
4791	if (master) {
4792		ret = sysfs_create_link(&(dev->dev.kobj),
4793					&(adj_dev->dev.kobj), "master");
4794		if (ret)
4795			goto remove_symlinks;
4796
4797		list_add_rcu(&adj->list, dev_list);
4798	} else {
4799		list_add_tail_rcu(&adj->list, dev_list);
4800	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4801
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4802	return 0;
 
4803
4804remove_symlinks:
4805	if (netdev_adjacent_is_neigh_list(dev, dev_list))
4806		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4807free_adj:
4808	kfree(adj);
4809	dev_put(adj_dev);
4810
4811	return ret;
 
 
 
 
 
 
4812}
4813
4814static void __netdev_adjacent_dev_remove(struct net_device *dev,
4815					 struct net_device *adj_dev,
4816					 struct list_head *dev_list)
4817{
4818	struct netdev_adjacent *adj;
 
4819
4820	adj = __netdev_find_adj(dev, adj_dev, dev_list);
 
 
 
 
4821
4822	if (!adj) {
4823		pr_err("tried to remove device %s from %s\n",
4824		       dev->name, adj_dev->name);
4825		BUG();
4826	}
4827
4828	if (adj->ref_nr > 1) {
4829		pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
4830			 adj->ref_nr-1);
4831		adj->ref_nr--;
4832		return;
4833	}
4834
4835	if (adj->master)
4836		sysfs_remove_link(&(dev->dev.kobj), "master");
 
 
 
 
4837
4838	if (netdev_adjacent_is_neigh_list(dev, dev_list))
4839		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 
 
 
 
4840
4841	list_del_rcu(&adj->list);
4842	pr_debug("dev_put for %s, because link removed from %s to %s\n",
4843		 adj_dev->name, dev->name, adj_dev->name);
4844	dev_put(adj_dev);
4845	kfree_rcu(adj, rcu);
4846}
4847
4848static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
4849					    struct net_device *upper_dev,
4850					    struct list_head *up_list,
4851					    struct list_head *down_list,
4852					    void *private, bool master)
4853{
4854	int ret;
4855
4856	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
4857					   master);
4858	if (ret)
4859		return ret;
4860
4861	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
4862					   false);
4863	if (ret) {
4864		__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4865		return ret;
4866	}
 
4867
4868	return 0;
4869}
 
 
 
 
4870
4871static int __netdev_adjacent_dev_link(struct net_device *dev,
4872				      struct net_device *upper_dev)
4873{
4874	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
4875						&dev->all_adj_list.upper,
4876						&upper_dev->all_adj_list.lower,
4877						NULL, false);
4878}
4879
4880static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
4881					       struct net_device *upper_dev,
4882					       struct list_head *up_list,
4883					       struct list_head *down_list)
 
 
 
 
 
4884{
4885	__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4886	__netdev_adjacent_dev_remove(upper_dev, dev, down_list);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4887}
4888
4889static void __netdev_adjacent_dev_unlink(struct net_device *dev,
4890					 struct net_device *upper_dev)
4891{
4892	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4893					   &dev->all_adj_list.upper,
4894					   &upper_dev->all_adj_list.lower);
4895}
4896
4897static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
4898						struct net_device *upper_dev,
4899						void *private, bool master)
4900{
4901	int ret = __netdev_adjacent_dev_link(dev, upper_dev);
4902
4903	if (ret)
4904		return ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
4905
4906	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
4907					       &dev->adj_list.upper,
4908					       &upper_dev->adj_list.lower,
4909					       private, master);
4910	if (ret) {
4911		__netdev_adjacent_dev_unlink(dev, upper_dev);
4912		return ret;
4913	}
4914
4915	return 0;
4916}
4917
4918static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
4919						   struct net_device *upper_dev)
4920{
4921	__netdev_adjacent_dev_unlink(dev, upper_dev);
4922	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4923					   &dev->adj_list.upper,
4924					   &upper_dev->adj_list.lower);
4925}
4926
4927static int __netdev_upper_dev_link(struct net_device *dev,
4928				   struct net_device *upper_dev, bool master,
4929				   void *private)
4930{
4931	struct netdev_adjacent *i, *j, *to_i, *to_j;
4932	int ret = 0;
4933
4934	ASSERT_RTNL();
4935
4936	if (dev == upper_dev)
4937		return -EBUSY;
4938
4939	/* To prevent loops, check if dev is not upper device to upper_dev. */
4940	if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
4941		return -EBUSY;
4942
4943	if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
4944		return -EEXIST;
4945
4946	if (master && netdev_master_upper_dev_get(dev))
4947		return -EBUSY;
4948
4949	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
4950						   master);
4951	if (ret)
4952		return ret;
4953
4954	/* Now that we linked these devs, make all the upper_dev's
4955	 * all_adj_list.upper visible to every dev's all_adj_list.lower an
4956	 * versa, and don't forget the devices itself. All of these
4957	 * links are non-neighbours.
4958	 */
4959	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4960		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
4961			pr_debug("Interlinking %s with %s, non-neighbour\n",
4962				 i->dev->name, j->dev->name);
4963			ret = __netdev_adjacent_dev_link(i->dev, j->dev);
4964			if (ret)
4965				goto rollback_mesh;
4966		}
4967	}
4968
4969	/* add dev to every upper_dev's upper device */
4970	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
4971		pr_debug("linking %s's upper device %s with %s\n",
4972			 upper_dev->name, i->dev->name, dev->name);
4973		ret = __netdev_adjacent_dev_link(dev, i->dev);
4974		if (ret)
4975			goto rollback_upper_mesh;
4976	}
4977
4978	/* add upper_dev to every dev's lower device */
4979	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4980		pr_debug("linking %s's lower device %s with %s\n", dev->name,
4981			 i->dev->name, upper_dev->name);
4982		ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
4983		if (ret)
4984			goto rollback_lower_mesh;
4985	}
4986
4987	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4988	return 0;
 
4989
4990rollback_lower_mesh:
4991	to_i = i;
4992	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4993		if (i == to_i)
4994			break;
4995		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
4996	}
4997
4998	i = NULL;
 
 
 
 
4999
5000rollback_upper_mesh:
5001	to_i = i;
5002	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5003		if (i == to_i)
5004			break;
5005		__netdev_adjacent_dev_unlink(dev, i->dev);
5006	}
5007
5008	i = j = NULL;
5009
5010rollback_mesh:
5011	to_i = i;
5012	to_j = j;
5013	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5014		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5015			if (i == to_i && j == to_j)
5016				break;
5017			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5018		}
5019		if (i == to_i)
5020			break;
5021	}
5022
5023	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 
 
 
 
 
5024
5025	return ret;
 
 
 
 
 
 
 
 
 
 
 
5026}
5027
5028/**
5029 * netdev_upper_dev_link - Add a link to the upper device
5030 * @dev: device
5031 * @upper_dev: new upper device
5032 *
5033 * Adds a link to device which is upper to this one. The caller must hold
5034 * the RTNL lock. On a failure a negative errno code is returned.
5035 * On success the reference counts are adjusted and the function
5036 * returns zero.
5037 */
5038int netdev_upper_dev_link(struct net_device *dev,
5039			  struct net_device *upper_dev)
5040{
5041	return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5042}
5043EXPORT_SYMBOL(netdev_upper_dev_link);
5044
5045/**
5046 * netdev_master_upper_dev_link - Add a master link to the upper device
5047 * @dev: device
5048 * @upper_dev: new upper device
5049 *
5050 * Adds a link to device which is upper to this one. In this case, only
5051 * one master upper device can be linked, although other non-master devices
5052 * might be linked as well. The caller must hold the RTNL lock.
5053 * On a failure a negative errno code is returned. On success the reference
5054 * counts are adjusted and the function returns zero.
5055 */
5056int netdev_master_upper_dev_link(struct net_device *dev,
5057				 struct net_device *upper_dev)
5058{
5059	return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5060}
5061EXPORT_SYMBOL(netdev_master_upper_dev_link);
5062
5063int netdev_master_upper_dev_link_private(struct net_device *dev,
5064					 struct net_device *upper_dev,
5065					 void *private)
 
 
 
5066{
5067	return __netdev_upper_dev_link(dev, upper_dev, true, private);
5068}
5069EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
 
 
 
5070
5071/**
5072 * netdev_upper_dev_unlink - Removes a link to upper device
5073 * @dev: device
5074 * @upper_dev: new upper device
5075 *
5076 * Removes a link to device which is upper to this one. The caller must hold
5077 * the RTNL lock.
 
 
5078 */
5079void netdev_upper_dev_unlink(struct net_device *dev,
5080			     struct net_device *upper_dev)
5081{
5082	struct netdev_adjacent *i, *j;
5083	ASSERT_RTNL();
5084
5085	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5086
5087	/* Here is the tricky part. We must remove all dev's lower
5088	 * devices from all upper_dev's upper devices and vice
5089	 * versa, to maintain the graph relationship.
5090	 */
5091	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5092		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5093			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5094
5095	/* remove also the devices itself from lower/upper device
5096	 * list
5097	 */
5098	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5099		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5100
5101	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5102		__netdev_adjacent_dev_unlink(dev, i->dev);
5103
5104	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5105}
5106EXPORT_SYMBOL(netdev_upper_dev_unlink);
5107
5108void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5109{
5110	struct netdev_adjacent *iter;
5111
5112	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5113		netdev_adjacent_sysfs_del(iter->dev, oldname,
5114					  &iter->dev->adj_list.lower);
5115		netdev_adjacent_sysfs_add(iter->dev, dev,
5116					  &iter->dev->adj_list.lower);
5117	}
5118
5119	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5120		netdev_adjacent_sysfs_del(iter->dev, oldname,
5121					  &iter->dev->adj_list.upper);
5122		netdev_adjacent_sysfs_add(iter->dev, dev,
5123					  &iter->dev->adj_list.upper);
5124	}
5125}
5126
5127void *netdev_lower_dev_get_private(struct net_device *dev,
5128				   struct net_device *lower_dev)
5129{
5130	struct netdev_adjacent *lower;
5131
5132	if (!lower_dev)
5133		return NULL;
5134	lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5135	if (!lower)
5136		return NULL;
5137
5138	return lower->private;
5139}
5140EXPORT_SYMBOL(netdev_lower_dev_get_private);
5141
5142
5143int dev_get_nest_level(struct net_device *dev,
5144		       bool (*type_check)(struct net_device *dev))
 
 
 
 
 
 
 
 
5145{
5146	struct net_device *lower = NULL;
5147	struct list_head *iter;
5148	int max_nest = -1;
5149	int nest;
5150
5151	ASSERT_RTNL();
5152
5153	netdev_for_each_lower_dev(dev, lower, iter) {
5154		nest = dev_get_nest_level(lower, type_check);
5155		if (max_nest < nest)
5156			max_nest = nest;
5157	}
5158
5159	if (type_check(dev))
5160		max_nest++;
5161
5162	return max_nest;
 
5163}
5164EXPORT_SYMBOL(dev_get_nest_level);
5165
5166static void dev_change_rx_flags(struct net_device *dev, int flags)
5167{
5168	const struct net_device_ops *ops = dev->netdev_ops;
5169
5170	if (ops->ndo_change_rx_flags)
5171		ops->ndo_change_rx_flags(dev, flags);
5172}
5173
5174static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5175{
5176	unsigned int old_flags = dev->flags;
5177	kuid_t uid;
5178	kgid_t gid;
5179
5180	ASSERT_RTNL();
5181
5182	dev->flags |= IFF_PROMISC;
5183	dev->promiscuity += inc;
5184	if (dev->promiscuity == 0) {
5185		/*
5186		 * Avoid overflow.
5187		 * If inc causes overflow, untouch promisc and return error.
5188		 */
5189		if (inc < 0)
5190			dev->flags &= ~IFF_PROMISC;
5191		else {
5192			dev->promiscuity -= inc;
5193			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5194				dev->name);
5195			return -EOVERFLOW;
5196		}
5197	}
5198	if (dev->flags != old_flags) {
5199		pr_info("device %s %s promiscuous mode\n",
5200			dev->name,
5201			dev->flags & IFF_PROMISC ? "entered" : "left");
5202		if (audit_enabled) {
5203			current_uid_gid(&uid, &gid);
5204			audit_log(current->audit_context, GFP_ATOMIC,
5205				AUDIT_ANOM_PROMISCUOUS,
5206				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5207				dev->name, (dev->flags & IFF_PROMISC),
5208				(old_flags & IFF_PROMISC),
5209				from_kuid(&init_user_ns, audit_get_loginuid(current)),
5210				from_kuid(&init_user_ns, uid),
5211				from_kgid(&init_user_ns, gid),
5212				audit_get_sessionid(current));
5213		}
5214
5215		dev_change_rx_flags(dev, IFF_PROMISC);
5216	}
5217	if (notify)
5218		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
5219	return 0;
5220}
5221
5222/**
5223 *	dev_set_promiscuity	- update promiscuity count on a device
5224 *	@dev: device
5225 *	@inc: modifier
5226 *
5227 *	Add or remove promiscuity from a device. While the count in the device
5228 *	remains above zero the interface remains promiscuous. Once it hits zero
5229 *	the device reverts back to normal filtering operation. A negative inc
5230 *	value is used to drop promiscuity on the device.
5231 *	Return 0 if successful or a negative errno code on error.
5232 */
5233int dev_set_promiscuity(struct net_device *dev, int inc)
5234{
5235	unsigned int old_flags = dev->flags;
5236	int err;
5237
5238	err = __dev_set_promiscuity(dev, inc, true);
5239	if (err < 0)
5240		return err;
5241	if (dev->flags != old_flags)
5242		dev_set_rx_mode(dev);
5243	return err;
5244}
5245EXPORT_SYMBOL(dev_set_promiscuity);
5246
5247static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
 
 
 
 
 
 
 
 
 
 
 
 
 
5248{
5249	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5250
5251	ASSERT_RTNL();
5252
5253	dev->flags |= IFF_ALLMULTI;
5254	dev->allmulti += inc;
5255	if (dev->allmulti == 0) {
5256		/*
5257		 * Avoid overflow.
5258		 * If inc causes overflow, untouch allmulti and return error.
5259		 */
5260		if (inc < 0)
5261			dev->flags &= ~IFF_ALLMULTI;
5262		else {
5263			dev->allmulti -= inc;
5264			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5265				dev->name);
5266			return -EOVERFLOW;
5267		}
5268	}
5269	if (dev->flags ^ old_flags) {
5270		dev_change_rx_flags(dev, IFF_ALLMULTI);
5271		dev_set_rx_mode(dev);
5272		if (notify)
5273			__dev_notify_flags(dev, old_flags,
5274					   dev->gflags ^ old_gflags);
5275	}
5276	return 0;
5277}
5278
5279/**
5280 *	dev_set_allmulti	- update allmulti count on a device
5281 *	@dev: device
5282 *	@inc: modifier
5283 *
5284 *	Add or remove reception of all multicast frames to a device. While the
5285 *	count in the device remains above zero the interface remains listening
5286 *	to all interfaces. Once it hits zero the device reverts back to normal
5287 *	filtering operation. A negative @inc value is used to drop the counter
5288 *	when releasing a resource needing all multicasts.
5289 *	Return 0 if successful or a negative errno code on error.
5290 */
5291
5292int dev_set_allmulti(struct net_device *dev, int inc)
5293{
5294	return __dev_set_allmulti(dev, inc, true);
5295}
5296EXPORT_SYMBOL(dev_set_allmulti);
5297
5298/*
5299 *	Upload unicast and multicast address lists to device and
5300 *	configure RX filtering. When the device doesn't support unicast
5301 *	filtering it is put in promiscuous mode while unicast addresses
5302 *	are present.
5303 */
5304void __dev_set_rx_mode(struct net_device *dev)
5305{
5306	const struct net_device_ops *ops = dev->netdev_ops;
5307
5308	/* dev_open will call this function so the list will stay sane. */
5309	if (!(dev->flags&IFF_UP))
5310		return;
5311
5312	if (!netif_device_present(dev))
5313		return;
5314
5315	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5316		/* Unicast addresses changes may only happen under the rtnl,
5317		 * therefore calling __dev_set_promiscuity here is safe.
5318		 */
5319		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5320			__dev_set_promiscuity(dev, 1, false);
5321			dev->uc_promisc = true;
5322		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5323			__dev_set_promiscuity(dev, -1, false);
5324			dev->uc_promisc = false;
5325		}
5326	}
5327
5328	if (ops->ndo_set_rx_mode)
5329		ops->ndo_set_rx_mode(dev);
5330}
5331
5332void dev_set_rx_mode(struct net_device *dev)
5333{
5334	netif_addr_lock_bh(dev);
5335	__dev_set_rx_mode(dev);
5336	netif_addr_unlock_bh(dev);
5337}
5338
5339/**
5340 *	dev_get_flags - get flags reported to userspace
5341 *	@dev: device
5342 *
5343 *	Get the combination of flag bits exported through APIs to userspace.
5344 */
5345unsigned int dev_get_flags(const struct net_device *dev)
5346{
5347	unsigned int flags;
5348
5349	flags = (dev->flags & ~(IFF_PROMISC |
5350				IFF_ALLMULTI |
5351				IFF_RUNNING |
5352				IFF_LOWER_UP |
5353				IFF_DORMANT)) |
5354		(dev->gflags & (IFF_PROMISC |
5355				IFF_ALLMULTI));
5356
5357	if (netif_running(dev)) {
5358		if (netif_oper_up(dev))
5359			flags |= IFF_RUNNING;
5360		if (netif_carrier_ok(dev))
5361			flags |= IFF_LOWER_UP;
5362		if (netif_dormant(dev))
5363			flags |= IFF_DORMANT;
5364	}
5365
5366	return flags;
5367}
5368EXPORT_SYMBOL(dev_get_flags);
5369
5370int __dev_change_flags(struct net_device *dev, unsigned int flags)
5371{
5372	unsigned int old_flags = dev->flags;
5373	int ret;
5374
5375	ASSERT_RTNL();
5376
5377	/*
5378	 *	Set the flags on our device.
5379	 */
5380
5381	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5382			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5383			       IFF_AUTOMEDIA)) |
5384		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5385				    IFF_ALLMULTI));
5386
5387	/*
5388	 *	Load in the correct multicast list now the flags have changed.
5389	 */
5390
5391	if ((old_flags ^ flags) & IFF_MULTICAST)
5392		dev_change_rx_flags(dev, IFF_MULTICAST);
5393
5394	dev_set_rx_mode(dev);
5395
5396	/*
5397	 *	Have we downed the interface. We handle IFF_UP ourselves
5398	 *	according to user attempts to set it, rather than blindly
5399	 *	setting it.
5400	 */
5401
5402	ret = 0;
5403	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
5404		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5405
5406		if (!ret)
5407			dev_set_rx_mode(dev);
5408	}
5409
5410	if ((flags ^ dev->gflags) & IFF_PROMISC) {
5411		int inc = (flags & IFF_PROMISC) ? 1 : -1;
5412		unsigned int old_flags = dev->flags;
5413
5414		dev->gflags ^= IFF_PROMISC;
5415
5416		if (__dev_set_promiscuity(dev, inc, false) >= 0)
5417			if (dev->flags != old_flags)
5418				dev_set_rx_mode(dev);
5419	}
5420
5421	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5422	   is important. Some (broken) drivers set IFF_PROMISC, when
5423	   IFF_ALLMULTI is requested not asking us and not reporting.
5424	 */
5425	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5426		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5427
5428		dev->gflags ^= IFF_ALLMULTI;
5429		__dev_set_allmulti(dev, inc, false);
5430	}
5431
5432	return ret;
5433}
5434
5435void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5436			unsigned int gchanges)
5437{
5438	unsigned int changes = dev->flags ^ old_flags;
5439
5440	if (gchanges)
5441		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5442
5443	if (changes & IFF_UP) {
5444		if (dev->flags & IFF_UP)
5445			call_netdevice_notifiers(NETDEV_UP, dev);
5446		else
5447			call_netdevice_notifiers(NETDEV_DOWN, dev);
5448	}
5449
5450	if (dev->flags & IFF_UP &&
5451	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5452		struct netdev_notifier_change_info change_info;
5453
5454		change_info.flags_changed = changes;
5455		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5456					      &change_info.info);
5457	}
5458}
5459
5460/**
5461 *	dev_change_flags - change device settings
5462 *	@dev: device
5463 *	@flags: device state flags
5464 *
5465 *	Change settings on device based state flags. The flags are
5466 *	in the userspace exported format.
5467 */
5468int dev_change_flags(struct net_device *dev, unsigned int flags)
5469{
5470	int ret;
5471	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5472
5473	ret = __dev_change_flags(dev, flags);
5474	if (ret < 0)
5475		return ret;
5476
5477	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5478	__dev_notify_flags(dev, old_flags, changes);
 
 
 
5479	return ret;
5480}
5481EXPORT_SYMBOL(dev_change_flags);
5482
5483static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5484{
5485	const struct net_device_ops *ops = dev->netdev_ops;
5486
5487	if (ops->ndo_change_mtu)
5488		return ops->ndo_change_mtu(dev, new_mtu);
5489
5490	dev->mtu = new_mtu;
5491	return 0;
5492}
5493
5494/**
5495 *	dev_set_mtu - Change maximum transfer unit
5496 *	@dev: device
5497 *	@new_mtu: new transfer unit
5498 *
5499 *	Change the maximum transfer size of the network device.
5500 */
5501int dev_set_mtu(struct net_device *dev, int new_mtu)
5502{
5503	int err, orig_mtu;
 
5504
5505	if (new_mtu == dev->mtu)
5506		return 0;
5507
5508	/*	MTU must be positive.	 */
5509	if (new_mtu < 0)
5510		return -EINVAL;
5511
5512	if (!netif_device_present(dev))
5513		return -ENODEV;
5514
5515	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5516	err = notifier_to_errno(err);
5517	if (err)
5518		return err;
5519
5520	orig_mtu = dev->mtu;
5521	err = __dev_set_mtu(dev, new_mtu);
5522
5523	if (!err) {
5524		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5525		err = notifier_to_errno(err);
5526		if (err) {
5527			/* setting mtu back and notifying everyone again,
5528			 * so that they have a chance to revert changes.
5529			 */
5530			__dev_set_mtu(dev, orig_mtu);
5531			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5532		}
5533	}
5534	return err;
5535}
5536EXPORT_SYMBOL(dev_set_mtu);
5537
5538/**
5539 *	dev_set_group - Change group this device belongs to
5540 *	@dev: device
5541 *	@new_group: group this device should belong to
5542 */
5543void dev_set_group(struct net_device *dev, int new_group)
5544{
5545	dev->group = new_group;
5546}
5547EXPORT_SYMBOL(dev_set_group);
5548
5549/**
5550 *	dev_set_mac_address - Change Media Access Control Address
5551 *	@dev: device
5552 *	@sa: new address
5553 *
5554 *	Change the hardware (MAC) address of the device
5555 */
5556int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5557{
5558	const struct net_device_ops *ops = dev->netdev_ops;
5559	int err;
5560
5561	if (!ops->ndo_set_mac_address)
5562		return -EOPNOTSUPP;
5563	if (sa->sa_family != dev->type)
5564		return -EINVAL;
5565	if (!netif_device_present(dev))
5566		return -ENODEV;
5567	err = ops->ndo_set_mac_address(dev, sa);
5568	if (err)
5569		return err;
5570	dev->addr_assign_type = NET_ADDR_SET;
5571	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5572	add_device_randomness(dev->dev_addr, dev->addr_len);
5573	return 0;
5574}
5575EXPORT_SYMBOL(dev_set_mac_address);
5576
5577/**
5578 *	dev_change_carrier - Change device carrier
5579 *	@dev: device
5580 *	@new_carrier: new value
5581 *
5582 *	Change device carrier
5583 */
5584int dev_change_carrier(struct net_device *dev, bool new_carrier)
5585{
5586	const struct net_device_ops *ops = dev->netdev_ops;
 
5587
5588	if (!ops->ndo_change_carrier)
5589		return -EOPNOTSUPP;
5590	if (!netif_device_present(dev))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5591		return -ENODEV;
5592	return ops->ndo_change_carrier(dev, new_carrier);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5593}
5594EXPORT_SYMBOL(dev_change_carrier);
 
 
 
 
5595
5596/**
5597 *	dev_get_phys_port_id - Get device physical port ID
5598 *	@dev: device
5599 *	@ppid: port ID
 
5600 *
5601 *	Get device physical port ID
 
 
 
5602 */
5603int dev_get_phys_port_id(struct net_device *dev,
5604			 struct netdev_phys_port_id *ppid)
5605{
5606	const struct net_device_ops *ops = dev->netdev_ops;
 
 
5607
5608	if (!ops->ndo_get_phys_port_id)
5609		return -EOPNOTSUPP;
5610	return ops->ndo_get_phys_port_id(dev, ppid);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5611}
5612EXPORT_SYMBOL(dev_get_phys_port_id);
5613
5614/**
5615 *	dev_new_index	-	allocate an ifindex
5616 *	@net: the applicable net namespace
5617 *
5618 *	Returns a suitable unique value for a new device interface
5619 *	number.  The caller must hold the rtnl semaphore or the
5620 *	dev_base_lock to be sure it remains unique.
5621 */
5622static int dev_new_index(struct net *net)
5623{
5624	int ifindex = net->ifindex;
5625	for (;;) {
5626		if (++ifindex <= 0)
5627			ifindex = 1;
5628		if (!__dev_get_by_index(net, ifindex))
5629			return net->ifindex = ifindex;
5630	}
5631}
5632
5633/* Delayed registration/unregisteration */
5634static LIST_HEAD(net_todo_list);
5635DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5636
5637static void net_set_todo(struct net_device *dev)
5638{
5639	list_add_tail(&dev->todo_list, &net_todo_list);
5640	dev_net(dev)->dev_unreg_count++;
5641}
5642
5643static void rollback_registered_many(struct list_head *head)
5644{
5645	struct net_device *dev, *tmp;
5646	LIST_HEAD(close_head);
5647
5648	BUG_ON(dev_boot_phase);
5649	ASSERT_RTNL();
5650
5651	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5652		/* Some devices call without registering
5653		 * for initialization unwind. Remove those
5654		 * devices and proceed with the remaining.
5655		 */
5656		if (dev->reg_state == NETREG_UNINITIALIZED) {
5657			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5658				 dev->name, dev);
5659
5660			WARN_ON(1);
5661			list_del(&dev->unreg_list);
5662			continue;
5663		}
5664		dev->dismantle = true;
5665		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5666	}
5667
5668	/* If device is running, close it first. */
5669	list_for_each_entry(dev, head, unreg_list)
5670		list_add_tail(&dev->close_list, &close_head);
5671	dev_close_many(&close_head);
5672
5673	list_for_each_entry(dev, head, unreg_list) {
5674		/* And unlink it from device chain. */
5675		unlist_netdevice(dev);
5676
5677		dev->reg_state = NETREG_UNREGISTERING;
5678	}
5679
5680	synchronize_net();
5681
5682	list_for_each_entry(dev, head, unreg_list) {
5683		/* Shutdown queueing discipline. */
5684		dev_shutdown(dev);
5685
5686
5687		/* Notify protocols, that we are about to destroy
5688		   this device. They should clean all the things.
5689		*/
5690		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5691
5692		if (!dev->rtnl_link_ops ||
5693		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5694			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5695
5696		/*
5697		 *	Flush the unicast and multicast chains
5698		 */
5699		dev_uc_flush(dev);
5700		dev_mc_flush(dev);
5701
5702		if (dev->netdev_ops->ndo_uninit)
5703			dev->netdev_ops->ndo_uninit(dev);
5704
5705		/* Notifier chain MUST detach us all upper devices. */
5706		WARN_ON(netdev_has_any_upper_dev(dev));
5707
5708		/* Remove entries from kobject tree */
5709		netdev_unregister_kobject(dev);
5710#ifdef CONFIG_XPS
5711		/* Remove XPS queueing entries */
5712		netif_reset_xps_queues_gt(dev, 0);
5713#endif
5714	}
5715
 
 
 
 
5716	synchronize_net();
5717
5718	list_for_each_entry(dev, head, unreg_list)
5719		dev_put(dev);
5720}
5721
5722static void rollback_registered(struct net_device *dev)
5723{
5724	LIST_HEAD(single);
5725
5726	list_add(&dev->unreg_list, &single);
5727	rollback_registered_many(&single);
5728	list_del(&single);
5729}
5730
5731static netdev_features_t netdev_fix_features(struct net_device *dev,
5732	netdev_features_t features)
5733{
5734	/* Fix illegal checksum combinations */
5735	if ((features & NETIF_F_HW_CSUM) &&
5736	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5737		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5738		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5739	}
5740
 
 
 
 
 
 
 
 
5741	/* TSO requires that SG is present as well. */
5742	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5743		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5744		features &= ~NETIF_F_ALL_TSO;
5745	}
5746
5747	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5748					!(features & NETIF_F_IP_CSUM)) {
5749		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5750		features &= ~NETIF_F_TSO;
5751		features &= ~NETIF_F_TSO_ECN;
5752	}
5753
5754	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5755					 !(features & NETIF_F_IPV6_CSUM)) {
5756		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5757		features &= ~NETIF_F_TSO6;
5758	}
5759
5760	/* TSO ECN requires that TSO is present as well. */
5761	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5762		features &= ~NETIF_F_TSO_ECN;
5763
5764	/* Software GSO depends on SG. */
5765	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5766		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5767		features &= ~NETIF_F_GSO;
5768	}
5769
5770	/* UFO needs SG and checksumming */
5771	if (features & NETIF_F_UFO) {
5772		/* maybe split UFO into V4 and V6? */
5773		if (!((features & NETIF_F_GEN_CSUM) ||
5774		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5775			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5776			netdev_dbg(dev,
5777				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5778			features &= ~NETIF_F_UFO;
5779		}
5780
5781		if (!(features & NETIF_F_SG)) {
5782			netdev_dbg(dev,
5783				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5784			features &= ~NETIF_F_UFO;
5785		}
5786	}
5787
5788#ifdef CONFIG_NET_RX_BUSY_POLL
5789	if (dev->netdev_ops->ndo_busy_poll)
5790		features |= NETIF_F_BUSY_POLL;
5791	else
5792#endif
5793		features &= ~NETIF_F_BUSY_POLL;
5794
5795	return features;
5796}
5797
5798int __netdev_update_features(struct net_device *dev)
5799{
5800	netdev_features_t features;
5801	int err = 0;
5802
5803	ASSERT_RTNL();
5804
5805	features = netdev_get_wanted_features(dev);
5806
5807	if (dev->netdev_ops->ndo_fix_features)
5808		features = dev->netdev_ops->ndo_fix_features(dev, features);
5809
5810	/* driver might be less strict about feature dependencies */
5811	features = netdev_fix_features(dev, features);
5812
5813	if (dev->features == features)
5814		return 0;
5815
5816	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5817		&dev->features, &features);
5818
5819	if (dev->netdev_ops->ndo_set_features)
5820		err = dev->netdev_ops->ndo_set_features(dev, features);
5821
5822	if (unlikely(err < 0)) {
5823		netdev_err(dev,
5824			"set_features() failed (%d); wanted %pNF, left %pNF\n",
5825			err, &features, &dev->features);
5826		return -1;
5827	}
5828
5829	if (!err)
5830		dev->features = features;
5831
5832	return 1;
5833}
5834
5835/**
5836 *	netdev_update_features - recalculate device features
5837 *	@dev: the device to check
5838 *
5839 *	Recalculate dev->features set and send notifications if it
5840 *	has changed. Should be called after driver or hardware dependent
5841 *	conditions might have changed that influence the features.
5842 */
5843void netdev_update_features(struct net_device *dev)
5844{
5845	if (__netdev_update_features(dev))
5846		netdev_features_change(dev);
5847}
5848EXPORT_SYMBOL(netdev_update_features);
5849
5850/**
5851 *	netdev_change_features - recalculate device features
5852 *	@dev: the device to check
5853 *
5854 *	Recalculate dev->features set and send notifications even
5855 *	if they have not changed. Should be called instead of
5856 *	netdev_update_features() if also dev->vlan_features might
5857 *	have changed to allow the changes to be propagated to stacked
5858 *	VLAN devices.
5859 */
5860void netdev_change_features(struct net_device *dev)
5861{
5862	__netdev_update_features(dev);
5863	netdev_features_change(dev);
5864}
5865EXPORT_SYMBOL(netdev_change_features);
5866
5867/**
5868 *	netif_stacked_transfer_operstate -	transfer operstate
5869 *	@rootdev: the root or lower level device to transfer state from
5870 *	@dev: the device to transfer operstate to
5871 *
5872 *	Transfer operational state from root to device. This is normally
5873 *	called when a stacking relationship exists between the root
5874 *	device and the device(a leaf device).
5875 */
5876void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5877					struct net_device *dev)
5878{
5879	if (rootdev->operstate == IF_OPER_DORMANT)
5880		netif_dormant_on(dev);
5881	else
5882		netif_dormant_off(dev);
5883
5884	if (netif_carrier_ok(rootdev)) {
5885		if (!netif_carrier_ok(dev))
5886			netif_carrier_on(dev);
5887	} else {
5888		if (netif_carrier_ok(dev))
5889			netif_carrier_off(dev);
5890	}
5891}
5892EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5893
5894#ifdef CONFIG_SYSFS
5895static int netif_alloc_rx_queues(struct net_device *dev)
5896{
5897	unsigned int i, count = dev->num_rx_queues;
5898	struct netdev_rx_queue *rx;
5899
5900	BUG_ON(count < 1);
5901
5902	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5903	if (!rx)
 
5904		return -ENOMEM;
5905
5906	dev->_rx = rx;
5907
5908	for (i = 0; i < count; i++)
5909		rx[i].dev = dev;
5910	return 0;
5911}
5912#endif
5913
5914static void netdev_init_one_queue(struct net_device *dev,
5915				  struct netdev_queue *queue, void *_unused)
5916{
5917	/* Initialize queue lock */
5918	spin_lock_init(&queue->_xmit_lock);
5919	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5920	queue->xmit_lock_owner = -1;
5921	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5922	queue->dev = dev;
5923#ifdef CONFIG_BQL
5924	dql_init(&queue->dql, HZ);
5925#endif
5926}
5927
5928static void netif_free_tx_queues(struct net_device *dev)
5929{
5930	if (is_vmalloc_addr(dev->_tx))
5931		vfree(dev->_tx);
5932	else
5933		kfree(dev->_tx);
5934}
5935
5936static int netif_alloc_netdev_queues(struct net_device *dev)
5937{
5938	unsigned int count = dev->num_tx_queues;
5939	struct netdev_queue *tx;
5940	size_t sz = count * sizeof(*tx);
5941
5942	BUG_ON(count < 1 || count > 0xffff);
5943
5944	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
5945	if (!tx) {
5946		tx = vzalloc(sz);
5947		if (!tx)
5948			return -ENOMEM;
5949	}
5950	dev->_tx = tx;
5951
5952	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5953	spin_lock_init(&dev->tx_global_lock);
5954
5955	return 0;
5956}
5957
5958/**
5959 *	register_netdevice	- register a network device
5960 *	@dev: device to register
5961 *
5962 *	Take a completed network device structure and add it to the kernel
5963 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5964 *	chain. 0 is returned on success. A negative errno code is returned
5965 *	on a failure to set up the device, or if the name is a duplicate.
5966 *
5967 *	Callers must hold the rtnl semaphore. You may want
5968 *	register_netdev() instead of this.
5969 *
5970 *	BUGS:
5971 *	The locking appears insufficient to guarantee two parallel registers
5972 *	will not get the same name.
5973 */
5974
5975int register_netdevice(struct net_device *dev)
5976{
5977	int ret;
5978	struct net *net = dev_net(dev);
5979
5980	BUG_ON(dev_boot_phase);
5981	ASSERT_RTNL();
5982
5983	might_sleep();
5984
5985	/* When net_device's are persistent, this will be fatal. */
5986	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5987	BUG_ON(!net);
5988
5989	spin_lock_init(&dev->addr_list_lock);
5990	netdev_set_addr_lockdep_class(dev);
5991
5992	dev->iflink = -1;
5993
5994	ret = dev_get_valid_name(net, dev, dev->name);
5995	if (ret < 0)
5996		goto out;
5997
5998	/* Init, if this function is available */
5999	if (dev->netdev_ops->ndo_init) {
6000		ret = dev->netdev_ops->ndo_init(dev);
6001		if (ret) {
6002			if (ret > 0)
6003				ret = -EIO;
6004			goto out;
6005		}
6006	}
6007
6008	if (((dev->hw_features | dev->features) &
6009	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
6010	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6011	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6012		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6013		ret = -EINVAL;
6014		goto err_uninit;
6015	}
6016
6017	ret = -EBUSY;
6018	if (!dev->ifindex)
6019		dev->ifindex = dev_new_index(net);
6020	else if (__dev_get_by_index(net, dev->ifindex))
6021		goto err_uninit;
6022
6023	if (dev->iflink == -1)
6024		dev->iflink = dev->ifindex;
6025
6026	/* Transfer changeable features to wanted_features and enable
6027	 * software offloads (GSO and GRO).
6028	 */
6029	dev->hw_features |= NETIF_F_SOFT_FEATURES;
6030	dev->features |= NETIF_F_SOFT_FEATURES;
6031	dev->wanted_features = dev->features & dev->hw_features;
6032
 
6033	if (!(dev->flags & IFF_LOOPBACK)) {
6034		dev->hw_features |= NETIF_F_NOCACHE_COPY;
 
 
 
 
6035	}
6036
6037	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6038	 */
6039	dev->vlan_features |= NETIF_F_HIGHDMA;
6040
6041	/* Make NETIF_F_SG inheritable to tunnel devices.
6042	 */
6043	dev->hw_enc_features |= NETIF_F_SG;
6044
6045	/* Make NETIF_F_SG inheritable to MPLS.
6046	 */
6047	dev->mpls_features |= NETIF_F_SG;
6048
6049	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6050	ret = notifier_to_errno(ret);
6051	if (ret)
6052		goto err_uninit;
6053
6054	ret = netdev_register_kobject(dev);
6055	if (ret)
6056		goto err_uninit;
6057	dev->reg_state = NETREG_REGISTERED;
6058
6059	__netdev_update_features(dev);
6060
6061	/*
6062	 *	Default initial state at registry is that the
6063	 *	device is present.
6064	 */
6065
6066	set_bit(__LINK_STATE_PRESENT, &dev->state);
6067
6068	linkwatch_init_dev(dev);
6069
6070	dev_init_scheduler(dev);
6071	dev_hold(dev);
6072	list_netdevice(dev);
6073	add_device_randomness(dev->dev_addr, dev->addr_len);
6074
6075	/* If the device has permanent device address, driver should
6076	 * set dev_addr and also addr_assign_type should be set to
6077	 * NET_ADDR_PERM (default value).
6078	 */
6079	if (dev->addr_assign_type == NET_ADDR_PERM)
6080		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6081
6082	/* Notify protocols, that a new device appeared. */
6083	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6084	ret = notifier_to_errno(ret);
6085	if (ret) {
6086		rollback_registered(dev);
6087		dev->reg_state = NETREG_UNREGISTERED;
6088	}
6089	/*
6090	 *	Prevent userspace races by waiting until the network
6091	 *	device is fully setup before sending notifications.
6092	 */
6093	if (!dev->rtnl_link_ops ||
6094	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6095		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6096
6097out:
6098	return ret;
6099
6100err_uninit:
6101	if (dev->netdev_ops->ndo_uninit)
6102		dev->netdev_ops->ndo_uninit(dev);
6103	goto out;
6104}
6105EXPORT_SYMBOL(register_netdevice);
6106
6107/**
6108 *	init_dummy_netdev	- init a dummy network device for NAPI
6109 *	@dev: device to init
6110 *
6111 *	This takes a network device structure and initialize the minimum
6112 *	amount of fields so it can be used to schedule NAPI polls without
6113 *	registering a full blown interface. This is to be used by drivers
6114 *	that need to tie several hardware interfaces to a single NAPI
6115 *	poll scheduler due to HW limitations.
6116 */
6117int init_dummy_netdev(struct net_device *dev)
6118{
6119	/* Clear everything. Note we don't initialize spinlocks
6120	 * are they aren't supposed to be taken by any of the
6121	 * NAPI code and this dummy netdev is supposed to be
6122	 * only ever used for NAPI polls
6123	 */
6124	memset(dev, 0, sizeof(struct net_device));
6125
6126	/* make sure we BUG if trying to hit standard
6127	 * register/unregister code path
6128	 */
6129	dev->reg_state = NETREG_DUMMY;
6130
6131	/* NAPI wants this */
6132	INIT_LIST_HEAD(&dev->napi_list);
6133
6134	/* a dummy interface is started by default */
6135	set_bit(__LINK_STATE_PRESENT, &dev->state);
6136	set_bit(__LINK_STATE_START, &dev->state);
6137
6138	/* Note : We dont allocate pcpu_refcnt for dummy devices,
6139	 * because users of this 'device' dont need to change
6140	 * its refcount.
6141	 */
6142
6143	return 0;
6144}
6145EXPORT_SYMBOL_GPL(init_dummy_netdev);
6146
6147
6148/**
6149 *	register_netdev	- register a network device
6150 *	@dev: device to register
6151 *
6152 *	Take a completed network device structure and add it to the kernel
6153 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6154 *	chain. 0 is returned on success. A negative errno code is returned
6155 *	on a failure to set up the device, or if the name is a duplicate.
6156 *
6157 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
6158 *	and expands the device name if you passed a format string to
6159 *	alloc_netdev.
6160 */
6161int register_netdev(struct net_device *dev)
6162{
6163	int err;
6164
6165	rtnl_lock();
6166	err = register_netdevice(dev);
6167	rtnl_unlock();
6168	return err;
6169}
6170EXPORT_SYMBOL(register_netdev);
6171
6172int netdev_refcnt_read(const struct net_device *dev)
6173{
6174	int i, refcnt = 0;
6175
6176	for_each_possible_cpu(i)
6177		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6178	return refcnt;
6179}
6180EXPORT_SYMBOL(netdev_refcnt_read);
6181
6182/**
6183 * netdev_wait_allrefs - wait until all references are gone.
6184 * @dev: target net_device
6185 *
6186 * This is called when unregistering network devices.
6187 *
6188 * Any protocol or device that holds a reference should register
6189 * for netdevice notification, and cleanup and put back the
6190 * reference if they receive an UNREGISTER event.
6191 * We can get stuck here if buggy protocols don't correctly
6192 * call dev_put.
6193 */
6194static void netdev_wait_allrefs(struct net_device *dev)
6195{
6196	unsigned long rebroadcast_time, warning_time;
6197	int refcnt;
6198
6199	linkwatch_forget_dev(dev);
6200
6201	rebroadcast_time = warning_time = jiffies;
6202	refcnt = netdev_refcnt_read(dev);
6203
6204	while (refcnt != 0) {
6205		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6206			rtnl_lock();
6207
6208			/* Rebroadcast unregister notification */
6209			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 
 
6210
6211			__rtnl_unlock();
6212			rcu_barrier();
6213			rtnl_lock();
6214
6215			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6216			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6217				     &dev->state)) {
6218				/* We must not have linkwatch events
6219				 * pending on unregister. If this
6220				 * happens, we simply run the queue
6221				 * unscheduled, resulting in a noop
6222				 * for this device.
6223				 */
6224				linkwatch_run_queue();
6225			}
6226
6227			__rtnl_unlock();
6228
6229			rebroadcast_time = jiffies;
6230		}
6231
6232		msleep(250);
6233
6234		refcnt = netdev_refcnt_read(dev);
6235
6236		if (time_after(jiffies, warning_time + 10 * HZ)) {
6237			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6238				 dev->name, refcnt);
6239			warning_time = jiffies;
6240		}
6241	}
6242}
6243
6244/* The sequence is:
6245 *
6246 *	rtnl_lock();
6247 *	...
6248 *	register_netdevice(x1);
6249 *	register_netdevice(x2);
6250 *	...
6251 *	unregister_netdevice(y1);
6252 *	unregister_netdevice(y2);
6253 *      ...
6254 *	rtnl_unlock();
6255 *	free_netdev(y1);
6256 *	free_netdev(y2);
6257 *
6258 * We are invoked by rtnl_unlock().
6259 * This allows us to deal with problems:
6260 * 1) We can delete sysfs objects which invoke hotplug
6261 *    without deadlocking with linkwatch via keventd.
6262 * 2) Since we run with the RTNL semaphore not held, we can sleep
6263 *    safely in order to wait for the netdev refcnt to drop to zero.
6264 *
6265 * We must not return until all unregister events added during
6266 * the interval the lock was held have been completed.
6267 */
6268void netdev_run_todo(void)
6269{
6270	struct list_head list;
6271
6272	/* Snapshot list, allow later requests */
6273	list_replace_init(&net_todo_list, &list);
6274
6275	__rtnl_unlock();
6276
6277
6278	/* Wait for rcu callbacks to finish before next phase */
 
6279	if (!list_empty(&list))
6280		rcu_barrier();
6281
6282	while (!list_empty(&list)) {
6283		struct net_device *dev
6284			= list_first_entry(&list, struct net_device, todo_list);
6285		list_del(&dev->todo_list);
6286
6287		rtnl_lock();
6288		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6289		__rtnl_unlock();
6290
6291		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6292			pr_err("network todo '%s' but state %d\n",
6293			       dev->name, dev->reg_state);
6294			dump_stack();
6295			continue;
6296		}
6297
6298		dev->reg_state = NETREG_UNREGISTERED;
6299
6300		on_each_cpu(flush_backlog, dev, 1);
6301
6302		netdev_wait_allrefs(dev);
6303
6304		/* paranoia */
6305		BUG_ON(netdev_refcnt_read(dev));
6306		WARN_ON(rcu_access_pointer(dev->ip_ptr));
6307		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6308		WARN_ON(dev->dn_ptr);
6309
6310		if (dev->destructor)
6311			dev->destructor(dev);
6312
6313		/* Report a network device has been unregistered */
6314		rtnl_lock();
6315		dev_net(dev)->dev_unreg_count--;
6316		__rtnl_unlock();
6317		wake_up(&netdev_unregistering_wq);
6318
6319		/* Free network device */
6320		kobject_put(&dev->dev.kobj);
6321	}
6322}
6323
6324/* Convert net_device_stats to rtnl_link_stats64.  They have the same
6325 * fields in the same order, with only the type differing.
6326 */
6327void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6328			     const struct net_device_stats *netdev_stats)
6329{
6330#if BITS_PER_LONG == 64
6331	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6332	memcpy(stats64, netdev_stats, sizeof(*stats64));
6333#else
6334	size_t i, n = sizeof(*stats64) / sizeof(u64);
6335	const unsigned long *src = (const unsigned long *)netdev_stats;
6336	u64 *dst = (u64 *)stats64;
6337
6338	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6339		     sizeof(*stats64) / sizeof(u64));
6340	for (i = 0; i < n; i++)
6341		dst[i] = src[i];
6342#endif
6343}
6344EXPORT_SYMBOL(netdev_stats_to_stats64);
6345
6346/**
6347 *	dev_get_stats	- get network device statistics
6348 *	@dev: device to get statistics from
6349 *	@storage: place to store stats
6350 *
6351 *	Get network statistics from device. Return @storage.
6352 *	The device driver may provide its own method by setting
6353 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6354 *	otherwise the internal statistics structure is used.
6355 */
6356struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6357					struct rtnl_link_stats64 *storage)
6358{
6359	const struct net_device_ops *ops = dev->netdev_ops;
6360
6361	if (ops->ndo_get_stats64) {
6362		memset(storage, 0, sizeof(*storage));
6363		ops->ndo_get_stats64(dev, storage);
6364	} else if (ops->ndo_get_stats) {
6365		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6366	} else {
6367		netdev_stats_to_stats64(storage, &dev->stats);
6368	}
6369	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6370	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6371	return storage;
6372}
6373EXPORT_SYMBOL(dev_get_stats);
6374
6375struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6376{
6377	struct netdev_queue *queue = dev_ingress_queue(dev);
6378
6379#ifdef CONFIG_NET_CLS_ACT
6380	if (queue)
6381		return queue;
6382	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6383	if (!queue)
6384		return NULL;
6385	netdev_init_one_queue(dev, queue, NULL);
6386	queue->qdisc = &noop_qdisc;
6387	queue->qdisc_sleeping = &noop_qdisc;
6388	rcu_assign_pointer(dev->ingress_queue, queue);
6389#endif
6390	return queue;
6391}
6392
6393static const struct ethtool_ops default_ethtool_ops;
6394
6395void netdev_set_default_ethtool_ops(struct net_device *dev,
6396				    const struct ethtool_ops *ops)
6397{
6398	if (dev->ethtool_ops == &default_ethtool_ops)
6399		dev->ethtool_ops = ops;
6400}
6401EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6402
6403void netdev_freemem(struct net_device *dev)
6404{
6405	char *addr = (char *)dev - dev->padded;
6406
6407	if (is_vmalloc_addr(addr))
6408		vfree(addr);
6409	else
6410		kfree(addr);
6411}
6412
6413/**
6414 *	alloc_netdev_mqs - allocate network device
6415 *	@sizeof_priv:	size of private data to allocate space for
6416 *	@name:		device name format string
6417 *	@setup:		callback to initialize device
6418 *	@txqs:		the number of TX subqueues to allocate
6419 *	@rxqs:		the number of RX subqueues to allocate
6420 *
6421 *	Allocates a struct net_device with private data area for driver use
6422 *	and performs basic initialization.  Also allocates subqueue structs
6423 *	for each queue on the device.
6424 */
6425struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6426		void (*setup)(struct net_device *),
6427		unsigned int txqs, unsigned int rxqs)
6428{
6429	struct net_device *dev;
6430	size_t alloc_size;
6431	struct net_device *p;
6432
6433	BUG_ON(strlen(name) >= sizeof(dev->name));
6434
6435	if (txqs < 1) {
6436		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6437		return NULL;
6438	}
6439
6440#ifdef CONFIG_SYSFS
6441	if (rxqs < 1) {
6442		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6443		return NULL;
6444	}
6445#endif
6446
6447	alloc_size = sizeof(struct net_device);
6448	if (sizeof_priv) {
6449		/* ensure 32-byte alignment of private area */
6450		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6451		alloc_size += sizeof_priv;
6452	}
6453	/* ensure 32-byte alignment of whole construct */
6454	alloc_size += NETDEV_ALIGN - 1;
6455
6456	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6457	if (!p)
6458		p = vzalloc(alloc_size);
6459	if (!p)
6460		return NULL;
 
6461
6462	dev = PTR_ALIGN(p, NETDEV_ALIGN);
6463	dev->padded = (char *)dev - (char *)p;
6464
6465	dev->pcpu_refcnt = alloc_percpu(int);
6466	if (!dev->pcpu_refcnt)
6467		goto free_dev;
6468
6469	if (dev_addr_init(dev))
6470		goto free_pcpu;
6471
6472	dev_mc_init(dev);
6473	dev_uc_init(dev);
6474
6475	dev_net_set(dev, &init_net);
6476
6477	dev->gso_max_size = GSO_MAX_SIZE;
6478	dev->gso_max_segs = GSO_MAX_SEGS;
6479
6480	INIT_LIST_HEAD(&dev->napi_list);
6481	INIT_LIST_HEAD(&dev->unreg_list);
6482	INIT_LIST_HEAD(&dev->close_list);
6483	INIT_LIST_HEAD(&dev->link_watch_list);
6484	INIT_LIST_HEAD(&dev->adj_list.upper);
6485	INIT_LIST_HEAD(&dev->adj_list.lower);
6486	INIT_LIST_HEAD(&dev->all_adj_list.upper);
6487	INIT_LIST_HEAD(&dev->all_adj_list.lower);
6488	dev->priv_flags = IFF_XMIT_DST_RELEASE;
6489	setup(dev);
6490
6491	dev->num_tx_queues = txqs;
6492	dev->real_num_tx_queues = txqs;
6493	if (netif_alloc_netdev_queues(dev))
6494		goto free_all;
6495
6496#ifdef CONFIG_SYSFS
6497	dev->num_rx_queues = rxqs;
6498	dev->real_num_rx_queues = rxqs;
6499	if (netif_alloc_rx_queues(dev))
6500		goto free_all;
6501#endif
6502
6503	strcpy(dev->name, name);
6504	dev->group = INIT_NETDEV_GROUP;
6505	if (!dev->ethtool_ops)
6506		dev->ethtool_ops = &default_ethtool_ops;
6507	return dev;
6508
6509free_all:
6510	free_netdev(dev);
6511	return NULL;
6512
6513free_pcpu:
6514	free_percpu(dev->pcpu_refcnt);
6515	netif_free_tx_queues(dev);
6516#ifdef CONFIG_SYSFS
6517	kfree(dev->_rx);
6518#endif
6519
6520free_dev:
6521	netdev_freemem(dev);
6522	return NULL;
6523}
6524EXPORT_SYMBOL(alloc_netdev_mqs);
6525
6526/**
6527 *	free_netdev - free network device
6528 *	@dev: device
6529 *
6530 *	This function does the last stage of destroying an allocated device
6531 * 	interface. The reference to the device object is released.
6532 *	If this is the last reference then it will be freed.
6533 */
6534void free_netdev(struct net_device *dev)
6535{
6536	struct napi_struct *p, *n;
6537
6538	release_net(dev_net(dev));
6539
6540	netif_free_tx_queues(dev);
6541#ifdef CONFIG_SYSFS
6542	kfree(dev->_rx);
6543#endif
6544
6545	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6546
6547	/* Flush device addresses */
6548	dev_addr_flush(dev);
6549
6550	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6551		netif_napi_del(p);
6552
6553	free_percpu(dev->pcpu_refcnt);
6554	dev->pcpu_refcnt = NULL;
6555
6556	/*  Compatibility with error handling in drivers */
6557	if (dev->reg_state == NETREG_UNINITIALIZED) {
6558		netdev_freemem(dev);
6559		return;
6560	}
6561
6562	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6563	dev->reg_state = NETREG_RELEASED;
6564
6565	/* will free via device release */
6566	put_device(&dev->dev);
6567}
6568EXPORT_SYMBOL(free_netdev);
6569
6570/**
6571 *	synchronize_net -  Synchronize with packet receive processing
6572 *
6573 *	Wait for packets currently being received to be done.
6574 *	Does not block later packets from starting.
6575 */
6576void synchronize_net(void)
6577{
6578	might_sleep();
6579	if (rtnl_is_locked())
6580		synchronize_rcu_expedited();
6581	else
6582		synchronize_rcu();
6583}
6584EXPORT_SYMBOL(synchronize_net);
6585
6586/**
6587 *	unregister_netdevice_queue - remove device from the kernel
6588 *	@dev: device
6589 *	@head: list
6590 *
6591 *	This function shuts down a device interface and removes it
6592 *	from the kernel tables.
6593 *	If head not NULL, device is queued to be unregistered later.
6594 *
6595 *	Callers must hold the rtnl semaphore.  You may want
6596 *	unregister_netdev() instead of this.
6597 */
6598
6599void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6600{
6601	ASSERT_RTNL();
6602
6603	if (head) {
6604		list_move_tail(&dev->unreg_list, head);
6605	} else {
6606		rollback_registered(dev);
6607		/* Finish processing unregister after unlock */
6608		net_set_todo(dev);
6609	}
6610}
6611EXPORT_SYMBOL(unregister_netdevice_queue);
6612
6613/**
6614 *	unregister_netdevice_many - unregister many devices
6615 *	@head: list of devices
6616 */
6617void unregister_netdevice_many(struct list_head *head)
6618{
6619	struct net_device *dev;
6620
6621	if (!list_empty(head)) {
6622		rollback_registered_many(head);
6623		list_for_each_entry(dev, head, unreg_list)
6624			net_set_todo(dev);
6625	}
6626}
6627EXPORT_SYMBOL(unregister_netdevice_many);
6628
6629/**
6630 *	unregister_netdev - remove device from the kernel
6631 *	@dev: device
6632 *
6633 *	This function shuts down a device interface and removes it
6634 *	from the kernel tables.
6635 *
6636 *	This is just a wrapper for unregister_netdevice that takes
6637 *	the rtnl semaphore.  In general you want to use this and not
6638 *	unregister_netdevice.
6639 */
6640void unregister_netdev(struct net_device *dev)
6641{
6642	rtnl_lock();
6643	unregister_netdevice(dev);
6644	rtnl_unlock();
6645}
6646EXPORT_SYMBOL(unregister_netdev);
6647
6648/**
6649 *	dev_change_net_namespace - move device to different nethost namespace
6650 *	@dev: device
6651 *	@net: network namespace
6652 *	@pat: If not NULL name pattern to try if the current device name
6653 *	      is already taken in the destination network namespace.
6654 *
6655 *	This function shuts down a device interface and moves it
6656 *	to a new network namespace. On success 0 is returned, on
6657 *	a failure a netagive errno code is returned.
6658 *
6659 *	Callers must hold the rtnl semaphore.
6660 */
6661
6662int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6663{
6664	int err;
6665
6666	ASSERT_RTNL();
6667
6668	/* Don't allow namespace local devices to be moved. */
6669	err = -EINVAL;
6670	if (dev->features & NETIF_F_NETNS_LOCAL)
6671		goto out;
6672
6673	/* Ensure the device has been registrered */
 
6674	if (dev->reg_state != NETREG_REGISTERED)
6675		goto out;
6676
6677	/* Get out if there is nothing todo */
6678	err = 0;
6679	if (net_eq(dev_net(dev), net))
6680		goto out;
6681
6682	/* Pick the destination device name, and ensure
6683	 * we can use it in the destination network namespace.
6684	 */
6685	err = -EEXIST;
6686	if (__dev_get_by_name(net, dev->name)) {
6687		/* We get here if we can't use the current device name */
6688		if (!pat)
6689			goto out;
6690		if (dev_get_valid_name(net, dev, pat) < 0)
6691			goto out;
6692	}
6693
6694	/*
6695	 * And now a mini version of register_netdevice unregister_netdevice.
6696	 */
6697
6698	/* If device is running close it first. */
6699	dev_close(dev);
6700
6701	/* And unlink it from device chain */
6702	err = -ENODEV;
6703	unlist_netdevice(dev);
6704
6705	synchronize_net();
6706
6707	/* Shutdown queueing discipline. */
6708	dev_shutdown(dev);
6709
6710	/* Notify protocols, that we are about to destroy
6711	   this device. They should clean all the things.
6712
6713	   Note that dev->reg_state stays at NETREG_REGISTERED.
6714	   This is wanted because this way 8021q and macvlan know
6715	   the device is just moving and can keep their slaves up.
6716	*/
6717	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6718	rcu_barrier();
6719	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6720	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6721
6722	/*
6723	 *	Flush the unicast and multicast chains
6724	 */
6725	dev_uc_flush(dev);
6726	dev_mc_flush(dev);
6727
6728	/* Send a netdev-removed uevent to the old namespace */
6729	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6730
6731	/* Actually switch the network namespace */
6732	dev_net_set(dev, net);
6733
6734	/* If there is an ifindex conflict assign a new one */
6735	if (__dev_get_by_index(net, dev->ifindex)) {
6736		int iflink = (dev->iflink == dev->ifindex);
6737		dev->ifindex = dev_new_index(net);
6738		if (iflink)
6739			dev->iflink = dev->ifindex;
6740	}
6741
6742	/* Send a netdev-add uevent to the new namespace */
6743	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6744
6745	/* Fixup kobjects */
6746	err = device_rename(&dev->dev, dev->name);
6747	WARN_ON(err);
6748
6749	/* Add the device back in the hashes */
6750	list_netdevice(dev);
6751
6752	/* Notify protocols, that a new device appeared. */
6753	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6754
6755	/*
6756	 *	Prevent userspace races by waiting until the network
6757	 *	device is fully setup before sending notifications.
6758	 */
6759	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6760
6761	synchronize_net();
6762	err = 0;
6763out:
6764	return err;
6765}
6766EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6767
6768static int dev_cpu_callback(struct notifier_block *nfb,
6769			    unsigned long action,
6770			    void *ocpu)
6771{
6772	struct sk_buff **list_skb;
6773	struct sk_buff *skb;
6774	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6775	struct softnet_data *sd, *oldsd;
6776
6777	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6778		return NOTIFY_OK;
6779
6780	local_irq_disable();
6781	cpu = smp_processor_id();
6782	sd = &per_cpu(softnet_data, cpu);
6783	oldsd = &per_cpu(softnet_data, oldcpu);
6784
6785	/* Find end of our completion_queue. */
6786	list_skb = &sd->completion_queue;
6787	while (*list_skb)
6788		list_skb = &(*list_skb)->next;
6789	/* Append completion queue from offline CPU. */
6790	*list_skb = oldsd->completion_queue;
6791	oldsd->completion_queue = NULL;
6792
6793	/* Append output queue from offline CPU. */
6794	if (oldsd->output_queue) {
6795		*sd->output_queue_tailp = oldsd->output_queue;
6796		sd->output_queue_tailp = oldsd->output_queue_tailp;
6797		oldsd->output_queue = NULL;
6798		oldsd->output_queue_tailp = &oldsd->output_queue;
6799	}
6800	/* Append NAPI poll list from offline CPU. */
6801	if (!list_empty(&oldsd->poll_list)) {
6802		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6803		raise_softirq_irqoff(NET_RX_SOFTIRQ);
6804	}
6805
6806	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6807	local_irq_enable();
6808
6809	/* Process offline CPU's input_pkt_queue */
6810	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6811		netif_rx_internal(skb);
6812		input_queue_head_incr(oldsd);
6813	}
6814	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6815		netif_rx_internal(skb);
6816		input_queue_head_incr(oldsd);
6817	}
6818
6819	return NOTIFY_OK;
6820}
6821
6822
6823/**
6824 *	netdev_increment_features - increment feature set by one
6825 *	@all: current feature set
6826 *	@one: new feature set
6827 *	@mask: mask feature set
6828 *
6829 *	Computes a new feature set after adding a device with feature set
6830 *	@one to the master device with current feature set @all.  Will not
6831 *	enable anything that is off in @mask. Returns the new feature set.
6832 */
6833netdev_features_t netdev_increment_features(netdev_features_t all,
6834	netdev_features_t one, netdev_features_t mask)
6835{
6836	if (mask & NETIF_F_GEN_CSUM)
6837		mask |= NETIF_F_ALL_CSUM;
6838	mask |= NETIF_F_VLAN_CHALLENGED;
6839
6840	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6841	all &= one | ~NETIF_F_ALL_FOR_ALL;
6842
6843	/* If one device supports hw checksumming, set for all. */
6844	if (all & NETIF_F_GEN_CSUM)
6845		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6846
6847	return all;
6848}
6849EXPORT_SYMBOL(netdev_increment_features);
6850
6851static struct hlist_head * __net_init netdev_create_hash(void)
6852{
6853	int i;
6854	struct hlist_head *hash;
6855
6856	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6857	if (hash != NULL)
6858		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6859			INIT_HLIST_HEAD(&hash[i]);
6860
6861	return hash;
6862}
6863
6864/* Initialize per network namespace state */
6865static int __net_init netdev_init(struct net *net)
6866{
6867	if (net != &init_net)
6868		INIT_LIST_HEAD(&net->dev_base_head);
6869
6870	net->dev_name_head = netdev_create_hash();
6871	if (net->dev_name_head == NULL)
6872		goto err_name;
6873
6874	net->dev_index_head = netdev_create_hash();
6875	if (net->dev_index_head == NULL)
6876		goto err_idx;
6877
6878	return 0;
6879
6880err_idx:
6881	kfree(net->dev_name_head);
6882err_name:
6883	return -ENOMEM;
6884}
6885
6886/**
6887 *	netdev_drivername - network driver for the device
6888 *	@dev: network device
6889 *
6890 *	Determine network driver for device.
6891 */
6892const char *netdev_drivername(const struct net_device *dev)
6893{
6894	const struct device_driver *driver;
6895	const struct device *parent;
6896	const char *empty = "";
6897
6898	parent = dev->dev.parent;
6899	if (!parent)
6900		return empty;
6901
6902	driver = parent->driver;
6903	if (driver && driver->name)
6904		return driver->name;
6905	return empty;
6906}
6907
6908static int __netdev_printk(const char *level, const struct net_device *dev,
6909			   struct va_format *vaf)
6910{
6911	int r;
6912
6913	if (dev && dev->dev.parent) {
6914		r = dev_printk_emit(level[1] - '0',
6915				    dev->dev.parent,
6916				    "%s %s %s: %pV",
6917				    dev_driver_string(dev->dev.parent),
6918				    dev_name(dev->dev.parent),
6919				    netdev_name(dev), vaf);
6920	} else if (dev) {
6921		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6922	} else {
6923		r = printk("%s(NULL net_device): %pV", level, vaf);
6924	}
6925
6926	return r;
6927}
 
6928
6929int netdev_printk(const char *level, const struct net_device *dev,
6930		  const char *format, ...)
6931{
6932	struct va_format vaf;
6933	va_list args;
6934	int r;
6935
6936	va_start(args, format);
6937
6938	vaf.fmt = format;
6939	vaf.va = &args;
6940
6941	r = __netdev_printk(level, dev, &vaf);
6942
6943	va_end(args);
6944
6945	return r;
6946}
6947EXPORT_SYMBOL(netdev_printk);
6948
6949#define define_netdev_printk_level(func, level)			\
6950int func(const struct net_device *dev, const char *fmt, ...)	\
6951{								\
6952	int r;							\
6953	struct va_format vaf;					\
6954	va_list args;						\
6955								\
6956	va_start(args, fmt);					\
6957								\
6958	vaf.fmt = fmt;						\
6959	vaf.va = &args;						\
6960								\
6961	r = __netdev_printk(level, dev, &vaf);			\
6962								\
6963	va_end(args);						\
6964								\
6965	return r;						\
6966}								\
6967EXPORT_SYMBOL(func);
6968
6969define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6970define_netdev_printk_level(netdev_alert, KERN_ALERT);
6971define_netdev_printk_level(netdev_crit, KERN_CRIT);
6972define_netdev_printk_level(netdev_err, KERN_ERR);
6973define_netdev_printk_level(netdev_warn, KERN_WARNING);
6974define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6975define_netdev_printk_level(netdev_info, KERN_INFO);
6976
6977static void __net_exit netdev_exit(struct net *net)
6978{
6979	kfree(net->dev_name_head);
6980	kfree(net->dev_index_head);
6981}
6982
6983static struct pernet_operations __net_initdata netdev_net_ops = {
6984	.init = netdev_init,
6985	.exit = netdev_exit,
6986};
6987
6988static void __net_exit default_device_exit(struct net *net)
6989{
6990	struct net_device *dev, *aux;
6991	/*
6992	 * Push all migratable network devices back to the
6993	 * initial network namespace
6994	 */
6995	rtnl_lock();
6996	for_each_netdev_safe(net, dev, aux) {
6997		int err;
6998		char fb_name[IFNAMSIZ];
6999
7000		/* Ignore unmoveable devices (i.e. loopback) */
7001		if (dev->features & NETIF_F_NETNS_LOCAL)
7002			continue;
7003
7004		/* Leave virtual devices for the generic cleanup */
7005		if (dev->rtnl_link_ops)
7006			continue;
7007
7008		/* Push remaining network devices to init_net */
7009		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7010		err = dev_change_net_namespace(dev, &init_net, fb_name);
7011		if (err) {
7012			pr_emerg("%s: failed to move %s to init_net: %d\n",
7013				 __func__, dev->name, err);
7014			BUG();
7015		}
7016	}
7017	rtnl_unlock();
7018}
7019
7020static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7021{
7022	/* Return with the rtnl_lock held when there are no network
7023	 * devices unregistering in any network namespace in net_list.
7024	 */
7025	struct net *net;
7026	bool unregistering;
7027	DEFINE_WAIT(wait);
7028
7029	for (;;) {
7030		prepare_to_wait(&netdev_unregistering_wq, &wait,
7031				TASK_UNINTERRUPTIBLE);
7032		unregistering = false;
7033		rtnl_lock();
7034		list_for_each_entry(net, net_list, exit_list) {
7035			if (net->dev_unreg_count > 0) {
7036				unregistering = true;
7037				break;
7038			}
7039		}
7040		if (!unregistering)
7041			break;
7042		__rtnl_unlock();
7043		schedule();
7044	}
7045	finish_wait(&netdev_unregistering_wq, &wait);
7046}
7047
7048static void __net_exit default_device_exit_batch(struct list_head *net_list)
7049{
7050	/* At exit all network devices most be removed from a network
7051	 * namespace.  Do this in the reverse order of registration.
7052	 * Do this across as many network namespaces as possible to
7053	 * improve batching efficiency.
7054	 */
7055	struct net_device *dev;
7056	struct net *net;
7057	LIST_HEAD(dev_kill_list);
7058
7059	/* To prevent network device cleanup code from dereferencing
7060	 * loopback devices or network devices that have been freed
7061	 * wait here for all pending unregistrations to complete,
7062	 * before unregistring the loopback device and allowing the
7063	 * network namespace be freed.
7064	 *
7065	 * The netdev todo list containing all network devices
7066	 * unregistrations that happen in default_device_exit_batch
7067	 * will run in the rtnl_unlock() at the end of
7068	 * default_device_exit_batch.
7069	 */
7070	rtnl_lock_unregistering(net_list);
7071	list_for_each_entry(net, net_list, exit_list) {
7072		for_each_netdev_reverse(net, dev) {
7073			if (dev->rtnl_link_ops)
7074				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7075			else
7076				unregister_netdevice_queue(dev, &dev_kill_list);
7077		}
7078	}
7079	unregister_netdevice_many(&dev_kill_list);
7080	list_del(&dev_kill_list);
7081	rtnl_unlock();
7082}
7083
7084static struct pernet_operations __net_initdata default_device_ops = {
7085	.exit = default_device_exit,
7086	.exit_batch = default_device_exit_batch,
7087};
7088
7089/*
7090 *	Initialize the DEV module. At boot time this walks the device list and
7091 *	unhooks any devices that fail to initialise (normally hardware not
7092 *	present) and leaves us with a valid list of present and active devices.
7093 *
7094 */
7095
7096/*
7097 *       This is called single threaded during boot, so no need
7098 *       to take the rtnl semaphore.
7099 */
7100static int __init net_dev_init(void)
7101{
7102	int i, rc = -ENOMEM;
7103
7104	BUG_ON(!dev_boot_phase);
7105
7106	if (dev_proc_init())
7107		goto out;
7108
7109	if (netdev_kobject_init())
7110		goto out;
7111
7112	INIT_LIST_HEAD(&ptype_all);
7113	for (i = 0; i < PTYPE_HASH_SIZE; i++)
7114		INIT_LIST_HEAD(&ptype_base[i]);
7115
7116	INIT_LIST_HEAD(&offload_base);
7117
7118	if (register_pernet_subsys(&netdev_net_ops))
7119		goto out;
7120
7121	/*
7122	 *	Initialise the packet receive queues.
7123	 */
7124
7125	for_each_possible_cpu(i) {
7126		struct softnet_data *sd = &per_cpu(softnet_data, i);
7127
 
7128		skb_queue_head_init(&sd->input_pkt_queue);
7129		skb_queue_head_init(&sd->process_queue);
 
7130		INIT_LIST_HEAD(&sd->poll_list);
 
7131		sd->output_queue_tailp = &sd->output_queue;
7132#ifdef CONFIG_RPS
7133		sd->csd.func = rps_trigger_softirq;
7134		sd->csd.info = sd;
 
7135		sd->cpu = i;
7136#endif
7137
7138		sd->backlog.poll = process_backlog;
7139		sd->backlog.weight = weight_p;
 
 
7140	}
7141
7142	dev_boot_phase = 0;
7143
7144	/* The loopback device is special if any other network devices
7145	 * is present in a network namespace the loopback device must
7146	 * be present. Since we now dynamically allocate and free the
7147	 * loopback device ensure this invariant is maintained by
7148	 * keeping the loopback device as the first device on the
7149	 * list of network devices.  Ensuring the loopback devices
7150	 * is the first device that appears and the last network device
7151	 * that disappears.
7152	 */
7153	if (register_pernet_device(&loopback_net_ops))
7154		goto out;
7155
7156	if (register_pernet_device(&default_device_ops))
7157		goto out;
7158
7159	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7160	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7161
7162	hotcpu_notifier(dev_cpu_callback, 0);
7163	dst_init();
 
7164	rc = 0;
7165out:
7166	return rc;
7167}
7168
7169subsys_initcall(net_dev_init);
v3.5.6
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/mutex.h>
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
  97#include <net/net_namespace.h>
  98#include <net/sock.h>
  99#include <linux/rtnetlink.h>
 100#include <linux/proc_fs.h>
 101#include <linux/seq_file.h>
 102#include <linux/stat.h>
 103#include <net/dst.h>
 104#include <net/pkt_sched.h>
 105#include <net/checksum.h>
 106#include <net/xfrm.h>
 107#include <linux/highmem.h>
 108#include <linux/init.h>
 109#include <linux/kmod.h>
 110#include <linux/module.h>
 111#include <linux/netpoll.h>
 112#include <linux/rcupdate.h>
 113#include <linux/delay.h>
 114#include <net/wext.h>
 115#include <net/iw_handler.h>
 116#include <asm/current.h>
 117#include <linux/audit.h>
 118#include <linux/dmaengine.h>
 119#include <linux/err.h>
 120#include <linux/ctype.h>
 121#include <linux/if_arp.h>
 122#include <linux/if_vlan.h>
 123#include <linux/ip.h>
 124#include <net/ip.h>
 125#include <linux/ipv6.h>
 126#include <linux/in.h>
 127#include <linux/jhash.h>
 128#include <linux/random.h>
 129#include <trace/events/napi.h>
 130#include <trace/events/net.h>
 131#include <trace/events/skb.h>
 132#include <linux/pci.h>
 133#include <linux/inetdevice.h>
 134#include <linux/cpu_rmap.h>
 135#include <linux/net_tstamp.h>
 136#include <linux/static_key.h>
 137#include <net/flow_keys.h>
 
 
 138
 139#include "net-sysfs.h"
 140
 141/* Instead of increasing this, you should create a hash table. */
 142#define MAX_GRO_SKBS 8
 143
 144/* This should be increased if a protocol with a bigger head is added. */
 145#define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147/*
 148 *	The list of packet types we will receive (as opposed to discard)
 149 *	and the routines to invoke.
 150 *
 151 *	Why 16. Because with 16 the only overlap we get on a hash of the
 152 *	low nibble of the protocol value is RARP/SNAP/X.25.
 153 *
 154 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 155 *             sure which should go first, but I bet it won't make much
 156 *             difference if we are running VLANs.  The good news is that
 157 *             this protocol won't be in the list unless compiled in, so
 158 *             the average user (w/out VLANs) will not be adversely affected.
 159 *             --BLG
 160 *
 161 *		0800	IP
 162 *		8100    802.1Q VLAN
 163 *		0001	802.3
 164 *		0002	AX.25
 165 *		0004	802.2
 166 *		8035	RARP
 167 *		0005	SNAP
 168 *		0805	X.25
 169 *		0806	ARP
 170 *		8137	IPX
 171 *		0009	Localtalk
 172 *		86DD	IPv6
 173 */
 174
 175#define PTYPE_HASH_SIZE	(16)
 176#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 177
 178static DEFINE_SPINLOCK(ptype_lock);
 179static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 180static struct list_head ptype_all __read_mostly;	/* Taps */
 181
 182/*
 183 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 184 * semaphore.
 185 *
 186 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 187 *
 188 * Writers must hold the rtnl semaphore while they loop through the
 189 * dev_base_head list, and hold dev_base_lock for writing when they do the
 190 * actual updates.  This allows pure readers to access the list even
 191 * while a writer is preparing to update it.
 192 *
 193 * To put it another way, dev_base_lock is held for writing only to
 194 * protect against pure readers; the rtnl semaphore provides the
 195 * protection against other writers.
 196 *
 197 * See, for example usages, register_netdevice() and
 198 * unregister_netdevice(), which must be called with the rtnl
 199 * semaphore held.
 200 */
 201DEFINE_RWLOCK(dev_base_lock);
 202EXPORT_SYMBOL(dev_base_lock);
 203
 
 
 
 
 
 
 
 
 204static inline void dev_base_seq_inc(struct net *net)
 205{
 206	while (++net->dev_base_seq == 0);
 207}
 208
 209static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 210{
 211	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 212
 213	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 214}
 215
 216static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 217{
 218	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 219}
 220
 221static inline void rps_lock(struct softnet_data *sd)
 222{
 223#ifdef CONFIG_RPS
 224	spin_lock(&sd->input_pkt_queue.lock);
 225#endif
 226}
 227
 228static inline void rps_unlock(struct softnet_data *sd)
 229{
 230#ifdef CONFIG_RPS
 231	spin_unlock(&sd->input_pkt_queue.lock);
 232#endif
 233}
 234
 235/* Device list insertion */
 236static int list_netdevice(struct net_device *dev)
 237{
 238	struct net *net = dev_net(dev);
 239
 240	ASSERT_RTNL();
 241
 242	write_lock_bh(&dev_base_lock);
 243	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 244	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 245	hlist_add_head_rcu(&dev->index_hlist,
 246			   dev_index_hash(net, dev->ifindex));
 247	write_unlock_bh(&dev_base_lock);
 248
 249	dev_base_seq_inc(net);
 250
 251	return 0;
 252}
 253
 254/* Device list removal
 255 * caller must respect a RCU grace period before freeing/reusing dev
 256 */
 257static void unlist_netdevice(struct net_device *dev)
 258{
 259	ASSERT_RTNL();
 260
 261	/* Unlink dev from the device chain */
 262	write_lock_bh(&dev_base_lock);
 263	list_del_rcu(&dev->dev_list);
 264	hlist_del_rcu(&dev->name_hlist);
 265	hlist_del_rcu(&dev->index_hlist);
 266	write_unlock_bh(&dev_base_lock);
 267
 268	dev_base_seq_inc(dev_net(dev));
 269}
 270
 271/*
 272 *	Our notifier list
 273 */
 274
 275static RAW_NOTIFIER_HEAD(netdev_chain);
 276
 277/*
 278 *	Device drivers call our routines to queue packets here. We empty the
 279 *	queue in the local softnet handler.
 280 */
 281
 282DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 283EXPORT_PER_CPU_SYMBOL(softnet_data);
 284
 285#ifdef CONFIG_LOCKDEP
 286/*
 287 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 288 * according to dev->type
 289 */
 290static const unsigned short netdev_lock_type[] =
 291	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 292	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 293	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 294	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 295	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 296	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 297	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 298	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 299	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 300	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 301	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 302	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 303	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 304	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 305	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 306
 307static const char *const netdev_lock_name[] =
 308	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 309	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 310	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 311	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 312	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 313	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 314	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 315	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 316	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 317	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 318	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 319	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 320	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 321	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 322	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 323
 324static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 325static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 326
 327static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 328{
 329	int i;
 330
 331	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 332		if (netdev_lock_type[i] == dev_type)
 333			return i;
 334	/* the last key is used by default */
 335	return ARRAY_SIZE(netdev_lock_type) - 1;
 336}
 337
 338static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 339						 unsigned short dev_type)
 340{
 341	int i;
 342
 343	i = netdev_lock_pos(dev_type);
 344	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 345				   netdev_lock_name[i]);
 346}
 347
 348static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 349{
 350	int i;
 351
 352	i = netdev_lock_pos(dev->type);
 353	lockdep_set_class_and_name(&dev->addr_list_lock,
 354				   &netdev_addr_lock_key[i],
 355				   netdev_lock_name[i]);
 356}
 357#else
 358static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 359						 unsigned short dev_type)
 360{
 361}
 362static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 363{
 364}
 365#endif
 366
 367/*******************************************************************************
 368
 369		Protocol management and registration routines
 370
 371*******************************************************************************/
 372
 373/*
 374 *	Add a protocol ID to the list. Now that the input handler is
 375 *	smarter we can dispense with all the messy stuff that used to be
 376 *	here.
 377 *
 378 *	BEWARE!!! Protocol handlers, mangling input packets,
 379 *	MUST BE last in hash buckets and checking protocol handlers
 380 *	MUST start from promiscuous ptype_all chain in net_bh.
 381 *	It is true now, do not change it.
 382 *	Explanation follows: if protocol handler, mangling packet, will
 383 *	be the first on list, it is not able to sense, that packet
 384 *	is cloned and should be copied-on-write, so that it will
 385 *	change it and subsequent readers will get broken packet.
 386 *							--ANK (980803)
 387 */
 388
 389static inline struct list_head *ptype_head(const struct packet_type *pt)
 390{
 391	if (pt->type == htons(ETH_P_ALL))
 392		return &ptype_all;
 393	else
 394		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 395}
 396
 397/**
 398 *	dev_add_pack - add packet handler
 399 *	@pt: packet type declaration
 400 *
 401 *	Add a protocol handler to the networking stack. The passed &packet_type
 402 *	is linked into kernel lists and may not be freed until it has been
 403 *	removed from the kernel lists.
 404 *
 405 *	This call does not sleep therefore it can not
 406 *	guarantee all CPU's that are in middle of receiving packets
 407 *	will see the new packet type (until the next received packet).
 408 */
 409
 410void dev_add_pack(struct packet_type *pt)
 411{
 412	struct list_head *head = ptype_head(pt);
 413
 414	spin_lock(&ptype_lock);
 415	list_add_rcu(&pt->list, head);
 416	spin_unlock(&ptype_lock);
 417}
 418EXPORT_SYMBOL(dev_add_pack);
 419
 420/**
 421 *	__dev_remove_pack	 - remove packet handler
 422 *	@pt: packet type declaration
 423 *
 424 *	Remove a protocol handler that was previously added to the kernel
 425 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 426 *	from the kernel lists and can be freed or reused once this function
 427 *	returns.
 428 *
 429 *      The packet type might still be in use by receivers
 430 *	and must not be freed until after all the CPU's have gone
 431 *	through a quiescent state.
 432 */
 433void __dev_remove_pack(struct packet_type *pt)
 434{
 435	struct list_head *head = ptype_head(pt);
 436	struct packet_type *pt1;
 437
 438	spin_lock(&ptype_lock);
 439
 440	list_for_each_entry(pt1, head, list) {
 441		if (pt == pt1) {
 442			list_del_rcu(&pt->list);
 443			goto out;
 444		}
 445	}
 446
 447	pr_warn("dev_remove_pack: %p not found\n", pt);
 448out:
 449	spin_unlock(&ptype_lock);
 450}
 451EXPORT_SYMBOL(__dev_remove_pack);
 452
 453/**
 454 *	dev_remove_pack	 - remove packet handler
 455 *	@pt: packet type declaration
 456 *
 457 *	Remove a protocol handler that was previously added to the kernel
 458 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 459 *	from the kernel lists and can be freed or reused once this function
 460 *	returns.
 461 *
 462 *	This call sleeps to guarantee that no CPU is looking at the packet
 463 *	type after return.
 464 */
 465void dev_remove_pack(struct packet_type *pt)
 466{
 467	__dev_remove_pack(pt);
 468
 469	synchronize_net();
 470}
 471EXPORT_SYMBOL(dev_remove_pack);
 472
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 473/******************************************************************************
 474
 475		      Device Boot-time Settings Routines
 476
 477*******************************************************************************/
 478
 479/* Boot time configuration table */
 480static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 481
 482/**
 483 *	netdev_boot_setup_add	- add new setup entry
 484 *	@name: name of the device
 485 *	@map: configured settings for the device
 486 *
 487 *	Adds new setup entry to the dev_boot_setup list.  The function
 488 *	returns 0 on error and 1 on success.  This is a generic routine to
 489 *	all netdevices.
 490 */
 491static int netdev_boot_setup_add(char *name, struct ifmap *map)
 492{
 493	struct netdev_boot_setup *s;
 494	int i;
 495
 496	s = dev_boot_setup;
 497	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 498		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 499			memset(s[i].name, 0, sizeof(s[i].name));
 500			strlcpy(s[i].name, name, IFNAMSIZ);
 501			memcpy(&s[i].map, map, sizeof(s[i].map));
 502			break;
 503		}
 504	}
 505
 506	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 507}
 508
 509/**
 510 *	netdev_boot_setup_check	- check boot time settings
 511 *	@dev: the netdevice
 512 *
 513 * 	Check boot time settings for the device.
 514 *	The found settings are set for the device to be used
 515 *	later in the device probing.
 516 *	Returns 0 if no settings found, 1 if they are.
 517 */
 518int netdev_boot_setup_check(struct net_device *dev)
 519{
 520	struct netdev_boot_setup *s = dev_boot_setup;
 521	int i;
 522
 523	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 524		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 525		    !strcmp(dev->name, s[i].name)) {
 526			dev->irq 	= s[i].map.irq;
 527			dev->base_addr 	= s[i].map.base_addr;
 528			dev->mem_start 	= s[i].map.mem_start;
 529			dev->mem_end 	= s[i].map.mem_end;
 530			return 1;
 531		}
 532	}
 533	return 0;
 534}
 535EXPORT_SYMBOL(netdev_boot_setup_check);
 536
 537
 538/**
 539 *	netdev_boot_base	- get address from boot time settings
 540 *	@prefix: prefix for network device
 541 *	@unit: id for network device
 542 *
 543 * 	Check boot time settings for the base address of device.
 544 *	The found settings are set for the device to be used
 545 *	later in the device probing.
 546 *	Returns 0 if no settings found.
 547 */
 548unsigned long netdev_boot_base(const char *prefix, int unit)
 549{
 550	const struct netdev_boot_setup *s = dev_boot_setup;
 551	char name[IFNAMSIZ];
 552	int i;
 553
 554	sprintf(name, "%s%d", prefix, unit);
 555
 556	/*
 557	 * If device already registered then return base of 1
 558	 * to indicate not to probe for this interface
 559	 */
 560	if (__dev_get_by_name(&init_net, name))
 561		return 1;
 562
 563	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 564		if (!strcmp(name, s[i].name))
 565			return s[i].map.base_addr;
 566	return 0;
 567}
 568
 569/*
 570 * Saves at boot time configured settings for any netdevice.
 571 */
 572int __init netdev_boot_setup(char *str)
 573{
 574	int ints[5];
 575	struct ifmap map;
 576
 577	str = get_options(str, ARRAY_SIZE(ints), ints);
 578	if (!str || !*str)
 579		return 0;
 580
 581	/* Save settings */
 582	memset(&map, 0, sizeof(map));
 583	if (ints[0] > 0)
 584		map.irq = ints[1];
 585	if (ints[0] > 1)
 586		map.base_addr = ints[2];
 587	if (ints[0] > 2)
 588		map.mem_start = ints[3];
 589	if (ints[0] > 3)
 590		map.mem_end = ints[4];
 591
 592	/* Add new entry to the list */
 593	return netdev_boot_setup_add(str, &map);
 594}
 595
 596__setup("netdev=", netdev_boot_setup);
 597
 598/*******************************************************************************
 599
 600			    Device Interface Subroutines
 601
 602*******************************************************************************/
 603
 604/**
 605 *	__dev_get_by_name	- find a device by its name
 606 *	@net: the applicable net namespace
 607 *	@name: name to find
 608 *
 609 *	Find an interface by name. Must be called under RTNL semaphore
 610 *	or @dev_base_lock. If the name is found a pointer to the device
 611 *	is returned. If the name is not found then %NULL is returned. The
 612 *	reference counters are not incremented so the caller must be
 613 *	careful with locks.
 614 */
 615
 616struct net_device *__dev_get_by_name(struct net *net, const char *name)
 617{
 618	struct hlist_node *p;
 619	struct net_device *dev;
 620	struct hlist_head *head = dev_name_hash(net, name);
 621
 622	hlist_for_each_entry(dev, p, head, name_hlist)
 623		if (!strncmp(dev->name, name, IFNAMSIZ))
 624			return dev;
 625
 626	return NULL;
 627}
 628EXPORT_SYMBOL(__dev_get_by_name);
 629
 630/**
 631 *	dev_get_by_name_rcu	- find a device by its name
 632 *	@net: the applicable net namespace
 633 *	@name: name to find
 634 *
 635 *	Find an interface by name.
 636 *	If the name is found a pointer to the device is returned.
 637 * 	If the name is not found then %NULL is returned.
 638 *	The reference counters are not incremented so the caller must be
 639 *	careful with locks. The caller must hold RCU lock.
 640 */
 641
 642struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 643{
 644	struct hlist_node *p;
 645	struct net_device *dev;
 646	struct hlist_head *head = dev_name_hash(net, name);
 647
 648	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 649		if (!strncmp(dev->name, name, IFNAMSIZ))
 650			return dev;
 651
 652	return NULL;
 653}
 654EXPORT_SYMBOL(dev_get_by_name_rcu);
 655
 656/**
 657 *	dev_get_by_name		- find a device by its name
 658 *	@net: the applicable net namespace
 659 *	@name: name to find
 660 *
 661 *	Find an interface by name. This can be called from any
 662 *	context and does its own locking. The returned handle has
 663 *	the usage count incremented and the caller must use dev_put() to
 664 *	release it when it is no longer needed. %NULL is returned if no
 665 *	matching device is found.
 666 */
 667
 668struct net_device *dev_get_by_name(struct net *net, const char *name)
 669{
 670	struct net_device *dev;
 671
 672	rcu_read_lock();
 673	dev = dev_get_by_name_rcu(net, name);
 674	if (dev)
 675		dev_hold(dev);
 676	rcu_read_unlock();
 677	return dev;
 678}
 679EXPORT_SYMBOL(dev_get_by_name);
 680
 681/**
 682 *	__dev_get_by_index - find a device by its ifindex
 683 *	@net: the applicable net namespace
 684 *	@ifindex: index of device
 685 *
 686 *	Search for an interface by index. Returns %NULL if the device
 687 *	is not found or a pointer to the device. The device has not
 688 *	had its reference counter increased so the caller must be careful
 689 *	about locking. The caller must hold either the RTNL semaphore
 690 *	or @dev_base_lock.
 691 */
 692
 693struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 694{
 695	struct hlist_node *p;
 696	struct net_device *dev;
 697	struct hlist_head *head = dev_index_hash(net, ifindex);
 698
 699	hlist_for_each_entry(dev, p, head, index_hlist)
 700		if (dev->ifindex == ifindex)
 701			return dev;
 702
 703	return NULL;
 704}
 705EXPORT_SYMBOL(__dev_get_by_index);
 706
 707/**
 708 *	dev_get_by_index_rcu - find a device by its ifindex
 709 *	@net: the applicable net namespace
 710 *	@ifindex: index of device
 711 *
 712 *	Search for an interface by index. Returns %NULL if the device
 713 *	is not found or a pointer to the device. The device has not
 714 *	had its reference counter increased so the caller must be careful
 715 *	about locking. The caller must hold RCU lock.
 716 */
 717
 718struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 719{
 720	struct hlist_node *p;
 721	struct net_device *dev;
 722	struct hlist_head *head = dev_index_hash(net, ifindex);
 723
 724	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 725		if (dev->ifindex == ifindex)
 726			return dev;
 727
 728	return NULL;
 729}
 730EXPORT_SYMBOL(dev_get_by_index_rcu);
 731
 732
 733/**
 734 *	dev_get_by_index - find a device by its ifindex
 735 *	@net: the applicable net namespace
 736 *	@ifindex: index of device
 737 *
 738 *	Search for an interface by index. Returns NULL if the device
 739 *	is not found or a pointer to the device. The device returned has
 740 *	had a reference added and the pointer is safe until the user calls
 741 *	dev_put to indicate they have finished with it.
 742 */
 743
 744struct net_device *dev_get_by_index(struct net *net, int ifindex)
 745{
 746	struct net_device *dev;
 747
 748	rcu_read_lock();
 749	dev = dev_get_by_index_rcu(net, ifindex);
 750	if (dev)
 751		dev_hold(dev);
 752	rcu_read_unlock();
 753	return dev;
 754}
 755EXPORT_SYMBOL(dev_get_by_index);
 756
 757/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 758 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 759 *	@net: the applicable net namespace
 760 *	@type: media type of device
 761 *	@ha: hardware address
 762 *
 763 *	Search for an interface by MAC address. Returns NULL if the device
 764 *	is not found or a pointer to the device.
 765 *	The caller must hold RCU or RTNL.
 766 *	The returned device has not had its ref count increased
 767 *	and the caller must therefore be careful about locking
 768 *
 769 */
 770
 771struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 772				       const char *ha)
 773{
 774	struct net_device *dev;
 775
 776	for_each_netdev_rcu(net, dev)
 777		if (dev->type == type &&
 778		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 779			return dev;
 780
 781	return NULL;
 782}
 783EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 784
 785struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 786{
 787	struct net_device *dev;
 788
 789	ASSERT_RTNL();
 790	for_each_netdev(net, dev)
 791		if (dev->type == type)
 792			return dev;
 793
 794	return NULL;
 795}
 796EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 797
 798struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 799{
 800	struct net_device *dev, *ret = NULL;
 801
 802	rcu_read_lock();
 803	for_each_netdev_rcu(net, dev)
 804		if (dev->type == type) {
 805			dev_hold(dev);
 806			ret = dev;
 807			break;
 808		}
 809	rcu_read_unlock();
 810	return ret;
 811}
 812EXPORT_SYMBOL(dev_getfirstbyhwtype);
 813
 814/**
 815 *	dev_get_by_flags_rcu - find any device with given flags
 816 *	@net: the applicable net namespace
 817 *	@if_flags: IFF_* values
 818 *	@mask: bitmask of bits in if_flags to check
 819 *
 820 *	Search for any interface with the given flags. Returns NULL if a device
 821 *	is not found or a pointer to the device. Must be called inside
 822 *	rcu_read_lock(), and result refcount is unchanged.
 823 */
 824
 825struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 826				    unsigned short mask)
 827{
 828	struct net_device *dev, *ret;
 829
 830	ret = NULL;
 831	for_each_netdev_rcu(net, dev) {
 832		if (((dev->flags ^ if_flags) & mask) == 0) {
 833			ret = dev;
 834			break;
 835		}
 836	}
 837	return ret;
 838}
 839EXPORT_SYMBOL(dev_get_by_flags_rcu);
 840
 841/**
 842 *	dev_valid_name - check if name is okay for network device
 843 *	@name: name string
 844 *
 845 *	Network device names need to be valid file names to
 846 *	to allow sysfs to work.  We also disallow any kind of
 847 *	whitespace.
 848 */
 849bool dev_valid_name(const char *name)
 850{
 851	if (*name == '\0')
 852		return false;
 853	if (strlen(name) >= IFNAMSIZ)
 854		return false;
 855	if (!strcmp(name, ".") || !strcmp(name, ".."))
 856		return false;
 857
 858	while (*name) {
 859		if (*name == '/' || isspace(*name))
 860			return false;
 861		name++;
 862	}
 863	return true;
 864}
 865EXPORT_SYMBOL(dev_valid_name);
 866
 867/**
 868 *	__dev_alloc_name - allocate a name for a device
 869 *	@net: network namespace to allocate the device name in
 870 *	@name: name format string
 871 *	@buf:  scratch buffer and result name string
 872 *
 873 *	Passed a format string - eg "lt%d" it will try and find a suitable
 874 *	id. It scans list of devices to build up a free map, then chooses
 875 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 876 *	while allocating the name and adding the device in order to avoid
 877 *	duplicates.
 878 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 879 *	Returns the number of the unit assigned or a negative errno code.
 880 */
 881
 882static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 883{
 884	int i = 0;
 885	const char *p;
 886	const int max_netdevices = 8*PAGE_SIZE;
 887	unsigned long *inuse;
 888	struct net_device *d;
 889
 890	p = strnchr(name, IFNAMSIZ-1, '%');
 891	if (p) {
 892		/*
 893		 * Verify the string as this thing may have come from
 894		 * the user.  There must be either one "%d" and no other "%"
 895		 * characters.
 896		 */
 897		if (p[1] != 'd' || strchr(p + 2, '%'))
 898			return -EINVAL;
 899
 900		/* Use one page as a bit array of possible slots */
 901		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 902		if (!inuse)
 903			return -ENOMEM;
 904
 905		for_each_netdev(net, d) {
 906			if (!sscanf(d->name, name, &i))
 907				continue;
 908			if (i < 0 || i >= max_netdevices)
 909				continue;
 910
 911			/*  avoid cases where sscanf is not exact inverse of printf */
 912			snprintf(buf, IFNAMSIZ, name, i);
 913			if (!strncmp(buf, d->name, IFNAMSIZ))
 914				set_bit(i, inuse);
 915		}
 916
 917		i = find_first_zero_bit(inuse, max_netdevices);
 918		free_page((unsigned long) inuse);
 919	}
 920
 921	if (buf != name)
 922		snprintf(buf, IFNAMSIZ, name, i);
 923	if (!__dev_get_by_name(net, buf))
 924		return i;
 925
 926	/* It is possible to run out of possible slots
 927	 * when the name is long and there isn't enough space left
 928	 * for the digits, or if all bits are used.
 929	 */
 930	return -ENFILE;
 931}
 932
 933/**
 934 *	dev_alloc_name - allocate a name for a device
 935 *	@dev: device
 936 *	@name: name format string
 937 *
 938 *	Passed a format string - eg "lt%d" it will try and find a suitable
 939 *	id. It scans list of devices to build up a free map, then chooses
 940 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 941 *	while allocating the name and adding the device in order to avoid
 942 *	duplicates.
 943 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 944 *	Returns the number of the unit assigned or a negative errno code.
 945 */
 946
 947int dev_alloc_name(struct net_device *dev, const char *name)
 948{
 949	char buf[IFNAMSIZ];
 950	struct net *net;
 951	int ret;
 952
 953	BUG_ON(!dev_net(dev));
 954	net = dev_net(dev);
 955	ret = __dev_alloc_name(net, name, buf);
 956	if (ret >= 0)
 957		strlcpy(dev->name, buf, IFNAMSIZ);
 958	return ret;
 959}
 960EXPORT_SYMBOL(dev_alloc_name);
 961
 962static int dev_get_valid_name(struct net_device *dev, const char *name)
 
 
 963{
 964	struct net *net;
 
 
 
 
 
 
 
 965
 966	BUG_ON(!dev_net(dev));
 967	net = dev_net(dev);
 
 
 
 968
 969	if (!dev_valid_name(name))
 970		return -EINVAL;
 971
 972	if (strchr(name, '%'))
 973		return dev_alloc_name(dev, name);
 974	else if (__dev_get_by_name(net, name))
 975		return -EEXIST;
 976	else if (dev->name != name)
 977		strlcpy(dev->name, name, IFNAMSIZ);
 978
 979	return 0;
 980}
 981
 982/**
 983 *	dev_change_name - change name of a device
 984 *	@dev: device
 985 *	@newname: name (or format string) must be at least IFNAMSIZ
 986 *
 987 *	Change name of a device, can pass format strings "eth%d".
 988 *	for wildcarding.
 989 */
 990int dev_change_name(struct net_device *dev, const char *newname)
 991{
 992	char oldname[IFNAMSIZ];
 993	int err = 0;
 994	int ret;
 995	struct net *net;
 996
 997	ASSERT_RTNL();
 998	BUG_ON(!dev_net(dev));
 999
1000	net = dev_net(dev);
1001	if (dev->flags & IFF_UP)
1002		return -EBUSY;
1003
1004	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 
 
 
1005		return 0;
 
1006
1007	memcpy(oldname, dev->name, IFNAMSIZ);
1008
1009	err = dev_get_valid_name(dev, newname);
1010	if (err < 0)
 
1011		return err;
 
1012
1013rollback:
1014	ret = device_rename(&dev->dev, dev->name);
1015	if (ret) {
1016		memcpy(dev->name, oldname, IFNAMSIZ);
 
1017		return ret;
1018	}
1019
 
 
 
 
1020	write_lock_bh(&dev_base_lock);
1021	hlist_del_rcu(&dev->name_hlist);
1022	write_unlock_bh(&dev_base_lock);
1023
1024	synchronize_rcu();
1025
1026	write_lock_bh(&dev_base_lock);
1027	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1028	write_unlock_bh(&dev_base_lock);
1029
1030	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1031	ret = notifier_to_errno(ret);
1032
1033	if (ret) {
1034		/* err >= 0 after dev_alloc_name() or stores the first errno */
1035		if (err >= 0) {
1036			err = ret;
 
1037			memcpy(dev->name, oldname, IFNAMSIZ);
 
1038			goto rollback;
1039		} else {
1040			pr_err("%s: name change rollback failed: %d\n",
1041			       dev->name, ret);
1042		}
1043	}
1044
1045	return err;
1046}
1047
1048/**
1049 *	dev_set_alias - change ifalias of a device
1050 *	@dev: device
1051 *	@alias: name up to IFALIASZ
1052 *	@len: limit of bytes to copy from info
1053 *
1054 *	Set ifalias for a device,
1055 */
1056int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1057{
1058	char *new_ifalias;
1059
1060	ASSERT_RTNL();
1061
1062	if (len >= IFALIASZ)
1063		return -EINVAL;
1064
1065	if (!len) {
1066		if (dev->ifalias) {
1067			kfree(dev->ifalias);
1068			dev->ifalias = NULL;
1069		}
1070		return 0;
1071	}
1072
1073	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1074	if (!new_ifalias)
1075		return -ENOMEM;
1076	dev->ifalias = new_ifalias;
1077
1078	strlcpy(dev->ifalias, alias, len+1);
1079	return len;
1080}
1081
1082
1083/**
1084 *	netdev_features_change - device changes features
1085 *	@dev: device to cause notification
1086 *
1087 *	Called to indicate a device has changed features.
1088 */
1089void netdev_features_change(struct net_device *dev)
1090{
1091	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1092}
1093EXPORT_SYMBOL(netdev_features_change);
1094
1095/**
1096 *	netdev_state_change - device changes state
1097 *	@dev: device to cause notification
1098 *
1099 *	Called to indicate a device has changed state. This function calls
1100 *	the notifier chains for netdev_chain and sends a NEWLINK message
1101 *	to the routing socket.
1102 */
1103void netdev_state_change(struct net_device *dev)
1104{
1105	if (dev->flags & IFF_UP) {
1106		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1107		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1108	}
1109}
1110EXPORT_SYMBOL(netdev_state_change);
1111
1112int netdev_bonding_change(struct net_device *dev, unsigned long event)
1113{
1114	return call_netdevice_notifiers(event, dev);
1115}
1116EXPORT_SYMBOL(netdev_bonding_change);
1117
1118/**
1119 *	dev_load 	- load a network module
1120 *	@net: the applicable net namespace
1121 *	@name: name of interface
1122 *
1123 *	If a network interface is not present and the process has suitable
1124 *	privileges this function loads the module. If module loading is not
1125 *	available in this kernel then it becomes a nop.
 
 
1126 */
1127
1128void dev_load(struct net *net, const char *name)
1129{
1130	struct net_device *dev;
1131	int no_module;
1132
1133	rcu_read_lock();
1134	dev = dev_get_by_name_rcu(net, name);
1135	rcu_read_unlock();
1136
1137	no_module = !dev;
1138	if (no_module && capable(CAP_NET_ADMIN))
1139		no_module = request_module("netdev-%s", name);
1140	if (no_module && capable(CAP_SYS_MODULE)) {
1141		if (!request_module("%s", name))
1142			pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1143				name);
1144	}
1145}
1146EXPORT_SYMBOL(dev_load);
1147
1148static int __dev_open(struct net_device *dev)
1149{
1150	const struct net_device_ops *ops = dev->netdev_ops;
1151	int ret;
1152
1153	ASSERT_RTNL();
1154
1155	if (!netif_device_present(dev))
1156		return -ENODEV;
1157
 
 
 
 
 
 
1158	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1159	ret = notifier_to_errno(ret);
1160	if (ret)
1161		return ret;
1162
1163	set_bit(__LINK_STATE_START, &dev->state);
1164
1165	if (ops->ndo_validate_addr)
1166		ret = ops->ndo_validate_addr(dev);
1167
1168	if (!ret && ops->ndo_open)
1169		ret = ops->ndo_open(dev);
1170
 
 
1171	if (ret)
1172		clear_bit(__LINK_STATE_START, &dev->state);
1173	else {
1174		dev->flags |= IFF_UP;
1175		net_dmaengine_get();
1176		dev_set_rx_mode(dev);
1177		dev_activate(dev);
1178		add_device_randomness(dev->dev_addr, dev->addr_len);
1179	}
1180
1181	return ret;
1182}
1183
1184/**
1185 *	dev_open	- prepare an interface for use.
1186 *	@dev:	device to open
1187 *
1188 *	Takes a device from down to up state. The device's private open
1189 *	function is invoked and then the multicast lists are loaded. Finally
1190 *	the device is moved into the up state and a %NETDEV_UP message is
1191 *	sent to the netdev notifier chain.
1192 *
1193 *	Calling this function on an active interface is a nop. On a failure
1194 *	a negative errno code is returned.
1195 */
1196int dev_open(struct net_device *dev)
1197{
1198	int ret;
1199
1200	if (dev->flags & IFF_UP)
1201		return 0;
1202
1203	ret = __dev_open(dev);
1204	if (ret < 0)
1205		return ret;
1206
1207	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1208	call_netdevice_notifiers(NETDEV_UP, dev);
1209
1210	return ret;
1211}
1212EXPORT_SYMBOL(dev_open);
1213
1214static int __dev_close_many(struct list_head *head)
1215{
1216	struct net_device *dev;
1217
1218	ASSERT_RTNL();
1219	might_sleep();
1220
1221	list_for_each_entry(dev, head, unreg_list) {
 
 
 
1222		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1223
1224		clear_bit(__LINK_STATE_START, &dev->state);
1225
1226		/* Synchronize to scheduled poll. We cannot touch poll list, it
1227		 * can be even on different cpu. So just clear netif_running().
1228		 *
1229		 * dev->stop() will invoke napi_disable() on all of it's
1230		 * napi_struct instances on this device.
1231		 */
1232		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1233	}
1234
1235	dev_deactivate_many(head);
1236
1237	list_for_each_entry(dev, head, unreg_list) {
1238		const struct net_device_ops *ops = dev->netdev_ops;
1239
1240		/*
1241		 *	Call the device specific close. This cannot fail.
1242		 *	Only if device is UP
1243		 *
1244		 *	We allow it to be called even after a DETACH hot-plug
1245		 *	event.
1246		 */
1247		if (ops->ndo_stop)
1248			ops->ndo_stop(dev);
1249
1250		dev->flags &= ~IFF_UP;
1251		net_dmaengine_put();
 
1252	}
1253
1254	return 0;
1255}
1256
1257static int __dev_close(struct net_device *dev)
1258{
1259	int retval;
1260	LIST_HEAD(single);
1261
1262	list_add(&dev->unreg_list, &single);
1263	retval = __dev_close_many(&single);
1264	list_del(&single);
 
1265	return retval;
1266}
1267
1268static int dev_close_many(struct list_head *head)
1269{
1270	struct net_device *dev, *tmp;
1271	LIST_HEAD(tmp_list);
1272
1273	list_for_each_entry_safe(dev, tmp, head, unreg_list)
 
1274		if (!(dev->flags & IFF_UP))
1275			list_move(&dev->unreg_list, &tmp_list);
1276
1277	__dev_close_many(head);
1278
1279	list_for_each_entry(dev, head, unreg_list) {
1280		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1281		call_netdevice_notifiers(NETDEV_DOWN, dev);
 
1282	}
1283
1284	/* rollback_registered_many needs the complete original list */
1285	list_splice(&tmp_list, head);
1286	return 0;
1287}
1288
1289/**
1290 *	dev_close - shutdown an interface.
1291 *	@dev: device to shutdown
1292 *
1293 *	This function moves an active device into down state. A
1294 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1295 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1296 *	chain.
1297 */
1298int dev_close(struct net_device *dev)
1299{
1300	if (dev->flags & IFF_UP) {
1301		LIST_HEAD(single);
1302
1303		list_add(&dev->unreg_list, &single);
1304		dev_close_many(&single);
1305		list_del(&single);
1306	}
1307	return 0;
1308}
1309EXPORT_SYMBOL(dev_close);
1310
1311
1312/**
1313 *	dev_disable_lro - disable Large Receive Offload on a device
1314 *	@dev: device
1315 *
1316 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1317 *	called under RTNL.  This is needed if received packets may be
1318 *	forwarded to another interface.
1319 */
1320void dev_disable_lro(struct net_device *dev)
1321{
1322	/*
1323	 * If we're trying to disable lro on a vlan device
1324	 * use the underlying physical device instead
1325	 */
1326	if (is_vlan_dev(dev))
1327		dev = vlan_dev_real_dev(dev);
1328
 
 
 
 
1329	dev->wanted_features &= ~NETIF_F_LRO;
1330	netdev_update_features(dev);
1331
1332	if (unlikely(dev->features & NETIF_F_LRO))
1333		netdev_WARN(dev, "failed to disable LRO!\n");
1334}
1335EXPORT_SYMBOL(dev_disable_lro);
1336
 
 
 
 
 
 
 
 
1337
1338static int dev_boot_phase = 1;
1339
1340/**
1341 *	register_netdevice_notifier - register a network notifier block
1342 *	@nb: notifier
1343 *
1344 *	Register a notifier to be called when network device events occur.
1345 *	The notifier passed is linked into the kernel structures and must
1346 *	not be reused until it has been unregistered. A negative errno code
1347 *	is returned on a failure.
1348 *
1349 * 	When registered all registration and up events are replayed
1350 *	to the new notifier to allow device to have a race free
1351 *	view of the network device list.
1352 */
1353
1354int register_netdevice_notifier(struct notifier_block *nb)
1355{
1356	struct net_device *dev;
1357	struct net_device *last;
1358	struct net *net;
1359	int err;
1360
1361	rtnl_lock();
1362	err = raw_notifier_chain_register(&netdev_chain, nb);
1363	if (err)
1364		goto unlock;
1365	if (dev_boot_phase)
1366		goto unlock;
1367	for_each_net(net) {
1368		for_each_netdev(net, dev) {
1369			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1370			err = notifier_to_errno(err);
1371			if (err)
1372				goto rollback;
1373
1374			if (!(dev->flags & IFF_UP))
1375				continue;
1376
1377			nb->notifier_call(nb, NETDEV_UP, dev);
1378		}
1379	}
1380
1381unlock:
1382	rtnl_unlock();
1383	return err;
1384
1385rollback:
1386	last = dev;
1387	for_each_net(net) {
1388		for_each_netdev(net, dev) {
1389			if (dev == last)
1390				goto outroll;
1391
1392			if (dev->flags & IFF_UP) {
1393				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1394				nb->notifier_call(nb, NETDEV_DOWN, dev);
 
1395			}
1396			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1397			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1398		}
1399	}
1400
1401outroll:
1402	raw_notifier_chain_unregister(&netdev_chain, nb);
1403	goto unlock;
1404}
1405EXPORT_SYMBOL(register_netdevice_notifier);
1406
1407/**
1408 *	unregister_netdevice_notifier - unregister a network notifier block
1409 *	@nb: notifier
1410 *
1411 *	Unregister a notifier previously registered by
1412 *	register_netdevice_notifier(). The notifier is unlinked into the
1413 *	kernel structures and may then be reused. A negative errno code
1414 *	is returned on a failure.
1415 *
1416 * 	After unregistering unregister and down device events are synthesized
1417 *	for all devices on the device list to the removed notifier to remove
1418 *	the need for special case cleanup code.
1419 */
1420
1421int unregister_netdevice_notifier(struct notifier_block *nb)
1422{
1423	struct net_device *dev;
1424	struct net *net;
1425	int err;
1426
1427	rtnl_lock();
1428	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1429	if (err)
1430		goto unlock;
1431
1432	for_each_net(net) {
1433		for_each_netdev(net, dev) {
1434			if (dev->flags & IFF_UP) {
1435				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1436				nb->notifier_call(nb, NETDEV_DOWN, dev);
 
1437			}
1438			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1439			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1440		}
1441	}
1442unlock:
1443	rtnl_unlock();
1444	return err;
1445}
1446EXPORT_SYMBOL(unregister_netdevice_notifier);
1447
1448/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1449 *	call_netdevice_notifiers - call all network notifier blocks
1450 *      @val: value passed unmodified to notifier function
1451 *      @dev: net_device pointer passed unmodified to notifier function
1452 *
1453 *	Call all network notifier blocks.  Parameters and return value
1454 *	are as for raw_notifier_call_chain().
1455 */
1456
1457int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1458{
1459	ASSERT_RTNL();
1460	return raw_notifier_call_chain(&netdev_chain, val, dev);
 
1461}
1462EXPORT_SYMBOL(call_netdevice_notifiers);
1463
1464static struct static_key netstamp_needed __read_mostly;
1465#ifdef HAVE_JUMP_LABEL
1466/* We are not allowed to call static_key_slow_dec() from irq context
1467 * If net_disable_timestamp() is called from irq context, defer the
1468 * static_key_slow_dec() calls.
1469 */
1470static atomic_t netstamp_needed_deferred;
1471#endif
1472
1473void net_enable_timestamp(void)
1474{
1475#ifdef HAVE_JUMP_LABEL
1476	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1477
1478	if (deferred) {
1479		while (--deferred)
1480			static_key_slow_dec(&netstamp_needed);
1481		return;
1482	}
1483#endif
1484	WARN_ON(in_interrupt());
1485	static_key_slow_inc(&netstamp_needed);
1486}
1487EXPORT_SYMBOL(net_enable_timestamp);
1488
1489void net_disable_timestamp(void)
1490{
1491#ifdef HAVE_JUMP_LABEL
1492	if (in_interrupt()) {
1493		atomic_inc(&netstamp_needed_deferred);
1494		return;
1495	}
1496#endif
1497	static_key_slow_dec(&netstamp_needed);
1498}
1499EXPORT_SYMBOL(net_disable_timestamp);
1500
1501static inline void net_timestamp_set(struct sk_buff *skb)
1502{
1503	skb->tstamp.tv64 = 0;
1504	if (static_key_false(&netstamp_needed))
1505		__net_timestamp(skb);
1506}
1507
1508#define net_timestamp_check(COND, SKB)			\
1509	if (static_key_false(&netstamp_needed)) {		\
1510		if ((COND) && !(SKB)->tstamp.tv64)	\
1511			__net_timestamp(SKB);		\
1512	}						\
1513
1514static int net_hwtstamp_validate(struct ifreq *ifr)
1515{
1516	struct hwtstamp_config cfg;
1517	enum hwtstamp_tx_types tx_type;
1518	enum hwtstamp_rx_filters rx_filter;
1519	int tx_type_valid = 0;
1520	int rx_filter_valid = 0;
1521
1522	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1523		return -EFAULT;
1524
1525	if (cfg.flags) /* reserved for future extensions */
1526		return -EINVAL;
1527
1528	tx_type = cfg.tx_type;
1529	rx_filter = cfg.rx_filter;
1530
1531	switch (tx_type) {
1532	case HWTSTAMP_TX_OFF:
1533	case HWTSTAMP_TX_ON:
1534	case HWTSTAMP_TX_ONESTEP_SYNC:
1535		tx_type_valid = 1;
1536		break;
1537	}
1538
1539	switch (rx_filter) {
1540	case HWTSTAMP_FILTER_NONE:
1541	case HWTSTAMP_FILTER_ALL:
1542	case HWTSTAMP_FILTER_SOME:
1543	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1544	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1545	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1546	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1547	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1548	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1549	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1550	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1551	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1552	case HWTSTAMP_FILTER_PTP_V2_EVENT:
1553	case HWTSTAMP_FILTER_PTP_V2_SYNC:
1554	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1555		rx_filter_valid = 1;
1556		break;
1557	}
1558
1559	if (!tx_type_valid || !rx_filter_valid)
1560		return -ERANGE;
1561
1562	return 0;
1563}
1564
1565static inline bool is_skb_forwardable(struct net_device *dev,
1566				      struct sk_buff *skb)
1567{
1568	unsigned int len;
1569
1570	if (!(dev->flags & IFF_UP))
1571		return false;
1572
1573	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1574	if (skb->len <= len)
1575		return true;
1576
1577	/* if TSO is enabled, we don't care about the length as the packet
1578	 * could be forwarded without being segmented before
1579	 */
1580	if (skb_is_gso(skb))
1581		return true;
1582
1583	return false;
1584}
 
1585
1586/**
1587 * dev_forward_skb - loopback an skb to another netif
1588 *
1589 * @dev: destination network device
1590 * @skb: buffer to forward
1591 *
1592 * return values:
1593 *	NET_RX_SUCCESS	(no congestion)
1594 *	NET_RX_DROP     (packet was dropped, but freed)
1595 *
1596 * dev_forward_skb can be used for injecting an skb from the
1597 * start_xmit function of one device into the receive queue
1598 * of another device.
1599 *
1600 * The receiving device may be in another namespace, so
1601 * we have to clear all information in the skb that could
1602 * impact namespace isolation.
1603 */
1604int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1605{
1606	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1607		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1608			atomic_long_inc(&dev->rx_dropped);
1609			kfree_skb(skb);
1610			return NET_RX_DROP;
1611		}
1612	}
1613
1614	skb_orphan(skb);
1615	nf_reset(skb);
1616
1617	if (unlikely(!is_skb_forwardable(dev, skb))) {
1618		atomic_long_inc(&dev->rx_dropped);
1619		kfree_skb(skb);
1620		return NET_RX_DROP;
1621	}
1622	skb->skb_iif = 0;
1623	skb->dev = dev;
1624	skb_dst_drop(skb);
1625	skb->tstamp.tv64 = 0;
1626	skb->pkt_type = PACKET_HOST;
1627	skb->protocol = eth_type_trans(skb, dev);
1628	skb->mark = 0;
1629	secpath_reset(skb);
1630	nf_reset(skb);
1631	return netif_rx(skb);
1632}
1633EXPORT_SYMBOL_GPL(dev_forward_skb);
1634
1635static inline int deliver_skb(struct sk_buff *skb,
1636			      struct packet_type *pt_prev,
1637			      struct net_device *orig_dev)
1638{
 
 
1639	atomic_inc(&skb->users);
1640	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1641}
1642
1643static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1644{
1645	if (ptype->af_packet_priv == NULL)
1646		return false;
1647
1648	if (ptype->id_match)
1649		return ptype->id_match(ptype, skb->sk);
1650	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1651		return true;
1652
1653	return false;
1654}
1655
1656/*
1657 *	Support routine. Sends outgoing frames to any network
1658 *	taps currently in use.
1659 */
1660
1661static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1662{
1663	struct packet_type *ptype;
1664	struct sk_buff *skb2 = NULL;
1665	struct packet_type *pt_prev = NULL;
1666
1667	rcu_read_lock();
1668	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1669		/* Never send packets back to the socket
1670		 * they originated from - MvS (miquels@drinkel.ow.org)
1671		 */
1672		if ((ptype->dev == dev || !ptype->dev) &&
1673		    (!skb_loop_sk(ptype, skb))) {
1674			if (pt_prev) {
1675				deliver_skb(skb2, pt_prev, skb->dev);
1676				pt_prev = ptype;
1677				continue;
1678			}
1679
1680			skb2 = skb_clone(skb, GFP_ATOMIC);
1681			if (!skb2)
1682				break;
1683
1684			net_timestamp_set(skb2);
1685
1686			/* skb->nh should be correctly
1687			   set by sender, so that the second statement is
1688			   just protection against buggy protocols.
1689			 */
1690			skb_reset_mac_header(skb2);
1691
1692			if (skb_network_header(skb2) < skb2->data ||
1693			    skb2->network_header > skb2->tail) {
1694				net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1695						     ntohs(skb2->protocol),
1696						     dev->name);
1697				skb_reset_network_header(skb2);
1698			}
1699
1700			skb2->transport_header = skb2->network_header;
1701			skb2->pkt_type = PACKET_OUTGOING;
1702			pt_prev = ptype;
1703		}
1704	}
1705	if (pt_prev)
1706		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1707	rcu_read_unlock();
1708}
1709
1710/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
 
1711 * @dev: Network device
1712 * @txq: number of queues available
1713 *
1714 * If real_num_tx_queues is changed the tc mappings may no longer be
1715 * valid. To resolve this verify the tc mapping remains valid and if
1716 * not NULL the mapping. With no priorities mapping to this
1717 * offset/count pair it will no longer be used. In the worst case TC0
1718 * is invalid nothing can be done so disable priority mappings. If is
1719 * expected that drivers will fix this mapping if they can before
1720 * calling netif_set_real_num_tx_queues.
1721 */
1722static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1723{
1724	int i;
1725	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1726
1727	/* If TC0 is invalidated disable TC mapping */
1728	if (tc->offset + tc->count > txq) {
1729		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1730		dev->num_tc = 0;
1731		return;
1732	}
1733
1734	/* Invalidated prio to tc mappings set to TC0 */
1735	for (i = 1; i < TC_BITMASK + 1; i++) {
1736		int q = netdev_get_prio_tc_map(dev, i);
1737
1738		tc = &dev->tc_to_txq[q];
1739		if (tc->offset + tc->count > txq) {
1740			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1741				i, q);
1742			netdev_set_prio_tc_map(dev, i, 0);
1743		}
1744	}
1745}
1746
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1747/*
1748 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1749 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1750 */
1751int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1752{
1753	int rc;
1754
1755	if (txq < 1 || txq > dev->num_tx_queues)
1756		return -EINVAL;
1757
1758	if (dev->reg_state == NETREG_REGISTERED ||
1759	    dev->reg_state == NETREG_UNREGISTERING) {
1760		ASSERT_RTNL();
1761
1762		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1763						  txq);
1764		if (rc)
1765			return rc;
1766
1767		if (dev->num_tc)
1768			netif_setup_tc(dev, txq);
1769
1770		if (txq < dev->real_num_tx_queues)
1771			qdisc_reset_all_tx_gt(dev, txq);
 
 
 
 
1772	}
1773
1774	dev->real_num_tx_queues = txq;
1775	return 0;
1776}
1777EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1778
1779#ifdef CONFIG_RPS
1780/**
1781 *	netif_set_real_num_rx_queues - set actual number of RX queues used
1782 *	@dev: Network device
1783 *	@rxq: Actual number of RX queues
1784 *
1785 *	This must be called either with the rtnl_lock held or before
1786 *	registration of the net device.  Returns 0 on success, or a
1787 *	negative error code.  If called before registration, it always
1788 *	succeeds.
1789 */
1790int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1791{
1792	int rc;
1793
1794	if (rxq < 1 || rxq > dev->num_rx_queues)
1795		return -EINVAL;
1796
1797	if (dev->reg_state == NETREG_REGISTERED) {
1798		ASSERT_RTNL();
1799
1800		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1801						  rxq);
1802		if (rc)
1803			return rc;
1804	}
1805
1806	dev->real_num_rx_queues = rxq;
1807	return 0;
1808}
1809EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1810#endif
1811
 
 
 
 
 
 
 
 
 
 
 
 
1812static inline void __netif_reschedule(struct Qdisc *q)
1813{
1814	struct softnet_data *sd;
1815	unsigned long flags;
1816
1817	local_irq_save(flags);
1818	sd = &__get_cpu_var(softnet_data);
1819	q->next_sched = NULL;
1820	*sd->output_queue_tailp = q;
1821	sd->output_queue_tailp = &q->next_sched;
1822	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1823	local_irq_restore(flags);
1824}
1825
1826void __netif_schedule(struct Qdisc *q)
1827{
1828	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1829		__netif_reschedule(q);
1830}
1831EXPORT_SYMBOL(__netif_schedule);
1832
1833void dev_kfree_skb_irq(struct sk_buff *skb)
 
 
 
 
1834{
1835	if (atomic_dec_and_test(&skb->users)) {
1836		struct softnet_data *sd;
1837		unsigned long flags;
 
 
 
1838
1839		local_irq_save(flags);
1840		sd = &__get_cpu_var(softnet_data);
1841		skb->next = sd->completion_queue;
1842		sd->completion_queue = skb;
1843		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1844		local_irq_restore(flags);
1845	}
 
 
 
 
 
 
1846}
1847EXPORT_SYMBOL(dev_kfree_skb_irq);
1848
1849void dev_kfree_skb_any(struct sk_buff *skb)
1850{
1851	if (in_irq() || irqs_disabled())
1852		dev_kfree_skb_irq(skb);
1853	else
1854		dev_kfree_skb(skb);
1855}
1856EXPORT_SYMBOL(dev_kfree_skb_any);
1857
1858
1859/**
1860 * netif_device_detach - mark device as removed
1861 * @dev: network device
1862 *
1863 * Mark device as removed from system and therefore no longer available.
1864 */
1865void netif_device_detach(struct net_device *dev)
1866{
1867	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1868	    netif_running(dev)) {
1869		netif_tx_stop_all_queues(dev);
1870	}
1871}
1872EXPORT_SYMBOL(netif_device_detach);
1873
1874/**
1875 * netif_device_attach - mark device as attached
1876 * @dev: network device
1877 *
1878 * Mark device as attached from system and restart if needed.
1879 */
1880void netif_device_attach(struct net_device *dev)
1881{
1882	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1883	    netif_running(dev)) {
1884		netif_tx_wake_all_queues(dev);
1885		__netdev_watchdog_up(dev);
1886	}
1887}
1888EXPORT_SYMBOL(netif_device_attach);
1889
1890static void skb_warn_bad_offload(const struct sk_buff *skb)
1891{
1892	static const netdev_features_t null_features = 0;
1893	struct net_device *dev = skb->dev;
1894	const char *driver = "";
1895
 
 
 
1896	if (dev && dev->dev.parent)
1897		driver = dev_driver_string(dev->dev.parent);
1898
1899	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
1900	     "gso_type=%d ip_summed=%d\n",
1901	     driver, dev ? &dev->features : &null_features,
1902	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
1903	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
1904	     skb_shinfo(skb)->gso_type, skb->ip_summed);
1905}
1906
1907/*
1908 * Invalidate hardware checksum when packet is to be mangled, and
1909 * complete checksum manually on outgoing path.
1910 */
1911int skb_checksum_help(struct sk_buff *skb)
1912{
1913	__wsum csum;
1914	int ret = 0, offset;
1915
1916	if (skb->ip_summed == CHECKSUM_COMPLETE)
1917		goto out_set_summed;
1918
1919	if (unlikely(skb_shinfo(skb)->gso_size)) {
1920		skb_warn_bad_offload(skb);
1921		return -EINVAL;
1922	}
1923
 
 
 
 
 
 
 
 
 
1924	offset = skb_checksum_start_offset(skb);
1925	BUG_ON(offset >= skb_headlen(skb));
1926	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1927
1928	offset += skb->csum_offset;
1929	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1930
1931	if (skb_cloned(skb) &&
1932	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1933		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1934		if (ret)
1935			goto out;
1936	}
1937
1938	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1939out_set_summed:
1940	skb->ip_summed = CHECKSUM_NONE;
1941out:
1942	return ret;
1943}
1944EXPORT_SYMBOL(skb_checksum_help);
1945
1946/**
1947 *	skb_gso_segment - Perform segmentation on skb.
1948 *	@skb: buffer to segment
1949 *	@features: features for the output path (see dev->features)
1950 *
1951 *	This function segments the given skb and returns a list of segments.
1952 *
1953 *	It may return NULL if the skb requires no segmentation.  This is
1954 *	only possible when GSO is used for verifying header integrity.
1955 */
1956struct sk_buff *skb_gso_segment(struct sk_buff *skb,
1957	netdev_features_t features)
1958{
1959	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1960	struct packet_type *ptype;
1961	__be16 type = skb->protocol;
1962	int vlan_depth = ETH_HLEN;
1963	int err;
1964
1965	while (type == htons(ETH_P_8021Q)) {
1966		struct vlan_hdr *vh;
 
1967
1968		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1969			return ERR_PTR(-EINVAL);
1970
1971		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1972		type = vh->h_vlan_encapsulated_proto;
1973		vlan_depth += VLAN_HLEN;
1974	}
1975
1976	skb_reset_mac_header(skb);
1977	skb->mac_len = skb->network_header - skb->mac_header;
1978	__skb_pull(skb, skb->mac_len);
 
 
 
 
 
 
 
 
 
 
 
1979
1980	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1981		skb_warn_bad_offload(skb);
 
1982
1983		if (skb_header_cloned(skb) &&
1984		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1985			return ERR_PTR(err);
 
 
1986	}
1987
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1988	rcu_read_lock();
1989	list_for_each_entry_rcu(ptype,
1990			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1991		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1992			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1993				err = ptype->gso_send_check(skb);
 
 
1994				segs = ERR_PTR(err);
1995				if (err || skb_gso_ok(skb, features))
1996					break;
1997				__skb_push(skb, (skb->data -
1998						 skb_network_header(skb)));
1999			}
2000			segs = ptype->gso_segment(skb, features);
2001			break;
2002		}
2003	}
2004	rcu_read_unlock();
2005
2006	__skb_push(skb, skb->data - skb_mac_header(skb));
2007
2008	return segs;
2009}
2010EXPORT_SYMBOL(skb_gso_segment);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2011
2012/* Take action when hardware reception checksum errors are detected. */
2013#ifdef CONFIG_BUG
2014void netdev_rx_csum_fault(struct net_device *dev)
2015{
2016	if (net_ratelimit()) {
2017		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2018		dump_stack();
2019	}
2020}
2021EXPORT_SYMBOL(netdev_rx_csum_fault);
2022#endif
2023
2024/* Actually, we should eliminate this check as soon as we know, that:
2025 * 1. IOMMU is present and allows to map all the memory.
2026 * 2. No high memory really exists on this machine.
2027 */
2028
2029static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2030{
2031#ifdef CONFIG_HIGHMEM
2032	int i;
2033	if (!(dev->features & NETIF_F_HIGHDMA)) {
2034		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2035			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2036			if (PageHighMem(skb_frag_page(frag)))
2037				return 1;
2038		}
2039	}
2040
2041	if (PCI_DMA_BUS_IS_PHYS) {
2042		struct device *pdev = dev->dev.parent;
2043
2044		if (!pdev)
2045			return 0;
2046		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2047			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2048			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2049			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2050				return 1;
2051		}
2052	}
2053#endif
2054	return 0;
2055}
2056
2057struct dev_gso_cb {
2058	void (*destructor)(struct sk_buff *skb);
2059};
2060
2061#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2062
2063static void dev_gso_skb_destructor(struct sk_buff *skb)
2064{
2065	struct dev_gso_cb *cb;
2066
2067	do {
2068		struct sk_buff *nskb = skb->next;
2069
2070		skb->next = nskb->next;
2071		nskb->next = NULL;
2072		kfree_skb(nskb);
2073	} while (skb->next);
2074
2075	cb = DEV_GSO_CB(skb);
2076	if (cb->destructor)
2077		cb->destructor(skb);
2078}
2079
2080/**
2081 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2082 *	@skb: buffer to segment
2083 *	@features: device features as applicable to this skb
2084 *
2085 *	This function segments the given skb and stores the list of segments
2086 *	in skb->next.
2087 */
2088static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2089{
2090	struct sk_buff *segs;
2091
2092	segs = skb_gso_segment(skb, features);
2093
2094	/* Verifying header integrity only. */
2095	if (!segs)
2096		return 0;
2097
2098	if (IS_ERR(segs))
2099		return PTR_ERR(segs);
2100
2101	skb->next = segs;
2102	DEV_GSO_CB(skb)->destructor = skb->destructor;
2103	skb->destructor = dev_gso_skb_destructor;
2104
2105	return 0;
2106}
2107
2108static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
 
2109{
2110	return ((features & NETIF_F_GEN_CSUM) ||
2111		((features & NETIF_F_V4_CSUM) &&
2112		 protocol == htons(ETH_P_IP)) ||
2113		((features & NETIF_F_V6_CSUM) &&
2114		 protocol == htons(ETH_P_IPV6)) ||
2115		((features & NETIF_F_FCOE_CRC) &&
2116		 protocol == htons(ETH_P_FCOE)));
2117}
2118
2119static netdev_features_t harmonize_features(struct sk_buff *skb,
2120	__be16 protocol, netdev_features_t features)
2121{
2122	if (!can_checksum_protocol(features, protocol)) {
2123		features &= ~NETIF_F_ALL_CSUM;
2124		features &= ~NETIF_F_SG;
2125	} else if (illegal_highdma(skb->dev, skb)) {
2126		features &= ~NETIF_F_SG;
2127	}
2128
2129	return features;
2130}
2131
2132netdev_features_t netif_skb_features(struct sk_buff *skb)
2133{
2134	__be16 protocol = skb->protocol;
2135	netdev_features_t features = skb->dev->features;
2136
2137	if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2138		features &= ~NETIF_F_GSO_MASK;
2139
2140	if (protocol == htons(ETH_P_8021Q)) {
2141		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2142		protocol = veh->h_vlan_encapsulated_proto;
2143	} else if (!vlan_tx_tag_present(skb)) {
2144		return harmonize_features(skb, protocol, features);
2145	}
2146
2147	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
 
2148
2149	if (protocol != htons(ETH_P_8021Q)) {
2150		return harmonize_features(skb, protocol, features);
2151	} else {
2152		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2153				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2154		return harmonize_features(skb, protocol, features);
2155	}
 
2156}
2157EXPORT_SYMBOL(netif_skb_features);
2158
2159/*
2160 * Returns true if either:
2161 *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2162 *	2. skb is fragmented and the device does not support SG, or if
2163 *	   at least one of fragments is in highmem and device does not
2164 *	   support DMA from it.
2165 */
2166static inline int skb_needs_linearize(struct sk_buff *skb,
2167				      int features)
2168{
2169	return skb_is_nonlinear(skb) &&
2170			((skb_has_frag_list(skb) &&
2171				!(features & NETIF_F_FRAGLIST)) ||
2172			(skb_shinfo(skb)->nr_frags &&
2173				!(features & NETIF_F_SG)));
2174}
2175
2176int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2177			struct netdev_queue *txq)
2178{
2179	const struct net_device_ops *ops = dev->netdev_ops;
2180	int rc = NETDEV_TX_OK;
2181	unsigned int skb_len;
2182
2183	if (likely(!skb->next)) {
2184		netdev_features_t features;
2185
2186		/*
2187		 * If device doesn't need skb->dst, release it right now while
2188		 * its hot in this cpu cache
2189		 */
2190		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2191			skb_dst_drop(skb);
2192
2193		if (!list_empty(&ptype_all))
2194			dev_queue_xmit_nit(skb, dev);
2195
2196		features = netif_skb_features(skb);
2197
2198		if (vlan_tx_tag_present(skb) &&
2199		    !(features & NETIF_F_HW_VLAN_TX)) {
2200			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
 
2201			if (unlikely(!skb))
2202				goto out;
2203
2204			skb->vlan_tci = 0;
2205		}
2206
 
 
 
 
 
 
 
2207		if (netif_needs_gso(skb, features)) {
2208			if (unlikely(dev_gso_segment(skb, features)))
2209				goto out_kfree_skb;
2210			if (skb->next)
2211				goto gso;
2212		} else {
2213			if (skb_needs_linearize(skb, features) &&
2214			    __skb_linearize(skb))
2215				goto out_kfree_skb;
2216
2217			/* If packet is not checksummed and device does not
2218			 * support checksumming for this protocol, complete
2219			 * checksumming here.
2220			 */
2221			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2222				skb_set_transport_header(skb,
2223					skb_checksum_start_offset(skb));
 
 
 
 
2224				if (!(features & NETIF_F_ALL_CSUM) &&
2225				     skb_checksum_help(skb))
2226					goto out_kfree_skb;
2227			}
2228		}
2229
 
 
 
2230		skb_len = skb->len;
 
2231		rc = ops->ndo_start_xmit(skb, dev);
2232		trace_net_dev_xmit(skb, rc, dev, skb_len);
2233		if (rc == NETDEV_TX_OK)
2234			txq_trans_update(txq);
2235		return rc;
2236	}
2237
2238gso:
2239	do {
2240		struct sk_buff *nskb = skb->next;
2241
2242		skb->next = nskb->next;
2243		nskb->next = NULL;
2244
2245		/*
2246		 * If device doesn't need nskb->dst, release it right now while
2247		 * its hot in this cpu cache
2248		 */
2249		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2250			skb_dst_drop(nskb);
2251
2252		skb_len = nskb->len;
 
2253		rc = ops->ndo_start_xmit(nskb, dev);
2254		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2255		if (unlikely(rc != NETDEV_TX_OK)) {
2256			if (rc & ~NETDEV_TX_MASK)
2257				goto out_kfree_gso_skb;
2258			nskb->next = skb->next;
2259			skb->next = nskb;
2260			return rc;
2261		}
2262		txq_trans_update(txq);
2263		if (unlikely(netif_xmit_stopped(txq) && skb->next))
2264			return NETDEV_TX_BUSY;
2265	} while (skb->next);
2266
2267out_kfree_gso_skb:
2268	if (likely(skb->next == NULL))
2269		skb->destructor = DEV_GSO_CB(skb)->destructor;
 
 
 
2270out_kfree_skb:
2271	kfree_skb(skb);
2272out:
2273	return rc;
2274}
 
2275
2276static u32 hashrnd __read_mostly;
2277
2278/*
2279 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2280 * to be used as a distribution range.
2281 */
2282u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2283		  unsigned int num_tx_queues)
2284{
2285	u32 hash;
2286	u16 qoffset = 0;
2287	u16 qcount = num_tx_queues;
2288
2289	if (skb_rx_queue_recorded(skb)) {
2290		hash = skb_get_rx_queue(skb);
2291		while (unlikely(hash >= num_tx_queues))
2292			hash -= num_tx_queues;
2293		return hash;
2294	}
2295
2296	if (dev->num_tc) {
2297		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2298		qoffset = dev->tc_to_txq[tc].offset;
2299		qcount = dev->tc_to_txq[tc].count;
2300	}
2301
2302	if (skb->sk && skb->sk->sk_hash)
2303		hash = skb->sk->sk_hash;
2304	else
2305		hash = (__force u16) skb->protocol;
2306	hash = jhash_1word(hash, hashrnd);
2307
2308	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2309}
2310EXPORT_SYMBOL(__skb_tx_hash);
2311
2312static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2313{
2314	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2315		net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
2316				     dev->name, queue_index,
2317				     dev->real_num_tx_queues);
2318		return 0;
2319	}
2320	return queue_index;
2321}
2322
2323static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2324{
2325#ifdef CONFIG_XPS
2326	struct xps_dev_maps *dev_maps;
2327	struct xps_map *map;
2328	int queue_index = -1;
2329
2330	rcu_read_lock();
2331	dev_maps = rcu_dereference(dev->xps_maps);
2332	if (dev_maps) {
2333		map = rcu_dereference(
2334		    dev_maps->cpu_map[raw_smp_processor_id()]);
2335		if (map) {
2336			if (map->len == 1)
2337				queue_index = map->queues[0];
2338			else {
2339				u32 hash;
2340				if (skb->sk && skb->sk->sk_hash)
2341					hash = skb->sk->sk_hash;
2342				else
2343					hash = (__force u16) skb->protocol ^
2344					    skb->rxhash;
2345				hash = jhash_1word(hash, hashrnd);
2346				queue_index = map->queues[
2347				    ((u64)hash * map->len) >> 32];
2348			}
2349			if (unlikely(queue_index >= dev->real_num_tx_queues))
2350				queue_index = -1;
2351		}
2352	}
2353	rcu_read_unlock();
2354
2355	return queue_index;
2356#else
2357	return -1;
2358#endif
2359}
2360
2361static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2362					struct sk_buff *skb)
2363{
2364	int queue_index;
2365	const struct net_device_ops *ops = dev->netdev_ops;
2366
2367	if (dev->real_num_tx_queues == 1)
2368		queue_index = 0;
2369	else if (ops->ndo_select_queue) {
2370		queue_index = ops->ndo_select_queue(dev, skb);
2371		queue_index = dev_cap_txqueue(dev, queue_index);
2372	} else {
2373		struct sock *sk = skb->sk;
2374		queue_index = sk_tx_queue_get(sk);
2375
2376		if (queue_index < 0 || skb->ooo_okay ||
2377		    queue_index >= dev->real_num_tx_queues) {
2378			int old_index = queue_index;
2379
2380			queue_index = get_xps_queue(dev, skb);
2381			if (queue_index < 0)
2382				queue_index = skb_tx_hash(dev, skb);
2383
2384			if (queue_index != old_index && sk) {
2385				struct dst_entry *dst =
2386				    rcu_dereference_check(sk->sk_dst_cache, 1);
2387
2388				if (dst && skb_dst(skb) == dst)
2389					sk_tx_queue_set(sk, queue_index);
2390			}
2391		}
2392	}
2393
2394	skb_set_queue_mapping(skb, queue_index);
2395	return netdev_get_tx_queue(dev, queue_index);
2396}
2397
2398static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2399				 struct net_device *dev,
2400				 struct netdev_queue *txq)
2401{
2402	spinlock_t *root_lock = qdisc_lock(q);
2403	bool contended;
2404	int rc;
2405
2406	qdisc_skb_cb(skb)->pkt_len = skb->len;
2407	qdisc_calculate_pkt_len(skb, q);
2408	/*
2409	 * Heuristic to force contended enqueues to serialize on a
2410	 * separate lock before trying to get qdisc main lock.
2411	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2412	 * and dequeue packets faster.
2413	 */
2414	contended = qdisc_is_running(q);
2415	if (unlikely(contended))
2416		spin_lock(&q->busylock);
2417
2418	spin_lock(root_lock);
2419	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2420		kfree_skb(skb);
2421		rc = NET_XMIT_DROP;
2422	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2423		   qdisc_run_begin(q)) {
2424		/*
2425		 * This is a work-conserving queue; there are no old skbs
2426		 * waiting to be sent out; and the qdisc is not running -
2427		 * xmit the skb directly.
2428		 */
2429		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2430			skb_dst_force(skb);
2431
2432		qdisc_bstats_update(q, skb);
2433
2434		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2435			if (unlikely(contended)) {
2436				spin_unlock(&q->busylock);
2437				contended = false;
2438			}
2439			__qdisc_run(q);
2440		} else
2441			qdisc_run_end(q);
2442
2443		rc = NET_XMIT_SUCCESS;
2444	} else {
2445		skb_dst_force(skb);
2446		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2447		if (qdisc_run_begin(q)) {
2448			if (unlikely(contended)) {
2449				spin_unlock(&q->busylock);
2450				contended = false;
2451			}
2452			__qdisc_run(q);
2453		}
2454	}
2455	spin_unlock(root_lock);
2456	if (unlikely(contended))
2457		spin_unlock(&q->busylock);
2458	return rc;
2459}
2460
2461#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2462static void skb_update_prio(struct sk_buff *skb)
2463{
2464	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2465
2466	if (!skb->priority && skb->sk && map) {
2467		unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2468
2469		if (prioidx < map->priomap_len)
2470			skb->priority = map->priomap[prioidx];
2471	}
2472}
2473#else
2474#define skb_update_prio(skb)
2475#endif
2476
2477static DEFINE_PER_CPU(int, xmit_recursion);
2478#define RECURSION_LIMIT 10
2479
2480/**
2481 *	dev_queue_xmit - transmit a buffer
2482 *	@skb: buffer to transmit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2483 *
2484 *	Queue a buffer for transmission to a network device. The caller must
2485 *	have set the device and priority and built the buffer before calling
2486 *	this function. The function can be called from an interrupt.
2487 *
2488 *	A negative errno code is returned on a failure. A success does not
2489 *	guarantee the frame will be transmitted as it may be dropped due
2490 *	to congestion or traffic shaping.
2491 *
2492 * -----------------------------------------------------------------------------------
2493 *      I notice this method can also return errors from the queue disciplines,
2494 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2495 *      be positive.
2496 *
2497 *      Regardless of the return value, the skb is consumed, so it is currently
2498 *      difficult to retry a send to this method.  (You can bump the ref count
2499 *      before sending to hold a reference for retry if you are careful.)
2500 *
2501 *      When calling this method, interrupts MUST be enabled.  This is because
2502 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2503 *          --BLG
2504 */
2505int dev_queue_xmit(struct sk_buff *skb)
2506{
2507	struct net_device *dev = skb->dev;
2508	struct netdev_queue *txq;
2509	struct Qdisc *q;
2510	int rc = -ENOMEM;
2511
 
 
2512	/* Disable soft irqs for various locks below. Also
2513	 * stops preemption for RCU.
2514	 */
2515	rcu_read_lock_bh();
2516
2517	skb_update_prio(skb);
2518
2519	txq = dev_pick_tx(dev, skb);
2520	q = rcu_dereference_bh(txq->qdisc);
2521
2522#ifdef CONFIG_NET_CLS_ACT
2523	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2524#endif
2525	trace_net_dev_queue(skb);
2526	if (q->enqueue) {
2527		rc = __dev_xmit_skb(skb, q, dev, txq);
2528		goto out;
2529	}
2530
2531	/* The device has no queue. Common case for software devices:
2532	   loopback, all the sorts of tunnels...
2533
2534	   Really, it is unlikely that netif_tx_lock protection is necessary
2535	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2536	   counters.)
2537	   However, it is possible, that they rely on protection
2538	   made by us here.
2539
2540	   Check this and shot the lock. It is not prone from deadlocks.
2541	   Either shot noqueue qdisc, it is even simpler 8)
2542	 */
2543	if (dev->flags & IFF_UP) {
2544		int cpu = smp_processor_id(); /* ok because BHs are off */
2545
2546		if (txq->xmit_lock_owner != cpu) {
2547
2548			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2549				goto recursion_alert;
2550
2551			HARD_TX_LOCK(dev, txq, cpu);
2552
2553			if (!netif_xmit_stopped(txq)) {
2554				__this_cpu_inc(xmit_recursion);
2555				rc = dev_hard_start_xmit(skb, dev, txq);
2556				__this_cpu_dec(xmit_recursion);
2557				if (dev_xmit_complete(rc)) {
2558					HARD_TX_UNLOCK(dev, txq);
2559					goto out;
2560				}
2561			}
2562			HARD_TX_UNLOCK(dev, txq);
2563			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2564					     dev->name);
2565		} else {
2566			/* Recursion is detected! It is possible,
2567			 * unfortunately
2568			 */
2569recursion_alert:
2570			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2571					     dev->name);
2572		}
2573	}
2574
2575	rc = -ENETDOWN;
2576	rcu_read_unlock_bh();
2577
 
2578	kfree_skb(skb);
2579	return rc;
2580out:
2581	rcu_read_unlock_bh();
2582	return rc;
2583}
 
 
 
 
 
2584EXPORT_SYMBOL(dev_queue_xmit);
2585
 
 
 
 
 
 
2586
2587/*=======================================================================
2588			Receiver routines
2589  =======================================================================*/
2590
2591int netdev_max_backlog __read_mostly = 1000;
 
 
2592int netdev_tstamp_prequeue __read_mostly = 1;
2593int netdev_budget __read_mostly = 300;
2594int weight_p __read_mostly = 64;            /* old backlog weight */
2595
2596/* Called with irq disabled */
2597static inline void ____napi_schedule(struct softnet_data *sd,
2598				     struct napi_struct *napi)
2599{
2600	list_add_tail(&napi->poll_list, &sd->poll_list);
2601	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2602}
2603
2604/*
2605 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2606 * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2607 * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2608 * if hash is a canonical 4-tuple hash over transport ports.
2609 */
2610void __skb_get_rxhash(struct sk_buff *skb)
2611{
2612	struct flow_keys keys;
2613	u32 hash;
2614
2615	if (!skb_flow_dissect(skb, &keys))
2616		return;
2617
2618	if (keys.ports) {
2619		if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0])
2620			swap(keys.port16[0], keys.port16[1]);
2621		skb->l4_rxhash = 1;
2622	}
2623
2624	/* get a consistent hash (same value on both flow directions) */
2625	if ((__force u32)keys.dst < (__force u32)keys.src)
2626		swap(keys.dst, keys.src);
2627
2628	hash = jhash_3words((__force u32)keys.dst,
2629			    (__force u32)keys.src,
2630			    (__force u32)keys.ports, hashrnd);
2631	if (!hash)
2632		hash = 1;
2633
2634	skb->rxhash = hash;
2635}
2636EXPORT_SYMBOL(__skb_get_rxhash);
2637
2638#ifdef CONFIG_RPS
2639
2640/* One global table that all flow-based protocols share. */
2641struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2642EXPORT_SYMBOL(rps_sock_flow_table);
2643
2644struct static_key rps_needed __read_mostly;
2645
2646static struct rps_dev_flow *
2647set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2648	    struct rps_dev_flow *rflow, u16 next_cpu)
2649{
2650	if (next_cpu != RPS_NO_CPU) {
2651#ifdef CONFIG_RFS_ACCEL
2652		struct netdev_rx_queue *rxqueue;
2653		struct rps_dev_flow_table *flow_table;
2654		struct rps_dev_flow *old_rflow;
2655		u32 flow_id;
2656		u16 rxq_index;
2657		int rc;
2658
2659		/* Should we steer this flow to a different hardware queue? */
2660		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2661		    !(dev->features & NETIF_F_NTUPLE))
2662			goto out;
2663		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2664		if (rxq_index == skb_get_rx_queue(skb))
2665			goto out;
2666
2667		rxqueue = dev->_rx + rxq_index;
2668		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2669		if (!flow_table)
2670			goto out;
2671		flow_id = skb->rxhash & flow_table->mask;
2672		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2673							rxq_index, flow_id);
2674		if (rc < 0)
2675			goto out;
2676		old_rflow = rflow;
2677		rflow = &flow_table->flows[flow_id];
2678		rflow->filter = rc;
2679		if (old_rflow->filter == rflow->filter)
2680			old_rflow->filter = RPS_NO_FILTER;
2681	out:
2682#endif
2683		rflow->last_qtail =
2684			per_cpu(softnet_data, next_cpu).input_queue_head;
2685	}
2686
2687	rflow->cpu = next_cpu;
2688	return rflow;
2689}
2690
2691/*
2692 * get_rps_cpu is called from netif_receive_skb and returns the target
2693 * CPU from the RPS map of the receiving queue for a given skb.
2694 * rcu_read_lock must be held on entry.
2695 */
2696static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2697		       struct rps_dev_flow **rflowp)
2698{
2699	struct netdev_rx_queue *rxqueue;
2700	struct rps_map *map;
2701	struct rps_dev_flow_table *flow_table;
2702	struct rps_sock_flow_table *sock_flow_table;
2703	int cpu = -1;
2704	u16 tcpu;
 
2705
2706	if (skb_rx_queue_recorded(skb)) {
2707		u16 index = skb_get_rx_queue(skb);
2708		if (unlikely(index >= dev->real_num_rx_queues)) {
2709			WARN_ONCE(dev->real_num_rx_queues > 1,
2710				  "%s received packet on queue %u, but number "
2711				  "of RX queues is %u\n",
2712				  dev->name, index, dev->real_num_rx_queues);
2713			goto done;
2714		}
2715		rxqueue = dev->_rx + index;
2716	} else
2717		rxqueue = dev->_rx;
2718
2719	map = rcu_dereference(rxqueue->rps_map);
2720	if (map) {
2721		if (map->len == 1 &&
2722		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
2723			tcpu = map->cpus[0];
2724			if (cpu_online(tcpu))
2725				cpu = tcpu;
2726			goto done;
2727		}
2728	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2729		goto done;
2730	}
2731
2732	skb_reset_network_header(skb);
2733	if (!skb_get_rxhash(skb))
 
2734		goto done;
2735
2736	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2737	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2738	if (flow_table && sock_flow_table) {
2739		u16 next_cpu;
2740		struct rps_dev_flow *rflow;
2741
2742		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2743		tcpu = rflow->cpu;
2744
2745		next_cpu = sock_flow_table->ents[skb->rxhash &
2746		    sock_flow_table->mask];
2747
2748		/*
2749		 * If the desired CPU (where last recvmsg was done) is
2750		 * different from current CPU (one in the rx-queue flow
2751		 * table entry), switch if one of the following holds:
2752		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2753		 *   - Current CPU is offline.
2754		 *   - The current CPU's queue tail has advanced beyond the
2755		 *     last packet that was enqueued using this table entry.
2756		 *     This guarantees that all previous packets for the flow
2757		 *     have been dequeued, thus preserving in order delivery.
2758		 */
2759		if (unlikely(tcpu != next_cpu) &&
2760		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2761		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2762		      rflow->last_qtail)) >= 0))
 
2763			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
 
2764
2765		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2766			*rflowp = rflow;
2767			cpu = tcpu;
2768			goto done;
2769		}
2770	}
2771
2772	if (map) {
2773		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2774
2775		if (cpu_online(tcpu)) {
2776			cpu = tcpu;
2777			goto done;
2778		}
2779	}
2780
2781done:
2782	return cpu;
2783}
2784
2785#ifdef CONFIG_RFS_ACCEL
2786
2787/**
2788 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2789 * @dev: Device on which the filter was set
2790 * @rxq_index: RX queue index
2791 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2792 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2793 *
2794 * Drivers that implement ndo_rx_flow_steer() should periodically call
2795 * this function for each installed filter and remove the filters for
2796 * which it returns %true.
2797 */
2798bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2799			 u32 flow_id, u16 filter_id)
2800{
2801	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2802	struct rps_dev_flow_table *flow_table;
2803	struct rps_dev_flow *rflow;
2804	bool expire = true;
2805	int cpu;
2806
2807	rcu_read_lock();
2808	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2809	if (flow_table && flow_id <= flow_table->mask) {
2810		rflow = &flow_table->flows[flow_id];
2811		cpu = ACCESS_ONCE(rflow->cpu);
2812		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2813		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2814			   rflow->last_qtail) <
2815		     (int)(10 * flow_table->mask)))
2816			expire = false;
2817	}
2818	rcu_read_unlock();
2819	return expire;
2820}
2821EXPORT_SYMBOL(rps_may_expire_flow);
2822
2823#endif /* CONFIG_RFS_ACCEL */
2824
2825/* Called from hardirq (IPI) context */
2826static void rps_trigger_softirq(void *data)
2827{
2828	struct softnet_data *sd = data;
2829
2830	____napi_schedule(sd, &sd->backlog);
2831	sd->received_rps++;
2832}
2833
2834#endif /* CONFIG_RPS */
2835
2836/*
2837 * Check if this softnet_data structure is another cpu one
2838 * If yes, queue it to our IPI list and return 1
2839 * If no, return 0
2840 */
2841static int rps_ipi_queued(struct softnet_data *sd)
2842{
2843#ifdef CONFIG_RPS
2844	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2845
2846	if (sd != mysd) {
2847		sd->rps_ipi_next = mysd->rps_ipi_list;
2848		mysd->rps_ipi_list = sd;
2849
2850		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2851		return 1;
2852	}
2853#endif /* CONFIG_RPS */
2854	return 0;
2855}
2856
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2857/*
2858 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2859 * queue (may be a remote CPU queue).
2860 */
2861static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2862			      unsigned int *qtail)
2863{
2864	struct softnet_data *sd;
2865	unsigned long flags;
 
2866
2867	sd = &per_cpu(softnet_data, cpu);
2868
2869	local_irq_save(flags);
2870
2871	rps_lock(sd);
2872	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
 
2873		if (skb_queue_len(&sd->input_pkt_queue)) {
2874enqueue:
2875			__skb_queue_tail(&sd->input_pkt_queue, skb);
2876			input_queue_tail_incr_save(sd, qtail);
2877			rps_unlock(sd);
2878			local_irq_restore(flags);
2879			return NET_RX_SUCCESS;
2880		}
2881
2882		/* Schedule NAPI for backlog device
2883		 * We can use non atomic operation since we own the queue lock
2884		 */
2885		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2886			if (!rps_ipi_queued(sd))
2887				____napi_schedule(sd, &sd->backlog);
2888		}
2889		goto enqueue;
2890	}
2891
2892	sd->dropped++;
2893	rps_unlock(sd);
2894
2895	local_irq_restore(flags);
2896
2897	atomic_long_inc(&skb->dev->rx_dropped);
2898	kfree_skb(skb);
2899	return NET_RX_DROP;
2900}
2901
2902/**
2903 *	netif_rx	-	post buffer to the network code
2904 *	@skb: buffer to post
2905 *
2906 *	This function receives a packet from a device driver and queues it for
2907 *	the upper (protocol) levels to process.  It always succeeds. The buffer
2908 *	may be dropped during processing for congestion control or by the
2909 *	protocol layers.
2910 *
2911 *	return values:
2912 *	NET_RX_SUCCESS	(no congestion)
2913 *	NET_RX_DROP     (packet was dropped)
2914 *
2915 */
2916
2917int netif_rx(struct sk_buff *skb)
2918{
2919	int ret;
2920
2921	/* if netpoll wants it, pretend we never saw it */
2922	if (netpoll_rx(skb))
2923		return NET_RX_DROP;
2924
2925	net_timestamp_check(netdev_tstamp_prequeue, skb);
2926
2927	trace_netif_rx(skb);
2928#ifdef CONFIG_RPS
2929	if (static_key_false(&rps_needed)) {
2930		struct rps_dev_flow voidflow, *rflow = &voidflow;
2931		int cpu;
2932
2933		preempt_disable();
2934		rcu_read_lock();
2935
2936		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2937		if (cpu < 0)
2938			cpu = smp_processor_id();
2939
2940		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2941
2942		rcu_read_unlock();
2943		preempt_enable();
2944	} else
2945#endif
2946	{
2947		unsigned int qtail;
2948		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2949		put_cpu();
2950	}
2951	return ret;
2952}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2953EXPORT_SYMBOL(netif_rx);
2954
2955int netif_rx_ni(struct sk_buff *skb)
2956{
2957	int err;
2958
 
 
2959	preempt_disable();
2960	err = netif_rx(skb);
2961	if (local_softirq_pending())
2962		do_softirq();
2963	preempt_enable();
2964
2965	return err;
2966}
2967EXPORT_SYMBOL(netif_rx_ni);
2968
2969static void net_tx_action(struct softirq_action *h)
2970{
2971	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2972
2973	if (sd->completion_queue) {
2974		struct sk_buff *clist;
2975
2976		local_irq_disable();
2977		clist = sd->completion_queue;
2978		sd->completion_queue = NULL;
2979		local_irq_enable();
2980
2981		while (clist) {
2982			struct sk_buff *skb = clist;
2983			clist = clist->next;
2984
2985			WARN_ON(atomic_read(&skb->users));
2986			trace_kfree_skb(skb, net_tx_action);
 
 
 
2987			__kfree_skb(skb);
2988		}
2989	}
2990
2991	if (sd->output_queue) {
2992		struct Qdisc *head;
2993
2994		local_irq_disable();
2995		head = sd->output_queue;
2996		sd->output_queue = NULL;
2997		sd->output_queue_tailp = &sd->output_queue;
2998		local_irq_enable();
2999
3000		while (head) {
3001			struct Qdisc *q = head;
3002			spinlock_t *root_lock;
3003
3004			head = head->next_sched;
3005
3006			root_lock = qdisc_lock(q);
3007			if (spin_trylock(root_lock)) {
3008				smp_mb__before_clear_bit();
3009				clear_bit(__QDISC_STATE_SCHED,
3010					  &q->state);
3011				qdisc_run(q);
3012				spin_unlock(root_lock);
3013			} else {
3014				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3015					      &q->state)) {
3016					__netif_reschedule(q);
3017				} else {
3018					smp_mb__before_clear_bit();
3019					clear_bit(__QDISC_STATE_SCHED,
3020						  &q->state);
3021				}
3022			}
3023		}
3024	}
3025}
3026
3027#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3028    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3029/* This hook is defined here for ATM LANE */
3030int (*br_fdb_test_addr_hook)(struct net_device *dev,
3031			     unsigned char *addr) __read_mostly;
3032EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3033#endif
3034
3035#ifdef CONFIG_NET_CLS_ACT
3036/* TODO: Maybe we should just force sch_ingress to be compiled in
3037 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3038 * a compare and 2 stores extra right now if we dont have it on
3039 * but have CONFIG_NET_CLS_ACT
3040 * NOTE: This doesn't stop any functionality; if you dont have
3041 * the ingress scheduler, you just can't add policies on ingress.
3042 *
3043 */
3044static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3045{
3046	struct net_device *dev = skb->dev;
3047	u32 ttl = G_TC_RTTL(skb->tc_verd);
3048	int result = TC_ACT_OK;
3049	struct Qdisc *q;
3050
3051	if (unlikely(MAX_RED_LOOP < ttl++)) {
3052		net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3053				     skb->skb_iif, dev->ifindex);
3054		return TC_ACT_SHOT;
3055	}
3056
3057	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3058	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3059
3060	q = rxq->qdisc;
3061	if (q != &noop_qdisc) {
3062		spin_lock(qdisc_lock(q));
3063		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3064			result = qdisc_enqueue_root(skb, q);
3065		spin_unlock(qdisc_lock(q));
3066	}
3067
3068	return result;
3069}
3070
3071static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3072					 struct packet_type **pt_prev,
3073					 int *ret, struct net_device *orig_dev)
3074{
3075	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3076
3077	if (!rxq || rxq->qdisc == &noop_qdisc)
3078		goto out;
3079
3080	if (*pt_prev) {
3081		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3082		*pt_prev = NULL;
3083	}
3084
3085	switch (ing_filter(skb, rxq)) {
3086	case TC_ACT_SHOT:
3087	case TC_ACT_STOLEN:
3088		kfree_skb(skb);
3089		return NULL;
3090	}
3091
3092out:
3093	skb->tc_verd = 0;
3094	return skb;
3095}
3096#endif
3097
3098/**
3099 *	netdev_rx_handler_register - register receive handler
3100 *	@dev: device to register a handler for
3101 *	@rx_handler: receive handler to register
3102 *	@rx_handler_data: data pointer that is used by rx handler
3103 *
3104 *	Register a receive hander for a device. This handler will then be
3105 *	called from __netif_receive_skb. A negative errno code is returned
3106 *	on a failure.
3107 *
3108 *	The caller must hold the rtnl_mutex.
3109 *
3110 *	For a general description of rx_handler, see enum rx_handler_result.
3111 */
3112int netdev_rx_handler_register(struct net_device *dev,
3113			       rx_handler_func_t *rx_handler,
3114			       void *rx_handler_data)
3115{
3116	ASSERT_RTNL();
3117
3118	if (dev->rx_handler)
3119		return -EBUSY;
3120
 
3121	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3122	rcu_assign_pointer(dev->rx_handler, rx_handler);
3123
3124	return 0;
3125}
3126EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3127
3128/**
3129 *	netdev_rx_handler_unregister - unregister receive handler
3130 *	@dev: device to unregister a handler from
3131 *
3132 *	Unregister a receive hander from a device.
3133 *
3134 *	The caller must hold the rtnl_mutex.
3135 */
3136void netdev_rx_handler_unregister(struct net_device *dev)
3137{
3138
3139	ASSERT_RTNL();
3140	RCU_INIT_POINTER(dev->rx_handler, NULL);
 
 
 
 
 
3141	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3142}
3143EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3144
3145static int __netif_receive_skb(struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3146{
3147	struct packet_type *ptype, *pt_prev;
3148	rx_handler_func_t *rx_handler;
3149	struct net_device *orig_dev;
3150	struct net_device *null_or_dev;
3151	bool deliver_exact = false;
3152	int ret = NET_RX_DROP;
3153	__be16 type;
3154
3155	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3156
3157	trace_netif_receive_skb(skb);
3158
3159	/* if we've gotten here through NAPI, check netpoll */
3160	if (netpoll_receive_skb(skb))
3161		return NET_RX_DROP;
3162
3163	if (!skb->skb_iif)
3164		skb->skb_iif = skb->dev->ifindex;
3165	orig_dev = skb->dev;
3166
3167	skb_reset_network_header(skb);
3168	skb_reset_transport_header(skb);
 
3169	skb_reset_mac_len(skb);
3170
3171	pt_prev = NULL;
3172
3173	rcu_read_lock();
3174
3175another_round:
 
3176
3177	__this_cpu_inc(softnet_data.processed);
3178
3179	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
 
3180		skb = vlan_untag(skb);
3181		if (unlikely(!skb))
3182			goto out;
3183	}
3184
3185#ifdef CONFIG_NET_CLS_ACT
3186	if (skb->tc_verd & TC_NCLS) {
3187		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3188		goto ncls;
3189	}
3190#endif
3191
 
 
 
3192	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3193		if (!ptype->dev || ptype->dev == skb->dev) {
3194			if (pt_prev)
3195				ret = deliver_skb(skb, pt_prev, orig_dev);
3196			pt_prev = ptype;
3197		}
3198	}
3199
 
3200#ifdef CONFIG_NET_CLS_ACT
3201	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3202	if (!skb)
3203		goto out;
3204ncls:
3205#endif
3206
3207	rx_handler = rcu_dereference(skb->dev->rx_handler);
 
 
3208	if (vlan_tx_tag_present(skb)) {
3209		if (pt_prev) {
3210			ret = deliver_skb(skb, pt_prev, orig_dev);
3211			pt_prev = NULL;
3212		}
3213		if (vlan_do_receive(&skb, !rx_handler))
3214			goto another_round;
3215		else if (unlikely(!skb))
3216			goto out;
3217	}
3218
 
3219	if (rx_handler) {
3220		if (pt_prev) {
3221			ret = deliver_skb(skb, pt_prev, orig_dev);
3222			pt_prev = NULL;
3223		}
3224		switch (rx_handler(&skb)) {
3225		case RX_HANDLER_CONSUMED:
3226			goto out;
 
3227		case RX_HANDLER_ANOTHER:
3228			goto another_round;
3229		case RX_HANDLER_EXACT:
3230			deliver_exact = true;
3231		case RX_HANDLER_PASS:
3232			break;
3233		default:
3234			BUG();
3235		}
3236	}
3237
 
 
 
 
 
 
 
 
 
 
3238	/* deliver only exact match when indicated */
3239	null_or_dev = deliver_exact ? skb->dev : NULL;
3240
3241	type = skb->protocol;
3242	list_for_each_entry_rcu(ptype,
3243			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3244		if (ptype->type == type &&
3245		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3246		     ptype->dev == orig_dev)) {
3247			if (pt_prev)
3248				ret = deliver_skb(skb, pt_prev, orig_dev);
3249			pt_prev = ptype;
3250		}
3251	}
3252
3253	if (pt_prev) {
3254		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 
 
 
3255	} else {
 
3256		atomic_long_inc(&skb->dev->rx_dropped);
3257		kfree_skb(skb);
3258		/* Jamal, now you will not able to escape explaining
3259		 * me how you were going to use this. :-)
3260		 */
3261		ret = NET_RX_DROP;
3262	}
3263
3264out:
3265	rcu_read_unlock();
3266	return ret;
3267}
3268
3269/**
3270 *	netif_receive_skb - process receive buffer from network
3271 *	@skb: buffer to process
3272 *
3273 *	netif_receive_skb() is the main receive data processing function.
3274 *	It always succeeds. The buffer may be dropped during processing
3275 *	for congestion control or by the protocol layers.
3276 *
3277 *	This function may only be called from softirq context and interrupts
3278 *	should be enabled.
3279 *
3280 *	Return values (usually ignored):
3281 *	NET_RX_SUCCESS: no congestion
3282 *	NET_RX_DROP: packet was dropped
3283 */
3284int netif_receive_skb(struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
3285{
3286	net_timestamp_check(netdev_tstamp_prequeue, skb);
3287
3288	if (skb_defer_rx_timestamp(skb))
3289		return NET_RX_SUCCESS;
3290
3291#ifdef CONFIG_RPS
3292	if (static_key_false(&rps_needed)) {
3293		struct rps_dev_flow voidflow, *rflow = &voidflow;
3294		int cpu, ret;
3295
3296		rcu_read_lock();
3297
3298		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3299
3300		if (cpu >= 0) {
3301			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3302			rcu_read_unlock();
3303			return ret;
3304		}
3305		rcu_read_unlock();
3306	}
3307#endif
3308	return __netif_receive_skb(skb);
3309}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3310EXPORT_SYMBOL(netif_receive_skb);
3311
3312/* Network device is going away, flush any packets still pending
3313 * Called with irqs disabled.
3314 */
3315static void flush_backlog(void *arg)
3316{
3317	struct net_device *dev = arg;
3318	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3319	struct sk_buff *skb, *tmp;
3320
3321	rps_lock(sd);
3322	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3323		if (skb->dev == dev) {
3324			__skb_unlink(skb, &sd->input_pkt_queue);
3325			kfree_skb(skb);
3326			input_queue_head_incr(sd);
3327		}
3328	}
3329	rps_unlock(sd);
3330
3331	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3332		if (skb->dev == dev) {
3333			__skb_unlink(skb, &sd->process_queue);
3334			kfree_skb(skb);
3335			input_queue_head_incr(sd);
3336		}
3337	}
3338}
3339
3340static int napi_gro_complete(struct sk_buff *skb)
3341{
3342	struct packet_type *ptype;
3343	__be16 type = skb->protocol;
3344	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3345	int err = -ENOENT;
3346
 
 
3347	if (NAPI_GRO_CB(skb)->count == 1) {
3348		skb_shinfo(skb)->gso_size = 0;
3349		goto out;
3350	}
3351
3352	rcu_read_lock();
3353	list_for_each_entry_rcu(ptype, head, list) {
3354		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3355			continue;
3356
3357		err = ptype->gro_complete(skb);
3358		break;
3359	}
3360	rcu_read_unlock();
3361
3362	if (err) {
3363		WARN_ON(&ptype->list == head);
3364		kfree_skb(skb);
3365		return NET_RX_SUCCESS;
3366	}
3367
3368out:
3369	return netif_receive_skb(skb);
3370}
3371
3372inline void napi_gro_flush(struct napi_struct *napi)
 
 
 
 
3373{
3374	struct sk_buff *skb, *next;
 
 
 
 
 
 
3375
3376	for (skb = napi->gro_list; skb; skb = next) {
3377		next = skb->next;
3378		skb->next = NULL;
 
 
 
 
 
3379		napi_gro_complete(skb);
 
3380	}
3381
3382	napi->gro_count = 0;
3383	napi->gro_list = NULL;
3384}
3385EXPORT_SYMBOL(napi_gro_flush);
3386
3387enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3388{
3389	struct sk_buff **pp = NULL;
3390	struct packet_type *ptype;
3391	__be16 type = skb->protocol;
3392	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3393	int same_flow;
3394	int mac_len;
3395	enum gro_result ret;
 
3396
3397	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3398		goto normal;
3399
3400	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3401		goto normal;
3402
 
 
 
3403	rcu_read_lock();
3404	list_for_each_entry_rcu(ptype, head, list) {
3405		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3406			continue;
3407
3408		skb_set_network_header(skb, skb_gro_offset(skb));
3409		mac_len = skb->network_header - skb->mac_header;
3410		skb->mac_len = mac_len;
3411		NAPI_GRO_CB(skb)->same_flow = 0;
3412		NAPI_GRO_CB(skb)->flush = 0;
3413		NAPI_GRO_CB(skb)->free = 0;
 
3414
3415		pp = ptype->gro_receive(&napi->gro_list, skb);
3416		break;
3417	}
3418	rcu_read_unlock();
3419
3420	if (&ptype->list == head)
3421		goto normal;
3422
3423	same_flow = NAPI_GRO_CB(skb)->same_flow;
3424	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3425
3426	if (pp) {
3427		struct sk_buff *nskb = *pp;
3428
3429		*pp = nskb->next;
3430		nskb->next = NULL;
3431		napi_gro_complete(nskb);
3432		napi->gro_count--;
3433	}
3434
3435	if (same_flow)
3436		goto ok;
3437
3438	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3439		goto normal;
3440
3441	napi->gro_count++;
 
 
 
 
 
 
 
 
 
 
 
 
 
3442	NAPI_GRO_CB(skb)->count = 1;
 
 
3443	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3444	skb->next = napi->gro_list;
3445	napi->gro_list = skb;
3446	ret = GRO_HELD;
3447
3448pull:
3449	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3450		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3451
3452		BUG_ON(skb->end - skb->tail < grow);
3453
3454		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3455
3456		skb->tail += grow;
3457		skb->data_len -= grow;
3458
3459		skb_shinfo(skb)->frags[0].page_offset += grow;
3460		skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3461
3462		if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3463			skb_frag_unref(skb, 0);
3464			memmove(skb_shinfo(skb)->frags,
3465				skb_shinfo(skb)->frags + 1,
3466				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3467		}
3468	}
3469
3470ok:
3471	return ret;
3472
3473normal:
3474	ret = GRO_NORMAL;
3475	goto pull;
3476}
3477EXPORT_SYMBOL(dev_gro_receive);
3478
3479static inline gro_result_t
3480__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3481{
3482	struct sk_buff *p;
3483	unsigned int maclen = skb->dev->hard_header_len;
 
 
 
 
 
 
 
 
 
3484
3485	for (p = napi->gro_list; p; p = p->next) {
3486		unsigned long diffs;
 
 
3487
3488		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3489		diffs |= p->vlan_tci ^ skb->vlan_tci;
3490		if (maclen == ETH_HLEN)
3491			diffs |= compare_ether_header(skb_mac_header(p),
3492						      skb_gro_mac_header(skb));
3493		else if (!diffs)
3494			diffs = memcmp(skb_mac_header(p),
3495				       skb_gro_mac_header(skb),
3496				       maclen);
3497		NAPI_GRO_CB(p)->same_flow = !diffs;
3498		NAPI_GRO_CB(p)->flush = 0;
3499	}
3500
3501	return dev_gro_receive(napi, skb);
3502}
 
3503
3504gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3505{
3506	switch (ret) {
3507	case GRO_NORMAL:
3508		if (netif_receive_skb(skb))
3509			ret = GRO_DROP;
3510		break;
3511
3512	case GRO_DROP:
3513		kfree_skb(skb);
3514		break;
3515
3516	case GRO_MERGED_FREE:
3517		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3518			kmem_cache_free(skbuff_head_cache, skb);
3519		else
3520			__kfree_skb(skb);
3521		break;
3522
3523	case GRO_HELD:
3524	case GRO_MERGED:
3525		break;
3526	}
3527
3528	return ret;
3529}
3530EXPORT_SYMBOL(napi_skb_finish);
3531
3532void skb_gro_reset_offset(struct sk_buff *skb)
3533{
3534	NAPI_GRO_CB(skb)->data_offset = 0;
3535	NAPI_GRO_CB(skb)->frag0 = NULL;
3536	NAPI_GRO_CB(skb)->frag0_len = 0;
3537
3538	if (skb->mac_header == skb->tail &&
3539	    !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3540		NAPI_GRO_CB(skb)->frag0 =
3541			skb_frag_address(&skb_shinfo(skb)->frags[0]);
3542		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3543	}
3544}
3545EXPORT_SYMBOL(skb_gro_reset_offset);
3546
3547gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3548{
3549	skb_gro_reset_offset(skb);
3550
3551	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3552}
3553EXPORT_SYMBOL(napi_gro_receive);
3554
3555static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3556{
3557	__skb_pull(skb, skb_headlen(skb));
3558	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
3559	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3560	skb->vlan_tci = 0;
3561	skb->dev = napi->dev;
3562	skb->skb_iif = 0;
 
3563
3564	napi->skb = skb;
3565}
3566
3567struct sk_buff *napi_get_frags(struct napi_struct *napi)
3568{
3569	struct sk_buff *skb = napi->skb;
3570
3571	if (!skb) {
3572		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3573		if (skb)
3574			napi->skb = skb;
3575	}
3576	return skb;
3577}
3578EXPORT_SYMBOL(napi_get_frags);
3579
3580gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3581			       gro_result_t ret)
 
3582{
3583	switch (ret) {
3584	case GRO_NORMAL:
3585	case GRO_HELD:
 
3586		skb->protocol = eth_type_trans(skb, skb->dev);
3587
3588		if (ret == GRO_HELD)
3589			skb_gro_pull(skb, -ETH_HLEN);
3590		else if (netif_receive_skb(skb))
3591			ret = GRO_DROP;
3592		break;
3593
3594	case GRO_DROP:
3595	case GRO_MERGED_FREE:
3596		napi_reuse_skb(napi, skb);
3597		break;
3598
3599	case GRO_MERGED:
3600		break;
3601	}
3602
3603	return ret;
3604}
3605EXPORT_SYMBOL(napi_frags_finish);
3606
 
 
 
 
3607static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3608{
3609	struct sk_buff *skb = napi->skb;
3610	struct ethhdr *eth;
3611	unsigned int hlen;
3612	unsigned int off;
3613
3614	napi->skb = NULL;
3615
3616	skb_reset_mac_header(skb);
3617	skb_gro_reset_offset(skb);
3618
3619	off = skb_gro_offset(skb);
3620	hlen = off + sizeof(*eth);
3621	eth = skb_gro_header_fast(skb, off);
3622	if (skb_gro_header_hard(skb, hlen)) {
3623		eth = skb_gro_header_slow(skb, hlen, off);
3624		if (unlikely(!eth)) {
3625			napi_reuse_skb(napi, skb);
3626			skb = NULL;
3627			goto out;
3628		}
 
 
 
 
3629	}
3630
3631	skb_gro_pull(skb, sizeof(*eth));
3632
3633	/*
3634	 * This works because the only protocols we care about don't require
3635	 * special handling.  We'll fix it up properly at the end.
 
3636	 */
3637	skb->protocol = eth->h_proto;
3638
3639out:
3640	return skb;
3641}
3642
3643gro_result_t napi_gro_frags(struct napi_struct *napi)
3644{
3645	struct sk_buff *skb = napi_frags_skb(napi);
3646
3647	if (!skb)
3648		return GRO_DROP;
3649
3650	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
 
 
3651}
3652EXPORT_SYMBOL(napi_gro_frags);
3653
3654/*
3655 * net_rps_action sends any pending IPI's for rps.
3656 * Note: called with local irq disabled, but exits with local irq enabled.
3657 */
3658static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3659{
3660#ifdef CONFIG_RPS
3661	struct softnet_data *remsd = sd->rps_ipi_list;
3662
3663	if (remsd) {
3664		sd->rps_ipi_list = NULL;
3665
3666		local_irq_enable();
3667
3668		/* Send pending IPI's to kick RPS processing on remote cpus. */
3669		while (remsd) {
3670			struct softnet_data *next = remsd->rps_ipi_next;
3671
3672			if (cpu_online(remsd->cpu))
3673				__smp_call_function_single(remsd->cpu,
3674							   &remsd->csd, 0);
3675			remsd = next;
3676		}
3677	} else
3678#endif
3679		local_irq_enable();
3680}
3681
3682static int process_backlog(struct napi_struct *napi, int quota)
3683{
3684	int work = 0;
3685	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3686
3687#ifdef CONFIG_RPS
3688	/* Check if we have pending ipi, its better to send them now,
3689	 * not waiting net_rx_action() end.
3690	 */
3691	if (sd->rps_ipi_list) {
3692		local_irq_disable();
3693		net_rps_action_and_irq_enable(sd);
3694	}
3695#endif
3696	napi->weight = weight_p;
3697	local_irq_disable();
3698	while (work < quota) {
3699		struct sk_buff *skb;
3700		unsigned int qlen;
3701
3702		while ((skb = __skb_dequeue(&sd->process_queue))) {
3703			local_irq_enable();
3704			__netif_receive_skb(skb);
3705			local_irq_disable();
3706			input_queue_head_incr(sd);
3707			if (++work >= quota) {
3708				local_irq_enable();
3709				return work;
3710			}
3711		}
3712
3713		rps_lock(sd);
3714		qlen = skb_queue_len(&sd->input_pkt_queue);
3715		if (qlen)
3716			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3717						   &sd->process_queue);
3718
3719		if (qlen < quota - work) {
3720			/*
3721			 * Inline a custom version of __napi_complete().
3722			 * only current cpu owns and manipulates this napi,
3723			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3724			 * we can use a plain write instead of clear_bit(),
3725			 * and we dont need an smp_mb() memory barrier.
3726			 */
3727			list_del(&napi->poll_list);
3728			napi->state = 0;
3729
3730			quota = work + qlen;
3731		}
3732		rps_unlock(sd);
3733	}
3734	local_irq_enable();
3735
3736	return work;
3737}
3738
3739/**
3740 * __napi_schedule - schedule for receive
3741 * @n: entry to schedule
3742 *
3743 * The entry's receive function will be scheduled to run
3744 */
3745void __napi_schedule(struct napi_struct *n)
3746{
3747	unsigned long flags;
3748
3749	local_irq_save(flags);
3750	____napi_schedule(&__get_cpu_var(softnet_data), n);
3751	local_irq_restore(flags);
3752}
3753EXPORT_SYMBOL(__napi_schedule);
3754
3755void __napi_complete(struct napi_struct *n)
3756{
3757	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3758	BUG_ON(n->gro_list);
3759
3760	list_del(&n->poll_list);
3761	smp_mb__before_clear_bit();
3762	clear_bit(NAPI_STATE_SCHED, &n->state);
3763}
3764EXPORT_SYMBOL(__napi_complete);
3765
3766void napi_complete(struct napi_struct *n)
3767{
3768	unsigned long flags;
3769
3770	/*
3771	 * don't let napi dequeue from the cpu poll list
3772	 * just in case its running on a different cpu
3773	 */
3774	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3775		return;
3776
3777	napi_gro_flush(n);
3778	local_irq_save(flags);
3779	__napi_complete(n);
3780	local_irq_restore(flags);
3781}
3782EXPORT_SYMBOL(napi_complete);
3783
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3784void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3785		    int (*poll)(struct napi_struct *, int), int weight)
3786{
3787	INIT_LIST_HEAD(&napi->poll_list);
3788	napi->gro_count = 0;
3789	napi->gro_list = NULL;
3790	napi->skb = NULL;
3791	napi->poll = poll;
 
 
 
3792	napi->weight = weight;
3793	list_add(&napi->dev_list, &dev->napi_list);
3794	napi->dev = dev;
3795#ifdef CONFIG_NETPOLL
3796	spin_lock_init(&napi->poll_lock);
3797	napi->poll_owner = -1;
3798#endif
3799	set_bit(NAPI_STATE_SCHED, &napi->state);
3800}
3801EXPORT_SYMBOL(netif_napi_add);
3802
3803void netif_napi_del(struct napi_struct *napi)
3804{
3805	struct sk_buff *skb, *next;
3806
3807	list_del_init(&napi->dev_list);
3808	napi_free_frags(napi);
3809
3810	for (skb = napi->gro_list; skb; skb = next) {
3811		next = skb->next;
3812		skb->next = NULL;
3813		kfree_skb(skb);
3814	}
3815
3816	napi->gro_list = NULL;
3817	napi->gro_count = 0;
3818}
3819EXPORT_SYMBOL(netif_napi_del);
3820
3821static void net_rx_action(struct softirq_action *h)
3822{
3823	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3824	unsigned long time_limit = jiffies + 2;
3825	int budget = netdev_budget;
3826	void *have;
3827
3828	local_irq_disable();
3829
3830	while (!list_empty(&sd->poll_list)) {
3831		struct napi_struct *n;
3832		int work, weight;
3833
3834		/* If softirq window is exhuasted then punt.
3835		 * Allow this to run for 2 jiffies since which will allow
3836		 * an average latency of 1.5/HZ.
3837		 */
3838		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3839			goto softnet_break;
3840
3841		local_irq_enable();
3842
3843		/* Even though interrupts have been re-enabled, this
3844		 * access is safe because interrupts can only add new
3845		 * entries to the tail of this list, and only ->poll()
3846		 * calls can remove this head entry from the list.
3847		 */
3848		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3849
3850		have = netpoll_poll_lock(n);
3851
3852		weight = n->weight;
3853
3854		/* This NAPI_STATE_SCHED test is for avoiding a race
3855		 * with netpoll's poll_napi().  Only the entity which
3856		 * obtains the lock and sees NAPI_STATE_SCHED set will
3857		 * actually make the ->poll() call.  Therefore we avoid
3858		 * accidentally calling ->poll() when NAPI is not scheduled.
3859		 */
3860		work = 0;
3861		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3862			work = n->poll(n, weight);
3863			trace_napi_poll(n);
3864		}
3865
3866		WARN_ON_ONCE(work > weight);
3867
3868		budget -= work;
3869
3870		local_irq_disable();
3871
3872		/* Drivers must not modify the NAPI state if they
3873		 * consume the entire weight.  In such cases this code
3874		 * still "owns" the NAPI instance and therefore can
3875		 * move the instance around on the list at-will.
3876		 */
3877		if (unlikely(work == weight)) {
3878			if (unlikely(napi_disable_pending(n))) {
3879				local_irq_enable();
3880				napi_complete(n);
3881				local_irq_disable();
3882			} else
 
 
 
 
 
 
 
 
3883				list_move_tail(&n->poll_list, &sd->poll_list);
 
3884		}
3885
3886		netpoll_poll_unlock(have);
3887	}
3888out:
3889	net_rps_action_and_irq_enable(sd);
3890
3891#ifdef CONFIG_NET_DMA
3892	/*
3893	 * There may not be any more sk_buffs coming right now, so push
3894	 * any pending DMA copies to hardware
3895	 */
3896	dma_issue_pending_all();
3897#endif
3898
3899	return;
3900
3901softnet_break:
3902	sd->time_squeeze++;
3903	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3904	goto out;
3905}
3906
3907static gifconf_func_t *gifconf_list[NPROTO];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3908
3909/**
3910 *	register_gifconf	-	register a SIOCGIF handler
3911 *	@family: Address family
3912 *	@gifconf: Function handler
3913 *
3914 *	Register protocol dependent address dumping routines. The handler
3915 *	that is passed must not be freed or reused until it has been replaced
3916 *	by another handler.
3917 */
3918int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
 
3919{
3920	if (family >= NPROTO)
3921		return -EINVAL;
3922	gifconf_list[family] = gifconf;
3923	return 0;
3924}
3925EXPORT_SYMBOL(register_gifconf);
3926
 
 
 
 
 
 
 
 
 
 
3927
3928/*
3929 *	Map an interface index to its name (SIOCGIFNAME)
3930 */
3931
3932/*
3933 *	We need this ioctl for efficient implementation of the
3934 *	if_indextoname() function required by the IPv6 API.  Without
3935 *	it, we would have to search all the interfaces to find a
3936 *	match.  --pb
 
3937 */
 
 
 
3938
3939static int dev_ifname(struct net *net, struct ifreq __user *arg)
3940{
3941	struct net_device *dev;
3942	struct ifreq ifr;
3943
3944	/*
3945	 *	Fetch the caller's info block.
3946	 */
3947
3948	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3949		return -EFAULT;
 
 
 
 
 
3950
3951	rcu_read_lock();
3952	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3953	if (!dev) {
3954		rcu_read_unlock();
3955		return -ENODEV;
3956	}
3957
3958	strcpy(ifr.ifr_name, dev->name);
3959	rcu_read_unlock();
3960
3961	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3962		return -EFAULT;
3963	return 0;
3964}
 
3965
3966/*
3967 *	Perform a SIOCGIFCONF call. This structure will change
3968 *	size eventually, and there is nothing I can do about it.
3969 *	Thus we will need a 'compatibility mode'.
 
 
 
3970 */
 
 
 
 
 
 
 
 
 
 
 
3971
3972static int dev_ifconf(struct net *net, char __user *arg)
3973{
3974	struct ifconf ifc;
3975	struct net_device *dev;
3976	char __user *pos;
3977	int len;
3978	int total;
3979	int i;
3980
3981	/*
3982	 *	Fetch the caller's info block.
3983	 */
3984
3985	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3986		return -EFAULT;
 
 
 
 
 
 
 
 
 
 
3987
3988	pos = ifc.ifc_buf;
3989	len = ifc.ifc_len;
3990
3991	/*
3992	 *	Loop over the interfaces, and write an info block for each.
3993	 */
3994
3995	total = 0;
3996	for_each_netdev(net, dev) {
3997		for (i = 0; i < NPROTO; i++) {
3998			if (gifconf_list[i]) {
3999				int done;
4000				if (!pos)
4001					done = gifconf_list[i](dev, NULL, 0);
4002				else
4003					done = gifconf_list[i](dev, pos + total,
4004							       len - total);
4005				if (done < 0)
4006					return -EFAULT;
4007				total += done;
4008			}
4009		}
4010	}
4011
4012	/*
4013	 *	All done.  Write the updated control block back to the caller.
4014	 */
4015	ifc.ifc_len = total;
4016
4017	/*
4018	 * 	Both BSD and Solaris return 0 here, so we do too.
4019	 */
4020	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4021}
 
4022
4023#ifdef CONFIG_PROC_FS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4024
4025#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4026
4027#define get_bucket(x) ((x) >> BUCKET_SPACE)
4028#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4029#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4030
4031static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4032{
4033	struct net *net = seq_file_net(seq);
4034	struct net_device *dev;
4035	struct hlist_node *p;
4036	struct hlist_head *h;
4037	unsigned int count = 0, offset = get_offset(*pos);
4038
4039	h = &net->dev_name_head[get_bucket(*pos)];
4040	hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4041		if (++count == offset)
4042			return dev;
4043	}
4044
4045	return NULL;
4046}
 
4047
4048static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
 
 
 
 
 
 
 
 
 
 
 
4049{
4050	struct net_device *dev;
4051	unsigned int bucket;
 
 
 
4052
4053	do {
4054		dev = dev_from_same_bucket(seq, pos);
4055		if (dev)
4056			return dev;
4057
4058		bucket = get_bucket(*pos) + 1;
4059		*pos = set_bucket_offset(bucket, 1);
4060	} while (bucket < NETDEV_HASHENTRIES);
4061
4062	return NULL;
4063}
 
4064
4065/*
4066 *	This is invoked by the /proc filesystem handler to display a device
4067 *	in detail.
 
 
 
 
 
 
 
4068 */
4069void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4070	__acquires(RCU)
4071{
4072	rcu_read_lock();
4073	if (!*pos)
4074		return SEQ_START_TOKEN;
4075
4076	if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4077		return NULL;
4078
4079	return dev_from_bucket(seq, pos);
 
 
4080}
 
4081
4082void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 
 
 
 
 
 
 
 
4083{
4084	++*pos;
4085	return dev_from_bucket(seq, pos);
 
 
 
 
 
4086}
 
4087
4088void dev_seq_stop(struct seq_file *seq, void *v)
4089	__releases(RCU)
 
 
 
 
 
 
4090{
4091	rcu_read_unlock();
 
 
 
 
 
 
4092}
 
4093
4094static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4095{
4096	struct rtnl_link_stats64 temp;
4097	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4098
4099	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4100		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4101		   dev->name, stats->rx_bytes, stats->rx_packets,
4102		   stats->rx_errors,
4103		   stats->rx_dropped + stats->rx_missed_errors,
4104		   stats->rx_fifo_errors,
4105		   stats->rx_length_errors + stats->rx_over_errors +
4106		    stats->rx_crc_errors + stats->rx_frame_errors,
4107		   stats->rx_compressed, stats->multicast,
4108		   stats->tx_bytes, stats->tx_packets,
4109		   stats->tx_errors, stats->tx_dropped,
4110		   stats->tx_fifo_errors, stats->collisions,
4111		   stats->tx_carrier_errors +
4112		    stats->tx_aborted_errors +
4113		    stats->tx_window_errors +
4114		    stats->tx_heartbeat_errors,
4115		   stats->tx_compressed);
4116}
4117
4118/*
4119 *	Called from the PROCfs module. This now uses the new arbitrary sized
4120 *	/proc/net interface to create /proc/net/dev
4121 */
4122static int dev_seq_show(struct seq_file *seq, void *v)
4123{
4124	if (v == SEQ_START_TOKEN)
4125		seq_puts(seq, "Inter-|   Receive                            "
4126			      "                    |  Transmit\n"
4127			      " face |bytes    packets errs drop fifo frame "
4128			      "compressed multicast|bytes    packets errs "
4129			      "drop fifo colls carrier compressed\n");
4130	else
4131		dev_seq_printf_stats(seq, v);
4132	return 0;
4133}
4134
4135static struct softnet_data *softnet_get_online(loff_t *pos)
4136{
4137	struct softnet_data *sd = NULL;
 
 
 
4138
4139	while (*pos < nr_cpu_ids)
4140		if (cpu_online(*pos)) {
4141			sd = &per_cpu(softnet_data, *pos);
4142			break;
4143		} else
4144			++*pos;
4145	return sd;
4146}
4147
4148static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
 
 
4149{
4150	return softnet_get_online(pos);
4151}
4152
4153static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4154{
4155	++*pos;
4156	return softnet_get_online(pos);
4157}
4158
4159static void softnet_seq_stop(struct seq_file *seq, void *v)
4160{
4161}
 
 
4162
4163static int softnet_seq_show(struct seq_file *seq, void *v)
4164{
4165	struct softnet_data *sd = v;
 
 
 
4166
4167	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4168		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4169		   0, 0, 0, 0, /* was fastroute */
4170		   sd->cpu_collision, sd->received_rps);
4171	return 0;
4172}
4173
4174static const struct seq_operations dev_seq_ops = {
4175	.start = dev_seq_start,
4176	.next  = dev_seq_next,
4177	.stop  = dev_seq_stop,
4178	.show  = dev_seq_show,
4179};
4180
4181static int dev_seq_open(struct inode *inode, struct file *file)
 
 
 
 
 
 
 
 
 
 
 
4182{
4183	return seq_open_net(inode, file, &dev_seq_ops,
4184			    sizeof(struct seq_net_private));
4185}
 
 
 
4186
4187static const struct file_operations dev_seq_fops = {
4188	.owner	 = THIS_MODULE,
4189	.open    = dev_seq_open,
4190	.read    = seq_read,
4191	.llseek  = seq_lseek,
4192	.release = seq_release_net,
4193};
4194
4195static const struct seq_operations softnet_seq_ops = {
4196	.start = softnet_seq_start,
4197	.next  = softnet_seq_next,
4198	.stop  = softnet_seq_stop,
4199	.show  = softnet_seq_show,
4200};
4201
4202static int softnet_seq_open(struct inode *inode, struct file *file)
 
4203{
4204	return seq_open(file, &softnet_seq_ops);
 
 
 
4205}
4206
4207static const struct file_operations softnet_seq_fops = {
4208	.owner	 = THIS_MODULE,
4209	.open    = softnet_seq_open,
4210	.read    = seq_read,
4211	.llseek  = seq_lseek,
4212	.release = seq_release,
4213};
4214
4215static void *ptype_get_idx(loff_t pos)
4216{
4217	struct packet_type *pt = NULL;
4218	loff_t i = 0;
4219	int t;
4220
4221	list_for_each_entry_rcu(pt, &ptype_all, list) {
4222		if (i == pos)
4223			return pt;
4224		++i;
4225	}
4226
4227	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4228		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4229			if (i == pos)
4230				return pt;
4231			++i;
4232		}
4233	}
4234	return NULL;
4235}
4236
4237static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4238	__acquires(RCU)
4239{
4240	rcu_read_lock();
4241	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
 
4242}
4243
4244static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 
4245{
4246	struct packet_type *pt;
4247	struct list_head *nxt;
4248	int hash;
4249
4250	++*pos;
4251	if (v == SEQ_START_TOKEN)
4252		return ptype_get_idx(0);
4253
4254	pt = v;
4255	nxt = pt->list.next;
4256	if (pt->type == htons(ETH_P_ALL)) {
4257		if (nxt != &ptype_all)
4258			goto found;
4259		hash = 0;
4260		nxt = ptype_base[0].next;
4261	} else
4262		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4263
4264	while (nxt == &ptype_base[hash]) {
4265		if (++hash >= PTYPE_HASH_SIZE)
4266			return NULL;
4267		nxt = ptype_base[hash].next;
 
 
 
4268	}
4269found:
4270	return list_entry(nxt, struct packet_type, list);
4271}
4272
4273static void ptype_seq_stop(struct seq_file *seq, void *v)
4274	__releases(RCU)
4275{
4276	rcu_read_unlock();
 
 
 
4277}
4278
4279static int ptype_seq_show(struct seq_file *seq, void *v)
 
 
4280{
4281	struct packet_type *pt = v;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4282
4283	if (v == SEQ_START_TOKEN)
4284		seq_puts(seq, "Type Device      Function\n");
4285	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4286		if (pt->type == htons(ETH_P_ALL))
4287			seq_puts(seq, "ALL ");
4288		else
4289			seq_printf(seq, "%04x", ntohs(pt->type));
 
4290
4291		seq_printf(seq, " %-8s %pF\n",
4292			   pt->dev ? pt->dev->name : "", pt->func);
 
 
 
 
 
4293	}
4294
 
4295	return 0;
4296}
4297
4298static const struct seq_operations ptype_seq_ops = {
4299	.start = ptype_seq_start,
4300	.next  = ptype_seq_next,
4301	.stop  = ptype_seq_stop,
4302	.show  = ptype_seq_show,
4303};
 
4304
4305static int ptype_seq_open(struct inode *inode, struct file *file)
4306{
4307	return seq_open_net(inode, file, &ptype_seq_ops,
4308			sizeof(struct seq_net_private));
4309}
4310
4311static const struct file_operations ptype_seq_fops = {
4312	.owner	 = THIS_MODULE,
4313	.open    = ptype_seq_open,
4314	.read    = seq_read,
4315	.llseek  = seq_lseek,
4316	.release = seq_release_net,
4317};
4318
 
4319
4320static int __net_init dev_proc_net_init(struct net *net)
4321{
4322	int rc = -ENOMEM;
 
 
 
 
 
 
 
 
 
4323
4324	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4325		goto out;
4326	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4327		goto out_dev;
4328	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4329		goto out_softnet;
4330
4331	if (wext_proc_init(net))
4332		goto out_ptype;
4333	rc = 0;
4334out:
4335	return rc;
4336out_ptype:
4337	proc_net_remove(net, "ptype");
4338out_softnet:
4339	proc_net_remove(net, "softnet_stat");
4340out_dev:
4341	proc_net_remove(net, "dev");
4342	goto out;
4343}
4344
4345static void __net_exit dev_proc_net_exit(struct net *net)
 
 
 
 
 
 
 
 
 
 
 
4346{
4347	wext_proc_exit(net);
 
 
4348
4349	proc_net_remove(net, "ptype");
4350	proc_net_remove(net, "softnet_stat");
4351	proc_net_remove(net, "dev");
 
 
 
 
 
 
 
 
 
 
 
 
4352}
 
4353
4354static struct pernet_operations __net_initdata dev_proc_ops = {
4355	.init = dev_proc_net_init,
4356	.exit = dev_proc_net_exit,
4357};
4358
4359static int __init dev_proc_init(void)
4360{
4361	return register_pernet_subsys(&dev_proc_ops);
4362}
4363#else
4364#define dev_proc_init() 0
4365#endif	/* CONFIG_PROC_FS */
4366
4367
4368/**
4369 *	netdev_set_master	-	set up master pointer
4370 *	@slave: slave device
4371 *	@master: new master device
4372 *
4373 *	Changes the master device of the slave. Pass %NULL to break the
4374 *	bonding. The caller must hold the RTNL semaphore. On a failure
4375 *	a negative errno code is returned. On success the reference counts
4376 *	are adjusted and the function returns zero.
4377 */
4378int netdev_set_master(struct net_device *slave, struct net_device *master)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4379{
4380	struct net_device *old = slave->master;
4381
4382	ASSERT_RTNL();
 
 
 
 
 
4383
4384	if (master) {
4385		if (old)
4386			return -EBUSY;
4387		dev_hold(master);
 
4388	}
 
4389
4390	slave->master = master;
 
 
 
4391
4392	if (old)
4393		dev_put(old);
4394	return 0;
 
 
 
 
4395}
4396EXPORT_SYMBOL(netdev_set_master);
4397
4398/**
4399 *	netdev_set_bond_master	-	set up bonding master/slave pair
4400 *	@slave: slave device
4401 *	@master: new master device
4402 *
4403 *	Changes the master device of the slave. Pass %NULL to break the
4404 *	bonding. The caller must hold the RTNL semaphore. On a failure
4405 *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4406 *	to the routing socket and the function returns zero.
4407 */
4408int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4409{
4410	int err;
 
 
 
4411
4412	ASSERT_RTNL();
4413
4414	err = netdev_set_master(slave, master);
4415	if (err)
4416		return err;
4417	if (master)
4418		slave->flags |= IFF_SLAVE;
4419	else
4420		slave->flags &= ~IFF_SLAVE;
 
4421
4422	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4423	return 0;
4424}
4425EXPORT_SYMBOL(netdev_set_bond_master);
4426
4427static void dev_change_rx_flags(struct net_device *dev, int flags)
4428{
4429	const struct net_device_ops *ops = dev->netdev_ops;
4430
4431	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4432		ops->ndo_change_rx_flags(dev, flags);
4433}
4434
4435static int __dev_set_promiscuity(struct net_device *dev, int inc)
4436{
4437	unsigned int old_flags = dev->flags;
4438	uid_t uid;
4439	gid_t gid;
4440
4441	ASSERT_RTNL();
4442
4443	dev->flags |= IFF_PROMISC;
4444	dev->promiscuity += inc;
4445	if (dev->promiscuity == 0) {
4446		/*
4447		 * Avoid overflow.
4448		 * If inc causes overflow, untouch promisc and return error.
4449		 */
4450		if (inc < 0)
4451			dev->flags &= ~IFF_PROMISC;
4452		else {
4453			dev->promiscuity -= inc;
4454			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4455				dev->name);
4456			return -EOVERFLOW;
4457		}
4458	}
4459	if (dev->flags != old_flags) {
4460		pr_info("device %s %s promiscuous mode\n",
4461			dev->name,
4462			dev->flags & IFF_PROMISC ? "entered" : "left");
4463		if (audit_enabled) {
4464			current_uid_gid(&uid, &gid);
4465			audit_log(current->audit_context, GFP_ATOMIC,
4466				AUDIT_ANOM_PROMISCUOUS,
4467				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4468				dev->name, (dev->flags & IFF_PROMISC),
4469				(old_flags & IFF_PROMISC),
4470				audit_get_loginuid(current),
4471				uid, gid,
 
4472				audit_get_sessionid(current));
4473		}
4474
4475		dev_change_rx_flags(dev, IFF_PROMISC);
4476	}
 
 
4477	return 0;
4478}
4479
4480/**
4481 *	dev_set_promiscuity	- update promiscuity count on a device
4482 *	@dev: device
4483 *	@inc: modifier
4484 *
4485 *	Add or remove promiscuity from a device. While the count in the device
4486 *	remains above zero the interface remains promiscuous. Once it hits zero
4487 *	the device reverts back to normal filtering operation. A negative inc
4488 *	value is used to drop promiscuity on the device.
4489 *	Return 0 if successful or a negative errno code on error.
4490 */
4491int dev_set_promiscuity(struct net_device *dev, int inc)
4492{
4493	unsigned int old_flags = dev->flags;
4494	int err;
4495
4496	err = __dev_set_promiscuity(dev, inc);
4497	if (err < 0)
4498		return err;
4499	if (dev->flags != old_flags)
4500		dev_set_rx_mode(dev);
4501	return err;
4502}
4503EXPORT_SYMBOL(dev_set_promiscuity);
4504
4505/**
4506 *	dev_set_allmulti	- update allmulti count on a device
4507 *	@dev: device
4508 *	@inc: modifier
4509 *
4510 *	Add or remove reception of all multicast frames to a device. While the
4511 *	count in the device remains above zero the interface remains listening
4512 *	to all interfaces. Once it hits zero the device reverts back to normal
4513 *	filtering operation. A negative @inc value is used to drop the counter
4514 *	when releasing a resource needing all multicasts.
4515 *	Return 0 if successful or a negative errno code on error.
4516 */
4517
4518int dev_set_allmulti(struct net_device *dev, int inc)
4519{
4520	unsigned int old_flags = dev->flags;
4521
4522	ASSERT_RTNL();
4523
4524	dev->flags |= IFF_ALLMULTI;
4525	dev->allmulti += inc;
4526	if (dev->allmulti == 0) {
4527		/*
4528		 * Avoid overflow.
4529		 * If inc causes overflow, untouch allmulti and return error.
4530		 */
4531		if (inc < 0)
4532			dev->flags &= ~IFF_ALLMULTI;
4533		else {
4534			dev->allmulti -= inc;
4535			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4536				dev->name);
4537			return -EOVERFLOW;
4538		}
4539	}
4540	if (dev->flags ^ old_flags) {
4541		dev_change_rx_flags(dev, IFF_ALLMULTI);
4542		dev_set_rx_mode(dev);
 
 
 
4543	}
4544	return 0;
4545}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4546EXPORT_SYMBOL(dev_set_allmulti);
4547
4548/*
4549 *	Upload unicast and multicast address lists to device and
4550 *	configure RX filtering. When the device doesn't support unicast
4551 *	filtering it is put in promiscuous mode while unicast addresses
4552 *	are present.
4553 */
4554void __dev_set_rx_mode(struct net_device *dev)
4555{
4556	const struct net_device_ops *ops = dev->netdev_ops;
4557
4558	/* dev_open will call this function so the list will stay sane. */
4559	if (!(dev->flags&IFF_UP))
4560		return;
4561
4562	if (!netif_device_present(dev))
4563		return;
4564
4565	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4566		/* Unicast addresses changes may only happen under the rtnl,
4567		 * therefore calling __dev_set_promiscuity here is safe.
4568		 */
4569		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4570			__dev_set_promiscuity(dev, 1);
4571			dev->uc_promisc = true;
4572		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4573			__dev_set_promiscuity(dev, -1);
4574			dev->uc_promisc = false;
4575		}
4576	}
4577
4578	if (ops->ndo_set_rx_mode)
4579		ops->ndo_set_rx_mode(dev);
4580}
4581
4582void dev_set_rx_mode(struct net_device *dev)
4583{
4584	netif_addr_lock_bh(dev);
4585	__dev_set_rx_mode(dev);
4586	netif_addr_unlock_bh(dev);
4587}
4588
4589/**
4590 *	dev_get_flags - get flags reported to userspace
4591 *	@dev: device
4592 *
4593 *	Get the combination of flag bits exported through APIs to userspace.
4594 */
4595unsigned int dev_get_flags(const struct net_device *dev)
4596{
4597	unsigned int flags;
4598
4599	flags = (dev->flags & ~(IFF_PROMISC |
4600				IFF_ALLMULTI |
4601				IFF_RUNNING |
4602				IFF_LOWER_UP |
4603				IFF_DORMANT)) |
4604		(dev->gflags & (IFF_PROMISC |
4605				IFF_ALLMULTI));
4606
4607	if (netif_running(dev)) {
4608		if (netif_oper_up(dev))
4609			flags |= IFF_RUNNING;
4610		if (netif_carrier_ok(dev))
4611			flags |= IFF_LOWER_UP;
4612		if (netif_dormant(dev))
4613			flags |= IFF_DORMANT;
4614	}
4615
4616	return flags;
4617}
4618EXPORT_SYMBOL(dev_get_flags);
4619
4620int __dev_change_flags(struct net_device *dev, unsigned int flags)
4621{
4622	unsigned int old_flags = dev->flags;
4623	int ret;
4624
4625	ASSERT_RTNL();
4626
4627	/*
4628	 *	Set the flags on our device.
4629	 */
4630
4631	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4632			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4633			       IFF_AUTOMEDIA)) |
4634		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4635				    IFF_ALLMULTI));
4636
4637	/*
4638	 *	Load in the correct multicast list now the flags have changed.
4639	 */
4640
4641	if ((old_flags ^ flags) & IFF_MULTICAST)
4642		dev_change_rx_flags(dev, IFF_MULTICAST);
4643
4644	dev_set_rx_mode(dev);
4645
4646	/*
4647	 *	Have we downed the interface. We handle IFF_UP ourselves
4648	 *	according to user attempts to set it, rather than blindly
4649	 *	setting it.
4650	 */
4651
4652	ret = 0;
4653	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4654		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4655
4656		if (!ret)
4657			dev_set_rx_mode(dev);
4658	}
4659
4660	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4661		int inc = (flags & IFF_PROMISC) ? 1 : -1;
 
4662
4663		dev->gflags ^= IFF_PROMISC;
4664		dev_set_promiscuity(dev, inc);
 
 
 
4665	}
4666
4667	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4668	   is important. Some (broken) drivers set IFF_PROMISC, when
4669	   IFF_ALLMULTI is requested not asking us and not reporting.
4670	 */
4671	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4672		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4673
4674		dev->gflags ^= IFF_ALLMULTI;
4675		dev_set_allmulti(dev, inc);
4676	}
4677
4678	return ret;
4679}
4680
4681void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
 
4682{
4683	unsigned int changes = dev->flags ^ old_flags;
4684
 
 
 
4685	if (changes & IFF_UP) {
4686		if (dev->flags & IFF_UP)
4687			call_netdevice_notifiers(NETDEV_UP, dev);
4688		else
4689			call_netdevice_notifiers(NETDEV_DOWN, dev);
4690	}
4691
4692	if (dev->flags & IFF_UP &&
4693	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4694		call_netdevice_notifiers(NETDEV_CHANGE, dev);
 
 
 
 
 
4695}
4696
4697/**
4698 *	dev_change_flags - change device settings
4699 *	@dev: device
4700 *	@flags: device state flags
4701 *
4702 *	Change settings on device based state flags. The flags are
4703 *	in the userspace exported format.
4704 */
4705int dev_change_flags(struct net_device *dev, unsigned int flags)
4706{
4707	int ret;
4708	unsigned int changes, old_flags = dev->flags;
4709
4710	ret = __dev_change_flags(dev, flags);
4711	if (ret < 0)
4712		return ret;
4713
4714	changes = old_flags ^ dev->flags;
4715	if (changes)
4716		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4717
4718	__dev_notify_flags(dev, old_flags);
4719	return ret;
4720}
4721EXPORT_SYMBOL(dev_change_flags);
4722
 
 
 
 
 
 
 
 
 
 
 
4723/**
4724 *	dev_set_mtu - Change maximum transfer unit
4725 *	@dev: device
4726 *	@new_mtu: new transfer unit
4727 *
4728 *	Change the maximum transfer size of the network device.
4729 */
4730int dev_set_mtu(struct net_device *dev, int new_mtu)
4731{
4732	const struct net_device_ops *ops = dev->netdev_ops;
4733	int err;
4734
4735	if (new_mtu == dev->mtu)
4736		return 0;
4737
4738	/*	MTU must be positive.	 */
4739	if (new_mtu < 0)
4740		return -EINVAL;
4741
4742	if (!netif_device_present(dev))
4743		return -ENODEV;
4744
4745	err = 0;
4746	if (ops->ndo_change_mtu)
4747		err = ops->ndo_change_mtu(dev, new_mtu);
4748	else
4749		dev->mtu = new_mtu;
 
 
4750
4751	if (!err && dev->flags & IFF_UP)
4752		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
 
 
 
 
 
 
 
 
 
4753	return err;
4754}
4755EXPORT_SYMBOL(dev_set_mtu);
4756
4757/**
4758 *	dev_set_group - Change group this device belongs to
4759 *	@dev: device
4760 *	@new_group: group this device should belong to
4761 */
4762void dev_set_group(struct net_device *dev, int new_group)
4763{
4764	dev->group = new_group;
4765}
4766EXPORT_SYMBOL(dev_set_group);
4767
4768/**
4769 *	dev_set_mac_address - Change Media Access Control Address
4770 *	@dev: device
4771 *	@sa: new address
4772 *
4773 *	Change the hardware (MAC) address of the device
4774 */
4775int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4776{
4777	const struct net_device_ops *ops = dev->netdev_ops;
4778	int err;
4779
4780	if (!ops->ndo_set_mac_address)
4781		return -EOPNOTSUPP;
4782	if (sa->sa_family != dev->type)
4783		return -EINVAL;
4784	if (!netif_device_present(dev))
4785		return -ENODEV;
4786	err = ops->ndo_set_mac_address(dev, sa);
4787	if (!err)
4788		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
 
 
4789	add_device_randomness(dev->dev_addr, dev->addr_len);
4790	return err;
4791}
4792EXPORT_SYMBOL(dev_set_mac_address);
4793
4794/*
4795 *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
 
 
 
 
4796 */
4797static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4798{
4799	int err;
4800	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4801
4802	if (!dev)
4803		return -ENODEV;
4804
4805	switch (cmd) {
4806	case SIOCGIFFLAGS:	/* Get interface flags */
4807		ifr->ifr_flags = (short) dev_get_flags(dev);
4808		return 0;
4809
4810	case SIOCGIFMETRIC:	/* Get the metric on the interface
4811				   (currently unused) */
4812		ifr->ifr_metric = 0;
4813		return 0;
4814
4815	case SIOCGIFMTU:	/* Get the MTU of a device */
4816		ifr->ifr_mtu = dev->mtu;
4817		return 0;
4818
4819	case SIOCGIFHWADDR:
4820		if (!dev->addr_len)
4821			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4822		else
4823			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4824			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4825		ifr->ifr_hwaddr.sa_family = dev->type;
4826		return 0;
4827
4828	case SIOCGIFSLAVE:
4829		err = -EINVAL;
4830		break;
4831
4832	case SIOCGIFMAP:
4833		ifr->ifr_map.mem_start = dev->mem_start;
4834		ifr->ifr_map.mem_end   = dev->mem_end;
4835		ifr->ifr_map.base_addr = dev->base_addr;
4836		ifr->ifr_map.irq       = dev->irq;
4837		ifr->ifr_map.dma       = dev->dma;
4838		ifr->ifr_map.port      = dev->if_port;
4839		return 0;
4840
4841	case SIOCGIFINDEX:
4842		ifr->ifr_ifindex = dev->ifindex;
4843		return 0;
4844
4845	case SIOCGIFTXQLEN:
4846		ifr->ifr_qlen = dev->tx_queue_len;
4847		return 0;
4848
4849	default:
4850		/* dev_ioctl() should ensure this case
4851		 * is never reached
4852		 */
4853		WARN_ON(1);
4854		err = -ENOTTY;
4855		break;
4856
4857	}
4858	return err;
4859}
4860
4861/*
4862 *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4863 */
4864static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4865{
4866	int err;
4867	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4868	const struct net_device_ops *ops;
4869
4870	if (!dev)
4871		return -ENODEV;
4872
4873	ops = dev->netdev_ops;
4874
4875	switch (cmd) {
4876	case SIOCSIFFLAGS:	/* Set interface flags */
4877		return dev_change_flags(dev, ifr->ifr_flags);
4878
4879	case SIOCSIFMETRIC:	/* Set the metric on the interface
4880				   (currently unused) */
4881		return -EOPNOTSUPP;
4882
4883	case SIOCSIFMTU:	/* Set the MTU of a device */
4884		return dev_set_mtu(dev, ifr->ifr_mtu);
4885
4886	case SIOCSIFHWADDR:
4887		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4888
4889	case SIOCSIFHWBROADCAST:
4890		if (ifr->ifr_hwaddr.sa_family != dev->type)
4891			return -EINVAL;
4892		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4893		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4894		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4895		return 0;
4896
4897	case SIOCSIFMAP:
4898		if (ops->ndo_set_config) {
4899			if (!netif_device_present(dev))
4900				return -ENODEV;
4901			return ops->ndo_set_config(dev, &ifr->ifr_map);
4902		}
4903		return -EOPNOTSUPP;
4904
4905	case SIOCADDMULTI:
4906		if (!ops->ndo_set_rx_mode ||
4907		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4908			return -EINVAL;
4909		if (!netif_device_present(dev))
4910			return -ENODEV;
4911		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4912
4913	case SIOCDELMULTI:
4914		if (!ops->ndo_set_rx_mode ||
4915		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4916			return -EINVAL;
4917		if (!netif_device_present(dev))
4918			return -ENODEV;
4919		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4920
4921	case SIOCSIFTXQLEN:
4922		if (ifr->ifr_qlen < 0)
4923			return -EINVAL;
4924		dev->tx_queue_len = ifr->ifr_qlen;
4925		return 0;
4926
4927	case SIOCSIFNAME:
4928		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4929		return dev_change_name(dev, ifr->ifr_newname);
4930
4931	case SIOCSHWTSTAMP:
4932		err = net_hwtstamp_validate(ifr);
4933		if (err)
4934			return err;
4935		/* fall through */
4936
4937	/*
4938	 *	Unknown or private ioctl
4939	 */
4940	default:
4941		if ((cmd >= SIOCDEVPRIVATE &&
4942		    cmd <= SIOCDEVPRIVATE + 15) ||
4943		    cmd == SIOCBONDENSLAVE ||
4944		    cmd == SIOCBONDRELEASE ||
4945		    cmd == SIOCBONDSETHWADDR ||
4946		    cmd == SIOCBONDSLAVEINFOQUERY ||
4947		    cmd == SIOCBONDINFOQUERY ||
4948		    cmd == SIOCBONDCHANGEACTIVE ||
4949		    cmd == SIOCGMIIPHY ||
4950		    cmd == SIOCGMIIREG ||
4951		    cmd == SIOCSMIIREG ||
4952		    cmd == SIOCBRADDIF ||
4953		    cmd == SIOCBRDELIF ||
4954		    cmd == SIOCSHWTSTAMP ||
4955		    cmd == SIOCWANDEV) {
4956			err = -EOPNOTSUPP;
4957			if (ops->ndo_do_ioctl) {
4958				if (netif_device_present(dev))
4959					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4960				else
4961					err = -ENODEV;
4962			}
4963		} else
4964			err = -EINVAL;
4965
4966	}
4967	return err;
4968}
4969
4970/*
4971 *	This function handles all "interface"-type I/O control requests. The actual
4972 *	'doing' part of this is dev_ifsioc above.
4973 */
4974
4975/**
4976 *	dev_ioctl	-	network device ioctl
4977 *	@net: the applicable net namespace
4978 *	@cmd: command to issue
4979 *	@arg: pointer to a struct ifreq in user space
4980 *
4981 *	Issue ioctl functions to devices. This is normally called by the
4982 *	user space syscall interfaces but can sometimes be useful for
4983 *	other purposes. The return value is the return from the syscall if
4984 *	positive or a negative errno code on error.
4985 */
4986
4987int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4988{
4989	struct ifreq ifr;
4990	int ret;
4991	char *colon;
4992
4993	/* One special case: SIOCGIFCONF takes ifconf argument
4994	   and requires shared lock, because it sleeps writing
4995	   to user space.
4996	 */
4997
4998	if (cmd == SIOCGIFCONF) {
4999		rtnl_lock();
5000		ret = dev_ifconf(net, (char __user *) arg);
5001		rtnl_unlock();
5002		return ret;
5003	}
5004	if (cmd == SIOCGIFNAME)
5005		return dev_ifname(net, (struct ifreq __user *)arg);
5006
5007	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5008		return -EFAULT;
5009
5010	ifr.ifr_name[IFNAMSIZ-1] = 0;
5011
5012	colon = strchr(ifr.ifr_name, ':');
5013	if (colon)
5014		*colon = 0;
5015
5016	/*
5017	 *	See which interface the caller is talking about.
5018	 */
5019
5020	switch (cmd) {
5021	/*
5022	 *	These ioctl calls:
5023	 *	- can be done by all.
5024	 *	- atomic and do not require locking.
5025	 *	- return a value
5026	 */
5027	case SIOCGIFFLAGS:
5028	case SIOCGIFMETRIC:
5029	case SIOCGIFMTU:
5030	case SIOCGIFHWADDR:
5031	case SIOCGIFSLAVE:
5032	case SIOCGIFMAP:
5033	case SIOCGIFINDEX:
5034	case SIOCGIFTXQLEN:
5035		dev_load(net, ifr.ifr_name);
5036		rcu_read_lock();
5037		ret = dev_ifsioc_locked(net, &ifr, cmd);
5038		rcu_read_unlock();
5039		if (!ret) {
5040			if (colon)
5041				*colon = ':';
5042			if (copy_to_user(arg, &ifr,
5043					 sizeof(struct ifreq)))
5044				ret = -EFAULT;
5045		}
5046		return ret;
5047
5048	case SIOCETHTOOL:
5049		dev_load(net, ifr.ifr_name);
5050		rtnl_lock();
5051		ret = dev_ethtool(net, &ifr);
5052		rtnl_unlock();
5053		if (!ret) {
5054			if (colon)
5055				*colon = ':';
5056			if (copy_to_user(arg, &ifr,
5057					 sizeof(struct ifreq)))
5058				ret = -EFAULT;
5059		}
5060		return ret;
5061
5062	/*
5063	 *	These ioctl calls:
5064	 *	- require superuser power.
5065	 *	- require strict serialization.
5066	 *	- return a value
5067	 */
5068	case SIOCGMIIPHY:
5069	case SIOCGMIIREG:
5070	case SIOCSIFNAME:
5071		if (!capable(CAP_NET_ADMIN))
5072			return -EPERM;
5073		dev_load(net, ifr.ifr_name);
5074		rtnl_lock();
5075		ret = dev_ifsioc(net, &ifr, cmd);
5076		rtnl_unlock();
5077		if (!ret) {
5078			if (colon)
5079				*colon = ':';
5080			if (copy_to_user(arg, &ifr,
5081					 sizeof(struct ifreq)))
5082				ret = -EFAULT;
5083		}
5084		return ret;
5085
5086	/*
5087	 *	These ioctl calls:
5088	 *	- require superuser power.
5089	 *	- require strict serialization.
5090	 *	- do not return a value
5091	 */
5092	case SIOCSIFFLAGS:
5093	case SIOCSIFMETRIC:
5094	case SIOCSIFMTU:
5095	case SIOCSIFMAP:
5096	case SIOCSIFHWADDR:
5097	case SIOCSIFSLAVE:
5098	case SIOCADDMULTI:
5099	case SIOCDELMULTI:
5100	case SIOCSIFHWBROADCAST:
5101	case SIOCSIFTXQLEN:
5102	case SIOCSMIIREG:
5103	case SIOCBONDENSLAVE:
5104	case SIOCBONDRELEASE:
5105	case SIOCBONDSETHWADDR:
5106	case SIOCBONDCHANGEACTIVE:
5107	case SIOCBRADDIF:
5108	case SIOCBRDELIF:
5109	case SIOCSHWTSTAMP:
5110		if (!capable(CAP_NET_ADMIN))
5111			return -EPERM;
5112		/* fall through */
5113	case SIOCBONDSLAVEINFOQUERY:
5114	case SIOCBONDINFOQUERY:
5115		dev_load(net, ifr.ifr_name);
5116		rtnl_lock();
5117		ret = dev_ifsioc(net, &ifr, cmd);
5118		rtnl_unlock();
5119		return ret;
5120
5121	case SIOCGIFMEM:
5122		/* Get the per device memory space. We can add this but
5123		 * currently do not support it */
5124	case SIOCSIFMEM:
5125		/* Set the per device memory buffer space.
5126		 * Not applicable in our case */
5127	case SIOCSIFLINK:
5128		return -ENOTTY;
5129
5130	/*
5131	 *	Unknown or private ioctl.
5132	 */
5133	default:
5134		if (cmd == SIOCWANDEV ||
5135		    (cmd >= SIOCDEVPRIVATE &&
5136		     cmd <= SIOCDEVPRIVATE + 15)) {
5137			dev_load(net, ifr.ifr_name);
5138			rtnl_lock();
5139			ret = dev_ifsioc(net, &ifr, cmd);
5140			rtnl_unlock();
5141			if (!ret && copy_to_user(arg, &ifr,
5142						 sizeof(struct ifreq)))
5143				ret = -EFAULT;
5144			return ret;
5145		}
5146		/* Take care of Wireless Extensions */
5147		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5148			return wext_handle_ioctl(net, &ifr, cmd, arg);
5149		return -ENOTTY;
5150	}
5151}
5152
5153
5154/**
5155 *	dev_new_index	-	allocate an ifindex
5156 *	@net: the applicable net namespace
5157 *
5158 *	Returns a suitable unique value for a new device interface
5159 *	number.  The caller must hold the rtnl semaphore or the
5160 *	dev_base_lock to be sure it remains unique.
5161 */
5162static int dev_new_index(struct net *net)
5163{
5164	static int ifindex;
5165	for (;;) {
5166		if (++ifindex <= 0)
5167			ifindex = 1;
5168		if (!__dev_get_by_index(net, ifindex))
5169			return ifindex;
5170	}
5171}
5172
5173/* Delayed registration/unregisteration */
5174static LIST_HEAD(net_todo_list);
 
5175
5176static void net_set_todo(struct net_device *dev)
5177{
5178	list_add_tail(&dev->todo_list, &net_todo_list);
 
5179}
5180
5181static void rollback_registered_many(struct list_head *head)
5182{
5183	struct net_device *dev, *tmp;
 
5184
5185	BUG_ON(dev_boot_phase);
5186	ASSERT_RTNL();
5187
5188	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5189		/* Some devices call without registering
5190		 * for initialization unwind. Remove those
5191		 * devices and proceed with the remaining.
5192		 */
5193		if (dev->reg_state == NETREG_UNINITIALIZED) {
5194			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5195				 dev->name, dev);
5196
5197			WARN_ON(1);
5198			list_del(&dev->unreg_list);
5199			continue;
5200		}
5201		dev->dismantle = true;
5202		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5203	}
5204
5205	/* If device is running, close it first. */
5206	dev_close_many(head);
 
 
5207
5208	list_for_each_entry(dev, head, unreg_list) {
5209		/* And unlink it from device chain. */
5210		unlist_netdevice(dev);
5211
5212		dev->reg_state = NETREG_UNREGISTERING;
5213	}
5214
5215	synchronize_net();
5216
5217	list_for_each_entry(dev, head, unreg_list) {
5218		/* Shutdown queueing discipline. */
5219		dev_shutdown(dev);
5220
5221
5222		/* Notify protocols, that we are about to destroy
5223		   this device. They should clean all the things.
5224		*/
5225		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5226
5227		if (!dev->rtnl_link_ops ||
5228		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5229			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5230
5231		/*
5232		 *	Flush the unicast and multicast chains
5233		 */
5234		dev_uc_flush(dev);
5235		dev_mc_flush(dev);
5236
5237		if (dev->netdev_ops->ndo_uninit)
5238			dev->netdev_ops->ndo_uninit(dev);
5239
5240		/* Notifier chain MUST detach us from master device. */
5241		WARN_ON(dev->master);
5242
5243		/* Remove entries from kobject tree */
5244		netdev_unregister_kobject(dev);
 
 
 
 
5245	}
5246
5247	/* Process any work delayed until the end of the batch */
5248	dev = list_first_entry(head, struct net_device, unreg_list);
5249	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5250
5251	synchronize_net();
5252
5253	list_for_each_entry(dev, head, unreg_list)
5254		dev_put(dev);
5255}
5256
5257static void rollback_registered(struct net_device *dev)
5258{
5259	LIST_HEAD(single);
5260
5261	list_add(&dev->unreg_list, &single);
5262	rollback_registered_many(&single);
5263	list_del(&single);
5264}
5265
5266static netdev_features_t netdev_fix_features(struct net_device *dev,
5267	netdev_features_t features)
5268{
5269	/* Fix illegal checksum combinations */
5270	if ((features & NETIF_F_HW_CSUM) &&
5271	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5272		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5273		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5274	}
5275
5276	/* Fix illegal SG+CSUM combinations. */
5277	if ((features & NETIF_F_SG) &&
5278	    !(features & NETIF_F_ALL_CSUM)) {
5279		netdev_dbg(dev,
5280			"Dropping NETIF_F_SG since no checksum feature.\n");
5281		features &= ~NETIF_F_SG;
5282	}
5283
5284	/* TSO requires that SG is present as well. */
5285	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5286		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5287		features &= ~NETIF_F_ALL_TSO;
5288	}
5289
 
 
 
 
 
 
 
 
 
 
 
 
 
5290	/* TSO ECN requires that TSO is present as well. */
5291	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5292		features &= ~NETIF_F_TSO_ECN;
5293
5294	/* Software GSO depends on SG. */
5295	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5296		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5297		features &= ~NETIF_F_GSO;
5298	}
5299
5300	/* UFO needs SG and checksumming */
5301	if (features & NETIF_F_UFO) {
5302		/* maybe split UFO into V4 and V6? */
5303		if (!((features & NETIF_F_GEN_CSUM) ||
5304		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5305			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5306			netdev_dbg(dev,
5307				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5308			features &= ~NETIF_F_UFO;
5309		}
5310
5311		if (!(features & NETIF_F_SG)) {
5312			netdev_dbg(dev,
5313				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5314			features &= ~NETIF_F_UFO;
5315		}
5316	}
5317
 
 
 
 
 
 
 
5318	return features;
5319}
5320
5321int __netdev_update_features(struct net_device *dev)
5322{
5323	netdev_features_t features;
5324	int err = 0;
5325
5326	ASSERT_RTNL();
5327
5328	features = netdev_get_wanted_features(dev);
5329
5330	if (dev->netdev_ops->ndo_fix_features)
5331		features = dev->netdev_ops->ndo_fix_features(dev, features);
5332
5333	/* driver might be less strict about feature dependencies */
5334	features = netdev_fix_features(dev, features);
5335
5336	if (dev->features == features)
5337		return 0;
5338
5339	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5340		&dev->features, &features);
5341
5342	if (dev->netdev_ops->ndo_set_features)
5343		err = dev->netdev_ops->ndo_set_features(dev, features);
5344
5345	if (unlikely(err < 0)) {
5346		netdev_err(dev,
5347			"set_features() failed (%d); wanted %pNF, left %pNF\n",
5348			err, &features, &dev->features);
5349		return -1;
5350	}
5351
5352	if (!err)
5353		dev->features = features;
5354
5355	return 1;
5356}
5357
5358/**
5359 *	netdev_update_features - recalculate device features
5360 *	@dev: the device to check
5361 *
5362 *	Recalculate dev->features set and send notifications if it
5363 *	has changed. Should be called after driver or hardware dependent
5364 *	conditions might have changed that influence the features.
5365 */
5366void netdev_update_features(struct net_device *dev)
5367{
5368	if (__netdev_update_features(dev))
5369		netdev_features_change(dev);
5370}
5371EXPORT_SYMBOL(netdev_update_features);
5372
5373/**
5374 *	netdev_change_features - recalculate device features
5375 *	@dev: the device to check
5376 *
5377 *	Recalculate dev->features set and send notifications even
5378 *	if they have not changed. Should be called instead of
5379 *	netdev_update_features() if also dev->vlan_features might
5380 *	have changed to allow the changes to be propagated to stacked
5381 *	VLAN devices.
5382 */
5383void netdev_change_features(struct net_device *dev)
5384{
5385	__netdev_update_features(dev);
5386	netdev_features_change(dev);
5387}
5388EXPORT_SYMBOL(netdev_change_features);
5389
5390/**
5391 *	netif_stacked_transfer_operstate -	transfer operstate
5392 *	@rootdev: the root or lower level device to transfer state from
5393 *	@dev: the device to transfer operstate to
5394 *
5395 *	Transfer operational state from root to device. This is normally
5396 *	called when a stacking relationship exists between the root
5397 *	device and the device(a leaf device).
5398 */
5399void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5400					struct net_device *dev)
5401{
5402	if (rootdev->operstate == IF_OPER_DORMANT)
5403		netif_dormant_on(dev);
5404	else
5405		netif_dormant_off(dev);
5406
5407	if (netif_carrier_ok(rootdev)) {
5408		if (!netif_carrier_ok(dev))
5409			netif_carrier_on(dev);
5410	} else {
5411		if (netif_carrier_ok(dev))
5412			netif_carrier_off(dev);
5413	}
5414}
5415EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5416
5417#ifdef CONFIG_RPS
5418static int netif_alloc_rx_queues(struct net_device *dev)
5419{
5420	unsigned int i, count = dev->num_rx_queues;
5421	struct netdev_rx_queue *rx;
5422
5423	BUG_ON(count < 1);
5424
5425	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5426	if (!rx) {
5427		pr_err("netdev: Unable to allocate %u rx queues\n", count);
5428		return -ENOMEM;
5429	}
5430	dev->_rx = rx;
5431
5432	for (i = 0; i < count; i++)
5433		rx[i].dev = dev;
5434	return 0;
5435}
5436#endif
5437
5438static void netdev_init_one_queue(struct net_device *dev,
5439				  struct netdev_queue *queue, void *_unused)
5440{
5441	/* Initialize queue lock */
5442	spin_lock_init(&queue->_xmit_lock);
5443	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5444	queue->xmit_lock_owner = -1;
5445	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5446	queue->dev = dev;
5447#ifdef CONFIG_BQL
5448	dql_init(&queue->dql, HZ);
5449#endif
5450}
5451
 
 
 
 
 
 
 
 
5452static int netif_alloc_netdev_queues(struct net_device *dev)
5453{
5454	unsigned int count = dev->num_tx_queues;
5455	struct netdev_queue *tx;
 
5456
5457	BUG_ON(count < 1);
5458
5459	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5460	if (!tx) {
5461		pr_err("netdev: Unable to allocate %u tx queues\n", count);
5462		return -ENOMEM;
 
5463	}
5464	dev->_tx = tx;
5465
5466	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5467	spin_lock_init(&dev->tx_global_lock);
5468
5469	return 0;
5470}
5471
5472/**
5473 *	register_netdevice	- register a network device
5474 *	@dev: device to register
5475 *
5476 *	Take a completed network device structure and add it to the kernel
5477 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5478 *	chain. 0 is returned on success. A negative errno code is returned
5479 *	on a failure to set up the device, or if the name is a duplicate.
5480 *
5481 *	Callers must hold the rtnl semaphore. You may want
5482 *	register_netdev() instead of this.
5483 *
5484 *	BUGS:
5485 *	The locking appears insufficient to guarantee two parallel registers
5486 *	will not get the same name.
5487 */
5488
5489int register_netdevice(struct net_device *dev)
5490{
5491	int ret;
5492	struct net *net = dev_net(dev);
5493
5494	BUG_ON(dev_boot_phase);
5495	ASSERT_RTNL();
5496
5497	might_sleep();
5498
5499	/* When net_device's are persistent, this will be fatal. */
5500	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5501	BUG_ON(!net);
5502
5503	spin_lock_init(&dev->addr_list_lock);
5504	netdev_set_addr_lockdep_class(dev);
5505
5506	dev->iflink = -1;
5507
5508	ret = dev_get_valid_name(dev, dev->name);
5509	if (ret < 0)
5510		goto out;
5511
5512	/* Init, if this function is available */
5513	if (dev->netdev_ops->ndo_init) {
5514		ret = dev->netdev_ops->ndo_init(dev);
5515		if (ret) {
5516			if (ret > 0)
5517				ret = -EIO;
5518			goto out;
5519		}
5520	}
5521
5522	dev->ifindex = dev_new_index(net);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5523	if (dev->iflink == -1)
5524		dev->iflink = dev->ifindex;
5525
5526	/* Transfer changeable features to wanted_features and enable
5527	 * software offloads (GSO and GRO).
5528	 */
5529	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5530	dev->features |= NETIF_F_SOFT_FEATURES;
5531	dev->wanted_features = dev->features & dev->hw_features;
5532
5533	/* Turn on no cache copy if HW is doing checksum */
5534	if (!(dev->flags & IFF_LOOPBACK)) {
5535		dev->hw_features |= NETIF_F_NOCACHE_COPY;
5536		if (dev->features & NETIF_F_ALL_CSUM) {
5537			dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5538			dev->features |= NETIF_F_NOCACHE_COPY;
5539		}
5540	}
5541
5542	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5543	 */
5544	dev->vlan_features |= NETIF_F_HIGHDMA;
5545
 
 
 
 
 
 
 
 
5546	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5547	ret = notifier_to_errno(ret);
5548	if (ret)
5549		goto err_uninit;
5550
5551	ret = netdev_register_kobject(dev);
5552	if (ret)
5553		goto err_uninit;
5554	dev->reg_state = NETREG_REGISTERED;
5555
5556	__netdev_update_features(dev);
5557
5558	/*
5559	 *	Default initial state at registry is that the
5560	 *	device is present.
5561	 */
5562
5563	set_bit(__LINK_STATE_PRESENT, &dev->state);
5564
 
 
5565	dev_init_scheduler(dev);
5566	dev_hold(dev);
5567	list_netdevice(dev);
5568	add_device_randomness(dev->dev_addr, dev->addr_len);
5569
 
 
 
 
 
 
 
5570	/* Notify protocols, that a new device appeared. */
5571	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5572	ret = notifier_to_errno(ret);
5573	if (ret) {
5574		rollback_registered(dev);
5575		dev->reg_state = NETREG_UNREGISTERED;
5576	}
5577	/*
5578	 *	Prevent userspace races by waiting until the network
5579	 *	device is fully setup before sending notifications.
5580	 */
5581	if (!dev->rtnl_link_ops ||
5582	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5583		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5584
5585out:
5586	return ret;
5587
5588err_uninit:
5589	if (dev->netdev_ops->ndo_uninit)
5590		dev->netdev_ops->ndo_uninit(dev);
5591	goto out;
5592}
5593EXPORT_SYMBOL(register_netdevice);
5594
5595/**
5596 *	init_dummy_netdev	- init a dummy network device for NAPI
5597 *	@dev: device to init
5598 *
5599 *	This takes a network device structure and initialize the minimum
5600 *	amount of fields so it can be used to schedule NAPI polls without
5601 *	registering a full blown interface. This is to be used by drivers
5602 *	that need to tie several hardware interfaces to a single NAPI
5603 *	poll scheduler due to HW limitations.
5604 */
5605int init_dummy_netdev(struct net_device *dev)
5606{
5607	/* Clear everything. Note we don't initialize spinlocks
5608	 * are they aren't supposed to be taken by any of the
5609	 * NAPI code and this dummy netdev is supposed to be
5610	 * only ever used for NAPI polls
5611	 */
5612	memset(dev, 0, sizeof(struct net_device));
5613
5614	/* make sure we BUG if trying to hit standard
5615	 * register/unregister code path
5616	 */
5617	dev->reg_state = NETREG_DUMMY;
5618
5619	/* NAPI wants this */
5620	INIT_LIST_HEAD(&dev->napi_list);
5621
5622	/* a dummy interface is started by default */
5623	set_bit(__LINK_STATE_PRESENT, &dev->state);
5624	set_bit(__LINK_STATE_START, &dev->state);
5625
5626	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5627	 * because users of this 'device' dont need to change
5628	 * its refcount.
5629	 */
5630
5631	return 0;
5632}
5633EXPORT_SYMBOL_GPL(init_dummy_netdev);
5634
5635
5636/**
5637 *	register_netdev	- register a network device
5638 *	@dev: device to register
5639 *
5640 *	Take a completed network device structure and add it to the kernel
5641 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5642 *	chain. 0 is returned on success. A negative errno code is returned
5643 *	on a failure to set up the device, or if the name is a duplicate.
5644 *
5645 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5646 *	and expands the device name if you passed a format string to
5647 *	alloc_netdev.
5648 */
5649int register_netdev(struct net_device *dev)
5650{
5651	int err;
5652
5653	rtnl_lock();
5654	err = register_netdevice(dev);
5655	rtnl_unlock();
5656	return err;
5657}
5658EXPORT_SYMBOL(register_netdev);
5659
5660int netdev_refcnt_read(const struct net_device *dev)
5661{
5662	int i, refcnt = 0;
5663
5664	for_each_possible_cpu(i)
5665		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5666	return refcnt;
5667}
5668EXPORT_SYMBOL(netdev_refcnt_read);
5669
5670/*
5671 * netdev_wait_allrefs - wait until all references are gone.
 
5672 *
5673 * This is called when unregistering network devices.
5674 *
5675 * Any protocol or device that holds a reference should register
5676 * for netdevice notification, and cleanup and put back the
5677 * reference if they receive an UNREGISTER event.
5678 * We can get stuck here if buggy protocols don't correctly
5679 * call dev_put.
5680 */
5681static void netdev_wait_allrefs(struct net_device *dev)
5682{
5683	unsigned long rebroadcast_time, warning_time;
5684	int refcnt;
5685
5686	linkwatch_forget_dev(dev);
5687
5688	rebroadcast_time = warning_time = jiffies;
5689	refcnt = netdev_refcnt_read(dev);
5690
5691	while (refcnt != 0) {
5692		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5693			rtnl_lock();
5694
5695			/* Rebroadcast unregister notification */
5696			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5697			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5698			 * should have already handle it the first time */
5699
 
 
 
 
 
5700			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5701				     &dev->state)) {
5702				/* We must not have linkwatch events
5703				 * pending on unregister. If this
5704				 * happens, we simply run the queue
5705				 * unscheduled, resulting in a noop
5706				 * for this device.
5707				 */
5708				linkwatch_run_queue();
5709			}
5710
5711			__rtnl_unlock();
5712
5713			rebroadcast_time = jiffies;
5714		}
5715
5716		msleep(250);
5717
5718		refcnt = netdev_refcnt_read(dev);
5719
5720		if (time_after(jiffies, warning_time + 10 * HZ)) {
5721			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5722				 dev->name, refcnt);
5723			warning_time = jiffies;
5724		}
5725	}
5726}
5727
5728/* The sequence is:
5729 *
5730 *	rtnl_lock();
5731 *	...
5732 *	register_netdevice(x1);
5733 *	register_netdevice(x2);
5734 *	...
5735 *	unregister_netdevice(y1);
5736 *	unregister_netdevice(y2);
5737 *      ...
5738 *	rtnl_unlock();
5739 *	free_netdev(y1);
5740 *	free_netdev(y2);
5741 *
5742 * We are invoked by rtnl_unlock().
5743 * This allows us to deal with problems:
5744 * 1) We can delete sysfs objects which invoke hotplug
5745 *    without deadlocking with linkwatch via keventd.
5746 * 2) Since we run with the RTNL semaphore not held, we can sleep
5747 *    safely in order to wait for the netdev refcnt to drop to zero.
5748 *
5749 * We must not return until all unregister events added during
5750 * the interval the lock was held have been completed.
5751 */
5752void netdev_run_todo(void)
5753{
5754	struct list_head list;
5755
5756	/* Snapshot list, allow later requests */
5757	list_replace_init(&net_todo_list, &list);
5758
5759	__rtnl_unlock();
5760
5761	/* Wait for rcu callbacks to finish before attempting to drain
5762	 * the device list.  This usually avoids a 250ms wait.
5763	 */
5764	if (!list_empty(&list))
5765		rcu_barrier();
5766
5767	while (!list_empty(&list)) {
5768		struct net_device *dev
5769			= list_first_entry(&list, struct net_device, todo_list);
5770		list_del(&dev->todo_list);
5771
 
 
 
 
5772		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5773			pr_err("network todo '%s' but state %d\n",
5774			       dev->name, dev->reg_state);
5775			dump_stack();
5776			continue;
5777		}
5778
5779		dev->reg_state = NETREG_UNREGISTERED;
5780
5781		on_each_cpu(flush_backlog, dev, 1);
5782
5783		netdev_wait_allrefs(dev);
5784
5785		/* paranoia */
5786		BUG_ON(netdev_refcnt_read(dev));
5787		WARN_ON(rcu_access_pointer(dev->ip_ptr));
5788		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5789		WARN_ON(dev->dn_ptr);
5790
5791		if (dev->destructor)
5792			dev->destructor(dev);
5793
 
 
 
 
 
 
5794		/* Free network device */
5795		kobject_put(&dev->dev.kobj);
5796	}
5797}
5798
5799/* Convert net_device_stats to rtnl_link_stats64.  They have the same
5800 * fields in the same order, with only the type differing.
5801 */
5802void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5803			     const struct net_device_stats *netdev_stats)
5804{
5805#if BITS_PER_LONG == 64
5806	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5807	memcpy(stats64, netdev_stats, sizeof(*stats64));
5808#else
5809	size_t i, n = sizeof(*stats64) / sizeof(u64);
5810	const unsigned long *src = (const unsigned long *)netdev_stats;
5811	u64 *dst = (u64 *)stats64;
5812
5813	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5814		     sizeof(*stats64) / sizeof(u64));
5815	for (i = 0; i < n; i++)
5816		dst[i] = src[i];
5817#endif
5818}
5819EXPORT_SYMBOL(netdev_stats_to_stats64);
5820
5821/**
5822 *	dev_get_stats	- get network device statistics
5823 *	@dev: device to get statistics from
5824 *	@storage: place to store stats
5825 *
5826 *	Get network statistics from device. Return @storage.
5827 *	The device driver may provide its own method by setting
5828 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5829 *	otherwise the internal statistics structure is used.
5830 */
5831struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5832					struct rtnl_link_stats64 *storage)
5833{
5834	const struct net_device_ops *ops = dev->netdev_ops;
5835
5836	if (ops->ndo_get_stats64) {
5837		memset(storage, 0, sizeof(*storage));
5838		ops->ndo_get_stats64(dev, storage);
5839	} else if (ops->ndo_get_stats) {
5840		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5841	} else {
5842		netdev_stats_to_stats64(storage, &dev->stats);
5843	}
5844	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
 
5845	return storage;
5846}
5847EXPORT_SYMBOL(dev_get_stats);
5848
5849struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5850{
5851	struct netdev_queue *queue = dev_ingress_queue(dev);
5852
5853#ifdef CONFIG_NET_CLS_ACT
5854	if (queue)
5855		return queue;
5856	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5857	if (!queue)
5858		return NULL;
5859	netdev_init_one_queue(dev, queue, NULL);
5860	queue->qdisc = &noop_qdisc;
5861	queue->qdisc_sleeping = &noop_qdisc;
5862	rcu_assign_pointer(dev->ingress_queue, queue);
5863#endif
5864	return queue;
5865}
5866
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5867/**
5868 *	alloc_netdev_mqs - allocate network device
5869 *	@sizeof_priv:	size of private data to allocate space for
5870 *	@name:		device name format string
5871 *	@setup:		callback to initialize device
5872 *	@txqs:		the number of TX subqueues to allocate
5873 *	@rxqs:		the number of RX subqueues to allocate
5874 *
5875 *	Allocates a struct net_device with private data area for driver use
5876 *	and performs basic initialization.  Also allocates subquue structs
5877 *	for each queue on the device.
5878 */
5879struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5880		void (*setup)(struct net_device *),
5881		unsigned int txqs, unsigned int rxqs)
5882{
5883	struct net_device *dev;
5884	size_t alloc_size;
5885	struct net_device *p;
5886
5887	BUG_ON(strlen(name) >= sizeof(dev->name));
5888
5889	if (txqs < 1) {
5890		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5891		return NULL;
5892	}
5893
5894#ifdef CONFIG_RPS
5895	if (rxqs < 1) {
5896		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5897		return NULL;
5898	}
5899#endif
5900
5901	alloc_size = sizeof(struct net_device);
5902	if (sizeof_priv) {
5903		/* ensure 32-byte alignment of private area */
5904		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5905		alloc_size += sizeof_priv;
5906	}
5907	/* ensure 32-byte alignment of whole construct */
5908	alloc_size += NETDEV_ALIGN - 1;
5909
5910	p = kzalloc(alloc_size, GFP_KERNEL);
5911	if (!p) {
5912		pr_err("alloc_netdev: Unable to allocate device\n");
 
5913		return NULL;
5914	}
5915
5916	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5917	dev->padded = (char *)dev - (char *)p;
5918
5919	dev->pcpu_refcnt = alloc_percpu(int);
5920	if (!dev->pcpu_refcnt)
5921		goto free_p;
5922
5923	if (dev_addr_init(dev))
5924		goto free_pcpu;
5925
5926	dev_mc_init(dev);
5927	dev_uc_init(dev);
5928
5929	dev_net_set(dev, &init_net);
5930
5931	dev->gso_max_size = GSO_MAX_SIZE;
5932	dev->gso_max_segs = GSO_MAX_SEGS;
5933
5934	INIT_LIST_HEAD(&dev->napi_list);
5935	INIT_LIST_HEAD(&dev->unreg_list);
 
5936	INIT_LIST_HEAD(&dev->link_watch_list);
 
 
 
 
5937	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5938	setup(dev);
5939
5940	dev->num_tx_queues = txqs;
5941	dev->real_num_tx_queues = txqs;
5942	if (netif_alloc_netdev_queues(dev))
5943		goto free_all;
5944
5945#ifdef CONFIG_RPS
5946	dev->num_rx_queues = rxqs;
5947	dev->real_num_rx_queues = rxqs;
5948	if (netif_alloc_rx_queues(dev))
5949		goto free_all;
5950#endif
5951
5952	strcpy(dev->name, name);
5953	dev->group = INIT_NETDEV_GROUP;
 
 
5954	return dev;
5955
5956free_all:
5957	free_netdev(dev);
5958	return NULL;
5959
5960free_pcpu:
5961	free_percpu(dev->pcpu_refcnt);
5962	kfree(dev->_tx);
5963#ifdef CONFIG_RPS
5964	kfree(dev->_rx);
5965#endif
5966
5967free_p:
5968	kfree(p);
5969	return NULL;
5970}
5971EXPORT_SYMBOL(alloc_netdev_mqs);
5972
5973/**
5974 *	free_netdev - free network device
5975 *	@dev: device
5976 *
5977 *	This function does the last stage of destroying an allocated device
5978 * 	interface. The reference to the device object is released.
5979 *	If this is the last reference then it will be freed.
5980 */
5981void free_netdev(struct net_device *dev)
5982{
5983	struct napi_struct *p, *n;
5984
5985	release_net(dev_net(dev));
5986
5987	kfree(dev->_tx);
5988#ifdef CONFIG_RPS
5989	kfree(dev->_rx);
5990#endif
5991
5992	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
5993
5994	/* Flush device addresses */
5995	dev_addr_flush(dev);
5996
5997	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5998		netif_napi_del(p);
5999
6000	free_percpu(dev->pcpu_refcnt);
6001	dev->pcpu_refcnt = NULL;
6002
6003	/*  Compatibility with error handling in drivers */
6004	if (dev->reg_state == NETREG_UNINITIALIZED) {
6005		kfree((char *)dev - dev->padded);
6006		return;
6007	}
6008
6009	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6010	dev->reg_state = NETREG_RELEASED;
6011
6012	/* will free via device release */
6013	put_device(&dev->dev);
6014}
6015EXPORT_SYMBOL(free_netdev);
6016
6017/**
6018 *	synchronize_net -  Synchronize with packet receive processing
6019 *
6020 *	Wait for packets currently being received to be done.
6021 *	Does not block later packets from starting.
6022 */
6023void synchronize_net(void)
6024{
6025	might_sleep();
6026	if (rtnl_is_locked())
6027		synchronize_rcu_expedited();
6028	else
6029		synchronize_rcu();
6030}
6031EXPORT_SYMBOL(synchronize_net);
6032
6033/**
6034 *	unregister_netdevice_queue - remove device from the kernel
6035 *	@dev: device
6036 *	@head: list
6037 *
6038 *	This function shuts down a device interface and removes it
6039 *	from the kernel tables.
6040 *	If head not NULL, device is queued to be unregistered later.
6041 *
6042 *	Callers must hold the rtnl semaphore.  You may want
6043 *	unregister_netdev() instead of this.
6044 */
6045
6046void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6047{
6048	ASSERT_RTNL();
6049
6050	if (head) {
6051		list_move_tail(&dev->unreg_list, head);
6052	} else {
6053		rollback_registered(dev);
6054		/* Finish processing unregister after unlock */
6055		net_set_todo(dev);
6056	}
6057}
6058EXPORT_SYMBOL(unregister_netdevice_queue);
6059
6060/**
6061 *	unregister_netdevice_many - unregister many devices
6062 *	@head: list of devices
6063 */
6064void unregister_netdevice_many(struct list_head *head)
6065{
6066	struct net_device *dev;
6067
6068	if (!list_empty(head)) {
6069		rollback_registered_many(head);
6070		list_for_each_entry(dev, head, unreg_list)
6071			net_set_todo(dev);
6072	}
6073}
6074EXPORT_SYMBOL(unregister_netdevice_many);
6075
6076/**
6077 *	unregister_netdev - remove device from the kernel
6078 *	@dev: device
6079 *
6080 *	This function shuts down a device interface and removes it
6081 *	from the kernel tables.
6082 *
6083 *	This is just a wrapper for unregister_netdevice that takes
6084 *	the rtnl semaphore.  In general you want to use this and not
6085 *	unregister_netdevice.
6086 */
6087void unregister_netdev(struct net_device *dev)
6088{
6089	rtnl_lock();
6090	unregister_netdevice(dev);
6091	rtnl_unlock();
6092}
6093EXPORT_SYMBOL(unregister_netdev);
6094
6095/**
6096 *	dev_change_net_namespace - move device to different nethost namespace
6097 *	@dev: device
6098 *	@net: network namespace
6099 *	@pat: If not NULL name pattern to try if the current device name
6100 *	      is already taken in the destination network namespace.
6101 *
6102 *	This function shuts down a device interface and moves it
6103 *	to a new network namespace. On success 0 is returned, on
6104 *	a failure a netagive errno code is returned.
6105 *
6106 *	Callers must hold the rtnl semaphore.
6107 */
6108
6109int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6110{
6111	int err;
6112
6113	ASSERT_RTNL();
6114
6115	/* Don't allow namespace local devices to be moved. */
6116	err = -EINVAL;
6117	if (dev->features & NETIF_F_NETNS_LOCAL)
6118		goto out;
6119
6120	/* Ensure the device has been registrered */
6121	err = -EINVAL;
6122	if (dev->reg_state != NETREG_REGISTERED)
6123		goto out;
6124
6125	/* Get out if there is nothing todo */
6126	err = 0;
6127	if (net_eq(dev_net(dev), net))
6128		goto out;
6129
6130	/* Pick the destination device name, and ensure
6131	 * we can use it in the destination network namespace.
6132	 */
6133	err = -EEXIST;
6134	if (__dev_get_by_name(net, dev->name)) {
6135		/* We get here if we can't use the current device name */
6136		if (!pat)
6137			goto out;
6138		if (dev_get_valid_name(dev, pat) < 0)
6139			goto out;
6140	}
6141
6142	/*
6143	 * And now a mini version of register_netdevice unregister_netdevice.
6144	 */
6145
6146	/* If device is running close it first. */
6147	dev_close(dev);
6148
6149	/* And unlink it from device chain */
6150	err = -ENODEV;
6151	unlist_netdevice(dev);
6152
6153	synchronize_net();
6154
6155	/* Shutdown queueing discipline. */
6156	dev_shutdown(dev);
6157
6158	/* Notify protocols, that we are about to destroy
6159	   this device. They should clean all the things.
6160
6161	   Note that dev->reg_state stays at NETREG_REGISTERED.
6162	   This is wanted because this way 8021q and macvlan know
6163	   the device is just moving and can keep their slaves up.
6164	*/
6165	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6166	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6167	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
 
6168
6169	/*
6170	 *	Flush the unicast and multicast chains
6171	 */
6172	dev_uc_flush(dev);
6173	dev_mc_flush(dev);
6174
 
 
 
6175	/* Actually switch the network namespace */
6176	dev_net_set(dev, net);
6177
6178	/* If there is an ifindex conflict assign a new one */
6179	if (__dev_get_by_index(net, dev->ifindex)) {
6180		int iflink = (dev->iflink == dev->ifindex);
6181		dev->ifindex = dev_new_index(net);
6182		if (iflink)
6183			dev->iflink = dev->ifindex;
6184	}
6185
 
 
 
6186	/* Fixup kobjects */
6187	err = device_rename(&dev->dev, dev->name);
6188	WARN_ON(err);
6189
6190	/* Add the device back in the hashes */
6191	list_netdevice(dev);
6192
6193	/* Notify protocols, that a new device appeared. */
6194	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6195
6196	/*
6197	 *	Prevent userspace races by waiting until the network
6198	 *	device is fully setup before sending notifications.
6199	 */
6200	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6201
6202	synchronize_net();
6203	err = 0;
6204out:
6205	return err;
6206}
6207EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6208
6209static int dev_cpu_callback(struct notifier_block *nfb,
6210			    unsigned long action,
6211			    void *ocpu)
6212{
6213	struct sk_buff **list_skb;
6214	struct sk_buff *skb;
6215	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6216	struct softnet_data *sd, *oldsd;
6217
6218	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6219		return NOTIFY_OK;
6220
6221	local_irq_disable();
6222	cpu = smp_processor_id();
6223	sd = &per_cpu(softnet_data, cpu);
6224	oldsd = &per_cpu(softnet_data, oldcpu);
6225
6226	/* Find end of our completion_queue. */
6227	list_skb = &sd->completion_queue;
6228	while (*list_skb)
6229		list_skb = &(*list_skb)->next;
6230	/* Append completion queue from offline CPU. */
6231	*list_skb = oldsd->completion_queue;
6232	oldsd->completion_queue = NULL;
6233
6234	/* Append output queue from offline CPU. */
6235	if (oldsd->output_queue) {
6236		*sd->output_queue_tailp = oldsd->output_queue;
6237		sd->output_queue_tailp = oldsd->output_queue_tailp;
6238		oldsd->output_queue = NULL;
6239		oldsd->output_queue_tailp = &oldsd->output_queue;
6240	}
6241	/* Append NAPI poll list from offline CPU. */
6242	if (!list_empty(&oldsd->poll_list)) {
6243		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6244		raise_softirq_irqoff(NET_RX_SOFTIRQ);
6245	}
6246
6247	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6248	local_irq_enable();
6249
6250	/* Process offline CPU's input_pkt_queue */
6251	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6252		netif_rx(skb);
6253		input_queue_head_incr(oldsd);
6254	}
6255	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6256		netif_rx(skb);
6257		input_queue_head_incr(oldsd);
6258	}
6259
6260	return NOTIFY_OK;
6261}
6262
6263
6264/**
6265 *	netdev_increment_features - increment feature set by one
6266 *	@all: current feature set
6267 *	@one: new feature set
6268 *	@mask: mask feature set
6269 *
6270 *	Computes a new feature set after adding a device with feature set
6271 *	@one to the master device with current feature set @all.  Will not
6272 *	enable anything that is off in @mask. Returns the new feature set.
6273 */
6274netdev_features_t netdev_increment_features(netdev_features_t all,
6275	netdev_features_t one, netdev_features_t mask)
6276{
6277	if (mask & NETIF_F_GEN_CSUM)
6278		mask |= NETIF_F_ALL_CSUM;
6279	mask |= NETIF_F_VLAN_CHALLENGED;
6280
6281	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6282	all &= one | ~NETIF_F_ALL_FOR_ALL;
6283
6284	/* If one device supports hw checksumming, set for all. */
6285	if (all & NETIF_F_GEN_CSUM)
6286		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6287
6288	return all;
6289}
6290EXPORT_SYMBOL(netdev_increment_features);
6291
6292static struct hlist_head *netdev_create_hash(void)
6293{
6294	int i;
6295	struct hlist_head *hash;
6296
6297	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6298	if (hash != NULL)
6299		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6300			INIT_HLIST_HEAD(&hash[i]);
6301
6302	return hash;
6303}
6304
6305/* Initialize per network namespace state */
6306static int __net_init netdev_init(struct net *net)
6307{
6308	if (net != &init_net)
6309		INIT_LIST_HEAD(&net->dev_base_head);
6310
6311	net->dev_name_head = netdev_create_hash();
6312	if (net->dev_name_head == NULL)
6313		goto err_name;
6314
6315	net->dev_index_head = netdev_create_hash();
6316	if (net->dev_index_head == NULL)
6317		goto err_idx;
6318
6319	return 0;
6320
6321err_idx:
6322	kfree(net->dev_name_head);
6323err_name:
6324	return -ENOMEM;
6325}
6326
6327/**
6328 *	netdev_drivername - network driver for the device
6329 *	@dev: network device
6330 *
6331 *	Determine network driver for device.
6332 */
6333const char *netdev_drivername(const struct net_device *dev)
6334{
6335	const struct device_driver *driver;
6336	const struct device *parent;
6337	const char *empty = "";
6338
6339	parent = dev->dev.parent;
6340	if (!parent)
6341		return empty;
6342
6343	driver = parent->driver;
6344	if (driver && driver->name)
6345		return driver->name;
6346	return empty;
6347}
6348
6349int __netdev_printk(const char *level, const struct net_device *dev,
6350			   struct va_format *vaf)
6351{
6352	int r;
6353
6354	if (dev && dev->dev.parent)
6355		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6356			       netdev_name(dev), vaf);
6357	else if (dev)
 
 
 
 
6358		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6359	else
6360		r = printk("%s(NULL net_device): %pV", level, vaf);
 
6361
6362	return r;
6363}
6364EXPORT_SYMBOL(__netdev_printk);
6365
6366int netdev_printk(const char *level, const struct net_device *dev,
6367		  const char *format, ...)
6368{
6369	struct va_format vaf;
6370	va_list args;
6371	int r;
6372
6373	va_start(args, format);
6374
6375	vaf.fmt = format;
6376	vaf.va = &args;
6377
6378	r = __netdev_printk(level, dev, &vaf);
 
6379	va_end(args);
6380
6381	return r;
6382}
6383EXPORT_SYMBOL(netdev_printk);
6384
6385#define define_netdev_printk_level(func, level)			\
6386int func(const struct net_device *dev, const char *fmt, ...)	\
6387{								\
6388	int r;							\
6389	struct va_format vaf;					\
6390	va_list args;						\
6391								\
6392	va_start(args, fmt);					\
6393								\
6394	vaf.fmt = fmt;						\
6395	vaf.va = &args;						\
6396								\
6397	r = __netdev_printk(level, dev, &vaf);			\
 
6398	va_end(args);						\
6399								\
6400	return r;						\
6401}								\
6402EXPORT_SYMBOL(func);
6403
6404define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6405define_netdev_printk_level(netdev_alert, KERN_ALERT);
6406define_netdev_printk_level(netdev_crit, KERN_CRIT);
6407define_netdev_printk_level(netdev_err, KERN_ERR);
6408define_netdev_printk_level(netdev_warn, KERN_WARNING);
6409define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6410define_netdev_printk_level(netdev_info, KERN_INFO);
6411
6412static void __net_exit netdev_exit(struct net *net)
6413{
6414	kfree(net->dev_name_head);
6415	kfree(net->dev_index_head);
6416}
6417
6418static struct pernet_operations __net_initdata netdev_net_ops = {
6419	.init = netdev_init,
6420	.exit = netdev_exit,
6421};
6422
6423static void __net_exit default_device_exit(struct net *net)
6424{
6425	struct net_device *dev, *aux;
6426	/*
6427	 * Push all migratable network devices back to the
6428	 * initial network namespace
6429	 */
6430	rtnl_lock();
6431	for_each_netdev_safe(net, dev, aux) {
6432		int err;
6433		char fb_name[IFNAMSIZ];
6434
6435		/* Ignore unmoveable devices (i.e. loopback) */
6436		if (dev->features & NETIF_F_NETNS_LOCAL)
6437			continue;
6438
6439		/* Leave virtual devices for the generic cleanup */
6440		if (dev->rtnl_link_ops)
6441			continue;
6442
6443		/* Push remaining network devices to init_net */
6444		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6445		err = dev_change_net_namespace(dev, &init_net, fb_name);
6446		if (err) {
6447			pr_emerg("%s: failed to move %s to init_net: %d\n",
6448				 __func__, dev->name, err);
6449			BUG();
6450		}
6451	}
6452	rtnl_unlock();
6453}
6454
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6455static void __net_exit default_device_exit_batch(struct list_head *net_list)
6456{
6457	/* At exit all network devices most be removed from a network
6458	 * namespace.  Do this in the reverse order of registration.
6459	 * Do this across as many network namespaces as possible to
6460	 * improve batching efficiency.
6461	 */
6462	struct net_device *dev;
6463	struct net *net;
6464	LIST_HEAD(dev_kill_list);
6465
6466	rtnl_lock();
 
 
 
 
 
 
 
 
 
 
 
6467	list_for_each_entry(net, net_list, exit_list) {
6468		for_each_netdev_reverse(net, dev) {
6469			if (dev->rtnl_link_ops)
6470				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6471			else
6472				unregister_netdevice_queue(dev, &dev_kill_list);
6473		}
6474	}
6475	unregister_netdevice_many(&dev_kill_list);
6476	list_del(&dev_kill_list);
6477	rtnl_unlock();
6478}
6479
6480static struct pernet_operations __net_initdata default_device_ops = {
6481	.exit = default_device_exit,
6482	.exit_batch = default_device_exit_batch,
6483};
6484
6485/*
6486 *	Initialize the DEV module. At boot time this walks the device list and
6487 *	unhooks any devices that fail to initialise (normally hardware not
6488 *	present) and leaves us with a valid list of present and active devices.
6489 *
6490 */
6491
6492/*
6493 *       This is called single threaded during boot, so no need
6494 *       to take the rtnl semaphore.
6495 */
6496static int __init net_dev_init(void)
6497{
6498	int i, rc = -ENOMEM;
6499
6500	BUG_ON(!dev_boot_phase);
6501
6502	if (dev_proc_init())
6503		goto out;
6504
6505	if (netdev_kobject_init())
6506		goto out;
6507
6508	INIT_LIST_HEAD(&ptype_all);
6509	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6510		INIT_LIST_HEAD(&ptype_base[i]);
6511
 
 
6512	if (register_pernet_subsys(&netdev_net_ops))
6513		goto out;
6514
6515	/*
6516	 *	Initialise the packet receive queues.
6517	 */
6518
6519	for_each_possible_cpu(i) {
6520		struct softnet_data *sd = &per_cpu(softnet_data, i);
6521
6522		memset(sd, 0, sizeof(*sd));
6523		skb_queue_head_init(&sd->input_pkt_queue);
6524		skb_queue_head_init(&sd->process_queue);
6525		sd->completion_queue = NULL;
6526		INIT_LIST_HEAD(&sd->poll_list);
6527		sd->output_queue = NULL;
6528		sd->output_queue_tailp = &sd->output_queue;
6529#ifdef CONFIG_RPS
6530		sd->csd.func = rps_trigger_softirq;
6531		sd->csd.info = sd;
6532		sd->csd.flags = 0;
6533		sd->cpu = i;
6534#endif
6535
6536		sd->backlog.poll = process_backlog;
6537		sd->backlog.weight = weight_p;
6538		sd->backlog.gro_list = NULL;
6539		sd->backlog.gro_count = 0;
6540	}
6541
6542	dev_boot_phase = 0;
6543
6544	/* The loopback device is special if any other network devices
6545	 * is present in a network namespace the loopback device must
6546	 * be present. Since we now dynamically allocate and free the
6547	 * loopback device ensure this invariant is maintained by
6548	 * keeping the loopback device as the first device on the
6549	 * list of network devices.  Ensuring the loopback devices
6550	 * is the first device that appears and the last network device
6551	 * that disappears.
6552	 */
6553	if (register_pernet_device(&loopback_net_ops))
6554		goto out;
6555
6556	if (register_pernet_device(&default_device_ops))
6557		goto out;
6558
6559	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6560	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6561
6562	hotcpu_notifier(dev_cpu_callback, 0);
6563	dst_init();
6564	dev_mcast_init();
6565	rc = 0;
6566out:
6567	return rc;
6568}
6569
6570subsys_initcall(net_dev_init);
6571
6572static int __init initialize_hashrnd(void)
6573{
6574	get_random_bytes(&hashrnd, sizeof(hashrnd));
6575	return 0;
6576}
6577
6578late_initcall_sync(initialize_hashrnd);
6579