Linux Audio

Check our new training course

Linux kernel drivers training

Mar 31-Apr 9, 2025, special US time zones
Register
Loading...
v4.6
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/mutex.h>
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
  97#include <net/net_namespace.h>
  98#include <net/sock.h>
  99#include <net/busy_poll.h>
 100#include <linux/rtnetlink.h>
 
 
 101#include <linux/stat.h>
 102#include <net/dst.h>
 103#include <net/dst_metadata.h>
 104#include <net/pkt_sched.h>
 105#include <net/checksum.h>
 106#include <net/xfrm.h>
 107#include <linux/highmem.h>
 108#include <linux/init.h>
 
 109#include <linux/module.h>
 110#include <linux/netpoll.h>
 111#include <linux/rcupdate.h>
 112#include <linux/delay.h>
 
 113#include <net/iw_handler.h>
 114#include <asm/current.h>
 115#include <linux/audit.h>
 116#include <linux/dmaengine.h>
 117#include <linux/err.h>
 118#include <linux/ctype.h>
 119#include <linux/if_arp.h>
 120#include <linux/if_vlan.h>
 121#include <linux/ip.h>
 122#include <net/ip.h>
 123#include <net/mpls.h>
 124#include <linux/ipv6.h>
 125#include <linux/in.h>
 126#include <linux/jhash.h>
 127#include <linux/random.h>
 128#include <trace/events/napi.h>
 129#include <trace/events/net.h>
 130#include <trace/events/skb.h>
 131#include <linux/pci.h>
 132#include <linux/inetdevice.h>
 133#include <linux/cpu_rmap.h>
 
 134#include <linux/static_key.h>
 135#include <linux/hashtable.h>
 136#include <linux/vmalloc.h>
 137#include <linux/if_macvlan.h>
 138#include <linux/errqueue.h>
 139#include <linux/hrtimer.h>
 140#include <linux/netfilter_ingress.h>
 141#include <linux/sctp.h>
 142
 143#include "net-sysfs.h"
 144
 145/* Instead of increasing this, you should create a hash table. */
 146#define MAX_GRO_SKBS 8
 147
 148/* This should be increased if a protocol with a bigger head is added. */
 149#define GRO_MAX_HEAD (MAX_HEADER + 128)
 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 151static DEFINE_SPINLOCK(ptype_lock);
 152static DEFINE_SPINLOCK(offload_lock);
 153struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 154struct list_head ptype_all __read_mostly;	/* Taps */
 155static struct list_head offload_base __read_mostly;
 156
 157static int netif_rx_internal(struct sk_buff *skb);
 158static int call_netdevice_notifiers_info(unsigned long val,
 159					 struct net_device *dev,
 160					 struct netdev_notifier_info *info);
 161
 162/*
 163 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 164 * semaphore.
 165 *
 166 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 167 *
 168 * Writers must hold the rtnl semaphore while they loop through the
 169 * dev_base_head list, and hold dev_base_lock for writing when they do the
 170 * actual updates.  This allows pure readers to access the list even
 171 * while a writer is preparing to update it.
 172 *
 173 * To put it another way, dev_base_lock is held for writing only to
 174 * protect against pure readers; the rtnl semaphore provides the
 175 * protection against other writers.
 176 *
 177 * See, for example usages, register_netdevice() and
 178 * unregister_netdevice(), which must be called with the rtnl
 179 * semaphore held.
 180 */
 181DEFINE_RWLOCK(dev_base_lock);
 182EXPORT_SYMBOL(dev_base_lock);
 183
 184/* protects napi_hash addition/deletion and napi_gen_id */
 185static DEFINE_SPINLOCK(napi_hash_lock);
 186
 187static unsigned int napi_gen_id = NR_CPUS;
 188static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 189
 190static seqcount_t devnet_rename_seq;
 191
 192static inline void dev_base_seq_inc(struct net *net)
 193{
 194	while (++net->dev_base_seq == 0);
 195}
 196
 197static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 198{
 199	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 200
 201	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 202}
 203
 204static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 205{
 206	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 207}
 208
 209static inline void rps_lock(struct softnet_data *sd)
 210{
 211#ifdef CONFIG_RPS
 212	spin_lock(&sd->input_pkt_queue.lock);
 213#endif
 214}
 215
 216static inline void rps_unlock(struct softnet_data *sd)
 217{
 218#ifdef CONFIG_RPS
 219	spin_unlock(&sd->input_pkt_queue.lock);
 220#endif
 221}
 222
 223/* Device list insertion */
 224static void list_netdevice(struct net_device *dev)
 225{
 226	struct net *net = dev_net(dev);
 227
 228	ASSERT_RTNL();
 229
 230	write_lock_bh(&dev_base_lock);
 231	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 232	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 233	hlist_add_head_rcu(&dev->index_hlist,
 234			   dev_index_hash(net, dev->ifindex));
 235	write_unlock_bh(&dev_base_lock);
 236
 237	dev_base_seq_inc(net);
 
 
 238}
 239
 240/* Device list removal
 241 * caller must respect a RCU grace period before freeing/reusing dev
 242 */
 243static void unlist_netdevice(struct net_device *dev)
 244{
 245	ASSERT_RTNL();
 246
 247	/* Unlink dev from the device chain */
 248	write_lock_bh(&dev_base_lock);
 249	list_del_rcu(&dev->dev_list);
 250	hlist_del_rcu(&dev->name_hlist);
 251	hlist_del_rcu(&dev->index_hlist);
 252	write_unlock_bh(&dev_base_lock);
 253
 254	dev_base_seq_inc(dev_net(dev));
 255}
 256
 257/*
 258 *	Our notifier list
 259 */
 260
 261static RAW_NOTIFIER_HEAD(netdev_chain);
 262
 263/*
 264 *	Device drivers call our routines to queue packets here. We empty the
 265 *	queue in the local softnet handler.
 266 */
 267
 268DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 269EXPORT_PER_CPU_SYMBOL(softnet_data);
 270
 271#ifdef CONFIG_LOCKDEP
 272/*
 273 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 274 * according to dev->type
 275 */
 276static const unsigned short netdev_lock_type[] =
 277	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 278	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 279	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 280	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 281	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 282	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 283	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 284	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 285	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 286	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 287	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 288	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 289	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 290	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 291	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 292
 293static const char *const netdev_lock_name[] =
 294	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 295	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 296	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 297	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 298	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 299	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 300	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 301	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 302	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 303	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 304	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 305	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 306	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 307	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 308	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 309
 310static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 311static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 312
 313static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 314{
 315	int i;
 316
 317	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 318		if (netdev_lock_type[i] == dev_type)
 319			return i;
 320	/* the last key is used by default */
 321	return ARRAY_SIZE(netdev_lock_type) - 1;
 322}
 323
 324static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 325						 unsigned short dev_type)
 326{
 327	int i;
 328
 329	i = netdev_lock_pos(dev_type);
 330	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 331				   netdev_lock_name[i]);
 332}
 333
 334static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 335{
 336	int i;
 337
 338	i = netdev_lock_pos(dev->type);
 339	lockdep_set_class_and_name(&dev->addr_list_lock,
 340				   &netdev_addr_lock_key[i],
 341				   netdev_lock_name[i]);
 342}
 343#else
 344static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 345						 unsigned short dev_type)
 346{
 347}
 348static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 349{
 350}
 351#endif
 352
 353/*******************************************************************************
 354
 355		Protocol management and registration routines
 356
 357*******************************************************************************/
 358
 359/*
 360 *	Add a protocol ID to the list. Now that the input handler is
 361 *	smarter we can dispense with all the messy stuff that used to be
 362 *	here.
 363 *
 364 *	BEWARE!!! Protocol handlers, mangling input packets,
 365 *	MUST BE last in hash buckets and checking protocol handlers
 366 *	MUST start from promiscuous ptype_all chain in net_bh.
 367 *	It is true now, do not change it.
 368 *	Explanation follows: if protocol handler, mangling packet, will
 369 *	be the first on list, it is not able to sense, that packet
 370 *	is cloned and should be copied-on-write, so that it will
 371 *	change it and subsequent readers will get broken packet.
 372 *							--ANK (980803)
 373 */
 374
 375static inline struct list_head *ptype_head(const struct packet_type *pt)
 376{
 377	if (pt->type == htons(ETH_P_ALL))
 378		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 379	else
 380		return pt->dev ? &pt->dev->ptype_specific :
 381				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 382}
 383
 384/**
 385 *	dev_add_pack - add packet handler
 386 *	@pt: packet type declaration
 387 *
 388 *	Add a protocol handler to the networking stack. The passed &packet_type
 389 *	is linked into kernel lists and may not be freed until it has been
 390 *	removed from the kernel lists.
 391 *
 392 *	This call does not sleep therefore it can not
 393 *	guarantee all CPU's that are in middle of receiving packets
 394 *	will see the new packet type (until the next received packet).
 395 */
 396
 397void dev_add_pack(struct packet_type *pt)
 398{
 399	struct list_head *head = ptype_head(pt);
 400
 401	spin_lock(&ptype_lock);
 402	list_add_rcu(&pt->list, head);
 403	spin_unlock(&ptype_lock);
 404}
 405EXPORT_SYMBOL(dev_add_pack);
 406
 407/**
 408 *	__dev_remove_pack	 - remove packet handler
 409 *	@pt: packet type declaration
 410 *
 411 *	Remove a protocol handler that was previously added to the kernel
 412 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 413 *	from the kernel lists and can be freed or reused once this function
 414 *	returns.
 415 *
 416 *      The packet type might still be in use by receivers
 417 *	and must not be freed until after all the CPU's have gone
 418 *	through a quiescent state.
 419 */
 420void __dev_remove_pack(struct packet_type *pt)
 421{
 422	struct list_head *head = ptype_head(pt);
 423	struct packet_type *pt1;
 424
 425	spin_lock(&ptype_lock);
 426
 427	list_for_each_entry(pt1, head, list) {
 428		if (pt == pt1) {
 429			list_del_rcu(&pt->list);
 430			goto out;
 431		}
 432	}
 433
 434	pr_warn("dev_remove_pack: %p not found\n", pt);
 435out:
 436	spin_unlock(&ptype_lock);
 437}
 438EXPORT_SYMBOL(__dev_remove_pack);
 439
 440/**
 441 *	dev_remove_pack	 - remove packet handler
 442 *	@pt: packet type declaration
 443 *
 444 *	Remove a protocol handler that was previously added to the kernel
 445 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 446 *	from the kernel lists and can be freed or reused once this function
 447 *	returns.
 448 *
 449 *	This call sleeps to guarantee that no CPU is looking at the packet
 450 *	type after return.
 451 */
 452void dev_remove_pack(struct packet_type *pt)
 453{
 454	__dev_remove_pack(pt);
 455
 456	synchronize_net();
 457}
 458EXPORT_SYMBOL(dev_remove_pack);
 459
 460
 461/**
 462 *	dev_add_offload - register offload handlers
 463 *	@po: protocol offload declaration
 464 *
 465 *	Add protocol offload handlers to the networking stack. The passed
 466 *	&proto_offload is linked into kernel lists and may not be freed until
 467 *	it has been removed from the kernel lists.
 468 *
 469 *	This call does not sleep therefore it can not
 470 *	guarantee all CPU's that are in middle of receiving packets
 471 *	will see the new offload handlers (until the next received packet).
 472 */
 473void dev_add_offload(struct packet_offload *po)
 474{
 475	struct packet_offload *elem;
 476
 477	spin_lock(&offload_lock);
 478	list_for_each_entry(elem, &offload_base, list) {
 479		if (po->priority < elem->priority)
 480			break;
 481	}
 482	list_add_rcu(&po->list, elem->list.prev);
 483	spin_unlock(&offload_lock);
 484}
 485EXPORT_SYMBOL(dev_add_offload);
 486
 487/**
 488 *	__dev_remove_offload	 - remove offload handler
 489 *	@po: packet offload declaration
 490 *
 491 *	Remove a protocol offload handler that was previously added to the
 492 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
 493 *	is removed from the kernel lists and can be freed or reused once this
 494 *	function returns.
 495 *
 496 *      The packet type might still be in use by receivers
 497 *	and must not be freed until after all the CPU's have gone
 498 *	through a quiescent state.
 499 */
 500static void __dev_remove_offload(struct packet_offload *po)
 501{
 502	struct list_head *head = &offload_base;
 503	struct packet_offload *po1;
 504
 505	spin_lock(&offload_lock);
 506
 507	list_for_each_entry(po1, head, list) {
 508		if (po == po1) {
 509			list_del_rcu(&po->list);
 510			goto out;
 511		}
 512	}
 513
 514	pr_warn("dev_remove_offload: %p not found\n", po);
 515out:
 516	spin_unlock(&offload_lock);
 517}
 518
 519/**
 520 *	dev_remove_offload	 - remove packet offload handler
 521 *	@po: packet offload declaration
 522 *
 523 *	Remove a packet offload handler that was previously added to the kernel
 524 *	offload handlers by dev_add_offload(). The passed &offload_type is
 525 *	removed from the kernel lists and can be freed or reused once this
 526 *	function returns.
 527 *
 528 *	This call sleeps to guarantee that no CPU is looking at the packet
 529 *	type after return.
 530 */
 531void dev_remove_offload(struct packet_offload *po)
 532{
 533	__dev_remove_offload(po);
 534
 535	synchronize_net();
 536}
 537EXPORT_SYMBOL(dev_remove_offload);
 538
 539/******************************************************************************
 540
 541		      Device Boot-time Settings Routines
 542
 543*******************************************************************************/
 544
 545/* Boot time configuration table */
 546static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 547
 548/**
 549 *	netdev_boot_setup_add	- add new setup entry
 550 *	@name: name of the device
 551 *	@map: configured settings for the device
 552 *
 553 *	Adds new setup entry to the dev_boot_setup list.  The function
 554 *	returns 0 on error and 1 on success.  This is a generic routine to
 555 *	all netdevices.
 556 */
 557static int netdev_boot_setup_add(char *name, struct ifmap *map)
 558{
 559	struct netdev_boot_setup *s;
 560	int i;
 561
 562	s = dev_boot_setup;
 563	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 564		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 565			memset(s[i].name, 0, sizeof(s[i].name));
 566			strlcpy(s[i].name, name, IFNAMSIZ);
 567			memcpy(&s[i].map, map, sizeof(s[i].map));
 568			break;
 569		}
 570	}
 571
 572	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 573}
 574
 575/**
 576 *	netdev_boot_setup_check	- check boot time settings
 577 *	@dev: the netdevice
 578 *
 579 * 	Check boot time settings for the device.
 580 *	The found settings are set for the device to be used
 581 *	later in the device probing.
 582 *	Returns 0 if no settings found, 1 if they are.
 583 */
 584int netdev_boot_setup_check(struct net_device *dev)
 585{
 586	struct netdev_boot_setup *s = dev_boot_setup;
 587	int i;
 588
 589	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 590		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 591		    !strcmp(dev->name, s[i].name)) {
 592			dev->irq 	= s[i].map.irq;
 593			dev->base_addr 	= s[i].map.base_addr;
 594			dev->mem_start 	= s[i].map.mem_start;
 595			dev->mem_end 	= s[i].map.mem_end;
 596			return 1;
 597		}
 598	}
 599	return 0;
 600}
 601EXPORT_SYMBOL(netdev_boot_setup_check);
 602
 603
 604/**
 605 *	netdev_boot_base	- get address from boot time settings
 606 *	@prefix: prefix for network device
 607 *	@unit: id for network device
 608 *
 609 * 	Check boot time settings for the base address of device.
 610 *	The found settings are set for the device to be used
 611 *	later in the device probing.
 612 *	Returns 0 if no settings found.
 613 */
 614unsigned long netdev_boot_base(const char *prefix, int unit)
 615{
 616	const struct netdev_boot_setup *s = dev_boot_setup;
 617	char name[IFNAMSIZ];
 618	int i;
 619
 620	sprintf(name, "%s%d", prefix, unit);
 621
 622	/*
 623	 * If device already registered then return base of 1
 624	 * to indicate not to probe for this interface
 625	 */
 626	if (__dev_get_by_name(&init_net, name))
 627		return 1;
 628
 629	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 630		if (!strcmp(name, s[i].name))
 631			return s[i].map.base_addr;
 632	return 0;
 633}
 634
 635/*
 636 * Saves at boot time configured settings for any netdevice.
 637 */
 638int __init netdev_boot_setup(char *str)
 639{
 640	int ints[5];
 641	struct ifmap map;
 642
 643	str = get_options(str, ARRAY_SIZE(ints), ints);
 644	if (!str || !*str)
 645		return 0;
 646
 647	/* Save settings */
 648	memset(&map, 0, sizeof(map));
 649	if (ints[0] > 0)
 650		map.irq = ints[1];
 651	if (ints[0] > 1)
 652		map.base_addr = ints[2];
 653	if (ints[0] > 2)
 654		map.mem_start = ints[3];
 655	if (ints[0] > 3)
 656		map.mem_end = ints[4];
 657
 658	/* Add new entry to the list */
 659	return netdev_boot_setup_add(str, &map);
 660}
 661
 662__setup("netdev=", netdev_boot_setup);
 663
 664/*******************************************************************************
 665
 666			    Device Interface Subroutines
 667
 668*******************************************************************************/
 669
 670/**
 671 *	dev_get_iflink	- get 'iflink' value of a interface
 672 *	@dev: targeted interface
 673 *
 674 *	Indicates the ifindex the interface is linked to.
 675 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
 676 */
 677
 678int dev_get_iflink(const struct net_device *dev)
 679{
 680	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 681		return dev->netdev_ops->ndo_get_iflink(dev);
 682
 683	return dev->ifindex;
 684}
 685EXPORT_SYMBOL(dev_get_iflink);
 686
 687/**
 688 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
 689 *	@dev: targeted interface
 690 *	@skb: The packet.
 691 *
 692 *	For better visibility of tunnel traffic OVS needs to retrieve
 693 *	egress tunnel information for a packet. Following API allows
 694 *	user to get this info.
 695 */
 696int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 697{
 698	struct ip_tunnel_info *info;
 699
 700	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 701		return -EINVAL;
 702
 703	info = skb_tunnel_info_unclone(skb);
 704	if (!info)
 705		return -ENOMEM;
 706	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 707		return -EINVAL;
 708
 709	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 710}
 711EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 712
 713/**
 714 *	__dev_get_by_name	- find a device by its name
 715 *	@net: the applicable net namespace
 716 *	@name: name to find
 717 *
 718 *	Find an interface by name. Must be called under RTNL semaphore
 719 *	or @dev_base_lock. If the name is found a pointer to the device
 720 *	is returned. If the name is not found then %NULL is returned. The
 721 *	reference counters are not incremented so the caller must be
 722 *	careful with locks.
 723 */
 724
 725struct net_device *__dev_get_by_name(struct net *net, const char *name)
 726{
 
 727	struct net_device *dev;
 728	struct hlist_head *head = dev_name_hash(net, name);
 729
 730	hlist_for_each_entry(dev, head, name_hlist)
 731		if (!strncmp(dev->name, name, IFNAMSIZ))
 732			return dev;
 733
 734	return NULL;
 735}
 736EXPORT_SYMBOL(__dev_get_by_name);
 737
 738/**
 739 *	dev_get_by_name_rcu	- find a device by its name
 740 *	@net: the applicable net namespace
 741 *	@name: name to find
 742 *
 743 *	Find an interface by name.
 744 *	If the name is found a pointer to the device is returned.
 745 * 	If the name is not found then %NULL is returned.
 746 *	The reference counters are not incremented so the caller must be
 747 *	careful with locks. The caller must hold RCU lock.
 748 */
 749
 750struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 751{
 
 752	struct net_device *dev;
 753	struct hlist_head *head = dev_name_hash(net, name);
 754
 755	hlist_for_each_entry_rcu(dev, head, name_hlist)
 756		if (!strncmp(dev->name, name, IFNAMSIZ))
 757			return dev;
 758
 759	return NULL;
 760}
 761EXPORT_SYMBOL(dev_get_by_name_rcu);
 762
 763/**
 764 *	dev_get_by_name		- find a device by its name
 765 *	@net: the applicable net namespace
 766 *	@name: name to find
 767 *
 768 *	Find an interface by name. This can be called from any
 769 *	context and does its own locking. The returned handle has
 770 *	the usage count incremented and the caller must use dev_put() to
 771 *	release it when it is no longer needed. %NULL is returned if no
 772 *	matching device is found.
 773 */
 774
 775struct net_device *dev_get_by_name(struct net *net, const char *name)
 776{
 777	struct net_device *dev;
 778
 779	rcu_read_lock();
 780	dev = dev_get_by_name_rcu(net, name);
 781	if (dev)
 782		dev_hold(dev);
 783	rcu_read_unlock();
 784	return dev;
 785}
 786EXPORT_SYMBOL(dev_get_by_name);
 787
 788/**
 789 *	__dev_get_by_index - find a device by its ifindex
 790 *	@net: the applicable net namespace
 791 *	@ifindex: index of device
 792 *
 793 *	Search for an interface by index. Returns %NULL if the device
 794 *	is not found or a pointer to the device. The device has not
 795 *	had its reference counter increased so the caller must be careful
 796 *	about locking. The caller must hold either the RTNL semaphore
 797 *	or @dev_base_lock.
 798 */
 799
 800struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 801{
 
 802	struct net_device *dev;
 803	struct hlist_head *head = dev_index_hash(net, ifindex);
 804
 805	hlist_for_each_entry(dev, head, index_hlist)
 806		if (dev->ifindex == ifindex)
 807			return dev;
 808
 809	return NULL;
 810}
 811EXPORT_SYMBOL(__dev_get_by_index);
 812
 813/**
 814 *	dev_get_by_index_rcu - find a device by its ifindex
 815 *	@net: the applicable net namespace
 816 *	@ifindex: index of device
 817 *
 818 *	Search for an interface by index. Returns %NULL if the device
 819 *	is not found or a pointer to the device. The device has not
 820 *	had its reference counter increased so the caller must be careful
 821 *	about locking. The caller must hold RCU lock.
 822 */
 823
 824struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 825{
 
 826	struct net_device *dev;
 827	struct hlist_head *head = dev_index_hash(net, ifindex);
 828
 829	hlist_for_each_entry_rcu(dev, head, index_hlist)
 830		if (dev->ifindex == ifindex)
 831			return dev;
 832
 833	return NULL;
 834}
 835EXPORT_SYMBOL(dev_get_by_index_rcu);
 836
 837
 838/**
 839 *	dev_get_by_index - find a device by its ifindex
 840 *	@net: the applicable net namespace
 841 *	@ifindex: index of device
 842 *
 843 *	Search for an interface by index. Returns NULL if the device
 844 *	is not found or a pointer to the device. The device returned has
 845 *	had a reference added and the pointer is safe until the user calls
 846 *	dev_put to indicate they have finished with it.
 847 */
 848
 849struct net_device *dev_get_by_index(struct net *net, int ifindex)
 850{
 851	struct net_device *dev;
 852
 853	rcu_read_lock();
 854	dev = dev_get_by_index_rcu(net, ifindex);
 855	if (dev)
 856		dev_hold(dev);
 857	rcu_read_unlock();
 858	return dev;
 859}
 860EXPORT_SYMBOL(dev_get_by_index);
 861
 862/**
 863 *	netdev_get_name - get a netdevice name, knowing its ifindex.
 864 *	@net: network namespace
 865 *	@name: a pointer to the buffer where the name will be stored.
 866 *	@ifindex: the ifindex of the interface to get the name from.
 867 *
 868 *	The use of raw_seqcount_begin() and cond_resched() before
 869 *	retrying is required as we want to give the writers a chance
 870 *	to complete when CONFIG_PREEMPT is not set.
 871 */
 872int netdev_get_name(struct net *net, char *name, int ifindex)
 873{
 874	struct net_device *dev;
 875	unsigned int seq;
 876
 877retry:
 878	seq = raw_seqcount_begin(&devnet_rename_seq);
 879	rcu_read_lock();
 880	dev = dev_get_by_index_rcu(net, ifindex);
 881	if (!dev) {
 882		rcu_read_unlock();
 883		return -ENODEV;
 884	}
 885
 886	strcpy(name, dev->name);
 887	rcu_read_unlock();
 888	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 889		cond_resched();
 890		goto retry;
 891	}
 892
 893	return 0;
 894}
 895
 896/**
 897 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 898 *	@net: the applicable net namespace
 899 *	@type: media type of device
 900 *	@ha: hardware address
 901 *
 902 *	Search for an interface by MAC address. Returns NULL if the device
 903 *	is not found or a pointer to the device.
 904 *	The caller must hold RCU or RTNL.
 905 *	The returned device has not had its ref count increased
 906 *	and the caller must therefore be careful about locking
 907 *
 908 */
 909
 910struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 911				       const char *ha)
 912{
 913	struct net_device *dev;
 914
 915	for_each_netdev_rcu(net, dev)
 916		if (dev->type == type &&
 917		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 918			return dev;
 919
 920	return NULL;
 921}
 922EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 923
 924struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 925{
 926	struct net_device *dev;
 927
 928	ASSERT_RTNL();
 929	for_each_netdev(net, dev)
 930		if (dev->type == type)
 931			return dev;
 932
 933	return NULL;
 934}
 935EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 936
 937struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 938{
 939	struct net_device *dev, *ret = NULL;
 940
 941	rcu_read_lock();
 942	for_each_netdev_rcu(net, dev)
 943		if (dev->type == type) {
 944			dev_hold(dev);
 945			ret = dev;
 946			break;
 947		}
 948	rcu_read_unlock();
 949	return ret;
 950}
 951EXPORT_SYMBOL(dev_getfirstbyhwtype);
 952
 953/**
 954 *	__dev_get_by_flags - find any device with given flags
 955 *	@net: the applicable net namespace
 956 *	@if_flags: IFF_* values
 957 *	@mask: bitmask of bits in if_flags to check
 958 *
 959 *	Search for any interface with the given flags. Returns NULL if a device
 960 *	is not found or a pointer to the device. Must be called inside
 961 *	rtnl_lock(), and result refcount is unchanged.
 962 */
 963
 964struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 965				      unsigned short mask)
 966{
 967	struct net_device *dev, *ret;
 968
 969	ASSERT_RTNL();
 970
 971	ret = NULL;
 972	for_each_netdev(net, dev) {
 973		if (((dev->flags ^ if_flags) & mask) == 0) {
 974			ret = dev;
 975			break;
 976		}
 977	}
 978	return ret;
 979}
 980EXPORT_SYMBOL(__dev_get_by_flags);
 981
 982/**
 983 *	dev_valid_name - check if name is okay for network device
 984 *	@name: name string
 985 *
 986 *	Network device names need to be valid file names to
 987 *	to allow sysfs to work.  We also disallow any kind of
 988 *	whitespace.
 989 */
 990bool dev_valid_name(const char *name)
 991{
 992	if (*name == '\0')
 993		return false;
 994	if (strlen(name) >= IFNAMSIZ)
 995		return false;
 996	if (!strcmp(name, ".") || !strcmp(name, ".."))
 997		return false;
 998
 999	while (*name) {
1000		if (*name == '/' || *name == ':' || isspace(*name))
1001			return false;
1002		name++;
1003	}
1004	return true;
1005}
1006EXPORT_SYMBOL(dev_valid_name);
1007
1008/**
1009 *	__dev_alloc_name - allocate a name for a device
1010 *	@net: network namespace to allocate the device name in
1011 *	@name: name format string
1012 *	@buf:  scratch buffer and result name string
1013 *
1014 *	Passed a format string - eg "lt%d" it will try and find a suitable
1015 *	id. It scans list of devices to build up a free map, then chooses
1016 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1017 *	while allocating the name and adding the device in order to avoid
1018 *	duplicates.
1019 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1020 *	Returns the number of the unit assigned or a negative errno code.
1021 */
1022
1023static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1024{
1025	int i = 0;
1026	const char *p;
1027	const int max_netdevices = 8*PAGE_SIZE;
1028	unsigned long *inuse;
1029	struct net_device *d;
1030
1031	p = strnchr(name, IFNAMSIZ-1, '%');
1032	if (p) {
1033		/*
1034		 * Verify the string as this thing may have come from
1035		 * the user.  There must be either one "%d" and no other "%"
1036		 * characters.
1037		 */
1038		if (p[1] != 'd' || strchr(p + 2, '%'))
1039			return -EINVAL;
1040
1041		/* Use one page as a bit array of possible slots */
1042		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1043		if (!inuse)
1044			return -ENOMEM;
1045
1046		for_each_netdev(net, d) {
1047			if (!sscanf(d->name, name, &i))
1048				continue;
1049			if (i < 0 || i >= max_netdevices)
1050				continue;
1051
1052			/*  avoid cases where sscanf is not exact inverse of printf */
1053			snprintf(buf, IFNAMSIZ, name, i);
1054			if (!strncmp(buf, d->name, IFNAMSIZ))
1055				set_bit(i, inuse);
1056		}
1057
1058		i = find_first_zero_bit(inuse, max_netdevices);
1059		free_page((unsigned long) inuse);
1060	}
1061
1062	if (buf != name)
1063		snprintf(buf, IFNAMSIZ, name, i);
1064	if (!__dev_get_by_name(net, buf))
1065		return i;
1066
1067	/* It is possible to run out of possible slots
1068	 * when the name is long and there isn't enough space left
1069	 * for the digits, or if all bits are used.
1070	 */
1071	return -ENFILE;
1072}
1073
1074/**
1075 *	dev_alloc_name - allocate a name for a device
1076 *	@dev: device
1077 *	@name: name format string
1078 *
1079 *	Passed a format string - eg "lt%d" it will try and find a suitable
1080 *	id. It scans list of devices to build up a free map, then chooses
1081 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1082 *	while allocating the name and adding the device in order to avoid
1083 *	duplicates.
1084 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1085 *	Returns the number of the unit assigned or a negative errno code.
1086 */
1087
1088int dev_alloc_name(struct net_device *dev, const char *name)
1089{
1090	char buf[IFNAMSIZ];
1091	struct net *net;
1092	int ret;
1093
1094	BUG_ON(!dev_net(dev));
1095	net = dev_net(dev);
1096	ret = __dev_alloc_name(net, name, buf);
1097	if (ret >= 0)
1098		strlcpy(dev->name, buf, IFNAMSIZ);
1099	return ret;
1100}
1101EXPORT_SYMBOL(dev_alloc_name);
1102
1103static int dev_alloc_name_ns(struct net *net,
1104			     struct net_device *dev,
1105			     const char *name)
1106{
1107	char buf[IFNAMSIZ];
1108	int ret;
1109
1110	ret = __dev_alloc_name(net, name, buf);
1111	if (ret >= 0)
1112		strlcpy(dev->name, buf, IFNAMSIZ);
1113	return ret;
1114}
1115
1116static int dev_get_valid_name(struct net *net,
1117			      struct net_device *dev,
1118			      const char *name)
1119{
1120	BUG_ON(!net);
1121
1122	if (!dev_valid_name(name))
1123		return -EINVAL;
1124
1125	if (strchr(name, '%'))
1126		return dev_alloc_name_ns(net, dev, name);
1127	else if (__dev_get_by_name(net, name))
1128		return -EEXIST;
1129	else if (dev->name != name)
1130		strlcpy(dev->name, name, IFNAMSIZ);
1131
1132	return 0;
1133}
1134
1135/**
1136 *	dev_change_name - change name of a device
1137 *	@dev: device
1138 *	@newname: name (or format string) must be at least IFNAMSIZ
1139 *
1140 *	Change name of a device, can pass format strings "eth%d".
1141 *	for wildcarding.
1142 */
1143int dev_change_name(struct net_device *dev, const char *newname)
1144{
1145	unsigned char old_assign_type;
1146	char oldname[IFNAMSIZ];
1147	int err = 0;
1148	int ret;
1149	struct net *net;
1150
1151	ASSERT_RTNL();
1152	BUG_ON(!dev_net(dev));
1153
1154	net = dev_net(dev);
1155	if (dev->flags & IFF_UP)
1156		return -EBUSY;
1157
1158	write_seqcount_begin(&devnet_rename_seq);
1159
1160	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1161		write_seqcount_end(&devnet_rename_seq);
1162		return 0;
1163	}
1164
1165	memcpy(oldname, dev->name, IFNAMSIZ);
1166
1167	err = dev_get_valid_name(net, dev, newname);
1168	if (err < 0) {
1169		write_seqcount_end(&devnet_rename_seq);
1170		return err;
1171	}
1172
1173	if (oldname[0] && !strchr(oldname, '%'))
1174		netdev_info(dev, "renamed from %s\n", oldname);
1175
1176	old_assign_type = dev->name_assign_type;
1177	dev->name_assign_type = NET_NAME_RENAMED;
1178
1179rollback:
1180	ret = device_rename(&dev->dev, dev->name);
1181	if (ret) {
1182		memcpy(dev->name, oldname, IFNAMSIZ);
1183		dev->name_assign_type = old_assign_type;
1184		write_seqcount_end(&devnet_rename_seq);
1185		return ret;
1186	}
1187
1188	write_seqcount_end(&devnet_rename_seq);
1189
1190	netdev_adjacent_rename_links(dev, oldname);
1191
1192	write_lock_bh(&dev_base_lock);
1193	hlist_del_rcu(&dev->name_hlist);
1194	write_unlock_bh(&dev_base_lock);
1195
1196	synchronize_rcu();
1197
1198	write_lock_bh(&dev_base_lock);
1199	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1200	write_unlock_bh(&dev_base_lock);
1201
1202	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1203	ret = notifier_to_errno(ret);
1204
1205	if (ret) {
1206		/* err >= 0 after dev_alloc_name() or stores the first errno */
1207		if (err >= 0) {
1208			err = ret;
1209			write_seqcount_begin(&devnet_rename_seq);
1210			memcpy(dev->name, oldname, IFNAMSIZ);
1211			memcpy(oldname, newname, IFNAMSIZ);
1212			dev->name_assign_type = old_assign_type;
1213			old_assign_type = NET_NAME_RENAMED;
1214			goto rollback;
1215		} else {
1216			pr_err("%s: name change rollback failed: %d\n",
1217			       dev->name, ret);
1218		}
1219	}
1220
1221	return err;
1222}
1223
1224/**
1225 *	dev_set_alias - change ifalias of a device
1226 *	@dev: device
1227 *	@alias: name up to IFALIASZ
1228 *	@len: limit of bytes to copy from info
1229 *
1230 *	Set ifalias for a device,
1231 */
1232int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1233{
1234	char *new_ifalias;
1235
1236	ASSERT_RTNL();
1237
1238	if (len >= IFALIASZ)
1239		return -EINVAL;
1240
1241	if (!len) {
1242		kfree(dev->ifalias);
1243		dev->ifalias = NULL;
 
 
1244		return 0;
1245	}
1246
1247	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1248	if (!new_ifalias)
1249		return -ENOMEM;
1250	dev->ifalias = new_ifalias;
1251
1252	strlcpy(dev->ifalias, alias, len+1);
1253	return len;
1254}
1255
1256
1257/**
1258 *	netdev_features_change - device changes features
1259 *	@dev: device to cause notification
1260 *
1261 *	Called to indicate a device has changed features.
1262 */
1263void netdev_features_change(struct net_device *dev)
1264{
1265	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1266}
1267EXPORT_SYMBOL(netdev_features_change);
1268
1269/**
1270 *	netdev_state_change - device changes state
1271 *	@dev: device to cause notification
1272 *
1273 *	Called to indicate a device has changed state. This function calls
1274 *	the notifier chains for netdev_chain and sends a NEWLINK message
1275 *	to the routing socket.
1276 */
1277void netdev_state_change(struct net_device *dev)
1278{
1279	if (dev->flags & IFF_UP) {
1280		struct netdev_notifier_change_info change_info;
1281
1282		change_info.flags_changed = 0;
1283		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1284					      &change_info.info);
1285		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1286	}
1287}
1288EXPORT_SYMBOL(netdev_state_change);
1289
 
 
 
 
 
 
1290/**
1291 * 	netdev_notify_peers - notify network peers about existence of @dev
1292 * 	@dev: network device
 
1293 *
1294 * Generate traffic such that interested network peers are aware of
1295 * @dev, such as by generating a gratuitous ARP. This may be used when
1296 * a device wants to inform the rest of the network about some sort of
1297 * reconfiguration such as a failover event or virtual machine
1298 * migration.
1299 */
1300void netdev_notify_peers(struct net_device *dev)
 
1301{
1302	rtnl_lock();
1303	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1304	rtnl_unlock();
 
 
 
 
 
 
 
 
 
 
 
 
1305}
1306EXPORT_SYMBOL(netdev_notify_peers);
1307
1308static int __dev_open(struct net_device *dev)
1309{
1310	const struct net_device_ops *ops = dev->netdev_ops;
1311	int ret;
1312
1313	ASSERT_RTNL();
1314
1315	if (!netif_device_present(dev))
1316		return -ENODEV;
1317
1318	/* Block netpoll from trying to do any rx path servicing.
1319	 * If we don't do this there is a chance ndo_poll_controller
1320	 * or ndo_poll may be running while we open the device
1321	 */
1322	netpoll_poll_disable(dev);
1323
1324	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1325	ret = notifier_to_errno(ret);
1326	if (ret)
1327		return ret;
1328
1329	set_bit(__LINK_STATE_START, &dev->state);
1330
1331	if (ops->ndo_validate_addr)
1332		ret = ops->ndo_validate_addr(dev);
1333
1334	if (!ret && ops->ndo_open)
1335		ret = ops->ndo_open(dev);
1336
1337	netpoll_poll_enable(dev);
1338
1339	if (ret)
1340		clear_bit(__LINK_STATE_START, &dev->state);
1341	else {
1342		dev->flags |= IFF_UP;
 
1343		dev_set_rx_mode(dev);
1344		dev_activate(dev);
1345		add_device_randomness(dev->dev_addr, dev->addr_len);
1346	}
1347
1348	return ret;
1349}
1350
1351/**
1352 *	dev_open	- prepare an interface for use.
1353 *	@dev:	device to open
1354 *
1355 *	Takes a device from down to up state. The device's private open
1356 *	function is invoked and then the multicast lists are loaded. Finally
1357 *	the device is moved into the up state and a %NETDEV_UP message is
1358 *	sent to the netdev notifier chain.
1359 *
1360 *	Calling this function on an active interface is a nop. On a failure
1361 *	a negative errno code is returned.
1362 */
1363int dev_open(struct net_device *dev)
1364{
1365	int ret;
1366
1367	if (dev->flags & IFF_UP)
1368		return 0;
1369
1370	ret = __dev_open(dev);
1371	if (ret < 0)
1372		return ret;
1373
1374	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1375	call_netdevice_notifiers(NETDEV_UP, dev);
1376
1377	return ret;
1378}
1379EXPORT_SYMBOL(dev_open);
1380
1381static int __dev_close_many(struct list_head *head)
1382{
1383	struct net_device *dev;
1384
1385	ASSERT_RTNL();
1386	might_sleep();
1387
1388	list_for_each_entry(dev, head, close_list) {
1389		/* Temporarily disable netpoll until the interface is down */
1390		netpoll_poll_disable(dev);
1391
1392		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1393
1394		clear_bit(__LINK_STATE_START, &dev->state);
1395
1396		/* Synchronize to scheduled poll. We cannot touch poll list, it
1397		 * can be even on different cpu. So just clear netif_running().
1398		 *
1399		 * dev->stop() will invoke napi_disable() on all of it's
1400		 * napi_struct instances on this device.
1401		 */
1402		smp_mb__after_atomic(); /* Commit netif_running(). */
1403	}
1404
1405	dev_deactivate_many(head);
1406
1407	list_for_each_entry(dev, head, close_list) {
1408		const struct net_device_ops *ops = dev->netdev_ops;
1409
1410		/*
1411		 *	Call the device specific close. This cannot fail.
1412		 *	Only if device is UP
1413		 *
1414		 *	We allow it to be called even after a DETACH hot-plug
1415		 *	event.
1416		 */
1417		if (ops->ndo_stop)
1418			ops->ndo_stop(dev);
1419
1420		dev->flags &= ~IFF_UP;
1421		netpoll_poll_enable(dev);
1422	}
1423
1424	return 0;
1425}
1426
1427static int __dev_close(struct net_device *dev)
1428{
1429	int retval;
1430	LIST_HEAD(single);
1431
1432	list_add(&dev->close_list, &single);
1433	retval = __dev_close_many(&single);
1434	list_del(&single);
1435
1436	return retval;
1437}
1438
1439int dev_close_many(struct list_head *head, bool unlink)
1440{
1441	struct net_device *dev, *tmp;
 
1442
1443	/* Remove the devices that don't need to be closed */
1444	list_for_each_entry_safe(dev, tmp, head, close_list)
1445		if (!(dev->flags & IFF_UP))
1446			list_del_init(&dev->close_list);
1447
1448	__dev_close_many(head);
1449
1450	list_for_each_entry_safe(dev, tmp, head, close_list) {
1451		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1452		call_netdevice_notifiers(NETDEV_DOWN, dev);
1453		if (unlink)
1454			list_del_init(&dev->close_list);
1455	}
1456
 
 
1457	return 0;
1458}
1459EXPORT_SYMBOL(dev_close_many);
1460
1461/**
1462 *	dev_close - shutdown an interface.
1463 *	@dev: device to shutdown
1464 *
1465 *	This function moves an active device into down state. A
1466 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1467 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1468 *	chain.
1469 */
1470int dev_close(struct net_device *dev)
1471{
1472	if (dev->flags & IFF_UP) {
1473		LIST_HEAD(single);
1474
1475		list_add(&dev->close_list, &single);
1476		dev_close_many(&single, true);
1477		list_del(&single);
1478	}
1479	return 0;
1480}
1481EXPORT_SYMBOL(dev_close);
1482
1483
1484/**
1485 *	dev_disable_lro - disable Large Receive Offload on a device
1486 *	@dev: device
1487 *
1488 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1489 *	called under RTNL.  This is needed if received packets may be
1490 *	forwarded to another interface.
1491 */
1492void dev_disable_lro(struct net_device *dev)
1493{
1494	struct net_device *lower_dev;
1495	struct list_head *iter;
 
 
 
 
1496
1497	dev->wanted_features &= ~NETIF_F_LRO;
1498	netdev_update_features(dev);
1499
1500	if (unlikely(dev->features & NETIF_F_LRO))
1501		netdev_WARN(dev, "failed to disable LRO!\n");
1502
1503	netdev_for_each_lower_dev(dev, lower_dev, iter)
1504		dev_disable_lro(lower_dev);
1505}
1506EXPORT_SYMBOL(dev_disable_lro);
1507
1508static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1509				   struct net_device *dev)
1510{
1511	struct netdev_notifier_info info;
1512
1513	netdev_notifier_info_init(&info, dev);
1514	return nb->notifier_call(nb, val, &info);
1515}
1516
1517static int dev_boot_phase = 1;
1518
1519/**
1520 *	register_netdevice_notifier - register a network notifier block
1521 *	@nb: notifier
1522 *
1523 *	Register a notifier to be called when network device events occur.
1524 *	The notifier passed is linked into the kernel structures and must
1525 *	not be reused until it has been unregistered. A negative errno code
1526 *	is returned on a failure.
1527 *
1528 * 	When registered all registration and up events are replayed
1529 *	to the new notifier to allow device to have a race free
1530 *	view of the network device list.
1531 */
1532
1533int register_netdevice_notifier(struct notifier_block *nb)
1534{
1535	struct net_device *dev;
1536	struct net_device *last;
1537	struct net *net;
1538	int err;
1539
1540	rtnl_lock();
1541	err = raw_notifier_chain_register(&netdev_chain, nb);
1542	if (err)
1543		goto unlock;
1544	if (dev_boot_phase)
1545		goto unlock;
1546	for_each_net(net) {
1547		for_each_netdev(net, dev) {
1548			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1549			err = notifier_to_errno(err);
1550			if (err)
1551				goto rollback;
1552
1553			if (!(dev->flags & IFF_UP))
1554				continue;
1555
1556			call_netdevice_notifier(nb, NETDEV_UP, dev);
1557		}
1558	}
1559
1560unlock:
1561	rtnl_unlock();
1562	return err;
1563
1564rollback:
1565	last = dev;
1566	for_each_net(net) {
1567		for_each_netdev(net, dev) {
1568			if (dev == last)
1569				goto outroll;
1570
1571			if (dev->flags & IFF_UP) {
1572				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1573							dev);
1574				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1575			}
1576			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
 
1577		}
1578	}
1579
1580outroll:
1581	raw_notifier_chain_unregister(&netdev_chain, nb);
1582	goto unlock;
1583}
1584EXPORT_SYMBOL(register_netdevice_notifier);
1585
1586/**
1587 *	unregister_netdevice_notifier - unregister a network notifier block
1588 *	@nb: notifier
1589 *
1590 *	Unregister a notifier previously registered by
1591 *	register_netdevice_notifier(). The notifier is unlinked into the
1592 *	kernel structures and may then be reused. A negative errno code
1593 *	is returned on a failure.
1594 *
1595 * 	After unregistering unregister and down device events are synthesized
1596 *	for all devices on the device list to the removed notifier to remove
1597 *	the need for special case cleanup code.
1598 */
1599
1600int unregister_netdevice_notifier(struct notifier_block *nb)
1601{
1602	struct net_device *dev;
1603	struct net *net;
1604	int err;
1605
1606	rtnl_lock();
1607	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1608	if (err)
1609		goto unlock;
1610
1611	for_each_net(net) {
1612		for_each_netdev(net, dev) {
1613			if (dev->flags & IFF_UP) {
1614				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1615							dev);
1616				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1617			}
1618			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
 
1619		}
1620	}
1621unlock:
1622	rtnl_unlock();
1623	return err;
1624}
1625EXPORT_SYMBOL(unregister_netdevice_notifier);
1626
1627/**
1628 *	call_netdevice_notifiers_info - call all network notifier blocks
1629 *	@val: value passed unmodified to notifier function
1630 *	@dev: net_device pointer passed unmodified to notifier function
1631 *	@info: notifier information data
1632 *
1633 *	Call all network notifier blocks.  Parameters and return value
1634 *	are as for raw_notifier_call_chain().
1635 */
1636
1637static int call_netdevice_notifiers_info(unsigned long val,
1638					 struct net_device *dev,
1639					 struct netdev_notifier_info *info)
1640{
1641	ASSERT_RTNL();
1642	netdev_notifier_info_init(info, dev);
1643	return raw_notifier_call_chain(&netdev_chain, val, info);
1644}
1645
1646/**
1647 *	call_netdevice_notifiers - call all network notifier blocks
1648 *      @val: value passed unmodified to notifier function
1649 *      @dev: net_device pointer passed unmodified to notifier function
1650 *
1651 *	Call all network notifier blocks.  Parameters and return value
1652 *	are as for raw_notifier_call_chain().
1653 */
1654
1655int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1656{
1657	struct netdev_notifier_info info;
1658
1659	return call_netdevice_notifiers_info(val, dev, &info);
1660}
1661EXPORT_SYMBOL(call_netdevice_notifiers);
1662
1663#ifdef CONFIG_NET_INGRESS
1664static struct static_key ingress_needed __read_mostly;
1665
1666void net_inc_ingress_queue(void)
1667{
1668	static_key_slow_inc(&ingress_needed);
1669}
1670EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1671
1672void net_dec_ingress_queue(void)
1673{
1674	static_key_slow_dec(&ingress_needed);
1675}
1676EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1677#endif
1678
1679#ifdef CONFIG_NET_EGRESS
1680static struct static_key egress_needed __read_mostly;
1681
1682void net_inc_egress_queue(void)
1683{
1684	static_key_slow_inc(&egress_needed);
1685}
1686EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1687
1688void net_dec_egress_queue(void)
1689{
1690	static_key_slow_dec(&egress_needed);
1691}
1692EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1693#endif
1694
1695static struct static_key netstamp_needed __read_mostly;
1696#ifdef HAVE_JUMP_LABEL
1697/* We are not allowed to call static_key_slow_dec() from irq context
1698 * If net_disable_timestamp() is called from irq context, defer the
1699 * static_key_slow_dec() calls.
1700 */
1701static atomic_t netstamp_needed_deferred;
1702#endif
1703
1704void net_enable_timestamp(void)
1705{
1706#ifdef HAVE_JUMP_LABEL
1707	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1708
1709	if (deferred) {
1710		while (--deferred)
1711			static_key_slow_dec(&netstamp_needed);
1712		return;
1713	}
1714#endif
 
1715	static_key_slow_inc(&netstamp_needed);
1716}
1717EXPORT_SYMBOL(net_enable_timestamp);
1718
1719void net_disable_timestamp(void)
1720{
1721#ifdef HAVE_JUMP_LABEL
1722	if (in_interrupt()) {
1723		atomic_inc(&netstamp_needed_deferred);
1724		return;
1725	}
1726#endif
1727	static_key_slow_dec(&netstamp_needed);
1728}
1729EXPORT_SYMBOL(net_disable_timestamp);
1730
1731static inline void net_timestamp_set(struct sk_buff *skb)
1732{
1733	skb->tstamp.tv64 = 0;
1734	if (static_key_false(&netstamp_needed))
1735		__net_timestamp(skb);
1736}
1737
1738#define net_timestamp_check(COND, SKB)			\
1739	if (static_key_false(&netstamp_needed)) {		\
1740		if ((COND) && !(SKB)->tstamp.tv64)	\
1741			__net_timestamp(SKB);		\
1742	}						\
1743
1744bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1745{
1746	unsigned int len;
1747
1748	if (!(dev->flags & IFF_UP))
1749		return false;
1750
1751	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1752	if (skb->len <= len)
1753		return true;
1754
1755	/* if TSO is enabled, we don't care about the length as the packet
1756	 * could be forwarded without being segmented before
1757	 */
1758	if (skb_is_gso(skb))
1759		return true;
1760
1761	return false;
1762}
1763EXPORT_SYMBOL_GPL(is_skb_forwardable);
1764
1765int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1766{
1767	if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1768	    unlikely(!is_skb_forwardable(dev, skb))) {
1769		atomic_long_inc(&dev->rx_dropped);
1770		kfree_skb(skb);
1771		return NET_RX_DROP;
1772	}
1773
1774	skb_scrub_packet(skb, true);
1775	skb->priority = 0;
1776	skb->protocol = eth_type_trans(skb, dev);
1777	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1778
1779	return 0;
1780}
1781EXPORT_SYMBOL_GPL(__dev_forward_skb);
1782
1783/**
1784 * dev_forward_skb - loopback an skb to another netif
1785 *
1786 * @dev: destination network device
1787 * @skb: buffer to forward
1788 *
1789 * return values:
1790 *	NET_RX_SUCCESS	(no congestion)
1791 *	NET_RX_DROP     (packet was dropped, but freed)
1792 *
1793 * dev_forward_skb can be used for injecting an skb from the
1794 * start_xmit function of one device into the receive queue
1795 * of another device.
1796 *
1797 * The receiving device may be in another namespace, so
1798 * we have to clear all information in the skb that could
1799 * impact namespace isolation.
1800 */
1801int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1802{
1803	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1804}
1805EXPORT_SYMBOL_GPL(dev_forward_skb);
1806
1807static inline int deliver_skb(struct sk_buff *skb,
1808			      struct packet_type *pt_prev,
1809			      struct net_device *orig_dev)
1810{
1811	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1812		return -ENOMEM;
1813	atomic_inc(&skb->users);
1814	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1815}
1816
1817static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1818					  struct packet_type **pt,
1819					  struct net_device *orig_dev,
1820					  __be16 type,
1821					  struct list_head *ptype_list)
1822{
1823	struct packet_type *ptype, *pt_prev = *pt;
1824
1825	list_for_each_entry_rcu(ptype, ptype_list, list) {
1826		if (ptype->type != type)
1827			continue;
1828		if (pt_prev)
1829			deliver_skb(skb, pt_prev, orig_dev);
1830		pt_prev = ptype;
1831	}
1832	*pt = pt_prev;
1833}
1834
1835static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1836{
1837	if (!ptype->af_packet_priv || !skb->sk)
1838		return false;
1839
1840	if (ptype->id_match)
1841		return ptype->id_match(ptype, skb->sk);
1842	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1843		return true;
1844
1845	return false;
1846}
1847
1848/*
1849 *	Support routine. Sends outgoing frames to any network
1850 *	taps currently in use.
1851 */
1852
1853static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1854{
1855	struct packet_type *ptype;
1856	struct sk_buff *skb2 = NULL;
1857	struct packet_type *pt_prev = NULL;
1858	struct list_head *ptype_list = &ptype_all;
1859
1860	rcu_read_lock();
1861again:
1862	list_for_each_entry_rcu(ptype, ptype_list, list) {
1863		/* Never send packets back to the socket
1864		 * they originated from - MvS (miquels@drinkel.ow.org)
1865		 */
1866		if (skb_loop_sk(ptype, skb))
1867			continue;
 
 
 
 
 
1868
1869		if (pt_prev) {
1870			deliver_skb(skb2, pt_prev, skb->dev);
1871			pt_prev = ptype;
1872			continue;
1873		}
1874
1875		/* need to clone skb, done only once */
1876		skb2 = skb_clone(skb, GFP_ATOMIC);
1877		if (!skb2)
1878			goto out_unlock;
1879
1880		net_timestamp_set(skb2);
1881
1882		/* skb->nh should be correctly
1883		 * set by sender, so that the second statement is
1884		 * just protection against buggy protocols.
1885		 */
1886		skb_reset_mac_header(skb2);
1887
1888		if (skb_network_header(skb2) < skb2->data ||
1889		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1890			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1891					     ntohs(skb2->protocol),
1892					     dev->name);
1893			skb_reset_network_header(skb2);
1894		}
1895
1896		skb2->transport_header = skb2->network_header;
1897		skb2->pkt_type = PACKET_OUTGOING;
1898		pt_prev = ptype;
1899	}
 
 
 
1900
1901	if (ptype_list == &ptype_all) {
1902		ptype_list = &dev->ptype_all;
1903		goto again;
 
1904	}
1905out_unlock:
1906	if (pt_prev)
1907		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1908	rcu_read_unlock();
1909}
1910
1911/**
1912 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1913 * @dev: Network device
1914 * @txq: number of queues available
1915 *
1916 * If real_num_tx_queues is changed the tc mappings may no longer be
1917 * valid. To resolve this verify the tc mapping remains valid and if
1918 * not NULL the mapping. With no priorities mapping to this
1919 * offset/count pair it will no longer be used. In the worst case TC0
1920 * is invalid nothing can be done so disable priority mappings. If is
1921 * expected that drivers will fix this mapping if they can before
1922 * calling netif_set_real_num_tx_queues.
1923 */
1924static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1925{
1926	int i;
1927	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1928
1929	/* If TC0 is invalidated disable TC mapping */
1930	if (tc->offset + tc->count > txq) {
1931		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1932		dev->num_tc = 0;
1933		return;
1934	}
1935
1936	/* Invalidated prio to tc mappings set to TC0 */
1937	for (i = 1; i < TC_BITMASK + 1; i++) {
1938		int q = netdev_get_prio_tc_map(dev, i);
1939
1940		tc = &dev->tc_to_txq[q];
1941		if (tc->offset + tc->count > txq) {
1942			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1943				i, q);
1944			netdev_set_prio_tc_map(dev, i, 0);
1945		}
1946	}
1947}
1948
1949#ifdef CONFIG_XPS
1950static DEFINE_MUTEX(xps_map_mutex);
1951#define xmap_dereference(P)		\
1952	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1953
1954static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1955					int cpu, u16 index)
1956{
1957	struct xps_map *map = NULL;
1958	int pos;
1959
1960	if (dev_maps)
1961		map = xmap_dereference(dev_maps->cpu_map[cpu]);
1962
1963	for (pos = 0; map && pos < map->len; pos++) {
1964		if (map->queues[pos] == index) {
1965			if (map->len > 1) {
1966				map->queues[pos] = map->queues[--map->len];
1967			} else {
1968				RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1969				kfree_rcu(map, rcu);
1970				map = NULL;
1971			}
1972			break;
1973		}
1974	}
1975
1976	return map;
1977}
1978
1979static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1980{
1981	struct xps_dev_maps *dev_maps;
1982	int cpu, i;
1983	bool active = false;
1984
1985	mutex_lock(&xps_map_mutex);
1986	dev_maps = xmap_dereference(dev->xps_maps);
1987
1988	if (!dev_maps)
1989		goto out_no_maps;
1990
1991	for_each_possible_cpu(cpu) {
1992		for (i = index; i < dev->num_tx_queues; i++) {
1993			if (!remove_xps_queue(dev_maps, cpu, i))
1994				break;
1995		}
1996		if (i == dev->num_tx_queues)
1997			active = true;
1998	}
1999
2000	if (!active) {
2001		RCU_INIT_POINTER(dev->xps_maps, NULL);
2002		kfree_rcu(dev_maps, rcu);
2003	}
2004
2005	for (i = index; i < dev->num_tx_queues; i++)
2006		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2007					     NUMA_NO_NODE);
2008
2009out_no_maps:
2010	mutex_unlock(&xps_map_mutex);
2011}
2012
2013static struct xps_map *expand_xps_map(struct xps_map *map,
2014				      int cpu, u16 index)
2015{
2016	struct xps_map *new_map;
2017	int alloc_len = XPS_MIN_MAP_ALLOC;
2018	int i, pos;
2019
2020	for (pos = 0; map && pos < map->len; pos++) {
2021		if (map->queues[pos] != index)
2022			continue;
2023		return map;
2024	}
2025
2026	/* Need to add queue to this CPU's existing map */
2027	if (map) {
2028		if (pos < map->alloc_len)
2029			return map;
2030
2031		alloc_len = map->alloc_len * 2;
2032	}
2033
2034	/* Need to allocate new map to store queue on this CPU's map */
2035	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2036			       cpu_to_node(cpu));
2037	if (!new_map)
2038		return NULL;
2039
2040	for (i = 0; i < pos; i++)
2041		new_map->queues[i] = map->queues[i];
2042	new_map->alloc_len = alloc_len;
2043	new_map->len = pos;
2044
2045	return new_map;
2046}
2047
2048int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2049			u16 index)
2050{
2051	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2052	struct xps_map *map, *new_map;
2053	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2054	int cpu, numa_node_id = -2;
2055	bool active = false;
2056
2057	mutex_lock(&xps_map_mutex);
2058
2059	dev_maps = xmap_dereference(dev->xps_maps);
2060
2061	/* allocate memory for queue storage */
2062	for_each_online_cpu(cpu) {
2063		if (!cpumask_test_cpu(cpu, mask))
2064			continue;
2065
2066		if (!new_dev_maps)
2067			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2068		if (!new_dev_maps) {
2069			mutex_unlock(&xps_map_mutex);
2070			return -ENOMEM;
2071		}
2072
2073		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2074				 NULL;
2075
2076		map = expand_xps_map(map, cpu, index);
2077		if (!map)
2078			goto error;
2079
2080		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2081	}
2082
2083	if (!new_dev_maps)
2084		goto out_no_new_maps;
2085
2086	for_each_possible_cpu(cpu) {
2087		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2088			/* add queue to CPU maps */
2089			int pos = 0;
2090
2091			map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2092			while ((pos < map->len) && (map->queues[pos] != index))
2093				pos++;
2094
2095			if (pos == map->len)
2096				map->queues[map->len++] = index;
2097#ifdef CONFIG_NUMA
2098			if (numa_node_id == -2)
2099				numa_node_id = cpu_to_node(cpu);
2100			else if (numa_node_id != cpu_to_node(cpu))
2101				numa_node_id = -1;
2102#endif
2103		} else if (dev_maps) {
2104			/* fill in the new device map from the old device map */
2105			map = xmap_dereference(dev_maps->cpu_map[cpu]);
2106			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2107		}
2108
2109	}
2110
2111	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2112
2113	/* Cleanup old maps */
2114	if (dev_maps) {
2115		for_each_possible_cpu(cpu) {
2116			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2117			map = xmap_dereference(dev_maps->cpu_map[cpu]);
2118			if (map && map != new_map)
2119				kfree_rcu(map, rcu);
2120		}
2121
2122		kfree_rcu(dev_maps, rcu);
2123	}
2124
2125	dev_maps = new_dev_maps;
2126	active = true;
2127
2128out_no_new_maps:
2129	/* update Tx queue numa node */
2130	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2131				     (numa_node_id >= 0) ? numa_node_id :
2132				     NUMA_NO_NODE);
2133
2134	if (!dev_maps)
2135		goto out_no_maps;
2136
2137	/* removes queue from unused CPUs */
2138	for_each_possible_cpu(cpu) {
2139		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2140			continue;
2141
2142		if (remove_xps_queue(dev_maps, cpu, index))
2143			active = true;
2144	}
2145
2146	/* free map if not active */
2147	if (!active) {
2148		RCU_INIT_POINTER(dev->xps_maps, NULL);
2149		kfree_rcu(dev_maps, rcu);
2150	}
2151
2152out_no_maps:
2153	mutex_unlock(&xps_map_mutex);
2154
2155	return 0;
2156error:
2157	/* remove any maps that we added */
2158	for_each_possible_cpu(cpu) {
2159		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2160		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2161				 NULL;
2162		if (new_map && new_map != map)
2163			kfree(new_map);
2164	}
2165
2166	mutex_unlock(&xps_map_mutex);
2167
2168	kfree(new_dev_maps);
2169	return -ENOMEM;
2170}
2171EXPORT_SYMBOL(netif_set_xps_queue);
2172
2173#endif
2174/*
2175 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2176 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2177 */
2178int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2179{
2180	int rc;
2181
2182	if (txq < 1 || txq > dev->num_tx_queues)
2183		return -EINVAL;
2184
2185	if (dev->reg_state == NETREG_REGISTERED ||
2186	    dev->reg_state == NETREG_UNREGISTERING) {
2187		ASSERT_RTNL();
2188
2189		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2190						  txq);
2191		if (rc)
2192			return rc;
2193
2194		if (dev->num_tc)
2195			netif_setup_tc(dev, txq);
2196
2197		if (txq < dev->real_num_tx_queues) {
2198			qdisc_reset_all_tx_gt(dev, txq);
2199#ifdef CONFIG_XPS
2200			netif_reset_xps_queues_gt(dev, txq);
2201#endif
2202		}
2203	}
2204
2205	dev->real_num_tx_queues = txq;
2206	return 0;
2207}
2208EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2209
2210#ifdef CONFIG_SYSFS
2211/**
2212 *	netif_set_real_num_rx_queues - set actual number of RX queues used
2213 *	@dev: Network device
2214 *	@rxq: Actual number of RX queues
2215 *
2216 *	This must be called either with the rtnl_lock held or before
2217 *	registration of the net device.  Returns 0 on success, or a
2218 *	negative error code.  If called before registration, it always
2219 *	succeeds.
2220 */
2221int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2222{
2223	int rc;
2224
2225	if (rxq < 1 || rxq > dev->num_rx_queues)
2226		return -EINVAL;
2227
2228	if (dev->reg_state == NETREG_REGISTERED) {
2229		ASSERT_RTNL();
2230
2231		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2232						  rxq);
2233		if (rc)
2234			return rc;
2235	}
2236
2237	dev->real_num_rx_queues = rxq;
2238	return 0;
2239}
2240EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2241#endif
2242
2243/**
2244 * netif_get_num_default_rss_queues - default number of RSS queues
2245 *
2246 * This routine should set an upper limit on the number of RSS queues
2247 * used by default by multiqueue devices.
2248 */
2249int netif_get_num_default_rss_queues(void)
2250{
2251	return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2252}
2253EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2254
2255static inline void __netif_reschedule(struct Qdisc *q)
2256{
2257	struct softnet_data *sd;
2258	unsigned long flags;
2259
2260	local_irq_save(flags);
2261	sd = this_cpu_ptr(&softnet_data);
2262	q->next_sched = NULL;
2263	*sd->output_queue_tailp = q;
2264	sd->output_queue_tailp = &q->next_sched;
2265	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2266	local_irq_restore(flags);
2267}
2268
2269void __netif_schedule(struct Qdisc *q)
2270{
2271	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2272		__netif_reschedule(q);
2273}
2274EXPORT_SYMBOL(__netif_schedule);
2275
2276struct dev_kfree_skb_cb {
2277	enum skb_free_reason reason;
2278};
2279
2280static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2281{
2282	return (struct dev_kfree_skb_cb *)skb->cb;
2283}
2284
2285void netif_schedule_queue(struct netdev_queue *txq)
2286{
2287	rcu_read_lock();
2288	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2289		struct Qdisc *q = rcu_dereference(txq->qdisc);
2290
2291		__netif_schedule(q);
2292	}
2293	rcu_read_unlock();
2294}
2295EXPORT_SYMBOL(netif_schedule_queue);
2296
2297/**
2298 *	netif_wake_subqueue - allow sending packets on subqueue
2299 *	@dev: network device
2300 *	@queue_index: sub queue index
2301 *
2302 * Resume individual transmit queue of a device with multiple transmit queues.
2303 */
2304void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2305{
2306	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2307
2308	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2309		struct Qdisc *q;
2310
2311		rcu_read_lock();
2312		q = rcu_dereference(txq->qdisc);
2313		__netif_schedule(q);
2314		rcu_read_unlock();
2315	}
2316}
2317EXPORT_SYMBOL(netif_wake_subqueue);
2318
2319void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2320{
2321	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2322		struct Qdisc *q;
2323
2324		rcu_read_lock();
2325		q = rcu_dereference(dev_queue->qdisc);
2326		__netif_schedule(q);
2327		rcu_read_unlock();
2328	}
2329}
2330EXPORT_SYMBOL(netif_tx_wake_queue);
2331
2332void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2333{
2334	unsigned long flags;
2335
2336	if (likely(atomic_read(&skb->users) == 1)) {
2337		smp_rmb();
2338		atomic_set(&skb->users, 0);
2339	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2340		return;
2341	}
2342	get_kfree_skb_cb(skb)->reason = reason;
2343	local_irq_save(flags);
2344	skb->next = __this_cpu_read(softnet_data.completion_queue);
2345	__this_cpu_write(softnet_data.completion_queue, skb);
2346	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2347	local_irq_restore(flags);
2348}
2349EXPORT_SYMBOL(__dev_kfree_skb_irq);
2350
2351void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2352{
2353	if (in_irq() || irqs_disabled())
2354		__dev_kfree_skb_irq(skb, reason);
2355	else
2356		dev_kfree_skb(skb);
2357}
2358EXPORT_SYMBOL(__dev_kfree_skb_any);
2359
2360
2361/**
2362 * netif_device_detach - mark device as removed
2363 * @dev: network device
2364 *
2365 * Mark device as removed from system and therefore no longer available.
2366 */
2367void netif_device_detach(struct net_device *dev)
2368{
2369	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2370	    netif_running(dev)) {
2371		netif_tx_stop_all_queues(dev);
2372	}
2373}
2374EXPORT_SYMBOL(netif_device_detach);
2375
2376/**
2377 * netif_device_attach - mark device as attached
2378 * @dev: network device
2379 *
2380 * Mark device as attached from system and restart if needed.
2381 */
2382void netif_device_attach(struct net_device *dev)
2383{
2384	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2385	    netif_running(dev)) {
2386		netif_tx_wake_all_queues(dev);
2387		__netdev_watchdog_up(dev);
2388	}
2389}
2390EXPORT_SYMBOL(netif_device_attach);
2391
2392/*
2393 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2394 * to be used as a distribution range.
2395 */
2396u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2397		  unsigned int num_tx_queues)
2398{
2399	u32 hash;
2400	u16 qoffset = 0;
2401	u16 qcount = num_tx_queues;
2402
2403	if (skb_rx_queue_recorded(skb)) {
2404		hash = skb_get_rx_queue(skb);
2405		while (unlikely(hash >= num_tx_queues))
2406			hash -= num_tx_queues;
2407		return hash;
2408	}
2409
2410	if (dev->num_tc) {
2411		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2412		qoffset = dev->tc_to_txq[tc].offset;
2413		qcount = dev->tc_to_txq[tc].count;
2414	}
2415
2416	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2417}
2418EXPORT_SYMBOL(__skb_tx_hash);
2419
2420static void skb_warn_bad_offload(const struct sk_buff *skb)
2421{
2422	static const netdev_features_t null_features = 0;
2423	struct net_device *dev = skb->dev;
2424	const char *name = "";
2425
2426	if (!net_ratelimit())
2427		return;
2428
2429	if (dev) {
2430		if (dev->dev.parent)
2431			name = dev_driver_string(dev->dev.parent);
2432		else
2433			name = netdev_name(dev);
2434	}
2435	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2436	     "gso_type=%d ip_summed=%d\n",
2437	     name, dev ? &dev->features : &null_features,
2438	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2439	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2440	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2441}
2442
2443/*
2444 * Invalidate hardware checksum when packet is to be mangled, and
2445 * complete checksum manually on outgoing path.
2446 */
2447int skb_checksum_help(struct sk_buff *skb)
2448{
2449	__wsum csum;
2450	int ret = 0, offset;
2451
2452	if (skb->ip_summed == CHECKSUM_COMPLETE)
2453		goto out_set_summed;
2454
2455	if (unlikely(skb_shinfo(skb)->gso_size)) {
2456		skb_warn_bad_offload(skb);
2457		return -EINVAL;
2458	}
2459
2460	/* Before computing a checksum, we should make sure no frag could
2461	 * be modified by an external entity : checksum could be wrong.
2462	 */
2463	if (skb_has_shared_frag(skb)) {
2464		ret = __skb_linearize(skb);
2465		if (ret)
2466			goto out;
2467	}
2468
2469	offset = skb_checksum_start_offset(skb);
2470	BUG_ON(offset >= skb_headlen(skb));
2471	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2472
2473	offset += skb->csum_offset;
2474	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2475
2476	if (skb_cloned(skb) &&
2477	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2478		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2479		if (ret)
2480			goto out;
2481	}
2482
2483	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
2484out_set_summed:
2485	skb->ip_summed = CHECKSUM_NONE;
2486out:
2487	return ret;
2488}
2489EXPORT_SYMBOL(skb_checksum_help);
2490
2491/* skb_csum_offload_check - Driver helper function to determine if a device
2492 * with limited checksum offload capabilities is able to offload the checksum
2493 * for a given packet.
2494 *
2495 * Arguments:
2496 *   skb - sk_buff for the packet in question
2497 *   spec - contains the description of what device can offload
2498 *   csum_encapped - returns true if the checksum being offloaded is
2499 *	      encpasulated. That is it is checksum for the transport header
2500 *	      in the inner headers.
2501 *   checksum_help - when set indicates that helper function should
2502 *	      call skb_checksum_help if offload checks fail
2503 *
2504 * Returns:
2505 *   true: Packet has passed the checksum checks and should be offloadable to
2506 *	   the device (a driver may still need to check for additional
2507 *	   restrictions of its device)
2508 *   false: Checksum is not offloadable. If checksum_help was set then
2509 *	   skb_checksum_help was called to resolve checksum for non-GSO
2510 *	   packets and when IP protocol is not SCTP
2511 */
2512bool __skb_csum_offload_chk(struct sk_buff *skb,
2513			    const struct skb_csum_offl_spec *spec,
2514			    bool *csum_encapped,
2515			    bool csum_help)
2516{
2517	struct iphdr *iph;
2518	struct ipv6hdr *ipv6;
2519	void *nhdr;
2520	int protocol;
2521	u8 ip_proto;
2522
2523	if (skb->protocol == htons(ETH_P_8021Q) ||
2524	    skb->protocol == htons(ETH_P_8021AD)) {
2525		if (!spec->vlan_okay)
2526			goto need_help;
2527	}
2528
2529	/* We check whether the checksum refers to a transport layer checksum in
2530	 * the outermost header or an encapsulated transport layer checksum that
2531	 * corresponds to the inner headers of the skb. If the checksum is for
2532	 * something else in the packet we need help.
2533	 */
2534	if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
2535		/* Non-encapsulated checksum */
2536		protocol = eproto_to_ipproto(vlan_get_protocol(skb));
2537		nhdr = skb_network_header(skb);
2538		*csum_encapped = false;
2539		if (spec->no_not_encapped)
2540			goto need_help;
2541	} else if (skb->encapsulation && spec->encap_okay &&
2542		   skb_checksum_start_offset(skb) ==
2543		   skb_inner_transport_offset(skb)) {
2544		/* Encapsulated checksum */
2545		*csum_encapped = true;
2546		switch (skb->inner_protocol_type) {
2547		case ENCAP_TYPE_ETHER:
2548			protocol = eproto_to_ipproto(skb->inner_protocol);
2549			break;
2550		case ENCAP_TYPE_IPPROTO:
2551			protocol = skb->inner_protocol;
2552			break;
2553		}
2554		nhdr = skb_inner_network_header(skb);
2555	} else {
2556		goto need_help;
2557	}
2558
2559	switch (protocol) {
2560	case IPPROTO_IP:
2561		if (!spec->ipv4_okay)
2562			goto need_help;
2563		iph = nhdr;
2564		ip_proto = iph->protocol;
2565		if (iph->ihl != 5 && !spec->ip_options_okay)
2566			goto need_help;
2567		break;
2568	case IPPROTO_IPV6:
2569		if (!spec->ipv6_okay)
2570			goto need_help;
2571		if (spec->no_encapped_ipv6 && *csum_encapped)
2572			goto need_help;
2573		ipv6 = nhdr;
2574		nhdr += sizeof(*ipv6);
2575		ip_proto = ipv6->nexthdr;
2576		break;
2577	default:
2578		goto need_help;
2579	}
2580
2581ip_proto_again:
2582	switch (ip_proto) {
2583	case IPPROTO_TCP:
2584		if (!spec->tcp_okay ||
2585		    skb->csum_offset != offsetof(struct tcphdr, check))
2586			goto need_help;
2587		break;
2588	case IPPROTO_UDP:
2589		if (!spec->udp_okay ||
2590		    skb->csum_offset != offsetof(struct udphdr, check))
2591			goto need_help;
2592		break;
2593	case IPPROTO_SCTP:
2594		if (!spec->sctp_okay ||
2595		    skb->csum_offset != offsetof(struct sctphdr, checksum))
2596			goto cant_help;
2597		break;
2598	case NEXTHDR_HOP:
2599	case NEXTHDR_ROUTING:
2600	case NEXTHDR_DEST: {
2601		u8 *opthdr = nhdr;
2602
2603		if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
2604			goto need_help;
2605
2606		ip_proto = opthdr[0];
2607		nhdr += (opthdr[1] + 1) << 3;
2608
2609		goto ip_proto_again;
2610	}
2611	default:
2612		goto need_help;
2613	}
2614
2615	/* Passed the tests for offloading checksum */
2616	return true;
2617
2618need_help:
2619	if (csum_help && !skb_shinfo(skb)->gso_size)
2620		skb_checksum_help(skb);
2621cant_help:
2622	return false;
2623}
2624EXPORT_SYMBOL(__skb_csum_offload_chk);
2625
2626__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2627{
2628	__be16 type = skb->protocol;
2629
2630	/* Tunnel gso handlers can set protocol to ethernet. */
2631	if (type == htons(ETH_P_TEB)) {
2632		struct ethhdr *eth;
2633
2634		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2635			return 0;
2636
2637		eth = (struct ethhdr *)skb_mac_header(skb);
2638		type = eth->h_proto;
2639	}
2640
2641	return __vlan_get_protocol(skb, type, depth);
2642}
2643
2644/**
2645 *	skb_mac_gso_segment - mac layer segmentation handler.
2646 *	@skb: buffer to segment
2647 *	@features: features for the output path (see dev->features)
 
 
 
 
 
2648 */
2649struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2650				    netdev_features_t features)
2651{
2652	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2653	struct packet_offload *ptype;
2654	int vlan_depth = skb->mac_len;
2655	__be16 type = skb_network_protocol(skb, &vlan_depth);
 
2656
2657	if (unlikely(!type))
2658		return ERR_PTR(-EINVAL);
2659
2660	__skb_pull(skb, vlan_depth);
 
2661
2662	rcu_read_lock();
2663	list_for_each_entry_rcu(ptype, &offload_base, list) {
2664		if (ptype->type == type && ptype->callbacks.gso_segment) {
2665			segs = ptype->callbacks.gso_segment(skb, features);
2666			break;
2667		}
2668	}
2669	rcu_read_unlock();
2670
2671	__skb_push(skb, skb->data - skb_mac_header(skb));
2672
2673	return segs;
2674}
2675EXPORT_SYMBOL(skb_mac_gso_segment);
2676
2677
2678/* openvswitch calls this on rx path, so we need a different check.
2679 */
2680static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2681{
2682	if (tx_path)
2683		return skb->ip_summed != CHECKSUM_PARTIAL;
2684	else
2685		return skb->ip_summed == CHECKSUM_NONE;
2686}
2687
2688/**
2689 *	__skb_gso_segment - Perform segmentation on skb.
2690 *	@skb: buffer to segment
2691 *	@features: features for the output path (see dev->features)
2692 *	@tx_path: whether it is called in TX path
2693 *
2694 *	This function segments the given skb and returns a list of segments.
2695 *
2696 *	It may return NULL if the skb requires no segmentation.  This is
2697 *	only possible when GSO is used for verifying header integrity.
2698 *
2699 *	Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2700 */
2701struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2702				  netdev_features_t features, bool tx_path)
2703{
2704	if (unlikely(skb_needs_check(skb, tx_path))) {
2705		int err;
2706
 
2707		skb_warn_bad_offload(skb);
2708
2709		err = skb_cow_head(skb, 0);
2710		if (err < 0)
2711			return ERR_PTR(err);
2712	}
2713
2714	BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2715		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2716
2717	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2718	SKB_GSO_CB(skb)->encap_level = 0;
 
 
 
 
 
 
 
 
 
 
 
 
2719
2720	skb_reset_mac_header(skb);
2721	skb_reset_mac_len(skb);
2722
2723	return skb_mac_gso_segment(skb, features);
2724}
2725EXPORT_SYMBOL(__skb_gso_segment);
2726
2727/* Take action when hardware reception checksum errors are detected. */
2728#ifdef CONFIG_BUG
2729void netdev_rx_csum_fault(struct net_device *dev)
2730{
2731	if (net_ratelimit()) {
2732		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2733		dump_stack();
2734	}
2735}
2736EXPORT_SYMBOL(netdev_rx_csum_fault);
2737#endif
2738
2739/* Actually, we should eliminate this check as soon as we know, that:
2740 * 1. IOMMU is present and allows to map all the memory.
2741 * 2. No high memory really exists on this machine.
2742 */
2743
2744static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2745{
2746#ifdef CONFIG_HIGHMEM
2747	int i;
2748	if (!(dev->features & NETIF_F_HIGHDMA)) {
2749		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2750			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2751			if (PageHighMem(skb_frag_page(frag)))
2752				return 1;
2753		}
2754	}
2755
2756	if (PCI_DMA_BUS_IS_PHYS) {
2757		struct device *pdev = dev->dev.parent;
2758
2759		if (!pdev)
2760			return 0;
2761		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2762			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2763			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2764			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2765				return 1;
2766		}
2767	}
2768#endif
2769	return 0;
2770}
2771
2772/* If MPLS offload request, verify we are testing hardware MPLS features
2773 * instead of standard features for the netdev.
2774 */
2775#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2776static netdev_features_t net_mpls_features(struct sk_buff *skb,
2777					   netdev_features_t features,
2778					   __be16 type)
2779{
2780	if (eth_p_mpls(type))
2781		features &= skb->dev->mpls_features;
2782
2783	return features;
2784}
2785#else
2786static netdev_features_t net_mpls_features(struct sk_buff *skb,
2787					   netdev_features_t features,
2788					   __be16 type)
2789{
2790	return features;
 
 
 
 
 
 
 
 
 
 
 
 
2791}
2792#endif
2793
2794static netdev_features_t harmonize_features(struct sk_buff *skb,
2795	netdev_features_t features)
 
 
 
 
 
 
 
2796{
2797	int tmp;
2798	__be16 type;
2799
2800	type = skb_network_protocol(skb, &tmp);
2801	features = net_mpls_features(skb, features, type);
2802
2803	if (skb->ip_summed != CHECKSUM_NONE &&
2804	    !can_checksum_protocol(features, type)) {
2805		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2806	} else if (illegal_highdma(skb->dev, skb)) {
2807		features &= ~NETIF_F_SG;
2808	}
2809
2810	return features;
 
 
 
 
 
 
 
2811}
2812
2813netdev_features_t passthru_features_check(struct sk_buff *skb,
2814					  struct net_device *dev,
2815					  netdev_features_t features)
2816{
2817	return features;
 
 
 
 
 
 
2818}
2819EXPORT_SYMBOL(passthru_features_check);
2820
2821static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2822					     struct net_device *dev,
2823					     netdev_features_t features)
2824{
2825	return vlan_features_check(skb, features);
 
 
 
 
 
 
 
2826}
2827
2828netdev_features_t netif_skb_features(struct sk_buff *skb)
2829{
2830	struct net_device *dev = skb->dev;
2831	netdev_features_t features = dev->features;
2832	u16 gso_segs = skb_shinfo(skb)->gso_segs;
2833
2834	if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2835		features &= ~NETIF_F_GSO_MASK;
2836
2837	/* If encapsulation offload request, verify we are testing
2838	 * hardware encapsulation features instead of standard
2839	 * features for the netdev
2840	 */
2841	if (skb->encapsulation)
2842		features &= dev->hw_enc_features;
2843
2844	if (skb_vlan_tagged(skb))
2845		features = netdev_intersect_features(features,
2846						     dev->vlan_features |
2847						     NETIF_F_HW_VLAN_CTAG_TX |
2848						     NETIF_F_HW_VLAN_STAG_TX);
2849
2850	if (dev->netdev_ops->ndo_features_check)
2851		features &= dev->netdev_ops->ndo_features_check(skb, dev,
2852								features);
2853	else
2854		features &= dflt_features_check(skb, dev, features);
2855
2856	return harmonize_features(skb, features);
 
 
 
 
 
 
2857}
2858EXPORT_SYMBOL(netif_skb_features);
2859
2860static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2861		    struct netdev_queue *txq, bool more)
2862{
2863	unsigned int len;
2864	int rc;
2865
2866	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2867		dev_queue_xmit_nit(skb, dev);
2868
2869	len = skb->len;
2870	trace_net_dev_start_xmit(skb, dev);
2871	rc = netdev_start_xmit(skb, dev, txq, more);
2872	trace_net_dev_xmit(skb, rc, dev, len);
2873
2874	return rc;
2875}
2876
2877struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2878				    struct netdev_queue *txq, int *ret)
2879{
2880	struct sk_buff *skb = first;
2881	int rc = NETDEV_TX_OK;
 
2882
2883	while (skb) {
2884		struct sk_buff *next = skb->next;
2885
2886		skb->next = NULL;
2887		rc = xmit_one(skb, dev, txq, next != NULL);
2888		if (unlikely(!dev_xmit_complete(rc))) {
2889			skb->next = next;
2890			goto out;
2891		}
2892
2893		skb = next;
2894		if (netif_xmit_stopped(txq) && skb) {
2895			rc = NETDEV_TX_BUSY;
2896			break;
2897		}
2898	}
2899
2900out:
2901	*ret = rc;
2902	return skb;
2903}
2904
2905static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2906					  netdev_features_t features)
2907{
2908	if (skb_vlan_tag_present(skb) &&
2909	    !vlan_hw_offload_capable(features, skb->vlan_proto))
2910		skb = __vlan_hwaccel_push_inside(skb);
2911	return skb;
2912}
2913
2914static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2915{
2916	netdev_features_t features;
2917
2918	if (skb->next)
2919		return skb;
 
 
 
 
 
 
 
2920
2921	features = netif_skb_features(skb);
2922	skb = validate_xmit_vlan(skb, features);
2923	if (unlikely(!skb))
2924		goto out_null;
2925
2926	if (netif_needs_gso(skb, features)) {
2927		struct sk_buff *segs;
2928
2929		segs = skb_gso_segment(skb, features);
2930		if (IS_ERR(segs)) {
2931			goto out_kfree_skb;
2932		} else if (segs) {
2933			consume_skb(skb);
2934			skb = segs;
2935		}
2936	} else {
2937		if (skb_needs_linearize(skb, features) &&
2938		    __skb_linearize(skb))
2939			goto out_kfree_skb;
2940
2941		/* If packet is not checksummed and device does not
2942		 * support checksumming for this protocol, complete
2943		 * checksumming here.
2944		 */
2945		if (skb->ip_summed == CHECKSUM_PARTIAL) {
2946			if (skb->encapsulation)
2947				skb_set_inner_transport_header(skb,
2948							       skb_checksum_start_offset(skb));
2949			else
2950				skb_set_transport_header(skb,
2951							 skb_checksum_start_offset(skb));
2952			if (!(features & NETIF_F_CSUM_MASK) &&
2953			    skb_checksum_help(skb))
2954				goto out_kfree_skb;
 
2955		}
 
 
 
 
 
 
 
2956	}
2957
2958	return skb;
 
 
 
 
 
2959
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2960out_kfree_skb:
2961	kfree_skb(skb);
2962out_null:
2963	return NULL;
2964}
2965
2966struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
 
 
 
 
 
 
 
2967{
2968	struct sk_buff *next, *head = NULL, *tail;
 
 
2969
2970	for (; skb != NULL; skb = next) {
2971		next = skb->next;
2972		skb->next = NULL;
 
 
 
2973
2974		/* in case skb wont be segmented, point to itself */
2975		skb->prev = skb;
 
 
 
2976
2977		skb = validate_xmit_skb(skb, dev);
2978		if (!skb)
2979			continue;
 
 
2980
2981		if (!head)
2982			head = skb;
2983		else
2984			tail->next = skb;
2985		/* If skb was segmented, skb->prev points to
2986		 * the last segment. If not, it still contains skb.
2987		 */
2988		tail = skb->prev;
 
 
 
2989	}
2990	return head;
2991}
2992
2993static void qdisc_pkt_len_init(struct sk_buff *skb)
2994{
2995	const struct skb_shared_info *shinfo = skb_shinfo(skb);
 
 
 
2996
2997	qdisc_skb_cb(skb)->pkt_len = skb->len;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2998
2999	/* To get more precise estimation of bytes sent on wire,
3000	 * we add to pkt_len the headers size of all segments
3001	 */
3002	if (shinfo->gso_size)  {
3003		unsigned int hdr_len;
3004		u16 gso_segs = shinfo->gso_segs;
3005
3006		/* mac layer + network layer */
3007		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3008
3009		/* + transport layer */
3010		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3011			hdr_len += tcp_hdrlen(skb);
3012		else
3013			hdr_len += sizeof(struct udphdr);
3014
3015		if (shinfo->gso_type & SKB_GSO_DODGY)
3016			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3017						shinfo->gso_size);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3018
3019		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
 
 
 
3020	}
 
 
 
3021}
3022
3023static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3024				 struct net_device *dev,
3025				 struct netdev_queue *txq)
3026{
3027	spinlock_t *root_lock = qdisc_lock(q);
3028	bool contended;
3029	int rc;
3030
 
3031	qdisc_calculate_pkt_len(skb, q);
3032	/*
3033	 * Heuristic to force contended enqueues to serialize on a
3034	 * separate lock before trying to get qdisc main lock.
3035	 * This permits __QDISC___STATE_RUNNING owner to get the lock more
3036	 * often and dequeue packets faster.
3037	 */
3038	contended = qdisc_is_running(q);
3039	if (unlikely(contended))
3040		spin_lock(&q->busylock);
3041
3042	spin_lock(root_lock);
3043	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3044		kfree_skb(skb);
3045		rc = NET_XMIT_DROP;
3046	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3047		   qdisc_run_begin(q)) {
3048		/*
3049		 * This is a work-conserving queue; there are no old skbs
3050		 * waiting to be sent out; and the qdisc is not running -
3051		 * xmit the skb directly.
3052		 */
 
 
3053
3054		qdisc_bstats_update(q, skb);
3055
3056		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3057			if (unlikely(contended)) {
3058				spin_unlock(&q->busylock);
3059				contended = false;
3060			}
3061			__qdisc_run(q);
3062		} else
3063			qdisc_run_end(q);
3064
3065		rc = NET_XMIT_SUCCESS;
3066	} else {
 
3067		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
3068		if (qdisc_run_begin(q)) {
3069			if (unlikely(contended)) {
3070				spin_unlock(&q->busylock);
3071				contended = false;
3072			}
3073			__qdisc_run(q);
3074		}
3075	}
3076	spin_unlock(root_lock);
3077	if (unlikely(contended))
3078		spin_unlock(&q->busylock);
3079	return rc;
3080}
3081
3082#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3083static void skb_update_prio(struct sk_buff *skb)
3084{
3085	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3086
3087	if (!skb->priority && skb->sk && map) {
3088		unsigned int prioidx =
3089			sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3090
3091		if (prioidx < map->priomap_len)
3092			skb->priority = map->priomap[prioidx];
3093	}
3094}
3095#else
3096#define skb_update_prio(skb)
3097#endif
3098
3099DEFINE_PER_CPU(int, xmit_recursion);
3100EXPORT_SYMBOL(xmit_recursion);
3101
3102#define RECURSION_LIMIT 10
3103
3104/**
3105 *	dev_loopback_xmit - loop back @skb
3106 *	@net: network namespace this loopback is happening in
3107 *	@sk:  sk needed to be a netfilter okfn
3108 *	@skb: buffer to transmit
3109 */
3110int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3111{
3112	skb_reset_mac_header(skb);
3113	__skb_pull(skb, skb_network_offset(skb));
3114	skb->pkt_type = PACKET_LOOPBACK;
3115	skb->ip_summed = CHECKSUM_UNNECESSARY;
3116	WARN_ON(!skb_dst(skb));
3117	skb_dst_force(skb);
3118	netif_rx_ni(skb);
3119	return 0;
3120}
3121EXPORT_SYMBOL(dev_loopback_xmit);
3122
3123#ifdef CONFIG_NET_EGRESS
3124static struct sk_buff *
3125sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3126{
3127	struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3128	struct tcf_result cl_res;
3129
3130	if (!cl)
3131		return skb;
3132
3133	/* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3134	 * earlier by the caller.
3135	 */
3136	qdisc_bstats_cpu_update(cl->q, skb);
3137
3138	switch (tc_classify(skb, cl, &cl_res, false)) {
3139	case TC_ACT_OK:
3140	case TC_ACT_RECLASSIFY:
3141		skb->tc_index = TC_H_MIN(cl_res.classid);
3142		break;
3143	case TC_ACT_SHOT:
3144		qdisc_qstats_cpu_drop(cl->q);
3145		*ret = NET_XMIT_DROP;
3146		goto drop;
3147	case TC_ACT_STOLEN:
3148	case TC_ACT_QUEUED:
3149		*ret = NET_XMIT_SUCCESS;
3150drop:
3151		kfree_skb(skb);
3152		return NULL;
3153	case TC_ACT_REDIRECT:
3154		/* No need to push/pop skb's mac_header here on egress! */
3155		skb_do_redirect(skb);
3156		*ret = NET_XMIT_SUCCESS;
3157		return NULL;
3158	default:
3159		break;
3160	}
3161
3162	return skb;
3163}
3164#endif /* CONFIG_NET_EGRESS */
3165
3166static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3167{
3168#ifdef CONFIG_XPS
3169	struct xps_dev_maps *dev_maps;
3170	struct xps_map *map;
3171	int queue_index = -1;
3172
3173	rcu_read_lock();
3174	dev_maps = rcu_dereference(dev->xps_maps);
3175	if (dev_maps) {
3176		map = rcu_dereference(
3177		    dev_maps->cpu_map[skb->sender_cpu - 1]);
3178		if (map) {
3179			if (map->len == 1)
3180				queue_index = map->queues[0];
3181			else
3182				queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3183									   map->len)];
3184			if (unlikely(queue_index >= dev->real_num_tx_queues))
3185				queue_index = -1;
3186		}
3187	}
3188	rcu_read_unlock();
3189
3190	return queue_index;
3191#else
3192	return -1;
3193#endif
3194}
3195
3196static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3197{
3198	struct sock *sk = skb->sk;
3199	int queue_index = sk_tx_queue_get(sk);
3200
3201	if (queue_index < 0 || skb->ooo_okay ||
3202	    queue_index >= dev->real_num_tx_queues) {
3203		int new_index = get_xps_queue(dev, skb);
3204		if (new_index < 0)
3205			new_index = skb_tx_hash(dev, skb);
3206
3207		if (queue_index != new_index && sk &&
3208		    sk_fullsock(sk) &&
3209		    rcu_access_pointer(sk->sk_dst_cache))
3210			sk_tx_queue_set(sk, new_index);
3211
3212		queue_index = new_index;
3213	}
3214
3215	return queue_index;
3216}
3217
3218struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3219				    struct sk_buff *skb,
3220				    void *accel_priv)
3221{
3222	int queue_index = 0;
3223
3224#ifdef CONFIG_XPS
3225	u32 sender_cpu = skb->sender_cpu - 1;
3226
3227	if (sender_cpu >= (u32)NR_CPUS)
3228		skb->sender_cpu = raw_smp_processor_id() + 1;
3229#endif
3230
3231	if (dev->real_num_tx_queues != 1) {
3232		const struct net_device_ops *ops = dev->netdev_ops;
3233		if (ops->ndo_select_queue)
3234			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3235							    __netdev_pick_tx);
3236		else
3237			queue_index = __netdev_pick_tx(dev, skb);
3238
3239		if (!accel_priv)
3240			queue_index = netdev_cap_txqueue(dev, queue_index);
3241	}
3242
3243	skb_set_queue_mapping(skb, queue_index);
3244	return netdev_get_tx_queue(dev, queue_index);
3245}
3246
3247/**
3248 *	__dev_queue_xmit - transmit a buffer
3249 *	@skb: buffer to transmit
3250 *	@accel_priv: private data used for L2 forwarding offload
3251 *
3252 *	Queue a buffer for transmission to a network device. The caller must
3253 *	have set the device and priority and built the buffer before calling
3254 *	this function. The function can be called from an interrupt.
3255 *
3256 *	A negative errno code is returned on a failure. A success does not
3257 *	guarantee the frame will be transmitted as it may be dropped due
3258 *	to congestion or traffic shaping.
3259 *
3260 * -----------------------------------------------------------------------------------
3261 *      I notice this method can also return errors from the queue disciplines,
3262 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3263 *      be positive.
3264 *
3265 *      Regardless of the return value, the skb is consumed, so it is currently
3266 *      difficult to retry a send to this method.  (You can bump the ref count
3267 *      before sending to hold a reference for retry if you are careful.)
3268 *
3269 *      When calling this method, interrupts MUST be enabled.  This is because
3270 *      the BH enable code must have IRQs enabled so that it will not deadlock.
3271 *          --BLG
3272 */
3273static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3274{
3275	struct net_device *dev = skb->dev;
3276	struct netdev_queue *txq;
3277	struct Qdisc *q;
3278	int rc = -ENOMEM;
3279
3280	skb_reset_mac_header(skb);
3281
3282	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3283		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3284
3285	/* Disable soft irqs for various locks below. Also
3286	 * stops preemption for RCU.
3287	 */
3288	rcu_read_lock_bh();
3289
3290	skb_update_prio(skb);
3291
3292	qdisc_pkt_len_init(skb);
 
 
3293#ifdef CONFIG_NET_CLS_ACT
3294	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3295# ifdef CONFIG_NET_EGRESS
3296	if (static_key_false(&egress_needed)) {
3297		skb = sch_handle_egress(skb, &rc, dev);
3298		if (!skb)
3299			goto out;
3300	}
3301# endif
3302#endif
3303	/* If device/qdisc don't need skb->dst, release it right now while
3304	 * its hot in this cpu cache.
3305	 */
3306	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3307		skb_dst_drop(skb);
3308	else
3309		skb_dst_force(skb);
3310
3311#ifdef CONFIG_NET_SWITCHDEV
3312	/* Don't forward if offload device already forwarded */
3313	if (skb->offload_fwd_mark &&
3314	    skb->offload_fwd_mark == dev->offload_fwd_mark) {
3315		consume_skb(skb);
3316		rc = NET_XMIT_SUCCESS;
3317		goto out;
3318	}
3319#endif
3320
3321	txq = netdev_pick_tx(dev, skb, accel_priv);
3322	q = rcu_dereference_bh(txq->qdisc);
3323
3324	trace_net_dev_queue(skb);
3325	if (q->enqueue) {
3326		rc = __dev_xmit_skb(skb, q, dev, txq);
3327		goto out;
3328	}
3329
3330	/* The device has no queue. Common case for software devices:
3331	   loopback, all the sorts of tunnels...
3332
3333	   Really, it is unlikely that netif_tx_lock protection is necessary
3334	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3335	   counters.)
3336	   However, it is possible, that they rely on protection
3337	   made by us here.
3338
3339	   Check this and shot the lock. It is not prone from deadlocks.
3340	   Either shot noqueue qdisc, it is even simpler 8)
3341	 */
3342	if (dev->flags & IFF_UP) {
3343		int cpu = smp_processor_id(); /* ok because BHs are off */
3344
3345		if (txq->xmit_lock_owner != cpu) {
3346
3347			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
3348				goto recursion_alert;
3349
3350			skb = validate_xmit_skb(skb, dev);
3351			if (!skb)
3352				goto drop;
3353
3354			HARD_TX_LOCK(dev, txq, cpu);
3355
3356			if (!netif_xmit_stopped(txq)) {
3357				__this_cpu_inc(xmit_recursion);
3358				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3359				__this_cpu_dec(xmit_recursion);
3360				if (dev_xmit_complete(rc)) {
3361					HARD_TX_UNLOCK(dev, txq);
3362					goto out;
3363				}
3364			}
3365			HARD_TX_UNLOCK(dev, txq);
3366			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3367					     dev->name);
3368		} else {
3369			/* Recursion is detected! It is possible,
3370			 * unfortunately
3371			 */
3372recursion_alert:
3373			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3374					     dev->name);
3375		}
3376	}
3377
3378	rc = -ENETDOWN;
3379drop:
3380	rcu_read_unlock_bh();
3381
3382	atomic_long_inc(&dev->tx_dropped);
3383	kfree_skb_list(skb);
3384	return rc;
3385out:
3386	rcu_read_unlock_bh();
3387	return rc;
3388}
3389
3390int dev_queue_xmit(struct sk_buff *skb)
3391{
3392	return __dev_queue_xmit(skb, NULL);
3393}
3394EXPORT_SYMBOL(dev_queue_xmit);
3395
3396int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3397{
3398	return __dev_queue_xmit(skb, accel_priv);
3399}
3400EXPORT_SYMBOL(dev_queue_xmit_accel);
3401
3402
3403/*=======================================================================
3404			Receiver routines
3405  =======================================================================*/
3406
3407int netdev_max_backlog __read_mostly = 1000;
3408EXPORT_SYMBOL(netdev_max_backlog);
3409
3410int netdev_tstamp_prequeue __read_mostly = 1;
3411int netdev_budget __read_mostly = 300;
3412int weight_p __read_mostly = 64;            /* old backlog weight */
3413
3414/* Called with irq disabled */
3415static inline void ____napi_schedule(struct softnet_data *sd,
3416				     struct napi_struct *napi)
3417{
3418	list_add_tail(&napi->poll_list, &sd->poll_list);
3419	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3420}
3421
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3422#ifdef CONFIG_RPS
3423
3424/* One global table that all flow-based protocols share. */
3425struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3426EXPORT_SYMBOL(rps_sock_flow_table);
3427u32 rps_cpu_mask __read_mostly;
3428EXPORT_SYMBOL(rps_cpu_mask);
3429
3430struct static_key rps_needed __read_mostly;
3431
3432static struct rps_dev_flow *
3433set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3434	    struct rps_dev_flow *rflow, u16 next_cpu)
3435{
3436	if (next_cpu < nr_cpu_ids) {
3437#ifdef CONFIG_RFS_ACCEL
3438		struct netdev_rx_queue *rxqueue;
3439		struct rps_dev_flow_table *flow_table;
3440		struct rps_dev_flow *old_rflow;
3441		u32 flow_id;
3442		u16 rxq_index;
3443		int rc;
3444
3445		/* Should we steer this flow to a different hardware queue? */
3446		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3447		    !(dev->features & NETIF_F_NTUPLE))
3448			goto out;
3449		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3450		if (rxq_index == skb_get_rx_queue(skb))
3451			goto out;
3452
3453		rxqueue = dev->_rx + rxq_index;
3454		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3455		if (!flow_table)
3456			goto out;
3457		flow_id = skb_get_hash(skb) & flow_table->mask;
3458		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3459							rxq_index, flow_id);
3460		if (rc < 0)
3461			goto out;
3462		old_rflow = rflow;
3463		rflow = &flow_table->flows[flow_id];
3464		rflow->filter = rc;
3465		if (old_rflow->filter == rflow->filter)
3466			old_rflow->filter = RPS_NO_FILTER;
3467	out:
3468#endif
3469		rflow->last_qtail =
3470			per_cpu(softnet_data, next_cpu).input_queue_head;
3471	}
3472
3473	rflow->cpu = next_cpu;
3474	return rflow;
3475}
3476
3477/*
3478 * get_rps_cpu is called from netif_receive_skb and returns the target
3479 * CPU from the RPS map of the receiving queue for a given skb.
3480 * rcu_read_lock must be held on entry.
3481 */
3482static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3483		       struct rps_dev_flow **rflowp)
3484{
3485	const struct rps_sock_flow_table *sock_flow_table;
3486	struct netdev_rx_queue *rxqueue = dev->_rx;
3487	struct rps_dev_flow_table *flow_table;
3488	struct rps_map *map;
 
 
3489	int cpu = -1;
3490	u32 tcpu;
3491	u32 hash;
3492
3493	if (skb_rx_queue_recorded(skb)) {
3494		u16 index = skb_get_rx_queue(skb);
3495
3496		if (unlikely(index >= dev->real_num_rx_queues)) {
3497			WARN_ONCE(dev->real_num_rx_queues > 1,
3498				  "%s received packet on queue %u, but number "
3499				  "of RX queues is %u\n",
3500				  dev->name, index, dev->real_num_rx_queues);
3501			goto done;
3502		}
3503		rxqueue += index;
3504	}
3505
3506	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3507
3508	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3509	map = rcu_dereference(rxqueue->rps_map);
3510	if (!flow_table && !map)
 
 
 
 
 
 
 
 
3511		goto done;
 
3512
3513	skb_reset_network_header(skb);
3514	hash = skb_get_hash(skb);
3515	if (!hash)
3516		goto done;
3517
 
3518	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3519	if (flow_table && sock_flow_table) {
 
3520		struct rps_dev_flow *rflow;
3521		u32 next_cpu;
3522		u32 ident;
3523
3524		/* First check into global flow table if there is a match */
3525		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3526		if ((ident ^ hash) & ~rps_cpu_mask)
3527			goto try_rps;
3528
3529		next_cpu = ident & rps_cpu_mask;
3530
3531		/* OK, now we know there is a match,
3532		 * we can look at the local (per receive queue) flow table
3533		 */
3534		rflow = &flow_table->flows[hash & flow_table->mask];
3535		tcpu = rflow->cpu;
3536
 
 
 
3537		/*
3538		 * If the desired CPU (where last recvmsg was done) is
3539		 * different from current CPU (one in the rx-queue flow
3540		 * table entry), switch if one of the following holds:
3541		 *   - Current CPU is unset (>= nr_cpu_ids).
3542		 *   - Current CPU is offline.
3543		 *   - The current CPU's queue tail has advanced beyond the
3544		 *     last packet that was enqueued using this table entry.
3545		 *     This guarantees that all previous packets for the flow
3546		 *     have been dequeued, thus preserving in order delivery.
3547		 */
3548		if (unlikely(tcpu != next_cpu) &&
3549		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3550		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3551		      rflow->last_qtail)) >= 0)) {
3552			tcpu = next_cpu;
3553			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3554		}
3555
3556		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3557			*rflowp = rflow;
3558			cpu = tcpu;
3559			goto done;
3560		}
3561	}
3562
3563try_rps:
3564
3565	if (map) {
3566		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
 
3567		if (cpu_online(tcpu)) {
3568			cpu = tcpu;
3569			goto done;
3570		}
3571	}
3572
3573done:
3574	return cpu;
3575}
3576
3577#ifdef CONFIG_RFS_ACCEL
3578
3579/**
3580 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3581 * @dev: Device on which the filter was set
3582 * @rxq_index: RX queue index
3583 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3584 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3585 *
3586 * Drivers that implement ndo_rx_flow_steer() should periodically call
3587 * this function for each installed filter and remove the filters for
3588 * which it returns %true.
3589 */
3590bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3591			 u32 flow_id, u16 filter_id)
3592{
3593	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3594	struct rps_dev_flow_table *flow_table;
3595	struct rps_dev_flow *rflow;
3596	bool expire = true;
3597	unsigned int cpu;
3598
3599	rcu_read_lock();
3600	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3601	if (flow_table && flow_id <= flow_table->mask) {
3602		rflow = &flow_table->flows[flow_id];
3603		cpu = ACCESS_ONCE(rflow->cpu);
3604		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3605		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3606			   rflow->last_qtail) <
3607		     (int)(10 * flow_table->mask)))
3608			expire = false;
3609	}
3610	rcu_read_unlock();
3611	return expire;
3612}
3613EXPORT_SYMBOL(rps_may_expire_flow);
3614
3615#endif /* CONFIG_RFS_ACCEL */
3616
3617/* Called from hardirq (IPI) context */
3618static void rps_trigger_softirq(void *data)
3619{
3620	struct softnet_data *sd = data;
3621
3622	____napi_schedule(sd, &sd->backlog);
3623	sd->received_rps++;
3624}
3625
3626#endif /* CONFIG_RPS */
3627
3628/*
3629 * Check if this softnet_data structure is another cpu one
3630 * If yes, queue it to our IPI list and return 1
3631 * If no, return 0
3632 */
3633static int rps_ipi_queued(struct softnet_data *sd)
3634{
3635#ifdef CONFIG_RPS
3636	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3637
3638	if (sd != mysd) {
3639		sd->rps_ipi_next = mysd->rps_ipi_list;
3640		mysd->rps_ipi_list = sd;
3641
3642		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3643		return 1;
3644	}
3645#endif /* CONFIG_RPS */
3646	return 0;
3647}
3648
3649#ifdef CONFIG_NET_FLOW_LIMIT
3650int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3651#endif
3652
3653static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3654{
3655#ifdef CONFIG_NET_FLOW_LIMIT
3656	struct sd_flow_limit *fl;
3657	struct softnet_data *sd;
3658	unsigned int old_flow, new_flow;
3659
3660	if (qlen < (netdev_max_backlog >> 1))
3661		return false;
3662
3663	sd = this_cpu_ptr(&softnet_data);
3664
3665	rcu_read_lock();
3666	fl = rcu_dereference(sd->flow_limit);
3667	if (fl) {
3668		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3669		old_flow = fl->history[fl->history_head];
3670		fl->history[fl->history_head] = new_flow;
3671
3672		fl->history_head++;
3673		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3674
3675		if (likely(fl->buckets[old_flow]))
3676			fl->buckets[old_flow]--;
3677
3678		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3679			fl->count++;
3680			rcu_read_unlock();
3681			return true;
3682		}
3683	}
3684	rcu_read_unlock();
3685#endif
3686	return false;
3687}
3688
3689/*
3690 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3691 * queue (may be a remote CPU queue).
3692 */
3693static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3694			      unsigned int *qtail)
3695{
3696	struct softnet_data *sd;
3697	unsigned long flags;
3698	unsigned int qlen;
3699
3700	sd = &per_cpu(softnet_data, cpu);
3701
3702	local_irq_save(flags);
3703
3704	rps_lock(sd);
3705	if (!netif_running(skb->dev))
3706		goto drop;
3707	qlen = skb_queue_len(&sd->input_pkt_queue);
3708	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3709		if (qlen) {
3710enqueue:
3711			__skb_queue_tail(&sd->input_pkt_queue, skb);
3712			input_queue_tail_incr_save(sd, qtail);
3713			rps_unlock(sd);
3714			local_irq_restore(flags);
3715			return NET_RX_SUCCESS;
3716		}
3717
3718		/* Schedule NAPI for backlog device
3719		 * We can use non atomic operation since we own the queue lock
3720		 */
3721		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3722			if (!rps_ipi_queued(sd))
3723				____napi_schedule(sd, &sd->backlog);
3724		}
3725		goto enqueue;
3726	}
3727
3728drop:
3729	sd->dropped++;
3730	rps_unlock(sd);
3731
3732	local_irq_restore(flags);
3733
3734	atomic_long_inc(&skb->dev->rx_dropped);
3735	kfree_skb(skb);
3736	return NET_RX_DROP;
3737}
3738
3739static int netif_rx_internal(struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3740{
3741	int ret;
3742
 
 
 
 
3743	net_timestamp_check(netdev_tstamp_prequeue, skb);
3744
3745	trace_netif_rx(skb);
3746#ifdef CONFIG_RPS
3747	if (static_key_false(&rps_needed)) {
3748		struct rps_dev_flow voidflow, *rflow = &voidflow;
3749		int cpu;
3750
3751		preempt_disable();
3752		rcu_read_lock();
3753
3754		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3755		if (cpu < 0)
3756			cpu = smp_processor_id();
3757
3758		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3759
3760		rcu_read_unlock();
3761		preempt_enable();
3762	} else
3763#endif
3764	{
3765		unsigned int qtail;
3766		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3767		put_cpu();
3768	}
3769	return ret;
3770}
3771
3772/**
3773 *	netif_rx	-	post buffer to the network code
3774 *	@skb: buffer to post
3775 *
3776 *	This function receives a packet from a device driver and queues it for
3777 *	the upper (protocol) levels to process.  It always succeeds. The buffer
3778 *	may be dropped during processing for congestion control or by the
3779 *	protocol layers.
3780 *
3781 *	return values:
3782 *	NET_RX_SUCCESS	(no congestion)
3783 *	NET_RX_DROP     (packet was dropped)
3784 *
3785 */
3786
3787int netif_rx(struct sk_buff *skb)
3788{
3789	trace_netif_rx_entry(skb);
3790
3791	return netif_rx_internal(skb);
3792}
3793EXPORT_SYMBOL(netif_rx);
3794
3795int netif_rx_ni(struct sk_buff *skb)
3796{
3797	int err;
3798
3799	trace_netif_rx_ni_entry(skb);
3800
3801	preempt_disable();
3802	err = netif_rx_internal(skb);
3803	if (local_softirq_pending())
3804		do_softirq();
3805	preempt_enable();
3806
3807	return err;
3808}
3809EXPORT_SYMBOL(netif_rx_ni);
3810
3811static void net_tx_action(struct softirq_action *h)
3812{
3813	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3814
3815	if (sd->completion_queue) {
3816		struct sk_buff *clist;
3817
3818		local_irq_disable();
3819		clist = sd->completion_queue;
3820		sd->completion_queue = NULL;
3821		local_irq_enable();
3822
3823		while (clist) {
3824			struct sk_buff *skb = clist;
3825			clist = clist->next;
3826
3827			WARN_ON(atomic_read(&skb->users));
3828			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3829				trace_consume_skb(skb);
3830			else
3831				trace_kfree_skb(skb, net_tx_action);
3832
3833			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3834				__kfree_skb(skb);
3835			else
3836				__kfree_skb_defer(skb);
3837		}
3838
3839		__kfree_skb_flush();
3840	}
3841
3842	if (sd->output_queue) {
3843		struct Qdisc *head;
3844
3845		local_irq_disable();
3846		head = sd->output_queue;
3847		sd->output_queue = NULL;
3848		sd->output_queue_tailp = &sd->output_queue;
3849		local_irq_enable();
3850
3851		while (head) {
3852			struct Qdisc *q = head;
3853			spinlock_t *root_lock;
3854
3855			head = head->next_sched;
3856
3857			root_lock = qdisc_lock(q);
3858			if (spin_trylock(root_lock)) {
3859				smp_mb__before_atomic();
3860				clear_bit(__QDISC_STATE_SCHED,
3861					  &q->state);
3862				qdisc_run(q);
3863				spin_unlock(root_lock);
3864			} else {
3865				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3866					      &q->state)) {
3867					__netif_reschedule(q);
3868				} else {
3869					smp_mb__before_atomic();
3870					clear_bit(__QDISC_STATE_SCHED,
3871						  &q->state);
3872				}
3873			}
3874		}
3875	}
3876}
3877
3878#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3879    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3880/* This hook is defined here for ATM LANE */
3881int (*br_fdb_test_addr_hook)(struct net_device *dev,
3882			     unsigned char *addr) __read_mostly;
3883EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3884#endif
3885
3886static inline struct sk_buff *
3887sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3888		   struct net_device *orig_dev)
3889{
3890#ifdef CONFIG_NET_CLS_ACT
3891	struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3892	struct tcf_result cl_res;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3893
3894	/* If there's at least one ingress present somewhere (so
3895	 * we get here via enabled static key), remaining devices
3896	 * that are not configured with an ingress qdisc will bail
3897	 * out here.
3898	 */
3899	if (!cl)
3900		return skb;
3901	if (*pt_prev) {
3902		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3903		*pt_prev = NULL;
3904	}
3905
3906	qdisc_skb_cb(skb)->pkt_len = skb->len;
3907	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3908	qdisc_bstats_cpu_update(cl->q, skb);
3909
3910	switch (tc_classify(skb, cl, &cl_res, false)) {
3911	case TC_ACT_OK:
3912	case TC_ACT_RECLASSIFY:
3913		skb->tc_index = TC_H_MIN(cl_res.classid);
3914		break;
3915	case TC_ACT_SHOT:
3916		qdisc_qstats_cpu_drop(cl->q);
3917	case TC_ACT_STOLEN:
3918	case TC_ACT_QUEUED:
3919		kfree_skb(skb);
3920		return NULL;
3921	case TC_ACT_REDIRECT:
3922		/* skb_mac_header check was done by cls/act_bpf, so
3923		 * we can safely push the L2 header back before
3924		 * redirecting to another netdev
3925		 */
3926		__skb_push(skb, skb->mac_len);
3927		skb_do_redirect(skb);
3928		return NULL;
3929	default:
3930		break;
3931	}
3932#endif /* CONFIG_NET_CLS_ACT */
 
 
3933	return skb;
3934}
 
3935
3936/**
3937 *	netdev_rx_handler_register - register receive handler
3938 *	@dev: device to register a handler for
3939 *	@rx_handler: receive handler to register
3940 *	@rx_handler_data: data pointer that is used by rx handler
3941 *
3942 *	Register a receive handler for a device. This handler will then be
3943 *	called from __netif_receive_skb. A negative errno code is returned
3944 *	on a failure.
3945 *
3946 *	The caller must hold the rtnl_mutex.
3947 *
3948 *	For a general description of rx_handler, see enum rx_handler_result.
3949 */
3950int netdev_rx_handler_register(struct net_device *dev,
3951			       rx_handler_func_t *rx_handler,
3952			       void *rx_handler_data)
3953{
3954	ASSERT_RTNL();
3955
3956	if (dev->rx_handler)
3957		return -EBUSY;
3958
3959	/* Note: rx_handler_data must be set before rx_handler */
3960	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3961	rcu_assign_pointer(dev->rx_handler, rx_handler);
3962
3963	return 0;
3964}
3965EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3966
3967/**
3968 *	netdev_rx_handler_unregister - unregister receive handler
3969 *	@dev: device to unregister a handler from
3970 *
3971 *	Unregister a receive handler from a device.
3972 *
3973 *	The caller must hold the rtnl_mutex.
3974 */
3975void netdev_rx_handler_unregister(struct net_device *dev)
3976{
3977
3978	ASSERT_RTNL();
3979	RCU_INIT_POINTER(dev->rx_handler, NULL);
3980	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3981	 * section has a guarantee to see a non NULL rx_handler_data
3982	 * as well.
3983	 */
3984	synchronize_net();
3985	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3986}
3987EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3988
3989/*
3990 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3991 * the special handling of PFMEMALLOC skbs.
3992 */
3993static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3994{
3995	switch (skb->protocol) {
3996	case htons(ETH_P_ARP):
3997	case htons(ETH_P_IP):
3998	case htons(ETH_P_IPV6):
3999	case htons(ETH_P_8021Q):
4000	case htons(ETH_P_8021AD):
4001		return true;
4002	default:
4003		return false;
4004	}
4005}
4006
4007static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4008			     int *ret, struct net_device *orig_dev)
4009{
4010#ifdef CONFIG_NETFILTER_INGRESS
4011	if (nf_hook_ingress_active(skb)) {
4012		if (*pt_prev) {
4013			*ret = deliver_skb(skb, *pt_prev, orig_dev);
4014			*pt_prev = NULL;
4015		}
4016
4017		return nf_hook_ingress(skb);
4018	}
4019#endif /* CONFIG_NETFILTER_INGRESS */
4020	return 0;
4021}
4022
4023static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4024{
4025	struct packet_type *ptype, *pt_prev;
4026	rx_handler_func_t *rx_handler;
4027	struct net_device *orig_dev;
 
4028	bool deliver_exact = false;
4029	int ret = NET_RX_DROP;
4030	__be16 type;
4031
4032	net_timestamp_check(!netdev_tstamp_prequeue, skb);
4033
4034	trace_netif_receive_skb(skb);
4035
 
 
 
 
 
 
4036	orig_dev = skb->dev;
4037
4038	skb_reset_network_header(skb);
4039	if (!skb_transport_header_was_set(skb))
4040		skb_reset_transport_header(skb);
4041	skb_reset_mac_len(skb);
4042
4043	pt_prev = NULL;
4044
 
 
4045another_round:
4046	skb->skb_iif = skb->dev->ifindex;
4047
4048	__this_cpu_inc(softnet_data.processed);
4049
4050	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4051	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4052		skb = skb_vlan_untag(skb);
4053		if (unlikely(!skb))
4054			goto out;
4055	}
4056
4057#ifdef CONFIG_NET_CLS_ACT
4058	if (skb->tc_verd & TC_NCLS) {
4059		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4060		goto ncls;
4061	}
4062#endif
4063
4064	if (pfmemalloc)
4065		goto skip_taps;
4066
4067	list_for_each_entry_rcu(ptype, &ptype_all, list) {
4068		if (pt_prev)
4069			ret = deliver_skb(skb, pt_prev, orig_dev);
4070		pt_prev = ptype;
4071	}
4072
4073	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4074		if (pt_prev)
4075			ret = deliver_skb(skb, pt_prev, orig_dev);
4076		pt_prev = ptype;
4077	}
4078
4079skip_taps:
4080#ifdef CONFIG_NET_INGRESS
4081	if (static_key_false(&ingress_needed)) {
4082		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4083		if (!skb)
4084			goto out;
4085
4086		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4087			goto out;
4088	}
4089#endif
4090#ifdef CONFIG_NET_CLS_ACT
4091	skb->tc_verd = 0;
 
 
4092ncls:
4093#endif
4094	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4095		goto drop;
4096
4097	if (skb_vlan_tag_present(skb)) {
 
4098		if (pt_prev) {
4099			ret = deliver_skb(skb, pt_prev, orig_dev);
4100			pt_prev = NULL;
4101		}
4102		if (vlan_do_receive(&skb))
4103			goto another_round;
4104		else if (unlikely(!skb))
4105			goto out;
4106	}
4107
4108	rx_handler = rcu_dereference(skb->dev->rx_handler);
4109	if (rx_handler) {
4110		if (pt_prev) {
4111			ret = deliver_skb(skb, pt_prev, orig_dev);
4112			pt_prev = NULL;
4113		}
4114		switch (rx_handler(&skb)) {
4115		case RX_HANDLER_CONSUMED:
4116			ret = NET_RX_SUCCESS;
4117			goto out;
4118		case RX_HANDLER_ANOTHER:
4119			goto another_round;
4120		case RX_HANDLER_EXACT:
4121			deliver_exact = true;
4122		case RX_HANDLER_PASS:
4123			break;
4124		default:
4125			BUG();
4126		}
4127	}
4128
4129	if (unlikely(skb_vlan_tag_present(skb))) {
4130		if (skb_vlan_tag_get_id(skb))
4131			skb->pkt_type = PACKET_OTHERHOST;
4132		/* Note: we might in the future use prio bits
4133		 * and set skb->priority like in vlan_do_receive()
4134		 * For the time being, just ignore Priority Code Point
4135		 */
4136		skb->vlan_tci = 0;
4137	}
4138
4139	type = skb->protocol;
4140
4141	/* deliver only exact match when indicated */
4142	if (likely(!deliver_exact)) {
4143		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4144				       &ptype_base[ntohs(type) &
4145						   PTYPE_HASH_MASK]);
4146	}
4147
4148	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4149			       &orig_dev->ptype_specific);
4150
4151	if (unlikely(skb->dev != orig_dev)) {
4152		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4153				       &skb->dev->ptype_specific);
 
 
 
 
4154	}
4155
4156	if (pt_prev) {
4157		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4158			goto drop;
4159		else
4160			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4161	} else {
4162drop:
4163		if (!deliver_exact)
4164			atomic_long_inc(&skb->dev->rx_dropped);
4165		else
4166			atomic_long_inc(&skb->dev->rx_nohandler);
4167		kfree_skb(skb);
4168		/* Jamal, now you will not able to escape explaining
4169		 * me how you were going to use this. :-)
4170		 */
4171		ret = NET_RX_DROP;
4172	}
4173
4174out:
4175	return ret;
4176}
4177
4178static int __netif_receive_skb(struct sk_buff *skb)
4179{
4180	int ret;
4181
4182	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4183		unsigned long pflags = current->flags;
4184
4185		/*
4186		 * PFMEMALLOC skbs are special, they should
4187		 * - be delivered to SOCK_MEMALLOC sockets only
4188		 * - stay away from userspace
4189		 * - have bounded memory usage
4190		 *
4191		 * Use PF_MEMALLOC as this saves us from propagating the allocation
4192		 * context down to all allocation sites.
4193		 */
4194		current->flags |= PF_MEMALLOC;
4195		ret = __netif_receive_skb_core(skb, true);
4196		tsk_restore_flags(current, pflags, PF_MEMALLOC);
4197	} else
4198		ret = __netif_receive_skb_core(skb, false);
4199
4200	return ret;
4201}
4202
4203static int netif_receive_skb_internal(struct sk_buff *skb)
4204{
4205	int ret;
4206
4207	net_timestamp_check(netdev_tstamp_prequeue, skb);
4208
4209	if (skb_defer_rx_timestamp(skb))
4210		return NET_RX_SUCCESS;
4211
4212	rcu_read_lock();
4213
4214#ifdef CONFIG_RPS
4215	if (static_key_false(&rps_needed)) {
4216		struct rps_dev_flow voidflow, *rflow = &voidflow;
4217		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4218
4219		if (cpu >= 0) {
4220			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4221			rcu_read_unlock();
4222			return ret;
4223		}
4224	}
4225#endif
4226	ret = __netif_receive_skb(skb);
4227	rcu_read_unlock();
4228	return ret;
4229}
4230
4231/**
4232 *	netif_receive_skb - process receive buffer from network
4233 *	@skb: buffer to process
4234 *
4235 *	netif_receive_skb() is the main receive data processing function.
4236 *	It always succeeds. The buffer may be dropped during processing
4237 *	for congestion control or by the protocol layers.
4238 *
4239 *	This function may only be called from softirq context and interrupts
4240 *	should be enabled.
4241 *
4242 *	Return values (usually ignored):
4243 *	NET_RX_SUCCESS: no congestion
4244 *	NET_RX_DROP: packet was dropped
4245 */
4246int netif_receive_skb(struct sk_buff *skb)
4247{
4248	trace_netif_receive_skb_entry(skb);
4249
4250	return netif_receive_skb_internal(skb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4251}
4252EXPORT_SYMBOL(netif_receive_skb);
4253
4254/* Network device is going away, flush any packets still pending
4255 * Called with irqs disabled.
4256 */
4257static void flush_backlog(void *arg)
4258{
4259	struct net_device *dev = arg;
4260	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4261	struct sk_buff *skb, *tmp;
4262
4263	rps_lock(sd);
4264	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4265		if (skb->dev == dev) {
4266			__skb_unlink(skb, &sd->input_pkt_queue);
4267			kfree_skb(skb);
4268			input_queue_head_incr(sd);
4269		}
4270	}
4271	rps_unlock(sd);
4272
4273	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4274		if (skb->dev == dev) {
4275			__skb_unlink(skb, &sd->process_queue);
4276			kfree_skb(skb);
4277			input_queue_head_incr(sd);
4278		}
4279	}
4280}
4281
4282static int napi_gro_complete(struct sk_buff *skb)
4283{
4284	struct packet_offload *ptype;
4285	__be16 type = skb->protocol;
4286	struct list_head *head = &offload_base;
4287	int err = -ENOENT;
4288
4289	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4290
4291	if (NAPI_GRO_CB(skb)->count == 1) {
4292		skb_shinfo(skb)->gso_size = 0;
4293		goto out;
4294	}
4295
4296	rcu_read_lock();
4297	list_for_each_entry_rcu(ptype, head, list) {
4298		if (ptype->type != type || !ptype->callbacks.gro_complete)
4299			continue;
4300
4301		err = ptype->callbacks.gro_complete(skb, 0);
4302		break;
4303	}
4304	rcu_read_unlock();
4305
4306	if (err) {
4307		WARN_ON(&ptype->list == head);
4308		kfree_skb(skb);
4309		return NET_RX_SUCCESS;
4310	}
4311
4312out:
4313	return netif_receive_skb_internal(skb);
4314}
4315
4316/* napi->gro_list contains packets ordered by age.
4317 * youngest packets at the head of it.
4318 * Complete skbs in reverse order to reduce latencies.
4319 */
4320void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4321{
4322	struct sk_buff *skb, *prev = NULL;
4323
4324	/* scan list and build reverse chain */
4325	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4326		skb->prev = prev;
4327		prev = skb;
4328	}
4329
4330	for (skb = prev; skb; skb = prev) {
 
4331		skb->next = NULL;
4332
4333		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4334			return;
4335
4336		prev = skb->prev;
4337		napi_gro_complete(skb);
4338		napi->gro_count--;
4339	}
4340
 
4341	napi->gro_list = NULL;
4342}
4343EXPORT_SYMBOL(napi_gro_flush);
4344
4345static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4346{
4347	struct sk_buff *p;
4348	unsigned int maclen = skb->dev->hard_header_len;
4349	u32 hash = skb_get_hash_raw(skb);
4350
4351	for (p = napi->gro_list; p; p = p->next) {
4352		unsigned long diffs;
4353
4354		NAPI_GRO_CB(p)->flush = 0;
4355
4356		if (hash != skb_get_hash_raw(p)) {
4357			NAPI_GRO_CB(p)->same_flow = 0;
4358			continue;
4359		}
4360
4361		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4362		diffs |= p->vlan_tci ^ skb->vlan_tci;
4363		diffs |= skb_metadata_dst_cmp(p, skb);
4364		if (maclen == ETH_HLEN)
4365			diffs |= compare_ether_header(skb_mac_header(p),
4366						      skb_mac_header(skb));
4367		else if (!diffs)
4368			diffs = memcmp(skb_mac_header(p),
4369				       skb_mac_header(skb),
4370				       maclen);
4371		NAPI_GRO_CB(p)->same_flow = !diffs;
4372	}
4373}
4374
4375static void skb_gro_reset_offset(struct sk_buff *skb)
4376{
4377	const struct skb_shared_info *pinfo = skb_shinfo(skb);
4378	const skb_frag_t *frag0 = &pinfo->frags[0];
4379
4380	NAPI_GRO_CB(skb)->data_offset = 0;
4381	NAPI_GRO_CB(skb)->frag0 = NULL;
4382	NAPI_GRO_CB(skb)->frag0_len = 0;
4383
4384	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4385	    pinfo->nr_frags &&
4386	    !PageHighMem(skb_frag_page(frag0))) {
4387		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4388		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4389	}
4390}
4391
4392static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4393{
4394	struct skb_shared_info *pinfo = skb_shinfo(skb);
4395
4396	BUG_ON(skb->end - skb->tail < grow);
4397
4398	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4399
4400	skb->data_len -= grow;
4401	skb->tail += grow;
4402
4403	pinfo->frags[0].page_offset += grow;
4404	skb_frag_size_sub(&pinfo->frags[0], grow);
4405
4406	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4407		skb_frag_unref(skb, 0);
4408		memmove(pinfo->frags, pinfo->frags + 1,
4409			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
4410	}
4411}
4412
4413static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4414{
4415	struct sk_buff **pp = NULL;
4416	struct packet_offload *ptype;
4417	__be16 type = skb->protocol;
4418	struct list_head *head = &offload_base;
4419	int same_flow;
 
4420	enum gro_result ret;
4421	int grow;
4422
4423	if (!(skb->dev->features & NETIF_F_GRO))
4424		goto normal;
4425
4426	if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4427		goto normal;
4428
4429	gro_list_prepare(napi, skb);
4430
4431	rcu_read_lock();
4432	list_for_each_entry_rcu(ptype, head, list) {
4433		if (ptype->type != type || !ptype->callbacks.gro_receive)
4434			continue;
4435
4436		skb_set_network_header(skb, skb_gro_offset(skb));
4437		skb_reset_mac_len(skb);
 
4438		NAPI_GRO_CB(skb)->same_flow = 0;
4439		NAPI_GRO_CB(skb)->flush = 0;
4440		NAPI_GRO_CB(skb)->free = 0;
4441		NAPI_GRO_CB(skb)->encap_mark = 0;
4442		NAPI_GRO_CB(skb)->is_fou = 0;
4443		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4444
4445		/* Setup for GRO checksum validation */
4446		switch (skb->ip_summed) {
4447		case CHECKSUM_COMPLETE:
4448			NAPI_GRO_CB(skb)->csum = skb->csum;
4449			NAPI_GRO_CB(skb)->csum_valid = 1;
4450			NAPI_GRO_CB(skb)->csum_cnt = 0;
4451			break;
4452		case CHECKSUM_UNNECESSARY:
4453			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4454			NAPI_GRO_CB(skb)->csum_valid = 0;
4455			break;
4456		default:
4457			NAPI_GRO_CB(skb)->csum_cnt = 0;
4458			NAPI_GRO_CB(skb)->csum_valid = 0;
4459		}
4460
4461		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4462		break;
4463	}
4464	rcu_read_unlock();
4465
4466	if (&ptype->list == head)
4467		goto normal;
4468
4469	same_flow = NAPI_GRO_CB(skb)->same_flow;
4470	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4471
4472	if (pp) {
4473		struct sk_buff *nskb = *pp;
4474
4475		*pp = nskb->next;
4476		nskb->next = NULL;
4477		napi_gro_complete(nskb);
4478		napi->gro_count--;
4479	}
4480
4481	if (same_flow)
4482		goto ok;
4483
4484	if (NAPI_GRO_CB(skb)->flush)
4485		goto normal;
4486
4487	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4488		struct sk_buff *nskb = napi->gro_list;
4489
4490		/* locate the end of the list to select the 'oldest' flow */
4491		while (nskb->next) {
4492			pp = &nskb->next;
4493			nskb = *pp;
4494		}
4495		*pp = NULL;
4496		nskb->next = NULL;
4497		napi_gro_complete(nskb);
4498	} else {
4499		napi->gro_count++;
4500	}
4501	NAPI_GRO_CB(skb)->count = 1;
4502	NAPI_GRO_CB(skb)->age = jiffies;
4503	NAPI_GRO_CB(skb)->last = skb;
4504	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4505	skb->next = napi->gro_list;
4506	napi->gro_list = skb;
4507	ret = GRO_HELD;
4508
4509pull:
4510	grow = skb_gro_offset(skb) - skb_headlen(skb);
4511	if (grow > 0)
4512		gro_pull_from_frag0(skb, grow);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4513ok:
4514	return ret;
4515
4516normal:
4517	ret = GRO_NORMAL;
4518	goto pull;
4519}
 
4520
4521struct packet_offload *gro_find_receive_by_type(__be16 type)
 
4522{
4523	struct list_head *offload_head = &offload_base;
4524	struct packet_offload *ptype;
4525
4526	list_for_each_entry_rcu(ptype, offload_head, list) {
4527		if (ptype->type != type || !ptype->callbacks.gro_receive)
4528			continue;
4529		return ptype;
4530	}
4531	return NULL;
4532}
4533EXPORT_SYMBOL(gro_find_receive_by_type);
4534
4535struct packet_offload *gro_find_complete_by_type(__be16 type)
4536{
4537	struct list_head *offload_head = &offload_base;
4538	struct packet_offload *ptype;
4539
4540	list_for_each_entry_rcu(ptype, offload_head, list) {
4541		if (ptype->type != type || !ptype->callbacks.gro_complete)
4542			continue;
4543		return ptype;
 
 
 
 
 
 
 
4544	}
4545	return NULL;
 
4546}
4547EXPORT_SYMBOL(gro_find_complete_by_type);
4548
4549static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4550{
4551	switch (ret) {
4552	case GRO_NORMAL:
4553		if (netif_receive_skb_internal(skb))
4554			ret = GRO_DROP;
4555		break;
4556
4557	case GRO_DROP:
4558		kfree_skb(skb);
4559		break;
4560
4561	case GRO_MERGED_FREE:
4562		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4563			skb_dst_drop(skb);
4564			kmem_cache_free(skbuff_head_cache, skb);
4565		} else {
4566			__kfree_skb(skb);
4567		}
4568		break;
4569
4570	case GRO_HELD:
4571	case GRO_MERGED:
4572		break;
4573	}
4574
4575	return ret;
4576}
 
4577
4578gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4579{
4580	skb_mark_napi_id(skb, napi);
4581	trace_napi_gro_receive_entry(skb);
 
 
 
 
 
 
 
 
 
 
4582
 
 
4583	skb_gro_reset_offset(skb);
4584
4585	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4586}
4587EXPORT_SYMBOL(napi_gro_receive);
4588
4589static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4590{
4591	if (unlikely(skb->pfmemalloc)) {
4592		consume_skb(skb);
4593		return;
4594	}
4595	__skb_pull(skb, skb_headlen(skb));
4596	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4597	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4598	skb->vlan_tci = 0;
4599	skb->dev = napi->dev;
4600	skb->skb_iif = 0;
4601	skb->encapsulation = 0;
4602	skb_shinfo(skb)->gso_type = 0;
4603	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4604
4605	napi->skb = skb;
4606}
4607
4608struct sk_buff *napi_get_frags(struct napi_struct *napi)
4609{
4610	struct sk_buff *skb = napi->skb;
4611
4612	if (!skb) {
4613		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4614		if (skb) {
4615			napi->skb = skb;
4616			skb_mark_napi_id(skb, napi);
4617		}
4618	}
4619	return skb;
4620}
4621EXPORT_SYMBOL(napi_get_frags);
4622
4623static gro_result_t napi_frags_finish(struct napi_struct *napi,
4624				      struct sk_buff *skb,
4625				      gro_result_t ret)
4626{
4627	switch (ret) {
4628	case GRO_NORMAL:
4629	case GRO_HELD:
4630		__skb_push(skb, ETH_HLEN);
4631		skb->protocol = eth_type_trans(skb, skb->dev);
4632		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
 
 
 
4633			ret = GRO_DROP;
4634		break;
4635
4636	case GRO_DROP:
4637	case GRO_MERGED_FREE:
4638		napi_reuse_skb(napi, skb);
4639		break;
4640
4641	case GRO_MERGED:
4642		break;
4643	}
4644
4645	return ret;
4646}
 
4647
4648/* Upper GRO stack assumes network header starts at gro_offset=0
4649 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4650 * We copy ethernet header into skb->data to have a common layout.
4651 */
4652static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4653{
4654	struct sk_buff *skb = napi->skb;
4655	const struct ethhdr *eth;
4656	unsigned int hlen = sizeof(*eth);
 
4657
4658	napi->skb = NULL;
4659
4660	skb_reset_mac_header(skb);
4661	skb_gro_reset_offset(skb);
4662
4663	eth = skb_gro_header_fast(skb, 0);
4664	if (unlikely(skb_gro_header_hard(skb, hlen))) {
4665		eth = skb_gro_header_slow(skb, hlen, 0);
 
 
4666		if (unlikely(!eth)) {
4667			napi_reuse_skb(napi, skb);
4668			return NULL;
 
4669		}
4670	} else {
4671		gro_pull_from_frag0(skb, hlen);
4672		NAPI_GRO_CB(skb)->frag0 += hlen;
4673		NAPI_GRO_CB(skb)->frag0_len -= hlen;
4674	}
4675	__skb_pull(skb, hlen);
 
4676
4677	/*
4678	 * This works because the only protocols we care about don't require
4679	 * special handling.
4680	 * We'll fix it up properly in napi_frags_finish()
4681	 */
4682	skb->protocol = eth->h_proto;
4683
 
4684	return skb;
4685}
4686
4687gro_result_t napi_gro_frags(struct napi_struct *napi)
4688{
4689	struct sk_buff *skb = napi_frags_skb(napi);
4690
4691	if (!skb)
4692		return GRO_DROP;
4693
4694	trace_napi_gro_frags_entry(skb);
4695
4696	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4697}
4698EXPORT_SYMBOL(napi_gro_frags);
4699
4700/* Compute the checksum from gro_offset and return the folded value
4701 * after adding in any pseudo checksum.
4702 */
4703__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4704{
4705	__wsum wsum;
4706	__sum16 sum;
4707
4708	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4709
4710	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4711	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4712	if (likely(!sum)) {
4713		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4714		    !skb->csum_complete_sw)
4715			netdev_rx_csum_fault(skb->dev);
4716	}
4717
4718	NAPI_GRO_CB(skb)->csum = wsum;
4719	NAPI_GRO_CB(skb)->csum_valid = 1;
4720
4721	return sum;
4722}
4723EXPORT_SYMBOL(__skb_gro_checksum_complete);
4724
4725/*
4726 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4727 * Note: called with local irq disabled, but exits with local irq enabled.
4728 */
4729static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4730{
4731#ifdef CONFIG_RPS
4732	struct softnet_data *remsd = sd->rps_ipi_list;
4733
4734	if (remsd) {
4735		sd->rps_ipi_list = NULL;
4736
4737		local_irq_enable();
4738
4739		/* Send pending IPI's to kick RPS processing on remote cpus. */
4740		while (remsd) {
4741			struct softnet_data *next = remsd->rps_ipi_next;
4742
4743			if (cpu_online(remsd->cpu))
4744				smp_call_function_single_async(remsd->cpu,
4745							   &remsd->csd);
4746			remsd = next;
4747		}
4748	} else
4749#endif
4750		local_irq_enable();
4751}
4752
4753static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4754{
4755#ifdef CONFIG_RPS
4756	return sd->rps_ipi_list != NULL;
4757#else
4758	return false;
4759#endif
4760}
4761
4762static int process_backlog(struct napi_struct *napi, int quota)
4763{
4764	int work = 0;
4765	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4766
 
4767	/* Check if we have pending ipi, its better to send them now,
4768	 * not waiting net_rx_action() end.
4769	 */
4770	if (sd_has_rps_ipi_waiting(sd)) {
4771		local_irq_disable();
4772		net_rps_action_and_irq_enable(sd);
4773	}
4774
4775	napi->weight = weight_p;
4776	local_irq_disable();
4777	while (1) {
4778		struct sk_buff *skb;
 
4779
4780		while ((skb = __skb_dequeue(&sd->process_queue))) {
4781			rcu_read_lock();
4782			local_irq_enable();
4783			__netif_receive_skb(skb);
4784			rcu_read_unlock();
4785			local_irq_disable();
4786			input_queue_head_incr(sd);
4787			if (++work >= quota) {
4788				local_irq_enable();
4789				return work;
4790			}
4791		}
4792
4793		rps_lock(sd);
4794		if (skb_queue_empty(&sd->input_pkt_queue)) {
 
 
 
 
 
4795			/*
4796			 * Inline a custom version of __napi_complete().
4797			 * only current cpu owns and manipulates this napi,
4798			 * and NAPI_STATE_SCHED is the only possible flag set
4799			 * on backlog.
4800			 * We can use a plain write instead of clear_bit(),
4801			 * and we dont need an smp_mb() memory barrier.
4802			 */
 
4803			napi->state = 0;
4804			rps_unlock(sd);
4805
4806			break;
4807		}
4808
4809		skb_queue_splice_tail_init(&sd->input_pkt_queue,
4810					   &sd->process_queue);
4811		rps_unlock(sd);
4812	}
4813	local_irq_enable();
4814
4815	return work;
4816}
4817
4818/**
4819 * __napi_schedule - schedule for receive
4820 * @n: entry to schedule
4821 *
4822 * The entry's receive function will be scheduled to run.
4823 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4824 */
4825void __napi_schedule(struct napi_struct *n)
4826{
4827	unsigned long flags;
4828
4829	local_irq_save(flags);
4830	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4831	local_irq_restore(flags);
4832}
4833EXPORT_SYMBOL(__napi_schedule);
4834
4835/**
4836 * __napi_schedule_irqoff - schedule for receive
4837 * @n: entry to schedule
4838 *
4839 * Variant of __napi_schedule() assuming hard irqs are masked
4840 */
4841void __napi_schedule_irqoff(struct napi_struct *n)
4842{
4843	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4844}
4845EXPORT_SYMBOL(__napi_schedule_irqoff);
4846
4847void __napi_complete(struct napi_struct *n)
4848{
4849	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
 
4850
4851	list_del_init(&n->poll_list);
4852	smp_mb__before_atomic();
4853	clear_bit(NAPI_STATE_SCHED, &n->state);
4854}
4855EXPORT_SYMBOL(__napi_complete);
4856
4857void napi_complete_done(struct napi_struct *n, int work_done)
4858{
4859	unsigned long flags;
4860
4861	/*
4862	 * don't let napi dequeue from the cpu poll list
4863	 * just in case its running on a different cpu
4864	 */
4865	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4866		return;
4867
4868	if (n->gro_list) {
4869		unsigned long timeout = 0;
4870
4871		if (work_done)
4872			timeout = n->dev->gro_flush_timeout;
4873
4874		if (timeout)
4875			hrtimer_start(&n->timer, ns_to_ktime(timeout),
4876				      HRTIMER_MODE_REL_PINNED);
4877		else
4878			napi_gro_flush(n, false);
4879	}
4880	if (likely(list_empty(&n->poll_list))) {
4881		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4882	} else {
4883		/* If n->poll_list is not empty, we need to mask irqs */
4884		local_irq_save(flags);
4885		__napi_complete(n);
4886		local_irq_restore(flags);
4887	}
4888}
4889EXPORT_SYMBOL(napi_complete_done);
4890
4891/* must be called under rcu_read_lock(), as we dont take a reference */
4892static struct napi_struct *napi_by_id(unsigned int napi_id)
4893{
4894	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4895	struct napi_struct *napi;
4896
4897	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4898		if (napi->napi_id == napi_id)
4899			return napi;
4900
4901	return NULL;
4902}
4903
4904#if defined(CONFIG_NET_RX_BUSY_POLL)
4905#define BUSY_POLL_BUDGET 8
4906bool sk_busy_loop(struct sock *sk, int nonblock)
4907{
4908	unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4909	int (*busy_poll)(struct napi_struct *dev);
4910	struct napi_struct *napi;
4911	int rc = false;
4912
4913	rcu_read_lock();
4914
4915	napi = napi_by_id(sk->sk_napi_id);
4916	if (!napi)
4917		goto out;
4918
4919	/* Note: ndo_busy_poll method is optional in linux-4.5 */
4920	busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
4921
4922	do {
4923		rc = 0;
4924		local_bh_disable();
4925		if (busy_poll) {
4926			rc = busy_poll(napi);
4927		} else if (napi_schedule_prep(napi)) {
4928			void *have = netpoll_poll_lock(napi);
4929
4930			if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
4931				rc = napi->poll(napi, BUSY_POLL_BUDGET);
4932				trace_napi_poll(napi);
4933				if (rc == BUSY_POLL_BUDGET) {
4934					napi_complete_done(napi, rc);
4935					napi_schedule(napi);
4936				}
4937			}
4938			netpoll_poll_unlock(have);
4939		}
4940		if (rc > 0)
4941			NET_ADD_STATS_BH(sock_net(sk),
4942					 LINUX_MIB_BUSYPOLLRXPACKETS, rc);
4943		local_bh_enable();
4944
4945		if (rc == LL_FLUSH_FAILED)
4946			break; /* permanent failure */
4947
4948		cpu_relax();
4949	} while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
4950		 !need_resched() && !busy_loop_timeout(end_time));
4951
4952	rc = !skb_queue_empty(&sk->sk_receive_queue);
4953out:
4954	rcu_read_unlock();
4955	return rc;
4956}
4957EXPORT_SYMBOL(sk_busy_loop);
4958
4959#endif /* CONFIG_NET_RX_BUSY_POLL */
4960
4961void napi_hash_add(struct napi_struct *napi)
4962{
4963	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
4964	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
4965		return;
4966
4967	spin_lock(&napi_hash_lock);
4968
4969	/* 0..NR_CPUS+1 range is reserved for sender_cpu use */
4970	do {
4971		if (unlikely(++napi_gen_id < NR_CPUS + 1))
4972			napi_gen_id = NR_CPUS + 1;
4973	} while (napi_by_id(napi_gen_id));
4974	napi->napi_id = napi_gen_id;
4975
4976	hlist_add_head_rcu(&napi->napi_hash_node,
4977			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4978
4979	spin_unlock(&napi_hash_lock);
4980}
4981EXPORT_SYMBOL_GPL(napi_hash_add);
4982
4983/* Warning : caller is responsible to make sure rcu grace period
4984 * is respected before freeing memory containing @napi
4985 */
4986bool napi_hash_del(struct napi_struct *napi)
4987{
4988	bool rcu_sync_needed = false;
4989
4990	spin_lock(&napi_hash_lock);
4991
4992	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
4993		rcu_sync_needed = true;
4994		hlist_del_rcu(&napi->napi_hash_node);
4995	}
4996	spin_unlock(&napi_hash_lock);
4997	return rcu_sync_needed;
4998}
4999EXPORT_SYMBOL_GPL(napi_hash_del);
5000
5001static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5002{
5003	struct napi_struct *napi;
5004
5005	napi = container_of(timer, struct napi_struct, timer);
5006	if (napi->gro_list)
5007		napi_schedule(napi);
5008
5009	return HRTIMER_NORESTART;
5010}
 
5011
5012void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5013		    int (*poll)(struct napi_struct *, int), int weight)
5014{
5015	INIT_LIST_HEAD(&napi->poll_list);
5016	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5017	napi->timer.function = napi_watchdog;
5018	napi->gro_count = 0;
5019	napi->gro_list = NULL;
5020	napi->skb = NULL;
5021	napi->poll = poll;
5022	if (weight > NAPI_POLL_WEIGHT)
5023		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5024			    weight, dev->name);
5025	napi->weight = weight;
5026	list_add(&napi->dev_list, &dev->napi_list);
5027	napi->dev = dev;
5028#ifdef CONFIG_NETPOLL
5029	spin_lock_init(&napi->poll_lock);
5030	napi->poll_owner = -1;
5031#endif
5032	set_bit(NAPI_STATE_SCHED, &napi->state);
5033	napi_hash_add(napi);
5034}
5035EXPORT_SYMBOL(netif_napi_add);
5036
5037void napi_disable(struct napi_struct *n)
5038{
5039	might_sleep();
5040	set_bit(NAPI_STATE_DISABLE, &n->state);
5041
5042	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5043		msleep(1);
5044	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5045		msleep(1);
5046
5047	hrtimer_cancel(&n->timer);
5048
5049	clear_bit(NAPI_STATE_DISABLE, &n->state);
5050}
5051EXPORT_SYMBOL(napi_disable);
5052
5053/* Must be called in process context */
5054void netif_napi_del(struct napi_struct *napi)
5055{
5056	might_sleep();
5057	if (napi_hash_del(napi))
5058		synchronize_net();
5059	list_del_init(&napi->dev_list);
5060	napi_free_frags(napi);
5061
5062	kfree_skb_list(napi->gro_list);
 
 
 
 
 
5063	napi->gro_list = NULL;
5064	napi->gro_count = 0;
5065}
5066EXPORT_SYMBOL(netif_napi_del);
5067
5068static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5069{
5070	void *have;
5071	int work, weight;
5072
5073	list_del_init(&n->poll_list);
5074
5075	have = netpoll_poll_lock(n);
5076
5077	weight = n->weight;
5078
5079	/* This NAPI_STATE_SCHED test is for avoiding a race
5080	 * with netpoll's poll_napi().  Only the entity which
5081	 * obtains the lock and sees NAPI_STATE_SCHED set will
5082	 * actually make the ->poll() call.  Therefore we avoid
5083	 * accidentally calling ->poll() when NAPI is not scheduled.
5084	 */
5085	work = 0;
5086	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5087		work = n->poll(n, weight);
5088		trace_napi_poll(n);
5089	}
5090
5091	WARN_ON_ONCE(work > weight);
5092
5093	if (likely(work < weight))
5094		goto out_unlock;
5095
5096	/* Drivers must not modify the NAPI state if they
5097	 * consume the entire weight.  In such cases this code
5098	 * still "owns" the NAPI instance and therefore can
5099	 * move the instance around on the list at-will.
5100	 */
5101	if (unlikely(napi_disable_pending(n))) {
5102		napi_complete(n);
5103		goto out_unlock;
5104	}
5105
5106	if (n->gro_list) {
5107		/* flush too old packets
5108		 * If HZ < 1000, flush all packets.
5109		 */
5110		napi_gro_flush(n, HZ >= 1000);
5111	}
5112
5113	/* Some drivers may have called napi_schedule
5114	 * prior to exhausting their budget.
5115	 */
5116	if (unlikely(!list_empty(&n->poll_list))) {
5117		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5118			     n->dev ? n->dev->name : "backlog");
5119		goto out_unlock;
5120	}
5121
5122	list_add_tail(&n->poll_list, repoll);
5123
5124out_unlock:
5125	netpoll_poll_unlock(have);
5126
5127	return work;
5128}
5129
5130static void net_rx_action(struct softirq_action *h)
5131{
5132	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5133	unsigned long time_limit = jiffies + 2;
5134	int budget = netdev_budget;
5135	LIST_HEAD(list);
5136	LIST_HEAD(repoll);
5137
5138	local_irq_disable();
5139	list_splice_init(&sd->poll_list, &list);
5140	local_irq_enable();
5141
5142	for (;;) {
5143		struct napi_struct *n;
 
5144
5145		if (list_empty(&list)) {
5146			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5147				return;
5148			break;
5149		}
5150
5151		n = list_first_entry(&list, struct napi_struct, poll_list);
5152		budget -= napi_poll(n, &repoll);
5153
5154		/* If softirq window is exhausted then punt.
5155		 * Allow this to run for 2 jiffies since which will allow
5156		 * an average latency of 1.5/HZ.
5157		 */
5158		if (unlikely(budget <= 0 ||
5159			     time_after_eq(jiffies, time_limit))) {
5160			sd->time_squeeze++;
5161			break;
5162		}
5163	}
5164
5165	__kfree_skb_flush();
5166	local_irq_disable();
5167
5168	list_splice_tail_init(&sd->poll_list, &list);
5169	list_splice_tail(&repoll, &list);
5170	list_splice(&list, &sd->poll_list);
5171	if (!list_empty(&sd->poll_list))
5172		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 
5173
5174	net_rps_action_and_irq_enable(sd);
5175}
5176
5177struct netdev_adjacent {
5178	struct net_device *dev;
5179
5180	/* upper master flag, there can only be one master device per list */
5181	bool master;
 
 
 
 
 
 
 
 
 
5182
5183	/* counter for the number of times this device was added to us */
5184	u16 ref_nr;
5185
5186	/* private field for the users */
5187	void *private;
5188
5189	struct list_head list;
5190	struct rcu_head rcu;
5191};
5192
5193static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5194						 struct list_head *adj_list)
5195{
5196	struct netdev_adjacent *adj;
 
 
 
 
 
 
 
 
 
5197
5198	list_for_each_entry(adj, adj_list, list) {
5199		if (adj->dev == adj_dev)
5200			return adj;
5201	}
5202	return NULL;
5203}
5204
5205/**
5206 * netdev_has_upper_dev - Check if device is linked to an upper device
5207 * @dev: device
5208 * @upper_dev: upper device to check
5209 *
5210 * Find out if a device is linked to specified upper device and return true
5211 * in case it is. Note that this checks only immediate upper device,
5212 * not through a complete stack of devices. The caller must hold the RTNL lock.
5213 */
5214bool netdev_has_upper_dev(struct net_device *dev,
5215			  struct net_device *upper_dev)
5216{
5217	ASSERT_RTNL();
5218
5219	return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
5220}
5221EXPORT_SYMBOL(netdev_has_upper_dev);
 
 
 
 
5222
5223/**
5224 * netdev_has_any_upper_dev - Check if device is linked to some device
5225 * @dev: device
5226 *
5227 * Find out if a device is linked to an upper device and return true in case
5228 * it is. The caller must hold the RTNL lock.
5229 */
5230static bool netdev_has_any_upper_dev(struct net_device *dev)
5231{
5232	ASSERT_RTNL();
5233
5234	return !list_empty(&dev->all_adj_list.upper);
 
 
 
5235}
5236
 
 
5237/**
5238 * netdev_master_upper_dev_get - Get master upper device
5239 * @dev: device
 
5240 *
5241 * Find a master upper device and return pointer to it or NULL in case
5242 * it's not there. The caller must hold the RTNL lock.
 
5243 */
5244struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5245{
5246	struct netdev_adjacent *upper;
5247
5248	ASSERT_RTNL();
5249
5250	if (list_empty(&dev->adj_list.upper))
5251		return NULL;
5252
5253	upper = list_first_entry(&dev->adj_list.upper,
5254				 struct netdev_adjacent, list);
5255	if (likely(upper->master))
5256		return upper->dev;
5257	return NULL;
5258}
5259EXPORT_SYMBOL(netdev_master_upper_dev_get);
5260
5261void *netdev_adjacent_get_private(struct list_head *adj_list)
5262{
5263	struct netdev_adjacent *adj;
5264
5265	adj = list_entry(adj_list, struct netdev_adjacent, list);
5266
5267	return adj->private;
5268}
5269EXPORT_SYMBOL(netdev_adjacent_get_private);
5270
5271/**
5272 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5273 * @dev: device
5274 * @iter: list_head ** of the current position
5275 *
5276 * Gets the next device from the dev's upper list, starting from iter
5277 * position. The caller must hold RCU read lock.
5278 */
5279struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5280						 struct list_head **iter)
5281{
5282	struct netdev_adjacent *upper;
 
5283
5284	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 
 
5285
5286	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 
5287
5288	if (&upper->list == &dev->adj_list.upper)
5289		return NULL;
 
 
 
 
5290
5291	*iter = &upper->list;
 
5292
5293	return upper->dev;
 
 
5294}
5295EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5296
5297/**
5298 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5299 * @dev: device
5300 * @iter: list_head ** of the current position
5301 *
5302 * Gets the next device from the dev's upper list, starting from iter
5303 * position. The caller must hold RCU read lock.
5304 */
5305struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5306						     struct list_head **iter)
5307{
5308	struct netdev_adjacent *upper;
 
 
 
 
 
5309
5310	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 
 
5311
5312	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 
5313
5314	if (&upper->list == &dev->all_adj_list.upper)
5315		return NULL;
5316
5317	*iter = &upper->list;
 
 
5318
5319	return upper->dev;
5320}
5321EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
 
 
 
 
 
 
 
 
 
 
 
 
 
5322
5323/**
5324 * netdev_lower_get_next_private - Get the next ->private from the
5325 *				   lower neighbour list
5326 * @dev: device
5327 * @iter: list_head ** of the current position
5328 *
5329 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5330 * list, starting from iter position. The caller must hold either hold the
5331 * RTNL lock or its own locking that guarantees that the neighbour lower
5332 * list will remain unchanged.
5333 */
5334void *netdev_lower_get_next_private(struct net_device *dev,
5335				    struct list_head **iter)
5336{
5337	struct netdev_adjacent *lower;
5338
5339	lower = list_entry(*iter, struct netdev_adjacent, list);
 
 
 
 
5340
5341	if (&lower->list == &dev->adj_list.lower)
5342		return NULL;
5343
5344	*iter = lower->list.next;
5345
5346	return lower->private;
5347}
5348EXPORT_SYMBOL(netdev_lower_get_next_private);
5349
5350/**
5351 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5352 *				       lower neighbour list, RCU
5353 *				       variant
5354 * @dev: device
5355 * @iter: list_head ** of the current position
5356 *
5357 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5358 * list, starting from iter position. The caller must hold RCU read lock.
5359 */
5360void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5361					struct list_head **iter)
5362{
5363	struct netdev_adjacent *lower;
 
 
 
 
 
 
 
 
 
 
5364
5365	WARN_ON_ONCE(!rcu_read_lock_held());
 
5366
5367	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 
 
 
5368
5369	if (&lower->list == &dev->adj_list.lower)
5370		return NULL;
 
 
5371
5372	*iter = &lower->list;
 
 
5373
5374	return lower->private;
5375}
5376EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5377
5378/**
5379 * netdev_lower_get_next - Get the next device from the lower neighbour
5380 *                         list
5381 * @dev: device
5382 * @iter: list_head ** of the current position
5383 *
5384 * Gets the next netdev_adjacent from the dev's lower neighbour
5385 * list, starting from iter position. The caller must hold RTNL lock or
5386 * its own locking that guarantees that the neighbour lower
5387 * list will remain unchanged.
5388 */
5389void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
 
5390{
5391	struct netdev_adjacent *lower;
5392
5393	lower = list_entry(*iter, struct netdev_adjacent, list);
5394
5395	if (&lower->list == &dev->adj_list.lower)
5396		return NULL;
5397
5398	*iter = lower->list.next;
5399
5400	return lower->dev;
5401}
5402EXPORT_SYMBOL(netdev_lower_get_next);
5403
5404/**
5405 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5406 *				       lower neighbour list, RCU
5407 *				       variant
5408 * @dev: device
5409 *
5410 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5411 * list. The caller must hold RCU read lock.
5412 */
5413void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5414{
5415	struct netdev_adjacent *lower;
5416
5417	lower = list_first_or_null_rcu(&dev->adj_list.lower,
5418			struct netdev_adjacent, list);
5419	if (lower)
5420		return lower->private;
5421	return NULL;
5422}
5423EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5424
5425/**
5426 * netdev_master_upper_dev_get_rcu - Get master upper device
5427 * @dev: device
5428 *
5429 * Find a master upper device and return pointer to it or NULL in case
5430 * it's not there. The caller must hold the RCU read lock.
5431 */
5432struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5433{
5434	struct netdev_adjacent *upper;
5435
5436	upper = list_first_or_null_rcu(&dev->adj_list.upper,
5437				       struct netdev_adjacent, list);
5438	if (upper && likely(upper->master))
5439		return upper->dev;
5440	return NULL;
5441}
5442EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5443
5444static int netdev_adjacent_sysfs_add(struct net_device *dev,
5445			      struct net_device *adj_dev,
5446			      struct list_head *dev_list)
5447{
5448	char linkname[IFNAMSIZ+7];
5449	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5450		"upper_%s" : "lower_%s", adj_dev->name);
5451	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5452				 linkname);
5453}
5454static void netdev_adjacent_sysfs_del(struct net_device *dev,
5455			       char *name,
5456			       struct list_head *dev_list)
5457{
5458	char linkname[IFNAMSIZ+7];
5459	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5460		"upper_%s" : "lower_%s", name);
5461	sysfs_remove_link(&(dev->dev.kobj), linkname);
5462}
5463
5464static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5465						 struct net_device *adj_dev,
5466						 struct list_head *dev_list)
5467{
5468	return (dev_list == &dev->adj_list.upper ||
5469		dev_list == &dev->adj_list.lower) &&
5470		net_eq(dev_net(dev), dev_net(adj_dev));
 
 
 
 
 
 
 
 
 
 
5471}
5472
5473static int __netdev_adjacent_dev_insert(struct net_device *dev,
5474					struct net_device *adj_dev,
5475					struct list_head *dev_list,
5476					void *private, bool master)
 
5477{
5478	struct netdev_adjacent *adj;
5479	int ret;
5480
5481	adj = __netdev_find_adj(adj_dev, dev_list);
5482
5483	if (adj) {
5484		adj->ref_nr++;
5485		return 0;
5486	}
5487
5488	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5489	if (!adj)
5490		return -ENOMEM;
5491
5492	adj->dev = adj_dev;
5493	adj->master = master;
5494	adj->ref_nr = 1;
5495	adj->private = private;
5496	dev_hold(adj_dev);
5497
5498	pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5499		 adj_dev->name, dev->name, adj_dev->name);
5500
5501	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5502		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5503		if (ret)
5504			goto free_adj;
5505	}
5506
5507	/* Ensure that master link is always the first item in list. */
5508	if (master) {
5509		ret = sysfs_create_link(&(dev->dev.kobj),
5510					&(adj_dev->dev.kobj), "master");
5511		if (ret)
5512			goto remove_symlinks;
5513
5514		list_add_rcu(&adj->list, dev_list);
5515	} else {
5516		list_add_tail_rcu(&adj->list, dev_list);
5517	}
5518
5519	return 0;
5520
5521remove_symlinks:
5522	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5523		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5524free_adj:
5525	kfree(adj);
5526	dev_put(adj_dev);
5527
5528	return ret;
5529}
5530
5531static void __netdev_adjacent_dev_remove(struct net_device *dev,
5532					 struct net_device *adj_dev,
5533					 struct list_head *dev_list)
5534{
5535	struct netdev_adjacent *adj;
5536
5537	adj = __netdev_find_adj(adj_dev, dev_list);
5538
5539	if (!adj) {
5540		pr_err("tried to remove device %s from %s\n",
5541		       dev->name, adj_dev->name);
5542		BUG();
5543	}
5544
5545	if (adj->ref_nr > 1) {
5546		pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5547			 adj->ref_nr-1);
5548		adj->ref_nr--;
5549		return;
5550	}
5551
5552	if (adj->master)
5553		sysfs_remove_link(&(dev->dev.kobj), "master");
5554
5555	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5556		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5557
5558	list_del_rcu(&adj->list);
5559	pr_debug("dev_put for %s, because link removed from %s to %s\n",
5560		 adj_dev->name, dev->name, adj_dev->name);
5561	dev_put(adj_dev);
5562	kfree_rcu(adj, rcu);
5563}
5564
5565static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5566					    struct net_device *upper_dev,
5567					    struct list_head *up_list,
5568					    struct list_head *down_list,
5569					    void *private, bool master)
5570{
5571	int ret;
5572
5573	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5574					   master);
5575	if (ret)
5576		return ret;
5577
5578	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5579					   false);
5580	if (ret) {
5581		__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5582		return ret;
5583	}
5584
5585	return 0;
5586}
5587
5588static int __netdev_adjacent_dev_link(struct net_device *dev,
5589				      struct net_device *upper_dev)
5590{
5591	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5592						&dev->all_adj_list.upper,
5593						&upper_dev->all_adj_list.lower,
5594						NULL, false);
5595}
5596
5597static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5598					       struct net_device *upper_dev,
5599					       struct list_head *up_list,
5600					       struct list_head *down_list)
5601{
5602	__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5603	__netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5604}
5605
5606static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5607					 struct net_device *upper_dev)
5608{
5609	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5610					   &dev->all_adj_list.upper,
5611					   &upper_dev->all_adj_list.lower);
5612}
5613
5614static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5615						struct net_device *upper_dev,
5616						void *private, bool master)
5617{
5618	int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5619
5620	if (ret)
5621		return ret;
5622
5623	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5624					       &dev->adj_list.upper,
5625					       &upper_dev->adj_list.lower,
5626					       private, master);
5627	if (ret) {
5628		__netdev_adjacent_dev_unlink(dev, upper_dev);
5629		return ret;
5630	}
5631
 
 
 
 
5632	return 0;
5633}
5634
5635static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5636						   struct net_device *upper_dev)
5637{
5638	__netdev_adjacent_dev_unlink(dev, upper_dev);
5639	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5640					   &dev->adj_list.upper,
5641					   &upper_dev->adj_list.lower);
5642}
5643
5644static int __netdev_upper_dev_link(struct net_device *dev,
5645				   struct net_device *upper_dev, bool master,
5646				   void *upper_priv, void *upper_info)
5647{
5648	struct netdev_notifier_changeupper_info changeupper_info;
5649	struct netdev_adjacent *i, *j, *to_i, *to_j;
5650	int ret = 0;
5651
5652	ASSERT_RTNL();
5653
5654	if (dev == upper_dev)
5655		return -EBUSY;
5656
5657	/* To prevent loops, check if dev is not upper device to upper_dev. */
5658	if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5659		return -EBUSY;
5660
5661	if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5662		return -EEXIST;
5663
5664	if (master && netdev_master_upper_dev_get(dev))
5665		return -EBUSY;
5666
5667	changeupper_info.upper_dev = upper_dev;
5668	changeupper_info.master = master;
5669	changeupper_info.linking = true;
5670	changeupper_info.upper_info = upper_info;
5671
5672	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5673					    &changeupper_info.info);
5674	ret = notifier_to_errno(ret);
5675	if (ret)
5676		return ret;
5677
5678	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5679						   master);
5680	if (ret)
5681		return ret;
5682
5683	/* Now that we linked these devs, make all the upper_dev's
5684	 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5685	 * versa, and don't forget the devices itself. All of these
5686	 * links are non-neighbours.
5687	 */
5688	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5689		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5690			pr_debug("Interlinking %s with %s, non-neighbour\n",
5691				 i->dev->name, j->dev->name);
5692			ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5693			if (ret)
5694				goto rollback_mesh;
5695		}
5696	}
5697
5698	/* add dev to every upper_dev's upper device */
5699	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5700		pr_debug("linking %s's upper device %s with %s\n",
5701			 upper_dev->name, i->dev->name, dev->name);
5702		ret = __netdev_adjacent_dev_link(dev, i->dev);
5703		if (ret)
5704			goto rollback_upper_mesh;
5705	}
5706
5707	/* add upper_dev to every dev's lower device */
5708	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5709		pr_debug("linking %s's lower device %s with %s\n", dev->name,
5710			 i->dev->name, upper_dev->name);
5711		ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5712		if (ret)
5713			goto rollback_lower_mesh;
5714	}
5715
5716	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5717					    &changeupper_info.info);
5718	ret = notifier_to_errno(ret);
5719	if (ret)
5720		goto rollback_lower_mesh;
5721
5722	return 0;
 
 
 
 
5723
5724rollback_lower_mesh:
5725	to_i = i;
5726	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5727		if (i == to_i)
5728			break;
5729		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5730	}
5731
5732	i = NULL;
 
 
 
 
 
5733
5734rollback_upper_mesh:
5735	to_i = i;
5736	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5737		if (i == to_i)
5738			break;
5739		__netdev_adjacent_dev_unlink(dev, i->dev);
5740	}
5741
5742	i = j = NULL;
 
 
 
 
 
 
5743
5744rollback_mesh:
5745	to_i = i;
5746	to_j = j;
5747	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5748		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5749			if (i == to_i && j == to_j)
5750				break;
5751			__netdev_adjacent_dev_unlink(i->dev, j->dev);
 
 
 
 
 
 
 
 
 
5752		}
5753		if (i == to_i)
5754			break;
5755	}
5756
5757	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5758
5759	return ret;
5760}
5761
5762/**
5763 * netdev_upper_dev_link - Add a link to the upper device
5764 * @dev: device
5765 * @upper_dev: new upper device
5766 *
5767 * Adds a link to device which is upper to this one. The caller must hold
5768 * the RTNL lock. On a failure a negative errno code is returned.
5769 * On success the reference counts are adjusted and the function
5770 * returns zero.
5771 */
5772int netdev_upper_dev_link(struct net_device *dev,
5773			  struct net_device *upper_dev)
5774{
5775	return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
5776}
5777EXPORT_SYMBOL(netdev_upper_dev_link);
5778
5779/**
5780 * netdev_master_upper_dev_link - Add a master link to the upper device
5781 * @dev: device
5782 * @upper_dev: new upper device
5783 * @upper_priv: upper device private
5784 * @upper_info: upper info to be passed down via notifier
5785 *
5786 * Adds a link to device which is upper to this one. In this case, only
5787 * one master upper device can be linked, although other non-master devices
5788 * might be linked as well. The caller must hold the RTNL lock.
5789 * On a failure a negative errno code is returned. On success the reference
5790 * counts are adjusted and the function returns zero.
5791 */
5792int netdev_master_upper_dev_link(struct net_device *dev,
5793				 struct net_device *upper_dev,
5794				 void *upper_priv, void *upper_info)
5795{
5796	return __netdev_upper_dev_link(dev, upper_dev, true,
5797				       upper_priv, upper_info);
5798}
5799EXPORT_SYMBOL(netdev_master_upper_dev_link);
5800
5801/**
5802 * netdev_upper_dev_unlink - Removes a link to upper device
5803 * @dev: device
5804 * @upper_dev: new upper device
5805 *
5806 * Removes a link to device which is upper to this one. The caller must hold
5807 * the RTNL lock.
5808 */
5809void netdev_upper_dev_unlink(struct net_device *dev,
5810			     struct net_device *upper_dev)
5811{
5812	struct netdev_notifier_changeupper_info changeupper_info;
5813	struct netdev_adjacent *i, *j;
5814	ASSERT_RTNL();
5815
5816	changeupper_info.upper_dev = upper_dev;
5817	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5818	changeupper_info.linking = false;
5819
5820	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5821				      &changeupper_info.info);
5822
5823	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5824
5825	/* Here is the tricky part. We must remove all dev's lower
5826	 * devices from all upper_dev's upper devices and vice
5827	 * versa, to maintain the graph relationship.
5828	 */
5829	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5830		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5831			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5832
5833	/* remove also the devices itself from lower/upper device
5834	 * list
5835	 */
5836	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5837		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5838
5839	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5840		__netdev_adjacent_dev_unlink(dev, i->dev);
5841
5842	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5843				      &changeupper_info.info);
 
 
 
 
 
5844}
5845EXPORT_SYMBOL(netdev_upper_dev_unlink);
5846
5847/**
5848 * netdev_bonding_info_change - Dispatch event about slave change
5849 * @dev: device
5850 * @bonding_info: info to dispatch
5851 *
5852 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5853 * The caller must hold the RTNL lock.
5854 */
5855void netdev_bonding_info_change(struct net_device *dev,
5856				struct netdev_bonding_info *bonding_info)
5857{
5858	struct netdev_notifier_bonding_info	info;
5859
5860	memcpy(&info.bonding_info, bonding_info,
5861	       sizeof(struct netdev_bonding_info));
5862	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5863				      &info.info);
5864}
5865EXPORT_SYMBOL(netdev_bonding_info_change);
5866
5867static void netdev_adjacent_add_links(struct net_device *dev)
5868{
5869	struct netdev_adjacent *iter;
5870
5871	struct net *net = dev_net(dev);
 
 
 
 
 
 
5872
5873	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5874		if (!net_eq(net,dev_net(iter->dev)))
5875			continue;
5876		netdev_adjacent_sysfs_add(iter->dev, dev,
5877					  &iter->dev->adj_list.lower);
5878		netdev_adjacent_sysfs_add(dev, iter->dev,
5879					  &dev->adj_list.upper);
5880	}
5881
5882	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5883		if (!net_eq(net,dev_net(iter->dev)))
5884			continue;
5885		netdev_adjacent_sysfs_add(iter->dev, dev,
5886					  &iter->dev->adj_list.upper);
5887		netdev_adjacent_sysfs_add(dev, iter->dev,
5888					  &dev->adj_list.lower);
5889	}
5890}
5891
5892static void netdev_adjacent_del_links(struct net_device *dev)
5893{
5894	struct netdev_adjacent *iter;
 
 
 
5895
5896	struct net *net = dev_net(dev);
 
 
 
 
5897
5898	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5899		if (!net_eq(net,dev_net(iter->dev)))
5900			continue;
5901		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5902					  &iter->dev->adj_list.lower);
5903		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5904					  &dev->adj_list.upper);
5905	}
5906
5907	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5908		if (!net_eq(net,dev_net(iter->dev)))
5909			continue;
5910		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5911					  &iter->dev->adj_list.upper);
5912		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5913					  &dev->adj_list.lower);
5914	}
5915}
5916
5917void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5918{
5919	struct netdev_adjacent *iter;
5920
5921	struct net *net = dev_net(dev);
5922
5923	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5924		if (!net_eq(net,dev_net(iter->dev)))
5925			continue;
5926		netdev_adjacent_sysfs_del(iter->dev, oldname,
5927					  &iter->dev->adj_list.lower);
5928		netdev_adjacent_sysfs_add(iter->dev, dev,
5929					  &iter->dev->adj_list.lower);
5930	}
5931
5932	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5933		if (!net_eq(net,dev_net(iter->dev)))
5934			continue;
5935		netdev_adjacent_sysfs_del(iter->dev, oldname,
5936					  &iter->dev->adj_list.upper);
5937		netdev_adjacent_sysfs_add(iter->dev, dev,
5938					  &iter->dev->adj_list.upper);
5939	}
 
 
 
 
5940}
5941
5942void *netdev_lower_dev_get_private(struct net_device *dev,
5943				   struct net_device *lower_dev)
5944{
5945	struct netdev_adjacent *lower;
5946
5947	if (!lower_dev)
5948		return NULL;
5949	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
5950	if (!lower)
5951		return NULL;
5952
5953	return lower->private;
 
 
 
 
 
 
 
5954}
5955EXPORT_SYMBOL(netdev_lower_dev_get_private);
 
 
5956
5957
5958int dev_get_nest_level(struct net_device *dev,
5959		       bool (*type_check)(const struct net_device *dev))
 
 
 
 
 
 
 
 
 
5960{
5961	struct net_device *lower = NULL;
5962	struct list_head *iter;
5963	int max_nest = -1;
5964	int nest;
5965
5966	ASSERT_RTNL();
5967
5968	netdev_for_each_lower_dev(dev, lower, iter) {
5969		nest = dev_get_nest_level(lower, type_check);
5970		if (max_nest < nest)
5971			max_nest = nest;
5972	}
5973
5974	if (type_check(dev))
5975		max_nest++;
5976
5977	return max_nest;
 
 
5978}
5979EXPORT_SYMBOL(dev_get_nest_level);
5980
5981/**
5982 * netdev_lower_change - Dispatch event about lower device state change
5983 * @lower_dev: device
5984 * @lower_state_info: state to dispatch
5985 *
5986 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
5987 * The caller must hold the RTNL lock.
 
 
5988 */
5989void netdev_lower_state_changed(struct net_device *lower_dev,
5990				void *lower_state_info)
5991{
5992	struct netdev_notifier_changelowerstate_info changelowerstate_info;
5993
5994	ASSERT_RTNL();
5995	changelowerstate_info.lower_state_info = lower_state_info;
5996	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
5997				      &changelowerstate_info.info);
 
 
 
 
 
 
 
 
5998}
5999EXPORT_SYMBOL(netdev_lower_state_changed);
6000
6001static void dev_change_rx_flags(struct net_device *dev, int flags)
6002{
6003	const struct net_device_ops *ops = dev->netdev_ops;
6004
6005	if (ops->ndo_change_rx_flags)
6006		ops->ndo_change_rx_flags(dev, flags);
6007}
6008
6009static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6010{
6011	unsigned int old_flags = dev->flags;
6012	kuid_t uid;
6013	kgid_t gid;
6014
6015	ASSERT_RTNL();
6016
6017	dev->flags |= IFF_PROMISC;
6018	dev->promiscuity += inc;
6019	if (dev->promiscuity == 0) {
6020		/*
6021		 * Avoid overflow.
6022		 * If inc causes overflow, untouch promisc and return error.
6023		 */
6024		if (inc < 0)
6025			dev->flags &= ~IFF_PROMISC;
6026		else {
6027			dev->promiscuity -= inc;
6028			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6029				dev->name);
6030			return -EOVERFLOW;
6031		}
6032	}
6033	if (dev->flags != old_flags) {
6034		pr_info("device %s %s promiscuous mode\n",
6035			dev->name,
6036			dev->flags & IFF_PROMISC ? "entered" : "left");
6037		if (audit_enabled) {
6038			current_uid_gid(&uid, &gid);
6039			audit_log(current->audit_context, GFP_ATOMIC,
6040				AUDIT_ANOM_PROMISCUOUS,
6041				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6042				dev->name, (dev->flags & IFF_PROMISC),
6043				(old_flags & IFF_PROMISC),
6044				from_kuid(&init_user_ns, audit_get_loginuid(current)),
6045				from_kuid(&init_user_ns, uid),
6046				from_kgid(&init_user_ns, gid),
6047				audit_get_sessionid(current));
6048		}
6049
6050		dev_change_rx_flags(dev, IFF_PROMISC);
6051	}
6052	if (notify)
6053		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
6054	return 0;
6055}
6056
6057/**
6058 *	dev_set_promiscuity	- update promiscuity count on a device
6059 *	@dev: device
6060 *	@inc: modifier
6061 *
6062 *	Add or remove promiscuity from a device. While the count in the device
6063 *	remains above zero the interface remains promiscuous. Once it hits zero
6064 *	the device reverts back to normal filtering operation. A negative inc
6065 *	value is used to drop promiscuity on the device.
6066 *	Return 0 if successful or a negative errno code on error.
6067 */
6068int dev_set_promiscuity(struct net_device *dev, int inc)
6069{
6070	unsigned int old_flags = dev->flags;
6071	int err;
6072
6073	err = __dev_set_promiscuity(dev, inc, true);
6074	if (err < 0)
6075		return err;
6076	if (dev->flags != old_flags)
6077		dev_set_rx_mode(dev);
6078	return err;
6079}
6080EXPORT_SYMBOL(dev_set_promiscuity);
6081
6082static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
 
 
 
 
 
 
 
 
 
 
 
 
 
6083{
6084	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6085
6086	ASSERT_RTNL();
6087
6088	dev->flags |= IFF_ALLMULTI;
6089	dev->allmulti += inc;
6090	if (dev->allmulti == 0) {
6091		/*
6092		 * Avoid overflow.
6093		 * If inc causes overflow, untouch allmulti and return error.
6094		 */
6095		if (inc < 0)
6096			dev->flags &= ~IFF_ALLMULTI;
6097		else {
6098			dev->allmulti -= inc;
6099			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6100				dev->name);
6101			return -EOVERFLOW;
6102		}
6103	}
6104	if (dev->flags ^ old_flags) {
6105		dev_change_rx_flags(dev, IFF_ALLMULTI);
6106		dev_set_rx_mode(dev);
6107		if (notify)
6108			__dev_notify_flags(dev, old_flags,
6109					   dev->gflags ^ old_gflags);
6110	}
6111	return 0;
6112}
6113
6114/**
6115 *	dev_set_allmulti	- update allmulti count on a device
6116 *	@dev: device
6117 *	@inc: modifier
6118 *
6119 *	Add or remove reception of all multicast frames to a device. While the
6120 *	count in the device remains above zero the interface remains listening
6121 *	to all interfaces. Once it hits zero the device reverts back to normal
6122 *	filtering operation. A negative @inc value is used to drop the counter
6123 *	when releasing a resource needing all multicasts.
6124 *	Return 0 if successful or a negative errno code on error.
6125 */
6126
6127int dev_set_allmulti(struct net_device *dev, int inc)
6128{
6129	return __dev_set_allmulti(dev, inc, true);
6130}
6131EXPORT_SYMBOL(dev_set_allmulti);
6132
6133/*
6134 *	Upload unicast and multicast address lists to device and
6135 *	configure RX filtering. When the device doesn't support unicast
6136 *	filtering it is put in promiscuous mode while unicast addresses
6137 *	are present.
6138 */
6139void __dev_set_rx_mode(struct net_device *dev)
6140{
6141	const struct net_device_ops *ops = dev->netdev_ops;
6142
6143	/* dev_open will call this function so the list will stay sane. */
6144	if (!(dev->flags&IFF_UP))
6145		return;
6146
6147	if (!netif_device_present(dev))
6148		return;
6149
6150	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6151		/* Unicast addresses changes may only happen under the rtnl,
6152		 * therefore calling __dev_set_promiscuity here is safe.
6153		 */
6154		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6155			__dev_set_promiscuity(dev, 1, false);
6156			dev->uc_promisc = true;
6157		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6158			__dev_set_promiscuity(dev, -1, false);
6159			dev->uc_promisc = false;
6160		}
6161	}
6162
6163	if (ops->ndo_set_rx_mode)
6164		ops->ndo_set_rx_mode(dev);
6165}
6166
6167void dev_set_rx_mode(struct net_device *dev)
6168{
6169	netif_addr_lock_bh(dev);
6170	__dev_set_rx_mode(dev);
6171	netif_addr_unlock_bh(dev);
6172}
6173
6174/**
6175 *	dev_get_flags - get flags reported to userspace
6176 *	@dev: device
6177 *
6178 *	Get the combination of flag bits exported through APIs to userspace.
6179 */
6180unsigned int dev_get_flags(const struct net_device *dev)
6181{
6182	unsigned int flags;
6183
6184	flags = (dev->flags & ~(IFF_PROMISC |
6185				IFF_ALLMULTI |
6186				IFF_RUNNING |
6187				IFF_LOWER_UP |
6188				IFF_DORMANT)) |
6189		(dev->gflags & (IFF_PROMISC |
6190				IFF_ALLMULTI));
6191
6192	if (netif_running(dev)) {
6193		if (netif_oper_up(dev))
6194			flags |= IFF_RUNNING;
6195		if (netif_carrier_ok(dev))
6196			flags |= IFF_LOWER_UP;
6197		if (netif_dormant(dev))
6198			flags |= IFF_DORMANT;
6199	}
6200
6201	return flags;
6202}
6203EXPORT_SYMBOL(dev_get_flags);
6204
6205int __dev_change_flags(struct net_device *dev, unsigned int flags)
6206{
6207	unsigned int old_flags = dev->flags;
6208	int ret;
6209
6210	ASSERT_RTNL();
6211
6212	/*
6213	 *	Set the flags on our device.
6214	 */
6215
6216	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6217			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6218			       IFF_AUTOMEDIA)) |
6219		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6220				    IFF_ALLMULTI));
6221
6222	/*
6223	 *	Load in the correct multicast list now the flags have changed.
6224	 */
6225
6226	if ((old_flags ^ flags) & IFF_MULTICAST)
6227		dev_change_rx_flags(dev, IFF_MULTICAST);
6228
6229	dev_set_rx_mode(dev);
6230
6231	/*
6232	 *	Have we downed the interface. We handle IFF_UP ourselves
6233	 *	according to user attempts to set it, rather than blindly
6234	 *	setting it.
6235	 */
6236
6237	ret = 0;
6238	if ((old_flags ^ flags) & IFF_UP)
6239		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6240
 
 
 
 
6241	if ((flags ^ dev->gflags) & IFF_PROMISC) {
6242		int inc = (flags & IFF_PROMISC) ? 1 : -1;
6243		unsigned int old_flags = dev->flags;
6244
6245		dev->gflags ^= IFF_PROMISC;
6246
6247		if (__dev_set_promiscuity(dev, inc, false) >= 0)
6248			if (dev->flags != old_flags)
6249				dev_set_rx_mode(dev);
6250	}
6251
6252	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6253	   is important. Some (broken) drivers set IFF_PROMISC, when
6254	   IFF_ALLMULTI is requested not asking us and not reporting.
6255	 */
6256	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6257		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6258
6259		dev->gflags ^= IFF_ALLMULTI;
6260		__dev_set_allmulti(dev, inc, false);
6261	}
6262
6263	return ret;
6264}
6265
6266void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6267			unsigned int gchanges)
6268{
6269	unsigned int changes = dev->flags ^ old_flags;
6270
6271	if (gchanges)
6272		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6273
6274	if (changes & IFF_UP) {
6275		if (dev->flags & IFF_UP)
6276			call_netdevice_notifiers(NETDEV_UP, dev);
6277		else
6278			call_netdevice_notifiers(NETDEV_DOWN, dev);
6279	}
6280
6281	if (dev->flags & IFF_UP &&
6282	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6283		struct netdev_notifier_change_info change_info;
6284
6285		change_info.flags_changed = changes;
6286		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6287					      &change_info.info);
6288	}
6289}
6290
6291/**
6292 *	dev_change_flags - change device settings
6293 *	@dev: device
6294 *	@flags: device state flags
6295 *
6296 *	Change settings on device based state flags. The flags are
6297 *	in the userspace exported format.
6298 */
6299int dev_change_flags(struct net_device *dev, unsigned int flags)
6300{
6301	int ret;
6302	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6303
6304	ret = __dev_change_flags(dev, flags);
6305	if (ret < 0)
6306		return ret;
6307
6308	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6309	__dev_notify_flags(dev, old_flags, changes);
 
 
 
6310	return ret;
6311}
6312EXPORT_SYMBOL(dev_change_flags);
6313
6314static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6315{
6316	const struct net_device_ops *ops = dev->netdev_ops;
6317
6318	if (ops->ndo_change_mtu)
6319		return ops->ndo_change_mtu(dev, new_mtu);
6320
6321	dev->mtu = new_mtu;
6322	return 0;
6323}
6324
6325/**
6326 *	dev_set_mtu - Change maximum transfer unit
6327 *	@dev: device
6328 *	@new_mtu: new transfer unit
6329 *
6330 *	Change the maximum transfer size of the network device.
6331 */
6332int dev_set_mtu(struct net_device *dev, int new_mtu)
6333{
6334	int err, orig_mtu;
 
6335
6336	if (new_mtu == dev->mtu)
6337		return 0;
6338
6339	/*	MTU must be positive.	 */
6340	if (new_mtu < 0)
6341		return -EINVAL;
6342
6343	if (!netif_device_present(dev))
6344		return -ENODEV;
6345
6346	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6347	err = notifier_to_errno(err);
6348	if (err)
6349		return err;
6350
6351	orig_mtu = dev->mtu;
6352	err = __dev_set_mtu(dev, new_mtu);
6353
6354	if (!err) {
6355		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6356		err = notifier_to_errno(err);
6357		if (err) {
6358			/* setting mtu back and notifying everyone again,
6359			 * so that they have a chance to revert changes.
6360			 */
6361			__dev_set_mtu(dev, orig_mtu);
6362			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6363		}
6364	}
6365	return err;
6366}
6367EXPORT_SYMBOL(dev_set_mtu);
6368
6369/**
6370 *	dev_set_group - Change group this device belongs to
6371 *	@dev: device
6372 *	@new_group: group this device should belong to
6373 */
6374void dev_set_group(struct net_device *dev, int new_group)
6375{
6376	dev->group = new_group;
6377}
6378EXPORT_SYMBOL(dev_set_group);
6379
6380/**
6381 *	dev_set_mac_address - Change Media Access Control Address
6382 *	@dev: device
6383 *	@sa: new address
6384 *
6385 *	Change the hardware (MAC) address of the device
6386 */
6387int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6388{
6389	const struct net_device_ops *ops = dev->netdev_ops;
6390	int err;
6391
6392	if (!ops->ndo_set_mac_address)
6393		return -EOPNOTSUPP;
6394	if (sa->sa_family != dev->type)
6395		return -EINVAL;
6396	if (!netif_device_present(dev))
6397		return -ENODEV;
6398	err = ops->ndo_set_mac_address(dev, sa);
6399	if (err)
6400		return err;
6401	dev->addr_assign_type = NET_ADDR_SET;
6402	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6403	add_device_randomness(dev->dev_addr, dev->addr_len);
6404	return 0;
6405}
6406EXPORT_SYMBOL(dev_set_mac_address);
6407
6408/**
6409 *	dev_change_carrier - Change device carrier
6410 *	@dev: device
6411 *	@new_carrier: new value
6412 *
6413 *	Change device carrier
6414 */
6415int dev_change_carrier(struct net_device *dev, bool new_carrier)
6416{
6417	const struct net_device_ops *ops = dev->netdev_ops;
 
6418
6419	if (!ops->ndo_change_carrier)
6420		return -EOPNOTSUPP;
6421	if (!netif_device_present(dev))
6422		return -ENODEV;
6423	return ops->ndo_change_carrier(dev, new_carrier);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6424}
6425EXPORT_SYMBOL(dev_change_carrier);
6426
6427/**
6428 *	dev_get_phys_port_id - Get device physical port ID
6429 *	@dev: device
6430 *	@ppid: port ID
6431 *
6432 *	Get device physical port ID
6433 */
6434int dev_get_phys_port_id(struct net_device *dev,
6435			 struct netdev_phys_item_id *ppid)
6436{
6437	const struct net_device_ops *ops = dev->netdev_ops;
 
 
6438
6439	if (!ops->ndo_get_phys_port_id)
 
 
 
 
 
 
 
 
 
 
6440		return -EOPNOTSUPP;
6441	return ops->ndo_get_phys_port_id(dev, ppid);
6442}
6443EXPORT_SYMBOL(dev_get_phys_port_id);
6444
6445/**
6446 *	dev_get_phys_port_name - Get device physical port name
6447 *	@dev: device
6448 *	@name: port name
6449 *	@len: limit of bytes to copy to name
6450 *
6451 *	Get device physical port name
6452 */
6453int dev_get_phys_port_name(struct net_device *dev,
6454			   char *name, size_t len)
6455{
6456	const struct net_device_ops *ops = dev->netdev_ops;
 
6457
6458	if (!ops->ndo_get_phys_port_name)
 
 
 
 
 
6459		return -EOPNOTSUPP;
6460	return ops->ndo_get_phys_port_name(dev, name, len);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6461}
6462EXPORT_SYMBOL(dev_get_phys_port_name);
 
 
 
 
6463
6464/**
6465 *	dev_change_proto_down - update protocol port state information
6466 *	@dev: device
6467 *	@proto_down: new value
 
6468 *
6469 *	This info can be used by switch drivers to set the phys state of the
6470 *	port.
 
 
6471 */
6472int dev_change_proto_down(struct net_device *dev, bool proto_down)
 
6473{
6474	const struct net_device_ops *ops = dev->netdev_ops;
 
 
6475
6476	if (!ops->ndo_change_proto_down)
6477		return -EOPNOTSUPP;
6478	if (!netif_device_present(dev))
6479		return -ENODEV;
6480	return ops->ndo_change_proto_down(dev, proto_down);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6481}
6482EXPORT_SYMBOL(dev_change_proto_down);
6483
6484/**
6485 *	dev_new_index	-	allocate an ifindex
6486 *	@net: the applicable net namespace
6487 *
6488 *	Returns a suitable unique value for a new device interface
6489 *	number.  The caller must hold the rtnl semaphore or the
6490 *	dev_base_lock to be sure it remains unique.
6491 */
6492static int dev_new_index(struct net *net)
6493{
6494	int ifindex = net->ifindex;
6495	for (;;) {
6496		if (++ifindex <= 0)
6497			ifindex = 1;
6498		if (!__dev_get_by_index(net, ifindex))
6499			return net->ifindex = ifindex;
6500	}
6501}
6502
6503/* Delayed registration/unregisteration */
6504static LIST_HEAD(net_todo_list);
6505DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6506
6507static void net_set_todo(struct net_device *dev)
6508{
6509	list_add_tail(&dev->todo_list, &net_todo_list);
6510	dev_net(dev)->dev_unreg_count++;
6511}
6512
6513static void rollback_registered_many(struct list_head *head)
6514{
6515	struct net_device *dev, *tmp;
6516	LIST_HEAD(close_head);
6517
6518	BUG_ON(dev_boot_phase);
6519	ASSERT_RTNL();
6520
6521	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6522		/* Some devices call without registering
6523		 * for initialization unwind. Remove those
6524		 * devices and proceed with the remaining.
6525		 */
6526		if (dev->reg_state == NETREG_UNINITIALIZED) {
6527			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6528				 dev->name, dev);
6529
6530			WARN_ON(1);
6531			list_del(&dev->unreg_list);
6532			continue;
6533		}
6534		dev->dismantle = true;
6535		BUG_ON(dev->reg_state != NETREG_REGISTERED);
6536	}
6537
6538	/* If device is running, close it first. */
6539	list_for_each_entry(dev, head, unreg_list)
6540		list_add_tail(&dev->close_list, &close_head);
6541	dev_close_many(&close_head, true);
6542
6543	list_for_each_entry(dev, head, unreg_list) {
6544		/* And unlink it from device chain. */
6545		unlist_netdevice(dev);
6546
6547		dev->reg_state = NETREG_UNREGISTERING;
6548		on_each_cpu(flush_backlog, dev, 1);
6549	}
6550
6551	synchronize_net();
6552
6553	list_for_each_entry(dev, head, unreg_list) {
6554		struct sk_buff *skb = NULL;
6555
6556		/* Shutdown queueing discipline. */
6557		dev_shutdown(dev);
6558
6559
6560		/* Notify protocols, that we are about to destroy
6561		   this device. They should clean all the things.
6562		*/
6563		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6564
6565		if (!dev->rtnl_link_ops ||
6566		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6567			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6568						     GFP_KERNEL);
6569
6570		/*
6571		 *	Flush the unicast and multicast chains
6572		 */
6573		dev_uc_flush(dev);
6574		dev_mc_flush(dev);
6575
6576		if (dev->netdev_ops->ndo_uninit)
6577			dev->netdev_ops->ndo_uninit(dev);
6578
6579		if (skb)
6580			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6581
6582		/* Notifier chain MUST detach us all upper devices. */
6583		WARN_ON(netdev_has_any_upper_dev(dev));
6584
6585		/* Remove entries from kobject tree */
6586		netdev_unregister_kobject(dev);
6587#ifdef CONFIG_XPS
6588		/* Remove XPS queueing entries */
6589		netif_reset_xps_queues_gt(dev, 0);
6590#endif
6591	}
6592
 
 
 
 
6593	synchronize_net();
6594
6595	list_for_each_entry(dev, head, unreg_list)
6596		dev_put(dev);
6597}
6598
6599static void rollback_registered(struct net_device *dev)
6600{
6601	LIST_HEAD(single);
6602
6603	list_add(&dev->unreg_list, &single);
6604	rollback_registered_many(&single);
6605	list_del(&single);
6606}
6607
6608static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6609	struct net_device *upper, netdev_features_t features)
6610{
6611	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6612	netdev_features_t feature;
6613	int feature_bit;
6614
6615	for_each_netdev_feature(&upper_disables, feature_bit) {
6616		feature = __NETIF_F_BIT(feature_bit);
6617		if (!(upper->wanted_features & feature)
6618		    && (features & feature)) {
6619			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6620				   &feature, upper->name);
6621			features &= ~feature;
6622		}
6623	}
6624
6625	return features;
6626}
6627
6628static void netdev_sync_lower_features(struct net_device *upper,
6629	struct net_device *lower, netdev_features_t features)
6630{
6631	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6632	netdev_features_t feature;
6633	int feature_bit;
6634
6635	for_each_netdev_feature(&upper_disables, feature_bit) {
6636		feature = __NETIF_F_BIT(feature_bit);
6637		if (!(features & feature) && (lower->features & feature)) {
6638			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6639				   &feature, lower->name);
6640			lower->wanted_features &= ~feature;
6641			netdev_update_features(lower);
6642
6643			if (unlikely(lower->features & feature))
6644				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6645					    &feature, lower->name);
6646		}
6647	}
6648}
6649
6650static netdev_features_t netdev_fix_features(struct net_device *dev,
6651	netdev_features_t features)
6652{
6653	/* Fix illegal checksum combinations */
6654	if ((features & NETIF_F_HW_CSUM) &&
6655	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6656		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6657		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6658	}
6659
 
 
 
 
 
 
 
 
6660	/* TSO requires that SG is present as well. */
6661	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6662		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6663		features &= ~NETIF_F_ALL_TSO;
6664	}
6665
6666	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6667					!(features & NETIF_F_IP_CSUM)) {
6668		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6669		features &= ~NETIF_F_TSO;
6670		features &= ~NETIF_F_TSO_ECN;
6671	}
6672
6673	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6674					 !(features & NETIF_F_IPV6_CSUM)) {
6675		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6676		features &= ~NETIF_F_TSO6;
6677	}
6678
6679	/* TSO ECN requires that TSO is present as well. */
6680	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6681		features &= ~NETIF_F_TSO_ECN;
6682
6683	/* Software GSO depends on SG. */
6684	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6685		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6686		features &= ~NETIF_F_GSO;
6687	}
6688
6689	/* UFO needs SG and checksumming */
6690	if (features & NETIF_F_UFO) {
6691		/* maybe split UFO into V4 and V6? */
6692		if (!(features & NETIF_F_HW_CSUM) &&
6693		    ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6694		     (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6695			netdev_dbg(dev,
6696				"Dropping NETIF_F_UFO since no checksum offload features.\n");
6697			features &= ~NETIF_F_UFO;
6698		}
6699
6700		if (!(features & NETIF_F_SG)) {
6701			netdev_dbg(dev,
6702				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6703			features &= ~NETIF_F_UFO;
6704		}
6705	}
6706
6707#ifdef CONFIG_NET_RX_BUSY_POLL
6708	if (dev->netdev_ops->ndo_busy_poll)
6709		features |= NETIF_F_BUSY_POLL;
6710	else
6711#endif
6712		features &= ~NETIF_F_BUSY_POLL;
6713
6714	return features;
6715}
6716
6717int __netdev_update_features(struct net_device *dev)
6718{
6719	struct net_device *upper, *lower;
6720	netdev_features_t features;
6721	struct list_head *iter;
6722	int err = -1;
6723
6724	ASSERT_RTNL();
6725
6726	features = netdev_get_wanted_features(dev);
6727
6728	if (dev->netdev_ops->ndo_fix_features)
6729		features = dev->netdev_ops->ndo_fix_features(dev, features);
6730
6731	/* driver might be less strict about feature dependencies */
6732	features = netdev_fix_features(dev, features);
6733
6734	/* some features can't be enabled if they're off an an upper device */
6735	netdev_for_each_upper_dev_rcu(dev, upper, iter)
6736		features = netdev_sync_upper_features(dev, upper, features);
6737
6738	if (dev->features == features)
6739		goto sync_lower;
6740
6741	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6742		&dev->features, &features);
6743
6744	if (dev->netdev_ops->ndo_set_features)
6745		err = dev->netdev_ops->ndo_set_features(dev, features);
6746	else
6747		err = 0;
6748
6749	if (unlikely(err < 0)) {
6750		netdev_err(dev,
6751			"set_features() failed (%d); wanted %pNF, left %pNF\n",
6752			err, &features, &dev->features);
6753		/* return non-0 since some features might have changed and
6754		 * it's better to fire a spurious notification than miss it
6755		 */
6756		return -1;
6757	}
6758
6759sync_lower:
6760	/* some features must be disabled on lower devices when disabled
6761	 * on an upper device (think: bonding master or bridge)
6762	 */
6763	netdev_for_each_lower_dev(dev, lower, iter)
6764		netdev_sync_lower_features(dev, lower, features);
6765
6766	if (!err)
6767		dev->features = features;
6768
6769	return err < 0 ? 0 : 1;
6770}
6771
6772/**
6773 *	netdev_update_features - recalculate device features
6774 *	@dev: the device to check
6775 *
6776 *	Recalculate dev->features set and send notifications if it
6777 *	has changed. Should be called after driver or hardware dependent
6778 *	conditions might have changed that influence the features.
6779 */
6780void netdev_update_features(struct net_device *dev)
6781{
6782	if (__netdev_update_features(dev))
6783		netdev_features_change(dev);
6784}
6785EXPORT_SYMBOL(netdev_update_features);
6786
6787/**
6788 *	netdev_change_features - recalculate device features
6789 *	@dev: the device to check
6790 *
6791 *	Recalculate dev->features set and send notifications even
6792 *	if they have not changed. Should be called instead of
6793 *	netdev_update_features() if also dev->vlan_features might
6794 *	have changed to allow the changes to be propagated to stacked
6795 *	VLAN devices.
6796 */
6797void netdev_change_features(struct net_device *dev)
6798{
6799	__netdev_update_features(dev);
6800	netdev_features_change(dev);
6801}
6802EXPORT_SYMBOL(netdev_change_features);
6803
6804/**
6805 *	netif_stacked_transfer_operstate -	transfer operstate
6806 *	@rootdev: the root or lower level device to transfer state from
6807 *	@dev: the device to transfer operstate to
6808 *
6809 *	Transfer operational state from root to device. This is normally
6810 *	called when a stacking relationship exists between the root
6811 *	device and the device(a leaf device).
6812 */
6813void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6814					struct net_device *dev)
6815{
6816	if (rootdev->operstate == IF_OPER_DORMANT)
6817		netif_dormant_on(dev);
6818	else
6819		netif_dormant_off(dev);
6820
6821	if (netif_carrier_ok(rootdev)) {
6822		if (!netif_carrier_ok(dev))
6823			netif_carrier_on(dev);
6824	} else {
6825		if (netif_carrier_ok(dev))
6826			netif_carrier_off(dev);
6827	}
6828}
6829EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6830
6831#ifdef CONFIG_SYSFS
6832static int netif_alloc_rx_queues(struct net_device *dev)
6833{
6834	unsigned int i, count = dev->num_rx_queues;
6835	struct netdev_rx_queue *rx;
6836	size_t sz = count * sizeof(*rx);
6837
6838	BUG_ON(count < 1);
6839
6840	rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6841	if (!rx) {
6842		rx = vzalloc(sz);
6843		if (!rx)
6844			return -ENOMEM;
6845	}
6846	dev->_rx = rx;
6847
6848	for (i = 0; i < count; i++)
6849		rx[i].dev = dev;
6850	return 0;
6851}
6852#endif
6853
6854static void netdev_init_one_queue(struct net_device *dev,
6855				  struct netdev_queue *queue, void *_unused)
6856{
6857	/* Initialize queue lock */
6858	spin_lock_init(&queue->_xmit_lock);
6859	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6860	queue->xmit_lock_owner = -1;
6861	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6862	queue->dev = dev;
6863#ifdef CONFIG_BQL
6864	dql_init(&queue->dql, HZ);
6865#endif
6866}
6867
6868static void netif_free_tx_queues(struct net_device *dev)
6869{
6870	kvfree(dev->_tx);
6871}
6872
6873static int netif_alloc_netdev_queues(struct net_device *dev)
6874{
6875	unsigned int count = dev->num_tx_queues;
6876	struct netdev_queue *tx;
6877	size_t sz = count * sizeof(*tx);
6878
6879	if (count < 1 || count > 0xffff)
6880		return -EINVAL;
6881
6882	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6883	if (!tx) {
6884		tx = vzalloc(sz);
6885		if (!tx)
6886			return -ENOMEM;
6887	}
6888	dev->_tx = tx;
6889
6890	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6891	spin_lock_init(&dev->tx_global_lock);
6892
6893	return 0;
6894}
6895
6896void netif_tx_stop_all_queues(struct net_device *dev)
6897{
6898	unsigned int i;
6899
6900	for (i = 0; i < dev->num_tx_queues; i++) {
6901		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6902		netif_tx_stop_queue(txq);
6903	}
6904}
6905EXPORT_SYMBOL(netif_tx_stop_all_queues);
6906
6907/**
6908 *	register_netdevice	- register a network device
6909 *	@dev: device to register
6910 *
6911 *	Take a completed network device structure and add it to the kernel
6912 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6913 *	chain. 0 is returned on success. A negative errno code is returned
6914 *	on a failure to set up the device, or if the name is a duplicate.
6915 *
6916 *	Callers must hold the rtnl semaphore. You may want
6917 *	register_netdev() instead of this.
6918 *
6919 *	BUGS:
6920 *	The locking appears insufficient to guarantee two parallel registers
6921 *	will not get the same name.
6922 */
6923
6924int register_netdevice(struct net_device *dev)
6925{
6926	int ret;
6927	struct net *net = dev_net(dev);
6928
6929	BUG_ON(dev_boot_phase);
6930	ASSERT_RTNL();
6931
6932	might_sleep();
6933
6934	/* When net_device's are persistent, this will be fatal. */
6935	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6936	BUG_ON(!net);
6937
6938	spin_lock_init(&dev->addr_list_lock);
6939	netdev_set_addr_lockdep_class(dev);
6940
6941	ret = dev_get_valid_name(net, dev, dev->name);
 
 
6942	if (ret < 0)
6943		goto out;
6944
6945	/* Init, if this function is available */
6946	if (dev->netdev_ops->ndo_init) {
6947		ret = dev->netdev_ops->ndo_init(dev);
6948		if (ret) {
6949			if (ret > 0)
6950				ret = -EIO;
6951			goto out;
6952		}
6953	}
6954
6955	if (((dev->hw_features | dev->features) &
6956	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
6957	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6958	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6959		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6960		ret = -EINVAL;
6961		goto err_uninit;
6962	}
6963
6964	ret = -EBUSY;
6965	if (!dev->ifindex)
6966		dev->ifindex = dev_new_index(net);
6967	else if (__dev_get_by_index(net, dev->ifindex))
6968		goto err_uninit;
6969
6970	/* Transfer changeable features to wanted_features and enable
6971	 * software offloads (GSO and GRO).
6972	 */
6973	dev->hw_features |= NETIF_F_SOFT_FEATURES;
6974	dev->features |= NETIF_F_SOFT_FEATURES;
6975	dev->wanted_features = dev->features & dev->hw_features;
6976
 
6977	if (!(dev->flags & IFF_LOOPBACK)) {
6978		dev->hw_features |= NETIF_F_NOCACHE_COPY;
 
 
 
 
6979	}
6980
6981	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6982	 */
6983	dev->vlan_features |= NETIF_F_HIGHDMA;
6984
6985	/* Make NETIF_F_SG inheritable to tunnel devices.
6986	 */
6987	dev->hw_enc_features |= NETIF_F_SG;
6988
6989	/* Make NETIF_F_SG inheritable to MPLS.
6990	 */
6991	dev->mpls_features |= NETIF_F_SG;
6992
6993	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6994	ret = notifier_to_errno(ret);
6995	if (ret)
6996		goto err_uninit;
6997
6998	ret = netdev_register_kobject(dev);
6999	if (ret)
7000		goto err_uninit;
7001	dev->reg_state = NETREG_REGISTERED;
7002
7003	__netdev_update_features(dev);
7004
7005	/*
7006	 *	Default initial state at registry is that the
7007	 *	device is present.
7008	 */
7009
7010	set_bit(__LINK_STATE_PRESENT, &dev->state);
7011
7012	linkwatch_init_dev(dev);
7013
7014	dev_init_scheduler(dev);
7015	dev_hold(dev);
7016	list_netdevice(dev);
7017	add_device_randomness(dev->dev_addr, dev->addr_len);
7018
7019	/* If the device has permanent device address, driver should
7020	 * set dev_addr and also addr_assign_type should be set to
7021	 * NET_ADDR_PERM (default value).
7022	 */
7023	if (dev->addr_assign_type == NET_ADDR_PERM)
7024		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7025
7026	/* Notify protocols, that a new device appeared. */
7027	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7028	ret = notifier_to_errno(ret);
7029	if (ret) {
7030		rollback_registered(dev);
7031		dev->reg_state = NETREG_UNREGISTERED;
7032	}
7033	/*
7034	 *	Prevent userspace races by waiting until the network
7035	 *	device is fully setup before sending notifications.
7036	 */
7037	if (!dev->rtnl_link_ops ||
7038	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7039		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7040
7041out:
7042	return ret;
7043
7044err_uninit:
7045	if (dev->netdev_ops->ndo_uninit)
7046		dev->netdev_ops->ndo_uninit(dev);
7047	goto out;
7048}
7049EXPORT_SYMBOL(register_netdevice);
7050
7051/**
7052 *	init_dummy_netdev	- init a dummy network device for NAPI
7053 *	@dev: device to init
7054 *
7055 *	This takes a network device structure and initialize the minimum
7056 *	amount of fields so it can be used to schedule NAPI polls without
7057 *	registering a full blown interface. This is to be used by drivers
7058 *	that need to tie several hardware interfaces to a single NAPI
7059 *	poll scheduler due to HW limitations.
7060 */
7061int init_dummy_netdev(struct net_device *dev)
7062{
7063	/* Clear everything. Note we don't initialize spinlocks
7064	 * are they aren't supposed to be taken by any of the
7065	 * NAPI code and this dummy netdev is supposed to be
7066	 * only ever used for NAPI polls
7067	 */
7068	memset(dev, 0, sizeof(struct net_device));
7069
7070	/* make sure we BUG if trying to hit standard
7071	 * register/unregister code path
7072	 */
7073	dev->reg_state = NETREG_DUMMY;
7074
7075	/* NAPI wants this */
7076	INIT_LIST_HEAD(&dev->napi_list);
7077
7078	/* a dummy interface is started by default */
7079	set_bit(__LINK_STATE_PRESENT, &dev->state);
7080	set_bit(__LINK_STATE_START, &dev->state);
7081
7082	/* Note : We dont allocate pcpu_refcnt for dummy devices,
7083	 * because users of this 'device' dont need to change
7084	 * its refcount.
7085	 */
7086
7087	return 0;
7088}
7089EXPORT_SYMBOL_GPL(init_dummy_netdev);
7090
7091
7092/**
7093 *	register_netdev	- register a network device
7094 *	@dev: device to register
7095 *
7096 *	Take a completed network device structure and add it to the kernel
7097 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7098 *	chain. 0 is returned on success. A negative errno code is returned
7099 *	on a failure to set up the device, or if the name is a duplicate.
7100 *
7101 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
7102 *	and expands the device name if you passed a format string to
7103 *	alloc_netdev.
7104 */
7105int register_netdev(struct net_device *dev)
7106{
7107	int err;
7108
7109	rtnl_lock();
7110	err = register_netdevice(dev);
7111	rtnl_unlock();
7112	return err;
7113}
7114EXPORT_SYMBOL(register_netdev);
7115
7116int netdev_refcnt_read(const struct net_device *dev)
7117{
7118	int i, refcnt = 0;
7119
7120	for_each_possible_cpu(i)
7121		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7122	return refcnt;
7123}
7124EXPORT_SYMBOL(netdev_refcnt_read);
7125
7126/**
7127 * netdev_wait_allrefs - wait until all references are gone.
7128 * @dev: target net_device
7129 *
7130 * This is called when unregistering network devices.
7131 *
7132 * Any protocol or device that holds a reference should register
7133 * for netdevice notification, and cleanup and put back the
7134 * reference if they receive an UNREGISTER event.
7135 * We can get stuck here if buggy protocols don't correctly
7136 * call dev_put.
7137 */
7138static void netdev_wait_allrefs(struct net_device *dev)
7139{
7140	unsigned long rebroadcast_time, warning_time;
7141	int refcnt;
7142
7143	linkwatch_forget_dev(dev);
7144
7145	rebroadcast_time = warning_time = jiffies;
7146	refcnt = netdev_refcnt_read(dev);
7147
7148	while (refcnt != 0) {
7149		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7150			rtnl_lock();
7151
7152			/* Rebroadcast unregister notification */
7153			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 
 
7154
7155			__rtnl_unlock();
7156			rcu_barrier();
7157			rtnl_lock();
7158
7159			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7160			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7161				     &dev->state)) {
7162				/* We must not have linkwatch events
7163				 * pending on unregister. If this
7164				 * happens, we simply run the queue
7165				 * unscheduled, resulting in a noop
7166				 * for this device.
7167				 */
7168				linkwatch_run_queue();
7169			}
7170
7171			__rtnl_unlock();
7172
7173			rebroadcast_time = jiffies;
7174		}
7175
7176		msleep(250);
7177
7178		refcnt = netdev_refcnt_read(dev);
7179
7180		if (time_after(jiffies, warning_time + 10 * HZ)) {
7181			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7182				 dev->name, refcnt);
7183			warning_time = jiffies;
7184		}
7185	}
7186}
7187
7188/* The sequence is:
7189 *
7190 *	rtnl_lock();
7191 *	...
7192 *	register_netdevice(x1);
7193 *	register_netdevice(x2);
7194 *	...
7195 *	unregister_netdevice(y1);
7196 *	unregister_netdevice(y2);
7197 *      ...
7198 *	rtnl_unlock();
7199 *	free_netdev(y1);
7200 *	free_netdev(y2);
7201 *
7202 * We are invoked by rtnl_unlock().
7203 * This allows us to deal with problems:
7204 * 1) We can delete sysfs objects which invoke hotplug
7205 *    without deadlocking with linkwatch via keventd.
7206 * 2) Since we run with the RTNL semaphore not held, we can sleep
7207 *    safely in order to wait for the netdev refcnt to drop to zero.
7208 *
7209 * We must not return until all unregister events added during
7210 * the interval the lock was held have been completed.
7211 */
7212void netdev_run_todo(void)
7213{
7214	struct list_head list;
7215
7216	/* Snapshot list, allow later requests */
7217	list_replace_init(&net_todo_list, &list);
7218
7219	__rtnl_unlock();
7220
7221
7222	/* Wait for rcu callbacks to finish before next phase */
 
7223	if (!list_empty(&list))
7224		rcu_barrier();
7225
7226	while (!list_empty(&list)) {
7227		struct net_device *dev
7228			= list_first_entry(&list, struct net_device, todo_list);
7229		list_del(&dev->todo_list);
7230
7231		rtnl_lock();
7232		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7233		__rtnl_unlock();
7234
7235		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7236			pr_err("network todo '%s' but state %d\n",
7237			       dev->name, dev->reg_state);
7238			dump_stack();
7239			continue;
7240		}
7241
7242		dev->reg_state = NETREG_UNREGISTERED;
7243
 
 
7244		netdev_wait_allrefs(dev);
7245
7246		/* paranoia */
7247		BUG_ON(netdev_refcnt_read(dev));
7248		BUG_ON(!list_empty(&dev->ptype_all));
7249		BUG_ON(!list_empty(&dev->ptype_specific));
7250		WARN_ON(rcu_access_pointer(dev->ip_ptr));
7251		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7252		WARN_ON(dev->dn_ptr);
7253
7254		if (dev->destructor)
7255			dev->destructor(dev);
7256
7257		/* Report a network device has been unregistered */
7258		rtnl_lock();
7259		dev_net(dev)->dev_unreg_count--;
7260		__rtnl_unlock();
7261		wake_up(&netdev_unregistering_wq);
7262
7263		/* Free network device */
7264		kobject_put(&dev->dev.kobj);
7265	}
7266}
7267
7268/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7269 * all the same fields in the same order as net_device_stats, with only
7270 * the type differing, but rtnl_link_stats64 may have additional fields
7271 * at the end for newer counters.
7272 */
7273void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7274			     const struct net_device_stats *netdev_stats)
7275{
7276#if BITS_PER_LONG == 64
7277	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7278	memcpy(stats64, netdev_stats, sizeof(*stats64));
7279	/* zero out counters that only exist in rtnl_link_stats64 */
7280	memset((char *)stats64 + sizeof(*netdev_stats), 0,
7281	       sizeof(*stats64) - sizeof(*netdev_stats));
7282#else
7283	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7284	const unsigned long *src = (const unsigned long *)netdev_stats;
7285	u64 *dst = (u64 *)stats64;
7286
7287	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
 
7288	for (i = 0; i < n; i++)
7289		dst[i] = src[i];
7290	/* zero out counters that only exist in rtnl_link_stats64 */
7291	memset((char *)stats64 + n * sizeof(u64), 0,
7292	       sizeof(*stats64) - n * sizeof(u64));
7293#endif
7294}
7295EXPORT_SYMBOL(netdev_stats_to_stats64);
7296
7297/**
7298 *	dev_get_stats	- get network device statistics
7299 *	@dev: device to get statistics from
7300 *	@storage: place to store stats
7301 *
7302 *	Get network statistics from device. Return @storage.
7303 *	The device driver may provide its own method by setting
7304 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7305 *	otherwise the internal statistics structure is used.
7306 */
7307struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7308					struct rtnl_link_stats64 *storage)
7309{
7310	const struct net_device_ops *ops = dev->netdev_ops;
7311
7312	if (ops->ndo_get_stats64) {
7313		memset(storage, 0, sizeof(*storage));
7314		ops->ndo_get_stats64(dev, storage);
7315	} else if (ops->ndo_get_stats) {
7316		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7317	} else {
7318		netdev_stats_to_stats64(storage, &dev->stats);
7319	}
7320	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7321	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7322	storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7323	return storage;
7324}
7325EXPORT_SYMBOL(dev_get_stats);
7326
7327struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7328{
7329	struct netdev_queue *queue = dev_ingress_queue(dev);
7330
7331#ifdef CONFIG_NET_CLS_ACT
7332	if (queue)
7333		return queue;
7334	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7335	if (!queue)
7336		return NULL;
7337	netdev_init_one_queue(dev, queue, NULL);
7338	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7339	queue->qdisc_sleeping = &noop_qdisc;
7340	rcu_assign_pointer(dev->ingress_queue, queue);
7341#endif
7342	return queue;
7343}
7344
7345static const struct ethtool_ops default_ethtool_ops;
7346
7347void netdev_set_default_ethtool_ops(struct net_device *dev,
7348				    const struct ethtool_ops *ops)
7349{
7350	if (dev->ethtool_ops == &default_ethtool_ops)
7351		dev->ethtool_ops = ops;
7352}
7353EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7354
7355void netdev_freemem(struct net_device *dev)
7356{
7357	char *addr = (char *)dev - dev->padded;
7358
7359	kvfree(addr);
7360}
7361
7362/**
7363 *	alloc_netdev_mqs - allocate network device
7364 *	@sizeof_priv:		size of private data to allocate space for
7365 *	@name:			device name format string
7366 *	@name_assign_type: 	origin of device name
7367 *	@setup:			callback to initialize device
7368 *	@txqs:			the number of TX subqueues to allocate
7369 *	@rxqs:			the number of RX subqueues to allocate
7370 *
7371 *	Allocates a struct net_device with private data area for driver use
7372 *	and performs basic initialization.  Also allocates subqueue structs
7373 *	for each queue on the device.
7374 */
7375struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7376		unsigned char name_assign_type,
7377		void (*setup)(struct net_device *),
7378		unsigned int txqs, unsigned int rxqs)
7379{
7380	struct net_device *dev;
7381	size_t alloc_size;
7382	struct net_device *p;
7383
7384	BUG_ON(strlen(name) >= sizeof(dev->name));
7385
7386	if (txqs < 1) {
7387		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7388		return NULL;
7389	}
7390
7391#ifdef CONFIG_SYSFS
7392	if (rxqs < 1) {
7393		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7394		return NULL;
7395	}
7396#endif
7397
7398	alloc_size = sizeof(struct net_device);
7399	if (sizeof_priv) {
7400		/* ensure 32-byte alignment of private area */
7401		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7402		alloc_size += sizeof_priv;
7403	}
7404	/* ensure 32-byte alignment of whole construct */
7405	alloc_size += NETDEV_ALIGN - 1;
7406
7407	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7408	if (!p)
7409		p = vzalloc(alloc_size);
7410	if (!p)
7411		return NULL;
 
7412
7413	dev = PTR_ALIGN(p, NETDEV_ALIGN);
7414	dev->padded = (char *)dev - (char *)p;
7415
7416	dev->pcpu_refcnt = alloc_percpu(int);
7417	if (!dev->pcpu_refcnt)
7418		goto free_dev;
7419
7420	if (dev_addr_init(dev))
7421		goto free_pcpu;
7422
7423	dev_mc_init(dev);
7424	dev_uc_init(dev);
7425
7426	dev_net_set(dev, &init_net);
7427
7428	dev->gso_max_size = GSO_MAX_SIZE;
7429	dev->gso_max_segs = GSO_MAX_SEGS;
7430	dev->gso_min_segs = 0;
7431
7432	INIT_LIST_HEAD(&dev->napi_list);
7433	INIT_LIST_HEAD(&dev->unreg_list);
7434	INIT_LIST_HEAD(&dev->close_list);
7435	INIT_LIST_HEAD(&dev->link_watch_list);
7436	INIT_LIST_HEAD(&dev->adj_list.upper);
7437	INIT_LIST_HEAD(&dev->adj_list.lower);
7438	INIT_LIST_HEAD(&dev->all_adj_list.upper);
7439	INIT_LIST_HEAD(&dev->all_adj_list.lower);
7440	INIT_LIST_HEAD(&dev->ptype_all);
7441	INIT_LIST_HEAD(&dev->ptype_specific);
7442	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7443	setup(dev);
7444
7445	if (!dev->tx_queue_len) {
7446		dev->priv_flags |= IFF_NO_QUEUE;
7447		dev->tx_queue_len = 1;
7448	}
7449
7450	dev->num_tx_queues = txqs;
7451	dev->real_num_tx_queues = txqs;
7452	if (netif_alloc_netdev_queues(dev))
7453		goto free_all;
7454
7455#ifdef CONFIG_SYSFS
7456	dev->num_rx_queues = rxqs;
7457	dev->real_num_rx_queues = rxqs;
7458	if (netif_alloc_rx_queues(dev))
7459		goto free_all;
7460#endif
7461
7462	strcpy(dev->name, name);
7463	dev->name_assign_type = name_assign_type;
7464	dev->group = INIT_NETDEV_GROUP;
7465	if (!dev->ethtool_ops)
7466		dev->ethtool_ops = &default_ethtool_ops;
7467
7468	nf_hook_ingress_init(dev);
7469
7470	return dev;
7471
7472free_all:
7473	free_netdev(dev);
7474	return NULL;
7475
7476free_pcpu:
7477	free_percpu(dev->pcpu_refcnt);
7478free_dev:
7479	netdev_freemem(dev);
 
 
 
 
 
7480	return NULL;
7481}
7482EXPORT_SYMBOL(alloc_netdev_mqs);
7483
7484/**
7485 *	free_netdev - free network device
7486 *	@dev: device
7487 *
7488 *	This function does the last stage of destroying an allocated device
7489 * 	interface. The reference to the device object is released.
7490 *	If this is the last reference then it will be freed.
7491 *	Must be called in process context.
7492 */
7493void free_netdev(struct net_device *dev)
7494{
7495	struct napi_struct *p, *n;
7496
7497	might_sleep();
7498	netif_free_tx_queues(dev);
7499#ifdef CONFIG_SYSFS
7500	kvfree(dev->_rx);
 
7501#endif
7502
7503	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7504
7505	/* Flush device addresses */
7506	dev_addr_flush(dev);
7507
7508	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7509		netif_napi_del(p);
7510
7511	free_percpu(dev->pcpu_refcnt);
7512	dev->pcpu_refcnt = NULL;
7513
7514	/*  Compatibility with error handling in drivers */
7515	if (dev->reg_state == NETREG_UNINITIALIZED) {
7516		netdev_freemem(dev);
7517		return;
7518	}
7519
7520	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7521	dev->reg_state = NETREG_RELEASED;
7522
7523	/* will free via device release */
7524	put_device(&dev->dev);
7525}
7526EXPORT_SYMBOL(free_netdev);
7527
7528/**
7529 *	synchronize_net -  Synchronize with packet receive processing
7530 *
7531 *	Wait for packets currently being received to be done.
7532 *	Does not block later packets from starting.
7533 */
7534void synchronize_net(void)
7535{
7536	might_sleep();
7537	if (rtnl_is_locked())
7538		synchronize_rcu_expedited();
7539	else
7540		synchronize_rcu();
7541}
7542EXPORT_SYMBOL(synchronize_net);
7543
7544/**
7545 *	unregister_netdevice_queue - remove device from the kernel
7546 *	@dev: device
7547 *	@head: list
7548 *
7549 *	This function shuts down a device interface and removes it
7550 *	from the kernel tables.
7551 *	If head not NULL, device is queued to be unregistered later.
7552 *
7553 *	Callers must hold the rtnl semaphore.  You may want
7554 *	unregister_netdev() instead of this.
7555 */
7556
7557void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7558{
7559	ASSERT_RTNL();
7560
7561	if (head) {
7562		list_move_tail(&dev->unreg_list, head);
7563	} else {
7564		rollback_registered(dev);
7565		/* Finish processing unregister after unlock */
7566		net_set_todo(dev);
7567	}
7568}
7569EXPORT_SYMBOL(unregister_netdevice_queue);
7570
7571/**
7572 *	unregister_netdevice_many - unregister many devices
7573 *	@head: list of devices
7574 *
7575 *  Note: As most callers use a stack allocated list_head,
7576 *  we force a list_del() to make sure stack wont be corrupted later.
7577 */
7578void unregister_netdevice_many(struct list_head *head)
7579{
7580	struct net_device *dev;
7581
7582	if (!list_empty(head)) {
7583		rollback_registered_many(head);
7584		list_for_each_entry(dev, head, unreg_list)
7585			net_set_todo(dev);
7586		list_del(head);
7587	}
7588}
7589EXPORT_SYMBOL(unregister_netdevice_many);
7590
7591/**
7592 *	unregister_netdev - remove device from the kernel
7593 *	@dev: device
7594 *
7595 *	This function shuts down a device interface and removes it
7596 *	from the kernel tables.
7597 *
7598 *	This is just a wrapper for unregister_netdevice that takes
7599 *	the rtnl semaphore.  In general you want to use this and not
7600 *	unregister_netdevice.
7601 */
7602void unregister_netdev(struct net_device *dev)
7603{
7604	rtnl_lock();
7605	unregister_netdevice(dev);
7606	rtnl_unlock();
7607}
7608EXPORT_SYMBOL(unregister_netdev);
7609
7610/**
7611 *	dev_change_net_namespace - move device to different nethost namespace
7612 *	@dev: device
7613 *	@net: network namespace
7614 *	@pat: If not NULL name pattern to try if the current device name
7615 *	      is already taken in the destination network namespace.
7616 *
7617 *	This function shuts down a device interface and moves it
7618 *	to a new network namespace. On success 0 is returned, on
7619 *	a failure a netagive errno code is returned.
7620 *
7621 *	Callers must hold the rtnl semaphore.
7622 */
7623
7624int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7625{
7626	int err;
7627
7628	ASSERT_RTNL();
7629
7630	/* Don't allow namespace local devices to be moved. */
7631	err = -EINVAL;
7632	if (dev->features & NETIF_F_NETNS_LOCAL)
7633		goto out;
7634
7635	/* Ensure the device has been registrered */
 
7636	if (dev->reg_state != NETREG_REGISTERED)
7637		goto out;
7638
7639	/* Get out if there is nothing todo */
7640	err = 0;
7641	if (net_eq(dev_net(dev), net))
7642		goto out;
7643
7644	/* Pick the destination device name, and ensure
7645	 * we can use it in the destination network namespace.
7646	 */
7647	err = -EEXIST;
7648	if (__dev_get_by_name(net, dev->name)) {
7649		/* We get here if we can't use the current device name */
7650		if (!pat)
7651			goto out;
7652		if (dev_get_valid_name(net, dev, pat) < 0)
7653			goto out;
7654	}
7655
7656	/*
7657	 * And now a mini version of register_netdevice unregister_netdevice.
7658	 */
7659
7660	/* If device is running close it first. */
7661	dev_close(dev);
7662
7663	/* And unlink it from device chain */
7664	err = -ENODEV;
7665	unlist_netdevice(dev);
7666
7667	synchronize_net();
7668
7669	/* Shutdown queueing discipline. */
7670	dev_shutdown(dev);
7671
7672	/* Notify protocols, that we are about to destroy
7673	   this device. They should clean all the things.
7674
7675	   Note that dev->reg_state stays at NETREG_REGISTERED.
7676	   This is wanted because this way 8021q and macvlan know
7677	   the device is just moving and can keep their slaves up.
7678	*/
7679	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7680	rcu_barrier();
7681	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7682	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7683
7684	/*
7685	 *	Flush the unicast and multicast chains
7686	 */
7687	dev_uc_flush(dev);
7688	dev_mc_flush(dev);
7689
7690	/* Send a netdev-removed uevent to the old namespace */
7691	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7692	netdev_adjacent_del_links(dev);
7693
7694	/* Actually switch the network namespace */
7695	dev_net_set(dev, net);
7696
7697	/* If there is an ifindex conflict assign a new one */
7698	if (__dev_get_by_index(net, dev->ifindex))
 
7699		dev->ifindex = dev_new_index(net);
7700
7701	/* Send a netdev-add uevent to the new namespace */
7702	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7703	netdev_adjacent_add_links(dev);
7704
7705	/* Fixup kobjects */
7706	err = device_rename(&dev->dev, dev->name);
7707	WARN_ON(err);
7708
7709	/* Add the device back in the hashes */
7710	list_netdevice(dev);
7711
7712	/* Notify protocols, that a new device appeared. */
7713	call_netdevice_notifiers(NETDEV_REGISTER, dev);
7714
7715	/*
7716	 *	Prevent userspace races by waiting until the network
7717	 *	device is fully setup before sending notifications.
7718	 */
7719	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7720
7721	synchronize_net();
7722	err = 0;
7723out:
7724	return err;
7725}
7726EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7727
7728static int dev_cpu_callback(struct notifier_block *nfb,
7729			    unsigned long action,
7730			    void *ocpu)
7731{
7732	struct sk_buff **list_skb;
7733	struct sk_buff *skb;
7734	unsigned int cpu, oldcpu = (unsigned long)ocpu;
7735	struct softnet_data *sd, *oldsd;
7736
7737	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7738		return NOTIFY_OK;
7739
7740	local_irq_disable();
7741	cpu = smp_processor_id();
7742	sd = &per_cpu(softnet_data, cpu);
7743	oldsd = &per_cpu(softnet_data, oldcpu);
7744
7745	/* Find end of our completion_queue. */
7746	list_skb = &sd->completion_queue;
7747	while (*list_skb)
7748		list_skb = &(*list_skb)->next;
7749	/* Append completion queue from offline CPU. */
7750	*list_skb = oldsd->completion_queue;
7751	oldsd->completion_queue = NULL;
7752
7753	/* Append output queue from offline CPU. */
7754	if (oldsd->output_queue) {
7755		*sd->output_queue_tailp = oldsd->output_queue;
7756		sd->output_queue_tailp = oldsd->output_queue_tailp;
7757		oldsd->output_queue = NULL;
7758		oldsd->output_queue_tailp = &oldsd->output_queue;
7759	}
7760	/* Append NAPI poll list from offline CPU, with one exception :
7761	 * process_backlog() must be called by cpu owning percpu backlog.
7762	 * We properly handle process_queue & input_pkt_queue later.
7763	 */
7764	while (!list_empty(&oldsd->poll_list)) {
7765		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7766							    struct napi_struct,
7767							    poll_list);
7768
7769		list_del_init(&napi->poll_list);
7770		if (napi->poll == process_backlog)
7771			napi->state = 0;
7772		else
7773			____napi_schedule(sd, napi);
7774	}
7775
7776	raise_softirq_irqoff(NET_TX_SOFTIRQ);
7777	local_irq_enable();
7778
7779	/* Process offline CPU's input_pkt_queue */
7780	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7781		netif_rx_ni(skb);
7782		input_queue_head_incr(oldsd);
7783	}
7784	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7785		netif_rx_ni(skb);
7786		input_queue_head_incr(oldsd);
7787	}
7788
7789	return NOTIFY_OK;
7790}
7791
7792
7793/**
7794 *	netdev_increment_features - increment feature set by one
7795 *	@all: current feature set
7796 *	@one: new feature set
7797 *	@mask: mask feature set
7798 *
7799 *	Computes a new feature set after adding a device with feature set
7800 *	@one to the master device with current feature set @all.  Will not
7801 *	enable anything that is off in @mask. Returns the new feature set.
7802 */
7803netdev_features_t netdev_increment_features(netdev_features_t all,
7804	netdev_features_t one, netdev_features_t mask)
7805{
7806	if (mask & NETIF_F_HW_CSUM)
7807		mask |= NETIF_F_CSUM_MASK;
7808	mask |= NETIF_F_VLAN_CHALLENGED;
7809
7810	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
7811	all &= one | ~NETIF_F_ALL_FOR_ALL;
7812
7813	/* If one device supports hw checksumming, set for all. */
7814	if (all & NETIF_F_HW_CSUM)
7815		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
7816
7817	return all;
7818}
7819EXPORT_SYMBOL(netdev_increment_features);
7820
7821static struct hlist_head * __net_init netdev_create_hash(void)
7822{
7823	int i;
7824	struct hlist_head *hash;
7825
7826	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7827	if (hash != NULL)
7828		for (i = 0; i < NETDEV_HASHENTRIES; i++)
7829			INIT_HLIST_HEAD(&hash[i]);
7830
7831	return hash;
7832}
7833
7834/* Initialize per network namespace state */
7835static int __net_init netdev_init(struct net *net)
7836{
7837	if (net != &init_net)
7838		INIT_LIST_HEAD(&net->dev_base_head);
7839
7840	net->dev_name_head = netdev_create_hash();
7841	if (net->dev_name_head == NULL)
7842		goto err_name;
7843
7844	net->dev_index_head = netdev_create_hash();
7845	if (net->dev_index_head == NULL)
7846		goto err_idx;
7847
7848	return 0;
7849
7850err_idx:
7851	kfree(net->dev_name_head);
7852err_name:
7853	return -ENOMEM;
7854}
7855
7856/**
7857 *	netdev_drivername - network driver for the device
7858 *	@dev: network device
7859 *
7860 *	Determine network driver for device.
7861 */
7862const char *netdev_drivername(const struct net_device *dev)
7863{
7864	const struct device_driver *driver;
7865	const struct device *parent;
7866	const char *empty = "";
7867
7868	parent = dev->dev.parent;
7869	if (!parent)
7870		return empty;
7871
7872	driver = parent->driver;
7873	if (driver && driver->name)
7874		return driver->name;
7875	return empty;
7876}
7877
7878static void __netdev_printk(const char *level, const struct net_device *dev,
7879			    struct va_format *vaf)
7880{
7881	if (dev && dev->dev.parent) {
7882		dev_printk_emit(level[1] - '0',
7883				dev->dev.parent,
7884				"%s %s %s%s: %pV",
7885				dev_driver_string(dev->dev.parent),
7886				dev_name(dev->dev.parent),
7887				netdev_name(dev), netdev_reg_state(dev),
7888				vaf);
7889	} else if (dev) {
7890		printk("%s%s%s: %pV",
7891		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
7892	} else {
7893		printk("%s(NULL net_device): %pV", level, vaf);
7894	}
7895}
 
7896
7897void netdev_printk(const char *level, const struct net_device *dev,
7898		   const char *format, ...)
7899{
7900	struct va_format vaf;
7901	va_list args;
 
7902
7903	va_start(args, format);
7904
7905	vaf.fmt = format;
7906	vaf.va = &args;
7907
7908	__netdev_printk(level, dev, &vaf);
7909
7910	va_end(args);
 
 
7911}
7912EXPORT_SYMBOL(netdev_printk);
7913
7914#define define_netdev_printk_level(func, level)			\
7915void func(const struct net_device *dev, const char *fmt, ...)	\
7916{								\
 
7917	struct va_format vaf;					\
7918	va_list args;						\
7919								\
7920	va_start(args, fmt);					\
7921								\
7922	vaf.fmt = fmt;						\
7923	vaf.va = &args;						\
7924								\
7925	__netdev_printk(level, dev, &vaf);			\
7926								\
7927	va_end(args);						\
 
 
7928}								\
7929EXPORT_SYMBOL(func);
7930
7931define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7932define_netdev_printk_level(netdev_alert, KERN_ALERT);
7933define_netdev_printk_level(netdev_crit, KERN_CRIT);
7934define_netdev_printk_level(netdev_err, KERN_ERR);
7935define_netdev_printk_level(netdev_warn, KERN_WARNING);
7936define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7937define_netdev_printk_level(netdev_info, KERN_INFO);
7938
7939static void __net_exit netdev_exit(struct net *net)
7940{
7941	kfree(net->dev_name_head);
7942	kfree(net->dev_index_head);
7943}
7944
7945static struct pernet_operations __net_initdata netdev_net_ops = {
7946	.init = netdev_init,
7947	.exit = netdev_exit,
7948};
7949
7950static void __net_exit default_device_exit(struct net *net)
7951{
7952	struct net_device *dev, *aux;
7953	/*
7954	 * Push all migratable network devices back to the
7955	 * initial network namespace
7956	 */
7957	rtnl_lock();
7958	for_each_netdev_safe(net, dev, aux) {
7959		int err;
7960		char fb_name[IFNAMSIZ];
7961
7962		/* Ignore unmoveable devices (i.e. loopback) */
7963		if (dev->features & NETIF_F_NETNS_LOCAL)
7964			continue;
7965
7966		/* Leave virtual devices for the generic cleanup */
7967		if (dev->rtnl_link_ops)
7968			continue;
7969
7970		/* Push remaining network devices to init_net */
7971		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7972		err = dev_change_net_namespace(dev, &init_net, fb_name);
7973		if (err) {
7974			pr_emerg("%s: failed to move %s to init_net: %d\n",
7975				 __func__, dev->name, err);
7976			BUG();
7977		}
7978	}
7979	rtnl_unlock();
7980}
7981
7982static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7983{
7984	/* Return with the rtnl_lock held when there are no network
7985	 * devices unregistering in any network namespace in net_list.
7986	 */
7987	struct net *net;
7988	bool unregistering;
7989	DEFINE_WAIT_FUNC(wait, woken_wake_function);
7990
7991	add_wait_queue(&netdev_unregistering_wq, &wait);
7992	for (;;) {
7993		unregistering = false;
7994		rtnl_lock();
7995		list_for_each_entry(net, net_list, exit_list) {
7996			if (net->dev_unreg_count > 0) {
7997				unregistering = true;
7998				break;
7999			}
8000		}
8001		if (!unregistering)
8002			break;
8003		__rtnl_unlock();
8004
8005		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8006	}
8007	remove_wait_queue(&netdev_unregistering_wq, &wait);
8008}
8009
8010static void __net_exit default_device_exit_batch(struct list_head *net_list)
8011{
8012	/* At exit all network devices most be removed from a network
8013	 * namespace.  Do this in the reverse order of registration.
8014	 * Do this across as many network namespaces as possible to
8015	 * improve batching efficiency.
8016	 */
8017	struct net_device *dev;
8018	struct net *net;
8019	LIST_HEAD(dev_kill_list);
8020
8021	/* To prevent network device cleanup code from dereferencing
8022	 * loopback devices or network devices that have been freed
8023	 * wait here for all pending unregistrations to complete,
8024	 * before unregistring the loopback device and allowing the
8025	 * network namespace be freed.
8026	 *
8027	 * The netdev todo list containing all network devices
8028	 * unregistrations that happen in default_device_exit_batch
8029	 * will run in the rtnl_unlock() at the end of
8030	 * default_device_exit_batch.
8031	 */
8032	rtnl_lock_unregistering(net_list);
8033	list_for_each_entry(net, net_list, exit_list) {
8034		for_each_netdev_reverse(net, dev) {
8035			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8036				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8037			else
8038				unregister_netdevice_queue(dev, &dev_kill_list);
8039		}
8040	}
8041	unregister_netdevice_many(&dev_kill_list);
 
8042	rtnl_unlock();
8043}
8044
8045static struct pernet_operations __net_initdata default_device_ops = {
8046	.exit = default_device_exit,
8047	.exit_batch = default_device_exit_batch,
8048};
8049
8050/*
8051 *	Initialize the DEV module. At boot time this walks the device list and
8052 *	unhooks any devices that fail to initialise (normally hardware not
8053 *	present) and leaves us with a valid list of present and active devices.
8054 *
8055 */
8056
8057/*
8058 *       This is called single threaded during boot, so no need
8059 *       to take the rtnl semaphore.
8060 */
8061static int __init net_dev_init(void)
8062{
8063	int i, rc = -ENOMEM;
8064
8065	BUG_ON(!dev_boot_phase);
8066
8067	if (dev_proc_init())
8068		goto out;
8069
8070	if (netdev_kobject_init())
8071		goto out;
8072
8073	INIT_LIST_HEAD(&ptype_all);
8074	for (i = 0; i < PTYPE_HASH_SIZE; i++)
8075		INIT_LIST_HEAD(&ptype_base[i]);
8076
8077	INIT_LIST_HEAD(&offload_base);
8078
8079	if (register_pernet_subsys(&netdev_net_ops))
8080		goto out;
8081
8082	/*
8083	 *	Initialise the packet receive queues.
8084	 */
8085
8086	for_each_possible_cpu(i) {
8087		struct softnet_data *sd = &per_cpu(softnet_data, i);
8088
 
8089		skb_queue_head_init(&sd->input_pkt_queue);
8090		skb_queue_head_init(&sd->process_queue);
 
8091		INIT_LIST_HEAD(&sd->poll_list);
 
8092		sd->output_queue_tailp = &sd->output_queue;
8093#ifdef CONFIG_RPS
8094		sd->csd.func = rps_trigger_softirq;
8095		sd->csd.info = sd;
 
8096		sd->cpu = i;
8097#endif
8098
8099		sd->backlog.poll = process_backlog;
8100		sd->backlog.weight = weight_p;
 
 
8101	}
8102
8103	dev_boot_phase = 0;
8104
8105	/* The loopback device is special if any other network devices
8106	 * is present in a network namespace the loopback device must
8107	 * be present. Since we now dynamically allocate and free the
8108	 * loopback device ensure this invariant is maintained by
8109	 * keeping the loopback device as the first device on the
8110	 * list of network devices.  Ensuring the loopback devices
8111	 * is the first device that appears and the last network device
8112	 * that disappears.
8113	 */
8114	if (register_pernet_device(&loopback_net_ops))
8115		goto out;
8116
8117	if (register_pernet_device(&default_device_ops))
8118		goto out;
8119
8120	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8121	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8122
8123	hotcpu_notifier(dev_cpu_callback, 0);
8124	dst_subsys_init();
 
8125	rc = 0;
8126out:
8127	return rc;
8128}
8129
8130subsys_initcall(net_dev_init);
v3.5.6
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/mutex.h>
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
  97#include <net/net_namespace.h>
  98#include <net/sock.h>
 
  99#include <linux/rtnetlink.h>
 100#include <linux/proc_fs.h>
 101#include <linux/seq_file.h>
 102#include <linux/stat.h>
 103#include <net/dst.h>
 
 104#include <net/pkt_sched.h>
 105#include <net/checksum.h>
 106#include <net/xfrm.h>
 107#include <linux/highmem.h>
 108#include <linux/init.h>
 109#include <linux/kmod.h>
 110#include <linux/module.h>
 111#include <linux/netpoll.h>
 112#include <linux/rcupdate.h>
 113#include <linux/delay.h>
 114#include <net/wext.h>
 115#include <net/iw_handler.h>
 116#include <asm/current.h>
 117#include <linux/audit.h>
 118#include <linux/dmaengine.h>
 119#include <linux/err.h>
 120#include <linux/ctype.h>
 121#include <linux/if_arp.h>
 122#include <linux/if_vlan.h>
 123#include <linux/ip.h>
 124#include <net/ip.h>
 
 125#include <linux/ipv6.h>
 126#include <linux/in.h>
 127#include <linux/jhash.h>
 128#include <linux/random.h>
 129#include <trace/events/napi.h>
 130#include <trace/events/net.h>
 131#include <trace/events/skb.h>
 132#include <linux/pci.h>
 133#include <linux/inetdevice.h>
 134#include <linux/cpu_rmap.h>
 135#include <linux/net_tstamp.h>
 136#include <linux/static_key.h>
 137#include <net/flow_keys.h>
 
 
 
 
 
 
 138
 139#include "net-sysfs.h"
 140
 141/* Instead of increasing this, you should create a hash table. */
 142#define MAX_GRO_SKBS 8
 143
 144/* This should be increased if a protocol with a bigger head is added. */
 145#define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147/*
 148 *	The list of packet types we will receive (as opposed to discard)
 149 *	and the routines to invoke.
 150 *
 151 *	Why 16. Because with 16 the only overlap we get on a hash of the
 152 *	low nibble of the protocol value is RARP/SNAP/X.25.
 153 *
 154 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 155 *             sure which should go first, but I bet it won't make much
 156 *             difference if we are running VLANs.  The good news is that
 157 *             this protocol won't be in the list unless compiled in, so
 158 *             the average user (w/out VLANs) will not be adversely affected.
 159 *             --BLG
 160 *
 161 *		0800	IP
 162 *		8100    802.1Q VLAN
 163 *		0001	802.3
 164 *		0002	AX.25
 165 *		0004	802.2
 166 *		8035	RARP
 167 *		0005	SNAP
 168 *		0805	X.25
 169 *		0806	ARP
 170 *		8137	IPX
 171 *		0009	Localtalk
 172 *		86DD	IPv6
 173 */
 174
 175#define PTYPE_HASH_SIZE	(16)
 176#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 177
 178static DEFINE_SPINLOCK(ptype_lock);
 179static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 180static struct list_head ptype_all __read_mostly;	/* Taps */
 
 
 
 
 
 
 
 181
 182/*
 183 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 184 * semaphore.
 185 *
 186 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 187 *
 188 * Writers must hold the rtnl semaphore while they loop through the
 189 * dev_base_head list, and hold dev_base_lock for writing when they do the
 190 * actual updates.  This allows pure readers to access the list even
 191 * while a writer is preparing to update it.
 192 *
 193 * To put it another way, dev_base_lock is held for writing only to
 194 * protect against pure readers; the rtnl semaphore provides the
 195 * protection against other writers.
 196 *
 197 * See, for example usages, register_netdevice() and
 198 * unregister_netdevice(), which must be called with the rtnl
 199 * semaphore held.
 200 */
 201DEFINE_RWLOCK(dev_base_lock);
 202EXPORT_SYMBOL(dev_base_lock);
 203
 
 
 
 
 
 
 
 
 204static inline void dev_base_seq_inc(struct net *net)
 205{
 206	while (++net->dev_base_seq == 0);
 207}
 208
 209static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 210{
 211	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 212
 213	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 214}
 215
 216static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 217{
 218	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 219}
 220
 221static inline void rps_lock(struct softnet_data *sd)
 222{
 223#ifdef CONFIG_RPS
 224	spin_lock(&sd->input_pkt_queue.lock);
 225#endif
 226}
 227
 228static inline void rps_unlock(struct softnet_data *sd)
 229{
 230#ifdef CONFIG_RPS
 231	spin_unlock(&sd->input_pkt_queue.lock);
 232#endif
 233}
 234
 235/* Device list insertion */
 236static int list_netdevice(struct net_device *dev)
 237{
 238	struct net *net = dev_net(dev);
 239
 240	ASSERT_RTNL();
 241
 242	write_lock_bh(&dev_base_lock);
 243	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 244	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 245	hlist_add_head_rcu(&dev->index_hlist,
 246			   dev_index_hash(net, dev->ifindex));
 247	write_unlock_bh(&dev_base_lock);
 248
 249	dev_base_seq_inc(net);
 250
 251	return 0;
 252}
 253
 254/* Device list removal
 255 * caller must respect a RCU grace period before freeing/reusing dev
 256 */
 257static void unlist_netdevice(struct net_device *dev)
 258{
 259	ASSERT_RTNL();
 260
 261	/* Unlink dev from the device chain */
 262	write_lock_bh(&dev_base_lock);
 263	list_del_rcu(&dev->dev_list);
 264	hlist_del_rcu(&dev->name_hlist);
 265	hlist_del_rcu(&dev->index_hlist);
 266	write_unlock_bh(&dev_base_lock);
 267
 268	dev_base_seq_inc(dev_net(dev));
 269}
 270
 271/*
 272 *	Our notifier list
 273 */
 274
 275static RAW_NOTIFIER_HEAD(netdev_chain);
 276
 277/*
 278 *	Device drivers call our routines to queue packets here. We empty the
 279 *	queue in the local softnet handler.
 280 */
 281
 282DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 283EXPORT_PER_CPU_SYMBOL(softnet_data);
 284
 285#ifdef CONFIG_LOCKDEP
 286/*
 287 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 288 * according to dev->type
 289 */
 290static const unsigned short netdev_lock_type[] =
 291	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 292	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 293	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 294	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 295	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 296	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 297	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 298	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 299	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 300	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 301	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 302	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 303	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 304	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 305	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 306
 307static const char *const netdev_lock_name[] =
 308	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 309	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 310	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 311	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 312	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 313	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 314	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 315	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 316	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 317	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 318	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 319	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 320	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 321	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 322	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 323
 324static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 325static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 326
 327static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 328{
 329	int i;
 330
 331	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 332		if (netdev_lock_type[i] == dev_type)
 333			return i;
 334	/* the last key is used by default */
 335	return ARRAY_SIZE(netdev_lock_type) - 1;
 336}
 337
 338static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 339						 unsigned short dev_type)
 340{
 341	int i;
 342
 343	i = netdev_lock_pos(dev_type);
 344	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 345				   netdev_lock_name[i]);
 346}
 347
 348static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 349{
 350	int i;
 351
 352	i = netdev_lock_pos(dev->type);
 353	lockdep_set_class_and_name(&dev->addr_list_lock,
 354				   &netdev_addr_lock_key[i],
 355				   netdev_lock_name[i]);
 356}
 357#else
 358static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 359						 unsigned short dev_type)
 360{
 361}
 362static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 363{
 364}
 365#endif
 366
 367/*******************************************************************************
 368
 369		Protocol management and registration routines
 370
 371*******************************************************************************/
 372
 373/*
 374 *	Add a protocol ID to the list. Now that the input handler is
 375 *	smarter we can dispense with all the messy stuff that used to be
 376 *	here.
 377 *
 378 *	BEWARE!!! Protocol handlers, mangling input packets,
 379 *	MUST BE last in hash buckets and checking protocol handlers
 380 *	MUST start from promiscuous ptype_all chain in net_bh.
 381 *	It is true now, do not change it.
 382 *	Explanation follows: if protocol handler, mangling packet, will
 383 *	be the first on list, it is not able to sense, that packet
 384 *	is cloned and should be copied-on-write, so that it will
 385 *	change it and subsequent readers will get broken packet.
 386 *							--ANK (980803)
 387 */
 388
 389static inline struct list_head *ptype_head(const struct packet_type *pt)
 390{
 391	if (pt->type == htons(ETH_P_ALL))
 392		return &ptype_all;
 393	else
 394		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 
 395}
 396
 397/**
 398 *	dev_add_pack - add packet handler
 399 *	@pt: packet type declaration
 400 *
 401 *	Add a protocol handler to the networking stack. The passed &packet_type
 402 *	is linked into kernel lists and may not be freed until it has been
 403 *	removed from the kernel lists.
 404 *
 405 *	This call does not sleep therefore it can not
 406 *	guarantee all CPU's that are in middle of receiving packets
 407 *	will see the new packet type (until the next received packet).
 408 */
 409
 410void dev_add_pack(struct packet_type *pt)
 411{
 412	struct list_head *head = ptype_head(pt);
 413
 414	spin_lock(&ptype_lock);
 415	list_add_rcu(&pt->list, head);
 416	spin_unlock(&ptype_lock);
 417}
 418EXPORT_SYMBOL(dev_add_pack);
 419
 420/**
 421 *	__dev_remove_pack	 - remove packet handler
 422 *	@pt: packet type declaration
 423 *
 424 *	Remove a protocol handler that was previously added to the kernel
 425 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 426 *	from the kernel lists and can be freed or reused once this function
 427 *	returns.
 428 *
 429 *      The packet type might still be in use by receivers
 430 *	and must not be freed until after all the CPU's have gone
 431 *	through a quiescent state.
 432 */
 433void __dev_remove_pack(struct packet_type *pt)
 434{
 435	struct list_head *head = ptype_head(pt);
 436	struct packet_type *pt1;
 437
 438	spin_lock(&ptype_lock);
 439
 440	list_for_each_entry(pt1, head, list) {
 441		if (pt == pt1) {
 442			list_del_rcu(&pt->list);
 443			goto out;
 444		}
 445	}
 446
 447	pr_warn("dev_remove_pack: %p not found\n", pt);
 448out:
 449	spin_unlock(&ptype_lock);
 450}
 451EXPORT_SYMBOL(__dev_remove_pack);
 452
 453/**
 454 *	dev_remove_pack	 - remove packet handler
 455 *	@pt: packet type declaration
 456 *
 457 *	Remove a protocol handler that was previously added to the kernel
 458 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 459 *	from the kernel lists and can be freed or reused once this function
 460 *	returns.
 461 *
 462 *	This call sleeps to guarantee that no CPU is looking at the packet
 463 *	type after return.
 464 */
 465void dev_remove_pack(struct packet_type *pt)
 466{
 467	__dev_remove_pack(pt);
 468
 469	synchronize_net();
 470}
 471EXPORT_SYMBOL(dev_remove_pack);
 472
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 473/******************************************************************************
 474
 475		      Device Boot-time Settings Routines
 476
 477*******************************************************************************/
 478
 479/* Boot time configuration table */
 480static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 481
 482/**
 483 *	netdev_boot_setup_add	- add new setup entry
 484 *	@name: name of the device
 485 *	@map: configured settings for the device
 486 *
 487 *	Adds new setup entry to the dev_boot_setup list.  The function
 488 *	returns 0 on error and 1 on success.  This is a generic routine to
 489 *	all netdevices.
 490 */
 491static int netdev_boot_setup_add(char *name, struct ifmap *map)
 492{
 493	struct netdev_boot_setup *s;
 494	int i;
 495
 496	s = dev_boot_setup;
 497	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 498		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 499			memset(s[i].name, 0, sizeof(s[i].name));
 500			strlcpy(s[i].name, name, IFNAMSIZ);
 501			memcpy(&s[i].map, map, sizeof(s[i].map));
 502			break;
 503		}
 504	}
 505
 506	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 507}
 508
 509/**
 510 *	netdev_boot_setup_check	- check boot time settings
 511 *	@dev: the netdevice
 512 *
 513 * 	Check boot time settings for the device.
 514 *	The found settings are set for the device to be used
 515 *	later in the device probing.
 516 *	Returns 0 if no settings found, 1 if they are.
 517 */
 518int netdev_boot_setup_check(struct net_device *dev)
 519{
 520	struct netdev_boot_setup *s = dev_boot_setup;
 521	int i;
 522
 523	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 524		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 525		    !strcmp(dev->name, s[i].name)) {
 526			dev->irq 	= s[i].map.irq;
 527			dev->base_addr 	= s[i].map.base_addr;
 528			dev->mem_start 	= s[i].map.mem_start;
 529			dev->mem_end 	= s[i].map.mem_end;
 530			return 1;
 531		}
 532	}
 533	return 0;
 534}
 535EXPORT_SYMBOL(netdev_boot_setup_check);
 536
 537
 538/**
 539 *	netdev_boot_base	- get address from boot time settings
 540 *	@prefix: prefix for network device
 541 *	@unit: id for network device
 542 *
 543 * 	Check boot time settings for the base address of device.
 544 *	The found settings are set for the device to be used
 545 *	later in the device probing.
 546 *	Returns 0 if no settings found.
 547 */
 548unsigned long netdev_boot_base(const char *prefix, int unit)
 549{
 550	const struct netdev_boot_setup *s = dev_boot_setup;
 551	char name[IFNAMSIZ];
 552	int i;
 553
 554	sprintf(name, "%s%d", prefix, unit);
 555
 556	/*
 557	 * If device already registered then return base of 1
 558	 * to indicate not to probe for this interface
 559	 */
 560	if (__dev_get_by_name(&init_net, name))
 561		return 1;
 562
 563	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 564		if (!strcmp(name, s[i].name))
 565			return s[i].map.base_addr;
 566	return 0;
 567}
 568
 569/*
 570 * Saves at boot time configured settings for any netdevice.
 571 */
 572int __init netdev_boot_setup(char *str)
 573{
 574	int ints[5];
 575	struct ifmap map;
 576
 577	str = get_options(str, ARRAY_SIZE(ints), ints);
 578	if (!str || !*str)
 579		return 0;
 580
 581	/* Save settings */
 582	memset(&map, 0, sizeof(map));
 583	if (ints[0] > 0)
 584		map.irq = ints[1];
 585	if (ints[0] > 1)
 586		map.base_addr = ints[2];
 587	if (ints[0] > 2)
 588		map.mem_start = ints[3];
 589	if (ints[0] > 3)
 590		map.mem_end = ints[4];
 591
 592	/* Add new entry to the list */
 593	return netdev_boot_setup_add(str, &map);
 594}
 595
 596__setup("netdev=", netdev_boot_setup);
 597
 598/*******************************************************************************
 599
 600			    Device Interface Subroutines
 601
 602*******************************************************************************/
 603
 604/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 605 *	__dev_get_by_name	- find a device by its name
 606 *	@net: the applicable net namespace
 607 *	@name: name to find
 608 *
 609 *	Find an interface by name. Must be called under RTNL semaphore
 610 *	or @dev_base_lock. If the name is found a pointer to the device
 611 *	is returned. If the name is not found then %NULL is returned. The
 612 *	reference counters are not incremented so the caller must be
 613 *	careful with locks.
 614 */
 615
 616struct net_device *__dev_get_by_name(struct net *net, const char *name)
 617{
 618	struct hlist_node *p;
 619	struct net_device *dev;
 620	struct hlist_head *head = dev_name_hash(net, name);
 621
 622	hlist_for_each_entry(dev, p, head, name_hlist)
 623		if (!strncmp(dev->name, name, IFNAMSIZ))
 624			return dev;
 625
 626	return NULL;
 627}
 628EXPORT_SYMBOL(__dev_get_by_name);
 629
 630/**
 631 *	dev_get_by_name_rcu	- find a device by its name
 632 *	@net: the applicable net namespace
 633 *	@name: name to find
 634 *
 635 *	Find an interface by name.
 636 *	If the name is found a pointer to the device is returned.
 637 * 	If the name is not found then %NULL is returned.
 638 *	The reference counters are not incremented so the caller must be
 639 *	careful with locks. The caller must hold RCU lock.
 640 */
 641
 642struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 643{
 644	struct hlist_node *p;
 645	struct net_device *dev;
 646	struct hlist_head *head = dev_name_hash(net, name);
 647
 648	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 649		if (!strncmp(dev->name, name, IFNAMSIZ))
 650			return dev;
 651
 652	return NULL;
 653}
 654EXPORT_SYMBOL(dev_get_by_name_rcu);
 655
 656/**
 657 *	dev_get_by_name		- find a device by its name
 658 *	@net: the applicable net namespace
 659 *	@name: name to find
 660 *
 661 *	Find an interface by name. This can be called from any
 662 *	context and does its own locking. The returned handle has
 663 *	the usage count incremented and the caller must use dev_put() to
 664 *	release it when it is no longer needed. %NULL is returned if no
 665 *	matching device is found.
 666 */
 667
 668struct net_device *dev_get_by_name(struct net *net, const char *name)
 669{
 670	struct net_device *dev;
 671
 672	rcu_read_lock();
 673	dev = dev_get_by_name_rcu(net, name);
 674	if (dev)
 675		dev_hold(dev);
 676	rcu_read_unlock();
 677	return dev;
 678}
 679EXPORT_SYMBOL(dev_get_by_name);
 680
 681/**
 682 *	__dev_get_by_index - find a device by its ifindex
 683 *	@net: the applicable net namespace
 684 *	@ifindex: index of device
 685 *
 686 *	Search for an interface by index. Returns %NULL if the device
 687 *	is not found or a pointer to the device. The device has not
 688 *	had its reference counter increased so the caller must be careful
 689 *	about locking. The caller must hold either the RTNL semaphore
 690 *	or @dev_base_lock.
 691 */
 692
 693struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 694{
 695	struct hlist_node *p;
 696	struct net_device *dev;
 697	struct hlist_head *head = dev_index_hash(net, ifindex);
 698
 699	hlist_for_each_entry(dev, p, head, index_hlist)
 700		if (dev->ifindex == ifindex)
 701			return dev;
 702
 703	return NULL;
 704}
 705EXPORT_SYMBOL(__dev_get_by_index);
 706
 707/**
 708 *	dev_get_by_index_rcu - find a device by its ifindex
 709 *	@net: the applicable net namespace
 710 *	@ifindex: index of device
 711 *
 712 *	Search for an interface by index. Returns %NULL if the device
 713 *	is not found or a pointer to the device. The device has not
 714 *	had its reference counter increased so the caller must be careful
 715 *	about locking. The caller must hold RCU lock.
 716 */
 717
 718struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 719{
 720	struct hlist_node *p;
 721	struct net_device *dev;
 722	struct hlist_head *head = dev_index_hash(net, ifindex);
 723
 724	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 725		if (dev->ifindex == ifindex)
 726			return dev;
 727
 728	return NULL;
 729}
 730EXPORT_SYMBOL(dev_get_by_index_rcu);
 731
 732
 733/**
 734 *	dev_get_by_index - find a device by its ifindex
 735 *	@net: the applicable net namespace
 736 *	@ifindex: index of device
 737 *
 738 *	Search for an interface by index. Returns NULL if the device
 739 *	is not found or a pointer to the device. The device returned has
 740 *	had a reference added and the pointer is safe until the user calls
 741 *	dev_put to indicate they have finished with it.
 742 */
 743
 744struct net_device *dev_get_by_index(struct net *net, int ifindex)
 745{
 746	struct net_device *dev;
 747
 748	rcu_read_lock();
 749	dev = dev_get_by_index_rcu(net, ifindex);
 750	if (dev)
 751		dev_hold(dev);
 752	rcu_read_unlock();
 753	return dev;
 754}
 755EXPORT_SYMBOL(dev_get_by_index);
 756
 757/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 758 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 759 *	@net: the applicable net namespace
 760 *	@type: media type of device
 761 *	@ha: hardware address
 762 *
 763 *	Search for an interface by MAC address. Returns NULL if the device
 764 *	is not found or a pointer to the device.
 765 *	The caller must hold RCU or RTNL.
 766 *	The returned device has not had its ref count increased
 767 *	and the caller must therefore be careful about locking
 768 *
 769 */
 770
 771struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 772				       const char *ha)
 773{
 774	struct net_device *dev;
 775
 776	for_each_netdev_rcu(net, dev)
 777		if (dev->type == type &&
 778		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 779			return dev;
 780
 781	return NULL;
 782}
 783EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 784
 785struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 786{
 787	struct net_device *dev;
 788
 789	ASSERT_RTNL();
 790	for_each_netdev(net, dev)
 791		if (dev->type == type)
 792			return dev;
 793
 794	return NULL;
 795}
 796EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 797
 798struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 799{
 800	struct net_device *dev, *ret = NULL;
 801
 802	rcu_read_lock();
 803	for_each_netdev_rcu(net, dev)
 804		if (dev->type == type) {
 805			dev_hold(dev);
 806			ret = dev;
 807			break;
 808		}
 809	rcu_read_unlock();
 810	return ret;
 811}
 812EXPORT_SYMBOL(dev_getfirstbyhwtype);
 813
 814/**
 815 *	dev_get_by_flags_rcu - find any device with given flags
 816 *	@net: the applicable net namespace
 817 *	@if_flags: IFF_* values
 818 *	@mask: bitmask of bits in if_flags to check
 819 *
 820 *	Search for any interface with the given flags. Returns NULL if a device
 821 *	is not found or a pointer to the device. Must be called inside
 822 *	rcu_read_lock(), and result refcount is unchanged.
 823 */
 824
 825struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 826				    unsigned short mask)
 827{
 828	struct net_device *dev, *ret;
 829
 
 
 830	ret = NULL;
 831	for_each_netdev_rcu(net, dev) {
 832		if (((dev->flags ^ if_flags) & mask) == 0) {
 833			ret = dev;
 834			break;
 835		}
 836	}
 837	return ret;
 838}
 839EXPORT_SYMBOL(dev_get_by_flags_rcu);
 840
 841/**
 842 *	dev_valid_name - check if name is okay for network device
 843 *	@name: name string
 844 *
 845 *	Network device names need to be valid file names to
 846 *	to allow sysfs to work.  We also disallow any kind of
 847 *	whitespace.
 848 */
 849bool dev_valid_name(const char *name)
 850{
 851	if (*name == '\0')
 852		return false;
 853	if (strlen(name) >= IFNAMSIZ)
 854		return false;
 855	if (!strcmp(name, ".") || !strcmp(name, ".."))
 856		return false;
 857
 858	while (*name) {
 859		if (*name == '/' || isspace(*name))
 860			return false;
 861		name++;
 862	}
 863	return true;
 864}
 865EXPORT_SYMBOL(dev_valid_name);
 866
 867/**
 868 *	__dev_alloc_name - allocate a name for a device
 869 *	@net: network namespace to allocate the device name in
 870 *	@name: name format string
 871 *	@buf:  scratch buffer and result name string
 872 *
 873 *	Passed a format string - eg "lt%d" it will try and find a suitable
 874 *	id. It scans list of devices to build up a free map, then chooses
 875 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 876 *	while allocating the name and adding the device in order to avoid
 877 *	duplicates.
 878 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 879 *	Returns the number of the unit assigned or a negative errno code.
 880 */
 881
 882static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 883{
 884	int i = 0;
 885	const char *p;
 886	const int max_netdevices = 8*PAGE_SIZE;
 887	unsigned long *inuse;
 888	struct net_device *d;
 889
 890	p = strnchr(name, IFNAMSIZ-1, '%');
 891	if (p) {
 892		/*
 893		 * Verify the string as this thing may have come from
 894		 * the user.  There must be either one "%d" and no other "%"
 895		 * characters.
 896		 */
 897		if (p[1] != 'd' || strchr(p + 2, '%'))
 898			return -EINVAL;
 899
 900		/* Use one page as a bit array of possible slots */
 901		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 902		if (!inuse)
 903			return -ENOMEM;
 904
 905		for_each_netdev(net, d) {
 906			if (!sscanf(d->name, name, &i))
 907				continue;
 908			if (i < 0 || i >= max_netdevices)
 909				continue;
 910
 911			/*  avoid cases where sscanf is not exact inverse of printf */
 912			snprintf(buf, IFNAMSIZ, name, i);
 913			if (!strncmp(buf, d->name, IFNAMSIZ))
 914				set_bit(i, inuse);
 915		}
 916
 917		i = find_first_zero_bit(inuse, max_netdevices);
 918		free_page((unsigned long) inuse);
 919	}
 920
 921	if (buf != name)
 922		snprintf(buf, IFNAMSIZ, name, i);
 923	if (!__dev_get_by_name(net, buf))
 924		return i;
 925
 926	/* It is possible to run out of possible slots
 927	 * when the name is long and there isn't enough space left
 928	 * for the digits, or if all bits are used.
 929	 */
 930	return -ENFILE;
 931}
 932
 933/**
 934 *	dev_alloc_name - allocate a name for a device
 935 *	@dev: device
 936 *	@name: name format string
 937 *
 938 *	Passed a format string - eg "lt%d" it will try and find a suitable
 939 *	id. It scans list of devices to build up a free map, then chooses
 940 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 941 *	while allocating the name and adding the device in order to avoid
 942 *	duplicates.
 943 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 944 *	Returns the number of the unit assigned or a negative errno code.
 945 */
 946
 947int dev_alloc_name(struct net_device *dev, const char *name)
 948{
 949	char buf[IFNAMSIZ];
 950	struct net *net;
 951	int ret;
 952
 953	BUG_ON(!dev_net(dev));
 954	net = dev_net(dev);
 955	ret = __dev_alloc_name(net, name, buf);
 956	if (ret >= 0)
 957		strlcpy(dev->name, buf, IFNAMSIZ);
 958	return ret;
 959}
 960EXPORT_SYMBOL(dev_alloc_name);
 961
 962static int dev_get_valid_name(struct net_device *dev, const char *name)
 
 
 963{
 964	struct net *net;
 
 
 
 
 
 
 
 965
 966	BUG_ON(!dev_net(dev));
 967	net = dev_net(dev);
 
 
 
 968
 969	if (!dev_valid_name(name))
 970		return -EINVAL;
 971
 972	if (strchr(name, '%'))
 973		return dev_alloc_name(dev, name);
 974	else if (__dev_get_by_name(net, name))
 975		return -EEXIST;
 976	else if (dev->name != name)
 977		strlcpy(dev->name, name, IFNAMSIZ);
 978
 979	return 0;
 980}
 981
 982/**
 983 *	dev_change_name - change name of a device
 984 *	@dev: device
 985 *	@newname: name (or format string) must be at least IFNAMSIZ
 986 *
 987 *	Change name of a device, can pass format strings "eth%d".
 988 *	for wildcarding.
 989 */
 990int dev_change_name(struct net_device *dev, const char *newname)
 991{
 
 992	char oldname[IFNAMSIZ];
 993	int err = 0;
 994	int ret;
 995	struct net *net;
 996
 997	ASSERT_RTNL();
 998	BUG_ON(!dev_net(dev));
 999
1000	net = dev_net(dev);
1001	if (dev->flags & IFF_UP)
1002		return -EBUSY;
1003
1004	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 
 
 
1005		return 0;
 
1006
1007	memcpy(oldname, dev->name, IFNAMSIZ);
1008
1009	err = dev_get_valid_name(dev, newname);
1010	if (err < 0)
 
1011		return err;
 
 
 
 
 
 
 
1012
1013rollback:
1014	ret = device_rename(&dev->dev, dev->name);
1015	if (ret) {
1016		memcpy(dev->name, oldname, IFNAMSIZ);
 
 
1017		return ret;
1018	}
1019
 
 
 
 
1020	write_lock_bh(&dev_base_lock);
1021	hlist_del_rcu(&dev->name_hlist);
1022	write_unlock_bh(&dev_base_lock);
1023
1024	synchronize_rcu();
1025
1026	write_lock_bh(&dev_base_lock);
1027	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1028	write_unlock_bh(&dev_base_lock);
1029
1030	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1031	ret = notifier_to_errno(ret);
1032
1033	if (ret) {
1034		/* err >= 0 after dev_alloc_name() or stores the first errno */
1035		if (err >= 0) {
1036			err = ret;
 
1037			memcpy(dev->name, oldname, IFNAMSIZ);
 
 
 
1038			goto rollback;
1039		} else {
1040			pr_err("%s: name change rollback failed: %d\n",
1041			       dev->name, ret);
1042		}
1043	}
1044
1045	return err;
1046}
1047
1048/**
1049 *	dev_set_alias - change ifalias of a device
1050 *	@dev: device
1051 *	@alias: name up to IFALIASZ
1052 *	@len: limit of bytes to copy from info
1053 *
1054 *	Set ifalias for a device,
1055 */
1056int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1057{
1058	char *new_ifalias;
1059
1060	ASSERT_RTNL();
1061
1062	if (len >= IFALIASZ)
1063		return -EINVAL;
1064
1065	if (!len) {
1066		if (dev->ifalias) {
1067			kfree(dev->ifalias);
1068			dev->ifalias = NULL;
1069		}
1070		return 0;
1071	}
1072
1073	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1074	if (!new_ifalias)
1075		return -ENOMEM;
1076	dev->ifalias = new_ifalias;
1077
1078	strlcpy(dev->ifalias, alias, len+1);
1079	return len;
1080}
1081
1082
1083/**
1084 *	netdev_features_change - device changes features
1085 *	@dev: device to cause notification
1086 *
1087 *	Called to indicate a device has changed features.
1088 */
1089void netdev_features_change(struct net_device *dev)
1090{
1091	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1092}
1093EXPORT_SYMBOL(netdev_features_change);
1094
1095/**
1096 *	netdev_state_change - device changes state
1097 *	@dev: device to cause notification
1098 *
1099 *	Called to indicate a device has changed state. This function calls
1100 *	the notifier chains for netdev_chain and sends a NEWLINK message
1101 *	to the routing socket.
1102 */
1103void netdev_state_change(struct net_device *dev)
1104{
1105	if (dev->flags & IFF_UP) {
1106		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1107		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
 
 
 
 
1108	}
1109}
1110EXPORT_SYMBOL(netdev_state_change);
1111
1112int netdev_bonding_change(struct net_device *dev, unsigned long event)
1113{
1114	return call_netdevice_notifiers(event, dev);
1115}
1116EXPORT_SYMBOL(netdev_bonding_change);
1117
1118/**
1119 *	dev_load 	- load a network module
1120 *	@net: the applicable net namespace
1121 *	@name: name of interface
1122 *
1123 *	If a network interface is not present and the process has suitable
1124 *	privileges this function loads the module. If module loading is not
1125 *	available in this kernel then it becomes a nop.
 
 
1126 */
1127
1128void dev_load(struct net *net, const char *name)
1129{
1130	struct net_device *dev;
1131	int no_module;
1132
1133	rcu_read_lock();
1134	dev = dev_get_by_name_rcu(net, name);
1135	rcu_read_unlock();
1136
1137	no_module = !dev;
1138	if (no_module && capable(CAP_NET_ADMIN))
1139		no_module = request_module("netdev-%s", name);
1140	if (no_module && capable(CAP_SYS_MODULE)) {
1141		if (!request_module("%s", name))
1142			pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1143				name);
1144	}
1145}
1146EXPORT_SYMBOL(dev_load);
1147
1148static int __dev_open(struct net_device *dev)
1149{
1150	const struct net_device_ops *ops = dev->netdev_ops;
1151	int ret;
1152
1153	ASSERT_RTNL();
1154
1155	if (!netif_device_present(dev))
1156		return -ENODEV;
1157
 
 
 
 
 
 
1158	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1159	ret = notifier_to_errno(ret);
1160	if (ret)
1161		return ret;
1162
1163	set_bit(__LINK_STATE_START, &dev->state);
1164
1165	if (ops->ndo_validate_addr)
1166		ret = ops->ndo_validate_addr(dev);
1167
1168	if (!ret && ops->ndo_open)
1169		ret = ops->ndo_open(dev);
1170
 
 
1171	if (ret)
1172		clear_bit(__LINK_STATE_START, &dev->state);
1173	else {
1174		dev->flags |= IFF_UP;
1175		net_dmaengine_get();
1176		dev_set_rx_mode(dev);
1177		dev_activate(dev);
1178		add_device_randomness(dev->dev_addr, dev->addr_len);
1179	}
1180
1181	return ret;
1182}
1183
1184/**
1185 *	dev_open	- prepare an interface for use.
1186 *	@dev:	device to open
1187 *
1188 *	Takes a device from down to up state. The device's private open
1189 *	function is invoked and then the multicast lists are loaded. Finally
1190 *	the device is moved into the up state and a %NETDEV_UP message is
1191 *	sent to the netdev notifier chain.
1192 *
1193 *	Calling this function on an active interface is a nop. On a failure
1194 *	a negative errno code is returned.
1195 */
1196int dev_open(struct net_device *dev)
1197{
1198	int ret;
1199
1200	if (dev->flags & IFF_UP)
1201		return 0;
1202
1203	ret = __dev_open(dev);
1204	if (ret < 0)
1205		return ret;
1206
1207	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1208	call_netdevice_notifiers(NETDEV_UP, dev);
1209
1210	return ret;
1211}
1212EXPORT_SYMBOL(dev_open);
1213
1214static int __dev_close_many(struct list_head *head)
1215{
1216	struct net_device *dev;
1217
1218	ASSERT_RTNL();
1219	might_sleep();
1220
1221	list_for_each_entry(dev, head, unreg_list) {
 
 
 
1222		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1223
1224		clear_bit(__LINK_STATE_START, &dev->state);
1225
1226		/* Synchronize to scheduled poll. We cannot touch poll list, it
1227		 * can be even on different cpu. So just clear netif_running().
1228		 *
1229		 * dev->stop() will invoke napi_disable() on all of it's
1230		 * napi_struct instances on this device.
1231		 */
1232		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1233	}
1234
1235	dev_deactivate_many(head);
1236
1237	list_for_each_entry(dev, head, unreg_list) {
1238		const struct net_device_ops *ops = dev->netdev_ops;
1239
1240		/*
1241		 *	Call the device specific close. This cannot fail.
1242		 *	Only if device is UP
1243		 *
1244		 *	We allow it to be called even after a DETACH hot-plug
1245		 *	event.
1246		 */
1247		if (ops->ndo_stop)
1248			ops->ndo_stop(dev);
1249
1250		dev->flags &= ~IFF_UP;
1251		net_dmaengine_put();
1252	}
1253
1254	return 0;
1255}
1256
1257static int __dev_close(struct net_device *dev)
1258{
1259	int retval;
1260	LIST_HEAD(single);
1261
1262	list_add(&dev->unreg_list, &single);
1263	retval = __dev_close_many(&single);
1264	list_del(&single);
 
1265	return retval;
1266}
1267
1268static int dev_close_many(struct list_head *head)
1269{
1270	struct net_device *dev, *tmp;
1271	LIST_HEAD(tmp_list);
1272
1273	list_for_each_entry_safe(dev, tmp, head, unreg_list)
 
1274		if (!(dev->flags & IFF_UP))
1275			list_move(&dev->unreg_list, &tmp_list);
1276
1277	__dev_close_many(head);
1278
1279	list_for_each_entry(dev, head, unreg_list) {
1280		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1281		call_netdevice_notifiers(NETDEV_DOWN, dev);
 
 
1282	}
1283
1284	/* rollback_registered_many needs the complete original list */
1285	list_splice(&tmp_list, head);
1286	return 0;
1287}
 
1288
1289/**
1290 *	dev_close - shutdown an interface.
1291 *	@dev: device to shutdown
1292 *
1293 *	This function moves an active device into down state. A
1294 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1295 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1296 *	chain.
1297 */
1298int dev_close(struct net_device *dev)
1299{
1300	if (dev->flags & IFF_UP) {
1301		LIST_HEAD(single);
1302
1303		list_add(&dev->unreg_list, &single);
1304		dev_close_many(&single);
1305		list_del(&single);
1306	}
1307	return 0;
1308}
1309EXPORT_SYMBOL(dev_close);
1310
1311
1312/**
1313 *	dev_disable_lro - disable Large Receive Offload on a device
1314 *	@dev: device
1315 *
1316 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1317 *	called under RTNL.  This is needed if received packets may be
1318 *	forwarded to another interface.
1319 */
1320void dev_disable_lro(struct net_device *dev)
1321{
1322	/*
1323	 * If we're trying to disable lro on a vlan device
1324	 * use the underlying physical device instead
1325	 */
1326	if (is_vlan_dev(dev))
1327		dev = vlan_dev_real_dev(dev);
1328
1329	dev->wanted_features &= ~NETIF_F_LRO;
1330	netdev_update_features(dev);
1331
1332	if (unlikely(dev->features & NETIF_F_LRO))
1333		netdev_WARN(dev, "failed to disable LRO!\n");
 
 
 
1334}
1335EXPORT_SYMBOL(dev_disable_lro);
1336
 
 
 
 
 
 
 
 
1337
1338static int dev_boot_phase = 1;
1339
1340/**
1341 *	register_netdevice_notifier - register a network notifier block
1342 *	@nb: notifier
1343 *
1344 *	Register a notifier to be called when network device events occur.
1345 *	The notifier passed is linked into the kernel structures and must
1346 *	not be reused until it has been unregistered. A negative errno code
1347 *	is returned on a failure.
1348 *
1349 * 	When registered all registration and up events are replayed
1350 *	to the new notifier to allow device to have a race free
1351 *	view of the network device list.
1352 */
1353
1354int register_netdevice_notifier(struct notifier_block *nb)
1355{
1356	struct net_device *dev;
1357	struct net_device *last;
1358	struct net *net;
1359	int err;
1360
1361	rtnl_lock();
1362	err = raw_notifier_chain_register(&netdev_chain, nb);
1363	if (err)
1364		goto unlock;
1365	if (dev_boot_phase)
1366		goto unlock;
1367	for_each_net(net) {
1368		for_each_netdev(net, dev) {
1369			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1370			err = notifier_to_errno(err);
1371			if (err)
1372				goto rollback;
1373
1374			if (!(dev->flags & IFF_UP))
1375				continue;
1376
1377			nb->notifier_call(nb, NETDEV_UP, dev);
1378		}
1379	}
1380
1381unlock:
1382	rtnl_unlock();
1383	return err;
1384
1385rollback:
1386	last = dev;
1387	for_each_net(net) {
1388		for_each_netdev(net, dev) {
1389			if (dev == last)
1390				goto outroll;
1391
1392			if (dev->flags & IFF_UP) {
1393				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1394				nb->notifier_call(nb, NETDEV_DOWN, dev);
 
1395			}
1396			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1397			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1398		}
1399	}
1400
1401outroll:
1402	raw_notifier_chain_unregister(&netdev_chain, nb);
1403	goto unlock;
1404}
1405EXPORT_SYMBOL(register_netdevice_notifier);
1406
1407/**
1408 *	unregister_netdevice_notifier - unregister a network notifier block
1409 *	@nb: notifier
1410 *
1411 *	Unregister a notifier previously registered by
1412 *	register_netdevice_notifier(). The notifier is unlinked into the
1413 *	kernel structures and may then be reused. A negative errno code
1414 *	is returned on a failure.
1415 *
1416 * 	After unregistering unregister and down device events are synthesized
1417 *	for all devices on the device list to the removed notifier to remove
1418 *	the need for special case cleanup code.
1419 */
1420
1421int unregister_netdevice_notifier(struct notifier_block *nb)
1422{
1423	struct net_device *dev;
1424	struct net *net;
1425	int err;
1426
1427	rtnl_lock();
1428	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1429	if (err)
1430		goto unlock;
1431
1432	for_each_net(net) {
1433		for_each_netdev(net, dev) {
1434			if (dev->flags & IFF_UP) {
1435				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1436				nb->notifier_call(nb, NETDEV_DOWN, dev);
 
1437			}
1438			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1439			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1440		}
1441	}
1442unlock:
1443	rtnl_unlock();
1444	return err;
1445}
1446EXPORT_SYMBOL(unregister_netdevice_notifier);
1447
1448/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1449 *	call_netdevice_notifiers - call all network notifier blocks
1450 *      @val: value passed unmodified to notifier function
1451 *      @dev: net_device pointer passed unmodified to notifier function
1452 *
1453 *	Call all network notifier blocks.  Parameters and return value
1454 *	are as for raw_notifier_call_chain().
1455 */
1456
1457int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1458{
1459	ASSERT_RTNL();
1460	return raw_notifier_call_chain(&netdev_chain, val, dev);
 
1461}
1462EXPORT_SYMBOL(call_netdevice_notifiers);
1463
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1464static struct static_key netstamp_needed __read_mostly;
1465#ifdef HAVE_JUMP_LABEL
1466/* We are not allowed to call static_key_slow_dec() from irq context
1467 * If net_disable_timestamp() is called from irq context, defer the
1468 * static_key_slow_dec() calls.
1469 */
1470static atomic_t netstamp_needed_deferred;
1471#endif
1472
1473void net_enable_timestamp(void)
1474{
1475#ifdef HAVE_JUMP_LABEL
1476	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1477
1478	if (deferred) {
1479		while (--deferred)
1480			static_key_slow_dec(&netstamp_needed);
1481		return;
1482	}
1483#endif
1484	WARN_ON(in_interrupt());
1485	static_key_slow_inc(&netstamp_needed);
1486}
1487EXPORT_SYMBOL(net_enable_timestamp);
1488
1489void net_disable_timestamp(void)
1490{
1491#ifdef HAVE_JUMP_LABEL
1492	if (in_interrupt()) {
1493		atomic_inc(&netstamp_needed_deferred);
1494		return;
1495	}
1496#endif
1497	static_key_slow_dec(&netstamp_needed);
1498}
1499EXPORT_SYMBOL(net_disable_timestamp);
1500
1501static inline void net_timestamp_set(struct sk_buff *skb)
1502{
1503	skb->tstamp.tv64 = 0;
1504	if (static_key_false(&netstamp_needed))
1505		__net_timestamp(skb);
1506}
1507
1508#define net_timestamp_check(COND, SKB)			\
1509	if (static_key_false(&netstamp_needed)) {		\
1510		if ((COND) && !(SKB)->tstamp.tv64)	\
1511			__net_timestamp(SKB);		\
1512	}						\
1513
1514static int net_hwtstamp_validate(struct ifreq *ifr)
1515{
1516	struct hwtstamp_config cfg;
1517	enum hwtstamp_tx_types tx_type;
1518	enum hwtstamp_rx_filters rx_filter;
1519	int tx_type_valid = 0;
1520	int rx_filter_valid = 0;
1521
1522	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1523		return -EFAULT;
1524
1525	if (cfg.flags) /* reserved for future extensions */
1526		return -EINVAL;
1527
1528	tx_type = cfg.tx_type;
1529	rx_filter = cfg.rx_filter;
1530
1531	switch (tx_type) {
1532	case HWTSTAMP_TX_OFF:
1533	case HWTSTAMP_TX_ON:
1534	case HWTSTAMP_TX_ONESTEP_SYNC:
1535		tx_type_valid = 1;
1536		break;
1537	}
1538
1539	switch (rx_filter) {
1540	case HWTSTAMP_FILTER_NONE:
1541	case HWTSTAMP_FILTER_ALL:
1542	case HWTSTAMP_FILTER_SOME:
1543	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1544	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1545	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1546	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1547	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1548	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1549	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1550	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1551	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1552	case HWTSTAMP_FILTER_PTP_V2_EVENT:
1553	case HWTSTAMP_FILTER_PTP_V2_SYNC:
1554	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1555		rx_filter_valid = 1;
1556		break;
1557	}
1558
1559	if (!tx_type_valid || !rx_filter_valid)
1560		return -ERANGE;
1561
1562	return 0;
1563}
1564
1565static inline bool is_skb_forwardable(struct net_device *dev,
1566				      struct sk_buff *skb)
1567{
1568	unsigned int len;
1569
1570	if (!(dev->flags & IFF_UP))
1571		return false;
1572
1573	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1574	if (skb->len <= len)
1575		return true;
1576
1577	/* if TSO is enabled, we don't care about the length as the packet
1578	 * could be forwarded without being segmented before
1579	 */
1580	if (skb_is_gso(skb))
1581		return true;
1582
1583	return false;
1584}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1585
1586/**
1587 * dev_forward_skb - loopback an skb to another netif
1588 *
1589 * @dev: destination network device
1590 * @skb: buffer to forward
1591 *
1592 * return values:
1593 *	NET_RX_SUCCESS	(no congestion)
1594 *	NET_RX_DROP     (packet was dropped, but freed)
1595 *
1596 * dev_forward_skb can be used for injecting an skb from the
1597 * start_xmit function of one device into the receive queue
1598 * of another device.
1599 *
1600 * The receiving device may be in another namespace, so
1601 * we have to clear all information in the skb that could
1602 * impact namespace isolation.
1603 */
1604int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1605{
1606	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1607		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1608			atomic_long_inc(&dev->rx_dropped);
1609			kfree_skb(skb);
1610			return NET_RX_DROP;
1611		}
1612	}
1613
1614	skb_orphan(skb);
1615	nf_reset(skb);
1616
1617	if (unlikely(!is_skb_forwardable(dev, skb))) {
1618		atomic_long_inc(&dev->rx_dropped);
1619		kfree_skb(skb);
1620		return NET_RX_DROP;
1621	}
1622	skb->skb_iif = 0;
1623	skb->dev = dev;
1624	skb_dst_drop(skb);
1625	skb->tstamp.tv64 = 0;
1626	skb->pkt_type = PACKET_HOST;
1627	skb->protocol = eth_type_trans(skb, dev);
1628	skb->mark = 0;
1629	secpath_reset(skb);
1630	nf_reset(skb);
1631	return netif_rx(skb);
1632}
1633EXPORT_SYMBOL_GPL(dev_forward_skb);
1634
1635static inline int deliver_skb(struct sk_buff *skb,
1636			      struct packet_type *pt_prev,
1637			      struct net_device *orig_dev)
1638{
 
 
1639	atomic_inc(&skb->users);
1640	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1641}
1642
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1643static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1644{
1645	if (ptype->af_packet_priv == NULL)
1646		return false;
1647
1648	if (ptype->id_match)
1649		return ptype->id_match(ptype, skb->sk);
1650	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1651		return true;
1652
1653	return false;
1654}
1655
1656/*
1657 *	Support routine. Sends outgoing frames to any network
1658 *	taps currently in use.
1659 */
1660
1661static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1662{
1663	struct packet_type *ptype;
1664	struct sk_buff *skb2 = NULL;
1665	struct packet_type *pt_prev = NULL;
 
1666
1667	rcu_read_lock();
1668	list_for_each_entry_rcu(ptype, &ptype_all, list) {
 
1669		/* Never send packets back to the socket
1670		 * they originated from - MvS (miquels@drinkel.ow.org)
1671		 */
1672		if ((ptype->dev == dev || !ptype->dev) &&
1673		    (!skb_loop_sk(ptype, skb))) {
1674			if (pt_prev) {
1675				deliver_skb(skb2, pt_prev, skb->dev);
1676				pt_prev = ptype;
1677				continue;
1678			}
1679
1680			skb2 = skb_clone(skb, GFP_ATOMIC);
1681			if (!skb2)
1682				break;
 
 
1683
1684			net_timestamp_set(skb2);
 
 
 
 
 
 
 
 
 
 
 
1685
1686			/* skb->nh should be correctly
1687			   set by sender, so that the second statement is
1688			   just protection against buggy protocols.
1689			 */
1690			skb_reset_mac_header(skb2);
 
 
1691
1692			if (skb_network_header(skb2) < skb2->data ||
1693			    skb2->network_header > skb2->tail) {
1694				net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1695						     ntohs(skb2->protocol),
1696						     dev->name);
1697				skb_reset_network_header(skb2);
1698			}
1699
1700			skb2->transport_header = skb2->network_header;
1701			skb2->pkt_type = PACKET_OUTGOING;
1702			pt_prev = ptype;
1703		}
1704	}
 
1705	if (pt_prev)
1706		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1707	rcu_read_unlock();
1708}
1709
1710/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
 
1711 * @dev: Network device
1712 * @txq: number of queues available
1713 *
1714 * If real_num_tx_queues is changed the tc mappings may no longer be
1715 * valid. To resolve this verify the tc mapping remains valid and if
1716 * not NULL the mapping. With no priorities mapping to this
1717 * offset/count pair it will no longer be used. In the worst case TC0
1718 * is invalid nothing can be done so disable priority mappings. If is
1719 * expected that drivers will fix this mapping if they can before
1720 * calling netif_set_real_num_tx_queues.
1721 */
1722static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1723{
1724	int i;
1725	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1726
1727	/* If TC0 is invalidated disable TC mapping */
1728	if (tc->offset + tc->count > txq) {
1729		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1730		dev->num_tc = 0;
1731		return;
1732	}
1733
1734	/* Invalidated prio to tc mappings set to TC0 */
1735	for (i = 1; i < TC_BITMASK + 1; i++) {
1736		int q = netdev_get_prio_tc_map(dev, i);
1737
1738		tc = &dev->tc_to_txq[q];
1739		if (tc->offset + tc->count > txq) {
1740			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1741				i, q);
1742			netdev_set_prio_tc_map(dev, i, 0);
1743		}
1744	}
1745}
1746
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1747/*
1748 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1749 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1750 */
1751int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1752{
1753	int rc;
1754
1755	if (txq < 1 || txq > dev->num_tx_queues)
1756		return -EINVAL;
1757
1758	if (dev->reg_state == NETREG_REGISTERED ||
1759	    dev->reg_state == NETREG_UNREGISTERING) {
1760		ASSERT_RTNL();
1761
1762		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1763						  txq);
1764		if (rc)
1765			return rc;
1766
1767		if (dev->num_tc)
1768			netif_setup_tc(dev, txq);
1769
1770		if (txq < dev->real_num_tx_queues)
1771			qdisc_reset_all_tx_gt(dev, txq);
 
 
 
 
1772	}
1773
1774	dev->real_num_tx_queues = txq;
1775	return 0;
1776}
1777EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1778
1779#ifdef CONFIG_RPS
1780/**
1781 *	netif_set_real_num_rx_queues - set actual number of RX queues used
1782 *	@dev: Network device
1783 *	@rxq: Actual number of RX queues
1784 *
1785 *	This must be called either with the rtnl_lock held or before
1786 *	registration of the net device.  Returns 0 on success, or a
1787 *	negative error code.  If called before registration, it always
1788 *	succeeds.
1789 */
1790int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1791{
1792	int rc;
1793
1794	if (rxq < 1 || rxq > dev->num_rx_queues)
1795		return -EINVAL;
1796
1797	if (dev->reg_state == NETREG_REGISTERED) {
1798		ASSERT_RTNL();
1799
1800		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1801						  rxq);
1802		if (rc)
1803			return rc;
1804	}
1805
1806	dev->real_num_rx_queues = rxq;
1807	return 0;
1808}
1809EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1810#endif
1811
 
 
 
 
 
 
 
 
 
 
 
 
1812static inline void __netif_reschedule(struct Qdisc *q)
1813{
1814	struct softnet_data *sd;
1815	unsigned long flags;
1816
1817	local_irq_save(flags);
1818	sd = &__get_cpu_var(softnet_data);
1819	q->next_sched = NULL;
1820	*sd->output_queue_tailp = q;
1821	sd->output_queue_tailp = &q->next_sched;
1822	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1823	local_irq_restore(flags);
1824}
1825
1826void __netif_schedule(struct Qdisc *q)
1827{
1828	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1829		__netif_reschedule(q);
1830}
1831EXPORT_SYMBOL(__netif_schedule);
1832
1833void dev_kfree_skb_irq(struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
1834{
1835	if (atomic_dec_and_test(&skb->users)) {
1836		struct softnet_data *sd;
1837		unsigned long flags;
1838
1839		local_irq_save(flags);
1840		sd = &__get_cpu_var(softnet_data);
1841		skb->next = sd->completion_queue;
1842		sd->completion_queue = skb;
1843		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1844		local_irq_restore(flags);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1845	}
 
 
 
 
 
 
1846}
1847EXPORT_SYMBOL(dev_kfree_skb_irq);
1848
1849void dev_kfree_skb_any(struct sk_buff *skb)
1850{
1851	if (in_irq() || irqs_disabled())
1852		dev_kfree_skb_irq(skb);
1853	else
1854		dev_kfree_skb(skb);
1855}
1856EXPORT_SYMBOL(dev_kfree_skb_any);
1857
1858
1859/**
1860 * netif_device_detach - mark device as removed
1861 * @dev: network device
1862 *
1863 * Mark device as removed from system and therefore no longer available.
1864 */
1865void netif_device_detach(struct net_device *dev)
1866{
1867	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1868	    netif_running(dev)) {
1869		netif_tx_stop_all_queues(dev);
1870	}
1871}
1872EXPORT_SYMBOL(netif_device_detach);
1873
1874/**
1875 * netif_device_attach - mark device as attached
1876 * @dev: network device
1877 *
1878 * Mark device as attached from system and restart if needed.
1879 */
1880void netif_device_attach(struct net_device *dev)
1881{
1882	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1883	    netif_running(dev)) {
1884		netif_tx_wake_all_queues(dev);
1885		__netdev_watchdog_up(dev);
1886	}
1887}
1888EXPORT_SYMBOL(netif_device_attach);
1889
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1890static void skb_warn_bad_offload(const struct sk_buff *skb)
1891{
1892	static const netdev_features_t null_features = 0;
1893	struct net_device *dev = skb->dev;
1894	const char *driver = "";
1895
1896	if (dev && dev->dev.parent)
1897		driver = dev_driver_string(dev->dev.parent);
1898
 
 
 
 
 
 
1899	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
1900	     "gso_type=%d ip_summed=%d\n",
1901	     driver, dev ? &dev->features : &null_features,
1902	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
1903	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
1904	     skb_shinfo(skb)->gso_type, skb->ip_summed);
1905}
1906
1907/*
1908 * Invalidate hardware checksum when packet is to be mangled, and
1909 * complete checksum manually on outgoing path.
1910 */
1911int skb_checksum_help(struct sk_buff *skb)
1912{
1913	__wsum csum;
1914	int ret = 0, offset;
1915
1916	if (skb->ip_summed == CHECKSUM_COMPLETE)
1917		goto out_set_summed;
1918
1919	if (unlikely(skb_shinfo(skb)->gso_size)) {
1920		skb_warn_bad_offload(skb);
1921		return -EINVAL;
1922	}
1923
 
 
 
 
 
 
 
 
 
1924	offset = skb_checksum_start_offset(skb);
1925	BUG_ON(offset >= skb_headlen(skb));
1926	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1927
1928	offset += skb->csum_offset;
1929	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1930
1931	if (skb_cloned(skb) &&
1932	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1933		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1934		if (ret)
1935			goto out;
1936	}
1937
1938	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1939out_set_summed:
1940	skb->ip_summed = CHECKSUM_NONE;
1941out:
1942	return ret;
1943}
1944EXPORT_SYMBOL(skb_checksum_help);
1945
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1946/**
1947 *	skb_gso_segment - Perform segmentation on skb.
1948 *	@skb: buffer to segment
1949 *	@features: features for the output path (see dev->features)
1950 *
1951 *	This function segments the given skb and returns a list of segments.
1952 *
1953 *	It may return NULL if the skb requires no segmentation.  This is
1954 *	only possible when GSO is used for verifying header integrity.
1955 */
1956struct sk_buff *skb_gso_segment(struct sk_buff *skb,
1957	netdev_features_t features)
1958{
1959	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1960	struct packet_type *ptype;
1961	__be16 type = skb->protocol;
1962	int vlan_depth = ETH_HLEN;
1963	int err;
1964
1965	while (type == htons(ETH_P_8021Q)) {
1966		struct vlan_hdr *vh;
1967
1968		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1969			return ERR_PTR(-EINVAL);
1970
1971		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1972		type = vh->h_vlan_encapsulated_proto;
1973		vlan_depth += VLAN_HLEN;
 
 
 
1974	}
 
 
 
1975
1976	skb_reset_mac_header(skb);
1977	skb->mac_len = skb->network_header - skb->mac_header;
1978	__skb_pull(skb, skb->mac_len);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1979
1980	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1981		skb_warn_bad_offload(skb);
1982
1983		if (skb_header_cloned(skb) &&
1984		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1985			return ERR_PTR(err);
1986	}
1987
1988	rcu_read_lock();
1989	list_for_each_entry_rcu(ptype,
1990			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1991		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1992			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1993				err = ptype->gso_send_check(skb);
1994				segs = ERR_PTR(err);
1995				if (err || skb_gso_ok(skb, features))
1996					break;
1997				__skb_push(skb, (skb->data -
1998						 skb_network_header(skb)));
1999			}
2000			segs = ptype->gso_segment(skb, features);
2001			break;
2002		}
2003	}
2004	rcu_read_unlock();
2005
2006	__skb_push(skb, skb->data - skb_mac_header(skb));
 
2007
2008	return segs;
2009}
2010EXPORT_SYMBOL(skb_gso_segment);
2011
2012/* Take action when hardware reception checksum errors are detected. */
2013#ifdef CONFIG_BUG
2014void netdev_rx_csum_fault(struct net_device *dev)
2015{
2016	if (net_ratelimit()) {
2017		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2018		dump_stack();
2019	}
2020}
2021EXPORT_SYMBOL(netdev_rx_csum_fault);
2022#endif
2023
2024/* Actually, we should eliminate this check as soon as we know, that:
2025 * 1. IOMMU is present and allows to map all the memory.
2026 * 2. No high memory really exists on this machine.
2027 */
2028
2029static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2030{
2031#ifdef CONFIG_HIGHMEM
2032	int i;
2033	if (!(dev->features & NETIF_F_HIGHDMA)) {
2034		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2035			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2036			if (PageHighMem(skb_frag_page(frag)))
2037				return 1;
2038		}
2039	}
2040
2041	if (PCI_DMA_BUS_IS_PHYS) {
2042		struct device *pdev = dev->dev.parent;
2043
2044		if (!pdev)
2045			return 0;
2046		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2047			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2048			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2049			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2050				return 1;
2051		}
2052	}
2053#endif
2054	return 0;
2055}
2056
2057struct dev_gso_cb {
2058	void (*destructor)(struct sk_buff *skb);
2059};
 
 
 
 
 
 
 
2060
2061#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2062
2063static void dev_gso_skb_destructor(struct sk_buff *skb)
 
 
 
2064{
2065	struct dev_gso_cb *cb;
2066
2067	do {
2068		struct sk_buff *nskb = skb->next;
2069
2070		skb->next = nskb->next;
2071		nskb->next = NULL;
2072		kfree_skb(nskb);
2073	} while (skb->next);
2074
2075	cb = DEV_GSO_CB(skb);
2076	if (cb->destructor)
2077		cb->destructor(skb);
2078}
 
2079
2080/**
2081 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2082 *	@skb: buffer to segment
2083 *	@features: device features as applicable to this skb
2084 *
2085 *	This function segments the given skb and stores the list of segments
2086 *	in skb->next.
2087 */
2088static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2089{
2090	struct sk_buff *segs;
 
2091
2092	segs = skb_gso_segment(skb, features);
 
2093
2094	/* Verifying header integrity only. */
2095	if (!segs)
2096		return 0;
 
 
 
2097
2098	if (IS_ERR(segs))
2099		return PTR_ERR(segs);
2100
2101	skb->next = segs;
2102	DEV_GSO_CB(skb)->destructor = skb->destructor;
2103	skb->destructor = dev_gso_skb_destructor;
2104
2105	return 0;
2106}
2107
2108static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
 
 
2109{
2110	return ((features & NETIF_F_GEN_CSUM) ||
2111		((features & NETIF_F_V4_CSUM) &&
2112		 protocol == htons(ETH_P_IP)) ||
2113		((features & NETIF_F_V6_CSUM) &&
2114		 protocol == htons(ETH_P_IPV6)) ||
2115		((features & NETIF_F_FCOE_CRC) &&
2116		 protocol == htons(ETH_P_FCOE)));
2117}
 
2118
2119static netdev_features_t harmonize_features(struct sk_buff *skb,
2120	__be16 protocol, netdev_features_t features)
 
2121{
2122	if (!can_checksum_protocol(features, protocol)) {
2123		features &= ~NETIF_F_ALL_CSUM;
2124		features &= ~NETIF_F_SG;
2125	} else if (illegal_highdma(skb->dev, skb)) {
2126		features &= ~NETIF_F_SG;
2127	}
2128
2129	return features;
2130}
2131
2132netdev_features_t netif_skb_features(struct sk_buff *skb)
2133{
2134	__be16 protocol = skb->protocol;
2135	netdev_features_t features = skb->dev->features;
 
2136
2137	if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2138		features &= ~NETIF_F_GSO_MASK;
2139
2140	if (protocol == htons(ETH_P_8021Q)) {
2141		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2142		protocol = veh->h_vlan_encapsulated_proto;
2143	} else if (!vlan_tx_tag_present(skb)) {
2144		return harmonize_features(skb, protocol, features);
2145	}
2146
2147	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
 
 
 
 
 
 
 
 
 
 
2148
2149	if (protocol != htons(ETH_P_8021Q)) {
2150		return harmonize_features(skb, protocol, features);
2151	} else {
2152		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2153				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2154		return harmonize_features(skb, protocol, features);
2155	}
2156}
2157EXPORT_SYMBOL(netif_skb_features);
2158
2159/*
2160 * Returns true if either:
2161 *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2162 *	2. skb is fragmented and the device does not support SG, or if
2163 *	   at least one of fragments is in highmem and device does not
2164 *	   support DMA from it.
2165 */
2166static inline int skb_needs_linearize(struct sk_buff *skb,
2167				      int features)
2168{
2169	return skb_is_nonlinear(skb) &&
2170			((skb_has_frag_list(skb) &&
2171				!(features & NETIF_F_FRAGLIST)) ||
2172			(skb_shinfo(skb)->nr_frags &&
2173				!(features & NETIF_F_SG)));
2174}
2175
2176int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2177			struct netdev_queue *txq)
2178{
2179	const struct net_device_ops *ops = dev->netdev_ops;
2180	int rc = NETDEV_TX_OK;
2181	unsigned int skb_len;
2182
2183	if (likely(!skb->next)) {
2184		netdev_features_t features;
2185
2186		/*
2187		 * If device doesn't need skb->dst, release it right now while
2188		 * its hot in this cpu cache
2189		 */
2190		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2191			skb_dst_drop(skb);
2192
2193		if (!list_empty(&ptype_all))
2194			dev_queue_xmit_nit(skb, dev);
 
 
 
 
2195
2196		features = netif_skb_features(skb);
 
 
 
2197
2198		if (vlan_tx_tag_present(skb) &&
2199		    !(features & NETIF_F_HW_VLAN_TX)) {
2200			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2201			if (unlikely(!skb))
2202				goto out;
 
 
 
2203
2204			skb->vlan_tci = 0;
2205		}
 
2206
2207		if (netif_needs_gso(skb, features)) {
2208			if (unlikely(dev_gso_segment(skb, features)))
2209				goto out_kfree_skb;
2210			if (skb->next)
2211				goto gso;
2212		} else {
2213			if (skb_needs_linearize(skb, features) &&
2214			    __skb_linearize(skb))
2215				goto out_kfree_skb;
2216
2217			/* If packet is not checksummed and device does not
2218			 * support checksumming for this protocol, complete
2219			 * checksumming here.
2220			 */
2221			if (skb->ip_summed == CHECKSUM_PARTIAL) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2222				skb_set_transport_header(skb,
2223					skb_checksum_start_offset(skb));
2224				if (!(features & NETIF_F_ALL_CSUM) &&
2225				     skb_checksum_help(skb))
2226					goto out_kfree_skb;
2227			}
2228		}
2229
2230		skb_len = skb->len;
2231		rc = ops->ndo_start_xmit(skb, dev);
2232		trace_net_dev_xmit(skb, rc, dev, skb_len);
2233		if (rc == NETDEV_TX_OK)
2234			txq_trans_update(txq);
2235		return rc;
2236	}
2237
2238gso:
2239	do {
2240		struct sk_buff *nskb = skb->next;
2241
2242		skb->next = nskb->next;
2243		nskb->next = NULL;
2244
2245		/*
2246		 * If device doesn't need nskb->dst, release it right now while
2247		 * its hot in this cpu cache
2248		 */
2249		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2250			skb_dst_drop(nskb);
2251
2252		skb_len = nskb->len;
2253		rc = ops->ndo_start_xmit(nskb, dev);
2254		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2255		if (unlikely(rc != NETDEV_TX_OK)) {
2256			if (rc & ~NETDEV_TX_MASK)
2257				goto out_kfree_gso_skb;
2258			nskb->next = skb->next;
2259			skb->next = nskb;
2260			return rc;
2261		}
2262		txq_trans_update(txq);
2263		if (unlikely(netif_xmit_stopped(txq) && skb->next))
2264			return NETDEV_TX_BUSY;
2265	} while (skb->next);
2266
2267out_kfree_gso_skb:
2268	if (likely(skb->next == NULL))
2269		skb->destructor = DEV_GSO_CB(skb)->destructor;
2270out_kfree_skb:
2271	kfree_skb(skb);
2272out:
2273	return rc;
2274}
2275
2276static u32 hashrnd __read_mostly;
2277
2278/*
2279 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2280 * to be used as a distribution range.
2281 */
2282u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2283		  unsigned int num_tx_queues)
2284{
2285	u32 hash;
2286	u16 qoffset = 0;
2287	u16 qcount = num_tx_queues;
2288
2289	if (skb_rx_queue_recorded(skb)) {
2290		hash = skb_get_rx_queue(skb);
2291		while (unlikely(hash >= num_tx_queues))
2292			hash -= num_tx_queues;
2293		return hash;
2294	}
2295
2296	if (dev->num_tc) {
2297		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2298		qoffset = dev->tc_to_txq[tc].offset;
2299		qcount = dev->tc_to_txq[tc].count;
2300	}
2301
2302	if (skb->sk && skb->sk->sk_hash)
2303		hash = skb->sk->sk_hash;
2304	else
2305		hash = (__force u16) skb->protocol;
2306	hash = jhash_1word(hash, hashrnd);
2307
2308	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2309}
2310EXPORT_SYMBOL(__skb_tx_hash);
2311
2312static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2313{
2314	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2315		net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
2316				     dev->name, queue_index,
2317				     dev->real_num_tx_queues);
2318		return 0;
2319	}
2320	return queue_index;
2321}
2322
2323static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2324{
2325#ifdef CONFIG_XPS
2326	struct xps_dev_maps *dev_maps;
2327	struct xps_map *map;
2328	int queue_index = -1;
2329
2330	rcu_read_lock();
2331	dev_maps = rcu_dereference(dev->xps_maps);
2332	if (dev_maps) {
2333		map = rcu_dereference(
2334		    dev_maps->cpu_map[raw_smp_processor_id()]);
2335		if (map) {
2336			if (map->len == 1)
2337				queue_index = map->queues[0];
2338			else {
2339				u32 hash;
2340				if (skb->sk && skb->sk->sk_hash)
2341					hash = skb->sk->sk_hash;
2342				else
2343					hash = (__force u16) skb->protocol ^
2344					    skb->rxhash;
2345				hash = jhash_1word(hash, hashrnd);
2346				queue_index = map->queues[
2347				    ((u64)hash * map->len) >> 32];
2348			}
2349			if (unlikely(queue_index >= dev->real_num_tx_queues))
2350				queue_index = -1;
2351		}
2352	}
2353	rcu_read_unlock();
2354
2355	return queue_index;
2356#else
2357	return -1;
2358#endif
2359}
 
 
 
 
 
 
 
 
 
 
2360
2361static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2362					struct sk_buff *skb)
2363{
2364	int queue_index;
2365	const struct net_device_ops *ops = dev->netdev_ops;
2366
2367	if (dev->real_num_tx_queues == 1)
2368		queue_index = 0;
2369	else if (ops->ndo_select_queue) {
2370		queue_index = ops->ndo_select_queue(dev, skb);
2371		queue_index = dev_cap_txqueue(dev, queue_index);
2372	} else {
2373		struct sock *sk = skb->sk;
2374		queue_index = sk_tx_queue_get(sk);
2375
2376		if (queue_index < 0 || skb->ooo_okay ||
2377		    queue_index >= dev->real_num_tx_queues) {
2378			int old_index = queue_index;
2379
2380			queue_index = get_xps_queue(dev, skb);
2381			if (queue_index < 0)
2382				queue_index = skb_tx_hash(dev, skb);
2383
2384			if (queue_index != old_index && sk) {
2385				struct dst_entry *dst =
2386				    rcu_dereference_check(sk->sk_dst_cache, 1);
2387
2388				if (dst && skb_dst(skb) == dst)
2389					sk_tx_queue_set(sk, queue_index);
2390			}
2391		}
2392	}
2393
2394	skb_set_queue_mapping(skb, queue_index);
2395	return netdev_get_tx_queue(dev, queue_index);
2396}
2397
2398static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2399				 struct net_device *dev,
2400				 struct netdev_queue *txq)
2401{
2402	spinlock_t *root_lock = qdisc_lock(q);
2403	bool contended;
2404	int rc;
2405
2406	qdisc_skb_cb(skb)->pkt_len = skb->len;
2407	qdisc_calculate_pkt_len(skb, q);
2408	/*
2409	 * Heuristic to force contended enqueues to serialize on a
2410	 * separate lock before trying to get qdisc main lock.
2411	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2412	 * and dequeue packets faster.
2413	 */
2414	contended = qdisc_is_running(q);
2415	if (unlikely(contended))
2416		spin_lock(&q->busylock);
2417
2418	spin_lock(root_lock);
2419	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2420		kfree_skb(skb);
2421		rc = NET_XMIT_DROP;
2422	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2423		   qdisc_run_begin(q)) {
2424		/*
2425		 * This is a work-conserving queue; there are no old skbs
2426		 * waiting to be sent out; and the qdisc is not running -
2427		 * xmit the skb directly.
2428		 */
2429		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2430			skb_dst_force(skb);
2431
2432		qdisc_bstats_update(q, skb);
2433
2434		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2435			if (unlikely(contended)) {
2436				spin_unlock(&q->busylock);
2437				contended = false;
2438			}
2439			__qdisc_run(q);
2440		} else
2441			qdisc_run_end(q);
2442
2443		rc = NET_XMIT_SUCCESS;
2444	} else {
2445		skb_dst_force(skb);
2446		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2447		if (qdisc_run_begin(q)) {
2448			if (unlikely(contended)) {
2449				spin_unlock(&q->busylock);
2450				contended = false;
2451			}
2452			__qdisc_run(q);
2453		}
2454	}
2455	spin_unlock(root_lock);
2456	if (unlikely(contended))
2457		spin_unlock(&q->busylock);
2458	return rc;
2459}
2460
2461#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2462static void skb_update_prio(struct sk_buff *skb)
2463{
2464	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2465
2466	if (!skb->priority && skb->sk && map) {
2467		unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
 
2468
2469		if (prioidx < map->priomap_len)
2470			skb->priority = map->priomap[prioidx];
2471	}
2472}
2473#else
2474#define skb_update_prio(skb)
2475#endif
2476
2477static DEFINE_PER_CPU(int, xmit_recursion);
 
 
2478#define RECURSION_LIMIT 10
2479
2480/**
2481 *	dev_queue_xmit - transmit a buffer
 
 
2482 *	@skb: buffer to transmit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2483 *
2484 *	Queue a buffer for transmission to a network device. The caller must
2485 *	have set the device and priority and built the buffer before calling
2486 *	this function. The function can be called from an interrupt.
2487 *
2488 *	A negative errno code is returned on a failure. A success does not
2489 *	guarantee the frame will be transmitted as it may be dropped due
2490 *	to congestion or traffic shaping.
2491 *
2492 * -----------------------------------------------------------------------------------
2493 *      I notice this method can also return errors from the queue disciplines,
2494 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2495 *      be positive.
2496 *
2497 *      Regardless of the return value, the skb is consumed, so it is currently
2498 *      difficult to retry a send to this method.  (You can bump the ref count
2499 *      before sending to hold a reference for retry if you are careful.)
2500 *
2501 *      When calling this method, interrupts MUST be enabled.  This is because
2502 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2503 *          --BLG
2504 */
2505int dev_queue_xmit(struct sk_buff *skb)
2506{
2507	struct net_device *dev = skb->dev;
2508	struct netdev_queue *txq;
2509	struct Qdisc *q;
2510	int rc = -ENOMEM;
2511
 
 
 
 
 
2512	/* Disable soft irqs for various locks below. Also
2513	 * stops preemption for RCU.
2514	 */
2515	rcu_read_lock_bh();
2516
2517	skb_update_prio(skb);
2518
2519	txq = dev_pick_tx(dev, skb);
2520	q = rcu_dereference_bh(txq->qdisc);
2521
2522#ifdef CONFIG_NET_CLS_ACT
2523	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
 
 
 
 
 
 
 
2524#endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2525	trace_net_dev_queue(skb);
2526	if (q->enqueue) {
2527		rc = __dev_xmit_skb(skb, q, dev, txq);
2528		goto out;
2529	}
2530
2531	/* The device has no queue. Common case for software devices:
2532	   loopback, all the sorts of tunnels...
2533
2534	   Really, it is unlikely that netif_tx_lock protection is necessary
2535	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2536	   counters.)
2537	   However, it is possible, that they rely on protection
2538	   made by us here.
2539
2540	   Check this and shot the lock. It is not prone from deadlocks.
2541	   Either shot noqueue qdisc, it is even simpler 8)
2542	 */
2543	if (dev->flags & IFF_UP) {
2544		int cpu = smp_processor_id(); /* ok because BHs are off */
2545
2546		if (txq->xmit_lock_owner != cpu) {
2547
2548			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2549				goto recursion_alert;
2550
 
 
 
 
2551			HARD_TX_LOCK(dev, txq, cpu);
2552
2553			if (!netif_xmit_stopped(txq)) {
2554				__this_cpu_inc(xmit_recursion);
2555				rc = dev_hard_start_xmit(skb, dev, txq);
2556				__this_cpu_dec(xmit_recursion);
2557				if (dev_xmit_complete(rc)) {
2558					HARD_TX_UNLOCK(dev, txq);
2559					goto out;
2560				}
2561			}
2562			HARD_TX_UNLOCK(dev, txq);
2563			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2564					     dev->name);
2565		} else {
2566			/* Recursion is detected! It is possible,
2567			 * unfortunately
2568			 */
2569recursion_alert:
2570			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2571					     dev->name);
2572		}
2573	}
2574
2575	rc = -ENETDOWN;
 
2576	rcu_read_unlock_bh();
2577
2578	kfree_skb(skb);
 
2579	return rc;
2580out:
2581	rcu_read_unlock_bh();
2582	return rc;
2583}
 
 
 
 
 
2584EXPORT_SYMBOL(dev_queue_xmit);
2585
 
 
 
 
 
 
2586
2587/*=======================================================================
2588			Receiver routines
2589  =======================================================================*/
2590
2591int netdev_max_backlog __read_mostly = 1000;
 
 
2592int netdev_tstamp_prequeue __read_mostly = 1;
2593int netdev_budget __read_mostly = 300;
2594int weight_p __read_mostly = 64;            /* old backlog weight */
2595
2596/* Called with irq disabled */
2597static inline void ____napi_schedule(struct softnet_data *sd,
2598				     struct napi_struct *napi)
2599{
2600	list_add_tail(&napi->poll_list, &sd->poll_list);
2601	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2602}
2603
2604/*
2605 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2606 * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2607 * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2608 * if hash is a canonical 4-tuple hash over transport ports.
2609 */
2610void __skb_get_rxhash(struct sk_buff *skb)
2611{
2612	struct flow_keys keys;
2613	u32 hash;
2614
2615	if (!skb_flow_dissect(skb, &keys))
2616		return;
2617
2618	if (keys.ports) {
2619		if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0])
2620			swap(keys.port16[0], keys.port16[1]);
2621		skb->l4_rxhash = 1;
2622	}
2623
2624	/* get a consistent hash (same value on both flow directions) */
2625	if ((__force u32)keys.dst < (__force u32)keys.src)
2626		swap(keys.dst, keys.src);
2627
2628	hash = jhash_3words((__force u32)keys.dst,
2629			    (__force u32)keys.src,
2630			    (__force u32)keys.ports, hashrnd);
2631	if (!hash)
2632		hash = 1;
2633
2634	skb->rxhash = hash;
2635}
2636EXPORT_SYMBOL(__skb_get_rxhash);
2637
2638#ifdef CONFIG_RPS
2639
2640/* One global table that all flow-based protocols share. */
2641struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2642EXPORT_SYMBOL(rps_sock_flow_table);
 
 
2643
2644struct static_key rps_needed __read_mostly;
2645
2646static struct rps_dev_flow *
2647set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2648	    struct rps_dev_flow *rflow, u16 next_cpu)
2649{
2650	if (next_cpu != RPS_NO_CPU) {
2651#ifdef CONFIG_RFS_ACCEL
2652		struct netdev_rx_queue *rxqueue;
2653		struct rps_dev_flow_table *flow_table;
2654		struct rps_dev_flow *old_rflow;
2655		u32 flow_id;
2656		u16 rxq_index;
2657		int rc;
2658
2659		/* Should we steer this flow to a different hardware queue? */
2660		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2661		    !(dev->features & NETIF_F_NTUPLE))
2662			goto out;
2663		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2664		if (rxq_index == skb_get_rx_queue(skb))
2665			goto out;
2666
2667		rxqueue = dev->_rx + rxq_index;
2668		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2669		if (!flow_table)
2670			goto out;
2671		flow_id = skb->rxhash & flow_table->mask;
2672		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2673							rxq_index, flow_id);
2674		if (rc < 0)
2675			goto out;
2676		old_rflow = rflow;
2677		rflow = &flow_table->flows[flow_id];
2678		rflow->filter = rc;
2679		if (old_rflow->filter == rflow->filter)
2680			old_rflow->filter = RPS_NO_FILTER;
2681	out:
2682#endif
2683		rflow->last_qtail =
2684			per_cpu(softnet_data, next_cpu).input_queue_head;
2685	}
2686
2687	rflow->cpu = next_cpu;
2688	return rflow;
2689}
2690
2691/*
2692 * get_rps_cpu is called from netif_receive_skb and returns the target
2693 * CPU from the RPS map of the receiving queue for a given skb.
2694 * rcu_read_lock must be held on entry.
2695 */
2696static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2697		       struct rps_dev_flow **rflowp)
2698{
2699	struct netdev_rx_queue *rxqueue;
 
 
2700	struct rps_map *map;
2701	struct rps_dev_flow_table *flow_table;
2702	struct rps_sock_flow_table *sock_flow_table;
2703	int cpu = -1;
2704	u16 tcpu;
 
2705
2706	if (skb_rx_queue_recorded(skb)) {
2707		u16 index = skb_get_rx_queue(skb);
 
2708		if (unlikely(index >= dev->real_num_rx_queues)) {
2709			WARN_ONCE(dev->real_num_rx_queues > 1,
2710				  "%s received packet on queue %u, but number "
2711				  "of RX queues is %u\n",
2712				  dev->name, index, dev->real_num_rx_queues);
2713			goto done;
2714		}
2715		rxqueue = dev->_rx + index;
2716	} else
2717		rxqueue = dev->_rx;
 
2718
 
2719	map = rcu_dereference(rxqueue->rps_map);
2720	if (map) {
2721		if (map->len == 1 &&
2722		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
2723			tcpu = map->cpus[0];
2724			if (cpu_online(tcpu))
2725				cpu = tcpu;
2726			goto done;
2727		}
2728	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2729		goto done;
2730	}
2731
2732	skb_reset_network_header(skb);
2733	if (!skb_get_rxhash(skb))
 
2734		goto done;
2735
2736	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2737	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2738	if (flow_table && sock_flow_table) {
2739		u16 next_cpu;
2740		struct rps_dev_flow *rflow;
 
 
 
 
 
 
 
 
 
2741
2742		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
 
 
 
2743		tcpu = rflow->cpu;
2744
2745		next_cpu = sock_flow_table->ents[skb->rxhash &
2746		    sock_flow_table->mask];
2747
2748		/*
2749		 * If the desired CPU (where last recvmsg was done) is
2750		 * different from current CPU (one in the rx-queue flow
2751		 * table entry), switch if one of the following holds:
2752		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2753		 *   - Current CPU is offline.
2754		 *   - The current CPU's queue tail has advanced beyond the
2755		 *     last packet that was enqueued using this table entry.
2756		 *     This guarantees that all previous packets for the flow
2757		 *     have been dequeued, thus preserving in order delivery.
2758		 */
2759		if (unlikely(tcpu != next_cpu) &&
2760		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2761		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2762		      rflow->last_qtail)) >= 0))
 
2763			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
 
2764
2765		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2766			*rflowp = rflow;
2767			cpu = tcpu;
2768			goto done;
2769		}
2770	}
2771
 
 
2772	if (map) {
2773		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2774
2775		if (cpu_online(tcpu)) {
2776			cpu = tcpu;
2777			goto done;
2778		}
2779	}
2780
2781done:
2782	return cpu;
2783}
2784
2785#ifdef CONFIG_RFS_ACCEL
2786
2787/**
2788 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2789 * @dev: Device on which the filter was set
2790 * @rxq_index: RX queue index
2791 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2792 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2793 *
2794 * Drivers that implement ndo_rx_flow_steer() should periodically call
2795 * this function for each installed filter and remove the filters for
2796 * which it returns %true.
2797 */
2798bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2799			 u32 flow_id, u16 filter_id)
2800{
2801	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2802	struct rps_dev_flow_table *flow_table;
2803	struct rps_dev_flow *rflow;
2804	bool expire = true;
2805	int cpu;
2806
2807	rcu_read_lock();
2808	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2809	if (flow_table && flow_id <= flow_table->mask) {
2810		rflow = &flow_table->flows[flow_id];
2811		cpu = ACCESS_ONCE(rflow->cpu);
2812		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2813		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2814			   rflow->last_qtail) <
2815		     (int)(10 * flow_table->mask)))
2816			expire = false;
2817	}
2818	rcu_read_unlock();
2819	return expire;
2820}
2821EXPORT_SYMBOL(rps_may_expire_flow);
2822
2823#endif /* CONFIG_RFS_ACCEL */
2824
2825/* Called from hardirq (IPI) context */
2826static void rps_trigger_softirq(void *data)
2827{
2828	struct softnet_data *sd = data;
2829
2830	____napi_schedule(sd, &sd->backlog);
2831	sd->received_rps++;
2832}
2833
2834#endif /* CONFIG_RPS */
2835
2836/*
2837 * Check if this softnet_data structure is another cpu one
2838 * If yes, queue it to our IPI list and return 1
2839 * If no, return 0
2840 */
2841static int rps_ipi_queued(struct softnet_data *sd)
2842{
2843#ifdef CONFIG_RPS
2844	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2845
2846	if (sd != mysd) {
2847		sd->rps_ipi_next = mysd->rps_ipi_list;
2848		mysd->rps_ipi_list = sd;
2849
2850		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2851		return 1;
2852	}
2853#endif /* CONFIG_RPS */
2854	return 0;
2855}
2856
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2857/*
2858 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2859 * queue (may be a remote CPU queue).
2860 */
2861static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2862			      unsigned int *qtail)
2863{
2864	struct softnet_data *sd;
2865	unsigned long flags;
 
2866
2867	sd = &per_cpu(softnet_data, cpu);
2868
2869	local_irq_save(flags);
2870
2871	rps_lock(sd);
2872	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2873		if (skb_queue_len(&sd->input_pkt_queue)) {
 
 
 
2874enqueue:
2875			__skb_queue_tail(&sd->input_pkt_queue, skb);
2876			input_queue_tail_incr_save(sd, qtail);
2877			rps_unlock(sd);
2878			local_irq_restore(flags);
2879			return NET_RX_SUCCESS;
2880		}
2881
2882		/* Schedule NAPI for backlog device
2883		 * We can use non atomic operation since we own the queue lock
2884		 */
2885		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2886			if (!rps_ipi_queued(sd))
2887				____napi_schedule(sd, &sd->backlog);
2888		}
2889		goto enqueue;
2890	}
2891
 
2892	sd->dropped++;
2893	rps_unlock(sd);
2894
2895	local_irq_restore(flags);
2896
2897	atomic_long_inc(&skb->dev->rx_dropped);
2898	kfree_skb(skb);
2899	return NET_RX_DROP;
2900}
2901
2902/**
2903 *	netif_rx	-	post buffer to the network code
2904 *	@skb: buffer to post
2905 *
2906 *	This function receives a packet from a device driver and queues it for
2907 *	the upper (protocol) levels to process.  It always succeeds. The buffer
2908 *	may be dropped during processing for congestion control or by the
2909 *	protocol layers.
2910 *
2911 *	return values:
2912 *	NET_RX_SUCCESS	(no congestion)
2913 *	NET_RX_DROP     (packet was dropped)
2914 *
2915 */
2916
2917int netif_rx(struct sk_buff *skb)
2918{
2919	int ret;
2920
2921	/* if netpoll wants it, pretend we never saw it */
2922	if (netpoll_rx(skb))
2923		return NET_RX_DROP;
2924
2925	net_timestamp_check(netdev_tstamp_prequeue, skb);
2926
2927	trace_netif_rx(skb);
2928#ifdef CONFIG_RPS
2929	if (static_key_false(&rps_needed)) {
2930		struct rps_dev_flow voidflow, *rflow = &voidflow;
2931		int cpu;
2932
2933		preempt_disable();
2934		rcu_read_lock();
2935
2936		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2937		if (cpu < 0)
2938			cpu = smp_processor_id();
2939
2940		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2941
2942		rcu_read_unlock();
2943		preempt_enable();
2944	} else
2945#endif
2946	{
2947		unsigned int qtail;
2948		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2949		put_cpu();
2950	}
2951	return ret;
2952}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2953EXPORT_SYMBOL(netif_rx);
2954
2955int netif_rx_ni(struct sk_buff *skb)
2956{
2957	int err;
2958
 
 
2959	preempt_disable();
2960	err = netif_rx(skb);
2961	if (local_softirq_pending())
2962		do_softirq();
2963	preempt_enable();
2964
2965	return err;
2966}
2967EXPORT_SYMBOL(netif_rx_ni);
2968
2969static void net_tx_action(struct softirq_action *h)
2970{
2971	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2972
2973	if (sd->completion_queue) {
2974		struct sk_buff *clist;
2975
2976		local_irq_disable();
2977		clist = sd->completion_queue;
2978		sd->completion_queue = NULL;
2979		local_irq_enable();
2980
2981		while (clist) {
2982			struct sk_buff *skb = clist;
2983			clist = clist->next;
2984
2985			WARN_ON(atomic_read(&skb->users));
2986			trace_kfree_skb(skb, net_tx_action);
2987			__kfree_skb(skb);
 
 
 
 
 
 
 
2988		}
 
 
2989	}
2990
2991	if (sd->output_queue) {
2992		struct Qdisc *head;
2993
2994		local_irq_disable();
2995		head = sd->output_queue;
2996		sd->output_queue = NULL;
2997		sd->output_queue_tailp = &sd->output_queue;
2998		local_irq_enable();
2999
3000		while (head) {
3001			struct Qdisc *q = head;
3002			spinlock_t *root_lock;
3003
3004			head = head->next_sched;
3005
3006			root_lock = qdisc_lock(q);
3007			if (spin_trylock(root_lock)) {
3008				smp_mb__before_clear_bit();
3009				clear_bit(__QDISC_STATE_SCHED,
3010					  &q->state);
3011				qdisc_run(q);
3012				spin_unlock(root_lock);
3013			} else {
3014				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3015					      &q->state)) {
3016					__netif_reschedule(q);
3017				} else {
3018					smp_mb__before_clear_bit();
3019					clear_bit(__QDISC_STATE_SCHED,
3020						  &q->state);
3021				}
3022			}
3023		}
3024	}
3025}
3026
3027#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3028    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3029/* This hook is defined here for ATM LANE */
3030int (*br_fdb_test_addr_hook)(struct net_device *dev,
3031			     unsigned char *addr) __read_mostly;
3032EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3033#endif
3034
 
 
 
 
3035#ifdef CONFIG_NET_CLS_ACT
3036/* TODO: Maybe we should just force sch_ingress to be compiled in
3037 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3038 * a compare and 2 stores extra right now if we dont have it on
3039 * but have CONFIG_NET_CLS_ACT
3040 * NOTE: This doesn't stop any functionality; if you dont have
3041 * the ingress scheduler, you just can't add policies on ingress.
3042 *
3043 */
3044static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3045{
3046	struct net_device *dev = skb->dev;
3047	u32 ttl = G_TC_RTTL(skb->tc_verd);
3048	int result = TC_ACT_OK;
3049	struct Qdisc *q;
3050
3051	if (unlikely(MAX_RED_LOOP < ttl++)) {
3052		net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3053				     skb->skb_iif, dev->ifindex);
3054		return TC_ACT_SHOT;
3055	}
3056
3057	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3058	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3059
3060	q = rxq->qdisc;
3061	if (q != &noop_qdisc) {
3062		spin_lock(qdisc_lock(q));
3063		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3064			result = qdisc_enqueue_root(skb, q);
3065		spin_unlock(qdisc_lock(q));
3066	}
3067
3068	return result;
3069}
3070
3071static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3072					 struct packet_type **pt_prev,
3073					 int *ret, struct net_device *orig_dev)
3074{
3075	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3076
3077	if (!rxq || rxq->qdisc == &noop_qdisc)
3078		goto out;
3079
 
 
 
 
 
 
 
3080	if (*pt_prev) {
3081		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3082		*pt_prev = NULL;
3083	}
3084
3085	switch (ing_filter(skb, rxq)) {
 
 
 
 
 
 
 
 
3086	case TC_ACT_SHOT:
 
3087	case TC_ACT_STOLEN:
 
3088		kfree_skb(skb);
3089		return NULL;
 
 
 
 
 
 
 
 
 
 
3090	}
3091
3092out:
3093	skb->tc_verd = 0;
3094	return skb;
3095}
3096#endif
3097
3098/**
3099 *	netdev_rx_handler_register - register receive handler
3100 *	@dev: device to register a handler for
3101 *	@rx_handler: receive handler to register
3102 *	@rx_handler_data: data pointer that is used by rx handler
3103 *
3104 *	Register a receive hander for a device. This handler will then be
3105 *	called from __netif_receive_skb. A negative errno code is returned
3106 *	on a failure.
3107 *
3108 *	The caller must hold the rtnl_mutex.
3109 *
3110 *	For a general description of rx_handler, see enum rx_handler_result.
3111 */
3112int netdev_rx_handler_register(struct net_device *dev,
3113			       rx_handler_func_t *rx_handler,
3114			       void *rx_handler_data)
3115{
3116	ASSERT_RTNL();
3117
3118	if (dev->rx_handler)
3119		return -EBUSY;
3120
 
3121	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3122	rcu_assign_pointer(dev->rx_handler, rx_handler);
3123
3124	return 0;
3125}
3126EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3127
3128/**
3129 *	netdev_rx_handler_unregister - unregister receive handler
3130 *	@dev: device to unregister a handler from
3131 *
3132 *	Unregister a receive hander from a device.
3133 *
3134 *	The caller must hold the rtnl_mutex.
3135 */
3136void netdev_rx_handler_unregister(struct net_device *dev)
3137{
3138
3139	ASSERT_RTNL();
3140	RCU_INIT_POINTER(dev->rx_handler, NULL);
 
 
 
 
 
3141	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3142}
3143EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3144
3145static int __netif_receive_skb(struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3146{
3147	struct packet_type *ptype, *pt_prev;
3148	rx_handler_func_t *rx_handler;
3149	struct net_device *orig_dev;
3150	struct net_device *null_or_dev;
3151	bool deliver_exact = false;
3152	int ret = NET_RX_DROP;
3153	__be16 type;
3154
3155	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3156
3157	trace_netif_receive_skb(skb);
3158
3159	/* if we've gotten here through NAPI, check netpoll */
3160	if (netpoll_receive_skb(skb))
3161		return NET_RX_DROP;
3162
3163	if (!skb->skb_iif)
3164		skb->skb_iif = skb->dev->ifindex;
3165	orig_dev = skb->dev;
3166
3167	skb_reset_network_header(skb);
3168	skb_reset_transport_header(skb);
 
3169	skb_reset_mac_len(skb);
3170
3171	pt_prev = NULL;
3172
3173	rcu_read_lock();
3174
3175another_round:
 
3176
3177	__this_cpu_inc(softnet_data.processed);
3178
3179	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3180		skb = vlan_untag(skb);
 
3181		if (unlikely(!skb))
3182			goto out;
3183	}
3184
3185#ifdef CONFIG_NET_CLS_ACT
3186	if (skb->tc_verd & TC_NCLS) {
3187		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3188		goto ncls;
3189	}
3190#endif
3191
 
 
 
3192	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3193		if (!ptype->dev || ptype->dev == skb->dev) {
3194			if (pt_prev)
3195				ret = deliver_skb(skb, pt_prev, orig_dev);
3196			pt_prev = ptype;
3197		}
 
 
 
 
3198	}
3199
 
 
 
 
 
 
 
 
 
 
 
3200#ifdef CONFIG_NET_CLS_ACT
3201	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3202	if (!skb)
3203		goto out;
3204ncls:
3205#endif
 
 
3206
3207	rx_handler = rcu_dereference(skb->dev->rx_handler);
3208	if (vlan_tx_tag_present(skb)) {
3209		if (pt_prev) {
3210			ret = deliver_skb(skb, pt_prev, orig_dev);
3211			pt_prev = NULL;
3212		}
3213		if (vlan_do_receive(&skb, !rx_handler))
3214			goto another_round;
3215		else if (unlikely(!skb))
3216			goto out;
3217	}
3218
 
3219	if (rx_handler) {
3220		if (pt_prev) {
3221			ret = deliver_skb(skb, pt_prev, orig_dev);
3222			pt_prev = NULL;
3223		}
3224		switch (rx_handler(&skb)) {
3225		case RX_HANDLER_CONSUMED:
 
3226			goto out;
3227		case RX_HANDLER_ANOTHER:
3228			goto another_round;
3229		case RX_HANDLER_EXACT:
3230			deliver_exact = true;
3231		case RX_HANDLER_PASS:
3232			break;
3233		default:
3234			BUG();
3235		}
3236	}
3237
 
 
 
 
 
 
 
 
 
 
 
 
3238	/* deliver only exact match when indicated */
3239	null_or_dev = deliver_exact ? skb->dev : NULL;
 
 
 
 
3240
3241	type = skb->protocol;
3242	list_for_each_entry_rcu(ptype,
3243			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3244		if (ptype->type == type &&
3245		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3246		     ptype->dev == orig_dev)) {
3247			if (pt_prev)
3248				ret = deliver_skb(skb, pt_prev, orig_dev);
3249			pt_prev = ptype;
3250		}
3251	}
3252
3253	if (pt_prev) {
3254		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 
 
 
3255	} else {
3256		atomic_long_inc(&skb->dev->rx_dropped);
 
 
 
 
3257		kfree_skb(skb);
3258		/* Jamal, now you will not able to escape explaining
3259		 * me how you were going to use this. :-)
3260		 */
3261		ret = NET_RX_DROP;
3262	}
3263
3264out:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3265	rcu_read_unlock();
3266	return ret;
3267}
3268
3269/**
3270 *	netif_receive_skb - process receive buffer from network
3271 *	@skb: buffer to process
3272 *
3273 *	netif_receive_skb() is the main receive data processing function.
3274 *	It always succeeds. The buffer may be dropped during processing
3275 *	for congestion control or by the protocol layers.
3276 *
3277 *	This function may only be called from softirq context and interrupts
3278 *	should be enabled.
3279 *
3280 *	Return values (usually ignored):
3281 *	NET_RX_SUCCESS: no congestion
3282 *	NET_RX_DROP: packet was dropped
3283 */
3284int netif_receive_skb(struct sk_buff *skb)
3285{
3286	net_timestamp_check(netdev_tstamp_prequeue, skb);
3287
3288	if (skb_defer_rx_timestamp(skb))
3289		return NET_RX_SUCCESS;
3290
3291#ifdef CONFIG_RPS
3292	if (static_key_false(&rps_needed)) {
3293		struct rps_dev_flow voidflow, *rflow = &voidflow;
3294		int cpu, ret;
3295
3296		rcu_read_lock();
3297
3298		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3299
3300		if (cpu >= 0) {
3301			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3302			rcu_read_unlock();
3303			return ret;
3304		}
3305		rcu_read_unlock();
3306	}
3307#endif
3308	return __netif_receive_skb(skb);
3309}
3310EXPORT_SYMBOL(netif_receive_skb);
3311
3312/* Network device is going away, flush any packets still pending
3313 * Called with irqs disabled.
3314 */
3315static void flush_backlog(void *arg)
3316{
3317	struct net_device *dev = arg;
3318	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3319	struct sk_buff *skb, *tmp;
3320
3321	rps_lock(sd);
3322	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3323		if (skb->dev == dev) {
3324			__skb_unlink(skb, &sd->input_pkt_queue);
3325			kfree_skb(skb);
3326			input_queue_head_incr(sd);
3327		}
3328	}
3329	rps_unlock(sd);
3330
3331	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3332		if (skb->dev == dev) {
3333			__skb_unlink(skb, &sd->process_queue);
3334			kfree_skb(skb);
3335			input_queue_head_incr(sd);
3336		}
3337	}
3338}
3339
3340static int napi_gro_complete(struct sk_buff *skb)
3341{
3342	struct packet_type *ptype;
3343	__be16 type = skb->protocol;
3344	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3345	int err = -ENOENT;
3346
 
 
3347	if (NAPI_GRO_CB(skb)->count == 1) {
3348		skb_shinfo(skb)->gso_size = 0;
3349		goto out;
3350	}
3351
3352	rcu_read_lock();
3353	list_for_each_entry_rcu(ptype, head, list) {
3354		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3355			continue;
3356
3357		err = ptype->gro_complete(skb);
3358		break;
3359	}
3360	rcu_read_unlock();
3361
3362	if (err) {
3363		WARN_ON(&ptype->list == head);
3364		kfree_skb(skb);
3365		return NET_RX_SUCCESS;
3366	}
3367
3368out:
3369	return netif_receive_skb(skb);
3370}
3371
3372inline void napi_gro_flush(struct napi_struct *napi)
 
 
 
 
3373{
3374	struct sk_buff *skb, *next;
 
 
 
 
 
 
3375
3376	for (skb = napi->gro_list; skb; skb = next) {
3377		next = skb->next;
3378		skb->next = NULL;
 
 
 
 
 
3379		napi_gro_complete(skb);
 
3380	}
3381
3382	napi->gro_count = 0;
3383	napi->gro_list = NULL;
3384}
3385EXPORT_SYMBOL(napi_gro_flush);
3386
3387enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3388{
3389	struct sk_buff **pp = NULL;
3390	struct packet_type *ptype;
3391	__be16 type = skb->protocol;
3392	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3393	int same_flow;
3394	int mac_len;
3395	enum gro_result ret;
 
3396
3397	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3398		goto normal;
3399
3400	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3401		goto normal;
3402
 
 
3403	rcu_read_lock();
3404	list_for_each_entry_rcu(ptype, head, list) {
3405		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3406			continue;
3407
3408		skb_set_network_header(skb, skb_gro_offset(skb));
3409		mac_len = skb->network_header - skb->mac_header;
3410		skb->mac_len = mac_len;
3411		NAPI_GRO_CB(skb)->same_flow = 0;
3412		NAPI_GRO_CB(skb)->flush = 0;
3413		NAPI_GRO_CB(skb)->free = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3414
3415		pp = ptype->gro_receive(&napi->gro_list, skb);
3416		break;
3417	}
3418	rcu_read_unlock();
3419
3420	if (&ptype->list == head)
3421		goto normal;
3422
3423	same_flow = NAPI_GRO_CB(skb)->same_flow;
3424	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3425
3426	if (pp) {
3427		struct sk_buff *nskb = *pp;
3428
3429		*pp = nskb->next;
3430		nskb->next = NULL;
3431		napi_gro_complete(nskb);
3432		napi->gro_count--;
3433	}
3434
3435	if (same_flow)
3436		goto ok;
3437
3438	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3439		goto normal;
3440
3441	napi->gro_count++;
 
 
 
 
 
 
 
 
 
 
 
 
 
3442	NAPI_GRO_CB(skb)->count = 1;
 
 
3443	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3444	skb->next = napi->gro_list;
3445	napi->gro_list = skb;
3446	ret = GRO_HELD;
3447
3448pull:
3449	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3450		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3451
3452		BUG_ON(skb->end - skb->tail < grow);
3453
3454		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3455
3456		skb->tail += grow;
3457		skb->data_len -= grow;
3458
3459		skb_shinfo(skb)->frags[0].page_offset += grow;
3460		skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3461
3462		if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3463			skb_frag_unref(skb, 0);
3464			memmove(skb_shinfo(skb)->frags,
3465				skb_shinfo(skb)->frags + 1,
3466				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3467		}
3468	}
3469
3470ok:
3471	return ret;
3472
3473normal:
3474	ret = GRO_NORMAL;
3475	goto pull;
3476}
3477EXPORT_SYMBOL(dev_gro_receive);
3478
3479static inline gro_result_t
3480__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3481{
3482	struct sk_buff *p;
3483	unsigned int maclen = skb->dev->hard_header_len;
 
 
 
 
 
 
 
 
 
3484
3485	for (p = napi->gro_list; p; p = p->next) {
3486		unsigned long diffs;
 
 
3487
3488		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3489		diffs |= p->vlan_tci ^ skb->vlan_tci;
3490		if (maclen == ETH_HLEN)
3491			diffs |= compare_ether_header(skb_mac_header(p),
3492						      skb_gro_mac_header(skb));
3493		else if (!diffs)
3494			diffs = memcmp(skb_mac_header(p),
3495				       skb_gro_mac_header(skb),
3496				       maclen);
3497		NAPI_GRO_CB(p)->same_flow = !diffs;
3498		NAPI_GRO_CB(p)->flush = 0;
3499	}
3500
3501	return dev_gro_receive(napi, skb);
3502}
 
3503
3504gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3505{
3506	switch (ret) {
3507	case GRO_NORMAL:
3508		if (netif_receive_skb(skb))
3509			ret = GRO_DROP;
3510		break;
3511
3512	case GRO_DROP:
3513		kfree_skb(skb);
3514		break;
3515
3516	case GRO_MERGED_FREE:
3517		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
 
3518			kmem_cache_free(skbuff_head_cache, skb);
3519		else
3520			__kfree_skb(skb);
 
3521		break;
3522
3523	case GRO_HELD:
3524	case GRO_MERGED:
3525		break;
3526	}
3527
3528	return ret;
3529}
3530EXPORT_SYMBOL(napi_skb_finish);
3531
3532void skb_gro_reset_offset(struct sk_buff *skb)
3533{
3534	NAPI_GRO_CB(skb)->data_offset = 0;
3535	NAPI_GRO_CB(skb)->frag0 = NULL;
3536	NAPI_GRO_CB(skb)->frag0_len = 0;
3537
3538	if (skb->mac_header == skb->tail &&
3539	    !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3540		NAPI_GRO_CB(skb)->frag0 =
3541			skb_frag_address(&skb_shinfo(skb)->frags[0]);
3542		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3543	}
3544}
3545EXPORT_SYMBOL(skb_gro_reset_offset);
3546
3547gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3548{
3549	skb_gro_reset_offset(skb);
3550
3551	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3552}
3553EXPORT_SYMBOL(napi_gro_receive);
3554
3555static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3556{
 
 
 
 
3557	__skb_pull(skb, skb_headlen(skb));
3558	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
3559	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3560	skb->vlan_tci = 0;
3561	skb->dev = napi->dev;
3562	skb->skb_iif = 0;
 
 
 
3563
3564	napi->skb = skb;
3565}
3566
3567struct sk_buff *napi_get_frags(struct napi_struct *napi)
3568{
3569	struct sk_buff *skb = napi->skb;
3570
3571	if (!skb) {
3572		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3573		if (skb)
3574			napi->skb = skb;
 
 
3575	}
3576	return skb;
3577}
3578EXPORT_SYMBOL(napi_get_frags);
3579
3580gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3581			       gro_result_t ret)
 
3582{
3583	switch (ret) {
3584	case GRO_NORMAL:
3585	case GRO_HELD:
 
3586		skb->protocol = eth_type_trans(skb, skb->dev);
3587
3588		if (ret == GRO_HELD)
3589			skb_gro_pull(skb, -ETH_HLEN);
3590		else if (netif_receive_skb(skb))
3591			ret = GRO_DROP;
3592		break;
3593
3594	case GRO_DROP:
3595	case GRO_MERGED_FREE:
3596		napi_reuse_skb(napi, skb);
3597		break;
3598
3599	case GRO_MERGED:
3600		break;
3601	}
3602
3603	return ret;
3604}
3605EXPORT_SYMBOL(napi_frags_finish);
3606
 
 
 
 
3607static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3608{
3609	struct sk_buff *skb = napi->skb;
3610	struct ethhdr *eth;
3611	unsigned int hlen;
3612	unsigned int off;
3613
3614	napi->skb = NULL;
3615
3616	skb_reset_mac_header(skb);
3617	skb_gro_reset_offset(skb);
3618
3619	off = skb_gro_offset(skb);
3620	hlen = off + sizeof(*eth);
3621	eth = skb_gro_header_fast(skb, off);
3622	if (skb_gro_header_hard(skb, hlen)) {
3623		eth = skb_gro_header_slow(skb, hlen, off);
3624		if (unlikely(!eth)) {
3625			napi_reuse_skb(napi, skb);
3626			skb = NULL;
3627			goto out;
3628		}
 
 
 
 
3629	}
3630
3631	skb_gro_pull(skb, sizeof(*eth));
3632
3633	/*
3634	 * This works because the only protocols we care about don't require
3635	 * special handling.  We'll fix it up properly at the end.
 
3636	 */
3637	skb->protocol = eth->h_proto;
3638
3639out:
3640	return skb;
3641}
3642
3643gro_result_t napi_gro_frags(struct napi_struct *napi)
3644{
3645	struct sk_buff *skb = napi_frags_skb(napi);
3646
3647	if (!skb)
3648		return GRO_DROP;
3649
3650	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
 
 
3651}
3652EXPORT_SYMBOL(napi_gro_frags);
3653
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3654/*
3655 * net_rps_action sends any pending IPI's for rps.
3656 * Note: called with local irq disabled, but exits with local irq enabled.
3657 */
3658static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3659{
3660#ifdef CONFIG_RPS
3661	struct softnet_data *remsd = sd->rps_ipi_list;
3662
3663	if (remsd) {
3664		sd->rps_ipi_list = NULL;
3665
3666		local_irq_enable();
3667
3668		/* Send pending IPI's to kick RPS processing on remote cpus. */
3669		while (remsd) {
3670			struct softnet_data *next = remsd->rps_ipi_next;
3671
3672			if (cpu_online(remsd->cpu))
3673				__smp_call_function_single(remsd->cpu,
3674							   &remsd->csd, 0);
3675			remsd = next;
3676		}
3677	} else
3678#endif
3679		local_irq_enable();
3680}
3681
 
 
 
 
 
 
 
 
 
3682static int process_backlog(struct napi_struct *napi, int quota)
3683{
3684	int work = 0;
3685	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3686
3687#ifdef CONFIG_RPS
3688	/* Check if we have pending ipi, its better to send them now,
3689	 * not waiting net_rx_action() end.
3690	 */
3691	if (sd->rps_ipi_list) {
3692		local_irq_disable();
3693		net_rps_action_and_irq_enable(sd);
3694	}
3695#endif
3696	napi->weight = weight_p;
3697	local_irq_disable();
3698	while (work < quota) {
3699		struct sk_buff *skb;
3700		unsigned int qlen;
3701
3702		while ((skb = __skb_dequeue(&sd->process_queue))) {
 
3703			local_irq_enable();
3704			__netif_receive_skb(skb);
 
3705			local_irq_disable();
3706			input_queue_head_incr(sd);
3707			if (++work >= quota) {
3708				local_irq_enable();
3709				return work;
3710			}
3711		}
3712
3713		rps_lock(sd);
3714		qlen = skb_queue_len(&sd->input_pkt_queue);
3715		if (qlen)
3716			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3717						   &sd->process_queue);
3718
3719		if (qlen < quota - work) {
3720			/*
3721			 * Inline a custom version of __napi_complete().
3722			 * only current cpu owns and manipulates this napi,
3723			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3724			 * we can use a plain write instead of clear_bit(),
 
3725			 * and we dont need an smp_mb() memory barrier.
3726			 */
3727			list_del(&napi->poll_list);
3728			napi->state = 0;
 
3729
3730			quota = work + qlen;
3731		}
 
 
 
3732		rps_unlock(sd);
3733	}
3734	local_irq_enable();
3735
3736	return work;
3737}
3738
3739/**
3740 * __napi_schedule - schedule for receive
3741 * @n: entry to schedule
3742 *
3743 * The entry's receive function will be scheduled to run
 
3744 */
3745void __napi_schedule(struct napi_struct *n)
3746{
3747	unsigned long flags;
3748
3749	local_irq_save(flags);
3750	____napi_schedule(&__get_cpu_var(softnet_data), n);
3751	local_irq_restore(flags);
3752}
3753EXPORT_SYMBOL(__napi_schedule);
3754
 
 
 
 
 
 
 
 
 
 
 
 
3755void __napi_complete(struct napi_struct *n)
3756{
3757	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3758	BUG_ON(n->gro_list);
3759
3760	list_del(&n->poll_list);
3761	smp_mb__before_clear_bit();
3762	clear_bit(NAPI_STATE_SCHED, &n->state);
3763}
3764EXPORT_SYMBOL(__napi_complete);
3765
3766void napi_complete(struct napi_struct *n)
3767{
3768	unsigned long flags;
3769
3770	/*
3771	 * don't let napi dequeue from the cpu poll list
3772	 * just in case its running on a different cpu
3773	 */
3774	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3775		return;
3776
3777	napi_gro_flush(n);
3778	local_irq_save(flags);
3779	__napi_complete(n);
3780	local_irq_restore(flags);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3781}
3782EXPORT_SYMBOL(napi_complete);
3783
3784void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3785		    int (*poll)(struct napi_struct *, int), int weight)
3786{
3787	INIT_LIST_HEAD(&napi->poll_list);
 
 
3788	napi->gro_count = 0;
3789	napi->gro_list = NULL;
3790	napi->skb = NULL;
3791	napi->poll = poll;
 
 
 
3792	napi->weight = weight;
3793	list_add(&napi->dev_list, &dev->napi_list);
3794	napi->dev = dev;
3795#ifdef CONFIG_NETPOLL
3796	spin_lock_init(&napi->poll_lock);
3797	napi->poll_owner = -1;
3798#endif
3799	set_bit(NAPI_STATE_SCHED, &napi->state);
 
3800}
3801EXPORT_SYMBOL(netif_napi_add);
3802
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3803void netif_napi_del(struct napi_struct *napi)
3804{
3805	struct sk_buff *skb, *next;
3806
 
3807	list_del_init(&napi->dev_list);
3808	napi_free_frags(napi);
3809
3810	for (skb = napi->gro_list; skb; skb = next) {
3811		next = skb->next;
3812		skb->next = NULL;
3813		kfree_skb(skb);
3814	}
3815
3816	napi->gro_list = NULL;
3817	napi->gro_count = 0;
3818}
3819EXPORT_SYMBOL(netif_napi_del);
3820
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3821static void net_rx_action(struct softirq_action *h)
3822{
3823	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3824	unsigned long time_limit = jiffies + 2;
3825	int budget = netdev_budget;
3826	void *have;
 
3827
3828	local_irq_disable();
 
 
3829
3830	while (!list_empty(&sd->poll_list)) {
3831		struct napi_struct *n;
3832		int work, weight;
3833
3834		/* If softirq window is exhuasted then punt.
 
 
 
 
 
 
 
 
 
3835		 * Allow this to run for 2 jiffies since which will allow
3836		 * an average latency of 1.5/HZ.
3837		 */
3838		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3839			goto softnet_break;
 
 
 
 
3840
3841		local_irq_enable();
 
3842
3843		/* Even though interrupts have been re-enabled, this
3844		 * access is safe because interrupts can only add new
3845		 * entries to the tail of this list, and only ->poll()
3846		 * calls can remove this head entry from the list.
3847		 */
3848		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3849
3850		have = netpoll_poll_lock(n);
 
3851
3852		weight = n->weight;
 
3853
3854		/* This NAPI_STATE_SCHED test is for avoiding a race
3855		 * with netpoll's poll_napi().  Only the entity which
3856		 * obtains the lock and sees NAPI_STATE_SCHED set will
3857		 * actually make the ->poll() call.  Therefore we avoid
3858		 * accidentally calling ->poll() when NAPI is not scheduled.
3859		 */
3860		work = 0;
3861		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3862			work = n->poll(n, weight);
3863			trace_napi_poll(n);
3864		}
3865
3866		WARN_ON_ONCE(work > weight);
 
3867
3868		budget -= work;
 
3869
3870		local_irq_disable();
 
 
3871
3872		/* Drivers must not modify the NAPI state if they
3873		 * consume the entire weight.  In such cases this code
3874		 * still "owns" the NAPI instance and therefore can
3875		 * move the instance around on the list at-will.
3876		 */
3877		if (unlikely(work == weight)) {
3878			if (unlikely(napi_disable_pending(n))) {
3879				local_irq_enable();
3880				napi_complete(n);
3881				local_irq_disable();
3882			} else
3883				list_move_tail(&n->poll_list, &sd->poll_list);
3884		}
3885
3886		netpoll_poll_unlock(have);
 
 
3887	}
3888out:
3889	net_rps_action_and_irq_enable(sd);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3890
3891#ifdef CONFIG_NET_DMA
3892	/*
3893	 * There may not be any more sk_buffs coming right now, so push
3894	 * any pending DMA copies to hardware
3895	 */
3896	dma_issue_pending_all();
3897#endif
3898
3899	return;
 
 
 
 
 
 
 
 
 
3900
3901softnet_break:
3902	sd->time_squeeze++;
3903	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3904	goto out;
3905}
3906
3907static gifconf_func_t *gifconf_list[NPROTO];
3908
3909/**
3910 *	register_gifconf	-	register a SIOCGIF handler
3911 *	@family: Address family
3912 *	@gifconf: Function handler
3913 *
3914 *	Register protocol dependent address dumping routines. The handler
3915 *	that is passed must not be freed or reused until it has been replaced
3916 *	by another handler.
3917 */
3918int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3919{
3920	if (family >= NPROTO)
3921		return -EINVAL;
3922	gifconf_list[family] = gifconf;
3923	return 0;
 
 
 
 
 
 
 
 
3924}
3925EXPORT_SYMBOL(register_gifconf);
 
 
 
 
3926
 
3927
3928/*
3929 *	Map an interface index to its name (SIOCGIFNAME)
3930 */
3931
3932/*
3933 *	We need this ioctl for efficient implementation of the
3934 *	if_indextoname() function required by the IPv6 API.  Without
3935 *	it, we would have to search all the interfaces to find a
3936 *	match.  --pb
 
 
3937 */
3938
3939static int dev_ifname(struct net *net, struct ifreq __user *arg)
3940{
3941	struct net_device *dev;
3942	struct ifreq ifr;
3943
3944	/*
3945	 *	Fetch the caller's info block.
3946	 */
3947
3948	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3949		return -EFAULT;
3950
3951	rcu_read_lock();
3952	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3953	if (!dev) {
3954		rcu_read_unlock();
3955		return -ENODEV;
3956	}
3957
3958	strcpy(ifr.ifr_name, dev->name);
3959	rcu_read_unlock();
3960
3961	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3962		return -EFAULT;
3963	return 0;
3964}
 
3965
3966/*
3967 *	Perform a SIOCGIFCONF call. This structure will change
3968 *	size eventually, and there is nothing I can do about it.
3969 *	Thus we will need a 'compatibility mode'.
 
 
 
3970 */
3971
3972static int dev_ifconf(struct net *net, char __user *arg)
3973{
3974	struct ifconf ifc;
3975	struct net_device *dev;
3976	char __user *pos;
3977	int len;
3978	int total;
3979	int i;
3980
3981	/*
3982	 *	Fetch the caller's info block.
3983	 */
3984
3985	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3986		return -EFAULT;
3987
3988	pos = ifc.ifc_buf;
3989	len = ifc.ifc_len;
3990
3991	/*
3992	 *	Loop over the interfaces, and write an info block for each.
3993	 */
3994
3995	total = 0;
3996	for_each_netdev(net, dev) {
3997		for (i = 0; i < NPROTO; i++) {
3998			if (gifconf_list[i]) {
3999				int done;
4000				if (!pos)
4001					done = gifconf_list[i](dev, NULL, 0);
4002				else
4003					done = gifconf_list[i](dev, pos + total,
4004							       len - total);
4005				if (done < 0)
4006					return -EFAULT;
4007				total += done;
4008			}
4009		}
4010	}
4011
4012	/*
4013	 *	All done.  Write the updated control block back to the caller.
4014	 */
4015	ifc.ifc_len = total;
 
 
 
 
 
 
 
 
 
 
 
4016
4017	/*
4018	 * 	Both BSD and Solaris return 0 here, so we do too.
4019	 */
4020	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4021}
4022
4023#ifdef CONFIG_PROC_FS
 
4024
4025#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4026
4027#define get_bucket(x) ((x) >> BUCKET_SPACE)
4028#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4029#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4030
4031static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
 
 
 
 
 
 
 
 
 
 
 
4032{
4033	struct net *net = seq_file_net(seq);
4034	struct net_device *dev;
4035	struct hlist_node *p;
4036	struct hlist_head *h;
4037	unsigned int count = 0, offset = get_offset(*pos);
4038
4039	h = &net->dev_name_head[get_bucket(*pos)];
4040	hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4041		if (++count == offset)
4042			return dev;
4043	}
4044
4045	return NULL;
4046}
4047
4048static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4049{
4050	struct net_device *dev;
4051	unsigned int bucket;
4052
4053	do {
4054		dev = dev_from_same_bucket(seq, pos);
4055		if (dev)
4056			return dev;
4057
4058		bucket = get_bucket(*pos) + 1;
4059		*pos = set_bucket_offset(bucket, 1);
4060	} while (bucket < NETDEV_HASHENTRIES);
4061
4062	return NULL;
4063}
 
4064
4065/*
4066 *	This is invoked by the /proc filesystem handler to display a device
4067 *	in detail.
 
 
 
 
 
 
 
4068 */
4069void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4070	__acquires(RCU)
4071{
4072	rcu_read_lock();
4073	if (!*pos)
4074		return SEQ_START_TOKEN;
4075
4076	if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4077		return NULL;
4078
4079	return dev_from_bucket(seq, pos);
 
 
4080}
 
4081
4082void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 
 
 
 
 
 
 
 
4083{
4084	++*pos;
4085	return dev_from_bucket(seq, pos);
 
 
 
 
 
4086}
 
4087
4088void dev_seq_stop(struct seq_file *seq, void *v)
4089	__releases(RCU)
 
 
 
 
 
 
4090{
4091	rcu_read_unlock();
 
 
 
 
 
 
4092}
 
4093
4094static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
 
 
 
 
 
 
 
 
 
 
 
 
4095{
4096	struct rtnl_link_stats64 temp;
4097	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
 
 
 
4098
4099	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4100		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4101		   dev->name, stats->rx_bytes, stats->rx_packets,
4102		   stats->rx_errors,
4103		   stats->rx_dropped + stats->rx_missed_errors,
4104		   stats->rx_fifo_errors,
4105		   stats->rx_length_errors + stats->rx_over_errors +
4106		    stats->rx_crc_errors + stats->rx_frame_errors,
4107		   stats->rx_compressed, stats->multicast,
4108		   stats->tx_bytes, stats->tx_packets,
4109		   stats->tx_errors, stats->tx_dropped,
4110		   stats->tx_fifo_errors, stats->collisions,
4111		   stats->tx_carrier_errors +
4112		    stats->tx_aborted_errors +
4113		    stats->tx_window_errors +
4114		    stats->tx_heartbeat_errors,
4115		   stats->tx_compressed);
4116}
4117
4118/*
4119 *	Called from the PROCfs module. This now uses the new arbitrary sized
4120 *	/proc/net interface to create /proc/net/dev
4121 */
4122static int dev_seq_show(struct seq_file *seq, void *v)
4123{
4124	if (v == SEQ_START_TOKEN)
4125		seq_puts(seq, "Inter-|   Receive                            "
4126			      "                    |  Transmit\n"
4127			      " face |bytes    packets errs drop fifo frame "
4128			      "compressed multicast|bytes    packets errs "
4129			      "drop fifo colls carrier compressed\n");
4130	else
4131		dev_seq_printf_stats(seq, v);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4132	return 0;
 
 
 
 
 
 
 
 
 
4133}
4134
4135static struct softnet_data *softnet_get_online(loff_t *pos)
 
 
4136{
4137	struct softnet_data *sd = NULL;
 
 
4138
4139	while (*pos < nr_cpu_ids)
4140		if (cpu_online(*pos)) {
4141			sd = &per_cpu(softnet_data, *pos);
4142			break;
4143		} else
4144			++*pos;
4145	return sd;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4146}
4147
4148static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
 
4149{
4150	return softnet_get_online(pos);
 
 
 
4151}
4152
4153static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 
 
4154{
4155	++*pos;
4156	return softnet_get_online(pos);
4157}
4158
4159static void softnet_seq_stop(struct seq_file *seq, void *v)
 
4160{
 
 
 
4161}
4162
4163static int softnet_seq_show(struct seq_file *seq, void *v)
 
 
4164{
4165	struct softnet_data *sd = v;
 
 
 
 
 
 
 
 
 
 
 
 
4166
4167	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4168		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4169		   0, 0, 0, 0, /* was fastroute */
4170		   sd->cpu_collision, sd->received_rps);
4171	return 0;
4172}
4173
4174static const struct seq_operations dev_seq_ops = {
4175	.start = dev_seq_start,
4176	.next  = dev_seq_next,
4177	.stop  = dev_seq_stop,
4178	.show  = dev_seq_show,
4179};
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4180
4181static int dev_seq_open(struct inode *inode, struct file *file)
4182{
4183	return seq_open_net(inode, file, &dev_seq_ops,
4184			    sizeof(struct seq_net_private));
4185}
4186
4187static const struct file_operations dev_seq_fops = {
4188	.owner	 = THIS_MODULE,
4189	.open    = dev_seq_open,
4190	.read    = seq_read,
4191	.llseek  = seq_lseek,
4192	.release = seq_release_net,
4193};
4194
4195static const struct seq_operations softnet_seq_ops = {
4196	.start = softnet_seq_start,
4197	.next  = softnet_seq_next,
4198	.stop  = softnet_seq_stop,
4199	.show  = softnet_seq_show,
4200};
4201
4202static int softnet_seq_open(struct inode *inode, struct file *file)
4203{
4204	return seq_open(file, &softnet_seq_ops);
4205}
 
 
 
4206
4207static const struct file_operations softnet_seq_fops = {
4208	.owner	 = THIS_MODULE,
4209	.open    = softnet_seq_open,
4210	.read    = seq_read,
4211	.llseek  = seq_lseek,
4212	.release = seq_release,
4213};
4214
4215static void *ptype_get_idx(loff_t pos)
4216{
4217	struct packet_type *pt = NULL;
4218	loff_t i = 0;
4219	int t;
4220
4221	list_for_each_entry_rcu(pt, &ptype_all, list) {
4222		if (i == pos)
4223			return pt;
4224		++i;
4225	}
4226
4227	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4228		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4229			if (i == pos)
4230				return pt;
4231			++i;
4232		}
 
 
4233	}
4234	return NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4235}
 
4236
4237static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4238	__acquires(RCU)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4239{
4240	rcu_read_lock();
4241	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4242}
 
4243
4244static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 
 
 
 
 
 
 
 
4245{
4246	struct packet_type *pt;
4247	struct list_head *nxt;
4248	int hash;
4249
4250	++*pos;
4251	if (v == SEQ_START_TOKEN)
4252		return ptype_get_idx(0);
4253
4254	pt = v;
4255	nxt = pt->list.next;
4256	if (pt->type == htons(ETH_P_ALL)) {
4257		if (nxt != &ptype_all)
4258			goto found;
4259		hash = 0;
4260		nxt = ptype_base[0].next;
4261	} else
4262		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 
 
 
 
 
 
 
 
 
 
 
 
4263
4264	while (nxt == &ptype_base[hash]) {
4265		if (++hash >= PTYPE_HASH_SIZE)
4266			return NULL;
4267		nxt = ptype_base[hash].next;
4268	}
4269found:
4270	return list_entry(nxt, struct packet_type, list);
4271}
 
4272
4273static void ptype_seq_stop(struct seq_file *seq, void *v)
4274	__releases(RCU)
 
 
 
 
 
 
 
 
4275{
4276	rcu_read_unlock();
 
 
 
 
 
4277}
 
4278
4279static int ptype_seq_show(struct seq_file *seq, void *v)
4280{
4281	struct packet_type *pt = v;
4282
4283	if (v == SEQ_START_TOKEN)
4284		seq_puts(seq, "Type Device      Function\n");
4285	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4286		if (pt->type == htons(ETH_P_ALL))
4287			seq_puts(seq, "ALL ");
4288		else
4289			seq_printf(seq, "%04x", ntohs(pt->type));
4290
4291		seq_printf(seq, " %-8s %pF\n",
4292			   pt->dev ? pt->dev->name : "", pt->func);
 
 
 
 
 
4293	}
4294
4295	return 0;
 
 
 
 
 
 
 
4296}
4297
4298static const struct seq_operations ptype_seq_ops = {
4299	.start = ptype_seq_start,
4300	.next  = ptype_seq_next,
4301	.stop  = ptype_seq_stop,
4302	.show  = ptype_seq_show,
4303};
4304
4305static int ptype_seq_open(struct inode *inode, struct file *file)
4306{
4307	return seq_open_net(inode, file, &ptype_seq_ops,
4308			sizeof(struct seq_net_private));
4309}
4310
4311static const struct file_operations ptype_seq_fops = {
4312	.owner	 = THIS_MODULE,
4313	.open    = ptype_seq_open,
4314	.read    = seq_read,
4315	.llseek  = seq_lseek,
4316	.release = seq_release_net,
4317};
 
4318
 
 
 
 
 
 
 
 
 
4319
4320static int __net_init dev_proc_net_init(struct net *net)
4321{
4322	int rc = -ENOMEM;
 
 
4323
4324	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4325		goto out;
4326	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4327		goto out_dev;
4328	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4329		goto out_softnet;
 
 
4330
4331	if (wext_proc_init(net))
4332		goto out_ptype;
4333	rc = 0;
4334out:
4335	return rc;
4336out_ptype:
4337	proc_net_remove(net, "ptype");
4338out_softnet:
4339	proc_net_remove(net, "softnet_stat");
4340out_dev:
4341	proc_net_remove(net, "dev");
4342	goto out;
4343}
4344
4345static void __net_exit dev_proc_net_exit(struct net *net)
 
4346{
4347	wext_proc_exit(net);
4348
4349	proc_net_remove(net, "ptype");
4350	proc_net_remove(net, "softnet_stat");
4351	proc_net_remove(net, "dev");
4352}
 
4353
4354static struct pernet_operations __net_initdata dev_proc_ops = {
4355	.init = dev_proc_net_init,
4356	.exit = dev_proc_net_exit,
4357};
4358
4359static int __init dev_proc_init(void)
4360{
4361	return register_pernet_subsys(&dev_proc_ops);
4362}
4363#else
4364#define dev_proc_init() 0
4365#endif	/* CONFIG_PROC_FS */
4366
4367
4368/**
4369 *	netdev_set_master	-	set up master pointer
4370 *	@slave: slave device
4371 *	@master: new master device
4372 *
4373 *	Changes the master device of the slave. Pass %NULL to break the
4374 *	bonding. The caller must hold the RTNL semaphore. On a failure
4375 *	a negative errno code is returned. On success the reference counts
4376 *	are adjusted and the function returns zero.
4377 */
4378int netdev_set_master(struct net_device *slave, struct net_device *master)
4379{
4380	struct net_device *old = slave->master;
 
 
 
4381
4382	ASSERT_RTNL();
4383
4384	if (master) {
4385		if (old)
4386			return -EBUSY;
4387		dev_hold(master);
4388	}
4389
4390	slave->master = master;
 
4391
4392	if (old)
4393		dev_put(old);
4394	return 0;
4395}
4396EXPORT_SYMBOL(netdev_set_master);
4397
4398/**
4399 *	netdev_set_bond_master	-	set up bonding master/slave pair
4400 *	@slave: slave device
4401 *	@master: new master device
4402 *
4403 *	Changes the master device of the slave. Pass %NULL to break the
4404 *	bonding. The caller must hold the RTNL semaphore. On a failure
4405 *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4406 *	to the routing socket and the function returns zero.
4407 */
4408int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
 
4409{
4410	int err;
4411
4412	ASSERT_RTNL();
4413
4414	err = netdev_set_master(slave, master);
4415	if (err)
4416		return err;
4417	if (master)
4418		slave->flags |= IFF_SLAVE;
4419	else
4420		slave->flags &= ~IFF_SLAVE;
4421
4422	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4423	return 0;
4424}
4425EXPORT_SYMBOL(netdev_set_bond_master);
4426
4427static void dev_change_rx_flags(struct net_device *dev, int flags)
4428{
4429	const struct net_device_ops *ops = dev->netdev_ops;
4430
4431	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4432		ops->ndo_change_rx_flags(dev, flags);
4433}
4434
4435static int __dev_set_promiscuity(struct net_device *dev, int inc)
4436{
4437	unsigned int old_flags = dev->flags;
4438	uid_t uid;
4439	gid_t gid;
4440
4441	ASSERT_RTNL();
4442
4443	dev->flags |= IFF_PROMISC;
4444	dev->promiscuity += inc;
4445	if (dev->promiscuity == 0) {
4446		/*
4447		 * Avoid overflow.
4448		 * If inc causes overflow, untouch promisc and return error.
4449		 */
4450		if (inc < 0)
4451			dev->flags &= ~IFF_PROMISC;
4452		else {
4453			dev->promiscuity -= inc;
4454			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4455				dev->name);
4456			return -EOVERFLOW;
4457		}
4458	}
4459	if (dev->flags != old_flags) {
4460		pr_info("device %s %s promiscuous mode\n",
4461			dev->name,
4462			dev->flags & IFF_PROMISC ? "entered" : "left");
4463		if (audit_enabled) {
4464			current_uid_gid(&uid, &gid);
4465			audit_log(current->audit_context, GFP_ATOMIC,
4466				AUDIT_ANOM_PROMISCUOUS,
4467				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4468				dev->name, (dev->flags & IFF_PROMISC),
4469				(old_flags & IFF_PROMISC),
4470				audit_get_loginuid(current),
4471				uid, gid,
 
4472				audit_get_sessionid(current));
4473		}
4474
4475		dev_change_rx_flags(dev, IFF_PROMISC);
4476	}
 
 
4477	return 0;
4478}
4479
4480/**
4481 *	dev_set_promiscuity	- update promiscuity count on a device
4482 *	@dev: device
4483 *	@inc: modifier
4484 *
4485 *	Add or remove promiscuity from a device. While the count in the device
4486 *	remains above zero the interface remains promiscuous. Once it hits zero
4487 *	the device reverts back to normal filtering operation. A negative inc
4488 *	value is used to drop promiscuity on the device.
4489 *	Return 0 if successful or a negative errno code on error.
4490 */
4491int dev_set_promiscuity(struct net_device *dev, int inc)
4492{
4493	unsigned int old_flags = dev->flags;
4494	int err;
4495
4496	err = __dev_set_promiscuity(dev, inc);
4497	if (err < 0)
4498		return err;
4499	if (dev->flags != old_flags)
4500		dev_set_rx_mode(dev);
4501	return err;
4502}
4503EXPORT_SYMBOL(dev_set_promiscuity);
4504
4505/**
4506 *	dev_set_allmulti	- update allmulti count on a device
4507 *	@dev: device
4508 *	@inc: modifier
4509 *
4510 *	Add or remove reception of all multicast frames to a device. While the
4511 *	count in the device remains above zero the interface remains listening
4512 *	to all interfaces. Once it hits zero the device reverts back to normal
4513 *	filtering operation. A negative @inc value is used to drop the counter
4514 *	when releasing a resource needing all multicasts.
4515 *	Return 0 if successful or a negative errno code on error.
4516 */
4517
4518int dev_set_allmulti(struct net_device *dev, int inc)
4519{
4520	unsigned int old_flags = dev->flags;
4521
4522	ASSERT_RTNL();
4523
4524	dev->flags |= IFF_ALLMULTI;
4525	dev->allmulti += inc;
4526	if (dev->allmulti == 0) {
4527		/*
4528		 * Avoid overflow.
4529		 * If inc causes overflow, untouch allmulti and return error.
4530		 */
4531		if (inc < 0)
4532			dev->flags &= ~IFF_ALLMULTI;
4533		else {
4534			dev->allmulti -= inc;
4535			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4536				dev->name);
4537			return -EOVERFLOW;
4538		}
4539	}
4540	if (dev->flags ^ old_flags) {
4541		dev_change_rx_flags(dev, IFF_ALLMULTI);
4542		dev_set_rx_mode(dev);
 
 
 
4543	}
4544	return 0;
4545}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4546EXPORT_SYMBOL(dev_set_allmulti);
4547
4548/*
4549 *	Upload unicast and multicast address lists to device and
4550 *	configure RX filtering. When the device doesn't support unicast
4551 *	filtering it is put in promiscuous mode while unicast addresses
4552 *	are present.
4553 */
4554void __dev_set_rx_mode(struct net_device *dev)
4555{
4556	const struct net_device_ops *ops = dev->netdev_ops;
4557
4558	/* dev_open will call this function so the list will stay sane. */
4559	if (!(dev->flags&IFF_UP))
4560		return;
4561
4562	if (!netif_device_present(dev))
4563		return;
4564
4565	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4566		/* Unicast addresses changes may only happen under the rtnl,
4567		 * therefore calling __dev_set_promiscuity here is safe.
4568		 */
4569		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4570			__dev_set_promiscuity(dev, 1);
4571			dev->uc_promisc = true;
4572		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4573			__dev_set_promiscuity(dev, -1);
4574			dev->uc_promisc = false;
4575		}
4576	}
4577
4578	if (ops->ndo_set_rx_mode)
4579		ops->ndo_set_rx_mode(dev);
4580}
4581
4582void dev_set_rx_mode(struct net_device *dev)
4583{
4584	netif_addr_lock_bh(dev);
4585	__dev_set_rx_mode(dev);
4586	netif_addr_unlock_bh(dev);
4587}
4588
4589/**
4590 *	dev_get_flags - get flags reported to userspace
4591 *	@dev: device
4592 *
4593 *	Get the combination of flag bits exported through APIs to userspace.
4594 */
4595unsigned int dev_get_flags(const struct net_device *dev)
4596{
4597	unsigned int flags;
4598
4599	flags = (dev->flags & ~(IFF_PROMISC |
4600				IFF_ALLMULTI |
4601				IFF_RUNNING |
4602				IFF_LOWER_UP |
4603				IFF_DORMANT)) |
4604		(dev->gflags & (IFF_PROMISC |
4605				IFF_ALLMULTI));
4606
4607	if (netif_running(dev)) {
4608		if (netif_oper_up(dev))
4609			flags |= IFF_RUNNING;
4610		if (netif_carrier_ok(dev))
4611			flags |= IFF_LOWER_UP;
4612		if (netif_dormant(dev))
4613			flags |= IFF_DORMANT;
4614	}
4615
4616	return flags;
4617}
4618EXPORT_SYMBOL(dev_get_flags);
4619
4620int __dev_change_flags(struct net_device *dev, unsigned int flags)
4621{
4622	unsigned int old_flags = dev->flags;
4623	int ret;
4624
4625	ASSERT_RTNL();
4626
4627	/*
4628	 *	Set the flags on our device.
4629	 */
4630
4631	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4632			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4633			       IFF_AUTOMEDIA)) |
4634		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4635				    IFF_ALLMULTI));
4636
4637	/*
4638	 *	Load in the correct multicast list now the flags have changed.
4639	 */
4640
4641	if ((old_flags ^ flags) & IFF_MULTICAST)
4642		dev_change_rx_flags(dev, IFF_MULTICAST);
4643
4644	dev_set_rx_mode(dev);
4645
4646	/*
4647	 *	Have we downed the interface. We handle IFF_UP ourselves
4648	 *	according to user attempts to set it, rather than blindly
4649	 *	setting it.
4650	 */
4651
4652	ret = 0;
4653	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4654		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4655
4656		if (!ret)
4657			dev_set_rx_mode(dev);
4658	}
4659
4660	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4661		int inc = (flags & IFF_PROMISC) ? 1 : -1;
 
4662
4663		dev->gflags ^= IFF_PROMISC;
4664		dev_set_promiscuity(dev, inc);
 
 
 
4665	}
4666
4667	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4668	   is important. Some (broken) drivers set IFF_PROMISC, when
4669	   IFF_ALLMULTI is requested not asking us and not reporting.
4670	 */
4671	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4672		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4673
4674		dev->gflags ^= IFF_ALLMULTI;
4675		dev_set_allmulti(dev, inc);
4676	}
4677
4678	return ret;
4679}
4680
4681void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
 
4682{
4683	unsigned int changes = dev->flags ^ old_flags;
4684
 
 
 
4685	if (changes & IFF_UP) {
4686		if (dev->flags & IFF_UP)
4687			call_netdevice_notifiers(NETDEV_UP, dev);
4688		else
4689			call_netdevice_notifiers(NETDEV_DOWN, dev);
4690	}
4691
4692	if (dev->flags & IFF_UP &&
4693	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4694		call_netdevice_notifiers(NETDEV_CHANGE, dev);
 
 
 
 
 
4695}
4696
4697/**
4698 *	dev_change_flags - change device settings
4699 *	@dev: device
4700 *	@flags: device state flags
4701 *
4702 *	Change settings on device based state flags. The flags are
4703 *	in the userspace exported format.
4704 */
4705int dev_change_flags(struct net_device *dev, unsigned int flags)
4706{
4707	int ret;
4708	unsigned int changes, old_flags = dev->flags;
4709
4710	ret = __dev_change_flags(dev, flags);
4711	if (ret < 0)
4712		return ret;
4713
4714	changes = old_flags ^ dev->flags;
4715	if (changes)
4716		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4717
4718	__dev_notify_flags(dev, old_flags);
4719	return ret;
4720}
4721EXPORT_SYMBOL(dev_change_flags);
4722
 
 
 
 
 
 
 
 
 
 
 
4723/**
4724 *	dev_set_mtu - Change maximum transfer unit
4725 *	@dev: device
4726 *	@new_mtu: new transfer unit
4727 *
4728 *	Change the maximum transfer size of the network device.
4729 */
4730int dev_set_mtu(struct net_device *dev, int new_mtu)
4731{
4732	const struct net_device_ops *ops = dev->netdev_ops;
4733	int err;
4734
4735	if (new_mtu == dev->mtu)
4736		return 0;
4737
4738	/*	MTU must be positive.	 */
4739	if (new_mtu < 0)
4740		return -EINVAL;
4741
4742	if (!netif_device_present(dev))
4743		return -ENODEV;
4744
4745	err = 0;
4746	if (ops->ndo_change_mtu)
4747		err = ops->ndo_change_mtu(dev, new_mtu);
4748	else
4749		dev->mtu = new_mtu;
 
 
4750
4751	if (!err && dev->flags & IFF_UP)
4752		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
 
 
 
 
 
 
 
 
 
4753	return err;
4754}
4755EXPORT_SYMBOL(dev_set_mtu);
4756
4757/**
4758 *	dev_set_group - Change group this device belongs to
4759 *	@dev: device
4760 *	@new_group: group this device should belong to
4761 */
4762void dev_set_group(struct net_device *dev, int new_group)
4763{
4764	dev->group = new_group;
4765}
4766EXPORT_SYMBOL(dev_set_group);
4767
4768/**
4769 *	dev_set_mac_address - Change Media Access Control Address
4770 *	@dev: device
4771 *	@sa: new address
4772 *
4773 *	Change the hardware (MAC) address of the device
4774 */
4775int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4776{
4777	const struct net_device_ops *ops = dev->netdev_ops;
4778	int err;
4779
4780	if (!ops->ndo_set_mac_address)
4781		return -EOPNOTSUPP;
4782	if (sa->sa_family != dev->type)
4783		return -EINVAL;
4784	if (!netif_device_present(dev))
4785		return -ENODEV;
4786	err = ops->ndo_set_mac_address(dev, sa);
4787	if (!err)
4788		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
 
 
4789	add_device_randomness(dev->dev_addr, dev->addr_len);
4790	return err;
4791}
4792EXPORT_SYMBOL(dev_set_mac_address);
4793
4794/*
4795 *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
 
 
 
 
4796 */
4797static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4798{
4799	int err;
4800	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4801
4802	if (!dev)
 
 
4803		return -ENODEV;
4804
4805	switch (cmd) {
4806	case SIOCGIFFLAGS:	/* Get interface flags */
4807		ifr->ifr_flags = (short) dev_get_flags(dev);
4808		return 0;
4809
4810	case SIOCGIFMETRIC:	/* Get the metric on the interface
4811				   (currently unused) */
4812		ifr->ifr_metric = 0;
4813		return 0;
4814
4815	case SIOCGIFMTU:	/* Get the MTU of a device */
4816		ifr->ifr_mtu = dev->mtu;
4817		return 0;
4818
4819	case SIOCGIFHWADDR:
4820		if (!dev->addr_len)
4821			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4822		else
4823			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4824			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4825		ifr->ifr_hwaddr.sa_family = dev->type;
4826		return 0;
4827
4828	case SIOCGIFSLAVE:
4829		err = -EINVAL;
4830		break;
4831
4832	case SIOCGIFMAP:
4833		ifr->ifr_map.mem_start = dev->mem_start;
4834		ifr->ifr_map.mem_end   = dev->mem_end;
4835		ifr->ifr_map.base_addr = dev->base_addr;
4836		ifr->ifr_map.irq       = dev->irq;
4837		ifr->ifr_map.dma       = dev->dma;
4838		ifr->ifr_map.port      = dev->if_port;
4839		return 0;
4840
4841	case SIOCGIFINDEX:
4842		ifr->ifr_ifindex = dev->ifindex;
4843		return 0;
4844
4845	case SIOCGIFTXQLEN:
4846		ifr->ifr_qlen = dev->tx_queue_len;
4847		return 0;
4848
4849	default:
4850		/* dev_ioctl() should ensure this case
4851		 * is never reached
4852		 */
4853		WARN_ON(1);
4854		err = -ENOTTY;
4855		break;
4856
4857	}
4858	return err;
4859}
 
4860
4861/*
4862 *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
 
 
 
 
4863 */
4864static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
 
4865{
4866	int err;
4867	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4868	const struct net_device_ops *ops;
4869
4870	if (!dev)
4871		return -ENODEV;
4872
4873	ops = dev->netdev_ops;
4874
4875	switch (cmd) {
4876	case SIOCSIFFLAGS:	/* Set interface flags */
4877		return dev_change_flags(dev, ifr->ifr_flags);
4878
4879	case SIOCSIFMETRIC:	/* Set the metric on the interface
4880				   (currently unused) */
4881		return -EOPNOTSUPP;
 
 
 
4882
4883	case SIOCSIFMTU:	/* Set the MTU of a device */
4884		return dev_set_mtu(dev, ifr->ifr_mtu);
4885
4886	case SIOCSIFHWADDR:
4887		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4888
4889	case SIOCSIFHWBROADCAST:
4890		if (ifr->ifr_hwaddr.sa_family != dev->type)
4891			return -EINVAL;
4892		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4893		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4894		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4895		return 0;
4896
4897	case SIOCSIFMAP:
4898		if (ops->ndo_set_config) {
4899			if (!netif_device_present(dev))
4900				return -ENODEV;
4901			return ops->ndo_set_config(dev, &ifr->ifr_map);
4902		}
4903		return -EOPNOTSUPP;
4904
4905	case SIOCADDMULTI:
4906		if (!ops->ndo_set_rx_mode ||
4907		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4908			return -EINVAL;
4909		if (!netif_device_present(dev))
4910			return -ENODEV;
4911		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4912
4913	case SIOCDELMULTI:
4914		if (!ops->ndo_set_rx_mode ||
4915		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4916			return -EINVAL;
4917		if (!netif_device_present(dev))
4918			return -ENODEV;
4919		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4920
4921	case SIOCSIFTXQLEN:
4922		if (ifr->ifr_qlen < 0)
4923			return -EINVAL;
4924		dev->tx_queue_len = ifr->ifr_qlen;
4925		return 0;
4926
4927	case SIOCSIFNAME:
4928		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4929		return dev_change_name(dev, ifr->ifr_newname);
4930
4931	case SIOCSHWTSTAMP:
4932		err = net_hwtstamp_validate(ifr);
4933		if (err)
4934			return err;
4935		/* fall through */
4936
4937	/*
4938	 *	Unknown or private ioctl
4939	 */
4940	default:
4941		if ((cmd >= SIOCDEVPRIVATE &&
4942		    cmd <= SIOCDEVPRIVATE + 15) ||
4943		    cmd == SIOCBONDENSLAVE ||
4944		    cmd == SIOCBONDRELEASE ||
4945		    cmd == SIOCBONDSETHWADDR ||
4946		    cmd == SIOCBONDSLAVEINFOQUERY ||
4947		    cmd == SIOCBONDINFOQUERY ||
4948		    cmd == SIOCBONDCHANGEACTIVE ||
4949		    cmd == SIOCGMIIPHY ||
4950		    cmd == SIOCGMIIREG ||
4951		    cmd == SIOCSMIIREG ||
4952		    cmd == SIOCBRADDIF ||
4953		    cmd == SIOCBRDELIF ||
4954		    cmd == SIOCSHWTSTAMP ||
4955		    cmd == SIOCWANDEV) {
4956			err = -EOPNOTSUPP;
4957			if (ops->ndo_do_ioctl) {
4958				if (netif_device_present(dev))
4959					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4960				else
4961					err = -ENODEV;
4962			}
4963		} else
4964			err = -EINVAL;
4965
4966	}
4967	return err;
4968}
4969
4970/*
4971 *	This function handles all "interface"-type I/O control requests. The actual
4972 *	'doing' part of this is dev_ifsioc above.
4973 */
4974
4975/**
4976 *	dev_ioctl	-	network device ioctl
4977 *	@net: the applicable net namespace
4978 *	@cmd: command to issue
4979 *	@arg: pointer to a struct ifreq in user space
4980 *
4981 *	Issue ioctl functions to devices. This is normally called by the
4982 *	user space syscall interfaces but can sometimes be useful for
4983 *	other purposes. The return value is the return from the syscall if
4984 *	positive or a negative errno code on error.
4985 */
4986
4987int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4988{
4989	struct ifreq ifr;
4990	int ret;
4991	char *colon;
4992
4993	/* One special case: SIOCGIFCONF takes ifconf argument
4994	   and requires shared lock, because it sleeps writing
4995	   to user space.
4996	 */
4997
4998	if (cmd == SIOCGIFCONF) {
4999		rtnl_lock();
5000		ret = dev_ifconf(net, (char __user *) arg);
5001		rtnl_unlock();
5002		return ret;
5003	}
5004	if (cmd == SIOCGIFNAME)
5005		return dev_ifname(net, (struct ifreq __user *)arg);
5006
5007	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5008		return -EFAULT;
5009
5010	ifr.ifr_name[IFNAMSIZ-1] = 0;
5011
5012	colon = strchr(ifr.ifr_name, ':');
5013	if (colon)
5014		*colon = 0;
5015
5016	/*
5017	 *	See which interface the caller is talking about.
5018	 */
5019
5020	switch (cmd) {
5021	/*
5022	 *	These ioctl calls:
5023	 *	- can be done by all.
5024	 *	- atomic and do not require locking.
5025	 *	- return a value
5026	 */
5027	case SIOCGIFFLAGS:
5028	case SIOCGIFMETRIC:
5029	case SIOCGIFMTU:
5030	case SIOCGIFHWADDR:
5031	case SIOCGIFSLAVE:
5032	case SIOCGIFMAP:
5033	case SIOCGIFINDEX:
5034	case SIOCGIFTXQLEN:
5035		dev_load(net, ifr.ifr_name);
5036		rcu_read_lock();
5037		ret = dev_ifsioc_locked(net, &ifr, cmd);
5038		rcu_read_unlock();
5039		if (!ret) {
5040			if (colon)
5041				*colon = ':';
5042			if (copy_to_user(arg, &ifr,
5043					 sizeof(struct ifreq)))
5044				ret = -EFAULT;
5045		}
5046		return ret;
5047
5048	case SIOCETHTOOL:
5049		dev_load(net, ifr.ifr_name);
5050		rtnl_lock();
5051		ret = dev_ethtool(net, &ifr);
5052		rtnl_unlock();
5053		if (!ret) {
5054			if (colon)
5055				*colon = ':';
5056			if (copy_to_user(arg, &ifr,
5057					 sizeof(struct ifreq)))
5058				ret = -EFAULT;
5059		}
5060		return ret;
5061
5062	/*
5063	 *	These ioctl calls:
5064	 *	- require superuser power.
5065	 *	- require strict serialization.
5066	 *	- return a value
5067	 */
5068	case SIOCGMIIPHY:
5069	case SIOCGMIIREG:
5070	case SIOCSIFNAME:
5071		if (!capable(CAP_NET_ADMIN))
5072			return -EPERM;
5073		dev_load(net, ifr.ifr_name);
5074		rtnl_lock();
5075		ret = dev_ifsioc(net, &ifr, cmd);
5076		rtnl_unlock();
5077		if (!ret) {
5078			if (colon)
5079				*colon = ':';
5080			if (copy_to_user(arg, &ifr,
5081					 sizeof(struct ifreq)))
5082				ret = -EFAULT;
5083		}
5084		return ret;
5085
5086	/*
5087	 *	These ioctl calls:
5088	 *	- require superuser power.
5089	 *	- require strict serialization.
5090	 *	- do not return a value
5091	 */
5092	case SIOCSIFFLAGS:
5093	case SIOCSIFMETRIC:
5094	case SIOCSIFMTU:
5095	case SIOCSIFMAP:
5096	case SIOCSIFHWADDR:
5097	case SIOCSIFSLAVE:
5098	case SIOCADDMULTI:
5099	case SIOCDELMULTI:
5100	case SIOCSIFHWBROADCAST:
5101	case SIOCSIFTXQLEN:
5102	case SIOCSMIIREG:
5103	case SIOCBONDENSLAVE:
5104	case SIOCBONDRELEASE:
5105	case SIOCBONDSETHWADDR:
5106	case SIOCBONDCHANGEACTIVE:
5107	case SIOCBRADDIF:
5108	case SIOCBRDELIF:
5109	case SIOCSHWTSTAMP:
5110		if (!capable(CAP_NET_ADMIN))
5111			return -EPERM;
5112		/* fall through */
5113	case SIOCBONDSLAVEINFOQUERY:
5114	case SIOCBONDINFOQUERY:
5115		dev_load(net, ifr.ifr_name);
5116		rtnl_lock();
5117		ret = dev_ifsioc(net, &ifr, cmd);
5118		rtnl_unlock();
5119		return ret;
5120
5121	case SIOCGIFMEM:
5122		/* Get the per device memory space. We can add this but
5123		 * currently do not support it */
5124	case SIOCSIFMEM:
5125		/* Set the per device memory buffer space.
5126		 * Not applicable in our case */
5127	case SIOCSIFLINK:
5128		return -ENOTTY;
5129
5130	/*
5131	 *	Unknown or private ioctl.
5132	 */
5133	default:
5134		if (cmd == SIOCWANDEV ||
5135		    (cmd >= SIOCDEVPRIVATE &&
5136		     cmd <= SIOCDEVPRIVATE + 15)) {
5137			dev_load(net, ifr.ifr_name);
5138			rtnl_lock();
5139			ret = dev_ifsioc(net, &ifr, cmd);
5140			rtnl_unlock();
5141			if (!ret && copy_to_user(arg, &ifr,
5142						 sizeof(struct ifreq)))
5143				ret = -EFAULT;
5144			return ret;
5145		}
5146		/* Take care of Wireless Extensions */
5147		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5148			return wext_handle_ioctl(net, &ifr, cmd, arg);
5149		return -ENOTTY;
5150	}
5151}
5152
5153
5154/**
5155 *	dev_new_index	-	allocate an ifindex
5156 *	@net: the applicable net namespace
5157 *
5158 *	Returns a suitable unique value for a new device interface
5159 *	number.  The caller must hold the rtnl semaphore or the
5160 *	dev_base_lock to be sure it remains unique.
5161 */
5162static int dev_new_index(struct net *net)
5163{
5164	static int ifindex;
5165	for (;;) {
5166		if (++ifindex <= 0)
5167			ifindex = 1;
5168		if (!__dev_get_by_index(net, ifindex))
5169			return ifindex;
5170	}
5171}
5172
5173/* Delayed registration/unregisteration */
5174static LIST_HEAD(net_todo_list);
 
5175
5176static void net_set_todo(struct net_device *dev)
5177{
5178	list_add_tail(&dev->todo_list, &net_todo_list);
 
5179}
5180
5181static void rollback_registered_many(struct list_head *head)
5182{
5183	struct net_device *dev, *tmp;
 
5184
5185	BUG_ON(dev_boot_phase);
5186	ASSERT_RTNL();
5187
5188	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5189		/* Some devices call without registering
5190		 * for initialization unwind. Remove those
5191		 * devices and proceed with the remaining.
5192		 */
5193		if (dev->reg_state == NETREG_UNINITIALIZED) {
5194			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5195				 dev->name, dev);
5196
5197			WARN_ON(1);
5198			list_del(&dev->unreg_list);
5199			continue;
5200		}
5201		dev->dismantle = true;
5202		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5203	}
5204
5205	/* If device is running, close it first. */
5206	dev_close_many(head);
 
 
5207
5208	list_for_each_entry(dev, head, unreg_list) {
5209		/* And unlink it from device chain. */
5210		unlist_netdevice(dev);
5211
5212		dev->reg_state = NETREG_UNREGISTERING;
 
5213	}
5214
5215	synchronize_net();
5216
5217	list_for_each_entry(dev, head, unreg_list) {
 
 
5218		/* Shutdown queueing discipline. */
5219		dev_shutdown(dev);
5220
5221
5222		/* Notify protocols, that we are about to destroy
5223		   this device. They should clean all the things.
5224		*/
5225		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5226
5227		if (!dev->rtnl_link_ops ||
5228		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5229			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
 
5230
5231		/*
5232		 *	Flush the unicast and multicast chains
5233		 */
5234		dev_uc_flush(dev);
5235		dev_mc_flush(dev);
5236
5237		if (dev->netdev_ops->ndo_uninit)
5238			dev->netdev_ops->ndo_uninit(dev);
5239
5240		/* Notifier chain MUST detach us from master device. */
5241		WARN_ON(dev->master);
 
 
 
5242
5243		/* Remove entries from kobject tree */
5244		netdev_unregister_kobject(dev);
 
 
 
 
5245	}
5246
5247	/* Process any work delayed until the end of the batch */
5248	dev = list_first_entry(head, struct net_device, unreg_list);
5249	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5250
5251	synchronize_net();
5252
5253	list_for_each_entry(dev, head, unreg_list)
5254		dev_put(dev);
5255}
5256
5257static void rollback_registered(struct net_device *dev)
5258{
5259	LIST_HEAD(single);
5260
5261	list_add(&dev->unreg_list, &single);
5262	rollback_registered_many(&single);
5263	list_del(&single);
5264}
5265
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5266static netdev_features_t netdev_fix_features(struct net_device *dev,
5267	netdev_features_t features)
5268{
5269	/* Fix illegal checksum combinations */
5270	if ((features & NETIF_F_HW_CSUM) &&
5271	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5272		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5273		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5274	}
5275
5276	/* Fix illegal SG+CSUM combinations. */
5277	if ((features & NETIF_F_SG) &&
5278	    !(features & NETIF_F_ALL_CSUM)) {
5279		netdev_dbg(dev,
5280			"Dropping NETIF_F_SG since no checksum feature.\n");
5281		features &= ~NETIF_F_SG;
5282	}
5283
5284	/* TSO requires that SG is present as well. */
5285	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5286		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5287		features &= ~NETIF_F_ALL_TSO;
5288	}
5289
 
 
 
 
 
 
 
 
 
 
 
 
 
5290	/* TSO ECN requires that TSO is present as well. */
5291	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5292		features &= ~NETIF_F_TSO_ECN;
5293
5294	/* Software GSO depends on SG. */
5295	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5296		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5297		features &= ~NETIF_F_GSO;
5298	}
5299
5300	/* UFO needs SG and checksumming */
5301	if (features & NETIF_F_UFO) {
5302		/* maybe split UFO into V4 and V6? */
5303		if (!((features & NETIF_F_GEN_CSUM) ||
5304		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5305			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5306			netdev_dbg(dev,
5307				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5308			features &= ~NETIF_F_UFO;
5309		}
5310
5311		if (!(features & NETIF_F_SG)) {
5312			netdev_dbg(dev,
5313				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5314			features &= ~NETIF_F_UFO;
5315		}
5316	}
5317
 
 
 
 
 
 
 
5318	return features;
5319}
5320
5321int __netdev_update_features(struct net_device *dev)
5322{
 
5323	netdev_features_t features;
5324	int err = 0;
 
5325
5326	ASSERT_RTNL();
5327
5328	features = netdev_get_wanted_features(dev);
5329
5330	if (dev->netdev_ops->ndo_fix_features)
5331		features = dev->netdev_ops->ndo_fix_features(dev, features);
5332
5333	/* driver might be less strict about feature dependencies */
5334	features = netdev_fix_features(dev, features);
5335
 
 
 
 
5336	if (dev->features == features)
5337		return 0;
5338
5339	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5340		&dev->features, &features);
5341
5342	if (dev->netdev_ops->ndo_set_features)
5343		err = dev->netdev_ops->ndo_set_features(dev, features);
 
 
5344
5345	if (unlikely(err < 0)) {
5346		netdev_err(dev,
5347			"set_features() failed (%d); wanted %pNF, left %pNF\n",
5348			err, &features, &dev->features);
 
 
 
5349		return -1;
5350	}
5351
 
 
 
 
 
 
 
5352	if (!err)
5353		dev->features = features;
5354
5355	return 1;
5356}
5357
5358/**
5359 *	netdev_update_features - recalculate device features
5360 *	@dev: the device to check
5361 *
5362 *	Recalculate dev->features set and send notifications if it
5363 *	has changed. Should be called after driver or hardware dependent
5364 *	conditions might have changed that influence the features.
5365 */
5366void netdev_update_features(struct net_device *dev)
5367{
5368	if (__netdev_update_features(dev))
5369		netdev_features_change(dev);
5370}
5371EXPORT_SYMBOL(netdev_update_features);
5372
5373/**
5374 *	netdev_change_features - recalculate device features
5375 *	@dev: the device to check
5376 *
5377 *	Recalculate dev->features set and send notifications even
5378 *	if they have not changed. Should be called instead of
5379 *	netdev_update_features() if also dev->vlan_features might
5380 *	have changed to allow the changes to be propagated to stacked
5381 *	VLAN devices.
5382 */
5383void netdev_change_features(struct net_device *dev)
5384{
5385	__netdev_update_features(dev);
5386	netdev_features_change(dev);
5387}
5388EXPORT_SYMBOL(netdev_change_features);
5389
5390/**
5391 *	netif_stacked_transfer_operstate -	transfer operstate
5392 *	@rootdev: the root or lower level device to transfer state from
5393 *	@dev: the device to transfer operstate to
5394 *
5395 *	Transfer operational state from root to device. This is normally
5396 *	called when a stacking relationship exists between the root
5397 *	device and the device(a leaf device).
5398 */
5399void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5400					struct net_device *dev)
5401{
5402	if (rootdev->operstate == IF_OPER_DORMANT)
5403		netif_dormant_on(dev);
5404	else
5405		netif_dormant_off(dev);
5406
5407	if (netif_carrier_ok(rootdev)) {
5408		if (!netif_carrier_ok(dev))
5409			netif_carrier_on(dev);
5410	} else {
5411		if (netif_carrier_ok(dev))
5412			netif_carrier_off(dev);
5413	}
5414}
5415EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5416
5417#ifdef CONFIG_RPS
5418static int netif_alloc_rx_queues(struct net_device *dev)
5419{
5420	unsigned int i, count = dev->num_rx_queues;
5421	struct netdev_rx_queue *rx;
 
5422
5423	BUG_ON(count < 1);
5424
5425	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5426	if (!rx) {
5427		pr_err("netdev: Unable to allocate %u rx queues\n", count);
5428		return -ENOMEM;
 
5429	}
5430	dev->_rx = rx;
5431
5432	for (i = 0; i < count; i++)
5433		rx[i].dev = dev;
5434	return 0;
5435}
5436#endif
5437
5438static void netdev_init_one_queue(struct net_device *dev,
5439				  struct netdev_queue *queue, void *_unused)
5440{
5441	/* Initialize queue lock */
5442	spin_lock_init(&queue->_xmit_lock);
5443	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5444	queue->xmit_lock_owner = -1;
5445	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5446	queue->dev = dev;
5447#ifdef CONFIG_BQL
5448	dql_init(&queue->dql, HZ);
5449#endif
5450}
5451
 
 
 
 
 
5452static int netif_alloc_netdev_queues(struct net_device *dev)
5453{
5454	unsigned int count = dev->num_tx_queues;
5455	struct netdev_queue *tx;
 
5456
5457	BUG_ON(count < 1);
 
5458
5459	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5460	if (!tx) {
5461		pr_err("netdev: Unable to allocate %u tx queues\n", count);
5462		return -ENOMEM;
 
5463	}
5464	dev->_tx = tx;
5465
5466	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5467	spin_lock_init(&dev->tx_global_lock);
5468
5469	return 0;
5470}
5471
 
 
 
 
 
 
 
 
 
 
 
5472/**
5473 *	register_netdevice	- register a network device
5474 *	@dev: device to register
5475 *
5476 *	Take a completed network device structure and add it to the kernel
5477 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5478 *	chain. 0 is returned on success. A negative errno code is returned
5479 *	on a failure to set up the device, or if the name is a duplicate.
5480 *
5481 *	Callers must hold the rtnl semaphore. You may want
5482 *	register_netdev() instead of this.
5483 *
5484 *	BUGS:
5485 *	The locking appears insufficient to guarantee two parallel registers
5486 *	will not get the same name.
5487 */
5488
5489int register_netdevice(struct net_device *dev)
5490{
5491	int ret;
5492	struct net *net = dev_net(dev);
5493
5494	BUG_ON(dev_boot_phase);
5495	ASSERT_RTNL();
5496
5497	might_sleep();
5498
5499	/* When net_device's are persistent, this will be fatal. */
5500	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5501	BUG_ON(!net);
5502
5503	spin_lock_init(&dev->addr_list_lock);
5504	netdev_set_addr_lockdep_class(dev);
5505
5506	dev->iflink = -1;
5507
5508	ret = dev_get_valid_name(dev, dev->name);
5509	if (ret < 0)
5510		goto out;
5511
5512	/* Init, if this function is available */
5513	if (dev->netdev_ops->ndo_init) {
5514		ret = dev->netdev_ops->ndo_init(dev);
5515		if (ret) {
5516			if (ret > 0)
5517				ret = -EIO;
5518			goto out;
5519		}
5520	}
5521
5522	dev->ifindex = dev_new_index(net);
5523	if (dev->iflink == -1)
5524		dev->iflink = dev->ifindex;
 
 
 
 
 
 
 
 
 
 
 
5525
5526	/* Transfer changeable features to wanted_features and enable
5527	 * software offloads (GSO and GRO).
5528	 */
5529	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5530	dev->features |= NETIF_F_SOFT_FEATURES;
5531	dev->wanted_features = dev->features & dev->hw_features;
5532
5533	/* Turn on no cache copy if HW is doing checksum */
5534	if (!(dev->flags & IFF_LOOPBACK)) {
5535		dev->hw_features |= NETIF_F_NOCACHE_COPY;
5536		if (dev->features & NETIF_F_ALL_CSUM) {
5537			dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5538			dev->features |= NETIF_F_NOCACHE_COPY;
5539		}
5540	}
5541
5542	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5543	 */
5544	dev->vlan_features |= NETIF_F_HIGHDMA;
5545
 
 
 
 
 
 
 
 
5546	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5547	ret = notifier_to_errno(ret);
5548	if (ret)
5549		goto err_uninit;
5550
5551	ret = netdev_register_kobject(dev);
5552	if (ret)
5553		goto err_uninit;
5554	dev->reg_state = NETREG_REGISTERED;
5555
5556	__netdev_update_features(dev);
5557
5558	/*
5559	 *	Default initial state at registry is that the
5560	 *	device is present.
5561	 */
5562
5563	set_bit(__LINK_STATE_PRESENT, &dev->state);
5564
 
 
5565	dev_init_scheduler(dev);
5566	dev_hold(dev);
5567	list_netdevice(dev);
5568	add_device_randomness(dev->dev_addr, dev->addr_len);
5569
 
 
 
 
 
 
 
5570	/* Notify protocols, that a new device appeared. */
5571	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5572	ret = notifier_to_errno(ret);
5573	if (ret) {
5574		rollback_registered(dev);
5575		dev->reg_state = NETREG_UNREGISTERED;
5576	}
5577	/*
5578	 *	Prevent userspace races by waiting until the network
5579	 *	device is fully setup before sending notifications.
5580	 */
5581	if (!dev->rtnl_link_ops ||
5582	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5583		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5584
5585out:
5586	return ret;
5587
5588err_uninit:
5589	if (dev->netdev_ops->ndo_uninit)
5590		dev->netdev_ops->ndo_uninit(dev);
5591	goto out;
5592}
5593EXPORT_SYMBOL(register_netdevice);
5594
5595/**
5596 *	init_dummy_netdev	- init a dummy network device for NAPI
5597 *	@dev: device to init
5598 *
5599 *	This takes a network device structure and initialize the minimum
5600 *	amount of fields so it can be used to schedule NAPI polls without
5601 *	registering a full blown interface. This is to be used by drivers
5602 *	that need to tie several hardware interfaces to a single NAPI
5603 *	poll scheduler due to HW limitations.
5604 */
5605int init_dummy_netdev(struct net_device *dev)
5606{
5607	/* Clear everything. Note we don't initialize spinlocks
5608	 * are they aren't supposed to be taken by any of the
5609	 * NAPI code and this dummy netdev is supposed to be
5610	 * only ever used for NAPI polls
5611	 */
5612	memset(dev, 0, sizeof(struct net_device));
5613
5614	/* make sure we BUG if trying to hit standard
5615	 * register/unregister code path
5616	 */
5617	dev->reg_state = NETREG_DUMMY;
5618
5619	/* NAPI wants this */
5620	INIT_LIST_HEAD(&dev->napi_list);
5621
5622	/* a dummy interface is started by default */
5623	set_bit(__LINK_STATE_PRESENT, &dev->state);
5624	set_bit(__LINK_STATE_START, &dev->state);
5625
5626	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5627	 * because users of this 'device' dont need to change
5628	 * its refcount.
5629	 */
5630
5631	return 0;
5632}
5633EXPORT_SYMBOL_GPL(init_dummy_netdev);
5634
5635
5636/**
5637 *	register_netdev	- register a network device
5638 *	@dev: device to register
5639 *
5640 *	Take a completed network device structure and add it to the kernel
5641 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5642 *	chain. 0 is returned on success. A negative errno code is returned
5643 *	on a failure to set up the device, or if the name is a duplicate.
5644 *
5645 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5646 *	and expands the device name if you passed a format string to
5647 *	alloc_netdev.
5648 */
5649int register_netdev(struct net_device *dev)
5650{
5651	int err;
5652
5653	rtnl_lock();
5654	err = register_netdevice(dev);
5655	rtnl_unlock();
5656	return err;
5657}
5658EXPORT_SYMBOL(register_netdev);
5659
5660int netdev_refcnt_read(const struct net_device *dev)
5661{
5662	int i, refcnt = 0;
5663
5664	for_each_possible_cpu(i)
5665		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5666	return refcnt;
5667}
5668EXPORT_SYMBOL(netdev_refcnt_read);
5669
5670/*
5671 * netdev_wait_allrefs - wait until all references are gone.
 
5672 *
5673 * This is called when unregistering network devices.
5674 *
5675 * Any protocol or device that holds a reference should register
5676 * for netdevice notification, and cleanup and put back the
5677 * reference if they receive an UNREGISTER event.
5678 * We can get stuck here if buggy protocols don't correctly
5679 * call dev_put.
5680 */
5681static void netdev_wait_allrefs(struct net_device *dev)
5682{
5683	unsigned long rebroadcast_time, warning_time;
5684	int refcnt;
5685
5686	linkwatch_forget_dev(dev);
5687
5688	rebroadcast_time = warning_time = jiffies;
5689	refcnt = netdev_refcnt_read(dev);
5690
5691	while (refcnt != 0) {
5692		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5693			rtnl_lock();
5694
5695			/* Rebroadcast unregister notification */
5696			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5697			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5698			 * should have already handle it the first time */
5699
 
 
 
 
 
5700			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5701				     &dev->state)) {
5702				/* We must not have linkwatch events
5703				 * pending on unregister. If this
5704				 * happens, we simply run the queue
5705				 * unscheduled, resulting in a noop
5706				 * for this device.
5707				 */
5708				linkwatch_run_queue();
5709			}
5710
5711			__rtnl_unlock();
5712
5713			rebroadcast_time = jiffies;
5714		}
5715
5716		msleep(250);
5717
5718		refcnt = netdev_refcnt_read(dev);
5719
5720		if (time_after(jiffies, warning_time + 10 * HZ)) {
5721			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5722				 dev->name, refcnt);
5723			warning_time = jiffies;
5724		}
5725	}
5726}
5727
5728/* The sequence is:
5729 *
5730 *	rtnl_lock();
5731 *	...
5732 *	register_netdevice(x1);
5733 *	register_netdevice(x2);
5734 *	...
5735 *	unregister_netdevice(y1);
5736 *	unregister_netdevice(y2);
5737 *      ...
5738 *	rtnl_unlock();
5739 *	free_netdev(y1);
5740 *	free_netdev(y2);
5741 *
5742 * We are invoked by rtnl_unlock().
5743 * This allows us to deal with problems:
5744 * 1) We can delete sysfs objects which invoke hotplug
5745 *    without deadlocking with linkwatch via keventd.
5746 * 2) Since we run with the RTNL semaphore not held, we can sleep
5747 *    safely in order to wait for the netdev refcnt to drop to zero.
5748 *
5749 * We must not return until all unregister events added during
5750 * the interval the lock was held have been completed.
5751 */
5752void netdev_run_todo(void)
5753{
5754	struct list_head list;
5755
5756	/* Snapshot list, allow later requests */
5757	list_replace_init(&net_todo_list, &list);
5758
5759	__rtnl_unlock();
5760
5761	/* Wait for rcu callbacks to finish before attempting to drain
5762	 * the device list.  This usually avoids a 250ms wait.
5763	 */
5764	if (!list_empty(&list))
5765		rcu_barrier();
5766
5767	while (!list_empty(&list)) {
5768		struct net_device *dev
5769			= list_first_entry(&list, struct net_device, todo_list);
5770		list_del(&dev->todo_list);
5771
 
 
 
 
5772		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5773			pr_err("network todo '%s' but state %d\n",
5774			       dev->name, dev->reg_state);
5775			dump_stack();
5776			continue;
5777		}
5778
5779		dev->reg_state = NETREG_UNREGISTERED;
5780
5781		on_each_cpu(flush_backlog, dev, 1);
5782
5783		netdev_wait_allrefs(dev);
5784
5785		/* paranoia */
5786		BUG_ON(netdev_refcnt_read(dev));
 
 
5787		WARN_ON(rcu_access_pointer(dev->ip_ptr));
5788		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5789		WARN_ON(dev->dn_ptr);
5790
5791		if (dev->destructor)
5792			dev->destructor(dev);
5793
 
 
 
 
 
 
5794		/* Free network device */
5795		kobject_put(&dev->dev.kobj);
5796	}
5797}
5798
5799/* Convert net_device_stats to rtnl_link_stats64.  They have the same
5800 * fields in the same order, with only the type differing.
 
 
5801 */
5802void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5803			     const struct net_device_stats *netdev_stats)
5804{
5805#if BITS_PER_LONG == 64
5806	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5807	memcpy(stats64, netdev_stats, sizeof(*stats64));
 
 
 
5808#else
5809	size_t i, n = sizeof(*stats64) / sizeof(u64);
5810	const unsigned long *src = (const unsigned long *)netdev_stats;
5811	u64 *dst = (u64 *)stats64;
5812
5813	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5814		     sizeof(*stats64) / sizeof(u64));
5815	for (i = 0; i < n; i++)
5816		dst[i] = src[i];
 
 
 
5817#endif
5818}
5819EXPORT_SYMBOL(netdev_stats_to_stats64);
5820
5821/**
5822 *	dev_get_stats	- get network device statistics
5823 *	@dev: device to get statistics from
5824 *	@storage: place to store stats
5825 *
5826 *	Get network statistics from device. Return @storage.
5827 *	The device driver may provide its own method by setting
5828 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5829 *	otherwise the internal statistics structure is used.
5830 */
5831struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5832					struct rtnl_link_stats64 *storage)
5833{
5834	const struct net_device_ops *ops = dev->netdev_ops;
5835
5836	if (ops->ndo_get_stats64) {
5837		memset(storage, 0, sizeof(*storage));
5838		ops->ndo_get_stats64(dev, storage);
5839	} else if (ops->ndo_get_stats) {
5840		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5841	} else {
5842		netdev_stats_to_stats64(storage, &dev->stats);
5843	}
5844	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
 
 
5845	return storage;
5846}
5847EXPORT_SYMBOL(dev_get_stats);
5848
5849struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5850{
5851	struct netdev_queue *queue = dev_ingress_queue(dev);
5852
5853#ifdef CONFIG_NET_CLS_ACT
5854	if (queue)
5855		return queue;
5856	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5857	if (!queue)
5858		return NULL;
5859	netdev_init_one_queue(dev, queue, NULL);
5860	queue->qdisc = &noop_qdisc;
5861	queue->qdisc_sleeping = &noop_qdisc;
5862	rcu_assign_pointer(dev->ingress_queue, queue);
5863#endif
5864	return queue;
5865}
5866
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5867/**
5868 *	alloc_netdev_mqs - allocate network device
5869 *	@sizeof_priv:	size of private data to allocate space for
5870 *	@name:		device name format string
5871 *	@setup:		callback to initialize device
5872 *	@txqs:		the number of TX subqueues to allocate
5873 *	@rxqs:		the number of RX subqueues to allocate
 
5874 *
5875 *	Allocates a struct net_device with private data area for driver use
5876 *	and performs basic initialization.  Also allocates subquue structs
5877 *	for each queue on the device.
5878 */
5879struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 
5880		void (*setup)(struct net_device *),
5881		unsigned int txqs, unsigned int rxqs)
5882{
5883	struct net_device *dev;
5884	size_t alloc_size;
5885	struct net_device *p;
5886
5887	BUG_ON(strlen(name) >= sizeof(dev->name));
5888
5889	if (txqs < 1) {
5890		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5891		return NULL;
5892	}
5893
5894#ifdef CONFIG_RPS
5895	if (rxqs < 1) {
5896		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5897		return NULL;
5898	}
5899#endif
5900
5901	alloc_size = sizeof(struct net_device);
5902	if (sizeof_priv) {
5903		/* ensure 32-byte alignment of private area */
5904		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5905		alloc_size += sizeof_priv;
5906	}
5907	/* ensure 32-byte alignment of whole construct */
5908	alloc_size += NETDEV_ALIGN - 1;
5909
5910	p = kzalloc(alloc_size, GFP_KERNEL);
5911	if (!p) {
5912		pr_err("alloc_netdev: Unable to allocate device\n");
 
5913		return NULL;
5914	}
5915
5916	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5917	dev->padded = (char *)dev - (char *)p;
5918
5919	dev->pcpu_refcnt = alloc_percpu(int);
5920	if (!dev->pcpu_refcnt)
5921		goto free_p;
5922
5923	if (dev_addr_init(dev))
5924		goto free_pcpu;
5925
5926	dev_mc_init(dev);
5927	dev_uc_init(dev);
5928
5929	dev_net_set(dev, &init_net);
5930
5931	dev->gso_max_size = GSO_MAX_SIZE;
5932	dev->gso_max_segs = GSO_MAX_SEGS;
 
5933
5934	INIT_LIST_HEAD(&dev->napi_list);
5935	INIT_LIST_HEAD(&dev->unreg_list);
 
5936	INIT_LIST_HEAD(&dev->link_watch_list);
5937	dev->priv_flags = IFF_XMIT_DST_RELEASE;
 
 
 
 
 
 
5938	setup(dev);
5939
 
 
 
 
 
5940	dev->num_tx_queues = txqs;
5941	dev->real_num_tx_queues = txqs;
5942	if (netif_alloc_netdev_queues(dev))
5943		goto free_all;
5944
5945#ifdef CONFIG_RPS
5946	dev->num_rx_queues = rxqs;
5947	dev->real_num_rx_queues = rxqs;
5948	if (netif_alloc_rx_queues(dev))
5949		goto free_all;
5950#endif
5951
5952	strcpy(dev->name, name);
 
5953	dev->group = INIT_NETDEV_GROUP;
 
 
 
 
 
5954	return dev;
5955
5956free_all:
5957	free_netdev(dev);
5958	return NULL;
5959
5960free_pcpu:
5961	free_percpu(dev->pcpu_refcnt);
5962	kfree(dev->_tx);
5963#ifdef CONFIG_RPS
5964	kfree(dev->_rx);
5965#endif
5966
5967free_p:
5968	kfree(p);
5969	return NULL;
5970}
5971EXPORT_SYMBOL(alloc_netdev_mqs);
5972
5973/**
5974 *	free_netdev - free network device
5975 *	@dev: device
5976 *
5977 *	This function does the last stage of destroying an allocated device
5978 * 	interface. The reference to the device object is released.
5979 *	If this is the last reference then it will be freed.
 
5980 */
5981void free_netdev(struct net_device *dev)
5982{
5983	struct napi_struct *p, *n;
5984
5985	release_net(dev_net(dev));
5986
5987	kfree(dev->_tx);
5988#ifdef CONFIG_RPS
5989	kfree(dev->_rx);
5990#endif
5991
5992	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
5993
5994	/* Flush device addresses */
5995	dev_addr_flush(dev);
5996
5997	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5998		netif_napi_del(p);
5999
6000	free_percpu(dev->pcpu_refcnt);
6001	dev->pcpu_refcnt = NULL;
6002
6003	/*  Compatibility with error handling in drivers */
6004	if (dev->reg_state == NETREG_UNINITIALIZED) {
6005		kfree((char *)dev - dev->padded);
6006		return;
6007	}
6008
6009	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6010	dev->reg_state = NETREG_RELEASED;
6011
6012	/* will free via device release */
6013	put_device(&dev->dev);
6014}
6015EXPORT_SYMBOL(free_netdev);
6016
6017/**
6018 *	synchronize_net -  Synchronize with packet receive processing
6019 *
6020 *	Wait for packets currently being received to be done.
6021 *	Does not block later packets from starting.
6022 */
6023void synchronize_net(void)
6024{
6025	might_sleep();
6026	if (rtnl_is_locked())
6027		synchronize_rcu_expedited();
6028	else
6029		synchronize_rcu();
6030}
6031EXPORT_SYMBOL(synchronize_net);
6032
6033/**
6034 *	unregister_netdevice_queue - remove device from the kernel
6035 *	@dev: device
6036 *	@head: list
6037 *
6038 *	This function shuts down a device interface and removes it
6039 *	from the kernel tables.
6040 *	If head not NULL, device is queued to be unregistered later.
6041 *
6042 *	Callers must hold the rtnl semaphore.  You may want
6043 *	unregister_netdev() instead of this.
6044 */
6045
6046void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6047{
6048	ASSERT_RTNL();
6049
6050	if (head) {
6051		list_move_tail(&dev->unreg_list, head);
6052	} else {
6053		rollback_registered(dev);
6054		/* Finish processing unregister after unlock */
6055		net_set_todo(dev);
6056	}
6057}
6058EXPORT_SYMBOL(unregister_netdevice_queue);
6059
6060/**
6061 *	unregister_netdevice_many - unregister many devices
6062 *	@head: list of devices
 
 
 
6063 */
6064void unregister_netdevice_many(struct list_head *head)
6065{
6066	struct net_device *dev;
6067
6068	if (!list_empty(head)) {
6069		rollback_registered_many(head);
6070		list_for_each_entry(dev, head, unreg_list)
6071			net_set_todo(dev);
 
6072	}
6073}
6074EXPORT_SYMBOL(unregister_netdevice_many);
6075
6076/**
6077 *	unregister_netdev - remove device from the kernel
6078 *	@dev: device
6079 *
6080 *	This function shuts down a device interface and removes it
6081 *	from the kernel tables.
6082 *
6083 *	This is just a wrapper for unregister_netdevice that takes
6084 *	the rtnl semaphore.  In general you want to use this and not
6085 *	unregister_netdevice.
6086 */
6087void unregister_netdev(struct net_device *dev)
6088{
6089	rtnl_lock();
6090	unregister_netdevice(dev);
6091	rtnl_unlock();
6092}
6093EXPORT_SYMBOL(unregister_netdev);
6094
6095/**
6096 *	dev_change_net_namespace - move device to different nethost namespace
6097 *	@dev: device
6098 *	@net: network namespace
6099 *	@pat: If not NULL name pattern to try if the current device name
6100 *	      is already taken in the destination network namespace.
6101 *
6102 *	This function shuts down a device interface and moves it
6103 *	to a new network namespace. On success 0 is returned, on
6104 *	a failure a netagive errno code is returned.
6105 *
6106 *	Callers must hold the rtnl semaphore.
6107 */
6108
6109int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6110{
6111	int err;
6112
6113	ASSERT_RTNL();
6114
6115	/* Don't allow namespace local devices to be moved. */
6116	err = -EINVAL;
6117	if (dev->features & NETIF_F_NETNS_LOCAL)
6118		goto out;
6119
6120	/* Ensure the device has been registrered */
6121	err = -EINVAL;
6122	if (dev->reg_state != NETREG_REGISTERED)
6123		goto out;
6124
6125	/* Get out if there is nothing todo */
6126	err = 0;
6127	if (net_eq(dev_net(dev), net))
6128		goto out;
6129
6130	/* Pick the destination device name, and ensure
6131	 * we can use it in the destination network namespace.
6132	 */
6133	err = -EEXIST;
6134	if (__dev_get_by_name(net, dev->name)) {
6135		/* We get here if we can't use the current device name */
6136		if (!pat)
6137			goto out;
6138		if (dev_get_valid_name(dev, pat) < 0)
6139			goto out;
6140	}
6141
6142	/*
6143	 * And now a mini version of register_netdevice unregister_netdevice.
6144	 */
6145
6146	/* If device is running close it first. */
6147	dev_close(dev);
6148
6149	/* And unlink it from device chain */
6150	err = -ENODEV;
6151	unlist_netdevice(dev);
6152
6153	synchronize_net();
6154
6155	/* Shutdown queueing discipline. */
6156	dev_shutdown(dev);
6157
6158	/* Notify protocols, that we are about to destroy
6159	   this device. They should clean all the things.
6160
6161	   Note that dev->reg_state stays at NETREG_REGISTERED.
6162	   This is wanted because this way 8021q and macvlan know
6163	   the device is just moving and can keep their slaves up.
6164	*/
6165	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6166	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6167	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
 
6168
6169	/*
6170	 *	Flush the unicast and multicast chains
6171	 */
6172	dev_uc_flush(dev);
6173	dev_mc_flush(dev);
6174
 
 
 
 
6175	/* Actually switch the network namespace */
6176	dev_net_set(dev, net);
6177
6178	/* If there is an ifindex conflict assign a new one */
6179	if (__dev_get_by_index(net, dev->ifindex)) {
6180		int iflink = (dev->iflink == dev->ifindex);
6181		dev->ifindex = dev_new_index(net);
6182		if (iflink)
6183			dev->iflink = dev->ifindex;
6184	}
 
6185
6186	/* Fixup kobjects */
6187	err = device_rename(&dev->dev, dev->name);
6188	WARN_ON(err);
6189
6190	/* Add the device back in the hashes */
6191	list_netdevice(dev);
6192
6193	/* Notify protocols, that a new device appeared. */
6194	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6195
6196	/*
6197	 *	Prevent userspace races by waiting until the network
6198	 *	device is fully setup before sending notifications.
6199	 */
6200	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6201
6202	synchronize_net();
6203	err = 0;
6204out:
6205	return err;
6206}
6207EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6208
6209static int dev_cpu_callback(struct notifier_block *nfb,
6210			    unsigned long action,
6211			    void *ocpu)
6212{
6213	struct sk_buff **list_skb;
6214	struct sk_buff *skb;
6215	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6216	struct softnet_data *sd, *oldsd;
6217
6218	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6219		return NOTIFY_OK;
6220
6221	local_irq_disable();
6222	cpu = smp_processor_id();
6223	sd = &per_cpu(softnet_data, cpu);
6224	oldsd = &per_cpu(softnet_data, oldcpu);
6225
6226	/* Find end of our completion_queue. */
6227	list_skb = &sd->completion_queue;
6228	while (*list_skb)
6229		list_skb = &(*list_skb)->next;
6230	/* Append completion queue from offline CPU. */
6231	*list_skb = oldsd->completion_queue;
6232	oldsd->completion_queue = NULL;
6233
6234	/* Append output queue from offline CPU. */
6235	if (oldsd->output_queue) {
6236		*sd->output_queue_tailp = oldsd->output_queue;
6237		sd->output_queue_tailp = oldsd->output_queue_tailp;
6238		oldsd->output_queue = NULL;
6239		oldsd->output_queue_tailp = &oldsd->output_queue;
6240	}
6241	/* Append NAPI poll list from offline CPU. */
6242	if (!list_empty(&oldsd->poll_list)) {
6243		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6244		raise_softirq_irqoff(NET_RX_SOFTIRQ);
 
 
 
 
 
 
 
 
 
 
6245	}
6246
6247	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6248	local_irq_enable();
6249
6250	/* Process offline CPU's input_pkt_queue */
6251	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6252		netif_rx(skb);
6253		input_queue_head_incr(oldsd);
6254	}
6255	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6256		netif_rx(skb);
6257		input_queue_head_incr(oldsd);
6258	}
6259
6260	return NOTIFY_OK;
6261}
6262
6263
6264/**
6265 *	netdev_increment_features - increment feature set by one
6266 *	@all: current feature set
6267 *	@one: new feature set
6268 *	@mask: mask feature set
6269 *
6270 *	Computes a new feature set after adding a device with feature set
6271 *	@one to the master device with current feature set @all.  Will not
6272 *	enable anything that is off in @mask. Returns the new feature set.
6273 */
6274netdev_features_t netdev_increment_features(netdev_features_t all,
6275	netdev_features_t one, netdev_features_t mask)
6276{
6277	if (mask & NETIF_F_GEN_CSUM)
6278		mask |= NETIF_F_ALL_CSUM;
6279	mask |= NETIF_F_VLAN_CHALLENGED;
6280
6281	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6282	all &= one | ~NETIF_F_ALL_FOR_ALL;
6283
6284	/* If one device supports hw checksumming, set for all. */
6285	if (all & NETIF_F_GEN_CSUM)
6286		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6287
6288	return all;
6289}
6290EXPORT_SYMBOL(netdev_increment_features);
6291
6292static struct hlist_head *netdev_create_hash(void)
6293{
6294	int i;
6295	struct hlist_head *hash;
6296
6297	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6298	if (hash != NULL)
6299		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6300			INIT_HLIST_HEAD(&hash[i]);
6301
6302	return hash;
6303}
6304
6305/* Initialize per network namespace state */
6306static int __net_init netdev_init(struct net *net)
6307{
6308	if (net != &init_net)
6309		INIT_LIST_HEAD(&net->dev_base_head);
6310
6311	net->dev_name_head = netdev_create_hash();
6312	if (net->dev_name_head == NULL)
6313		goto err_name;
6314
6315	net->dev_index_head = netdev_create_hash();
6316	if (net->dev_index_head == NULL)
6317		goto err_idx;
6318
6319	return 0;
6320
6321err_idx:
6322	kfree(net->dev_name_head);
6323err_name:
6324	return -ENOMEM;
6325}
6326
6327/**
6328 *	netdev_drivername - network driver for the device
6329 *	@dev: network device
6330 *
6331 *	Determine network driver for device.
6332 */
6333const char *netdev_drivername(const struct net_device *dev)
6334{
6335	const struct device_driver *driver;
6336	const struct device *parent;
6337	const char *empty = "";
6338
6339	parent = dev->dev.parent;
6340	if (!parent)
6341		return empty;
6342
6343	driver = parent->driver;
6344	if (driver && driver->name)
6345		return driver->name;
6346	return empty;
6347}
6348
6349int __netdev_printk(const char *level, const struct net_device *dev,
6350			   struct va_format *vaf)
6351{
6352	int r;
6353
6354	if (dev && dev->dev.parent)
6355		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6356			       netdev_name(dev), vaf);
6357	else if (dev)
6358		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6359	else
6360		r = printk("%s(NULL net_device): %pV", level, vaf);
6361
6362	return r;
 
 
 
6363}
6364EXPORT_SYMBOL(__netdev_printk);
6365
6366int netdev_printk(const char *level, const struct net_device *dev,
6367		  const char *format, ...)
6368{
6369	struct va_format vaf;
6370	va_list args;
6371	int r;
6372
6373	va_start(args, format);
6374
6375	vaf.fmt = format;
6376	vaf.va = &args;
6377
6378	r = __netdev_printk(level, dev, &vaf);
 
6379	va_end(args);
6380
6381	return r;
6382}
6383EXPORT_SYMBOL(netdev_printk);
6384
6385#define define_netdev_printk_level(func, level)			\
6386int func(const struct net_device *dev, const char *fmt, ...)	\
6387{								\
6388	int r;							\
6389	struct va_format vaf;					\
6390	va_list args;						\
6391								\
6392	va_start(args, fmt);					\
6393								\
6394	vaf.fmt = fmt;						\
6395	vaf.va = &args;						\
6396								\
6397	r = __netdev_printk(level, dev, &vaf);			\
 
6398	va_end(args);						\
6399								\
6400	return r;						\
6401}								\
6402EXPORT_SYMBOL(func);
6403
6404define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6405define_netdev_printk_level(netdev_alert, KERN_ALERT);
6406define_netdev_printk_level(netdev_crit, KERN_CRIT);
6407define_netdev_printk_level(netdev_err, KERN_ERR);
6408define_netdev_printk_level(netdev_warn, KERN_WARNING);
6409define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6410define_netdev_printk_level(netdev_info, KERN_INFO);
6411
6412static void __net_exit netdev_exit(struct net *net)
6413{
6414	kfree(net->dev_name_head);
6415	kfree(net->dev_index_head);
6416}
6417
6418static struct pernet_operations __net_initdata netdev_net_ops = {
6419	.init = netdev_init,
6420	.exit = netdev_exit,
6421};
6422
6423static void __net_exit default_device_exit(struct net *net)
6424{
6425	struct net_device *dev, *aux;
6426	/*
6427	 * Push all migratable network devices back to the
6428	 * initial network namespace
6429	 */
6430	rtnl_lock();
6431	for_each_netdev_safe(net, dev, aux) {
6432		int err;
6433		char fb_name[IFNAMSIZ];
6434
6435		/* Ignore unmoveable devices (i.e. loopback) */
6436		if (dev->features & NETIF_F_NETNS_LOCAL)
6437			continue;
6438
6439		/* Leave virtual devices for the generic cleanup */
6440		if (dev->rtnl_link_ops)
6441			continue;
6442
6443		/* Push remaining network devices to init_net */
6444		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6445		err = dev_change_net_namespace(dev, &init_net, fb_name);
6446		if (err) {
6447			pr_emerg("%s: failed to move %s to init_net: %d\n",
6448				 __func__, dev->name, err);
6449			BUG();
6450		}
6451	}
6452	rtnl_unlock();
6453}
6454
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6455static void __net_exit default_device_exit_batch(struct list_head *net_list)
6456{
6457	/* At exit all network devices most be removed from a network
6458	 * namespace.  Do this in the reverse order of registration.
6459	 * Do this across as many network namespaces as possible to
6460	 * improve batching efficiency.
6461	 */
6462	struct net_device *dev;
6463	struct net *net;
6464	LIST_HEAD(dev_kill_list);
6465
6466	rtnl_lock();
 
 
 
 
 
 
 
 
 
 
 
6467	list_for_each_entry(net, net_list, exit_list) {
6468		for_each_netdev_reverse(net, dev) {
6469			if (dev->rtnl_link_ops)
6470				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6471			else
6472				unregister_netdevice_queue(dev, &dev_kill_list);
6473		}
6474	}
6475	unregister_netdevice_many(&dev_kill_list);
6476	list_del(&dev_kill_list);
6477	rtnl_unlock();
6478}
6479
6480static struct pernet_operations __net_initdata default_device_ops = {
6481	.exit = default_device_exit,
6482	.exit_batch = default_device_exit_batch,
6483};
6484
6485/*
6486 *	Initialize the DEV module. At boot time this walks the device list and
6487 *	unhooks any devices that fail to initialise (normally hardware not
6488 *	present) and leaves us with a valid list of present and active devices.
6489 *
6490 */
6491
6492/*
6493 *       This is called single threaded during boot, so no need
6494 *       to take the rtnl semaphore.
6495 */
6496static int __init net_dev_init(void)
6497{
6498	int i, rc = -ENOMEM;
6499
6500	BUG_ON(!dev_boot_phase);
6501
6502	if (dev_proc_init())
6503		goto out;
6504
6505	if (netdev_kobject_init())
6506		goto out;
6507
6508	INIT_LIST_HEAD(&ptype_all);
6509	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6510		INIT_LIST_HEAD(&ptype_base[i]);
6511
 
 
6512	if (register_pernet_subsys(&netdev_net_ops))
6513		goto out;
6514
6515	/*
6516	 *	Initialise the packet receive queues.
6517	 */
6518
6519	for_each_possible_cpu(i) {
6520		struct softnet_data *sd = &per_cpu(softnet_data, i);
6521
6522		memset(sd, 0, sizeof(*sd));
6523		skb_queue_head_init(&sd->input_pkt_queue);
6524		skb_queue_head_init(&sd->process_queue);
6525		sd->completion_queue = NULL;
6526		INIT_LIST_HEAD(&sd->poll_list);
6527		sd->output_queue = NULL;
6528		sd->output_queue_tailp = &sd->output_queue;
6529#ifdef CONFIG_RPS
6530		sd->csd.func = rps_trigger_softirq;
6531		sd->csd.info = sd;
6532		sd->csd.flags = 0;
6533		sd->cpu = i;
6534#endif
6535
6536		sd->backlog.poll = process_backlog;
6537		sd->backlog.weight = weight_p;
6538		sd->backlog.gro_list = NULL;
6539		sd->backlog.gro_count = 0;
6540	}
6541
6542	dev_boot_phase = 0;
6543
6544	/* The loopback device is special if any other network devices
6545	 * is present in a network namespace the loopback device must
6546	 * be present. Since we now dynamically allocate and free the
6547	 * loopback device ensure this invariant is maintained by
6548	 * keeping the loopback device as the first device on the
6549	 * list of network devices.  Ensuring the loopback devices
6550	 * is the first device that appears and the last network device
6551	 * that disappears.
6552	 */
6553	if (register_pernet_device(&loopback_net_ops))
6554		goto out;
6555
6556	if (register_pernet_device(&default_device_ops))
6557		goto out;
6558
6559	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6560	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6561
6562	hotcpu_notifier(dev_cpu_callback, 0);
6563	dst_init();
6564	dev_mcast_init();
6565	rc = 0;
6566out:
6567	return rc;
6568}
6569
6570subsys_initcall(net_dev_init);
6571
6572static int __init initialize_hashrnd(void)
6573{
6574	get_random_bytes(&hashrnd, sizeof(hashrnd));
6575	return 0;
6576}
6577
6578late_initcall_sync(initialize_hashrnd);
6579