Linux Audio

Check our new training course

Loading...
v3.1
 
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <asm/system.h>
  77#include <linux/bitops.h>
  78#include <linux/capability.h>
  79#include <linux/cpu.h>
  80#include <linux/types.h>
  81#include <linux/kernel.h>
  82#include <linux/hash.h>
  83#include <linux/slab.h>
  84#include <linux/sched.h>
 
  85#include <linux/mutex.h>
 
  86#include <linux/string.h>
  87#include <linux/mm.h>
  88#include <linux/socket.h>
  89#include <linux/sockios.h>
  90#include <linux/errno.h>
  91#include <linux/interrupt.h>
  92#include <linux/if_ether.h>
  93#include <linux/netdevice.h>
  94#include <linux/etherdevice.h>
  95#include <linux/ethtool.h>
  96#include <linux/notifier.h>
  97#include <linux/skbuff.h>
 
 
  98#include <net/net_namespace.h>
  99#include <net/sock.h>
 
 100#include <linux/rtnetlink.h>
 101#include <linux/proc_fs.h>
 102#include <linux/seq_file.h>
 103#include <linux/stat.h>
 104#include <net/dst.h>
 
 105#include <net/pkt_sched.h>
 
 106#include <net/checksum.h>
 107#include <net/xfrm.h>
 108#include <linux/highmem.h>
 109#include <linux/init.h>
 110#include <linux/kmod.h>
 111#include <linux/module.h>
 112#include <linux/netpoll.h>
 113#include <linux/rcupdate.h>
 114#include <linux/delay.h>
 115#include <net/wext.h>
 116#include <net/iw_handler.h>
 117#include <asm/current.h>
 118#include <linux/audit.h>
 119#include <linux/dmaengine.h>
 120#include <linux/err.h>
 121#include <linux/ctype.h>
 122#include <linux/if_arp.h>
 123#include <linux/if_vlan.h>
 124#include <linux/ip.h>
 125#include <net/ip.h>
 
 126#include <linux/ipv6.h>
 127#include <linux/in.h>
 128#include <linux/jhash.h>
 129#include <linux/random.h>
 130#include <trace/events/napi.h>
 131#include <trace/events/net.h>
 132#include <trace/events/skb.h>
 133#include <linux/pci.h>
 134#include <linux/inetdevice.h>
 135#include <linux/cpu_rmap.h>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 136
 137#include "net-sysfs.h"
 138
 139/* Instead of increasing this, you should create a hash table. */
 140#define MAX_GRO_SKBS 8
 141
 142/* This should be increased if a protocol with a bigger head is added. */
 143#define GRO_MAX_HEAD (MAX_HEADER + 128)
 144
 145/*
 146 *	The list of packet types we will receive (as opposed to discard)
 147 *	and the routines to invoke.
 148 *
 149 *	Why 16. Because with 16 the only overlap we get on a hash of the
 150 *	low nibble of the protocol value is RARP/SNAP/X.25.
 151 *
 152 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 153 *             sure which should go first, but I bet it won't make much
 154 *             difference if we are running VLANs.  The good news is that
 155 *             this protocol won't be in the list unless compiled in, so
 156 *             the average user (w/out VLANs) will not be adversely affected.
 157 *             --BLG
 158 *
 159 *		0800	IP
 160 *		8100    802.1Q VLAN
 161 *		0001	802.3
 162 *		0002	AX.25
 163 *		0004	802.2
 164 *		8035	RARP
 165 *		0005	SNAP
 166 *		0805	X.25
 167 *		0806	ARP
 168 *		8137	IPX
 169 *		0009	Localtalk
 170 *		86DD	IPv6
 171 */
 172
 173#define PTYPE_HASH_SIZE	(16)
 174#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 175
 176static DEFINE_SPINLOCK(ptype_lock);
 177static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 178static struct list_head ptype_all __read_mostly;	/* Taps */
 
 
 
 
 
 
 
 
 
 
 179
 180/*
 181 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 182 * semaphore.
 183 *
 184 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 185 *
 186 * Writers must hold the rtnl semaphore while they loop through the
 187 * dev_base_head list, and hold dev_base_lock for writing when they do the
 188 * actual updates.  This allows pure readers to access the list even
 189 * while a writer is preparing to update it.
 190 *
 191 * To put it another way, dev_base_lock is held for writing only to
 192 * protect against pure readers; the rtnl semaphore provides the
 193 * protection against other writers.
 194 *
 195 * See, for example usages, register_netdevice() and
 196 * unregister_netdevice(), which must be called with the rtnl
 197 * semaphore held.
 198 */
 199DEFINE_RWLOCK(dev_base_lock);
 200EXPORT_SYMBOL(dev_base_lock);
 201
 
 
 
 
 
 
 
 
 
 
 202static inline void dev_base_seq_inc(struct net *net)
 203{
 204	while (++net->dev_base_seq == 0);
 
 205}
 206
 207static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 208{
 209	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 
 210	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 211}
 212
 213static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 214{
 215	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 216}
 217
 218static inline void rps_lock(struct softnet_data *sd)
 219{
 220#ifdef CONFIG_RPS
 221	spin_lock(&sd->input_pkt_queue.lock);
 222#endif
 223}
 224
 225static inline void rps_unlock(struct softnet_data *sd)
 226{
 227#ifdef CONFIG_RPS
 228	spin_unlock(&sd->input_pkt_queue.lock);
 229#endif
 230}
 231
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 232/* Device list insertion */
 233static int list_netdevice(struct net_device *dev)
 234{
 235	struct net *net = dev_net(dev);
 236
 237	ASSERT_RTNL();
 238
 239	write_lock_bh(&dev_base_lock);
 240	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 241	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 242	hlist_add_head_rcu(&dev->index_hlist,
 243			   dev_index_hash(net, dev->ifindex));
 244	write_unlock_bh(&dev_base_lock);
 245
 246	dev_base_seq_inc(net);
 247
 248	return 0;
 249}
 250
 251/* Device list removal
 252 * caller must respect a RCU grace period before freeing/reusing dev
 253 */
 254static void unlist_netdevice(struct net_device *dev)
 255{
 256	ASSERT_RTNL();
 257
 258	/* Unlink dev from the device chain */
 259	write_lock_bh(&dev_base_lock);
 260	list_del_rcu(&dev->dev_list);
 261	hlist_del_rcu(&dev->name_hlist);
 262	hlist_del_rcu(&dev->index_hlist);
 263	write_unlock_bh(&dev_base_lock);
 264
 265	dev_base_seq_inc(dev_net(dev));
 266}
 267
 268/*
 269 *	Our notifier list
 270 */
 271
 272static RAW_NOTIFIER_HEAD(netdev_chain);
 273
 274/*
 275 *	Device drivers call our routines to queue packets here. We empty the
 276 *	queue in the local softnet handler.
 277 */
 278
 279DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 280EXPORT_PER_CPU_SYMBOL(softnet_data);
 281
 282#ifdef CONFIG_LOCKDEP
 283/*
 284 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 285 * according to dev->type
 286 */
 287static const unsigned short netdev_lock_type[] =
 288	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 289	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 290	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 291	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 292	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 293	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 294	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 295	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 296	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 297	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 298	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 299	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 300	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 301	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 302	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 303	 ARPHRD_VOID, ARPHRD_NONE};
 304
 305static const char *const netdev_lock_name[] =
 306	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 307	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 308	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 309	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 310	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 311	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 312	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 313	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 314	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 315	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 316	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 317	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 318	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 319	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 320	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 321	 "_xmit_VOID", "_xmit_NONE"};
 322
 323static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 324static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 325
 326static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 327{
 328	int i;
 329
 330	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 331		if (netdev_lock_type[i] == dev_type)
 332			return i;
 333	/* the last key is used by default */
 334	return ARRAY_SIZE(netdev_lock_type) - 1;
 335}
 336
 337static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 338						 unsigned short dev_type)
 339{
 340	int i;
 341
 342	i = netdev_lock_pos(dev_type);
 343	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 344				   netdev_lock_name[i]);
 345}
 346
 347static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 348{
 349	int i;
 350
 351	i = netdev_lock_pos(dev->type);
 352	lockdep_set_class_and_name(&dev->addr_list_lock,
 353				   &netdev_addr_lock_key[i],
 354				   netdev_lock_name[i]);
 355}
 356#else
 357static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 358						 unsigned short dev_type)
 359{
 360}
 
 361static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 362{
 363}
 364#endif
 365
 366/*******************************************************************************
 
 
 
 
 367
 368		Protocol management and registration routines
 369
 370*******************************************************************************/
 371
 372/*
 373 *	Add a protocol ID to the list. Now that the input handler is
 374 *	smarter we can dispense with all the messy stuff that used to be
 375 *	here.
 376 *
 377 *	BEWARE!!! Protocol handlers, mangling input packets,
 378 *	MUST BE last in hash buckets and checking protocol handlers
 379 *	MUST start from promiscuous ptype_all chain in net_bh.
 380 *	It is true now, do not change it.
 381 *	Explanation follows: if protocol handler, mangling packet, will
 382 *	be the first on list, it is not able to sense, that packet
 383 *	is cloned and should be copied-on-write, so that it will
 384 *	change it and subsequent readers will get broken packet.
 385 *							--ANK (980803)
 386 */
 387
 388static inline struct list_head *ptype_head(const struct packet_type *pt)
 389{
 390	if (pt->type == htons(ETH_P_ALL))
 391		return &ptype_all;
 392	else
 393		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 
 394}
 395
 396/**
 397 *	dev_add_pack - add packet handler
 398 *	@pt: packet type declaration
 399 *
 400 *	Add a protocol handler to the networking stack. The passed &packet_type
 401 *	is linked into kernel lists and may not be freed until it has been
 402 *	removed from the kernel lists.
 403 *
 404 *	This call does not sleep therefore it can not
 405 *	guarantee all CPU's that are in middle of receiving packets
 406 *	will see the new packet type (until the next received packet).
 407 */
 408
 409void dev_add_pack(struct packet_type *pt)
 410{
 411	struct list_head *head = ptype_head(pt);
 412
 413	spin_lock(&ptype_lock);
 414	list_add_rcu(&pt->list, head);
 415	spin_unlock(&ptype_lock);
 416}
 417EXPORT_SYMBOL(dev_add_pack);
 418
 419/**
 420 *	__dev_remove_pack	 - remove packet handler
 421 *	@pt: packet type declaration
 422 *
 423 *	Remove a protocol handler that was previously added to the kernel
 424 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 425 *	from the kernel lists and can be freed or reused once this function
 426 *	returns.
 427 *
 428 *      The packet type might still be in use by receivers
 429 *	and must not be freed until after all the CPU's have gone
 430 *	through a quiescent state.
 431 */
 432void __dev_remove_pack(struct packet_type *pt)
 433{
 434	struct list_head *head = ptype_head(pt);
 435	struct packet_type *pt1;
 436
 437	spin_lock(&ptype_lock);
 438
 439	list_for_each_entry(pt1, head, list) {
 440		if (pt == pt1) {
 441			list_del_rcu(&pt->list);
 442			goto out;
 443		}
 444	}
 445
 446	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 447out:
 448	spin_unlock(&ptype_lock);
 449}
 450EXPORT_SYMBOL(__dev_remove_pack);
 451
 452/**
 453 *	dev_remove_pack	 - remove packet handler
 454 *	@pt: packet type declaration
 455 *
 456 *	Remove a protocol handler that was previously added to the kernel
 457 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 458 *	from the kernel lists and can be freed or reused once this function
 459 *	returns.
 460 *
 461 *	This call sleeps to guarantee that no CPU is looking at the packet
 462 *	type after return.
 463 */
 464void dev_remove_pack(struct packet_type *pt)
 465{
 466	__dev_remove_pack(pt);
 467
 468	synchronize_net();
 469}
 470EXPORT_SYMBOL(dev_remove_pack);
 471
 472/******************************************************************************
 473
 474		      Device Boot-time Settings Routines
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 475
 476*******************************************************************************/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 477
 478/* Boot time configuration table */
 479static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 480
 481/**
 482 *	netdev_boot_setup_add	- add new setup entry
 483 *	@name: name of the device
 484 *	@map: configured settings for the device
 485 *
 486 *	Adds new setup entry to the dev_boot_setup list.  The function
 487 *	returns 0 on error and 1 on success.  This is a generic routine to
 488 *	all netdevices.
 489 */
 490static int netdev_boot_setup_add(char *name, struct ifmap *map)
 491{
 492	struct netdev_boot_setup *s;
 493	int i;
 494
 495	s = dev_boot_setup;
 496	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 497		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 498			memset(s[i].name, 0, sizeof(s[i].name));
 499			strlcpy(s[i].name, name, IFNAMSIZ);
 500			memcpy(&s[i].map, map, sizeof(s[i].map));
 501			break;
 502		}
 503	}
 504
 505	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 506}
 507
 508/**
 509 *	netdev_boot_setup_check	- check boot time settings
 510 *	@dev: the netdevice
 511 *
 512 * 	Check boot time settings for the device.
 513 *	The found settings are set for the device to be used
 514 *	later in the device probing.
 515 *	Returns 0 if no settings found, 1 if they are.
 516 */
 517int netdev_boot_setup_check(struct net_device *dev)
 518{
 519	struct netdev_boot_setup *s = dev_boot_setup;
 520	int i;
 521
 522	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 523		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 524		    !strcmp(dev->name, s[i].name)) {
 525			dev->irq 	= s[i].map.irq;
 526			dev->base_addr 	= s[i].map.base_addr;
 527			dev->mem_start 	= s[i].map.mem_start;
 528			dev->mem_end 	= s[i].map.mem_end;
 529			return 1;
 530		}
 531	}
 532	return 0;
 533}
 534EXPORT_SYMBOL(netdev_boot_setup_check);
 535
 536
 537/**
 538 *	netdev_boot_base	- get address from boot time settings
 539 *	@prefix: prefix for network device
 540 *	@unit: id for network device
 541 *
 542 * 	Check boot time settings for the base address of device.
 543 *	The found settings are set for the device to be used
 544 *	later in the device probing.
 545 *	Returns 0 if no settings found.
 546 */
 547unsigned long netdev_boot_base(const char *prefix, int unit)
 548{
 549	const struct netdev_boot_setup *s = dev_boot_setup;
 550	char name[IFNAMSIZ];
 551	int i;
 552
 553	sprintf(name, "%s%d", prefix, unit);
 554
 555	/*
 556	 * If device already registered then return base of 1
 557	 * to indicate not to probe for this interface
 558	 */
 559	if (__dev_get_by_name(&init_net, name))
 560		return 1;
 561
 562	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 563		if (!strcmp(name, s[i].name))
 564			return s[i].map.base_addr;
 565	return 0;
 566}
 567
 568/*
 569 * Saves at boot time configured settings for any netdevice.
 570 */
 571int __init netdev_boot_setup(char *str)
 572{
 573	int ints[5];
 574	struct ifmap map;
 575
 576	str = get_options(str, ARRAY_SIZE(ints), ints);
 577	if (!str || !*str)
 578		return 0;
 579
 580	/* Save settings */
 581	memset(&map, 0, sizeof(map));
 582	if (ints[0] > 0)
 583		map.irq = ints[1];
 584	if (ints[0] > 1)
 585		map.base_addr = ints[2];
 586	if (ints[0] > 2)
 587		map.mem_start = ints[3];
 588	if (ints[0] > 3)
 589		map.mem_end = ints[4];
 590
 591	/* Add new entry to the list */
 592	return netdev_boot_setup_add(str, &map);
 593}
 594
 595__setup("netdev=", netdev_boot_setup);
 596
 597/*******************************************************************************
 
 
 
 
 598
 599			    Device Interface Subroutines
 
 
 
 
 
 
 600
 601*******************************************************************************/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 602
 603/**
 604 *	__dev_get_by_name	- find a device by its name
 605 *	@net: the applicable net namespace
 606 *	@name: name to find
 607 *
 608 *	Find an interface by name. Must be called under RTNL semaphore
 609 *	or @dev_base_lock. If the name is found a pointer to the device
 610 *	is returned. If the name is not found then %NULL is returned. The
 611 *	reference counters are not incremented so the caller must be
 612 *	careful with locks.
 613 */
 614
 615struct net_device *__dev_get_by_name(struct net *net, const char *name)
 616{
 617	struct hlist_node *p;
 618	struct net_device *dev;
 619	struct hlist_head *head = dev_name_hash(net, name);
 620
 621	hlist_for_each_entry(dev, p, head, name_hlist)
 622		if (!strncmp(dev->name, name, IFNAMSIZ))
 623			return dev;
 624
 625	return NULL;
 
 626}
 627EXPORT_SYMBOL(__dev_get_by_name);
 628
 629/**
 630 *	dev_get_by_name_rcu	- find a device by its name
 631 *	@net: the applicable net namespace
 632 *	@name: name to find
 633 *
 634 *	Find an interface by name.
 635 *	If the name is found a pointer to the device is returned.
 636 * 	If the name is not found then %NULL is returned.
 637 *	The reference counters are not incremented so the caller must be
 638 *	careful with locks. The caller must hold RCU lock.
 639 */
 640
 641struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 642{
 643	struct hlist_node *p;
 644	struct net_device *dev;
 645	struct hlist_head *head = dev_name_hash(net, name);
 646
 647	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 648		if (!strncmp(dev->name, name, IFNAMSIZ))
 649			return dev;
 650
 651	return NULL;
 652}
 653EXPORT_SYMBOL(dev_get_by_name_rcu);
 654
 655/**
 656 *	dev_get_by_name		- find a device by its name
 657 *	@net: the applicable net namespace
 658 *	@name: name to find
 659 *
 660 *	Find an interface by name. This can be called from any
 661 *	context and does its own locking. The returned handle has
 662 *	the usage count incremented and the caller must use dev_put() to
 663 *	release it when it is no longer needed. %NULL is returned if no
 664 *	matching device is found.
 665 */
 666
 667struct net_device *dev_get_by_name(struct net *net, const char *name)
 668{
 669	struct net_device *dev;
 670
 671	rcu_read_lock();
 672	dev = dev_get_by_name_rcu(net, name);
 673	if (dev)
 674		dev_hold(dev);
 675	rcu_read_unlock();
 676	return dev;
 677}
 678EXPORT_SYMBOL(dev_get_by_name);
 679
 680/**
 681 *	__dev_get_by_index - find a device by its ifindex
 682 *	@net: the applicable net namespace
 683 *	@ifindex: index of device
 684 *
 685 *	Search for an interface by index. Returns %NULL if the device
 686 *	is not found or a pointer to the device. The device has not
 687 *	had its reference counter increased so the caller must be careful
 688 *	about locking. The caller must hold either the RTNL semaphore
 689 *	or @dev_base_lock.
 690 */
 691
 692struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 693{
 694	struct hlist_node *p;
 695	struct net_device *dev;
 696	struct hlist_head *head = dev_index_hash(net, ifindex);
 697
 698	hlist_for_each_entry(dev, p, head, index_hlist)
 699		if (dev->ifindex == ifindex)
 700			return dev;
 701
 702	return NULL;
 703}
 704EXPORT_SYMBOL(__dev_get_by_index);
 705
 706/**
 707 *	dev_get_by_index_rcu - find a device by its ifindex
 708 *	@net: the applicable net namespace
 709 *	@ifindex: index of device
 710 *
 711 *	Search for an interface by index. Returns %NULL if the device
 712 *	is not found or a pointer to the device. The device has not
 713 *	had its reference counter increased so the caller must be careful
 714 *	about locking. The caller must hold RCU lock.
 715 */
 716
 717struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 718{
 719	struct hlist_node *p;
 720	struct net_device *dev;
 721	struct hlist_head *head = dev_index_hash(net, ifindex);
 722
 723	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 724		if (dev->ifindex == ifindex)
 725			return dev;
 726
 727	return NULL;
 728}
 729EXPORT_SYMBOL(dev_get_by_index_rcu);
 730
 731
 732/**
 733 *	dev_get_by_index - find a device by its ifindex
 734 *	@net: the applicable net namespace
 735 *	@ifindex: index of device
 736 *
 737 *	Search for an interface by index. Returns NULL if the device
 738 *	is not found or a pointer to the device. The device returned has
 739 *	had a reference added and the pointer is safe until the user calls
 740 *	dev_put to indicate they have finished with it.
 741 */
 742
 743struct net_device *dev_get_by_index(struct net *net, int ifindex)
 744{
 745	struct net_device *dev;
 746
 747	rcu_read_lock();
 748	dev = dev_get_by_index_rcu(net, ifindex);
 749	if (dev)
 750		dev_hold(dev);
 751	rcu_read_unlock();
 752	return dev;
 753}
 754EXPORT_SYMBOL(dev_get_by_index);
 755
 756/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 757 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 758 *	@net: the applicable net namespace
 759 *	@type: media type of device
 760 *	@ha: hardware address
 761 *
 762 *	Search for an interface by MAC address. Returns NULL if the device
 763 *	is not found or a pointer to the device.
 764 *	The caller must hold RCU or RTNL.
 765 *	The returned device has not had its ref count increased
 766 *	and the caller must therefore be careful about locking
 767 *
 768 */
 769
 770struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 771				       const char *ha)
 772{
 773	struct net_device *dev;
 774
 775	for_each_netdev_rcu(net, dev)
 776		if (dev->type == type &&
 777		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 778			return dev;
 779
 780	return NULL;
 781}
 782EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 783
 784struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 785{
 786	struct net_device *dev;
 787
 788	ASSERT_RTNL();
 789	for_each_netdev(net, dev)
 790		if (dev->type == type)
 791			return dev;
 792
 793	return NULL;
 794}
 795EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 796
 797struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 798{
 799	struct net_device *dev, *ret = NULL;
 800
 801	rcu_read_lock();
 802	for_each_netdev_rcu(net, dev)
 803		if (dev->type == type) {
 804			dev_hold(dev);
 805			ret = dev;
 806			break;
 807		}
 808	rcu_read_unlock();
 809	return ret;
 810}
 811EXPORT_SYMBOL(dev_getfirstbyhwtype);
 812
 813/**
 814 *	dev_get_by_flags_rcu - find any device with given flags
 815 *	@net: the applicable net namespace
 816 *	@if_flags: IFF_* values
 817 *	@mask: bitmask of bits in if_flags to check
 818 *
 819 *	Search for any interface with the given flags. Returns NULL if a device
 820 *	is not found or a pointer to the device. Must be called inside
 821 *	rcu_read_lock(), and result refcount is unchanged.
 822 */
 823
 824struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 825				    unsigned short mask)
 826{
 827	struct net_device *dev, *ret;
 828
 
 
 829	ret = NULL;
 830	for_each_netdev_rcu(net, dev) {
 831		if (((dev->flags ^ if_flags) & mask) == 0) {
 832			ret = dev;
 833			break;
 834		}
 835	}
 836	return ret;
 837}
 838EXPORT_SYMBOL(dev_get_by_flags_rcu);
 839
 840/**
 841 *	dev_valid_name - check if name is okay for network device
 842 *	@name: name string
 843 *
 844 *	Network device names need to be valid file names to
 845 *	to allow sysfs to work.  We also disallow any kind of
 846 *	whitespace.
 847 */
 848int dev_valid_name(const char *name)
 849{
 850	if (*name == '\0')
 851		return 0;
 852	if (strlen(name) >= IFNAMSIZ)
 853		return 0;
 854	if (!strcmp(name, ".") || !strcmp(name, ".."))
 855		return 0;
 856
 857	while (*name) {
 858		if (*name == '/' || isspace(*name))
 859			return 0;
 860		name++;
 861	}
 862	return 1;
 863}
 864EXPORT_SYMBOL(dev_valid_name);
 865
 866/**
 867 *	__dev_alloc_name - allocate a name for a device
 868 *	@net: network namespace to allocate the device name in
 869 *	@name: name format string
 870 *	@buf:  scratch buffer and result name string
 871 *
 872 *	Passed a format string - eg "lt%d" it will try and find a suitable
 873 *	id. It scans list of devices to build up a free map, then chooses
 874 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 875 *	while allocating the name and adding the device in order to avoid
 876 *	duplicates.
 877 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 878 *	Returns the number of the unit assigned or a negative errno code.
 879 */
 880
 881static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 882{
 883	int i = 0;
 884	const char *p;
 885	const int max_netdevices = 8*PAGE_SIZE;
 886	unsigned long *inuse;
 887	struct net_device *d;
 888
 889	p = strnchr(name, IFNAMSIZ-1, '%');
 
 
 
 890	if (p) {
 891		/*
 892		 * Verify the string as this thing may have come from
 893		 * the user.  There must be either one "%d" and no other "%"
 894		 * characters.
 895		 */
 896		if (p[1] != 'd' || strchr(p + 2, '%'))
 897			return -EINVAL;
 898
 899		/* Use one page as a bit array of possible slots */
 900		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 901		if (!inuse)
 902			return -ENOMEM;
 903
 904		for_each_netdev(net, d) {
 905			if (!sscanf(d->name, name, &i))
 906				continue;
 907			if (i < 0 || i >= max_netdevices)
 908				continue;
 909
 910			/*  avoid cases where sscanf is not exact inverse of printf */
 911			snprintf(buf, IFNAMSIZ, name, i);
 912			if (!strncmp(buf, d->name, IFNAMSIZ))
 913				set_bit(i, inuse);
 914		}
 915
 916		i = find_first_zero_bit(inuse, max_netdevices);
 917		free_page((unsigned long) inuse);
 918	}
 919
 920	if (buf != name)
 921		snprintf(buf, IFNAMSIZ, name, i);
 922	if (!__dev_get_by_name(net, buf))
 923		return i;
 924
 925	/* It is possible to run out of possible slots
 926	 * when the name is long and there isn't enough space left
 927	 * for the digits, or if all bits are used.
 928	 */
 929	return -ENFILE;
 930}
 931
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 932/**
 933 *	dev_alloc_name - allocate a name for a device
 934 *	@dev: device
 935 *	@name: name format string
 936 *
 937 *	Passed a format string - eg "lt%d" it will try and find a suitable
 938 *	id. It scans list of devices to build up a free map, then chooses
 939 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 940 *	while allocating the name and adding the device in order to avoid
 941 *	duplicates.
 942 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 943 *	Returns the number of the unit assigned or a negative errno code.
 944 */
 945
 946int dev_alloc_name(struct net_device *dev, const char *name)
 947{
 948	char buf[IFNAMSIZ];
 949	struct net *net;
 950	int ret;
 951
 952	BUG_ON(!dev_net(dev));
 953	net = dev_net(dev);
 954	ret = __dev_alloc_name(net, name, buf);
 955	if (ret >= 0)
 956		strlcpy(dev->name, buf, IFNAMSIZ);
 957	return ret;
 958}
 959EXPORT_SYMBOL(dev_alloc_name);
 960
 961static int dev_get_valid_name(struct net_device *dev, const char *name)
 
 962{
 963	struct net *net;
 964
 965	BUG_ON(!dev_net(dev));
 966	net = dev_net(dev);
 967
 968	if (!dev_valid_name(name))
 969		return -EINVAL;
 970
 971	if (strchr(name, '%'))
 972		return dev_alloc_name(dev, name);
 973	else if (__dev_get_by_name(net, name))
 974		return -EEXIST;
 975	else if (dev->name != name)
 976		strlcpy(dev->name, name, IFNAMSIZ);
 977
 978	return 0;
 979}
 980
 981/**
 982 *	dev_change_name - change name of a device
 983 *	@dev: device
 984 *	@newname: name (or format string) must be at least IFNAMSIZ
 985 *
 986 *	Change name of a device, can pass format strings "eth%d".
 987 *	for wildcarding.
 988 */
 989int dev_change_name(struct net_device *dev, const char *newname)
 990{
 
 991	char oldname[IFNAMSIZ];
 992	int err = 0;
 993	int ret;
 994	struct net *net;
 995
 996	ASSERT_RTNL();
 997	BUG_ON(!dev_net(dev));
 998
 999	net = dev_net(dev);
1000	if (dev->flags & IFF_UP)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1001		return -EBUSY;
1002
1003	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 
 
 
1004		return 0;
 
1005
1006	memcpy(oldname, dev->name, IFNAMSIZ);
1007
1008	err = dev_get_valid_name(dev, newname);
1009	if (err < 0)
 
1010		return err;
 
 
 
 
 
 
 
1011
1012rollback:
1013	ret = device_rename(&dev->dev, dev->name);
1014	if (ret) {
1015		memcpy(dev->name, oldname, IFNAMSIZ);
 
 
1016		return ret;
1017	}
1018
 
 
 
 
1019	write_lock_bh(&dev_base_lock);
1020	hlist_del_rcu(&dev->name_hlist);
1021	write_unlock_bh(&dev_base_lock);
1022
1023	synchronize_rcu();
1024
1025	write_lock_bh(&dev_base_lock);
1026	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1027	write_unlock_bh(&dev_base_lock);
1028
1029	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1030	ret = notifier_to_errno(ret);
1031
1032	if (ret) {
1033		/* err >= 0 after dev_alloc_name() or stores the first errno */
1034		if (err >= 0) {
1035			err = ret;
 
1036			memcpy(dev->name, oldname, IFNAMSIZ);
 
 
 
1037			goto rollback;
1038		} else {
1039			printk(KERN_ERR
1040			       "%s: name change rollback failed: %d.\n",
1041			       dev->name, ret);
1042		}
1043	}
1044
1045	return err;
1046}
1047
1048/**
1049 *	dev_set_alias - change ifalias of a device
1050 *	@dev: device
1051 *	@alias: name up to IFALIASZ
1052 *	@len: limit of bytes to copy from info
1053 *
1054 *	Set ifalias for a device,
1055 */
1056int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1057{
1058	ASSERT_RTNL();
1059
1060	if (len >= IFALIASZ)
1061		return -EINVAL;
1062
1063	if (!len) {
1064		if (dev->ifalias) {
1065			kfree(dev->ifalias);
1066			dev->ifalias = NULL;
1067		}
1068		return 0;
 
1069	}
1070
1071	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1072	if (!dev->ifalias)
1073		return -ENOMEM;
 
 
 
 
1074
1075	strlcpy(dev->ifalias, alias, len+1);
1076	return len;
1077}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1078
 
 
 
 
 
 
 
 
1079
1080/**
1081 *	netdev_features_change - device changes features
1082 *	@dev: device to cause notification
1083 *
1084 *	Called to indicate a device has changed features.
1085 */
1086void netdev_features_change(struct net_device *dev)
1087{
1088	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1089}
1090EXPORT_SYMBOL(netdev_features_change);
1091
1092/**
1093 *	netdev_state_change - device changes state
1094 *	@dev: device to cause notification
1095 *
1096 *	Called to indicate a device has changed state. This function calls
1097 *	the notifier chains for netdev_chain and sends a NEWLINK message
1098 *	to the routing socket.
1099 */
1100void netdev_state_change(struct net_device *dev)
1101{
1102	if (dev->flags & IFF_UP) {
1103		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1104		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
 
 
 
 
 
1105	}
1106}
1107EXPORT_SYMBOL(netdev_state_change);
1108
1109int netdev_bonding_change(struct net_device *dev, unsigned long event)
1110{
1111	return call_netdevice_notifiers(event, dev);
1112}
1113EXPORT_SYMBOL(netdev_bonding_change);
1114
1115/**
1116 *	dev_load 	- load a network module
1117 *	@net: the applicable net namespace
1118 *	@name: name of interface
1119 *
1120 *	If a network interface is not present and the process has suitable
1121 *	privileges this function loads the module. If module loading is not
1122 *	available in this kernel then it becomes a nop.
 
 
1123 */
1124
1125void dev_load(struct net *net, const char *name)
1126{
1127	struct net_device *dev;
1128	int no_module;
1129
1130	rcu_read_lock();
1131	dev = dev_get_by_name_rcu(net, name);
1132	rcu_read_unlock();
1133
1134	no_module = !dev;
1135	if (no_module && capable(CAP_NET_ADMIN))
1136		no_module = request_module("netdev-%s", name);
1137	if (no_module && capable(CAP_SYS_MODULE)) {
1138		if (!request_module("%s", name))
1139			pr_err("Loading kernel module for a network device "
1140"with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1141"instead\n", name);
1142	}
1143}
1144EXPORT_SYMBOL(dev_load);
1145
1146static int __dev_open(struct net_device *dev)
1147{
1148	const struct net_device_ops *ops = dev->netdev_ops;
1149	int ret;
1150
1151	ASSERT_RTNL();
1152
1153	if (!netif_device_present(dev))
1154		return -ENODEV;
 
 
 
 
 
1155
1156	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
 
 
 
 
 
 
1157	ret = notifier_to_errno(ret);
1158	if (ret)
1159		return ret;
1160
1161	set_bit(__LINK_STATE_START, &dev->state);
1162
1163	if (ops->ndo_validate_addr)
1164		ret = ops->ndo_validate_addr(dev);
1165
1166	if (!ret && ops->ndo_open)
1167		ret = ops->ndo_open(dev);
1168
 
 
1169	if (ret)
1170		clear_bit(__LINK_STATE_START, &dev->state);
1171	else {
1172		dev->flags |= IFF_UP;
1173		net_dmaengine_get();
1174		dev_set_rx_mode(dev);
1175		dev_activate(dev);
 
1176	}
1177
1178	return ret;
1179}
1180
1181/**
1182 *	dev_open	- prepare an interface for use.
1183 *	@dev:	device to open
 
1184 *
1185 *	Takes a device from down to up state. The device's private open
1186 *	function is invoked and then the multicast lists are loaded. Finally
1187 *	the device is moved into the up state and a %NETDEV_UP message is
1188 *	sent to the netdev notifier chain.
1189 *
1190 *	Calling this function on an active interface is a nop. On a failure
1191 *	a negative errno code is returned.
1192 */
1193int dev_open(struct net_device *dev)
1194{
1195	int ret;
1196
1197	if (dev->flags & IFF_UP)
1198		return 0;
1199
1200	ret = __dev_open(dev);
1201	if (ret < 0)
1202		return ret;
1203
1204	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1205	call_netdevice_notifiers(NETDEV_UP, dev);
1206
1207	return ret;
1208}
1209EXPORT_SYMBOL(dev_open);
1210
1211static int __dev_close_many(struct list_head *head)
1212{
1213	struct net_device *dev;
1214
1215	ASSERT_RTNL();
1216	might_sleep();
1217
1218	list_for_each_entry(dev, head, unreg_list) {
 
 
 
1219		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1220
1221		clear_bit(__LINK_STATE_START, &dev->state);
1222
1223		/* Synchronize to scheduled poll. We cannot touch poll list, it
1224		 * can be even on different cpu. So just clear netif_running().
1225		 *
1226		 * dev->stop() will invoke napi_disable() on all of it's
1227		 * napi_struct instances on this device.
1228		 */
1229		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1230	}
1231
1232	dev_deactivate_many(head);
1233
1234	list_for_each_entry(dev, head, unreg_list) {
1235		const struct net_device_ops *ops = dev->netdev_ops;
1236
1237		/*
1238		 *	Call the device specific close. This cannot fail.
1239		 *	Only if device is UP
1240		 *
1241		 *	We allow it to be called even after a DETACH hot-plug
1242		 *	event.
1243		 */
1244		if (ops->ndo_stop)
1245			ops->ndo_stop(dev);
1246
1247		dev->flags &= ~IFF_UP;
1248		net_dmaengine_put();
1249	}
1250
1251	return 0;
1252}
1253
1254static int __dev_close(struct net_device *dev)
1255{
1256	int retval;
1257	LIST_HEAD(single);
1258
1259	list_add(&dev->unreg_list, &single);
1260	retval = __dev_close_many(&single);
1261	list_del(&single);
1262	return retval;
1263}
1264
1265static int dev_close_many(struct list_head *head)
1266{
1267	struct net_device *dev, *tmp;
1268	LIST_HEAD(tmp_list);
1269
1270	list_for_each_entry_safe(dev, tmp, head, unreg_list)
 
1271		if (!(dev->flags & IFF_UP))
1272			list_move(&dev->unreg_list, &tmp_list);
1273
1274	__dev_close_many(head);
1275
1276	list_for_each_entry(dev, head, unreg_list) {
1277		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1278		call_netdevice_notifiers(NETDEV_DOWN, dev);
 
 
1279	}
1280
1281	/* rollback_registered_many needs the complete original list */
1282	list_splice(&tmp_list, head);
1283	return 0;
1284}
 
1285
1286/**
1287 *	dev_close - shutdown an interface.
1288 *	@dev: device to shutdown
1289 *
1290 *	This function moves an active device into down state. A
1291 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1292 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1293 *	chain.
1294 */
1295int dev_close(struct net_device *dev)
1296{
1297	if (dev->flags & IFF_UP) {
1298		LIST_HEAD(single);
1299
1300		list_add(&dev->unreg_list, &single);
1301		dev_close_many(&single);
1302		list_del(&single);
1303	}
1304	return 0;
1305}
1306EXPORT_SYMBOL(dev_close);
1307
1308
1309/**
1310 *	dev_disable_lro - disable Large Receive Offload on a device
1311 *	@dev: device
1312 *
1313 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1314 *	called under RTNL.  This is needed if received packets may be
1315 *	forwarded to another interface.
1316 */
1317void dev_disable_lro(struct net_device *dev)
1318{
1319	u32 flags;
1320
1321	/*
1322	 * If we're trying to disable lro on a vlan device
1323	 * use the underlying physical device instead
1324	 */
1325	if (is_vlan_dev(dev))
1326		dev = vlan_dev_real_dev(dev);
1327
1328	if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1329		flags = dev->ethtool_ops->get_flags(dev);
1330	else
1331		flags = ethtool_op_get_flags(dev);
1332
1333	if (!(flags & ETH_FLAG_LRO))
1334		return;
1335
1336	__ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1337	if (unlikely(dev->features & NETIF_F_LRO))
1338		netdev_WARN(dev, "failed to disable LRO!\n");
 
 
 
1339}
1340EXPORT_SYMBOL(dev_disable_lro);
1341
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1342
1343static int dev_boot_phase = 1;
1344
1345/**
1346 *	register_netdevice_notifier - register a network notifier block
1347 *	@nb: notifier
1348 *
1349 *	Register a notifier to be called when network device events occur.
1350 *	The notifier passed is linked into the kernel structures and must
1351 *	not be reused until it has been unregistered. A negative errno code
1352 *	is returned on a failure.
1353 *
1354 * 	When registered all registration and up events are replayed
1355 *	to the new notifier to allow device to have a race free
1356 *	view of the network device list.
1357 */
1358
1359int register_netdevice_notifier(struct notifier_block *nb)
1360{
1361	struct net_device *dev;
1362	struct net_device *last;
1363	struct net *net;
1364	int err;
1365
 
 
1366	rtnl_lock();
1367	err = raw_notifier_chain_register(&netdev_chain, nb);
1368	if (err)
1369		goto unlock;
1370	if (dev_boot_phase)
1371		goto unlock;
1372	for_each_net(net) {
1373		for_each_netdev(net, dev) {
1374			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1375			err = notifier_to_errno(err);
1376			if (err)
1377				goto rollback;
1378
1379			if (!(dev->flags & IFF_UP))
1380				continue;
1381
1382			nb->notifier_call(nb, NETDEV_UP, dev);
1383		}
1384	}
1385
1386unlock:
1387	rtnl_unlock();
 
1388	return err;
1389
1390rollback:
1391	last = dev;
1392	for_each_net(net) {
1393		for_each_netdev(net, dev) {
1394			if (dev == last)
1395				break;
1396
1397			if (dev->flags & IFF_UP) {
1398				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1399				nb->notifier_call(nb, NETDEV_DOWN, dev);
1400			}
1401			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1402			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1403		}
1404	}
1405
1406	raw_notifier_chain_unregister(&netdev_chain, nb);
1407	goto unlock;
1408}
1409EXPORT_SYMBOL(register_netdevice_notifier);
1410
1411/**
1412 *	unregister_netdevice_notifier - unregister a network notifier block
1413 *	@nb: notifier
1414 *
1415 *	Unregister a notifier previously registered by
1416 *	register_netdevice_notifier(). The notifier is unlinked into the
1417 *	kernel structures and may then be reused. A negative errno code
1418 *	is returned on a failure.
 
 
 
 
1419 */
1420
1421int unregister_netdevice_notifier(struct notifier_block *nb)
1422{
 
1423	int err;
1424
 
 
1425	rtnl_lock();
1426	err = raw_notifier_chain_unregister(&netdev_chain, nb);
 
 
 
 
 
 
 
1427	rtnl_unlock();
 
1428	return err;
1429}
1430EXPORT_SYMBOL(unregister_netdevice_notifier);
1431
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1432/**
1433 *	call_netdevice_notifiers - call all network notifier blocks
1434 *      @val: value passed unmodified to notifier function
1435 *      @dev: net_device pointer passed unmodified to notifier function
1436 *
1437 *	Call all network notifier blocks.  Parameters and return value
1438 *	are as for raw_notifier_call_chain().
1439 */
1440
1441int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1442{
1443	ASSERT_RTNL();
1444	return raw_notifier_call_chain(&netdev_chain, val, dev);
1445}
1446EXPORT_SYMBOL(call_netdevice_notifiers);
1447
1448/* When > 0 there are consumers of rx skb time stamps */
1449static atomic_t netstamp_needed = ATOMIC_INIT(0);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1450
1451void net_enable_timestamp(void)
1452{
1453	atomic_inc(&netstamp_needed);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1454}
1455EXPORT_SYMBOL(net_enable_timestamp);
1456
1457void net_disable_timestamp(void)
1458{
1459	atomic_dec(&netstamp_needed);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1460}
1461EXPORT_SYMBOL(net_disable_timestamp);
1462
1463static inline void net_timestamp_set(struct sk_buff *skb)
1464{
1465	if (atomic_read(&netstamp_needed))
 
1466		__net_timestamp(skb);
1467	else
1468		skb->tstamp.tv64 = 0;
1469}
1470
1471static inline void net_timestamp_check(struct sk_buff *skb)
1472{
1473	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1474		__net_timestamp(skb);
1475}
1476
1477static inline bool is_skb_forwardable(struct net_device *dev,
1478				      struct sk_buff *skb)
1479{
1480	unsigned int len;
1481
1482	if (!(dev->flags & IFF_UP))
1483		return false;
1484
1485	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1486	if (skb->len <= len)
1487		return true;
1488
1489	/* if TSO is enabled, we don't care about the length as the packet
1490	 * could be forwarded without being segmented before
1491	 */
1492	if (skb_is_gso(skb))
1493		return true;
1494
1495	return false;
1496}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1497
1498/**
1499 * dev_forward_skb - loopback an skb to another netif
1500 *
1501 * @dev: destination network device
1502 * @skb: buffer to forward
1503 *
1504 * return values:
1505 *	NET_RX_SUCCESS	(no congestion)
1506 *	NET_RX_DROP     (packet was dropped, but freed)
1507 *
1508 * dev_forward_skb can be used for injecting an skb from the
1509 * start_xmit function of one device into the receive queue
1510 * of another device.
1511 *
1512 * The receiving device may be in another namespace, so
1513 * we have to clear all information in the skb that could
1514 * impact namespace isolation.
1515 */
1516int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1517{
1518	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1519		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1520			atomic_long_inc(&dev->rx_dropped);
1521			kfree_skb(skb);
1522			return NET_RX_DROP;
1523		}
1524	}
1525
1526	skb_orphan(skb);
1527	nf_reset(skb);
1528
1529	if (unlikely(!is_skb_forwardable(dev, skb))) {
1530		atomic_long_inc(&dev->rx_dropped);
1531		kfree_skb(skb);
1532		return NET_RX_DROP;
1533	}
1534	skb_set_dev(skb, dev);
1535	skb->tstamp.tv64 = 0;
1536	skb->pkt_type = PACKET_HOST;
1537	skb->protocol = eth_type_trans(skb, dev);
1538	return netif_rx(skb);
1539}
1540EXPORT_SYMBOL_GPL(dev_forward_skb);
1541
1542static inline int deliver_skb(struct sk_buff *skb,
1543			      struct packet_type *pt_prev,
1544			      struct net_device *orig_dev)
1545{
1546	atomic_inc(&skb->users);
 
 
1547	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1548}
1549
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1550/*
1551 *	Support routine. Sends outgoing frames to any network
1552 *	taps currently in use.
1553 */
1554
1555static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1556{
1557	struct packet_type *ptype;
1558	struct sk_buff *skb2 = NULL;
1559	struct packet_type *pt_prev = NULL;
 
1560
1561	rcu_read_lock();
1562	list_for_each_entry_rcu(ptype, &ptype_all, list) {
 
 
 
 
1563		/* Never send packets back to the socket
1564		 * they originated from - MvS (miquels@drinkel.ow.org)
1565		 */
1566		if ((ptype->dev == dev || !ptype->dev) &&
1567		    (ptype->af_packet_priv == NULL ||
1568		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1569			if (pt_prev) {
1570				deliver_skb(skb2, pt_prev, skb->dev);
1571				pt_prev = ptype;
1572				continue;
1573			}
1574
1575			skb2 = skb_clone(skb, GFP_ATOMIC);
1576			if (!skb2)
1577				break;
 
 
1578
1579			net_timestamp_set(skb2);
 
 
 
 
 
 
 
 
 
 
 
1580
1581			/* skb->nh should be correctly
1582			   set by sender, so that the second statement is
1583			   just protection against buggy protocols.
1584			 */
1585			skb_reset_mac_header(skb2);
 
 
1586
1587			if (skb_network_header(skb2) < skb2->data ||
1588			    skb2->network_header > skb2->tail) {
1589				if (net_ratelimit())
1590					printk(KERN_CRIT "protocol %04x is "
1591					       "buggy, dev %s\n",
1592					       ntohs(skb2->protocol),
1593					       dev->name);
1594				skb_reset_network_header(skb2);
1595			}
1596
1597			skb2->transport_header = skb2->network_header;
1598			skb2->pkt_type = PACKET_OUTGOING;
1599			pt_prev = ptype;
1600		}
 
 
 
 
 
 
1601	}
1602	if (pt_prev)
1603		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1604	rcu_read_unlock();
1605}
 
1606
1607/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
 
1608 * @dev: Network device
1609 * @txq: number of queues available
1610 *
1611 * If real_num_tx_queues is changed the tc mappings may no longer be
1612 * valid. To resolve this verify the tc mapping remains valid and if
1613 * not NULL the mapping. With no priorities mapping to this
1614 * offset/count pair it will no longer be used. In the worst case TC0
1615 * is invalid nothing can be done so disable priority mappings. If is
1616 * expected that drivers will fix this mapping if they can before
1617 * calling netif_set_real_num_tx_queues.
1618 */
1619static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1620{
1621	int i;
1622	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1623
1624	/* If TC0 is invalidated disable TC mapping */
1625	if (tc->offset + tc->count > txq) {
1626		pr_warning("Number of in use tx queues changed "
1627			   "invalidating tc mappings. Priority "
1628			   "traffic classification disabled!\n");
1629		dev->num_tc = 0;
1630		return;
1631	}
1632
1633	/* Invalidated prio to tc mappings set to TC0 */
1634	for (i = 1; i < TC_BITMASK + 1; i++) {
1635		int q = netdev_get_prio_tc_map(dev, i);
1636
1637		tc = &dev->tc_to_txq[q];
1638		if (tc->offset + tc->count > txq) {
1639			pr_warning("Number of in use tx queues "
1640				   "changed. Priority %i to tc "
1641				   "mapping %i is no longer valid "
1642				   "setting map to 0\n",
1643				   i, q);
1644			netdev_set_prio_tc_map(dev, i, 0);
1645		}
1646	}
1647}
1648
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1649/*
1650 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1651 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1652 */
1653int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1654{
 
1655	int rc;
1656
 
 
1657	if (txq < 1 || txq > dev->num_tx_queues)
1658		return -EINVAL;
1659
1660	if (dev->reg_state == NETREG_REGISTERED ||
1661	    dev->reg_state == NETREG_UNREGISTERING) {
1662		ASSERT_RTNL();
1663
1664		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1665						  txq);
1666		if (rc)
1667			return rc;
1668
1669		if (dev->num_tc)
1670			netif_setup_tc(dev, txq);
1671
1672		if (txq < dev->real_num_tx_queues)
 
 
 
1673			qdisc_reset_all_tx_gt(dev, txq);
 
 
 
 
 
 
1674	}
1675
1676	dev->real_num_tx_queues = txq;
1677	return 0;
1678}
1679EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1680
1681#ifdef CONFIG_RPS
1682/**
1683 *	netif_set_real_num_rx_queues - set actual number of RX queues used
1684 *	@dev: Network device
1685 *	@rxq: Actual number of RX queues
1686 *
1687 *	This must be called either with the rtnl_lock held or before
1688 *	registration of the net device.  Returns 0 on success, or a
1689 *	negative error code.  If called before registration, it always
1690 *	succeeds.
1691 */
1692int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1693{
1694	int rc;
1695
1696	if (rxq < 1 || rxq > dev->num_rx_queues)
1697		return -EINVAL;
1698
1699	if (dev->reg_state == NETREG_REGISTERED) {
1700		ASSERT_RTNL();
1701
1702		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1703						  rxq);
1704		if (rc)
1705			return rc;
1706	}
1707
1708	dev->real_num_rx_queues = rxq;
1709	return 0;
1710}
1711EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1712#endif
1713
1714static inline void __netif_reschedule(struct Qdisc *q)
 
 
 
 
 
 
 
 
 
 
 
 
 
1715{
1716	struct softnet_data *sd;
1717	unsigned long flags;
1718
1719	local_irq_save(flags);
1720	sd = &__get_cpu_var(softnet_data);
1721	q->next_sched = NULL;
1722	*sd->output_queue_tailp = q;
1723	sd->output_queue_tailp = &q->next_sched;
1724	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1725	local_irq_restore(flags);
1726}
1727
1728void __netif_schedule(struct Qdisc *q)
1729{
1730	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1731		__netif_reschedule(q);
1732}
1733EXPORT_SYMBOL(__netif_schedule);
1734
1735void dev_kfree_skb_irq(struct sk_buff *skb)
 
 
 
 
1736{
1737	if (atomic_dec_and_test(&skb->users)) {
1738		struct softnet_data *sd;
1739		unsigned long flags;
1740
1741		local_irq_save(flags);
1742		sd = &__get_cpu_var(softnet_data);
1743		skb->next = sd->completion_queue;
1744		sd->completion_queue = skb;
1745		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1746		local_irq_restore(flags);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1747	}
 
 
 
 
 
 
1748}
1749EXPORT_SYMBOL(dev_kfree_skb_irq);
1750
1751void dev_kfree_skb_any(struct sk_buff *skb)
1752{
1753	if (in_irq() || irqs_disabled())
1754		dev_kfree_skb_irq(skb);
1755	else
1756		dev_kfree_skb(skb);
1757}
1758EXPORT_SYMBOL(dev_kfree_skb_any);
1759
1760
1761/**
1762 * netif_device_detach - mark device as removed
1763 * @dev: network device
1764 *
1765 * Mark device as removed from system and therefore no longer available.
1766 */
1767void netif_device_detach(struct net_device *dev)
1768{
1769	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1770	    netif_running(dev)) {
1771		netif_tx_stop_all_queues(dev);
1772	}
1773}
1774EXPORT_SYMBOL(netif_device_detach);
1775
1776/**
1777 * netif_device_attach - mark device as attached
1778 * @dev: network device
1779 *
1780 * Mark device as attached from system and restart if needed.
1781 */
1782void netif_device_attach(struct net_device *dev)
1783{
1784	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1785	    netif_running(dev)) {
1786		netif_tx_wake_all_queues(dev);
1787		__netdev_watchdog_up(dev);
1788	}
1789}
1790EXPORT_SYMBOL(netif_device_attach);
1791
1792/**
1793 * skb_dev_set -- assign a new device to a buffer
1794 * @skb: buffer for the new device
1795 * @dev: network device
1796 *
1797 * If an skb is owned by a device already, we have to reset
1798 * all data private to the namespace a device belongs to
1799 * before assigning it a new device.
1800 */
1801#ifdef CONFIG_NET_NS
1802void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
 
1803{
1804	skb_dst_drop(skb);
1805	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1806		secpath_reset(skb);
1807		nf_reset(skb);
1808		skb_init_secmark(skb);
1809		skb->mark = 0;
1810		skb->priority = 0;
1811		skb->nf_trace = 0;
1812		skb->ipvs_property = 0;
1813#ifdef CONFIG_NET_SCHED
1814		skb->tc_index = 0;
1815#endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1816	}
1817	skb->dev = dev;
 
 
 
1818}
1819EXPORT_SYMBOL(skb_set_dev);
1820#endif /* CONFIG_NET_NS */
1821
1822/*
1823 * Invalidate hardware checksum when packet is to be mangled, and
1824 * complete checksum manually on outgoing path.
1825 */
1826int skb_checksum_help(struct sk_buff *skb)
1827{
1828	__wsum csum;
1829	int ret = 0, offset;
1830
1831	if (skb->ip_summed == CHECKSUM_COMPLETE)
1832		goto out_set_summed;
1833
1834	if (unlikely(skb_shinfo(skb)->gso_size)) {
1835		/* Let GSO fix up the checksum. */
1836		goto out_set_summed;
 
 
 
 
 
 
 
 
 
1837	}
1838
1839	offset = skb_checksum_start_offset(skb);
1840	BUG_ON(offset >= skb_headlen(skb));
1841	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1842
1843	offset += skb->csum_offset;
1844	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1845
1846	if (skb_cloned(skb) &&
1847	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1848		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1849		if (ret)
1850			goto out;
1851	}
 
 
 
 
 
 
1852
1853	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1854out_set_summed:
 
 
 
 
 
 
1855	skb->ip_summed = CHECKSUM_NONE;
 
1856out:
1857	return ret;
1858}
1859EXPORT_SYMBOL(skb_checksum_help);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1860
1861/**
1862 *	skb_gso_segment - Perform segmentation on skb.
1863 *	@skb: buffer to segment
1864 *	@features: features for the output path (see dev->features)
1865 *
1866 *	This function segments the given skb and returns a list of segments.
1867 *
1868 *	It may return NULL if the skb requires no segmentation.  This is
1869 *	only possible when GSO is used for verifying header integrity.
1870 */
1871struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
 
1872{
1873	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1874	struct packet_type *ptype;
1875	__be16 type = skb->protocol;
1876	int vlan_depth = ETH_HLEN;
1877	int err;
1878
1879	while (type == htons(ETH_P_8021Q)) {
1880		struct vlan_hdr *vh;
1881
1882		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1883			return ERR_PTR(-EINVAL);
1884
1885		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1886		type = vh->h_vlan_encapsulated_proto;
1887		vlan_depth += VLAN_HLEN;
 
 
 
1888	}
 
1889
1890	skb_reset_mac_header(skb);
1891	skb->mac_len = skb->network_header - skb->mac_header;
1892	__skb_pull(skb, skb->mac_len);
 
 
1893
1894	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1895		struct net_device *dev = skb->dev;
1896		struct ethtool_drvinfo info = {};
1897
1898		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1899			dev->ethtool_ops->get_drvinfo(dev, &info);
 
 
 
 
 
1900
1901		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1902		     info.driver, dev ? dev->features : 0L,
1903		     skb->sk ? skb->sk->sk_route_caps : 0L,
1904		     skb->len, skb->data_len, skb->ip_summed);
1905
1906		if (skb_header_cloned(skb) &&
1907		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1908			return ERR_PTR(err);
1909	}
1910
1911	rcu_read_lock();
1912	list_for_each_entry_rcu(ptype,
1913			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1914		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1915			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1916				err = ptype->gso_send_check(skb);
1917				segs = ERR_PTR(err);
1918				if (err || skb_gso_ok(skb, features))
1919					break;
1920				__skb_push(skb, (skb->data -
1921						 skb_network_header(skb)));
1922			}
1923			segs = ptype->gso_segment(skb, features);
1924			break;
1925		}
1926	}
1927	rcu_read_unlock();
1928
1929	__skb_push(skb, skb->data - skb_mac_header(skb));
 
 
 
 
 
 
 
 
 
 
 
 
1930
1931	return segs;
1932}
1933EXPORT_SYMBOL(skb_gso_segment);
1934
1935/* Take action when hardware reception checksum errors are detected. */
1936#ifdef CONFIG_BUG
1937void netdev_rx_csum_fault(struct net_device *dev)
1938{
1939	if (net_ratelimit()) {
1940		printk(KERN_ERR "%s: hw csum failure.\n",
1941			dev ? dev->name : "<unknown>");
1942		dump_stack();
1943	}
1944}
1945EXPORT_SYMBOL(netdev_rx_csum_fault);
1946#endif
1947
1948/* Actually, we should eliminate this check as soon as we know, that:
1949 * 1. IOMMU is present and allows to map all the memory.
1950 * 2. No high memory really exists on this machine.
1951 */
1952
1953static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1954{
1955#ifdef CONFIG_HIGHMEM
1956	int i;
1957	if (!(dev->features & NETIF_F_HIGHDMA)) {
1958		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1959			if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1960				return 1;
1961	}
1962
1963	if (PCI_DMA_BUS_IS_PHYS) {
1964		struct device *pdev = dev->dev.parent;
1965
1966		if (!pdev)
1967			return 0;
1968		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1969			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1970			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
 
1971				return 1;
1972		}
1973	}
1974#endif
1975	return 0;
1976}
1977
1978struct dev_gso_cb {
1979	void (*destructor)(struct sk_buff *skb);
1980};
1981
1982#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1983
1984static void dev_gso_skb_destructor(struct sk_buff *skb)
1985{
1986	struct dev_gso_cb *cb;
 
1987
1988	do {
1989		struct sk_buff *nskb = skb->next;
1990
1991		skb->next = nskb->next;
1992		nskb->next = NULL;
1993		kfree_skb(nskb);
1994	} while (skb->next);
1995
1996	cb = DEV_GSO_CB(skb);
1997	if (cb->destructor)
1998		cb->destructor(skb);
1999}
2000
2001/**
2002 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2003 *	@skb: buffer to segment
2004 *	@features: device features as applicable to this skb
2005 *
2006 *	This function segments the given skb and stores the list of segments
2007 *	in skb->next.
2008 */
2009static int dev_gso_segment(struct sk_buff *skb, int features)
2010{
2011	struct sk_buff *segs;
2012
2013	segs = skb_gso_segment(skb, features);
2014
2015	/* Verifying header integrity only. */
2016	if (!segs)
2017		return 0;
 
2018
2019	if (IS_ERR(segs))
2020		return PTR_ERR(segs);
2021
2022	skb->next = segs;
2023	DEV_GSO_CB(skb)->destructor = skb->destructor;
2024	skb->destructor = dev_gso_skb_destructor;
 
 
 
2025
2026	return 0;
2027}
2028
2029/*
2030 * Try to orphan skb early, right before transmission by the device.
2031 * We cannot orphan skb if tx timestamp is requested or the sk-reference
2032 * is needed on driver level for other reasons, e.g. see net/can/raw.c
2033 */
2034static inline void skb_orphan_try(struct sk_buff *skb)
2035{
2036	struct sock *sk = skb->sk;
2037
2038	if (sk && !skb_shinfo(skb)->tx_flags) {
2039		/* skb_tx_hash() wont be able to get sk.
2040		 * We copy sk_hash into skb->rxhash
2041		 */
2042		if (!skb->rxhash)
2043			skb->rxhash = sk->sk_hash;
2044		skb_orphan(skb);
2045	}
2046}
 
2047
2048static bool can_checksum_protocol(unsigned long features, __be16 protocol)
 
 
2049{
2050	return ((features & NETIF_F_GEN_CSUM) ||
2051		((features & NETIF_F_V4_CSUM) &&
2052		 protocol == htons(ETH_P_IP)) ||
2053		((features & NETIF_F_V6_CSUM) &&
2054		 protocol == htons(ETH_P_IPV6)) ||
2055		((features & NETIF_F_FCOE_CRC) &&
2056		 protocol == htons(ETH_P_FCOE)));
2057}
2058
2059static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
 
 
2060{
2061	if (!can_checksum_protocol(features, protocol)) {
2062		features &= ~NETIF_F_ALL_CSUM;
2063		features &= ~NETIF_F_SG;
2064	} else if (illegal_highdma(skb->dev, skb)) {
2065		features &= ~NETIF_F_SG;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2066	}
2067
2068	return features;
2069}
2070
2071u32 netif_skb_features(struct sk_buff *skb)
2072{
2073	__be16 protocol = skb->protocol;
2074	u32 features = skb->dev->features;
2075
2076	if (protocol == htons(ETH_P_8021Q)) {
2077		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2078		protocol = veh->h_vlan_encapsulated_proto;
2079	} else if (!vlan_tx_tag_present(skb)) {
2080		return harmonize_features(skb, protocol, features);
2081	}
2082
2083	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2084
2085	if (protocol != htons(ETH_P_8021Q)) {
2086		return harmonize_features(skb, protocol, features);
2087	} else {
2088		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2089				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2090		return harmonize_features(skb, protocol, features);
2091	}
2092}
2093EXPORT_SYMBOL(netif_skb_features);
2094
2095/*
2096 * Returns true if either:
2097 *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2098 *	2. skb is fragmented and the device does not support SG, or if
2099 *	   at least one of fragments is in highmem and device does not
2100 *	   support DMA from it.
2101 */
2102static inline int skb_needs_linearize(struct sk_buff *skb,
2103				      int features)
2104{
2105	return skb_is_nonlinear(skb) &&
2106			((skb_has_frag_list(skb) &&
2107				!(features & NETIF_F_FRAGLIST)) ||
2108			(skb_shinfo(skb)->nr_frags &&
2109				!(features & NETIF_F_SG)));
2110}
2111
2112int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2113			struct netdev_queue *txq)
2114{
2115	const struct net_device_ops *ops = dev->netdev_ops;
2116	int rc = NETDEV_TX_OK;
2117	unsigned int skb_len;
2118
2119	if (likely(!skb->next)) {
2120		u32 features;
2121
2122		/*
2123		 * If device doesn't need skb->dst, release it right now while
2124		 * its hot in this cpu cache
2125		 */
2126		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2127			skb_dst_drop(skb);
2128
2129		if (!list_empty(&ptype_all))
2130			dev_queue_xmit_nit(skb, dev);
 
 
 
 
2131
2132		skb_orphan_try(skb);
 
 
 
2133
2134		features = netif_skb_features(skb);
 
 
 
 
 
 
 
2135
2136		if (vlan_tx_tag_present(skb) &&
2137		    !(features & NETIF_F_HW_VLAN_TX)) {
2138			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2139			if (unlikely(!skb))
2140				goto out;
 
2141
2142			skb->vlan_tci = 0;
2143		}
 
2144
2145		if (netif_needs_gso(skb, features)) {
2146			if (unlikely(dev_gso_segment(skb, features)))
2147				goto out_kfree_skb;
2148			if (skb->next)
2149				goto gso;
2150		} else {
2151			if (skb_needs_linearize(skb, features) &&
2152			    __skb_linearize(skb))
2153				goto out_kfree_skb;
2154
2155			/* If packet is not checksummed and device does not
2156			 * support checksumming for this protocol, complete
2157			 * checksumming here.
2158			 */
2159			if (skb->ip_summed == CHECKSUM_PARTIAL) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2160				skb_set_transport_header(skb,
2161					skb_checksum_start_offset(skb));
2162				if (!(features & NETIF_F_ALL_CSUM) &&
2163				     skb_checksum_help(skb))
2164					goto out_kfree_skb;
2165			}
2166		}
2167
2168		skb_len = skb->len;
2169		rc = ops->ndo_start_xmit(skb, dev);
2170		trace_net_dev_xmit(skb, rc, dev, skb_len);
2171		if (rc == NETDEV_TX_OK)
2172			txq_trans_update(txq);
2173		return rc;
2174	}
2175
2176gso:
2177	do {
2178		struct sk_buff *nskb = skb->next;
2179
2180		skb->next = nskb->next;
2181		nskb->next = NULL;
2182
2183		/*
2184		 * If device doesn't need nskb->dst, release it right now while
2185		 * its hot in this cpu cache
2186		 */
2187		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2188			skb_dst_drop(nskb);
2189
2190		skb_len = nskb->len;
2191		rc = ops->ndo_start_xmit(nskb, dev);
2192		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2193		if (unlikely(rc != NETDEV_TX_OK)) {
2194			if (rc & ~NETDEV_TX_MASK)
2195				goto out_kfree_gso_skb;
2196			nskb->next = skb->next;
2197			skb->next = nskb;
2198			return rc;
2199		}
2200		txq_trans_update(txq);
2201		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2202			return NETDEV_TX_BUSY;
2203	} while (skb->next);
2204
2205out_kfree_gso_skb:
2206	if (likely(skb->next == NULL))
2207		skb->destructor = DEV_GSO_CB(skb)->destructor;
2208out_kfree_skb:
2209	kfree_skb(skb);
2210out:
2211	return rc;
 
2212}
2213
2214static u32 hashrnd __read_mostly;
2215
2216/*
2217 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2218 * to be used as a distribution range.
2219 */
2220u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2221		  unsigned int num_tx_queues)
2222{
2223	u32 hash;
2224	u16 qoffset = 0;
2225	u16 qcount = num_tx_queues;
2226
2227	if (skb_rx_queue_recorded(skb)) {
2228		hash = skb_get_rx_queue(skb);
2229		while (unlikely(hash >= num_tx_queues))
2230			hash -= num_tx_queues;
2231		return hash;
2232	}
2233
2234	if (dev->num_tc) {
2235		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2236		qoffset = dev->tc_to_txq[tc].offset;
2237		qcount = dev->tc_to_txq[tc].count;
2238	}
2239
2240	if (skb->sk && skb->sk->sk_hash)
2241		hash = skb->sk->sk_hash;
2242	else
2243		hash = (__force u16) skb->protocol ^ skb->rxhash;
2244	hash = jhash_1word(hash, hashrnd);
2245
2246	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2247}
2248EXPORT_SYMBOL(__skb_tx_hash);
2249
2250static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2251{
2252	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2253		if (net_ratelimit()) {
2254			pr_warning("%s selects TX queue %d, but "
2255				"real number of TX queues is %d\n",
2256				dev->name, queue_index, dev->real_num_tx_queues);
2257		}
2258		return 0;
2259	}
2260	return queue_index;
2261}
 
2262
2263static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2264{
2265#ifdef CONFIG_XPS
2266	struct xps_dev_maps *dev_maps;
2267	struct xps_map *map;
2268	int queue_index = -1;
2269
2270	rcu_read_lock();
2271	dev_maps = rcu_dereference(dev->xps_maps);
2272	if (dev_maps) {
2273		map = rcu_dereference(
2274		    dev_maps->cpu_map[raw_smp_processor_id()]);
2275		if (map) {
2276			if (map->len == 1)
2277				queue_index = map->queues[0];
2278			else {
2279				u32 hash;
2280				if (skb->sk && skb->sk->sk_hash)
2281					hash = skb->sk->sk_hash;
2282				else
2283					hash = (__force u16) skb->protocol ^
2284					    skb->rxhash;
2285				hash = jhash_1word(hash, hashrnd);
2286				queue_index = map->queues[
2287				    ((u64)hash * map->len) >> 32];
2288			}
2289			if (unlikely(queue_index >= dev->real_num_tx_queues))
2290				queue_index = -1;
2291		}
2292	}
2293	rcu_read_unlock();
2294
2295	return queue_index;
2296#else
2297	return -1;
2298#endif
2299}
2300
2301static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2302					struct sk_buff *skb)
2303{
2304	int queue_index;
2305	const struct net_device_ops *ops = dev->netdev_ops;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2306
2307	if (dev->real_num_tx_queues == 1)
2308		queue_index = 0;
2309	else if (ops->ndo_select_queue) {
2310		queue_index = ops->ndo_select_queue(dev, skb);
2311		queue_index = dev_cap_txqueue(dev, queue_index);
2312	} else {
2313		struct sock *sk = skb->sk;
2314		queue_index = sk_tx_queue_get(sk);
2315
2316		if (queue_index < 0 || skb->ooo_okay ||
2317		    queue_index >= dev->real_num_tx_queues) {
2318			int old_index = queue_index;
2319
2320			queue_index = get_xps_queue(dev, skb);
2321			if (queue_index < 0)
2322				queue_index = skb_tx_hash(dev, skb);
2323
2324			if (queue_index != old_index && sk) {
2325				struct dst_entry *dst =
2326				    rcu_dereference_check(sk->sk_dst_cache, 1);
2327
2328				if (dst && skb_dst(skb) == dst)
2329					sk_tx_queue_set(sk, queue_index);
2330			}
2331		}
2332	}
2333
2334	skb_set_queue_mapping(skb, queue_index);
2335	return netdev_get_tx_queue(dev, queue_index);
2336}
2337
2338static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2339				 struct net_device *dev,
2340				 struct netdev_queue *txq)
2341{
2342	spinlock_t *root_lock = qdisc_lock(q);
 
2343	bool contended;
2344	int rc;
2345
2346	qdisc_skb_cb(skb)->pkt_len = skb->len;
2347	qdisc_calculate_pkt_len(skb, q);
 
 
 
 
 
 
 
 
 
 
2348	/*
2349	 * Heuristic to force contended enqueues to serialize on a
2350	 * separate lock before trying to get qdisc main lock.
2351	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2352	 * and dequeue packets faster.
2353	 */
2354	contended = qdisc_is_running(q);
2355	if (unlikely(contended))
2356		spin_lock(&q->busylock);
2357
2358	spin_lock(root_lock);
2359	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2360		kfree_skb(skb);
2361		rc = NET_XMIT_DROP;
2362	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2363		   qdisc_run_begin(q)) {
2364		/*
2365		 * This is a work-conserving queue; there are no old skbs
2366		 * waiting to be sent out; and the qdisc is not running -
2367		 * xmit the skb directly.
2368		 */
2369		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2370			skb_dst_force(skb);
2371
2372		qdisc_bstats_update(q, skb);
2373
2374		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2375			if (unlikely(contended)) {
2376				spin_unlock(&q->busylock);
2377				contended = false;
2378			}
2379			__qdisc_run(q);
2380		} else
2381			qdisc_run_end(q);
2382
 
2383		rc = NET_XMIT_SUCCESS;
2384	} else {
2385		skb_dst_force(skb);
2386		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2387		if (qdisc_run_begin(q)) {
2388			if (unlikely(contended)) {
2389				spin_unlock(&q->busylock);
2390				contended = false;
2391			}
2392			__qdisc_run(q);
 
2393		}
2394	}
2395	spin_unlock(root_lock);
 
 
2396	if (unlikely(contended))
2397		spin_unlock(&q->busylock);
2398	return rc;
2399}
2400
2401static DEFINE_PER_CPU(int, xmit_recursion);
2402#define RECURSION_LIMIT 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2403
2404/**
2405 *	dev_queue_xmit - transmit a buffer
 
 
2406 *	@skb: buffer to transmit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2407 *
2408 *	Queue a buffer for transmission to a network device. The caller must
2409 *	have set the device and priority and built the buffer before calling
2410 *	this function. The function can be called from an interrupt.
2411 *
2412 *	A negative errno code is returned on a failure. A success does not
2413 *	guarantee the frame will be transmitted as it may be dropped due
2414 *	to congestion or traffic shaping.
2415 *
2416 * -----------------------------------------------------------------------------------
2417 *      I notice this method can also return errors from the queue disciplines,
2418 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2419 *      be positive.
2420 *
2421 *      Regardless of the return value, the skb is consumed, so it is currently
2422 *      difficult to retry a send to this method.  (You can bump the ref count
2423 *      before sending to hold a reference for retry if you are careful.)
2424 *
2425 *      When calling this method, interrupts MUST be enabled.  This is because
2426 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2427 *          --BLG
2428 */
2429int dev_queue_xmit(struct sk_buff *skb)
2430{
2431	struct net_device *dev = skb->dev;
2432	struct netdev_queue *txq;
2433	struct Qdisc *q;
2434	int rc = -ENOMEM;
 
 
 
 
 
 
2435
2436	/* Disable soft irqs for various locks below. Also
2437	 * stops preemption for RCU.
2438	 */
2439	rcu_read_lock_bh();
2440
2441	txq = dev_pick_tx(dev, skb);
2442	q = rcu_dereference_bh(txq->qdisc);
2443
 
2444#ifdef CONFIG_NET_CLS_ACT
2445	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
 
 
 
 
 
 
 
2446#endif
 
 
 
 
 
 
 
 
 
 
 
2447	trace_net_dev_queue(skb);
2448	if (q->enqueue) {
2449		rc = __dev_xmit_skb(skb, q, dev, txq);
2450		goto out;
2451	}
2452
2453	/* The device has no queue. Common case for software devices:
2454	   loopback, all the sorts of tunnels...
2455
2456	   Really, it is unlikely that netif_tx_lock protection is necessary
2457	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2458	   counters.)
2459	   However, it is possible, that they rely on protection
2460	   made by us here.
2461
2462	   Check this and shot the lock. It is not prone from deadlocks.
2463	   Either shot noqueue qdisc, it is even simpler 8)
2464	 */
2465	if (dev->flags & IFF_UP) {
2466		int cpu = smp_processor_id(); /* ok because BHs are off */
2467
2468		if (txq->xmit_lock_owner != cpu) {
2469
2470			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2471				goto recursion_alert;
2472
 
 
 
 
2473			HARD_TX_LOCK(dev, txq, cpu);
2474
2475			if (!netif_tx_queue_stopped(txq)) {
2476				__this_cpu_inc(xmit_recursion);
2477				rc = dev_hard_start_xmit(skb, dev, txq);
2478				__this_cpu_dec(xmit_recursion);
2479				if (dev_xmit_complete(rc)) {
2480					HARD_TX_UNLOCK(dev, txq);
2481					goto out;
2482				}
2483			}
2484			HARD_TX_UNLOCK(dev, txq);
2485			if (net_ratelimit())
2486				printk(KERN_CRIT "Virtual device %s asks to "
2487				       "queue packet!\n", dev->name);
2488		} else {
2489			/* Recursion is detected! It is possible,
2490			 * unfortunately
2491			 */
2492recursion_alert:
2493			if (net_ratelimit())
2494				printk(KERN_CRIT "Dead loop on virtual device "
2495				       "%s, fix it urgently!\n", dev->name);
2496		}
2497	}
2498
2499	rc = -ENETDOWN;
2500	rcu_read_unlock_bh();
2501
2502	kfree_skb(skb);
 
2503	return rc;
2504out:
2505	rcu_read_unlock_bh();
2506	return rc;
2507}
 
 
 
 
 
2508EXPORT_SYMBOL(dev_queue_xmit);
2509
 
 
 
 
 
2510
2511/*=======================================================================
2512			Receiver routines
2513  =======================================================================*/
 
 
 
 
2514
2515int netdev_max_backlog __read_mostly = 1000;
2516int netdev_tstamp_prequeue __read_mostly = 1;
2517int netdev_budget __read_mostly = 300;
2518int weight_p __read_mostly = 64;            /* old backlog weight */
2519
2520/* Called with irq disabled */
2521static inline void ____napi_schedule(struct softnet_data *sd,
2522				     struct napi_struct *napi)
2523{
2524	list_add_tail(&napi->poll_list, &sd->poll_list);
2525	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2526}
2527
2528/*
2529 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2530 * and src/dst port numbers. Returns a non-zero hash number on success
2531 * and 0 on failure.
2532 */
2533__u32 __skb_get_rxhash(struct sk_buff *skb)
2534{
2535	int nhoff, hash = 0, poff;
2536	const struct ipv6hdr *ip6;
2537	const struct iphdr *ip;
2538	u8 ip_proto;
2539	u32 addr1, addr2, ihl;
2540	union {
2541		u32 v32;
2542		u16 v16[2];
2543	} ports;
2544
2545	nhoff = skb_network_offset(skb);
2546
2547	switch (skb->protocol) {
2548	case __constant_htons(ETH_P_IP):
2549		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2550			goto done;
 
 
2551
2552		ip = (const struct iphdr *) (skb->data + nhoff);
2553		if (ip_is_fragment(ip))
2554			ip_proto = 0;
2555		else
2556			ip_proto = ip->protocol;
2557		addr1 = (__force u32) ip->saddr;
2558		addr2 = (__force u32) ip->daddr;
2559		ihl = ip->ihl;
2560		break;
2561	case __constant_htons(ETH_P_IPV6):
2562		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2563			goto done;
2564
2565		ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2566		ip_proto = ip6->nexthdr;
2567		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2568		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2569		ihl = (40 >> 2);
2570		break;
2571	default:
2572		goto done;
2573	}
2574
2575	ports.v32 = 0;
2576	poff = proto_ports_offset(ip_proto);
2577	if (poff >= 0) {
2578		nhoff += ihl * 4 + poff;
2579		if (pskb_may_pull(skb, nhoff + 4)) {
2580			ports.v32 = * (__force u32 *) (skb->data + nhoff);
2581			if (ports.v16[1] < ports.v16[0])
2582				swap(ports.v16[0], ports.v16[1]);
2583		}
2584	}
 
2585
2586	/* get a consistent hash (same value on both flow directions) */
2587	if (addr2 < addr1)
2588		swap(addr1, addr2);
2589
2590	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2591	if (!hash)
2592		hash = 1;
 
 
 
 
 
 
 
 
2593
2594done:
2595	return hash;
 
 
 
 
2596}
2597EXPORT_SYMBOL(__skb_get_rxhash);
2598
2599#ifdef CONFIG_RPS
2600
2601/* One global table that all flow-based protocols share. */
2602struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2603EXPORT_SYMBOL(rps_sock_flow_table);
 
 
 
 
 
 
 
2604
2605static struct rps_dev_flow *
2606set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2607	    struct rps_dev_flow *rflow, u16 next_cpu)
2608{
2609	u16 tcpu;
2610
2611	tcpu = rflow->cpu = next_cpu;
2612	if (tcpu != RPS_NO_CPU) {
2613#ifdef CONFIG_RFS_ACCEL
2614		struct netdev_rx_queue *rxqueue;
2615		struct rps_dev_flow_table *flow_table;
2616		struct rps_dev_flow *old_rflow;
2617		u32 flow_id;
2618		u16 rxq_index;
2619		int rc;
2620
2621		/* Should we steer this flow to a different hardware queue? */
2622		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2623		    !(dev->features & NETIF_F_NTUPLE))
2624			goto out;
2625		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2626		if (rxq_index == skb_get_rx_queue(skb))
2627			goto out;
2628
2629		rxqueue = dev->_rx + rxq_index;
2630		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2631		if (!flow_table)
2632			goto out;
2633		flow_id = skb->rxhash & flow_table->mask;
2634		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2635							rxq_index, flow_id);
2636		if (rc < 0)
2637			goto out;
2638		old_rflow = rflow;
2639		rflow = &flow_table->flows[flow_id];
2640		rflow->cpu = next_cpu;
2641		rflow->filter = rc;
2642		if (old_rflow->filter == rflow->filter)
2643			old_rflow->filter = RPS_NO_FILTER;
2644	out:
2645#endif
2646		rflow->last_qtail =
2647			per_cpu(softnet_data, tcpu).input_queue_head;
2648	}
2649
 
2650	return rflow;
2651}
2652
2653/*
2654 * get_rps_cpu is called from netif_receive_skb and returns the target
2655 * CPU from the RPS map of the receiving queue for a given skb.
2656 * rcu_read_lock must be held on entry.
2657 */
2658static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2659		       struct rps_dev_flow **rflowp)
2660{
2661	struct netdev_rx_queue *rxqueue;
2662	struct rps_map *map;
2663	struct rps_dev_flow_table *flow_table;
2664	struct rps_sock_flow_table *sock_flow_table;
2665	int cpu = -1;
2666	u16 tcpu;
 
2667
2668	if (skb_rx_queue_recorded(skb)) {
2669		u16 index = skb_get_rx_queue(skb);
 
2670		if (unlikely(index >= dev->real_num_rx_queues)) {
2671			WARN_ONCE(dev->real_num_rx_queues > 1,
2672				  "%s received packet on queue %u, but number "
2673				  "of RX queues is %u\n",
2674				  dev->name, index, dev->real_num_rx_queues);
2675			goto done;
2676		}
2677		rxqueue = dev->_rx + index;
2678	} else
2679		rxqueue = dev->_rx;
 
2680
 
2681	map = rcu_dereference(rxqueue->rps_map);
2682	if (map) {
2683		if (map->len == 1 &&
2684		    !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2685			tcpu = map->cpus[0];
2686			if (cpu_online(tcpu))
2687				cpu = tcpu;
2688			goto done;
2689		}
2690	} else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2691		goto done;
2692	}
2693
2694	skb_reset_network_header(skb);
2695	if (!skb_get_rxhash(skb))
 
2696		goto done;
2697
2698	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2699	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2700	if (flow_table && sock_flow_table) {
2701		u16 next_cpu;
2702		struct rps_dev_flow *rflow;
 
 
2703
2704		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2705		tcpu = rflow->cpu;
 
 
 
 
2706
2707		next_cpu = sock_flow_table->ents[skb->rxhash &
2708		    sock_flow_table->mask];
 
 
 
2709
2710		/*
2711		 * If the desired CPU (where last recvmsg was done) is
2712		 * different from current CPU (one in the rx-queue flow
2713		 * table entry), switch if one of the following holds:
2714		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2715		 *   - Current CPU is offline.
2716		 *   - The current CPU's queue tail has advanced beyond the
2717		 *     last packet that was enqueued using this table entry.
2718		 *     This guarantees that all previous packets for the flow
2719		 *     have been dequeued, thus preserving in order delivery.
2720		 */
2721		if (unlikely(tcpu != next_cpu) &&
2722		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2723		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2724		      rflow->last_qtail)) >= 0))
 
2725			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
 
2726
2727		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2728			*rflowp = rflow;
2729			cpu = tcpu;
2730			goto done;
2731		}
2732	}
2733
2734	if (map) {
2735		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2736
 
 
2737		if (cpu_online(tcpu)) {
2738			cpu = tcpu;
2739			goto done;
2740		}
2741	}
2742
2743done:
2744	return cpu;
2745}
2746
2747#ifdef CONFIG_RFS_ACCEL
2748
2749/**
2750 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2751 * @dev: Device on which the filter was set
2752 * @rxq_index: RX queue index
2753 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2754 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2755 *
2756 * Drivers that implement ndo_rx_flow_steer() should periodically call
2757 * this function for each installed filter and remove the filters for
2758 * which it returns %true.
2759 */
2760bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2761			 u32 flow_id, u16 filter_id)
2762{
2763	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2764	struct rps_dev_flow_table *flow_table;
2765	struct rps_dev_flow *rflow;
2766	bool expire = true;
2767	int cpu;
2768
2769	rcu_read_lock();
2770	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2771	if (flow_table && flow_id <= flow_table->mask) {
2772		rflow = &flow_table->flows[flow_id];
2773		cpu = ACCESS_ONCE(rflow->cpu);
2774		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2775		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2776			   rflow->last_qtail) <
2777		     (int)(10 * flow_table->mask)))
2778			expire = false;
2779	}
2780	rcu_read_unlock();
2781	return expire;
2782}
2783EXPORT_SYMBOL(rps_may_expire_flow);
2784
2785#endif /* CONFIG_RFS_ACCEL */
2786
2787/* Called from hardirq (IPI) context */
2788static void rps_trigger_softirq(void *data)
2789{
2790	struct softnet_data *sd = data;
2791
2792	____napi_schedule(sd, &sd->backlog);
2793	sd->received_rps++;
2794}
2795
2796#endif /* CONFIG_RPS */
2797
2798/*
2799 * Check if this softnet_data structure is another cpu one
2800 * If yes, queue it to our IPI list and return 1
2801 * If no, return 0
2802 */
2803static int rps_ipi_queued(struct softnet_data *sd)
2804{
2805#ifdef CONFIG_RPS
2806	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2807
2808	if (sd != mysd) {
2809		sd->rps_ipi_next = mysd->rps_ipi_list;
2810		mysd->rps_ipi_list = sd;
2811
2812		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2813		return 1;
2814	}
2815#endif /* CONFIG_RPS */
2816	return 0;
2817}
2818
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2819/*
2820 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2821 * queue (may be a remote CPU queue).
2822 */
2823static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2824			      unsigned int *qtail)
2825{
2826	struct softnet_data *sd;
2827	unsigned long flags;
 
2828
2829	sd = &per_cpu(softnet_data, cpu);
2830
2831	local_irq_save(flags);
2832
2833	rps_lock(sd);
2834	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2835		if (skb_queue_len(&sd->input_pkt_queue)) {
 
 
 
2836enqueue:
2837			__skb_queue_tail(&sd->input_pkt_queue, skb);
2838			input_queue_tail_incr_save(sd, qtail);
2839			rps_unlock(sd);
2840			local_irq_restore(flags);
2841			return NET_RX_SUCCESS;
2842		}
2843
2844		/* Schedule NAPI for backlog device
2845		 * We can use non atomic operation since we own the queue lock
2846		 */
2847		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2848			if (!rps_ipi_queued(sd))
2849				____napi_schedule(sd, &sd->backlog);
2850		}
2851		goto enqueue;
2852	}
2853
 
2854	sd->dropped++;
2855	rps_unlock(sd);
2856
2857	local_irq_restore(flags);
2858
2859	atomic_long_inc(&skb->dev->rx_dropped);
2860	kfree_skb(skb);
2861	return NET_RX_DROP;
2862}
2863
2864/**
2865 *	netif_rx	-	post buffer to the network code
2866 *	@skb: buffer to post
2867 *
2868 *	This function receives a packet from a device driver and queues it for
2869 *	the upper (protocol) levels to process.  It always succeeds. The buffer
2870 *	may be dropped during processing for congestion control or by the
2871 *	protocol layers.
2872 *
2873 *	return values:
2874 *	NET_RX_SUCCESS	(no congestion)
2875 *	NET_RX_DROP     (packet was dropped)
2876 *
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2877 */
 
 
 
 
 
 
2878
2879int netif_rx(struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2880{
2881	int ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2882
2883	/* if netpoll wants it, pretend we never saw it */
2884	if (netpoll_rx(skb))
2885		return NET_RX_DROP;
2886
2887	if (netdev_tstamp_prequeue)
2888		net_timestamp_check(skb);
2889
2890	trace_netif_rx(skb);
 
2891#ifdef CONFIG_RPS
2892	{
2893		struct rps_dev_flow voidflow, *rflow = &voidflow;
2894		int cpu;
2895
2896		preempt_disable();
2897		rcu_read_lock();
2898
2899		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2900		if (cpu < 0)
2901			cpu = smp_processor_id();
2902
2903		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2904
2905		rcu_read_unlock();
2906		preempt_enable();
2907	}
2908#else
2909	{
2910		unsigned int qtail;
 
2911		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2912		put_cpu();
2913	}
2914#endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2915	return ret;
2916}
2917EXPORT_SYMBOL(netif_rx);
2918
2919int netif_rx_ni(struct sk_buff *skb)
2920{
2921	int err;
2922
 
 
2923	preempt_disable();
2924	err = netif_rx(skb);
2925	if (local_softirq_pending())
2926		do_softirq();
2927	preempt_enable();
 
2928
2929	return err;
2930}
2931EXPORT_SYMBOL(netif_rx_ni);
2932
2933static void net_tx_action(struct softirq_action *h)
2934{
2935	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2936
2937	if (sd->completion_queue) {
2938		struct sk_buff *clist;
2939
2940		local_irq_disable();
2941		clist = sd->completion_queue;
2942		sd->completion_queue = NULL;
2943		local_irq_enable();
2944
2945		while (clist) {
2946			struct sk_buff *skb = clist;
 
2947			clist = clist->next;
2948
2949			WARN_ON(atomic_read(&skb->users));
2950			trace_kfree_skb(skb, net_tx_action);
2951			__kfree_skb(skb);
 
 
 
 
 
 
 
2952		}
 
 
2953	}
2954
2955	if (sd->output_queue) {
2956		struct Qdisc *head;
2957
2958		local_irq_disable();
2959		head = sd->output_queue;
2960		sd->output_queue = NULL;
2961		sd->output_queue_tailp = &sd->output_queue;
2962		local_irq_enable();
2963
2964		while (head) {
2965			struct Qdisc *q = head;
2966			spinlock_t *root_lock;
2967
2968			head = head->next_sched;
2969
2970			root_lock = qdisc_lock(q);
2971			if (spin_trylock(root_lock)) {
2972				smp_mb__before_clear_bit();
2973				clear_bit(__QDISC_STATE_SCHED,
2974					  &q->state);
2975				qdisc_run(q);
2976				spin_unlock(root_lock);
2977			} else {
2978				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2979					      &q->state)) {
2980					__netif_reschedule(q);
2981				} else {
2982					smp_mb__before_clear_bit();
2983					clear_bit(__QDISC_STATE_SCHED,
2984						  &q->state);
2985				}
2986			}
 
 
 
 
 
 
 
 
2987		}
2988	}
 
 
2989}
2990
2991#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2992    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2993/* This hook is defined here for ATM LANE */
2994int (*br_fdb_test_addr_hook)(struct net_device *dev,
2995			     unsigned char *addr) __read_mostly;
2996EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2997#endif
2998
2999#ifdef CONFIG_NET_CLS_ACT
3000/* TODO: Maybe we should just force sch_ingress to be compiled in
3001 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3002 * a compare and 2 stores extra right now if we dont have it on
3003 * but have CONFIG_NET_CLS_ACT
3004 * NOTE: This doesn't stop any functionality; if you dont have
3005 * the ingress scheduler, you just can't add policies on ingress.
3006 *
3007 */
3008static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3009{
3010	struct net_device *dev = skb->dev;
3011	u32 ttl = G_TC_RTTL(skb->tc_verd);
3012	int result = TC_ACT_OK;
3013	struct Qdisc *q;
3014
3015	if (unlikely(MAX_RED_LOOP < ttl++)) {
3016		if (net_ratelimit())
3017			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3018			       skb->skb_iif, dev->ifindex);
3019		return TC_ACT_SHOT;
3020	}
3021
3022	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3023	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3024
3025	q = rxq->qdisc;
3026	if (q != &noop_qdisc) {
3027		spin_lock(qdisc_lock(q));
3028		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3029			result = qdisc_enqueue_root(skb, q);
3030		spin_unlock(qdisc_lock(q));
3031	}
3032
3033	return result;
3034}
3035
3036static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3037					 struct packet_type **pt_prev,
3038					 int *ret, struct net_device *orig_dev)
3039{
3040	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
 
 
3041
3042	if (!rxq || rxq->qdisc == &noop_qdisc)
3043		goto out;
 
 
 
 
 
3044
3045	if (*pt_prev) {
3046		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3047		*pt_prev = NULL;
3048	}
3049
3050	switch (ing_filter(skb, rxq)) {
 
 
 
 
 
 
 
 
 
3051	case TC_ACT_SHOT:
3052	case TC_ACT_STOLEN:
3053		kfree_skb(skb);
3054		return NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3055	}
3056
3057out:
3058	skb->tc_verd = 0;
3059	return skb;
3060}
3061#endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3062
3063/**
3064 *	netdev_rx_handler_register - register receive handler
3065 *	@dev: device to register a handler for
3066 *	@rx_handler: receive handler to register
3067 *	@rx_handler_data: data pointer that is used by rx handler
3068 *
3069 *	Register a receive hander for a device. This handler will then be
3070 *	called from __netif_receive_skb. A negative errno code is returned
3071 *	on a failure.
3072 *
3073 *	The caller must hold the rtnl_mutex.
3074 *
3075 *	For a general description of rx_handler, see enum rx_handler_result.
3076 */
3077int netdev_rx_handler_register(struct net_device *dev,
3078			       rx_handler_func_t *rx_handler,
3079			       void *rx_handler_data)
3080{
3081	ASSERT_RTNL();
3082
3083	if (dev->rx_handler)
3084		return -EBUSY;
3085
 
 
 
 
3086	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3087	rcu_assign_pointer(dev->rx_handler, rx_handler);
3088
3089	return 0;
3090}
3091EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3092
3093/**
3094 *	netdev_rx_handler_unregister - unregister receive handler
3095 *	@dev: device to unregister a handler from
3096 *
3097 *	Unregister a receive hander from a device.
3098 *
3099 *	The caller must hold the rtnl_mutex.
3100 */
3101void netdev_rx_handler_unregister(struct net_device *dev)
3102{
3103
3104	ASSERT_RTNL();
3105	rcu_assign_pointer(dev->rx_handler, NULL);
3106	rcu_assign_pointer(dev->rx_handler_data, NULL);
 
 
 
 
 
3107}
3108EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3109
3110static int __netif_receive_skb(struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3111{
3112	struct packet_type *ptype, *pt_prev;
3113	rx_handler_func_t *rx_handler;
 
3114	struct net_device *orig_dev;
3115	struct net_device *null_or_dev;
3116	bool deliver_exact = false;
3117	int ret = NET_RX_DROP;
3118	__be16 type;
3119
3120	if (!netdev_tstamp_prequeue)
3121		net_timestamp_check(skb);
3122
3123	trace_netif_receive_skb(skb);
3124
3125	/* if we've gotten here through NAPI, check netpoll */
3126	if (netpoll_receive_skb(skb))
3127		return NET_RX_DROP;
3128
3129	if (!skb->skb_iif)
3130		skb->skb_iif = skb->dev->ifindex;
3131	orig_dev = skb->dev;
3132
3133	skb_reset_network_header(skb);
3134	skb_reset_transport_header(skb);
 
3135	skb_reset_mac_len(skb);
3136
3137	pt_prev = NULL;
3138
3139	rcu_read_lock();
3140
3141another_round:
 
3142
3143	__this_cpu_inc(softnet_data.processed);
3144
3145	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3146		skb = vlan_untag(skb);
3147		if (unlikely(!skb))
 
 
 
 
 
 
3148			goto out;
 
 
3149	}
3150
3151#ifdef CONFIG_NET_CLS_ACT
3152	if (skb->tc_verd & TC_NCLS) {
3153		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3154		goto ncls;
 
3155	}
3156#endif
 
 
 
 
 
3157
3158	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3159		if (!ptype->dev || ptype->dev == skb->dev) {
3160			if (pt_prev)
3161				ret = deliver_skb(skb, pt_prev, orig_dev);
3162			pt_prev = ptype;
3163		}
3164	}
3165
3166#ifdef CONFIG_NET_CLS_ACT
3167	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3168	if (!skb)
3169		goto out;
3170ncls:
 
 
 
 
 
 
 
 
 
 
 
3171#endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3172
3173	rx_handler = rcu_dereference(skb->dev->rx_handler);
3174	if (rx_handler) {
3175		if (pt_prev) {
3176			ret = deliver_skb(skb, pt_prev, orig_dev);
3177			pt_prev = NULL;
3178		}
3179		switch (rx_handler(&skb)) {
3180		case RX_HANDLER_CONSUMED:
 
3181			goto out;
3182		case RX_HANDLER_ANOTHER:
3183			goto another_round;
3184		case RX_HANDLER_EXACT:
3185			deliver_exact = true;
3186		case RX_HANDLER_PASS:
3187			break;
3188		default:
3189			BUG();
3190		}
3191	}
3192
3193	if (vlan_tx_tag_present(skb)) {
3194		if (pt_prev) {
3195			ret = deliver_skb(skb, pt_prev, orig_dev);
3196			pt_prev = NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3197		}
3198		if (vlan_do_receive(&skb)) {
3199			ret = __netif_receive_skb(skb);
3200			goto out;
3201		} else if (unlikely(!skb))
3202			goto out;
3203	}
3204
 
 
3205	/* deliver only exact match when indicated */
3206	null_or_dev = deliver_exact ? skb->dev : NULL;
 
 
 
 
3207
3208	type = skb->protocol;
3209	list_for_each_entry_rcu(ptype,
3210			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3211		if (ptype->type == type &&
3212		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3213		     ptype->dev == orig_dev)) {
3214			if (pt_prev)
3215				ret = deliver_skb(skb, pt_prev, orig_dev);
3216			pt_prev = ptype;
3217		}
3218	}
3219
3220	if (pt_prev) {
3221		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 
 
3222	} else {
3223		atomic_long_inc(&skb->dev->rx_dropped);
 
 
 
 
3224		kfree_skb(skb);
3225		/* Jamal, now you will not able to escape explaining
3226		 * me how you were going to use this. :-)
3227		 */
3228		ret = NET_RX_DROP;
3229	}
3230
3231out:
3232	rcu_read_unlock();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3233	return ret;
3234}
3235
3236/**
3237 *	netif_receive_skb - process receive buffer from network
3238 *	@skb: buffer to process
3239 *
3240 *	netif_receive_skb() is the main receive data processing function.
3241 *	It always succeeds. The buffer may be dropped during processing
3242 *	for congestion control or by the protocol layers.
3243 *
3244 *	This function may only be called from softirq context and interrupts
3245 *	should be enabled.
3246 *
3247 *	Return values (usually ignored):
3248 *	NET_RX_SUCCESS: no congestion
3249 *	NET_RX_DROP: packet was dropped
3250 */
3251int netif_receive_skb(struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3252{
3253	if (netdev_tstamp_prequeue)
3254		net_timestamp_check(skb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3255
3256	if (skb_defer_rx_timestamp(skb))
3257		return NET_RX_SUCCESS;
3258
 
3259#ifdef CONFIG_RPS
3260	{
3261		struct rps_dev_flow voidflow, *rflow = &voidflow;
3262		int cpu, ret;
3263
3264		rcu_read_lock();
3265
3266		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3267
3268		if (cpu >= 0) {
3269			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3270			rcu_read_unlock();
3271		} else {
3272			rcu_read_unlock();
3273			ret = __netif_receive_skb(skb);
3274		}
 
 
 
 
 
 
3275
3276		return ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3277	}
3278#else
3279	return __netif_receive_skb(skb);
3280#endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3281}
3282EXPORT_SYMBOL(netif_receive_skb);
3283
3284/* Network device is going away, flush any packets still pending
3285 * Called with irqs disabled.
 
 
 
 
 
 
 
3286 */
3287static void flush_backlog(void *arg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3288{
3289	struct net_device *dev = arg;
3290	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3291	struct sk_buff *skb, *tmp;
 
 
 
 
3292
 
3293	rps_lock(sd);
3294	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3295		if (skb->dev == dev) {
3296			__skb_unlink(skb, &sd->input_pkt_queue);
3297			kfree_skb(skb);
3298			input_queue_head_incr(sd);
3299		}
3300	}
3301	rps_unlock(sd);
 
3302
3303	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3304		if (skb->dev == dev) {
3305			__skb_unlink(skb, &sd->process_queue);
3306			kfree_skb(skb);
3307			input_queue_head_incr(sd);
3308		}
3309	}
 
3310}
3311
3312static int napi_gro_complete(struct sk_buff *skb)
3313{
3314	struct packet_type *ptype;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3315	__be16 type = skb->protocol;
3316	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3317	int err = -ENOENT;
3318
 
 
3319	if (NAPI_GRO_CB(skb)->count == 1) {
3320		skb_shinfo(skb)->gso_size = 0;
3321		goto out;
3322	}
3323
3324	rcu_read_lock();
3325	list_for_each_entry_rcu(ptype, head, list) {
3326		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3327			continue;
3328
3329		err = ptype->gro_complete(skb);
 
 
3330		break;
3331	}
3332	rcu_read_unlock();
3333
3334	if (err) {
3335		WARN_ON(&ptype->list == head);
3336		kfree_skb(skb);
3337		return NET_RX_SUCCESS;
3338	}
3339
3340out:
3341	return netif_receive_skb(skb);
 
3342}
3343
3344inline void napi_gro_flush(struct napi_struct *napi)
 
3345{
3346	struct sk_buff *skb, *next;
 
3347
3348	for (skb = napi->gro_list; skb; skb = next) {
3349		next = skb->next;
3350		skb->next = NULL;
3351		napi_gro_complete(skb);
 
 
3352	}
3353
3354	napi->gro_count = 0;
3355	napi->gro_list = NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3356}
3357EXPORT_SYMBOL(napi_gro_flush);
3358
3359enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 
3360{
3361	struct sk_buff **pp = NULL;
3362	struct packet_type *ptype;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3363	__be16 type = skb->protocol;
3364	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3365	int same_flow;
3366	int mac_len;
3367	enum gro_result ret;
 
 
3368
3369	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3370		goto normal;
3371
3372	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3373		goto normal;
3374
3375	rcu_read_lock();
3376	list_for_each_entry_rcu(ptype, head, list) {
3377		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3378			continue;
3379
3380		skb_set_network_header(skb, skb_gro_offset(skb));
3381		mac_len = skb->network_header - skb->mac_header;
3382		skb->mac_len = mac_len;
3383		NAPI_GRO_CB(skb)->same_flow = 0;
3384		NAPI_GRO_CB(skb)->flush = 0;
3385		NAPI_GRO_CB(skb)->free = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3386
3387		pp = ptype->gro_receive(&napi->gro_list, skb);
 
 
3388		break;
3389	}
3390	rcu_read_unlock();
3391
3392	if (&ptype->list == head)
3393		goto normal;
3394
 
 
 
 
 
3395	same_flow = NAPI_GRO_CB(skb)->same_flow;
3396	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3397
3398	if (pp) {
3399		struct sk_buff *nskb = *pp;
3400
3401		*pp = nskb->next;
3402		nskb->next = NULL;
3403		napi_gro_complete(nskb);
3404		napi->gro_count--;
3405	}
3406
3407	if (same_flow)
3408		goto ok;
3409
3410	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3411		goto normal;
3412
3413	napi->gro_count++;
 
 
 
 
3414	NAPI_GRO_CB(skb)->count = 1;
 
 
3415	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3416	skb->next = napi->gro_list;
3417	napi->gro_list = skb;
3418	ret = GRO_HELD;
3419
3420pull:
3421	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3422		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3423
3424		BUG_ON(skb->end - skb->tail < grow);
3425
3426		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3427
3428		skb->tail += grow;
3429		skb->data_len -= grow;
3430
3431		skb_shinfo(skb)->frags[0].page_offset += grow;
3432		skb_shinfo(skb)->frags[0].size -= grow;
3433
3434		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3435			put_page(skb_shinfo(skb)->frags[0].page);
3436			memmove(skb_shinfo(skb)->frags,
3437				skb_shinfo(skb)->frags + 1,
3438				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3439		}
3440	}
3441
3442ok:
3443	return ret;
3444
3445normal:
3446	ret = GRO_NORMAL;
3447	goto pull;
3448}
3449EXPORT_SYMBOL(dev_gro_receive);
3450
3451static inline gro_result_t
3452__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3453{
3454	struct sk_buff *p;
 
3455
3456	for (p = napi->gro_list; p; p = p->next) {
3457		unsigned long diffs;
 
 
 
 
 
 
3458
3459		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3460		diffs |= p->vlan_tci ^ skb->vlan_tci;
3461		diffs |= compare_ether_header(skb_mac_header(p),
3462					      skb_gro_mac_header(skb));
3463		NAPI_GRO_CB(p)->same_flow = !diffs;
3464		NAPI_GRO_CB(p)->flush = 0;
 
 
 
3465	}
 
 
 
3466
3467	return dev_gro_receive(napi, skb);
 
 
 
 
3468}
3469
3470gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 
 
3471{
3472	switch (ret) {
3473	case GRO_NORMAL:
3474		if (netif_receive_skb(skb))
3475			ret = GRO_DROP;
3476		break;
3477
3478	case GRO_DROP:
3479	case GRO_MERGED_FREE:
3480		kfree_skb(skb);
3481		break;
3482
 
 
 
 
 
 
 
3483	case GRO_HELD:
3484	case GRO_MERGED:
 
3485		break;
3486	}
3487
3488	return ret;
3489}
3490EXPORT_SYMBOL(napi_skb_finish);
3491
3492void skb_gro_reset_offset(struct sk_buff *skb)
3493{
3494	NAPI_GRO_CB(skb)->data_offset = 0;
3495	NAPI_GRO_CB(skb)->frag0 = NULL;
3496	NAPI_GRO_CB(skb)->frag0_len = 0;
3497
3498	if (skb->mac_header == skb->tail &&
3499	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3500		NAPI_GRO_CB(skb)->frag0 =
3501			page_address(skb_shinfo(skb)->frags[0].page) +
3502			skb_shinfo(skb)->frags[0].page_offset;
3503		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3504	}
3505}
3506EXPORT_SYMBOL(skb_gro_reset_offset);
3507
3508gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3509{
3510	skb_gro_reset_offset(skb);
3511
3512	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
 
 
 
3513}
3514EXPORT_SYMBOL(napi_gro_receive);
3515
3516static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3517{
 
 
 
 
3518	__skb_pull(skb, skb_headlen(skb));
3519	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3520	skb->vlan_tci = 0;
 
3521	skb->dev = napi->dev;
3522	skb->skb_iif = 0;
3523
 
 
 
 
 
 
 
 
3524	napi->skb = skb;
3525}
3526
3527struct sk_buff *napi_get_frags(struct napi_struct *napi)
3528{
3529	struct sk_buff *skb = napi->skb;
3530
3531	if (!skb) {
3532		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3533		if (skb)
3534			napi->skb = skb;
 
 
3535	}
3536	return skb;
3537}
3538EXPORT_SYMBOL(napi_get_frags);
3539
3540gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3541			       gro_result_t ret)
 
3542{
3543	switch (ret) {
3544	case GRO_NORMAL:
3545	case GRO_HELD:
 
3546		skb->protocol = eth_type_trans(skb, skb->dev);
3547
3548		if (ret == GRO_HELD)
3549			skb_gro_pull(skb, -ETH_HLEN);
3550		else if (netif_receive_skb(skb))
3551			ret = GRO_DROP;
3552		break;
3553
3554	case GRO_DROP:
3555	case GRO_MERGED_FREE:
3556		napi_reuse_skb(napi, skb);
3557		break;
3558
 
 
 
 
 
 
 
3559	case GRO_MERGED:
 
3560		break;
3561	}
3562
3563	return ret;
3564}
3565EXPORT_SYMBOL(napi_frags_finish);
3566
3567struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 
 
 
 
3568{
3569	struct sk_buff *skb = napi->skb;
3570	struct ethhdr *eth;
3571	unsigned int hlen;
3572	unsigned int off;
3573
3574	napi->skb = NULL;
3575
3576	skb_reset_mac_header(skb);
3577	skb_gro_reset_offset(skb);
3578
3579	off = skb_gro_offset(skb);
3580	hlen = off + sizeof(*eth);
3581	eth = skb_gro_header_fast(skb, off);
3582	if (skb_gro_header_hard(skb, hlen)) {
3583		eth = skb_gro_header_slow(skb, hlen, off);
3584		if (unlikely(!eth)) {
 
 
3585			napi_reuse_skb(napi, skb);
3586			skb = NULL;
3587			goto out;
3588		}
 
 
 
 
 
3589	}
3590
3591	skb_gro_pull(skb, sizeof(*eth));
3592
3593	/*
3594	 * This works because the only protocols we care about don't require
3595	 * special handling.  We'll fix it up properly at the end.
 
3596	 */
3597	skb->protocol = eth->h_proto;
3598
3599out:
3600	return skb;
3601}
3602EXPORT_SYMBOL(napi_frags_skb);
3603
3604gro_result_t napi_gro_frags(struct napi_struct *napi)
3605{
 
3606	struct sk_buff *skb = napi_frags_skb(napi);
3607
3608	if (!skb)
3609		return GRO_DROP;
3610
3611	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
 
 
 
 
 
3612}
3613EXPORT_SYMBOL(napi_gro_frags);
3614
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3615/*
3616 * net_rps_action sends any pending IPI's for rps.
3617 * Note: called with local irq disabled, but exits with local irq enabled.
3618 */
3619static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3620{
3621#ifdef CONFIG_RPS
3622	struct softnet_data *remsd = sd->rps_ipi_list;
3623
3624	if (remsd) {
3625		sd->rps_ipi_list = NULL;
3626
3627		local_irq_enable();
3628
3629		/* Send pending IPI's to kick RPS processing on remote cpus. */
3630		while (remsd) {
3631			struct softnet_data *next = remsd->rps_ipi_next;
3632
3633			if (cpu_online(remsd->cpu))
3634				__smp_call_function_single(remsd->cpu,
3635							   &remsd->csd, 0);
3636			remsd = next;
3637		}
3638	} else
3639#endif
3640		local_irq_enable();
3641}
3642
 
 
 
 
 
 
 
 
 
3643static int process_backlog(struct napi_struct *napi, int quota)
3644{
3645	int work = 0;
3646	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
 
 
3647
3648#ifdef CONFIG_RPS
3649	/* Check if we have pending ipi, its better to send them now,
3650	 * not waiting net_rx_action() end.
3651	 */
3652	if (sd->rps_ipi_list) {
3653		local_irq_disable();
3654		net_rps_action_and_irq_enable(sd);
3655	}
3656#endif
3657	napi->weight = weight_p;
3658	local_irq_disable();
3659	while (work < quota) {
3660		struct sk_buff *skb;
3661		unsigned int qlen;
3662
3663		while ((skb = __skb_dequeue(&sd->process_queue))) {
3664			local_irq_enable();
3665			__netif_receive_skb(skb);
3666			local_irq_disable();
3667			input_queue_head_incr(sd);
3668			if (++work >= quota) {
3669				local_irq_enable();
3670				return work;
3671			}
3672		}
3673
 
3674		rps_lock(sd);
3675		qlen = skb_queue_len(&sd->input_pkt_queue);
3676		if (qlen)
3677			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3678						   &sd->process_queue);
3679
3680		if (qlen < quota - work) {
3681			/*
3682			 * Inline a custom version of __napi_complete().
3683			 * only current cpu owns and manipulates this napi,
3684			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3685			 * we can use a plain write instead of clear_bit(),
 
3686			 * and we dont need an smp_mb() memory barrier.
3687			 */
3688			list_del(&napi->poll_list);
3689			napi->state = 0;
3690
3691			quota = work + qlen;
 
 
3692		}
3693		rps_unlock(sd);
 
3694	}
3695	local_irq_enable();
3696
3697	return work;
3698}
3699
3700/**
3701 * __napi_schedule - schedule for receive
3702 * @n: entry to schedule
3703 *
3704 * The entry's receive function will be scheduled to run
 
3705 */
3706void __napi_schedule(struct napi_struct *n)
3707{
3708	unsigned long flags;
3709
3710	local_irq_save(flags);
3711	____napi_schedule(&__get_cpu_var(softnet_data), n);
3712	local_irq_restore(flags);
3713}
3714EXPORT_SYMBOL(__napi_schedule);
3715
3716void __napi_complete(struct napi_struct *n)
 
 
 
 
 
 
 
 
 
3717{
3718	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3719	BUG_ON(n->gro_list);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3720
3721	list_del(&n->poll_list);
3722	smp_mb__before_clear_bit();
3723	clear_bit(NAPI_STATE_SCHED, &n->state);
 
 
 
 
 
 
3724}
3725EXPORT_SYMBOL(__napi_complete);
3726
3727void napi_complete(struct napi_struct *n)
3728{
3729	unsigned long flags;
 
3730
3731	/*
3732	 * don't let napi dequeue from the cpu poll list
3733	 * just in case its running on a different cpu
 
 
3734	 */
3735	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3736		return;
3737
3738	napi_gro_flush(n);
3739	local_irq_save(flags);
3740	__napi_complete(n);
3741	local_irq_restore(flags);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3742}
3743EXPORT_SYMBOL(napi_complete);
3744
3745void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3746		    int (*poll)(struct napi_struct *, int), int weight)
3747{
3748	INIT_LIST_HEAD(&napi->poll_list);
3749	napi->gro_count = 0;
3750	napi->gro_list = NULL;
 
3751	napi->skb = NULL;
 
 
3752	napi->poll = poll;
 
 
 
3753	napi->weight = weight;
3754	list_add(&napi->dev_list, &dev->napi_list);
3755	napi->dev = dev;
3756#ifdef CONFIG_NETPOLL
3757	spin_lock_init(&napi->poll_lock);
3758	napi->poll_owner = -1;
3759#endif
3760	set_bit(NAPI_STATE_SCHED, &napi->state);
 
 
 
3761}
3762EXPORT_SYMBOL(netif_napi_add);
3763
3764void netif_napi_del(struct napi_struct *napi)
3765{
3766	struct sk_buff *skb, *next;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3767
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3768	list_del_init(&napi->dev_list);
3769	napi_free_frags(napi);
3770
3771	for (skb = napi->gro_list; skb; skb = next) {
3772		next = skb->next;
3773		skb->next = NULL;
3774		kfree_skb(skb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3775	}
3776
3777	napi->gro_list = NULL;
3778	napi->gro_count = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3779}
3780EXPORT_SYMBOL(netif_napi_del);
3781
3782static void net_rx_action(struct softirq_action *h)
3783{
3784	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3785	unsigned long time_limit = jiffies + 2;
 
3786	int budget = netdev_budget;
3787	void *have;
 
3788
3789	local_irq_disable();
 
 
3790
3791	while (!list_empty(&sd->poll_list)) {
3792		struct napi_struct *n;
3793		int work, weight;
3794
3795		/* If softirq window is exhuasted then punt.
 
 
 
 
 
 
 
 
 
3796		 * Allow this to run for 2 jiffies since which will allow
3797		 * an average latency of 1.5/HZ.
3798		 */
3799		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3800			goto softnet_break;
 
 
 
 
3801
3802		local_irq_enable();
3803
3804		/* Even though interrupts have been re-enabled, this
3805		 * access is safe because interrupts can only add new
3806		 * entries to the tail of this list, and only ->poll()
3807		 * calls can remove this head entry from the list.
3808		 */
3809		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3810
3811		have = netpoll_poll_lock(n);
 
 
 
3812
3813		weight = n->weight;
 
3814
3815		/* This NAPI_STATE_SCHED test is for avoiding a race
3816		 * with netpoll's poll_napi().  Only the entity which
3817		 * obtains the lock and sees NAPI_STATE_SCHED set will
3818		 * actually make the ->poll() call.  Therefore we avoid
3819		 * accidentally calling ->poll() when NAPI is not scheduled.
3820		 */
3821		work = 0;
3822		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3823			work = n->poll(n, weight);
3824			trace_napi_poll(n);
3825		}
3826
3827		WARN_ON_ONCE(work > weight);
 
3828
3829		budget -= work;
 
3830
3831		local_irq_disable();
 
3832
3833		/* Drivers must not modify the NAPI state if they
3834		 * consume the entire weight.  In such cases this code
3835		 * still "owns" the NAPI instance and therefore can
3836		 * move the instance around on the list at-will.
3837		 */
3838		if (unlikely(work == weight)) {
3839			if (unlikely(napi_disable_pending(n))) {
3840				local_irq_enable();
3841				napi_complete(n);
3842				local_irq_disable();
3843			} else
3844				list_move_tail(&n->poll_list, &sd->poll_list);
3845		}
3846
3847		netpoll_poll_unlock(have);
3848	}
3849out:
3850	net_rps_action_and_irq_enable(sd);
3851
3852#ifdef CONFIG_NET_DMA
3853	/*
3854	 * There may not be any more sk_buffs coming right now, so push
3855	 * any pending DMA copies to hardware
3856	 */
3857	dma_issue_pending_all();
3858#endif
3859
3860	return;
 
 
 
3861
3862softnet_break:
3863	sd->time_squeeze++;
3864	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3865	goto out;
3866}
3867
3868static gifconf_func_t *gifconf_list[NPROTO];
3869
3870/**
3871 *	register_gifconf	-	register a SIOCGIF handler
3872 *	@family: Address family
3873 *	@gifconf: Function handler
3874 *
3875 *	Register protocol dependent address dumping routines. The handler
3876 *	that is passed must not be freed or reused until it has been replaced
3877 *	by another handler.
3878 */
3879int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
 
3880{
3881	if (family >= NPROTO)
3882		return -EINVAL;
3883	gifconf_list[family] = gifconf;
3884	return 0;
3885}
3886EXPORT_SYMBOL(register_gifconf);
3887
 
3888
3889/*
3890 *	Map an interface index to its name (SIOCGIFNAME)
 
 
 
 
 
 
 
 
 
 
 
3891 */
3892
3893/*
3894 *	We need this ioctl for efficient implementation of the
3895 *	if_indextoname() function required by the IPv6 API.  Without
3896 *	it, we would have to search all the interfaces to find a
3897 *	match.  --pb
 
 
 
 
 
 
 
 
 
 
 
 
 
3898 */
 
 
 
 
 
 
 
3899
3900static int dev_ifname(struct net *net, struct ifreq __user *arg)
 
 
 
 
 
 
 
3901{
3902	struct net_device *dev;
3903	struct ifreq ifr;
3904
3905	/*
3906	 *	Fetch the caller's info block.
3907	 */
3908
3909	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3910		return -EFAULT;
3911
3912	rcu_read_lock();
3913	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3914	if (!dev) {
3915		rcu_read_unlock();
3916		return -ENODEV;
3917	}
 
3918
3919	strcpy(ifr.ifr_name, dev->name);
3920	rcu_read_unlock();
 
3921
3922	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3923		return -EFAULT;
3924	return 0;
 
 
 
 
 
 
 
3925}
3926
3927/*
3928 *	Perform a SIOCGIFCONF call. This structure will change
3929 *	size eventually, and there is nothing I can do about it.
3930 *	Thus we will need a 'compatibility mode'.
 
 
3931 */
 
 
 
 
 
 
3932
3933static int dev_ifconf(struct net *net, char __user *arg)
3934{
3935	struct ifconf ifc;
3936	struct net_device *dev;
3937	char __user *pos;
3938	int len;
3939	int total;
3940	int i;
3941
3942	/*
3943	 *	Fetch the caller's info block.
3944	 */
3945
3946	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3947		return -EFAULT;
 
3948
3949	pos = ifc.ifc_buf;
3950	len = ifc.ifc_len;
 
 
 
 
 
 
 
 
 
 
3951
3952	/*
3953	 *	Loop over the interfaces, and write an info block for each.
3954	 */
3955
3956	total = 0;
3957	for_each_netdev(net, dev) {
3958		for (i = 0; i < NPROTO; i++) {
3959			if (gifconf_list[i]) {
3960				int done;
3961				if (!pos)
3962					done = gifconf_list[i](dev, NULL, 0);
3963				else
3964					done = gifconf_list[i](dev, pos + total,
3965							       len - total);
3966				if (done < 0)
3967					return -EFAULT;
3968				total += done;
3969			}
3970		}
3971	}
3972
3973	/*
3974	 *	All done.  Write the updated control block back to the caller.
3975	 */
3976	ifc.ifc_len = total;
3977
3978	/*
3979	 * 	Both BSD and Solaris return 0 here, so we do too.
3980	 */
3981	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3982}
 
3983
3984#ifdef CONFIG_PROC_FS
3985/*
3986 *	This is invoked by the /proc filesystem handler to display a device
3987 *	in detail.
3988 */
3989void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3990	__acquires(RCU)
3991{
3992	struct net *net = seq_file_net(seq);
3993	loff_t off;
3994	struct net_device *dev;
3995
3996	rcu_read_lock();
3997	if (!*pos)
3998		return SEQ_START_TOKEN;
3999
4000	off = 1;
4001	for_each_netdev_rcu(net, dev)
4002		if (off++ == *pos)
4003			return dev;
4004
4005	return NULL;
 
 
 
4006}
4007
4008void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
4009{
4010	struct net_device *dev = v;
4011
4012	if (v == SEQ_START_TOKEN)
4013		dev = first_net_device_rcu(seq_file_net(seq));
4014	else
4015		dev = next_net_device_rcu(dev);
4016
4017	++*pos;
4018	return dev;
 
 
 
 
 
 
4019}
4020
4021void dev_seq_stop(struct seq_file *seq, void *v)
4022	__releases(RCU)
 
 
4023{
4024	rcu_read_unlock();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4025}
4026
4027static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4028{
4029	struct rtnl_link_stats64 temp;
4030	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
 
 
4031
4032	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4033		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4034		   dev->name, stats->rx_bytes, stats->rx_packets,
4035		   stats->rx_errors,
4036		   stats->rx_dropped + stats->rx_missed_errors,
4037		   stats->rx_fifo_errors,
4038		   stats->rx_length_errors + stats->rx_over_errors +
4039		    stats->rx_crc_errors + stats->rx_frame_errors,
4040		   stats->rx_compressed, stats->multicast,
4041		   stats->tx_bytes, stats->tx_packets,
4042		   stats->tx_errors, stats->tx_dropped,
4043		   stats->tx_fifo_errors, stats->collisions,
4044		   stats->tx_carrier_errors +
4045		    stats->tx_aborted_errors +
4046		    stats->tx_window_errors +
4047		    stats->tx_heartbeat_errors,
4048		   stats->tx_compressed);
4049}
4050
4051/*
4052 *	Called from the PROCfs module. This now uses the new arbitrary sized
4053 *	/proc/net interface to create /proc/net/dev
 
 
 
 
 
 
 
4054 */
4055static int dev_seq_show(struct seq_file *seq, void *v)
 
4056{
4057	if (v == SEQ_START_TOKEN)
4058		seq_puts(seq, "Inter-|   Receive                            "
4059			      "                    |  Transmit\n"
4060			      " face |bytes    packets errs drop fifo frame "
4061			      "compressed multicast|bytes    packets errs "
4062			      "drop fifo colls carrier compressed\n");
4063	else
4064		dev_seq_printf_stats(seq, v);
4065	return 0;
 
4066}
 
4067
4068static struct softnet_data *softnet_get_online(loff_t *pos)
 
 
 
 
 
 
 
 
 
 
 
4069{
4070	struct softnet_data *sd = NULL;
4071
4072	while (*pos < nr_cpu_ids)
4073		if (cpu_online(*pos)) {
4074			sd = &per_cpu(softnet_data, *pos);
4075			break;
4076		} else
4077			++*pos;
4078	return sd;
 
 
 
4079}
 
4080
4081static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
 
 
 
 
 
 
 
 
 
 
 
4082{
4083	return softnet_get_online(pos);
 
 
 
 
 
 
 
 
 
4084}
 
4085
4086static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
4087{
4088	++*pos;
4089	return softnet_get_online(pos);
 
 
 
 
 
 
 
 
4090}
4091
4092static void softnet_seq_stop(struct seq_file *seq, void *v)
 
 
4093{
 
 
 
 
 
 
 
 
 
 
 
4094}
4095
4096static int softnet_seq_show(struct seq_file *seq, void *v)
 
 
 
4097{
4098	struct softnet_data *sd = v;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4099
4100	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4101		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4102		   0, 0, 0, 0, /* was fastroute */
4103		   sd->cpu_collision, sd->received_rps);
4104	return 0;
4105}
 
4106
4107static const struct seq_operations dev_seq_ops = {
4108	.start = dev_seq_start,
4109	.next  = dev_seq_next,
4110	.stop  = dev_seq_stop,
4111	.show  = dev_seq_show,
4112};
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4113
4114static int dev_seq_open(struct inode *inode, struct file *file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4115{
4116	return seq_open_net(inode, file, &dev_seq_ops,
4117			    sizeof(struct seq_net_private));
 
 
 
 
 
 
 
4118}
 
4119
4120static const struct file_operations dev_seq_fops = {
4121	.owner	 = THIS_MODULE,
4122	.open    = dev_seq_open,
4123	.read    = seq_read,
4124	.llseek  = seq_lseek,
4125	.release = seq_release_net,
4126};
4127
4128static const struct seq_operations softnet_seq_ops = {
4129	.start = softnet_seq_start,
4130	.next  = softnet_seq_next,
4131	.stop  = softnet_seq_stop,
4132	.show  = softnet_seq_show,
4133};
 
 
 
4134
4135static int softnet_seq_open(struct inode *inode, struct file *file)
 
 
 
4136{
4137	return seq_open(file, &softnet_seq_ops);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4138}
4139
4140static const struct file_operations softnet_seq_fops = {
4141	.owner	 = THIS_MODULE,
4142	.open    = softnet_seq_open,
4143	.read    = seq_read,
4144	.llseek  = seq_lseek,
4145	.release = seq_release,
4146};
4147
4148static void *ptype_get_idx(loff_t pos)
 
4149{
4150	struct packet_type *pt = NULL;
4151	loff_t i = 0;
4152	int t;
4153
4154	list_for_each_entry_rcu(pt, &ptype_all, list) {
4155		if (i == pos)
4156			return pt;
4157		++i;
4158	}
4159
4160	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4161		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4162			if (i == pos)
4163				return pt;
4164			++i;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4165		}
 
 
 
4166	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4167	return NULL;
4168}
 
4169
4170static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4171	__acquires(RCU)
 
 
 
 
 
 
4172{
4173	rcu_read_lock();
4174	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
 
 
 
 
 
4175}
 
4176
4177static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 
4178{
4179	struct packet_type *pt;
4180	struct list_head *nxt;
4181	int hash;
4182
4183	++*pos;
4184	if (v == SEQ_START_TOKEN)
4185		return ptype_get_idx(0);
4186
4187	pt = v;
4188	nxt = pt->list.next;
4189	if (pt->type == htons(ETH_P_ALL)) {
4190		if (nxt != &ptype_all)
4191			goto found;
4192		hash = 0;
4193		nxt = ptype_base[0].next;
4194	} else
4195		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4196
4197	while (nxt == &ptype_base[hash]) {
4198		if (++hash >= PTYPE_HASH_SIZE)
4199			return NULL;
4200		nxt = ptype_base[hash].next;
4201	}
4202found:
4203	return list_entry(nxt, struct packet_type, list);
 
 
 
 
 
 
 
4204}
4205
4206static void ptype_seq_stop(struct seq_file *seq, void *v)
4207	__releases(RCU)
 
4208{
4209	rcu_read_unlock();
 
 
4210}
4211
4212static int ptype_seq_show(struct seq_file *seq, void *v)
 
 
 
4213{
4214	struct packet_type *pt = v;
 
4215
4216	if (v == SEQ_START_TOKEN)
4217		seq_puts(seq, "Type Device      Function\n");
4218	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4219		if (pt->type == htons(ETH_P_ALL))
4220			seq_puts(seq, "ALL ");
4221		else
4222			seq_printf(seq, "%04x", ntohs(pt->type));
 
 
 
 
 
 
 
 
 
 
 
 
 
4223
4224		seq_printf(seq, " %-8s %pF\n",
4225			   pt->dev ? pt->dev->name : "", pt->func);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4226	}
4227
4228	return 0;
 
 
 
 
 
 
 
 
 
4229}
4230
4231static const struct seq_operations ptype_seq_ops = {
4232	.start = ptype_seq_start,
4233	.next  = ptype_seq_next,
4234	.stop  = ptype_seq_stop,
4235	.show  = ptype_seq_show,
4236};
4237
4238static int ptype_seq_open(struct inode *inode, struct file *file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4239{
4240	return seq_open_net(inode, file, &ptype_seq_ops,
4241			sizeof(struct seq_net_private));
 
 
 
 
 
 
 
 
 
 
 
 
 
4242}
4243
4244static const struct file_operations ptype_seq_fops = {
4245	.owner	 = THIS_MODULE,
4246	.open    = ptype_seq_open,
4247	.read    = seq_read,
4248	.llseek  = seq_lseek,
4249	.release = seq_release_net,
4250};
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4251
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4252
4253static int __net_init dev_proc_net_init(struct net *net)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4254{
4255	int rc = -ENOMEM;
 
 
 
4256
4257	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4258		goto out;
4259	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4260		goto out_dev;
4261	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4262		goto out_softnet;
4263
4264	if (wext_proc_init(net))
4265		goto out_ptype;
4266	rc = 0;
4267out:
4268	return rc;
4269out_ptype:
4270	proc_net_remove(net, "ptype");
4271out_softnet:
4272	proc_net_remove(net, "softnet_stat");
4273out_dev:
4274	proc_net_remove(net, "dev");
4275	goto out;
 
4276}
4277
4278static void __net_exit dev_proc_net_exit(struct net *net)
 
4279{
4280	wext_proc_exit(net);
 
4281
4282	proc_net_remove(net, "ptype");
4283	proc_net_remove(net, "softnet_stat");
4284	proc_net_remove(net, "dev");
 
4285}
4286
4287static struct pernet_operations __net_initdata dev_proc_ops = {
4288	.init = dev_proc_net_init,
4289	.exit = dev_proc_net_exit,
4290};
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4291
4292static int __init dev_proc_init(void)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4293{
4294	return register_pernet_subsys(&dev_proc_ops);
 
 
 
 
 
 
 
 
 
 
 
4295}
4296#else
4297#define dev_proc_init() 0
4298#endif	/* CONFIG_PROC_FS */
 
 
 
 
 
 
 
 
 
 
 
 
 
4299
 
 
 
 
 
 
4300
4301/**
4302 *	netdev_set_master	-	set up master pointer
4303 *	@slave: slave device
4304 *	@master: new master device
 
4305 *
4306 *	Changes the master device of the slave. Pass %NULL to break the
4307 *	bonding. The caller must hold the RTNL semaphore. On a failure
4308 *	a negative errno code is returned. On success the reference counts
4309 *	are adjusted and the function returns zero.
4310 */
4311int netdev_set_master(struct net_device *slave, struct net_device *master)
 
 
 
4312{
4313	struct net_device *old = slave->master;
4314
4315	ASSERT_RTNL();
 
 
 
 
4316
4317	if (master) {
4318		if (old)
4319			return -EBUSY;
4320		dev_hold(master);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4321	}
 
4322
4323	slave->master = master;
 
 
4324
4325	if (old)
4326		dev_put(old);
4327	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4328}
4329EXPORT_SYMBOL(netdev_set_master);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4330
4331/**
4332 *	netdev_set_bond_master	-	set up bonding master/slave pair
4333 *	@slave: slave device
4334 *	@master: new master device
4335 *
4336 *	Changes the master device of the slave. Pass %NULL to break the
4337 *	bonding. The caller must hold the RTNL semaphore. On a failure
4338 *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4339 *	to the routing socket and the function returns zero.
4340 */
4341int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
 
4342{
4343	int err;
 
 
4344
4345	ASSERT_RTNL();
4346
4347	err = netdev_set_master(slave, master);
4348	if (err)
4349		return err;
4350	if (master)
4351		slave->flags |= IFF_SLAVE;
4352	else
4353		slave->flags &= ~IFF_SLAVE;
4354
4355	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4356	return 0;
4357}
4358EXPORT_SYMBOL(netdev_set_bond_master);
4359
4360static void dev_change_rx_flags(struct net_device *dev, int flags)
4361{
4362	const struct net_device_ops *ops = dev->netdev_ops;
4363
4364	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4365		ops->ndo_change_rx_flags(dev, flags);
4366}
4367
4368static int __dev_set_promiscuity(struct net_device *dev, int inc)
4369{
4370	unsigned short old_flags = dev->flags;
4371	uid_t uid;
4372	gid_t gid;
4373
4374	ASSERT_RTNL();
4375
4376	dev->flags |= IFF_PROMISC;
4377	dev->promiscuity += inc;
4378	if (dev->promiscuity == 0) {
4379		/*
4380		 * Avoid overflow.
4381		 * If inc causes overflow, untouch promisc and return error.
4382		 */
4383		if (inc < 0)
4384			dev->flags &= ~IFF_PROMISC;
4385		else {
4386			dev->promiscuity -= inc;
4387			printk(KERN_WARNING "%s: promiscuity touches roof, "
4388				"set promiscuity failed, promiscuity feature "
4389				"of device might be broken.\n", dev->name);
4390			return -EOVERFLOW;
4391		}
4392	}
4393	if (dev->flags != old_flags) {
4394		printk(KERN_INFO "device %s %s promiscuous mode\n",
4395		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4396							       "left");
4397		if (audit_enabled) {
4398			current_uid_gid(&uid, &gid);
4399			audit_log(current->audit_context, GFP_ATOMIC,
4400				AUDIT_ANOM_PROMISCUOUS,
4401				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4402				dev->name, (dev->flags & IFF_PROMISC),
4403				(old_flags & IFF_PROMISC),
4404				audit_get_loginuid(current),
4405				uid, gid,
4406				audit_get_sessionid(current));
 
4407		}
4408
4409		dev_change_rx_flags(dev, IFF_PROMISC);
4410	}
 
 
4411	return 0;
4412}
4413
4414/**
4415 *	dev_set_promiscuity	- update promiscuity count on a device
4416 *	@dev: device
4417 *	@inc: modifier
4418 *
4419 *	Add or remove promiscuity from a device. While the count in the device
4420 *	remains above zero the interface remains promiscuous. Once it hits zero
4421 *	the device reverts back to normal filtering operation. A negative inc
4422 *	value is used to drop promiscuity on the device.
4423 *	Return 0 if successful or a negative errno code on error.
4424 */
4425int dev_set_promiscuity(struct net_device *dev, int inc)
4426{
4427	unsigned short old_flags = dev->flags;
4428	int err;
4429
4430	err = __dev_set_promiscuity(dev, inc);
4431	if (err < 0)
4432		return err;
4433	if (dev->flags != old_flags)
4434		dev_set_rx_mode(dev);
4435	return err;
4436}
4437EXPORT_SYMBOL(dev_set_promiscuity);
4438
4439/**
4440 *	dev_set_allmulti	- update allmulti count on a device
4441 *	@dev: device
4442 *	@inc: modifier
4443 *
4444 *	Add or remove reception of all multicast frames to a device. While the
4445 *	count in the device remains above zero the interface remains listening
4446 *	to all interfaces. Once it hits zero the device reverts back to normal
4447 *	filtering operation. A negative @inc value is used to drop the counter
4448 *	when releasing a resource needing all multicasts.
4449 *	Return 0 if successful or a negative errno code on error.
4450 */
4451
4452int dev_set_allmulti(struct net_device *dev, int inc)
4453{
4454	unsigned short old_flags = dev->flags;
4455
4456	ASSERT_RTNL();
4457
4458	dev->flags |= IFF_ALLMULTI;
4459	dev->allmulti += inc;
4460	if (dev->allmulti == 0) {
4461		/*
4462		 * Avoid overflow.
4463		 * If inc causes overflow, untouch allmulti and return error.
4464		 */
4465		if (inc < 0)
4466			dev->flags &= ~IFF_ALLMULTI;
4467		else {
4468			dev->allmulti -= inc;
4469			printk(KERN_WARNING "%s: allmulti touches roof, "
4470				"set allmulti failed, allmulti feature of "
4471				"device might be broken.\n", dev->name);
4472			return -EOVERFLOW;
4473		}
4474	}
4475	if (dev->flags ^ old_flags) {
4476		dev_change_rx_flags(dev, IFF_ALLMULTI);
4477		dev_set_rx_mode(dev);
 
 
 
4478	}
4479	return 0;
4480}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4481EXPORT_SYMBOL(dev_set_allmulti);
4482
4483/*
4484 *	Upload unicast and multicast address lists to device and
4485 *	configure RX filtering. When the device doesn't support unicast
4486 *	filtering it is put in promiscuous mode while unicast addresses
4487 *	are present.
4488 */
4489void __dev_set_rx_mode(struct net_device *dev)
4490{
4491	const struct net_device_ops *ops = dev->netdev_ops;
4492
4493	/* dev_open will call this function so the list will stay sane. */
4494	if (!(dev->flags&IFF_UP))
4495		return;
4496
4497	if (!netif_device_present(dev))
4498		return;
4499
4500	if (ops->ndo_set_rx_mode)
4501		ops->ndo_set_rx_mode(dev);
4502	else {
4503		/* Unicast addresses changes may only happen under the rtnl,
4504		 * therefore calling __dev_set_promiscuity here is safe.
4505		 */
4506		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4507			__dev_set_promiscuity(dev, 1);
4508			dev->uc_promisc = true;
4509		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4510			__dev_set_promiscuity(dev, -1);
4511			dev->uc_promisc = false;
4512		}
4513
4514		if (ops->ndo_set_multicast_list)
4515			ops->ndo_set_multicast_list(dev);
4516	}
 
 
 
4517}
4518
4519void dev_set_rx_mode(struct net_device *dev)
4520{
4521	netif_addr_lock_bh(dev);
4522	__dev_set_rx_mode(dev);
4523	netif_addr_unlock_bh(dev);
4524}
4525
4526/**
4527 *	dev_ethtool_get_settings - call device's ethtool_ops::get_settings()
4528 *	@dev: device
4529 *	@cmd: memory area for ethtool_ops::get_settings() result
4530 *
4531 *      The cmd arg is initialized properly (cleared and
4532 *      ethtool_cmd::cmd field set to ETHTOOL_GSET).
4533 *
4534 *	Return device's ethtool_ops::get_settings() result value or
4535 *	-EOPNOTSUPP when device doesn't expose
4536 *	ethtool_ops::get_settings() operation.
4537 */
4538int dev_ethtool_get_settings(struct net_device *dev,
4539			     struct ethtool_cmd *cmd)
4540{
4541	if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings)
4542		return -EOPNOTSUPP;
4543
4544	memset(cmd, 0, sizeof(struct ethtool_cmd));
4545	cmd->cmd = ETHTOOL_GSET;
4546	return dev->ethtool_ops->get_settings(dev, cmd);
4547}
4548EXPORT_SYMBOL(dev_ethtool_get_settings);
4549
4550/**
4551 *	dev_get_flags - get flags reported to userspace
4552 *	@dev: device
4553 *
4554 *	Get the combination of flag bits exported through APIs to userspace.
4555 */
4556unsigned dev_get_flags(const struct net_device *dev)
4557{
4558	unsigned flags;
4559
4560	flags = (dev->flags & ~(IFF_PROMISC |
4561				IFF_ALLMULTI |
4562				IFF_RUNNING |
4563				IFF_LOWER_UP |
4564				IFF_DORMANT)) |
4565		(dev->gflags & (IFF_PROMISC |
4566				IFF_ALLMULTI));
4567
4568	if (netif_running(dev)) {
4569		if (netif_oper_up(dev))
4570			flags |= IFF_RUNNING;
4571		if (netif_carrier_ok(dev))
4572			flags |= IFF_LOWER_UP;
4573		if (netif_dormant(dev))
4574			flags |= IFF_DORMANT;
4575	}
4576
4577	return flags;
4578}
4579EXPORT_SYMBOL(dev_get_flags);
4580
4581int __dev_change_flags(struct net_device *dev, unsigned int flags)
 
4582{
4583	int old_flags = dev->flags;
4584	int ret;
4585
4586	ASSERT_RTNL();
4587
4588	/*
4589	 *	Set the flags on our device.
4590	 */
4591
4592	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4593			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4594			       IFF_AUTOMEDIA)) |
4595		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4596				    IFF_ALLMULTI));
4597
4598	/*
4599	 *	Load in the correct multicast list now the flags have changed.
4600	 */
4601
4602	if ((old_flags ^ flags) & IFF_MULTICAST)
4603		dev_change_rx_flags(dev, IFF_MULTICAST);
4604
4605	dev_set_rx_mode(dev);
4606
4607	/*
4608	 *	Have we downed the interface. We handle IFF_UP ourselves
4609	 *	according to user attempts to set it, rather than blindly
4610	 *	setting it.
4611	 */
4612
4613	ret = 0;
4614	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4615		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4616
4617		if (!ret)
4618			dev_set_rx_mode(dev);
4619	}
4620
4621	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4622		int inc = (flags & IFF_PROMISC) ? 1 : -1;
 
4623
4624		dev->gflags ^= IFF_PROMISC;
4625		dev_set_promiscuity(dev, inc);
 
 
 
4626	}
4627
4628	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4629	   is important. Some (broken) drivers set IFF_PROMISC, when
4630	   IFF_ALLMULTI is requested not asking us and not reporting.
4631	 */
4632	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4633		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4634
4635		dev->gflags ^= IFF_ALLMULTI;
4636		dev_set_allmulti(dev, inc);
4637	}
4638
4639	return ret;
4640}
4641
4642void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
 
4643{
4644	unsigned int changes = dev->flags ^ old_flags;
4645
 
 
 
4646	if (changes & IFF_UP) {
4647		if (dev->flags & IFF_UP)
4648			call_netdevice_notifiers(NETDEV_UP, dev);
4649		else
4650			call_netdevice_notifiers(NETDEV_DOWN, dev);
4651	}
4652
4653	if (dev->flags & IFF_UP &&
4654	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4655		call_netdevice_notifiers(NETDEV_CHANGE, dev);
 
 
 
 
 
 
 
 
4656}
4657
4658/**
4659 *	dev_change_flags - change device settings
4660 *	@dev: device
4661 *	@flags: device state flags
 
4662 *
4663 *	Change settings on device based state flags. The flags are
4664 *	in the userspace exported format.
4665 */
4666int dev_change_flags(struct net_device *dev, unsigned flags)
 
4667{
4668	int ret, changes;
4669	int old_flags = dev->flags;
4670
4671	ret = __dev_change_flags(dev, flags);
4672	if (ret < 0)
4673		return ret;
4674
4675	changes = old_flags ^ dev->flags;
4676	if (changes)
4677		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4678
4679	__dev_notify_flags(dev, old_flags);
4680	return ret;
4681}
4682EXPORT_SYMBOL(dev_change_flags);
4683
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4684/**
4685 *	dev_set_mtu - Change maximum transfer unit
4686 *	@dev: device
4687 *	@new_mtu: new transfer unit
 
4688 *
4689 *	Change the maximum transfer size of the network device.
4690 */
4691int dev_set_mtu(struct net_device *dev, int new_mtu)
 
4692{
4693	const struct net_device_ops *ops = dev->netdev_ops;
4694	int err;
4695
4696	if (new_mtu == dev->mtu)
4697		return 0;
4698
4699	/*	MTU must be positive.	 */
4700	if (new_mtu < 0)
4701		return -EINVAL;
4702
4703	if (!netif_device_present(dev))
4704		return -ENODEV;
4705
4706	err = 0;
4707	if (ops->ndo_change_mtu)
4708		err = ops->ndo_change_mtu(dev, new_mtu);
4709	else
4710		dev->mtu = new_mtu;
4711
4712	if (!err && dev->flags & IFF_UP)
4713		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4714	return err;
4715}
4716EXPORT_SYMBOL(dev_set_mtu);
4717
4718/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4719 *	dev_set_group - Change group this device belongs to
4720 *	@dev: device
4721 *	@new_group: group this device should belong to
4722 */
4723void dev_set_group(struct net_device *dev, int new_group)
4724{
4725	dev->group = new_group;
4726}
4727EXPORT_SYMBOL(dev_set_group);
4728
4729/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4730 *	dev_set_mac_address - Change Media Access Control Address
4731 *	@dev: device
4732 *	@sa: new address
 
4733 *
4734 *	Change the hardware (MAC) address of the device
4735 */
4736int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
 
4737{
4738	const struct net_device_ops *ops = dev->netdev_ops;
4739	int err;
4740
4741	if (!ops->ndo_set_mac_address)
4742		return -EOPNOTSUPP;
4743	if (sa->sa_family != dev->type)
4744		return -EINVAL;
4745	if (!netif_device_present(dev))
4746		return -ENODEV;
 
 
 
4747	err = ops->ndo_set_mac_address(dev, sa);
4748	if (!err)
4749		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4750	return err;
 
 
 
4751}
4752EXPORT_SYMBOL(dev_set_mac_address);
4753
4754/*
4755 *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
 
 
 
 
4756 */
4757static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4758{
4759	int err;
4760	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4761
4762	if (!dev)
 
 
4763		return -ENODEV;
 
 
 
4764
4765	switch (cmd) {
4766	case SIOCGIFFLAGS:	/* Get interface flags */
4767		ifr->ifr_flags = (short) dev_get_flags(dev);
4768		return 0;
4769
4770	case SIOCGIFMETRIC:	/* Get the metric on the interface
4771				   (currently unused) */
4772		ifr->ifr_metric = 0;
4773		return 0;
 
 
4774
4775	case SIOCGIFMTU:	/* Get the MTU of a device */
4776		ifr->ifr_mtu = dev->mtu;
4777		return 0;
 
 
4778
4779	case SIOCGIFHWADDR:
4780		if (!dev->addr_len)
4781			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4782		else
4783			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4784			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4785		ifr->ifr_hwaddr.sa_family = dev->type;
4786		return 0;
 
 
 
 
 
4787
4788	case SIOCGIFSLAVE:
4789		err = -EINVAL;
4790		break;
 
 
 
 
 
4791
4792	case SIOCGIFMAP:
4793		ifr->ifr_map.mem_start = dev->mem_start;
4794		ifr->ifr_map.mem_end   = dev->mem_end;
4795		ifr->ifr_map.base_addr = dev->base_addr;
4796		ifr->ifr_map.irq       = dev->irq;
4797		ifr->ifr_map.dma       = dev->dma;
4798		ifr->ifr_map.port      = dev->if_port;
4799		return 0;
 
 
 
 
 
 
 
 
 
4800
4801	case SIOCGIFINDEX:
4802		ifr->ifr_ifindex = dev->ifindex;
4803		return 0;
 
 
4804
4805	case SIOCGIFTXQLEN:
4806		ifr->ifr_qlen = dev->tx_queue_len;
4807		return 0;
4808
4809	default:
4810		/* dev_ioctl() should ensure this case
4811		 * is never reached
4812		 */
4813		WARN_ON(1);
4814		err = -ENOTTY;
4815		break;
4816
 
 
 
 
 
 
 
 
4817	}
 
4818	return err;
4819}
 
4820
4821/*
4822 *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
 
 
 
4823 */
4824static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4825{
4826	int err;
4827	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4828	const struct net_device_ops *ops;
4829
4830	if (!dev)
4831		return -ENODEV;
 
4832
4833	ops = dev->netdev_ops;
 
 
4834
4835	switch (cmd) {
4836	case SIOCSIFFLAGS:	/* Set interface flags */
4837		return dev_change_flags(dev, ifr->ifr_flags);
 
 
 
 
 
 
 
 
4838
4839	case SIOCSIFMETRIC:	/* Set the metric on the interface
4840				   (currently unused) */
4841		return -EOPNOTSUPP;
 
 
 
 
 
4842
4843	case SIOCSIFMTU:	/* Set the MTU of a device */
4844		return dev_set_mtu(dev, ifr->ifr_mtu);
4845
4846	case SIOCSIFHWADDR:
4847		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
 
 
 
 
 
 
 
 
 
 
 
 
 
4848
4849	case SIOCSIFHWBROADCAST:
4850		if (ifr->ifr_hwaddr.sa_family != dev->type)
4851			return -EINVAL;
4852		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4853		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4854		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4855		return 0;
 
 
 
 
4856
4857	case SIOCSIFMAP:
4858		if (ops->ndo_set_config) {
4859			if (!netif_device_present(dev))
4860				return -ENODEV;
4861			return ops->ndo_set_config(dev, &ifr->ifr_map);
 
 
 
4862		}
4863		return -EOPNOTSUPP;
 
 
4864
4865	case SIOCADDMULTI:
4866		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4867		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4868			return -EINVAL;
4869		if (!netif_device_present(dev))
4870			return -ENODEV;
4871		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4872
4873	case SIOCDELMULTI:
4874		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4875		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4876			return -EINVAL;
4877		if (!netif_device_present(dev))
4878			return -ENODEV;
4879		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4880
4881	case SIOCSIFTXQLEN:
4882		if (ifr->ifr_qlen < 0)
4883			return -EINVAL;
4884		dev->tx_queue_len = ifr->ifr_qlen;
4885		return 0;
4886
4887	case SIOCSIFNAME:
4888		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4889		return dev_change_name(dev, ifr->ifr_newname);
 
4890
4891	/*
4892	 *	Unknown or private ioctl
4893	 */
4894	default:
4895		if ((cmd >= SIOCDEVPRIVATE &&
4896		    cmd <= SIOCDEVPRIVATE + 15) ||
4897		    cmd == SIOCBONDENSLAVE ||
4898		    cmd == SIOCBONDRELEASE ||
4899		    cmd == SIOCBONDSETHWADDR ||
4900		    cmd == SIOCBONDSLAVEINFOQUERY ||
4901		    cmd == SIOCBONDINFOQUERY ||
4902		    cmd == SIOCBONDCHANGEACTIVE ||
4903		    cmd == SIOCGMIIPHY ||
4904		    cmd == SIOCGMIIREG ||
4905		    cmd == SIOCSMIIREG ||
4906		    cmd == SIOCBRADDIF ||
4907		    cmd == SIOCBRDELIF ||
4908		    cmd == SIOCSHWTSTAMP ||
4909		    cmd == SIOCWANDEV) {
4910			err = -EOPNOTSUPP;
4911			if (ops->ndo_do_ioctl) {
4912				if (netif_device_present(dev))
4913					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4914				else
4915					err = -ENODEV;
4916			}
4917		} else
4918			err = -EINVAL;
4919
4920	}
4921	return err;
 
 
 
4922}
4923
4924/*
4925 *	This function handles all "interface"-type I/O control requests. The actual
4926 *	'doing' part of this is dev_ifsioc above.
4927 */
 
 
4928
4929/**
4930 *	dev_ioctl	-	network device ioctl
4931 *	@net: the applicable net namespace
4932 *	@cmd: command to issue
4933 *	@arg: pointer to a struct ifreq in user space
4934 *
4935 *	Issue ioctl functions to devices. This is normally called by the
4936 *	user space syscall interfaces but can sometimes be useful for
4937 *	other purposes. The return value is the return from the syscall if
4938 *	positive or a negative errno code on error.
4939 */
4940
4941int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 
 
4942{
4943	struct ifreq ifr;
4944	int ret;
4945	char *colon;
4946
4947	/* One special case: SIOCGIFCONF takes ifconf argument
4948	   and requires shared lock, because it sleeps writing
4949	   to user space.
4950	 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4951
4952	if (cmd == SIOCGIFCONF) {
4953		rtnl_lock();
4954		ret = dev_ifconf(net, (char __user *) arg);
4955		rtnl_unlock();
4956		return ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4957	}
4958	if (cmd == SIOCGIFNAME)
4959		return dev_ifname(net, (struct ifreq __user *)arg);
4960
4961	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4962		return -EFAULT;
 
 
 
 
 
 
4963
4964	ifr.ifr_name[IFNAMSIZ-1] = 0;
4965
4966	colon = strchr(ifr.ifr_name, ':');
4967	if (colon)
4968		*colon = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4969
4970	/*
4971	 *	See which interface the caller is talking about.
4972	 */
 
 
 
4973
4974	switch (cmd) {
4975	/*
4976	 *	These ioctl calls:
4977	 *	- can be done by all.
4978	 *	- atomic and do not require locking.
4979	 *	- return a value
4980	 */
4981	case SIOCGIFFLAGS:
4982	case SIOCGIFMETRIC:
4983	case SIOCGIFMTU:
4984	case SIOCGIFHWADDR:
4985	case SIOCGIFSLAVE:
4986	case SIOCGIFMAP:
4987	case SIOCGIFINDEX:
4988	case SIOCGIFTXQLEN:
4989		dev_load(net, ifr.ifr_name);
4990		rcu_read_lock();
4991		ret = dev_ifsioc_locked(net, &ifr, cmd);
4992		rcu_read_unlock();
4993		if (!ret) {
4994			if (colon)
4995				*colon = ':';
4996			if (copy_to_user(arg, &ifr,
4997					 sizeof(struct ifreq)))
4998				ret = -EFAULT;
4999		}
5000		return ret;
5001
5002	case SIOCETHTOOL:
5003		dev_load(net, ifr.ifr_name);
5004		rtnl_lock();
5005		ret = dev_ethtool(net, &ifr);
5006		rtnl_unlock();
5007		if (!ret) {
5008			if (colon)
5009				*colon = ':';
5010			if (copy_to_user(arg, &ifr,
5011					 sizeof(struct ifreq)))
5012				ret = -EFAULT;
 
5013		}
5014		return ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5015
5016	/*
5017	 *	These ioctl calls:
5018	 *	- require superuser power.
5019	 *	- require strict serialization.
5020	 *	- return a value
5021	 */
5022	case SIOCGMIIPHY:
5023	case SIOCGMIIREG:
5024	case SIOCSIFNAME:
5025		if (!capable(CAP_NET_ADMIN))
5026			return -EPERM;
5027		dev_load(net, ifr.ifr_name);
5028		rtnl_lock();
5029		ret = dev_ifsioc(net, &ifr, cmd);
5030		rtnl_unlock();
5031		if (!ret) {
5032			if (colon)
5033				*colon = ':';
5034			if (copy_to_user(arg, &ifr,
5035					 sizeof(struct ifreq)))
5036				ret = -EFAULT;
5037		}
5038		return ret;
5039
5040	/*
5041	 *	These ioctl calls:
5042	 *	- require superuser power.
5043	 *	- require strict serialization.
5044	 *	- do not return a value
5045	 */
5046	case SIOCSIFFLAGS:
5047	case SIOCSIFMETRIC:
5048	case SIOCSIFMTU:
5049	case SIOCSIFMAP:
5050	case SIOCSIFHWADDR:
5051	case SIOCSIFSLAVE:
5052	case SIOCADDMULTI:
5053	case SIOCDELMULTI:
5054	case SIOCSIFHWBROADCAST:
5055	case SIOCSIFTXQLEN:
5056	case SIOCSMIIREG:
5057	case SIOCBONDENSLAVE:
5058	case SIOCBONDRELEASE:
5059	case SIOCBONDSETHWADDR:
5060	case SIOCBONDCHANGEACTIVE:
5061	case SIOCBRADDIF:
5062	case SIOCBRDELIF:
5063	case SIOCSHWTSTAMP:
5064		if (!capable(CAP_NET_ADMIN))
5065			return -EPERM;
5066		/* fall through */
5067	case SIOCBONDSLAVEINFOQUERY:
5068	case SIOCBONDINFOQUERY:
5069		dev_load(net, ifr.ifr_name);
5070		rtnl_lock();
5071		ret = dev_ifsioc(net, &ifr, cmd);
5072		rtnl_unlock();
5073		return ret;
5074
5075	case SIOCGIFMEM:
5076		/* Get the per device memory space. We can add this but
5077		 * currently do not support it */
5078	case SIOCSIFMEM:
5079		/* Set the per device memory buffer space.
5080		 * Not applicable in our case */
5081	case SIOCSIFLINK:
5082		return -ENOTTY;
5083
5084	/*
5085	 *	Unknown or private ioctl.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5086	 */
5087	default:
5088		if (cmd == SIOCWANDEV ||
5089		    (cmd >= SIOCDEVPRIVATE &&
5090		     cmd <= SIOCDEVPRIVATE + 15)) {
5091			dev_load(net, ifr.ifr_name);
5092			rtnl_lock();
5093			ret = dev_ifsioc(net, &ifr, cmd);
5094			rtnl_unlock();
5095			if (!ret && copy_to_user(arg, &ifr,
5096						 sizeof(struct ifreq)))
5097				ret = -EFAULT;
5098			return ret;
5099		}
5100		/* Take care of Wireless Extensions */
5101		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5102			return wext_handle_ioctl(net, &ifr, cmd, arg);
5103		return -ENOTTY;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5104	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5105}
5106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5107
5108/**
5109 *	dev_new_index	-	allocate an ifindex
5110 *	@net: the applicable net namespace
5111 *
5112 *	Returns a suitable unique value for a new device interface
5113 *	number.  The caller must hold the rtnl semaphore or the
5114 *	dev_base_lock to be sure it remains unique.
5115 */
5116static int dev_new_index(struct net *net)
5117{
5118	static int ifindex;
 
5119	for (;;) {
5120		if (++ifindex <= 0)
5121			ifindex = 1;
5122		if (!__dev_get_by_index(net, ifindex))
5123			return ifindex;
5124	}
5125}
5126
5127/* Delayed registration/unregisteration */
5128static LIST_HEAD(net_todo_list);
 
5129
5130static void net_set_todo(struct net_device *dev)
5131{
5132	list_add_tail(&dev->todo_list, &net_todo_list);
 
5133}
5134
5135static void rollback_registered_many(struct list_head *head)
5136{
5137	struct net_device *dev, *tmp;
 
5138
5139	BUG_ON(dev_boot_phase);
5140	ASSERT_RTNL();
5141
5142	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5143		/* Some devices call without registering
5144		 * for initialization unwind. Remove those
5145		 * devices and proceed with the remaining.
5146		 */
5147		if (dev->reg_state == NETREG_UNINITIALIZED) {
5148			pr_debug("unregister_netdevice: device %s/%p never "
5149				 "was registered\n", dev->name, dev);
5150
5151			WARN_ON(1);
5152			list_del(&dev->unreg_list);
5153			continue;
5154		}
5155		dev->dismantle = true;
5156		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5157	}
5158
5159	/* If device is running, close it first. */
5160	dev_close_many(head);
 
 
5161
5162	list_for_each_entry(dev, head, unreg_list) {
5163		/* And unlink it from device chain. */
5164		unlist_netdevice(dev);
5165
5166		dev->reg_state = NETREG_UNREGISTERING;
5167	}
 
5168
5169	synchronize_net();
5170
5171	list_for_each_entry(dev, head, unreg_list) {
 
 
5172		/* Shutdown queueing discipline. */
5173		dev_shutdown(dev);
5174
 
5175
5176		/* Notify protocols, that we are about to destroy
5177		   this device. They should clean all the things.
5178		*/
5179		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5180
5181		if (!dev->rtnl_link_ops ||
5182		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5183			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
 
5184
5185		/*
5186		 *	Flush the unicast and multicast chains
5187		 */
5188		dev_uc_flush(dev);
5189		dev_mc_flush(dev);
5190
 
 
 
5191		if (dev->netdev_ops->ndo_uninit)
5192			dev->netdev_ops->ndo_uninit(dev);
5193
5194		/* Notifier chain MUST detach us from master device. */
5195		WARN_ON(dev->master);
 
 
 
 
5196
5197		/* Remove entries from kobject tree */
5198		netdev_unregister_kobject(dev);
 
 
 
 
5199	}
5200
5201	/* Process any work delayed until the end of the batch */
5202	dev = list_first_entry(head, struct net_device, unreg_list);
5203	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5204
5205	rcu_barrier();
5206
5207	list_for_each_entry(dev, head, unreg_list)
5208		dev_put(dev);
5209}
5210
5211static void rollback_registered(struct net_device *dev)
5212{
5213	LIST_HEAD(single);
5214
5215	list_add(&dev->unreg_list, &single);
5216	rollback_registered_many(&single);
5217	list_del(&single);
5218}
5219
5220static u32 netdev_fix_features(struct net_device *dev, u32 features)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5221{
5222	/* Fix illegal checksum combinations */
5223	if ((features & NETIF_F_HW_CSUM) &&
5224	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5225		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5226		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5227	}
5228
5229	if ((features & NETIF_F_NO_CSUM) &&
5230	    (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5231		netdev_warn(dev, "mixed no checksumming and other settings.\n");
5232		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5233	}
5234
5235	/* Fix illegal SG+CSUM combinations. */
5236	if ((features & NETIF_F_SG) &&
5237	    !(features & NETIF_F_ALL_CSUM)) {
5238		netdev_dbg(dev,
5239			"Dropping NETIF_F_SG since no checksum feature.\n");
5240		features &= ~NETIF_F_SG;
5241	}
5242
5243	/* TSO requires that SG is present as well. */
5244	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5245		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5246		features &= ~NETIF_F_ALL_TSO;
5247	}
5248
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5249	/* TSO ECN requires that TSO is present as well. */
5250	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5251		features &= ~NETIF_F_TSO_ECN;
5252
5253	/* Software GSO depends on SG. */
5254	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5255		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5256		features &= ~NETIF_F_GSO;
5257	}
5258
5259	/* UFO needs SG and checksumming */
5260	if (features & NETIF_F_UFO) {
5261		/* maybe split UFO into V4 and V6? */
5262		if (!((features & NETIF_F_GEN_CSUM) ||
5263		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5264			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5265			netdev_dbg(dev,
5266				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5267			features &= ~NETIF_F_UFO;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5268		}
5269
5270		if (!(features & NETIF_F_SG)) {
5271			netdev_dbg(dev,
5272				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5273			features &= ~NETIF_F_UFO;
5274		}
5275	}
5276
5277	return features;
5278}
5279
5280int __netdev_update_features(struct net_device *dev)
5281{
5282	u32 features;
5283	int err = 0;
 
 
5284
5285	ASSERT_RTNL();
5286
5287	features = netdev_get_wanted_features(dev);
5288
5289	if (dev->netdev_ops->ndo_fix_features)
5290		features = dev->netdev_ops->ndo_fix_features(dev, features);
5291
5292	/* driver might be less strict about feature dependencies */
5293	features = netdev_fix_features(dev, features);
5294
 
 
 
 
5295	if (dev->features == features)
5296		return 0;
5297
5298	netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
5299		dev->features, features);
5300
5301	if (dev->netdev_ops->ndo_set_features)
5302		err = dev->netdev_ops->ndo_set_features(dev, features);
 
 
5303
5304	if (unlikely(err < 0)) {
5305		netdev_err(dev,
5306			"set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5307			err, features, dev->features);
 
 
 
5308		return -1;
5309	}
5310
5311	if (!err)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5312		dev->features = features;
 
5313
5314	return 1;
5315}
5316
5317/**
5318 *	netdev_update_features - recalculate device features
5319 *	@dev: the device to check
5320 *
5321 *	Recalculate dev->features set and send notifications if it
5322 *	has changed. Should be called after driver or hardware dependent
5323 *	conditions might have changed that influence the features.
5324 */
5325void netdev_update_features(struct net_device *dev)
5326{
5327	if (__netdev_update_features(dev))
5328		netdev_features_change(dev);
5329}
5330EXPORT_SYMBOL(netdev_update_features);
5331
5332/**
5333 *	netdev_change_features - recalculate device features
5334 *	@dev: the device to check
5335 *
5336 *	Recalculate dev->features set and send notifications even
5337 *	if they have not changed. Should be called instead of
5338 *	netdev_update_features() if also dev->vlan_features might
5339 *	have changed to allow the changes to be propagated to stacked
5340 *	VLAN devices.
5341 */
5342void netdev_change_features(struct net_device *dev)
5343{
5344	__netdev_update_features(dev);
5345	netdev_features_change(dev);
5346}
5347EXPORT_SYMBOL(netdev_change_features);
5348
5349/**
5350 *	netif_stacked_transfer_operstate -	transfer operstate
5351 *	@rootdev: the root or lower level device to transfer state from
5352 *	@dev: the device to transfer operstate to
5353 *
5354 *	Transfer operational state from root to device. This is normally
5355 *	called when a stacking relationship exists between the root
5356 *	device and the device(a leaf device).
5357 */
5358void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5359					struct net_device *dev)
5360{
5361	if (rootdev->operstate == IF_OPER_DORMANT)
5362		netif_dormant_on(dev);
5363	else
5364		netif_dormant_off(dev);
5365
5366	if (netif_carrier_ok(rootdev)) {
5367		if (!netif_carrier_ok(dev))
5368			netif_carrier_on(dev);
5369	} else {
5370		if (netif_carrier_ok(dev))
5371			netif_carrier_off(dev);
5372	}
 
 
5373}
5374EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5375
5376#ifdef CONFIG_RPS
5377static int netif_alloc_rx_queues(struct net_device *dev)
5378{
5379	unsigned int i, count = dev->num_rx_queues;
5380	struct netdev_rx_queue *rx;
 
 
5381
5382	BUG_ON(count < 1);
5383
5384	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5385	if (!rx) {
5386		pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5387		return -ENOMEM;
5388	}
5389	dev->_rx = rx;
5390
5391	for (i = 0; i < count; i++)
5392		rx[i].dev = dev;
 
 
 
 
 
 
5393	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5394}
5395#endif
5396
5397static void netdev_init_one_queue(struct net_device *dev,
5398				  struct netdev_queue *queue, void *_unused)
5399{
5400	/* Initialize queue lock */
5401	spin_lock_init(&queue->_xmit_lock);
5402	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5403	queue->xmit_lock_owner = -1;
5404	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5405	queue->dev = dev;
 
 
 
 
 
 
 
 
5406}
5407
5408static int netif_alloc_netdev_queues(struct net_device *dev)
5409{
5410	unsigned int count = dev->num_tx_queues;
5411	struct netdev_queue *tx;
 
5412
5413	BUG_ON(count < 1);
 
5414
5415	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5416	if (!tx) {
5417		pr_err("netdev: Unable to allocate %u tx queues.\n",
5418		       count);
5419		return -ENOMEM;
5420	}
5421	dev->_tx = tx;
5422
5423	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5424	spin_lock_init(&dev->tx_global_lock);
5425
5426	return 0;
5427}
5428
 
 
 
 
 
 
 
 
 
 
 
 
5429/**
5430 *	register_netdevice	- register a network device
5431 *	@dev: device to register
5432 *
5433 *	Take a completed network device structure and add it to the kernel
5434 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5435 *	chain. 0 is returned on success. A negative errno code is returned
5436 *	on a failure to set up the device, or if the name is a duplicate.
5437 *
5438 *	Callers must hold the rtnl semaphore. You may want
5439 *	register_netdev() instead of this.
5440 *
5441 *	BUGS:
5442 *	The locking appears insufficient to guarantee two parallel registers
5443 *	will not get the same name.
5444 */
5445
5446int register_netdevice(struct net_device *dev)
5447{
5448	int ret;
5449	struct net *net = dev_net(dev);
5450
 
 
5451	BUG_ON(dev_boot_phase);
5452	ASSERT_RTNL();
5453
5454	might_sleep();
5455
5456	/* When net_device's are persistent, this will be fatal. */
5457	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5458	BUG_ON(!net);
5459
 
 
 
 
5460	spin_lock_init(&dev->addr_list_lock);
5461	netdev_set_addr_lockdep_class(dev);
5462
5463	dev->iflink = -1;
5464
5465	ret = dev_get_valid_name(dev, dev->name);
5466	if (ret < 0)
5467		goto out;
5468
 
 
 
 
 
5469	/* Init, if this function is available */
5470	if (dev->netdev_ops->ndo_init) {
5471		ret = dev->netdev_ops->ndo_init(dev);
5472		if (ret) {
5473			if (ret > 0)
5474				ret = -EIO;
5475			goto out;
5476		}
5477	}
5478
5479	dev->ifindex = dev_new_index(net);
5480	if (dev->iflink == -1)
5481		dev->iflink = dev->ifindex;
 
 
 
 
 
 
 
 
 
 
 
5482
5483	/* Transfer changeable features to wanted_features and enable
5484	 * software offloads (GSO and GRO).
5485	 */
5486	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5487	dev->features |= NETIF_F_SOFT_FEATURES;
5488	dev->wanted_features = dev->features & dev->hw_features;
5489
5490	/* Turn on no cache copy if HW is doing checksum */
5491	dev->hw_features |= NETIF_F_NOCACHE_COPY;
5492	if ((dev->features & NETIF_F_ALL_CSUM) &&
5493	    !(dev->features & NETIF_F_NO_CSUM)) {
5494		dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5495		dev->features |= NETIF_F_NOCACHE_COPY;
5496	}
5497
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5498	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5499	 */
5500	dev->vlan_features |= NETIF_F_HIGHDMA;
5501
 
 
 
 
 
 
 
 
5502	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5503	ret = notifier_to_errno(ret);
5504	if (ret)
5505		goto err_uninit;
5506
5507	ret = netdev_register_kobject(dev);
5508	if (ret)
 
5509		goto err_uninit;
 
5510	dev->reg_state = NETREG_REGISTERED;
5511
5512	__netdev_update_features(dev);
5513
5514	/*
5515	 *	Default initial state at registry is that the
5516	 *	device is present.
5517	 */
5518
5519	set_bit(__LINK_STATE_PRESENT, &dev->state);
5520
 
 
5521	dev_init_scheduler(dev);
5522	dev_hold(dev);
5523	list_netdevice(dev);
 
 
 
 
 
 
 
 
5524
5525	/* Notify protocols, that a new device appeared. */
5526	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5527	ret = notifier_to_errno(ret);
5528	if (ret) {
5529		rollback_registered(dev);
 
 
5530		dev->reg_state = NETREG_UNREGISTERED;
 
 
 
 
 
 
 
5531	}
5532	/*
5533	 *	Prevent userspace races by waiting until the network
5534	 *	device is fully setup before sending notifications.
5535	 */
5536	if (!dev->rtnl_link_ops ||
5537	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5538		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5539
5540out:
5541	return ret;
5542
5543err_uninit:
5544	if (dev->netdev_ops->ndo_uninit)
5545		dev->netdev_ops->ndo_uninit(dev);
 
 
 
 
5546	goto out;
5547}
5548EXPORT_SYMBOL(register_netdevice);
5549
5550/**
5551 *	init_dummy_netdev	- init a dummy network device for NAPI
5552 *	@dev: device to init
5553 *
5554 *	This takes a network device structure and initialize the minimum
5555 *	amount of fields so it can be used to schedule NAPI polls without
5556 *	registering a full blown interface. This is to be used by drivers
5557 *	that need to tie several hardware interfaces to a single NAPI
5558 *	poll scheduler due to HW limitations.
5559 */
5560int init_dummy_netdev(struct net_device *dev)
5561{
5562	/* Clear everything. Note we don't initialize spinlocks
5563	 * are they aren't supposed to be taken by any of the
5564	 * NAPI code and this dummy netdev is supposed to be
5565	 * only ever used for NAPI polls
5566	 */
5567	memset(dev, 0, sizeof(struct net_device));
5568
5569	/* make sure we BUG if trying to hit standard
5570	 * register/unregister code path
5571	 */
5572	dev->reg_state = NETREG_DUMMY;
5573
5574	/* NAPI wants this */
5575	INIT_LIST_HEAD(&dev->napi_list);
5576
5577	/* a dummy interface is started by default */
5578	set_bit(__LINK_STATE_PRESENT, &dev->state);
5579	set_bit(__LINK_STATE_START, &dev->state);
5580
 
 
 
5581	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5582	 * because users of this 'device' dont need to change
5583	 * its refcount.
5584	 */
5585
5586	return 0;
5587}
5588EXPORT_SYMBOL_GPL(init_dummy_netdev);
5589
5590
5591/**
5592 *	register_netdev	- register a network device
5593 *	@dev: device to register
5594 *
5595 *	Take a completed network device structure and add it to the kernel
5596 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5597 *	chain. 0 is returned on success. A negative errno code is returned
5598 *	on a failure to set up the device, or if the name is a duplicate.
5599 *
5600 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5601 *	and expands the device name if you passed a format string to
5602 *	alloc_netdev.
5603 */
5604int register_netdev(struct net_device *dev)
5605{
5606	int err;
5607
5608	rtnl_lock();
 
5609	err = register_netdevice(dev);
5610	rtnl_unlock();
5611	return err;
5612}
5613EXPORT_SYMBOL(register_netdev);
5614
5615int netdev_refcnt_read(const struct net_device *dev)
5616{
5617	int i, refcnt = 0;
5618
5619	for_each_possible_cpu(i)
5620		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5621	return refcnt;
5622}
5623EXPORT_SYMBOL(netdev_refcnt_read);
5624
5625/*
5626 * netdev_wait_allrefs - wait until all references are gone.
 
5627 *
5628 * This is called when unregistering network devices.
5629 *
5630 * Any protocol or device that holds a reference should register
5631 * for netdevice notification, and cleanup and put back the
5632 * reference if they receive an UNREGISTER event.
5633 * We can get stuck here if buggy protocols don't correctly
5634 * call dev_put.
5635 */
5636static void netdev_wait_allrefs(struct net_device *dev)
5637{
5638	unsigned long rebroadcast_time, warning_time;
5639	int refcnt;
5640
5641	linkwatch_forget_dev(dev);
5642
5643	rebroadcast_time = warning_time = jiffies;
5644	refcnt = netdev_refcnt_read(dev);
5645
5646	while (refcnt != 0) {
5647		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5648			rtnl_lock();
5649
5650			/* Rebroadcast unregister notification */
5651			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5652			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5653			 * should have already handle it the first time */
 
 
5654
5655			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5656				     &dev->state)) {
5657				/* We must not have linkwatch events
5658				 * pending on unregister. If this
5659				 * happens, we simply run the queue
5660				 * unscheduled, resulting in a noop
5661				 * for this device.
5662				 */
5663				linkwatch_run_queue();
5664			}
5665
5666			__rtnl_unlock();
5667
5668			rebroadcast_time = jiffies;
5669		}
5670
5671		msleep(250);
5672
5673		refcnt = netdev_refcnt_read(dev);
5674
5675		if (time_after(jiffies, warning_time + 10 * HZ)) {
5676			printk(KERN_EMERG "unregister_netdevice: "
5677			       "waiting for %s to become free. Usage "
5678			       "count = %d\n",
5679			       dev->name, refcnt);
5680			warning_time = jiffies;
5681		}
5682	}
5683}
5684
5685/* The sequence is:
5686 *
5687 *	rtnl_lock();
5688 *	...
5689 *	register_netdevice(x1);
5690 *	register_netdevice(x2);
5691 *	...
5692 *	unregister_netdevice(y1);
5693 *	unregister_netdevice(y2);
5694 *      ...
5695 *	rtnl_unlock();
5696 *	free_netdev(y1);
5697 *	free_netdev(y2);
5698 *
5699 * We are invoked by rtnl_unlock().
5700 * This allows us to deal with problems:
5701 * 1) We can delete sysfs objects which invoke hotplug
5702 *    without deadlocking with linkwatch via keventd.
5703 * 2) Since we run with the RTNL semaphore not held, we can sleep
5704 *    safely in order to wait for the netdev refcnt to drop to zero.
5705 *
5706 * We must not return until all unregister events added during
5707 * the interval the lock was held have been completed.
5708 */
5709void netdev_run_todo(void)
5710{
5711	struct list_head list;
 
 
 
 
 
 
 
 
 
 
 
 
 
5712
5713	/* Snapshot list, allow later requests */
5714	list_replace_init(&net_todo_list, &list);
5715
5716	__rtnl_unlock();
5717
 
 
 
 
 
5718	while (!list_empty(&list)) {
5719		struct net_device *dev
5720			= list_first_entry(&list, struct net_device, todo_list);
5721		list_del(&dev->todo_list);
5722
5723		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5724			printk(KERN_ERR "network todo '%s' but state %d\n",
5725			       dev->name, dev->reg_state);
5726			dump_stack();
5727			continue;
5728		}
5729
5730		dev->reg_state = NETREG_UNREGISTERED;
5731
5732		on_each_cpu(flush_backlog, dev, 1);
5733
5734		netdev_wait_allrefs(dev);
5735
5736		/* paranoia */
5737		BUG_ON(netdev_refcnt_read(dev));
5738		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5739		WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
 
 
 
5740		WARN_ON(dev->dn_ptr);
 
 
 
 
 
5741
5742		if (dev->destructor)
5743			dev->destructor(dev);
 
 
 
5744
5745		/* Free network device */
5746		kobject_put(&dev->dev.kobj);
5747	}
5748}
5749
5750/* Convert net_device_stats to rtnl_link_stats64.  They have the same
5751 * fields in the same order, with only the type differing.
 
 
5752 */
5753static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5754				    const struct net_device_stats *netdev_stats)
5755{
5756#if BITS_PER_LONG == 64
5757        BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5758        memcpy(stats64, netdev_stats, sizeof(*stats64));
 
 
 
5759#else
5760	size_t i, n = sizeof(*stats64) / sizeof(u64);
5761	const unsigned long *src = (const unsigned long *)netdev_stats;
5762	u64 *dst = (u64 *)stats64;
5763
5764	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5765		     sizeof(*stats64) / sizeof(u64));
5766	for (i = 0; i < n; i++)
5767		dst[i] = src[i];
 
 
 
5768#endif
5769}
 
5770
5771/**
5772 *	dev_get_stats	- get network device statistics
5773 *	@dev: device to get statistics from
5774 *	@storage: place to store stats
5775 *
5776 *	Get network statistics from device. Return @storage.
5777 *	The device driver may provide its own method by setting
5778 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5779 *	otherwise the internal statistics structure is used.
5780 */
5781struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5782					struct rtnl_link_stats64 *storage)
5783{
5784	const struct net_device_ops *ops = dev->netdev_ops;
5785
5786	if (ops->ndo_get_stats64) {
5787		memset(storage, 0, sizeof(*storage));
5788		ops->ndo_get_stats64(dev, storage);
5789	} else if (ops->ndo_get_stats) {
5790		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5791	} else {
5792		netdev_stats_to_stats64(storage, &dev->stats);
5793	}
5794	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
 
 
5795	return storage;
5796}
5797EXPORT_SYMBOL(dev_get_stats);
5798
5799struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5800{
5801	struct netdev_queue *queue = dev_ingress_queue(dev);
5802
5803#ifdef CONFIG_NET_CLS_ACT
5804	if (queue)
5805		return queue;
5806	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5807	if (!queue)
5808		return NULL;
5809	netdev_init_one_queue(dev, queue, NULL);
5810	queue->qdisc = &noop_qdisc;
5811	queue->qdisc_sleeping = &noop_qdisc;
5812	rcu_assign_pointer(dev->ingress_queue, queue);
5813#endif
5814	return queue;
5815}
5816
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5817/**
5818 *	alloc_netdev_mqs - allocate network device
5819 *	@sizeof_priv:	size of private data to allocate space for
5820 *	@name:		device name format string
5821 *	@setup:		callback to initialize device
5822 *	@txqs:		the number of TX subqueues to allocate
5823 *	@rxqs:		the number of RX subqueues to allocate
5824 *
5825 *	Allocates a struct net_device with private data area for driver use
5826 *	and performs basic initialization.  Also allocates subquue structs
5827 *	for each queue on the device.
 
5828 */
5829struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 
5830		void (*setup)(struct net_device *),
5831		unsigned int txqs, unsigned int rxqs)
5832{
5833	struct net_device *dev;
5834	size_t alloc_size;
5835	struct net_device *p;
5836
5837	BUG_ON(strlen(name) >= sizeof(dev->name));
5838
5839	if (txqs < 1) {
5840		pr_err("alloc_netdev: Unable to allocate device "
5841		       "with zero queues.\n");
5842		return NULL;
5843	}
5844
5845#ifdef CONFIG_RPS
5846	if (rxqs < 1) {
5847		pr_err("alloc_netdev: Unable to allocate device "
5848		       "with zero RX queues.\n");
5849		return NULL;
5850	}
5851#endif
5852
5853	alloc_size = sizeof(struct net_device);
5854	if (sizeof_priv) {
5855		/* ensure 32-byte alignment of private area */
5856		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5857		alloc_size += sizeof_priv;
5858	}
5859	/* ensure 32-byte alignment of whole construct */
5860	alloc_size += NETDEV_ALIGN - 1;
5861
5862	p = kzalloc(alloc_size, GFP_KERNEL);
5863	if (!p) {
5864		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5865		return NULL;
5866	}
5867
5868	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5869	dev->padded = (char *)dev - (char *)p;
5870
5871	dev->pcpu_refcnt = alloc_percpu(int);
5872	if (!dev->pcpu_refcnt)
5873		goto free_p;
5874
5875	if (dev_addr_init(dev))
5876		goto free_pcpu;
5877
5878	dev_mc_init(dev);
5879	dev_uc_init(dev);
5880
5881	dev_net_set(dev, &init_net);
5882
5883	dev->gso_max_size = GSO_MAX_SIZE;
 
 
 
 
 
 
 
5884
5885	INIT_LIST_HEAD(&dev->napi_list);
5886	INIT_LIST_HEAD(&dev->unreg_list);
 
5887	INIT_LIST_HEAD(&dev->link_watch_list);
5888	dev->priv_flags = IFF_XMIT_DST_RELEASE;
 
 
 
 
 
 
 
 
5889	setup(dev);
5890
 
 
 
 
 
5891	dev->num_tx_queues = txqs;
5892	dev->real_num_tx_queues = txqs;
5893	if (netif_alloc_netdev_queues(dev))
5894		goto free_all;
5895
5896#ifdef CONFIG_RPS
5897	dev->num_rx_queues = rxqs;
5898	dev->real_num_rx_queues = rxqs;
5899	if (netif_alloc_rx_queues(dev))
5900		goto free_all;
5901#endif
5902
5903	strcpy(dev->name, name);
 
5904	dev->group = INIT_NETDEV_GROUP;
 
 
 
 
 
5905	return dev;
5906
5907free_all:
5908	free_netdev(dev);
5909	return NULL;
5910
5911free_pcpu:
5912	free_percpu(dev->pcpu_refcnt);
5913	kfree(dev->_tx);
5914#ifdef CONFIG_RPS
5915	kfree(dev->_rx);
5916#endif
5917
5918free_p:
5919	kfree(p);
5920	return NULL;
5921}
5922EXPORT_SYMBOL(alloc_netdev_mqs);
5923
5924/**
5925 *	free_netdev - free network device
5926 *	@dev: device
5927 *
5928 *	This function does the last stage of destroying an allocated device
5929 * 	interface. The reference to the device object is released.
5930 *	If this is the last reference then it will be freed.
 
5931 */
5932void free_netdev(struct net_device *dev)
5933{
5934	struct napi_struct *p, *n;
5935
5936	release_net(dev_net(dev));
5937
5938	kfree(dev->_tx);
5939#ifdef CONFIG_RPS
5940	kfree(dev->_rx);
5941#endif
5942
5943	kfree(rcu_dereference_raw(dev->ingress_queue));
5944
5945	/* Flush device addresses */
5946	dev_addr_flush(dev);
5947
5948	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5949		netif_napi_del(p);
5950
5951	free_percpu(dev->pcpu_refcnt);
5952	dev->pcpu_refcnt = NULL;
 
 
5953
5954	/*  Compatibility with error handling in drivers */
5955	if (dev->reg_state == NETREG_UNINITIALIZED) {
5956		kfree((char *)dev - dev->padded);
5957		return;
5958	}
5959
5960	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5961	dev->reg_state = NETREG_RELEASED;
5962
5963	/* will free via device release */
5964	put_device(&dev->dev);
5965}
5966EXPORT_SYMBOL(free_netdev);
5967
5968/**
5969 *	synchronize_net -  Synchronize with packet receive processing
5970 *
5971 *	Wait for packets currently being received to be done.
5972 *	Does not block later packets from starting.
5973 */
5974void synchronize_net(void)
5975{
5976	might_sleep();
5977	if (rtnl_is_locked())
5978		synchronize_rcu_expedited();
5979	else
5980		synchronize_rcu();
5981}
5982EXPORT_SYMBOL(synchronize_net);
5983
5984/**
5985 *	unregister_netdevice_queue - remove device from the kernel
5986 *	@dev: device
5987 *	@head: list
5988 *
5989 *	This function shuts down a device interface and removes it
5990 *	from the kernel tables.
5991 *	If head not NULL, device is queued to be unregistered later.
5992 *
5993 *	Callers must hold the rtnl semaphore.  You may want
5994 *	unregister_netdev() instead of this.
5995 */
5996
5997void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5998{
5999	ASSERT_RTNL();
6000
6001	if (head) {
6002		list_move_tail(&dev->unreg_list, head);
6003	} else {
6004		rollback_registered(dev);
6005		/* Finish processing unregister after unlock */
6006		net_set_todo(dev);
6007	}
6008}
6009EXPORT_SYMBOL(unregister_netdevice_queue);
6010
6011/**
6012 *	unregister_netdevice_many - unregister many devices
6013 *	@head: list of devices
 
 
 
6014 */
6015void unregister_netdevice_many(struct list_head *head)
6016{
6017	struct net_device *dev;
6018
6019	if (!list_empty(head)) {
6020		rollback_registered_many(head);
6021		list_for_each_entry(dev, head, unreg_list)
6022			net_set_todo(dev);
 
6023	}
6024}
6025EXPORT_SYMBOL(unregister_netdevice_many);
6026
6027/**
6028 *	unregister_netdev - remove device from the kernel
6029 *	@dev: device
6030 *
6031 *	This function shuts down a device interface and removes it
6032 *	from the kernel tables.
6033 *
6034 *	This is just a wrapper for unregister_netdevice that takes
6035 *	the rtnl semaphore.  In general you want to use this and not
6036 *	unregister_netdevice.
6037 */
6038void unregister_netdev(struct net_device *dev)
6039{
6040	rtnl_lock();
6041	unregister_netdevice(dev);
6042	rtnl_unlock();
6043}
6044EXPORT_SYMBOL(unregister_netdev);
6045
6046/**
6047 *	dev_change_net_namespace - move device to different nethost namespace
6048 *	@dev: device
6049 *	@net: network namespace
6050 *	@pat: If not NULL name pattern to try if the current device name
6051 *	      is already taken in the destination network namespace.
6052 *
6053 *	This function shuts down a device interface and moves it
6054 *	to a new network namespace. On success 0 is returned, on
6055 *	a failure a netagive errno code is returned.
6056 *
6057 *	Callers must hold the rtnl semaphore.
6058 */
6059
6060int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6061{
6062	int err;
 
6063
6064	ASSERT_RTNL();
6065
6066	/* Don't allow namespace local devices to be moved. */
6067	err = -EINVAL;
6068	if (dev->features & NETIF_F_NETNS_LOCAL)
6069		goto out;
6070
6071	/* Ensure the device has been registrered */
6072	err = -EINVAL;
6073	if (dev->reg_state != NETREG_REGISTERED)
6074		goto out;
6075
6076	/* Get out if there is nothing todo */
6077	err = 0;
6078	if (net_eq(dev_net(dev), net))
6079		goto out;
6080
6081	/* Pick the destination device name, and ensure
6082	 * we can use it in the destination network namespace.
6083	 */
6084	err = -EEXIST;
6085	if (__dev_get_by_name(net, dev->name)) {
6086		/* We get here if we can't use the current device name */
6087		if (!pat)
6088			goto out;
6089		if (dev_get_valid_name(dev, pat) < 0)
 
6090			goto out;
6091	}
6092
6093	/*
6094	 * And now a mini version of register_netdevice unregister_netdevice.
6095	 */
6096
6097	/* If device is running close it first. */
6098	dev_close(dev);
6099
6100	/* And unlink it from device chain */
6101	err = -ENODEV;
6102	unlist_netdevice(dev);
6103
6104	synchronize_net();
6105
6106	/* Shutdown queueing discipline. */
6107	dev_shutdown(dev);
6108
6109	/* Notify protocols, that we are about to destroy
6110	   this device. They should clean all the things.
6111
6112	   Note that dev->reg_state stays at NETREG_REGISTERED.
6113	   This is wanted because this way 8021q and macvlan know
6114	   the device is just moving and can keep their slaves up.
6115	*/
6116	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6117	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
 
 
 
 
 
 
 
 
 
 
6118
6119	/*
6120	 *	Flush the unicast and multicast chains
6121	 */
6122	dev_uc_flush(dev);
6123	dev_mc_flush(dev);
6124
 
 
 
 
 
 
 
6125	/* Actually switch the network namespace */
6126	dev_net_set(dev, net);
 
6127
6128	/* If there is an ifindex conflict assign a new one */
6129	if (__dev_get_by_index(net, dev->ifindex)) {
6130		int iflink = (dev->iflink == dev->ifindex);
6131		dev->ifindex = dev_new_index(net);
6132		if (iflink)
6133			dev->iflink = dev->ifindex;
6134	}
6135
6136	/* Fixup kobjects */
6137	err = device_rename(&dev->dev, dev->name);
6138	WARN_ON(err);
6139
 
 
 
 
 
 
6140	/* Add the device back in the hashes */
6141	list_netdevice(dev);
6142
6143	/* Notify protocols, that a new device appeared. */
6144	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6145
6146	/*
6147	 *	Prevent userspace races by waiting until the network
6148	 *	device is fully setup before sending notifications.
6149	 */
6150	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6151
6152	synchronize_net();
6153	err = 0;
6154out:
6155	return err;
6156}
6157EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6158
6159static int dev_cpu_callback(struct notifier_block *nfb,
6160			    unsigned long action,
6161			    void *ocpu)
6162{
6163	struct sk_buff **list_skb;
6164	struct sk_buff *skb;
6165	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6166	struct softnet_data *sd, *oldsd;
6167
6168	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6169		return NOTIFY_OK;
6170
6171	local_irq_disable();
6172	cpu = smp_processor_id();
6173	sd = &per_cpu(softnet_data, cpu);
6174	oldsd = &per_cpu(softnet_data, oldcpu);
6175
6176	/* Find end of our completion_queue. */
6177	list_skb = &sd->completion_queue;
6178	while (*list_skb)
6179		list_skb = &(*list_skb)->next;
6180	/* Append completion queue from offline CPU. */
6181	*list_skb = oldsd->completion_queue;
6182	oldsd->completion_queue = NULL;
6183
6184	/* Append output queue from offline CPU. */
6185	if (oldsd->output_queue) {
6186		*sd->output_queue_tailp = oldsd->output_queue;
6187		sd->output_queue_tailp = oldsd->output_queue_tailp;
6188		oldsd->output_queue = NULL;
6189		oldsd->output_queue_tailp = &oldsd->output_queue;
6190	}
6191	/* Append NAPI poll list from offline CPU. */
6192	if (!list_empty(&oldsd->poll_list)) {
6193		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6194		raise_softirq_irqoff(NET_RX_SOFTIRQ);
 
 
 
 
 
 
 
 
 
 
6195	}
6196
6197	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6198	local_irq_enable();
6199
 
 
 
 
 
 
 
6200	/* Process offline CPU's input_pkt_queue */
6201	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6202		netif_rx(skb);
6203		input_queue_head_incr(oldsd);
6204	}
6205	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6206		netif_rx(skb);
6207		input_queue_head_incr(oldsd);
6208	}
6209
6210	return NOTIFY_OK;
6211}
6212
6213
6214/**
6215 *	netdev_increment_features - increment feature set by one
6216 *	@all: current feature set
6217 *	@one: new feature set
6218 *	@mask: mask feature set
6219 *
6220 *	Computes a new feature set after adding a device with feature set
6221 *	@one to the master device with current feature set @all.  Will not
6222 *	enable anything that is off in @mask. Returns the new feature set.
6223 */
6224u32 netdev_increment_features(u32 all, u32 one, u32 mask)
 
6225{
6226	if (mask & NETIF_F_GEN_CSUM)
6227		mask |= NETIF_F_ALL_CSUM;
6228	mask |= NETIF_F_VLAN_CHALLENGED;
6229
6230	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6231	all &= one | ~NETIF_F_ALL_FOR_ALL;
6232
6233	/* If device needs checksumming, downgrade to it. */
6234	if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6235		all &= ~NETIF_F_NO_CSUM;
6236
6237	/* If one device supports hw checksumming, set for all. */
6238	if (all & NETIF_F_GEN_CSUM)
6239		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6240
6241	return all;
6242}
6243EXPORT_SYMBOL(netdev_increment_features);
6244
6245static struct hlist_head *netdev_create_hash(void)
6246{
6247	int i;
6248	struct hlist_head *hash;
6249
6250	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6251	if (hash != NULL)
6252		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6253			INIT_HLIST_HEAD(&hash[i]);
6254
6255	return hash;
6256}
6257
6258/* Initialize per network namespace state */
6259static int __net_init netdev_init(struct net *net)
6260{
6261	INIT_LIST_HEAD(&net->dev_base_head);
 
 
 
 
6262
6263	net->dev_name_head = netdev_create_hash();
6264	if (net->dev_name_head == NULL)
6265		goto err_name;
6266
6267	net->dev_index_head = netdev_create_hash();
6268	if (net->dev_index_head == NULL)
6269		goto err_idx;
6270
 
 
6271	return 0;
6272
6273err_idx:
6274	kfree(net->dev_name_head);
6275err_name:
6276	return -ENOMEM;
6277}
6278
6279/**
6280 *	netdev_drivername - network driver for the device
6281 *	@dev: network device
6282 *
6283 *	Determine network driver for device.
6284 */
6285const char *netdev_drivername(const struct net_device *dev)
6286{
6287	const struct device_driver *driver;
6288	const struct device *parent;
6289	const char *empty = "";
6290
6291	parent = dev->dev.parent;
6292	if (!parent)
6293		return empty;
6294
6295	driver = parent->driver;
6296	if (driver && driver->name)
6297		return driver->name;
6298	return empty;
6299}
6300
6301static int __netdev_printk(const char *level, const struct net_device *dev,
6302			   struct va_format *vaf)
6303{
6304	int r;
6305
6306	if (dev && dev->dev.parent)
6307		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6308			       netdev_name(dev), vaf);
6309	else if (dev)
6310		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6311	else
6312		r = printk("%s(NULL net_device): %pV", level, vaf);
6313
6314	return r;
 
 
 
6315}
6316
6317int netdev_printk(const char *level, const struct net_device *dev,
6318		  const char *format, ...)
6319{
6320	struct va_format vaf;
6321	va_list args;
6322	int r;
6323
6324	va_start(args, format);
6325
6326	vaf.fmt = format;
6327	vaf.va = &args;
6328
6329	r = __netdev_printk(level, dev, &vaf);
6330	va_end(args);
6331
6332	return r;
6333}
6334EXPORT_SYMBOL(netdev_printk);
6335
6336#define define_netdev_printk_level(func, level)			\
6337int func(const struct net_device *dev, const char *fmt, ...)	\
6338{								\
6339	int r;							\
6340	struct va_format vaf;					\
6341	va_list args;						\
6342								\
6343	va_start(args, fmt);					\
6344								\
6345	vaf.fmt = fmt;						\
6346	vaf.va = &args;						\
6347								\
6348	r = __netdev_printk(level, dev, &vaf);			\
6349	va_end(args);						\
6350								\
6351	return r;						\
6352}								\
6353EXPORT_SYMBOL(func);
6354
6355define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6356define_netdev_printk_level(netdev_alert, KERN_ALERT);
6357define_netdev_printk_level(netdev_crit, KERN_CRIT);
6358define_netdev_printk_level(netdev_err, KERN_ERR);
6359define_netdev_printk_level(netdev_warn, KERN_WARNING);
6360define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6361define_netdev_printk_level(netdev_info, KERN_INFO);
6362
6363static void __net_exit netdev_exit(struct net *net)
6364{
6365	kfree(net->dev_name_head);
6366	kfree(net->dev_index_head);
 
 
6367}
6368
6369static struct pernet_operations __net_initdata netdev_net_ops = {
6370	.init = netdev_init,
6371	.exit = netdev_exit,
6372};
6373
6374static void __net_exit default_device_exit(struct net *net)
6375{
6376	struct net_device *dev, *aux;
6377	/*
6378	 * Push all migratable network devices back to the
6379	 * initial network namespace
6380	 */
6381	rtnl_lock();
6382	for_each_netdev_safe(net, dev, aux) {
6383		int err;
6384		char fb_name[IFNAMSIZ];
6385
6386		/* Ignore unmoveable devices (i.e. loopback) */
6387		if (dev->features & NETIF_F_NETNS_LOCAL)
6388			continue;
6389
6390		/* Leave virtual devices for the generic cleanup */
6391		if (dev->rtnl_link_ops)
6392			continue;
6393
6394		/* Push remaining network devices to init_net */
6395		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
 
 
6396		err = dev_change_net_namespace(dev, &init_net, fb_name);
6397		if (err) {
6398			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6399				__func__, dev->name, err);
6400			BUG();
6401		}
6402	}
6403	rtnl_unlock();
6404}
6405
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6406static void __net_exit default_device_exit_batch(struct list_head *net_list)
6407{
6408	/* At exit all network devices most be removed from a network
6409	 * namespace.  Do this in the reverse order of registration.
6410	 * Do this across as many network namespaces as possible to
6411	 * improve batching efficiency.
6412	 */
6413	struct net_device *dev;
6414	struct net *net;
6415	LIST_HEAD(dev_kill_list);
6416
6417	rtnl_lock();
 
 
 
 
 
 
 
 
 
 
 
6418	list_for_each_entry(net, net_list, exit_list) {
6419		for_each_netdev_reverse(net, dev) {
6420			if (dev->rtnl_link_ops)
6421				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6422			else
6423				unregister_netdevice_queue(dev, &dev_kill_list);
6424		}
6425	}
6426	unregister_netdevice_many(&dev_kill_list);
6427	list_del(&dev_kill_list);
6428	rtnl_unlock();
6429}
6430
6431static struct pernet_operations __net_initdata default_device_ops = {
6432	.exit = default_device_exit,
6433	.exit_batch = default_device_exit_batch,
6434};
6435
6436/*
6437 *	Initialize the DEV module. At boot time this walks the device list and
6438 *	unhooks any devices that fail to initialise (normally hardware not
6439 *	present) and leaves us with a valid list of present and active devices.
6440 *
6441 */
6442
6443/*
6444 *       This is called single threaded during boot, so no need
6445 *       to take the rtnl semaphore.
6446 */
6447static int __init net_dev_init(void)
6448{
6449	int i, rc = -ENOMEM;
6450
6451	BUG_ON(!dev_boot_phase);
6452
6453	if (dev_proc_init())
6454		goto out;
6455
6456	if (netdev_kobject_init())
6457		goto out;
6458
6459	INIT_LIST_HEAD(&ptype_all);
6460	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6461		INIT_LIST_HEAD(&ptype_base[i]);
6462
 
 
6463	if (register_pernet_subsys(&netdev_net_ops))
6464		goto out;
6465
6466	/*
6467	 *	Initialise the packet receive queues.
6468	 */
6469
6470	for_each_possible_cpu(i) {
 
6471		struct softnet_data *sd = &per_cpu(softnet_data, i);
6472
6473		memset(sd, 0, sizeof(*sd));
 
6474		skb_queue_head_init(&sd->input_pkt_queue);
6475		skb_queue_head_init(&sd->process_queue);
6476		sd->completion_queue = NULL;
 
 
6477		INIT_LIST_HEAD(&sd->poll_list);
6478		sd->output_queue = NULL;
6479		sd->output_queue_tailp = &sd->output_queue;
6480#ifdef CONFIG_RPS
6481		sd->csd.func = rps_trigger_softirq;
6482		sd->csd.info = sd;
6483		sd->csd.flags = 0;
6484		sd->cpu = i;
6485#endif
6486
 
6487		sd->backlog.poll = process_backlog;
6488		sd->backlog.weight = weight_p;
6489		sd->backlog.gro_list = NULL;
6490		sd->backlog.gro_count = 0;
6491	}
6492
6493	dev_boot_phase = 0;
6494
6495	/* The loopback device is special if any other network devices
6496	 * is present in a network namespace the loopback device must
6497	 * be present. Since we now dynamically allocate and free the
6498	 * loopback device ensure this invariant is maintained by
6499	 * keeping the loopback device as the first device on the
6500	 * list of network devices.  Ensuring the loopback devices
6501	 * is the first device that appears and the last network device
6502	 * that disappears.
6503	 */
6504	if (register_pernet_device(&loopback_net_ops))
6505		goto out;
6506
6507	if (register_pernet_device(&default_device_ops))
6508		goto out;
6509
6510	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6511	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6512
6513	hotcpu_notifier(dev_cpu_callback, 0);
6514	dst_init();
6515	dev_mcast_init();
6516	rc = 0;
6517out:
6518	return rc;
6519}
6520
6521subsys_initcall(net_dev_init);
6522
6523static int __init initialize_hashrnd(void)
6524{
6525	get_random_bytes(&hashrnd, sizeof(hashrnd));
6526	return 0;
6527}
6528
6529late_initcall_sync(initialize_hashrnd);
6530
v5.9
    1// SPDX-License-Identifier: GPL-2.0-or-later
    2/*
    3 *      NET3    Protocol independent device support routines.
 
 
 
 
 
    4 *
    5 *	Derived from the non IP parts of dev.c 1.0.19
    6 *              Authors:	Ross Biro
    7 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
    8 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
    9 *
   10 *	Additional Authors:
   11 *		Florian la Roche <rzsfl@rz.uni-sb.de>
   12 *		Alan Cox <gw4pts@gw4pts.ampr.org>
   13 *		David Hinds <dahinds@users.sourceforge.net>
   14 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
   15 *		Adam Sulmicki <adam@cfar.umd.edu>
   16 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
   17 *
   18 *	Changes:
   19 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
   20 *                                      to 2 if register_netdev gets called
   21 *                                      before net_dev_init & also removed a
   22 *                                      few lines of code in the process.
   23 *		Alan Cox	:	device private ioctl copies fields back.
   24 *		Alan Cox	:	Transmit queue code does relevant
   25 *					stunts to keep the queue safe.
   26 *		Alan Cox	:	Fixed double lock.
   27 *		Alan Cox	:	Fixed promisc NULL pointer trap
   28 *		????????	:	Support the full private ioctl range
   29 *		Alan Cox	:	Moved ioctl permission check into
   30 *					drivers
   31 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
   32 *		Alan Cox	:	100 backlog just doesn't cut it when
   33 *					you start doing multicast video 8)
   34 *		Alan Cox	:	Rewrote net_bh and list manager.
   35 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
   36 *		Alan Cox	:	Took out transmit every packet pass
   37 *					Saved a few bytes in the ioctl handler
   38 *		Alan Cox	:	Network driver sets packet type before
   39 *					calling netif_rx. Saves a function
   40 *					call a packet.
   41 *		Alan Cox	:	Hashed net_bh()
   42 *		Richard Kooijman:	Timestamp fixes.
   43 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
   44 *		Alan Cox	:	Device lock protection.
   45 *              Alan Cox        :       Fixed nasty side effect of device close
   46 *					changes.
   47 *		Rudi Cilibrasi	:	Pass the right thing to
   48 *					set_mac_address()
   49 *		Dave Miller	:	32bit quantity for the device lock to
   50 *					make it work out on a Sparc.
   51 *		Bjorn Ekwall	:	Added KERNELD hack.
   52 *		Alan Cox	:	Cleaned up the backlog initialise.
   53 *		Craig Metz	:	SIOCGIFCONF fix if space for under
   54 *					1 device.
   55 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
   56 *					is no device open function.
   57 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
   58 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
   59 *		Cyrus Durgin	:	Cleaned for KMOD
   60 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
   61 *					A network device unload needs to purge
   62 *					the backlog queue.
   63 *	Paul Rusty Russell	:	SIOCSIFNAME
   64 *              Pekka Riikonen  :	Netdev boot-time settings code
   65 *              Andrew Morton   :       Make unregister_netdevice wait
   66 *                                      indefinitely on dev->refcnt
   67 *              J Hadi Salim    :       - Backlog queue sampling
   68 *				        - netif_rx() feedback
   69 */
   70
   71#include <linux/uaccess.h>
 
   72#include <linux/bitops.h>
   73#include <linux/capability.h>
   74#include <linux/cpu.h>
   75#include <linux/types.h>
   76#include <linux/kernel.h>
   77#include <linux/hash.h>
   78#include <linux/slab.h>
   79#include <linux/sched.h>
   80#include <linux/sched/mm.h>
   81#include <linux/mutex.h>
   82#include <linux/rwsem.h>
   83#include <linux/string.h>
   84#include <linux/mm.h>
   85#include <linux/socket.h>
   86#include <linux/sockios.h>
   87#include <linux/errno.h>
   88#include <linux/interrupt.h>
   89#include <linux/if_ether.h>
   90#include <linux/netdevice.h>
   91#include <linux/etherdevice.h>
   92#include <linux/ethtool.h>
 
   93#include <linux/skbuff.h>
   94#include <linux/bpf.h>
   95#include <linux/bpf_trace.h>
   96#include <net/net_namespace.h>
   97#include <net/sock.h>
   98#include <net/busy_poll.h>
   99#include <linux/rtnetlink.h>
 
 
  100#include <linux/stat.h>
  101#include <net/dst.h>
  102#include <net/dst_metadata.h>
  103#include <net/pkt_sched.h>
  104#include <net/pkt_cls.h>
  105#include <net/checksum.h>
  106#include <net/xfrm.h>
  107#include <linux/highmem.h>
  108#include <linux/init.h>
 
  109#include <linux/module.h>
  110#include <linux/netpoll.h>
  111#include <linux/rcupdate.h>
  112#include <linux/delay.h>
 
  113#include <net/iw_handler.h>
  114#include <asm/current.h>
  115#include <linux/audit.h>
  116#include <linux/dmaengine.h>
  117#include <linux/err.h>
  118#include <linux/ctype.h>
  119#include <linux/if_arp.h>
  120#include <linux/if_vlan.h>
  121#include <linux/ip.h>
  122#include <net/ip.h>
  123#include <net/mpls.h>
  124#include <linux/ipv6.h>
  125#include <linux/in.h>
  126#include <linux/jhash.h>
  127#include <linux/random.h>
  128#include <trace/events/napi.h>
  129#include <trace/events/net.h>
  130#include <trace/events/skb.h>
 
  131#include <linux/inetdevice.h>
  132#include <linux/cpu_rmap.h>
  133#include <linux/static_key.h>
  134#include <linux/hashtable.h>
  135#include <linux/vmalloc.h>
  136#include <linux/if_macvlan.h>
  137#include <linux/errqueue.h>
  138#include <linux/hrtimer.h>
  139#include <linux/netfilter_ingress.h>
  140#include <linux/crash_dump.h>
  141#include <linux/sctp.h>
  142#include <net/udp_tunnel.h>
  143#include <linux/net_namespace.h>
  144#include <linux/indirect_call_wrapper.h>
  145#include <net/devlink.h>
  146#include <linux/pm_runtime.h>
  147
  148#include "net-sysfs.h"
  149
 
  150#define MAX_GRO_SKBS 8
  151
  152/* This should be increased if a protocol with a bigger head is added. */
  153#define GRO_MAX_HEAD (MAX_HEADER + 128)
  154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  155static DEFINE_SPINLOCK(ptype_lock);
  156static DEFINE_SPINLOCK(offload_lock);
  157struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
  158struct list_head ptype_all __read_mostly;	/* Taps */
  159static struct list_head offload_base __read_mostly;
  160
  161static int netif_rx_internal(struct sk_buff *skb);
  162static int call_netdevice_notifiers_info(unsigned long val,
  163					 struct netdev_notifier_info *info);
  164static int call_netdevice_notifiers_extack(unsigned long val,
  165					   struct net_device *dev,
  166					   struct netlink_ext_ack *extack);
  167static struct napi_struct *napi_by_id(unsigned int napi_id);
  168
  169/*
  170 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
  171 * semaphore.
  172 *
  173 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
  174 *
  175 * Writers must hold the rtnl semaphore while they loop through the
  176 * dev_base_head list, and hold dev_base_lock for writing when they do the
  177 * actual updates.  This allows pure readers to access the list even
  178 * while a writer is preparing to update it.
  179 *
  180 * To put it another way, dev_base_lock is held for writing only to
  181 * protect against pure readers; the rtnl semaphore provides the
  182 * protection against other writers.
  183 *
  184 * See, for example usages, register_netdevice() and
  185 * unregister_netdevice(), which must be called with the rtnl
  186 * semaphore held.
  187 */
  188DEFINE_RWLOCK(dev_base_lock);
  189EXPORT_SYMBOL(dev_base_lock);
  190
  191static DEFINE_MUTEX(ifalias_mutex);
  192
  193/* protects napi_hash addition/deletion and napi_gen_id */
  194static DEFINE_SPINLOCK(napi_hash_lock);
  195
  196static unsigned int napi_gen_id = NR_CPUS;
  197static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
  198
  199static DECLARE_RWSEM(devnet_rename_sem);
  200
  201static inline void dev_base_seq_inc(struct net *net)
  202{
  203	while (++net->dev_base_seq == 0)
  204		;
  205}
  206
  207static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
  208{
  209	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
  210
  211	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
  212}
  213
  214static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
  215{
  216	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
  217}
  218
  219static inline void rps_lock(struct softnet_data *sd)
  220{
  221#ifdef CONFIG_RPS
  222	spin_lock(&sd->input_pkt_queue.lock);
  223#endif
  224}
  225
  226static inline void rps_unlock(struct softnet_data *sd)
  227{
  228#ifdef CONFIG_RPS
  229	spin_unlock(&sd->input_pkt_queue.lock);
  230#endif
  231}
  232
  233static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
  234						       const char *name)
  235{
  236	struct netdev_name_node *name_node;
  237
  238	name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
  239	if (!name_node)
  240		return NULL;
  241	INIT_HLIST_NODE(&name_node->hlist);
  242	name_node->dev = dev;
  243	name_node->name = name;
  244	return name_node;
  245}
  246
  247static struct netdev_name_node *
  248netdev_name_node_head_alloc(struct net_device *dev)
  249{
  250	struct netdev_name_node *name_node;
  251
  252	name_node = netdev_name_node_alloc(dev, dev->name);
  253	if (!name_node)
  254		return NULL;
  255	INIT_LIST_HEAD(&name_node->list);
  256	return name_node;
  257}
  258
  259static void netdev_name_node_free(struct netdev_name_node *name_node)
  260{
  261	kfree(name_node);
  262}
  263
  264static void netdev_name_node_add(struct net *net,
  265				 struct netdev_name_node *name_node)
  266{
  267	hlist_add_head_rcu(&name_node->hlist,
  268			   dev_name_hash(net, name_node->name));
  269}
  270
  271static void netdev_name_node_del(struct netdev_name_node *name_node)
  272{
  273	hlist_del_rcu(&name_node->hlist);
  274}
  275
  276static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
  277							const char *name)
  278{
  279	struct hlist_head *head = dev_name_hash(net, name);
  280	struct netdev_name_node *name_node;
  281
  282	hlist_for_each_entry(name_node, head, hlist)
  283		if (!strcmp(name_node->name, name))
  284			return name_node;
  285	return NULL;
  286}
  287
  288static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
  289							    const char *name)
  290{
  291	struct hlist_head *head = dev_name_hash(net, name);
  292	struct netdev_name_node *name_node;
  293
  294	hlist_for_each_entry_rcu(name_node, head, hlist)
  295		if (!strcmp(name_node->name, name))
  296			return name_node;
  297	return NULL;
  298}
  299
  300int netdev_name_node_alt_create(struct net_device *dev, const char *name)
  301{
  302	struct netdev_name_node *name_node;
  303	struct net *net = dev_net(dev);
  304
  305	name_node = netdev_name_node_lookup(net, name);
  306	if (name_node)
  307		return -EEXIST;
  308	name_node = netdev_name_node_alloc(dev, name);
  309	if (!name_node)
  310		return -ENOMEM;
  311	netdev_name_node_add(net, name_node);
  312	/* The node that holds dev->name acts as a head of per-device list. */
  313	list_add_tail(&name_node->list, &dev->name_node->list);
  314
  315	return 0;
  316}
  317EXPORT_SYMBOL(netdev_name_node_alt_create);
  318
  319static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
  320{
  321	list_del(&name_node->list);
  322	netdev_name_node_del(name_node);
  323	kfree(name_node->name);
  324	netdev_name_node_free(name_node);
  325}
  326
  327int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
  328{
  329	struct netdev_name_node *name_node;
  330	struct net *net = dev_net(dev);
  331
  332	name_node = netdev_name_node_lookup(net, name);
  333	if (!name_node)
  334		return -ENOENT;
  335	/* lookup might have found our primary name or a name belonging
  336	 * to another device.
  337	 */
  338	if (name_node == dev->name_node || name_node->dev != dev)
  339		return -EINVAL;
  340
  341	__netdev_name_node_alt_destroy(name_node);
  342
  343	return 0;
  344}
  345EXPORT_SYMBOL(netdev_name_node_alt_destroy);
  346
  347static void netdev_name_node_alt_flush(struct net_device *dev)
  348{
  349	struct netdev_name_node *name_node, *tmp;
  350
  351	list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
  352		__netdev_name_node_alt_destroy(name_node);
  353}
  354
  355/* Device list insertion */
  356static void list_netdevice(struct net_device *dev)
  357{
  358	struct net *net = dev_net(dev);
  359
  360	ASSERT_RTNL();
  361
  362	write_lock_bh(&dev_base_lock);
  363	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
  364	netdev_name_node_add(net, dev->name_node);
  365	hlist_add_head_rcu(&dev->index_hlist,
  366			   dev_index_hash(net, dev->ifindex));
  367	write_unlock_bh(&dev_base_lock);
  368
  369	dev_base_seq_inc(net);
 
 
  370}
  371
  372/* Device list removal
  373 * caller must respect a RCU grace period before freeing/reusing dev
  374 */
  375static void unlist_netdevice(struct net_device *dev)
  376{
  377	ASSERT_RTNL();
  378
  379	/* Unlink dev from the device chain */
  380	write_lock_bh(&dev_base_lock);
  381	list_del_rcu(&dev->dev_list);
  382	netdev_name_node_del(dev->name_node);
  383	hlist_del_rcu(&dev->index_hlist);
  384	write_unlock_bh(&dev_base_lock);
  385
  386	dev_base_seq_inc(dev_net(dev));
  387}
  388
  389/*
  390 *	Our notifier list
  391 */
  392
  393static RAW_NOTIFIER_HEAD(netdev_chain);
  394
  395/*
  396 *	Device drivers call our routines to queue packets here. We empty the
  397 *	queue in the local softnet handler.
  398 */
  399
  400DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
  401EXPORT_PER_CPU_SYMBOL(softnet_data);
  402
  403#ifdef CONFIG_LOCKDEP
  404/*
  405 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
  406 * according to dev->type
  407 */
  408static const unsigned short netdev_lock_type[] = {
  409	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
  410	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
  411	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
  412	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
  413	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
  414	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
  415	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
  416	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
  417	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
  418	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
  419	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
  420	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
  421	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
  422	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
  423	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
  424
  425static const char *const netdev_lock_name[] = {
  426	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
  427	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
  428	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
  429	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
  430	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
  431	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
  432	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
  433	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
  434	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
  435	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
  436	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
  437	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
  438	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
  439	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
  440	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 
 
  441
  442static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
  443static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
  444
  445static inline unsigned short netdev_lock_pos(unsigned short dev_type)
  446{
  447	int i;
  448
  449	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
  450		if (netdev_lock_type[i] == dev_type)
  451			return i;
  452	/* the last key is used by default */
  453	return ARRAY_SIZE(netdev_lock_type) - 1;
  454}
  455
  456static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
  457						 unsigned short dev_type)
  458{
  459	int i;
  460
  461	i = netdev_lock_pos(dev_type);
  462	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
  463				   netdev_lock_name[i]);
  464}
  465
  466static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
  467{
  468	int i;
  469
  470	i = netdev_lock_pos(dev->type);
  471	lockdep_set_class_and_name(&dev->addr_list_lock,
  472				   &netdev_addr_lock_key[i],
  473				   netdev_lock_name[i]);
  474}
  475#else
  476static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
  477						 unsigned short dev_type)
  478{
  479}
  480
  481static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
  482{
  483}
  484#endif
  485
  486/*******************************************************************************
  487 *
  488 *		Protocol management and registration routines
  489 *
  490 *******************************************************************************/
  491
 
 
 
  492
  493/*
  494 *	Add a protocol ID to the list. Now that the input handler is
  495 *	smarter we can dispense with all the messy stuff that used to be
  496 *	here.
  497 *
  498 *	BEWARE!!! Protocol handlers, mangling input packets,
  499 *	MUST BE last in hash buckets and checking protocol handlers
  500 *	MUST start from promiscuous ptype_all chain in net_bh.
  501 *	It is true now, do not change it.
  502 *	Explanation follows: if protocol handler, mangling packet, will
  503 *	be the first on list, it is not able to sense, that packet
  504 *	is cloned and should be copied-on-write, so that it will
  505 *	change it and subsequent readers will get broken packet.
  506 *							--ANK (980803)
  507 */
  508
  509static inline struct list_head *ptype_head(const struct packet_type *pt)
  510{
  511	if (pt->type == htons(ETH_P_ALL))
  512		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
  513	else
  514		return pt->dev ? &pt->dev->ptype_specific :
  515				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
  516}
  517
  518/**
  519 *	dev_add_pack - add packet handler
  520 *	@pt: packet type declaration
  521 *
  522 *	Add a protocol handler to the networking stack. The passed &packet_type
  523 *	is linked into kernel lists and may not be freed until it has been
  524 *	removed from the kernel lists.
  525 *
  526 *	This call does not sleep therefore it can not
  527 *	guarantee all CPU's that are in middle of receiving packets
  528 *	will see the new packet type (until the next received packet).
  529 */
  530
  531void dev_add_pack(struct packet_type *pt)
  532{
  533	struct list_head *head = ptype_head(pt);
  534
  535	spin_lock(&ptype_lock);
  536	list_add_rcu(&pt->list, head);
  537	spin_unlock(&ptype_lock);
  538}
  539EXPORT_SYMBOL(dev_add_pack);
  540
  541/**
  542 *	__dev_remove_pack	 - remove packet handler
  543 *	@pt: packet type declaration
  544 *
  545 *	Remove a protocol handler that was previously added to the kernel
  546 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  547 *	from the kernel lists and can be freed or reused once this function
  548 *	returns.
  549 *
  550 *      The packet type might still be in use by receivers
  551 *	and must not be freed until after all the CPU's have gone
  552 *	through a quiescent state.
  553 */
  554void __dev_remove_pack(struct packet_type *pt)
  555{
  556	struct list_head *head = ptype_head(pt);
  557	struct packet_type *pt1;
  558
  559	spin_lock(&ptype_lock);
  560
  561	list_for_each_entry(pt1, head, list) {
  562		if (pt == pt1) {
  563			list_del_rcu(&pt->list);
  564			goto out;
  565		}
  566	}
  567
  568	pr_warn("dev_remove_pack: %p not found\n", pt);
  569out:
  570	spin_unlock(&ptype_lock);
  571}
  572EXPORT_SYMBOL(__dev_remove_pack);
  573
  574/**
  575 *	dev_remove_pack	 - remove packet handler
  576 *	@pt: packet type declaration
  577 *
  578 *	Remove a protocol handler that was previously added to the kernel
  579 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  580 *	from the kernel lists and can be freed or reused once this function
  581 *	returns.
  582 *
  583 *	This call sleeps to guarantee that no CPU is looking at the packet
  584 *	type after return.
  585 */
  586void dev_remove_pack(struct packet_type *pt)
  587{
  588	__dev_remove_pack(pt);
  589
  590	synchronize_net();
  591}
  592EXPORT_SYMBOL(dev_remove_pack);
  593
 
  594
  595/**
  596 *	dev_add_offload - register offload handlers
  597 *	@po: protocol offload declaration
  598 *
  599 *	Add protocol offload handlers to the networking stack. The passed
  600 *	&proto_offload is linked into kernel lists and may not be freed until
  601 *	it has been removed from the kernel lists.
  602 *
  603 *	This call does not sleep therefore it can not
  604 *	guarantee all CPU's that are in middle of receiving packets
  605 *	will see the new offload handlers (until the next received packet).
  606 */
  607void dev_add_offload(struct packet_offload *po)
  608{
  609	struct packet_offload *elem;
  610
  611	spin_lock(&offload_lock);
  612	list_for_each_entry(elem, &offload_base, list) {
  613		if (po->priority < elem->priority)
  614			break;
  615	}
  616	list_add_rcu(&po->list, elem->list.prev);
  617	spin_unlock(&offload_lock);
  618}
  619EXPORT_SYMBOL(dev_add_offload);
  620
  621/**
  622 *	__dev_remove_offload	 - remove offload handler
  623 *	@po: packet offload declaration
  624 *
  625 *	Remove a protocol offload handler that was previously added to the
  626 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
  627 *	is removed from the kernel lists and can be freed or reused once this
  628 *	function returns.
  629 *
  630 *      The packet type might still be in use by receivers
  631 *	and must not be freed until after all the CPU's have gone
  632 *	through a quiescent state.
  633 */
  634static void __dev_remove_offload(struct packet_offload *po)
  635{
  636	struct list_head *head = &offload_base;
  637	struct packet_offload *po1;
  638
  639	spin_lock(&offload_lock);
  640
  641	list_for_each_entry(po1, head, list) {
  642		if (po == po1) {
  643			list_del_rcu(&po->list);
  644			goto out;
  645		}
  646	}
  647
  648	pr_warn("dev_remove_offload: %p not found\n", po);
  649out:
  650	spin_unlock(&offload_lock);
  651}
  652
  653/**
  654 *	dev_remove_offload	 - remove packet offload handler
  655 *	@po: packet offload declaration
  656 *
  657 *	Remove a packet offload handler that was previously added to the kernel
  658 *	offload handlers by dev_add_offload(). The passed &offload_type is
  659 *	removed from the kernel lists and can be freed or reused once this
  660 *	function returns.
  661 *
  662 *	This call sleeps to guarantee that no CPU is looking at the packet
  663 *	type after return.
  664 */
  665void dev_remove_offload(struct packet_offload *po)
  666{
  667	__dev_remove_offload(po);
  668
  669	synchronize_net();
  670}
  671EXPORT_SYMBOL(dev_remove_offload);
  672
  673/******************************************************************************
  674 *
  675 *		      Device Boot-time Settings Routines
  676 *
  677 ******************************************************************************/
  678
  679/* Boot time configuration table */
  680static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
  681
  682/**
  683 *	netdev_boot_setup_add	- add new setup entry
  684 *	@name: name of the device
  685 *	@map: configured settings for the device
  686 *
  687 *	Adds new setup entry to the dev_boot_setup list.  The function
  688 *	returns 0 on error and 1 on success.  This is a generic routine to
  689 *	all netdevices.
  690 */
  691static int netdev_boot_setup_add(char *name, struct ifmap *map)
  692{
  693	struct netdev_boot_setup *s;
  694	int i;
  695
  696	s = dev_boot_setup;
  697	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
  698		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
  699			memset(s[i].name, 0, sizeof(s[i].name));
  700			strlcpy(s[i].name, name, IFNAMSIZ);
  701			memcpy(&s[i].map, map, sizeof(s[i].map));
  702			break;
  703		}
  704	}
  705
  706	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
  707}
  708
  709/**
  710 * netdev_boot_setup_check	- check boot time settings
  711 * @dev: the netdevice
  712 *
  713 * Check boot time settings for the device.
  714 * The found settings are set for the device to be used
  715 * later in the device probing.
  716 * Returns 0 if no settings found, 1 if they are.
  717 */
  718int netdev_boot_setup_check(struct net_device *dev)
  719{
  720	struct netdev_boot_setup *s = dev_boot_setup;
  721	int i;
  722
  723	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
  724		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
  725		    !strcmp(dev->name, s[i].name)) {
  726			dev->irq = s[i].map.irq;
  727			dev->base_addr = s[i].map.base_addr;
  728			dev->mem_start = s[i].map.mem_start;
  729			dev->mem_end = s[i].map.mem_end;
  730			return 1;
  731		}
  732	}
  733	return 0;
  734}
  735EXPORT_SYMBOL(netdev_boot_setup_check);
  736
  737
  738/**
  739 * netdev_boot_base	- get address from boot time settings
  740 * @prefix: prefix for network device
  741 * @unit: id for network device
  742 *
  743 * Check boot time settings for the base address of device.
  744 * The found settings are set for the device to be used
  745 * later in the device probing.
  746 * Returns 0 if no settings found.
  747 */
  748unsigned long netdev_boot_base(const char *prefix, int unit)
  749{
  750	const struct netdev_boot_setup *s = dev_boot_setup;
  751	char name[IFNAMSIZ];
  752	int i;
  753
  754	sprintf(name, "%s%d", prefix, unit);
  755
  756	/*
  757	 * If device already registered then return base of 1
  758	 * to indicate not to probe for this interface
  759	 */
  760	if (__dev_get_by_name(&init_net, name))
  761		return 1;
  762
  763	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
  764		if (!strcmp(name, s[i].name))
  765			return s[i].map.base_addr;
  766	return 0;
  767}
  768
  769/*
  770 * Saves at boot time configured settings for any netdevice.
  771 */
  772int __init netdev_boot_setup(char *str)
  773{
  774	int ints[5];
  775	struct ifmap map;
  776
  777	str = get_options(str, ARRAY_SIZE(ints), ints);
  778	if (!str || !*str)
  779		return 0;
  780
  781	/* Save settings */
  782	memset(&map, 0, sizeof(map));
  783	if (ints[0] > 0)
  784		map.irq = ints[1];
  785	if (ints[0] > 1)
  786		map.base_addr = ints[2];
  787	if (ints[0] > 2)
  788		map.mem_start = ints[3];
  789	if (ints[0] > 3)
  790		map.mem_end = ints[4];
  791
  792	/* Add new entry to the list */
  793	return netdev_boot_setup_add(str, &map);
  794}
  795
  796__setup("netdev=", netdev_boot_setup);
  797
  798/*******************************************************************************
  799 *
  800 *			    Device Interface Subroutines
  801 *
  802 *******************************************************************************/
  803
  804/**
  805 *	dev_get_iflink	- get 'iflink' value of a interface
  806 *	@dev: targeted interface
  807 *
  808 *	Indicates the ifindex the interface is linked to.
  809 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
  810 */
  811
  812int dev_get_iflink(const struct net_device *dev)
  813{
  814	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
  815		return dev->netdev_ops->ndo_get_iflink(dev);
  816
  817	return dev->ifindex;
  818}
  819EXPORT_SYMBOL(dev_get_iflink);
  820
  821/**
  822 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
  823 *	@dev: targeted interface
  824 *	@skb: The packet.
  825 *
  826 *	For better visibility of tunnel traffic OVS needs to retrieve
  827 *	egress tunnel information for a packet. Following API allows
  828 *	user to get this info.
  829 */
  830int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
  831{
  832	struct ip_tunnel_info *info;
  833
  834	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
  835		return -EINVAL;
  836
  837	info = skb_tunnel_info_unclone(skb);
  838	if (!info)
  839		return -ENOMEM;
  840	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
  841		return -EINVAL;
  842
  843	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
  844}
  845EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
  846
  847/**
  848 *	__dev_get_by_name	- find a device by its name
  849 *	@net: the applicable net namespace
  850 *	@name: name to find
  851 *
  852 *	Find an interface by name. Must be called under RTNL semaphore
  853 *	or @dev_base_lock. If the name is found a pointer to the device
  854 *	is returned. If the name is not found then %NULL is returned. The
  855 *	reference counters are not incremented so the caller must be
  856 *	careful with locks.
  857 */
  858
  859struct net_device *__dev_get_by_name(struct net *net, const char *name)
  860{
  861	struct netdev_name_node *node_name;
 
 
 
 
 
 
  862
  863	node_name = netdev_name_node_lookup(net, name);
  864	return node_name ? node_name->dev : NULL;
  865}
  866EXPORT_SYMBOL(__dev_get_by_name);
  867
  868/**
  869 * dev_get_by_name_rcu	- find a device by its name
  870 * @net: the applicable net namespace
  871 * @name: name to find
  872 *
  873 * Find an interface by name.
  874 * If the name is found a pointer to the device is returned.
  875 * If the name is not found then %NULL is returned.
  876 * The reference counters are not incremented so the caller must be
  877 * careful with locks. The caller must hold RCU lock.
  878 */
  879
  880struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
  881{
  882	struct netdev_name_node *node_name;
 
 
  883
  884	node_name = netdev_name_node_lookup_rcu(net, name);
  885	return node_name ? node_name->dev : NULL;
 
 
 
  886}
  887EXPORT_SYMBOL(dev_get_by_name_rcu);
  888
  889/**
  890 *	dev_get_by_name		- find a device by its name
  891 *	@net: the applicable net namespace
  892 *	@name: name to find
  893 *
  894 *	Find an interface by name. This can be called from any
  895 *	context and does its own locking. The returned handle has
  896 *	the usage count incremented and the caller must use dev_put() to
  897 *	release it when it is no longer needed. %NULL is returned if no
  898 *	matching device is found.
  899 */
  900
  901struct net_device *dev_get_by_name(struct net *net, const char *name)
  902{
  903	struct net_device *dev;
  904
  905	rcu_read_lock();
  906	dev = dev_get_by_name_rcu(net, name);
  907	if (dev)
  908		dev_hold(dev);
  909	rcu_read_unlock();
  910	return dev;
  911}
  912EXPORT_SYMBOL(dev_get_by_name);
  913
  914/**
  915 *	__dev_get_by_index - find a device by its ifindex
  916 *	@net: the applicable net namespace
  917 *	@ifindex: index of device
  918 *
  919 *	Search for an interface by index. Returns %NULL if the device
  920 *	is not found or a pointer to the device. The device has not
  921 *	had its reference counter increased so the caller must be careful
  922 *	about locking. The caller must hold either the RTNL semaphore
  923 *	or @dev_base_lock.
  924 */
  925
  926struct net_device *__dev_get_by_index(struct net *net, int ifindex)
  927{
 
  928	struct net_device *dev;
  929	struct hlist_head *head = dev_index_hash(net, ifindex);
  930
  931	hlist_for_each_entry(dev, head, index_hlist)
  932		if (dev->ifindex == ifindex)
  933			return dev;
  934
  935	return NULL;
  936}
  937EXPORT_SYMBOL(__dev_get_by_index);
  938
  939/**
  940 *	dev_get_by_index_rcu - find a device by its ifindex
  941 *	@net: the applicable net namespace
  942 *	@ifindex: index of device
  943 *
  944 *	Search for an interface by index. Returns %NULL if the device
  945 *	is not found or a pointer to the device. The device has not
  946 *	had its reference counter increased so the caller must be careful
  947 *	about locking. The caller must hold RCU lock.
  948 */
  949
  950struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
  951{
 
  952	struct net_device *dev;
  953	struct hlist_head *head = dev_index_hash(net, ifindex);
  954
  955	hlist_for_each_entry_rcu(dev, head, index_hlist)
  956		if (dev->ifindex == ifindex)
  957			return dev;
  958
  959	return NULL;
  960}
  961EXPORT_SYMBOL(dev_get_by_index_rcu);
  962
  963
  964/**
  965 *	dev_get_by_index - find a device by its ifindex
  966 *	@net: the applicable net namespace
  967 *	@ifindex: index of device
  968 *
  969 *	Search for an interface by index. Returns NULL if the device
  970 *	is not found or a pointer to the device. The device returned has
  971 *	had a reference added and the pointer is safe until the user calls
  972 *	dev_put to indicate they have finished with it.
  973 */
  974
  975struct net_device *dev_get_by_index(struct net *net, int ifindex)
  976{
  977	struct net_device *dev;
  978
  979	rcu_read_lock();
  980	dev = dev_get_by_index_rcu(net, ifindex);
  981	if (dev)
  982		dev_hold(dev);
  983	rcu_read_unlock();
  984	return dev;
  985}
  986EXPORT_SYMBOL(dev_get_by_index);
  987
  988/**
  989 *	dev_get_by_napi_id - find a device by napi_id
  990 *	@napi_id: ID of the NAPI struct
  991 *
  992 *	Search for an interface by NAPI ID. Returns %NULL if the device
  993 *	is not found or a pointer to the device. The device has not had
  994 *	its reference counter increased so the caller must be careful
  995 *	about locking. The caller must hold RCU lock.
  996 */
  997
  998struct net_device *dev_get_by_napi_id(unsigned int napi_id)
  999{
 1000	struct napi_struct *napi;
 1001
 1002	WARN_ON_ONCE(!rcu_read_lock_held());
 1003
 1004	if (napi_id < MIN_NAPI_ID)
 1005		return NULL;
 1006
 1007	napi = napi_by_id(napi_id);
 1008
 1009	return napi ? napi->dev : NULL;
 1010}
 1011EXPORT_SYMBOL(dev_get_by_napi_id);
 1012
 1013/**
 1014 *	netdev_get_name - get a netdevice name, knowing its ifindex.
 1015 *	@net: network namespace
 1016 *	@name: a pointer to the buffer where the name will be stored.
 1017 *	@ifindex: the ifindex of the interface to get the name from.
 1018 */
 1019int netdev_get_name(struct net *net, char *name, int ifindex)
 1020{
 1021	struct net_device *dev;
 1022	int ret;
 1023
 1024	down_read(&devnet_rename_sem);
 1025	rcu_read_lock();
 1026
 1027	dev = dev_get_by_index_rcu(net, ifindex);
 1028	if (!dev) {
 1029		ret = -ENODEV;
 1030		goto out;
 1031	}
 1032
 1033	strcpy(name, dev->name);
 1034
 1035	ret = 0;
 1036out:
 1037	rcu_read_unlock();
 1038	up_read(&devnet_rename_sem);
 1039	return ret;
 1040}
 1041
 1042/**
 1043 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 1044 *	@net: the applicable net namespace
 1045 *	@type: media type of device
 1046 *	@ha: hardware address
 1047 *
 1048 *	Search for an interface by MAC address. Returns NULL if the device
 1049 *	is not found or a pointer to the device.
 1050 *	The caller must hold RCU or RTNL.
 1051 *	The returned device has not had its ref count increased
 1052 *	and the caller must therefore be careful about locking
 1053 *
 1054 */
 1055
 1056struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 1057				       const char *ha)
 1058{
 1059	struct net_device *dev;
 1060
 1061	for_each_netdev_rcu(net, dev)
 1062		if (dev->type == type &&
 1063		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 1064			return dev;
 1065
 1066	return NULL;
 1067}
 1068EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 1069
 1070struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 1071{
 1072	struct net_device *dev;
 1073
 1074	ASSERT_RTNL();
 1075	for_each_netdev(net, dev)
 1076		if (dev->type == type)
 1077			return dev;
 1078
 1079	return NULL;
 1080}
 1081EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 1082
 1083struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 1084{
 1085	struct net_device *dev, *ret = NULL;
 1086
 1087	rcu_read_lock();
 1088	for_each_netdev_rcu(net, dev)
 1089		if (dev->type == type) {
 1090			dev_hold(dev);
 1091			ret = dev;
 1092			break;
 1093		}
 1094	rcu_read_unlock();
 1095	return ret;
 1096}
 1097EXPORT_SYMBOL(dev_getfirstbyhwtype);
 1098
 1099/**
 1100 *	__dev_get_by_flags - find any device with given flags
 1101 *	@net: the applicable net namespace
 1102 *	@if_flags: IFF_* values
 1103 *	@mask: bitmask of bits in if_flags to check
 1104 *
 1105 *	Search for any interface with the given flags. Returns NULL if a device
 1106 *	is not found or a pointer to the device. Must be called inside
 1107 *	rtnl_lock(), and result refcount is unchanged.
 1108 */
 1109
 1110struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 1111				      unsigned short mask)
 1112{
 1113	struct net_device *dev, *ret;
 1114
 1115	ASSERT_RTNL();
 1116
 1117	ret = NULL;
 1118	for_each_netdev(net, dev) {
 1119		if (((dev->flags ^ if_flags) & mask) == 0) {
 1120			ret = dev;
 1121			break;
 1122		}
 1123	}
 1124	return ret;
 1125}
 1126EXPORT_SYMBOL(__dev_get_by_flags);
 1127
 1128/**
 1129 *	dev_valid_name - check if name is okay for network device
 1130 *	@name: name string
 1131 *
 1132 *	Network device names need to be valid file names to
 1133 *	to allow sysfs to work.  We also disallow any kind of
 1134 *	whitespace.
 1135 */
 1136bool dev_valid_name(const char *name)
 1137{
 1138	if (*name == '\0')
 1139		return false;
 1140	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
 1141		return false;
 1142	if (!strcmp(name, ".") || !strcmp(name, ".."))
 1143		return false;
 1144
 1145	while (*name) {
 1146		if (*name == '/' || *name == ':' || isspace(*name))
 1147			return false;
 1148		name++;
 1149	}
 1150	return true;
 1151}
 1152EXPORT_SYMBOL(dev_valid_name);
 1153
 1154/**
 1155 *	__dev_alloc_name - allocate a name for a device
 1156 *	@net: network namespace to allocate the device name in
 1157 *	@name: name format string
 1158 *	@buf:  scratch buffer and result name string
 1159 *
 1160 *	Passed a format string - eg "lt%d" it will try and find a suitable
 1161 *	id. It scans list of devices to build up a free map, then chooses
 1162 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 1163 *	while allocating the name and adding the device in order to avoid
 1164 *	duplicates.
 1165 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 1166 *	Returns the number of the unit assigned or a negative errno code.
 1167 */
 1168
 1169static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 1170{
 1171	int i = 0;
 1172	const char *p;
 1173	const int max_netdevices = 8*PAGE_SIZE;
 1174	unsigned long *inuse;
 1175	struct net_device *d;
 1176
 1177	if (!dev_valid_name(name))
 1178		return -EINVAL;
 1179
 1180	p = strchr(name, '%');
 1181	if (p) {
 1182		/*
 1183		 * Verify the string as this thing may have come from
 1184		 * the user.  There must be either one "%d" and no other "%"
 1185		 * characters.
 1186		 */
 1187		if (p[1] != 'd' || strchr(p + 2, '%'))
 1188			return -EINVAL;
 1189
 1190		/* Use one page as a bit array of possible slots */
 1191		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 1192		if (!inuse)
 1193			return -ENOMEM;
 1194
 1195		for_each_netdev(net, d) {
 1196			if (!sscanf(d->name, name, &i))
 1197				continue;
 1198			if (i < 0 || i >= max_netdevices)
 1199				continue;
 1200
 1201			/*  avoid cases where sscanf is not exact inverse of printf */
 1202			snprintf(buf, IFNAMSIZ, name, i);
 1203			if (!strncmp(buf, d->name, IFNAMSIZ))
 1204				set_bit(i, inuse);
 1205		}
 1206
 1207		i = find_first_zero_bit(inuse, max_netdevices);
 1208		free_page((unsigned long) inuse);
 1209	}
 1210
 1211	snprintf(buf, IFNAMSIZ, name, i);
 
 1212	if (!__dev_get_by_name(net, buf))
 1213		return i;
 1214
 1215	/* It is possible to run out of possible slots
 1216	 * when the name is long and there isn't enough space left
 1217	 * for the digits, or if all bits are used.
 1218	 */
 1219	return -ENFILE;
 1220}
 1221
 1222static int dev_alloc_name_ns(struct net *net,
 1223			     struct net_device *dev,
 1224			     const char *name)
 1225{
 1226	char buf[IFNAMSIZ];
 1227	int ret;
 1228
 1229	BUG_ON(!net);
 1230	ret = __dev_alloc_name(net, name, buf);
 1231	if (ret >= 0)
 1232		strlcpy(dev->name, buf, IFNAMSIZ);
 1233	return ret;
 1234}
 1235
 1236/**
 1237 *	dev_alloc_name - allocate a name for a device
 1238 *	@dev: device
 1239 *	@name: name format string
 1240 *
 1241 *	Passed a format string - eg "lt%d" it will try and find a suitable
 1242 *	id. It scans list of devices to build up a free map, then chooses
 1243 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 1244 *	while allocating the name and adding the device in order to avoid
 1245 *	duplicates.
 1246 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 1247 *	Returns the number of the unit assigned or a negative errno code.
 1248 */
 1249
 1250int dev_alloc_name(struct net_device *dev, const char *name)
 1251{
 1252	return dev_alloc_name_ns(dev_net(dev), dev, name);
 
 
 
 
 
 
 
 
 
 1253}
 1254EXPORT_SYMBOL(dev_alloc_name);
 1255
 1256static int dev_get_valid_name(struct net *net, struct net_device *dev,
 1257			      const char *name)
 1258{
 1259	BUG_ON(!net);
 
 
 
 1260
 1261	if (!dev_valid_name(name))
 1262		return -EINVAL;
 1263
 1264	if (strchr(name, '%'))
 1265		return dev_alloc_name_ns(net, dev, name);
 1266	else if (__dev_get_by_name(net, name))
 1267		return -EEXIST;
 1268	else if (dev->name != name)
 1269		strlcpy(dev->name, name, IFNAMSIZ);
 1270
 1271	return 0;
 1272}
 1273
 1274/**
 1275 *	dev_change_name - change name of a device
 1276 *	@dev: device
 1277 *	@newname: name (or format string) must be at least IFNAMSIZ
 1278 *
 1279 *	Change name of a device, can pass format strings "eth%d".
 1280 *	for wildcarding.
 1281 */
 1282int dev_change_name(struct net_device *dev, const char *newname)
 1283{
 1284	unsigned char old_assign_type;
 1285	char oldname[IFNAMSIZ];
 1286	int err = 0;
 1287	int ret;
 1288	struct net *net;
 1289
 1290	ASSERT_RTNL();
 1291	BUG_ON(!dev_net(dev));
 1292
 1293	net = dev_net(dev);
 1294
 1295	/* Some auto-enslaved devices e.g. failover slaves are
 1296	 * special, as userspace might rename the device after
 1297	 * the interface had been brought up and running since
 1298	 * the point kernel initiated auto-enslavement. Allow
 1299	 * live name change even when these slave devices are
 1300	 * up and running.
 1301	 *
 1302	 * Typically, users of these auto-enslaving devices
 1303	 * don't actually care about slave name change, as
 1304	 * they are supposed to operate on master interface
 1305	 * directly.
 1306	 */
 1307	if (dev->flags & IFF_UP &&
 1308	    likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
 1309		return -EBUSY;
 1310
 1311	down_write(&devnet_rename_sem);
 1312
 1313	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
 1314		up_write(&devnet_rename_sem);
 1315		return 0;
 1316	}
 1317
 1318	memcpy(oldname, dev->name, IFNAMSIZ);
 1319
 1320	err = dev_get_valid_name(net, dev, newname);
 1321	if (err < 0) {
 1322		up_write(&devnet_rename_sem);
 1323		return err;
 1324	}
 1325
 1326	if (oldname[0] && !strchr(oldname, '%'))
 1327		netdev_info(dev, "renamed from %s\n", oldname);
 1328
 1329	old_assign_type = dev->name_assign_type;
 1330	dev->name_assign_type = NET_NAME_RENAMED;
 1331
 1332rollback:
 1333	ret = device_rename(&dev->dev, dev->name);
 1334	if (ret) {
 1335		memcpy(dev->name, oldname, IFNAMSIZ);
 1336		dev->name_assign_type = old_assign_type;
 1337		up_write(&devnet_rename_sem);
 1338		return ret;
 1339	}
 1340
 1341	up_write(&devnet_rename_sem);
 1342
 1343	netdev_adjacent_rename_links(dev, oldname);
 1344
 1345	write_lock_bh(&dev_base_lock);
 1346	netdev_name_node_del(dev->name_node);
 1347	write_unlock_bh(&dev_base_lock);
 1348
 1349	synchronize_rcu();
 1350
 1351	write_lock_bh(&dev_base_lock);
 1352	netdev_name_node_add(net, dev->name_node);
 1353	write_unlock_bh(&dev_base_lock);
 1354
 1355	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 1356	ret = notifier_to_errno(ret);
 1357
 1358	if (ret) {
 1359		/* err >= 0 after dev_alloc_name() or stores the first errno */
 1360		if (err >= 0) {
 1361			err = ret;
 1362			down_write(&devnet_rename_sem);
 1363			memcpy(dev->name, oldname, IFNAMSIZ);
 1364			memcpy(oldname, newname, IFNAMSIZ);
 1365			dev->name_assign_type = old_assign_type;
 1366			old_assign_type = NET_NAME_RENAMED;
 1367			goto rollback;
 1368		} else {
 1369			pr_err("%s: name change rollback failed: %d\n",
 
 1370			       dev->name, ret);
 1371		}
 1372	}
 1373
 1374	return err;
 1375}
 1376
 1377/**
 1378 *	dev_set_alias - change ifalias of a device
 1379 *	@dev: device
 1380 *	@alias: name up to IFALIASZ
 1381 *	@len: limit of bytes to copy from info
 1382 *
 1383 *	Set ifalias for a device,
 1384 */
 1385int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 1386{
 1387	struct dev_ifalias *new_alias = NULL;
 1388
 1389	if (len >= IFALIASZ)
 1390		return -EINVAL;
 1391
 1392	if (len) {
 1393		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
 1394		if (!new_alias)
 1395			return -ENOMEM;
 1396
 1397		memcpy(new_alias->ifalias, alias, len);
 1398		new_alias->ifalias[len] = 0;
 1399	}
 1400
 1401	mutex_lock(&ifalias_mutex);
 1402	new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
 1403					mutex_is_locked(&ifalias_mutex));
 1404	mutex_unlock(&ifalias_mutex);
 1405
 1406	if (new_alias)
 1407		kfree_rcu(new_alias, rcuhead);
 1408
 
 1409	return len;
 1410}
 1411EXPORT_SYMBOL(dev_set_alias);
 1412
 1413/**
 1414 *	dev_get_alias - get ifalias of a device
 1415 *	@dev: device
 1416 *	@name: buffer to store name of ifalias
 1417 *	@len: size of buffer
 1418 *
 1419 *	get ifalias for a device.  Caller must make sure dev cannot go
 1420 *	away,  e.g. rcu read lock or own a reference count to device.
 1421 */
 1422int dev_get_alias(const struct net_device *dev, char *name, size_t len)
 1423{
 1424	const struct dev_ifalias *alias;
 1425	int ret = 0;
 1426
 1427	rcu_read_lock();
 1428	alias = rcu_dereference(dev->ifalias);
 1429	if (alias)
 1430		ret = snprintf(name, len, "%s", alias->ifalias);
 1431	rcu_read_unlock();
 1432
 1433	return ret;
 1434}
 1435
 1436/**
 1437 *	netdev_features_change - device changes features
 1438 *	@dev: device to cause notification
 1439 *
 1440 *	Called to indicate a device has changed features.
 1441 */
 1442void netdev_features_change(struct net_device *dev)
 1443{
 1444	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
 1445}
 1446EXPORT_SYMBOL(netdev_features_change);
 1447
 1448/**
 1449 *	netdev_state_change - device changes state
 1450 *	@dev: device to cause notification
 1451 *
 1452 *	Called to indicate a device has changed state. This function calls
 1453 *	the notifier chains for netdev_chain and sends a NEWLINK message
 1454 *	to the routing socket.
 1455 */
 1456void netdev_state_change(struct net_device *dev)
 1457{
 1458	if (dev->flags & IFF_UP) {
 1459		struct netdev_notifier_change_info change_info = {
 1460			.info.dev = dev,
 1461		};
 1462
 1463		call_netdevice_notifiers_info(NETDEV_CHANGE,
 1464					      &change_info.info);
 1465		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
 1466	}
 1467}
 1468EXPORT_SYMBOL(netdev_state_change);
 1469
 
 
 
 
 
 
 1470/**
 1471 * netdev_notify_peers - notify network peers about existence of @dev
 1472 * @dev: network device
 
 1473 *
 1474 * Generate traffic such that interested network peers are aware of
 1475 * @dev, such as by generating a gratuitous ARP. This may be used when
 1476 * a device wants to inform the rest of the network about some sort of
 1477 * reconfiguration such as a failover event or virtual machine
 1478 * migration.
 1479 */
 1480void netdev_notify_peers(struct net_device *dev)
 
 1481{
 1482	rtnl_lock();
 1483	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
 1484	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
 1485	rtnl_unlock();
 
 
 
 
 
 
 
 
 
 
 
 
 1486}
 1487EXPORT_SYMBOL(netdev_notify_peers);
 1488
 1489static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1490{
 1491	const struct net_device_ops *ops = dev->netdev_ops;
 1492	int ret;
 1493
 1494	ASSERT_RTNL();
 1495
 1496	if (!netif_device_present(dev)) {
 1497		/* may be detached because parent is runtime-suspended */
 1498		if (dev->dev.parent)
 1499			pm_runtime_resume(dev->dev.parent);
 1500		if (!netif_device_present(dev))
 1501			return -ENODEV;
 1502	}
 1503
 1504	/* Block netpoll from trying to do any rx path servicing.
 1505	 * If we don't do this there is a chance ndo_poll_controller
 1506	 * or ndo_poll may be running while we open the device
 1507	 */
 1508	netpoll_poll_disable(dev);
 1509
 1510	ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
 1511	ret = notifier_to_errno(ret);
 1512	if (ret)
 1513		return ret;
 1514
 1515	set_bit(__LINK_STATE_START, &dev->state);
 1516
 1517	if (ops->ndo_validate_addr)
 1518		ret = ops->ndo_validate_addr(dev);
 1519
 1520	if (!ret && ops->ndo_open)
 1521		ret = ops->ndo_open(dev);
 1522
 1523	netpoll_poll_enable(dev);
 1524
 1525	if (ret)
 1526		clear_bit(__LINK_STATE_START, &dev->state);
 1527	else {
 1528		dev->flags |= IFF_UP;
 
 1529		dev_set_rx_mode(dev);
 1530		dev_activate(dev);
 1531		add_device_randomness(dev->dev_addr, dev->addr_len);
 1532	}
 1533
 1534	return ret;
 1535}
 1536
 1537/**
 1538 *	dev_open	- prepare an interface for use.
 1539 *	@dev: device to open
 1540 *	@extack: netlink extended ack
 1541 *
 1542 *	Takes a device from down to up state. The device's private open
 1543 *	function is invoked and then the multicast lists are loaded. Finally
 1544 *	the device is moved into the up state and a %NETDEV_UP message is
 1545 *	sent to the netdev notifier chain.
 1546 *
 1547 *	Calling this function on an active interface is a nop. On a failure
 1548 *	a negative errno code is returned.
 1549 */
 1550int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1551{
 1552	int ret;
 1553
 1554	if (dev->flags & IFF_UP)
 1555		return 0;
 1556
 1557	ret = __dev_open(dev, extack);
 1558	if (ret < 0)
 1559		return ret;
 1560
 1561	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
 1562	call_netdevice_notifiers(NETDEV_UP, dev);
 1563
 1564	return ret;
 1565}
 1566EXPORT_SYMBOL(dev_open);
 1567
 1568static void __dev_close_many(struct list_head *head)
 1569{
 1570	struct net_device *dev;
 1571
 1572	ASSERT_RTNL();
 1573	might_sleep();
 1574
 1575	list_for_each_entry(dev, head, close_list) {
 1576		/* Temporarily disable netpoll until the interface is down */
 1577		netpoll_poll_disable(dev);
 1578
 1579		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
 1580
 1581		clear_bit(__LINK_STATE_START, &dev->state);
 1582
 1583		/* Synchronize to scheduled poll. We cannot touch poll list, it
 1584		 * can be even on different cpu. So just clear netif_running().
 1585		 *
 1586		 * dev->stop() will invoke napi_disable() on all of it's
 1587		 * napi_struct instances on this device.
 1588		 */
 1589		smp_mb__after_atomic(); /* Commit netif_running(). */
 1590	}
 1591
 1592	dev_deactivate_many(head);
 1593
 1594	list_for_each_entry(dev, head, close_list) {
 1595		const struct net_device_ops *ops = dev->netdev_ops;
 1596
 1597		/*
 1598		 *	Call the device specific close. This cannot fail.
 1599		 *	Only if device is UP
 1600		 *
 1601		 *	We allow it to be called even after a DETACH hot-plug
 1602		 *	event.
 1603		 */
 1604		if (ops->ndo_stop)
 1605			ops->ndo_stop(dev);
 1606
 1607		dev->flags &= ~IFF_UP;
 1608		netpoll_poll_enable(dev);
 1609	}
 
 
 1610}
 1611
 1612static void __dev_close(struct net_device *dev)
 1613{
 
 1614	LIST_HEAD(single);
 1615
 1616	list_add(&dev->close_list, &single);
 1617	__dev_close_many(&single);
 1618	list_del(&single);
 
 1619}
 1620
 1621void dev_close_many(struct list_head *head, bool unlink)
 1622{
 1623	struct net_device *dev, *tmp;
 
 1624
 1625	/* Remove the devices that don't need to be closed */
 1626	list_for_each_entry_safe(dev, tmp, head, close_list)
 1627		if (!(dev->flags & IFF_UP))
 1628			list_del_init(&dev->close_list);
 1629
 1630	__dev_close_many(head);
 1631
 1632	list_for_each_entry_safe(dev, tmp, head, close_list) {
 1633		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
 1634		call_netdevice_notifiers(NETDEV_DOWN, dev);
 1635		if (unlink)
 1636			list_del_init(&dev->close_list);
 1637	}
 
 
 
 
 1638}
 1639EXPORT_SYMBOL(dev_close_many);
 1640
 1641/**
 1642 *	dev_close - shutdown an interface.
 1643 *	@dev: device to shutdown
 1644 *
 1645 *	This function moves an active device into down state. A
 1646 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
 1647 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
 1648 *	chain.
 1649 */
 1650void dev_close(struct net_device *dev)
 1651{
 1652	if (dev->flags & IFF_UP) {
 1653		LIST_HEAD(single);
 1654
 1655		list_add(&dev->close_list, &single);
 1656		dev_close_many(&single, true);
 1657		list_del(&single);
 1658	}
 
 1659}
 1660EXPORT_SYMBOL(dev_close);
 1661
 1662
 1663/**
 1664 *	dev_disable_lro - disable Large Receive Offload on a device
 1665 *	@dev: device
 1666 *
 1667 *	Disable Large Receive Offload (LRO) on a net device.  Must be
 1668 *	called under RTNL.  This is needed if received packets may be
 1669 *	forwarded to another interface.
 1670 */
 1671void dev_disable_lro(struct net_device *dev)
 1672{
 1673	struct net_device *lower_dev;
 1674	struct list_head *iter;
 
 
 
 
 
 
 
 
 
 
 
 1675
 1676	dev->wanted_features &= ~NETIF_F_LRO;
 1677	netdev_update_features(dev);
 1678
 
 1679	if (unlikely(dev->features & NETIF_F_LRO))
 1680		netdev_WARN(dev, "failed to disable LRO!\n");
 1681
 1682	netdev_for_each_lower_dev(dev, lower_dev, iter)
 1683		dev_disable_lro(lower_dev);
 1684}
 1685EXPORT_SYMBOL(dev_disable_lro);
 1686
 1687/**
 1688 *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
 1689 *	@dev: device
 1690 *
 1691 *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
 1692 *	called under RTNL.  This is needed if Generic XDP is installed on
 1693 *	the device.
 1694 */
 1695static void dev_disable_gro_hw(struct net_device *dev)
 1696{
 1697	dev->wanted_features &= ~NETIF_F_GRO_HW;
 1698	netdev_update_features(dev);
 1699
 1700	if (unlikely(dev->features & NETIF_F_GRO_HW))
 1701		netdev_WARN(dev, "failed to disable GRO_HW!\n");
 1702}
 1703
 1704const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 1705{
 1706#define N(val) 						\
 1707	case NETDEV_##val:				\
 1708		return "NETDEV_" __stringify(val);
 1709	switch (cmd) {
 1710	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
 1711	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
 1712	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
 1713	N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
 1714	N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
 1715	N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
 1716	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
 1717	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
 1718	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
 1719	N(PRE_CHANGEADDR)
 1720	}
 1721#undef N
 1722	return "UNKNOWN_NETDEV_EVENT";
 1723}
 1724EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
 1725
 1726static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
 1727				   struct net_device *dev)
 1728{
 1729	struct netdev_notifier_info info = {
 1730		.dev = dev,
 1731	};
 1732
 1733	return nb->notifier_call(nb, val, &info);
 1734}
 1735
 1736static int call_netdevice_register_notifiers(struct notifier_block *nb,
 1737					     struct net_device *dev)
 1738{
 1739	int err;
 1740
 1741	err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
 1742	err = notifier_to_errno(err);
 1743	if (err)
 1744		return err;
 1745
 1746	if (!(dev->flags & IFF_UP))
 1747		return 0;
 1748
 1749	call_netdevice_notifier(nb, NETDEV_UP, dev);
 1750	return 0;
 1751}
 1752
 1753static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
 1754						struct net_device *dev)
 1755{
 1756	if (dev->flags & IFF_UP) {
 1757		call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
 1758					dev);
 1759		call_netdevice_notifier(nb, NETDEV_DOWN, dev);
 1760	}
 1761	call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
 1762}
 1763
 1764static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
 1765						 struct net *net)
 1766{
 1767	struct net_device *dev;
 1768	int err;
 1769
 1770	for_each_netdev(net, dev) {
 1771		err = call_netdevice_register_notifiers(nb, dev);
 1772		if (err)
 1773			goto rollback;
 1774	}
 1775	return 0;
 1776
 1777rollback:
 1778	for_each_netdev_continue_reverse(net, dev)
 1779		call_netdevice_unregister_notifiers(nb, dev);
 1780	return err;
 1781}
 1782
 1783static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
 1784						    struct net *net)
 1785{
 1786	struct net_device *dev;
 1787
 1788	for_each_netdev(net, dev)
 1789		call_netdevice_unregister_notifiers(nb, dev);
 1790}
 1791
 1792static int dev_boot_phase = 1;
 1793
 1794/**
 1795 * register_netdevice_notifier - register a network notifier block
 1796 * @nb: notifier
 1797 *
 1798 * Register a notifier to be called when network device events occur.
 1799 * The notifier passed is linked into the kernel structures and must
 1800 * not be reused until it has been unregistered. A negative errno code
 1801 * is returned on a failure.
 1802 *
 1803 * When registered all registration and up events are replayed
 1804 * to the new notifier to allow device to have a race free
 1805 * view of the network device list.
 1806 */
 1807
 1808int register_netdevice_notifier(struct notifier_block *nb)
 1809{
 
 
 1810	struct net *net;
 1811	int err;
 1812
 1813	/* Close race with setup_net() and cleanup_net() */
 1814	down_write(&pernet_ops_rwsem);
 1815	rtnl_lock();
 1816	err = raw_notifier_chain_register(&netdev_chain, nb);
 1817	if (err)
 1818		goto unlock;
 1819	if (dev_boot_phase)
 1820		goto unlock;
 1821	for_each_net(net) {
 1822		err = call_netdevice_register_net_notifiers(nb, net);
 1823		if (err)
 1824			goto rollback;
 
 
 
 
 
 
 
 
 1825	}
 1826
 1827unlock:
 1828	rtnl_unlock();
 1829	up_write(&pernet_ops_rwsem);
 1830	return err;
 1831
 1832rollback:
 1833	for_each_net_continue_reverse(net)
 1834		call_netdevice_unregister_net_notifiers(nb, net);
 
 
 
 
 
 
 
 
 
 
 
 
 1835
 1836	raw_notifier_chain_unregister(&netdev_chain, nb);
 1837	goto unlock;
 1838}
 1839EXPORT_SYMBOL(register_netdevice_notifier);
 1840
 1841/**
 1842 * unregister_netdevice_notifier - unregister a network notifier block
 1843 * @nb: notifier
 1844 *
 1845 * Unregister a notifier previously registered by
 1846 * register_netdevice_notifier(). The notifier is unlinked into the
 1847 * kernel structures and may then be reused. A negative errno code
 1848 * is returned on a failure.
 1849 *
 1850 * After unregistering unregister and down device events are synthesized
 1851 * for all devices on the device list to the removed notifier to remove
 1852 * the need for special case cleanup code.
 1853 */
 1854
 1855int unregister_netdevice_notifier(struct notifier_block *nb)
 1856{
 1857	struct net *net;
 1858	int err;
 1859
 1860	/* Close race with setup_net() and cleanup_net() */
 1861	down_write(&pernet_ops_rwsem);
 1862	rtnl_lock();
 1863	err = raw_notifier_chain_unregister(&netdev_chain, nb);
 1864	if (err)
 1865		goto unlock;
 1866
 1867	for_each_net(net)
 1868		call_netdevice_unregister_net_notifiers(nb, net);
 1869
 1870unlock:
 1871	rtnl_unlock();
 1872	up_write(&pernet_ops_rwsem);
 1873	return err;
 1874}
 1875EXPORT_SYMBOL(unregister_netdevice_notifier);
 1876
 1877static int __register_netdevice_notifier_net(struct net *net,
 1878					     struct notifier_block *nb,
 1879					     bool ignore_call_fail)
 1880{
 1881	int err;
 1882
 1883	err = raw_notifier_chain_register(&net->netdev_chain, nb);
 1884	if (err)
 1885		return err;
 1886	if (dev_boot_phase)
 1887		return 0;
 1888
 1889	err = call_netdevice_register_net_notifiers(nb, net);
 1890	if (err && !ignore_call_fail)
 1891		goto chain_unregister;
 1892
 1893	return 0;
 1894
 1895chain_unregister:
 1896	raw_notifier_chain_unregister(&net->netdev_chain, nb);
 1897	return err;
 1898}
 1899
 1900static int __unregister_netdevice_notifier_net(struct net *net,
 1901					       struct notifier_block *nb)
 1902{
 1903	int err;
 1904
 1905	err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
 1906	if (err)
 1907		return err;
 1908
 1909	call_netdevice_unregister_net_notifiers(nb, net);
 1910	return 0;
 1911}
 1912
 1913/**
 1914 * register_netdevice_notifier_net - register a per-netns network notifier block
 1915 * @net: network namespace
 1916 * @nb: notifier
 1917 *
 1918 * Register a notifier to be called when network device events occur.
 1919 * The notifier passed is linked into the kernel structures and must
 1920 * not be reused until it has been unregistered. A negative errno code
 1921 * is returned on a failure.
 1922 *
 1923 * When registered all registration and up events are replayed
 1924 * to the new notifier to allow device to have a race free
 1925 * view of the network device list.
 1926 */
 1927
 1928int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
 1929{
 1930	int err;
 1931
 1932	rtnl_lock();
 1933	err = __register_netdevice_notifier_net(net, nb, false);
 1934	rtnl_unlock();
 1935	return err;
 1936}
 1937EXPORT_SYMBOL(register_netdevice_notifier_net);
 1938
 1939/**
 1940 * unregister_netdevice_notifier_net - unregister a per-netns
 1941 *                                     network notifier block
 1942 * @net: network namespace
 1943 * @nb: notifier
 1944 *
 1945 * Unregister a notifier previously registered by
 1946 * register_netdevice_notifier(). The notifier is unlinked into the
 1947 * kernel structures and may then be reused. A negative errno code
 1948 * is returned on a failure.
 1949 *
 1950 * After unregistering unregister and down device events are synthesized
 1951 * for all devices on the device list to the removed notifier to remove
 1952 * the need for special case cleanup code.
 1953 */
 1954
 1955int unregister_netdevice_notifier_net(struct net *net,
 1956				      struct notifier_block *nb)
 1957{
 1958	int err;
 1959
 1960	rtnl_lock();
 1961	err = __unregister_netdevice_notifier_net(net, nb);
 1962	rtnl_unlock();
 1963	return err;
 1964}
 1965EXPORT_SYMBOL(unregister_netdevice_notifier_net);
 1966
 1967int register_netdevice_notifier_dev_net(struct net_device *dev,
 1968					struct notifier_block *nb,
 1969					struct netdev_net_notifier *nn)
 1970{
 1971	int err;
 1972
 1973	rtnl_lock();
 1974	err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
 1975	if (!err) {
 1976		nn->nb = nb;
 1977		list_add(&nn->list, &dev->net_notifier_list);
 1978	}
 1979	rtnl_unlock();
 1980	return err;
 1981}
 1982EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
 1983
 1984int unregister_netdevice_notifier_dev_net(struct net_device *dev,
 1985					  struct notifier_block *nb,
 1986					  struct netdev_net_notifier *nn)
 1987{
 1988	int err;
 1989
 1990	rtnl_lock();
 1991	list_del(&nn->list);
 1992	err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
 1993	rtnl_unlock();
 1994	return err;
 1995}
 1996EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
 1997
 1998static void move_netdevice_notifiers_dev_net(struct net_device *dev,
 1999					     struct net *net)
 2000{
 2001	struct netdev_net_notifier *nn;
 2002
 2003	list_for_each_entry(nn, &dev->net_notifier_list, list) {
 2004		__unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
 2005		__register_netdevice_notifier_net(net, nn->nb, true);
 2006	}
 2007}
 2008
 2009/**
 2010 *	call_netdevice_notifiers_info - call all network notifier blocks
 2011 *	@val: value passed unmodified to notifier function
 2012 *	@info: notifier information data
 2013 *
 2014 *	Call all network notifier blocks.  Parameters and return value
 2015 *	are as for raw_notifier_call_chain().
 2016 */
 2017
 2018static int call_netdevice_notifiers_info(unsigned long val,
 2019					 struct netdev_notifier_info *info)
 2020{
 2021	struct net *net = dev_net(info->dev);
 2022	int ret;
 2023
 2024	ASSERT_RTNL();
 2025
 2026	/* Run per-netns notifier block chain first, then run the global one.
 2027	 * Hopefully, one day, the global one is going to be removed after
 2028	 * all notifier block registrators get converted to be per-netns.
 2029	 */
 2030	ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
 2031	if (ret & NOTIFY_STOP_MASK)
 2032		return ret;
 2033	return raw_notifier_call_chain(&netdev_chain, val, info);
 2034}
 2035
 2036static int call_netdevice_notifiers_extack(unsigned long val,
 2037					   struct net_device *dev,
 2038					   struct netlink_ext_ack *extack)
 2039{
 2040	struct netdev_notifier_info info = {
 2041		.dev = dev,
 2042		.extack = extack,
 2043	};
 2044
 2045	return call_netdevice_notifiers_info(val, &info);
 2046}
 2047
 2048/**
 2049 *	call_netdevice_notifiers - call all network notifier blocks
 2050 *      @val: value passed unmodified to notifier function
 2051 *      @dev: net_device pointer passed unmodified to notifier function
 2052 *
 2053 *	Call all network notifier blocks.  Parameters and return value
 2054 *	are as for raw_notifier_call_chain().
 2055 */
 2056
 2057int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
 2058{
 2059	return call_netdevice_notifiers_extack(val, dev, NULL);
 
 2060}
 2061EXPORT_SYMBOL(call_netdevice_notifiers);
 2062
 2063/**
 2064 *	call_netdevice_notifiers_mtu - call all network notifier blocks
 2065 *	@val: value passed unmodified to notifier function
 2066 *	@dev: net_device pointer passed unmodified to notifier function
 2067 *	@arg: additional u32 argument passed to the notifier function
 2068 *
 2069 *	Call all network notifier blocks.  Parameters and return value
 2070 *	are as for raw_notifier_call_chain().
 2071 */
 2072static int call_netdevice_notifiers_mtu(unsigned long val,
 2073					struct net_device *dev, u32 arg)
 2074{
 2075	struct netdev_notifier_info_ext info = {
 2076		.info.dev = dev,
 2077		.ext.mtu = arg,
 2078	};
 2079
 2080	BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
 2081
 2082	return call_netdevice_notifiers_info(val, &info.info);
 2083}
 2084
 2085#ifdef CONFIG_NET_INGRESS
 2086static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
 2087
 2088void net_inc_ingress_queue(void)
 2089{
 2090	static_branch_inc(&ingress_needed_key);
 2091}
 2092EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
 2093
 2094void net_dec_ingress_queue(void)
 2095{
 2096	static_branch_dec(&ingress_needed_key);
 2097}
 2098EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
 2099#endif
 2100
 2101#ifdef CONFIG_NET_EGRESS
 2102static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
 2103
 2104void net_inc_egress_queue(void)
 2105{
 2106	static_branch_inc(&egress_needed_key);
 2107}
 2108EXPORT_SYMBOL_GPL(net_inc_egress_queue);
 2109
 2110void net_dec_egress_queue(void)
 2111{
 2112	static_branch_dec(&egress_needed_key);
 2113}
 2114EXPORT_SYMBOL_GPL(net_dec_egress_queue);
 2115#endif
 2116
 2117static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
 2118#ifdef CONFIG_JUMP_LABEL
 2119static atomic_t netstamp_needed_deferred;
 2120static atomic_t netstamp_wanted;
 2121static void netstamp_clear(struct work_struct *work)
 2122{
 2123	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
 2124	int wanted;
 2125
 2126	wanted = atomic_add_return(deferred, &netstamp_wanted);
 2127	if (wanted > 0)
 2128		static_branch_enable(&netstamp_needed_key);
 2129	else
 2130		static_branch_disable(&netstamp_needed_key);
 2131}
 2132static DECLARE_WORK(netstamp_work, netstamp_clear);
 2133#endif
 2134
 2135void net_enable_timestamp(void)
 2136{
 2137#ifdef CONFIG_JUMP_LABEL
 2138	int wanted;
 2139
 2140	while (1) {
 2141		wanted = atomic_read(&netstamp_wanted);
 2142		if (wanted <= 0)
 2143			break;
 2144		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
 2145			return;
 2146	}
 2147	atomic_inc(&netstamp_needed_deferred);
 2148	schedule_work(&netstamp_work);
 2149#else
 2150	static_branch_inc(&netstamp_needed_key);
 2151#endif
 2152}
 2153EXPORT_SYMBOL(net_enable_timestamp);
 2154
 2155void net_disable_timestamp(void)
 2156{
 2157#ifdef CONFIG_JUMP_LABEL
 2158	int wanted;
 2159
 2160	while (1) {
 2161		wanted = atomic_read(&netstamp_wanted);
 2162		if (wanted <= 1)
 2163			break;
 2164		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
 2165			return;
 2166	}
 2167	atomic_dec(&netstamp_needed_deferred);
 2168	schedule_work(&netstamp_work);
 2169#else
 2170	static_branch_dec(&netstamp_needed_key);
 2171#endif
 2172}
 2173EXPORT_SYMBOL(net_disable_timestamp);
 2174
 2175static inline void net_timestamp_set(struct sk_buff *skb)
 2176{
 2177	skb->tstamp = 0;
 2178	if (static_branch_unlikely(&netstamp_needed_key))
 2179		__net_timestamp(skb);
 
 
 2180}
 2181
 2182#define net_timestamp_check(COND, SKB)				\
 2183	if (static_branch_unlikely(&netstamp_needed_key)) {	\
 2184		if ((COND) && !(SKB)->tstamp)			\
 2185			__net_timestamp(SKB);			\
 2186	}							\
 2187
 2188bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
 
 2189{
 2190	unsigned int len;
 2191
 2192	if (!(dev->flags & IFF_UP))
 2193		return false;
 2194
 2195	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
 2196	if (skb->len <= len)
 2197		return true;
 2198
 2199	/* if TSO is enabled, we don't care about the length as the packet
 2200	 * could be forwarded without being segmented before
 2201	 */
 2202	if (skb_is_gso(skb))
 2203		return true;
 2204
 2205	return false;
 2206}
 2207EXPORT_SYMBOL_GPL(is_skb_forwardable);
 2208
 2209int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 2210{
 2211	int ret = ____dev_forward_skb(dev, skb);
 2212
 2213	if (likely(!ret)) {
 2214		skb->protocol = eth_type_trans(skb, dev);
 2215		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 2216	}
 2217
 2218	return ret;
 2219}
 2220EXPORT_SYMBOL_GPL(__dev_forward_skb);
 2221
 2222/**
 2223 * dev_forward_skb - loopback an skb to another netif
 2224 *
 2225 * @dev: destination network device
 2226 * @skb: buffer to forward
 2227 *
 2228 * return values:
 2229 *	NET_RX_SUCCESS	(no congestion)
 2230 *	NET_RX_DROP     (packet was dropped, but freed)
 2231 *
 2232 * dev_forward_skb can be used for injecting an skb from the
 2233 * start_xmit function of one device into the receive queue
 2234 * of another device.
 2235 *
 2236 * The receiving device may be in another namespace, so
 2237 * we have to clear all information in the skb that could
 2238 * impact namespace isolation.
 2239 */
 2240int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 2241{
 2242	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 2243}
 2244EXPORT_SYMBOL_GPL(dev_forward_skb);
 2245
 2246static inline int deliver_skb(struct sk_buff *skb,
 2247			      struct packet_type *pt_prev,
 2248			      struct net_device *orig_dev)
 2249{
 2250	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 2251		return -ENOMEM;
 2252	refcount_inc(&skb->users);
 2253	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 2254}
 2255
 2256static inline void deliver_ptype_list_skb(struct sk_buff *skb,
 2257					  struct packet_type **pt,
 2258					  struct net_device *orig_dev,
 2259					  __be16 type,
 2260					  struct list_head *ptype_list)
 2261{
 2262	struct packet_type *ptype, *pt_prev = *pt;
 2263
 2264	list_for_each_entry_rcu(ptype, ptype_list, list) {
 2265		if (ptype->type != type)
 2266			continue;
 2267		if (pt_prev)
 2268			deliver_skb(skb, pt_prev, orig_dev);
 2269		pt_prev = ptype;
 2270	}
 2271	*pt = pt_prev;
 2272}
 2273
 2274static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
 2275{
 2276	if (!ptype->af_packet_priv || !skb->sk)
 2277		return false;
 2278
 2279	if (ptype->id_match)
 2280		return ptype->id_match(ptype, skb->sk);
 2281	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
 2282		return true;
 2283
 2284	return false;
 2285}
 2286
 2287/**
 2288 * dev_nit_active - return true if any network interface taps are in use
 2289 *
 2290 * @dev: network device to check for the presence of taps
 2291 */
 2292bool dev_nit_active(struct net_device *dev)
 2293{
 2294	return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
 2295}
 2296EXPORT_SYMBOL_GPL(dev_nit_active);
 2297
 2298/*
 2299 *	Support routine. Sends outgoing frames to any network
 2300 *	taps currently in use.
 2301 */
 2302
 2303void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 2304{
 2305	struct packet_type *ptype;
 2306	struct sk_buff *skb2 = NULL;
 2307	struct packet_type *pt_prev = NULL;
 2308	struct list_head *ptype_list = &ptype_all;
 2309
 2310	rcu_read_lock();
 2311again:
 2312	list_for_each_entry_rcu(ptype, ptype_list, list) {
 2313		if (ptype->ignore_outgoing)
 2314			continue;
 2315
 2316		/* Never send packets back to the socket
 2317		 * they originated from - MvS (miquels@drinkel.ow.org)
 2318		 */
 2319		if (skb_loop_sk(ptype, skb))
 2320			continue;
 
 
 
 
 
 
 2321
 2322		if (pt_prev) {
 2323			deliver_skb(skb2, pt_prev, skb->dev);
 2324			pt_prev = ptype;
 2325			continue;
 2326		}
 2327
 2328		/* need to clone skb, done only once */
 2329		skb2 = skb_clone(skb, GFP_ATOMIC);
 2330		if (!skb2)
 2331			goto out_unlock;
 2332
 2333		net_timestamp_set(skb2);
 2334
 2335		/* skb->nh should be correctly
 2336		 * set by sender, so that the second statement is
 2337		 * just protection against buggy protocols.
 2338		 */
 2339		skb_reset_mac_header(skb2);
 2340
 2341		if (skb_network_header(skb2) < skb2->data ||
 2342		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
 2343			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
 2344					     ntohs(skb2->protocol),
 2345					     dev->name);
 2346			skb_reset_network_header(skb2);
 2347		}
 2348
 2349		skb2->transport_header = skb2->network_header;
 2350		skb2->pkt_type = PACKET_OUTGOING;
 2351		pt_prev = ptype;
 2352	}
 
 
 
 
 
 2353
 2354	if (ptype_list == &ptype_all) {
 2355		ptype_list = &dev->ptype_all;
 2356		goto again;
 2357	}
 2358out_unlock:
 2359	if (pt_prev) {
 2360		if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
 2361			pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
 2362		else
 2363			kfree_skb(skb2);
 2364	}
 
 
 2365	rcu_read_unlock();
 2366}
 2367EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
 2368
 2369/**
 2370 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
 2371 * @dev: Network device
 2372 * @txq: number of queues available
 2373 *
 2374 * If real_num_tx_queues is changed the tc mappings may no longer be
 2375 * valid. To resolve this verify the tc mapping remains valid and if
 2376 * not NULL the mapping. With no priorities mapping to this
 2377 * offset/count pair it will no longer be used. In the worst case TC0
 2378 * is invalid nothing can be done so disable priority mappings. If is
 2379 * expected that drivers will fix this mapping if they can before
 2380 * calling netif_set_real_num_tx_queues.
 2381 */
 2382static void netif_setup_tc(struct net_device *dev, unsigned int txq)
 2383{
 2384	int i;
 2385	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2386
 2387	/* If TC0 is invalidated disable TC mapping */
 2388	if (tc->offset + tc->count > txq) {
 2389		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
 
 
 2390		dev->num_tc = 0;
 2391		return;
 2392	}
 2393
 2394	/* Invalidated prio to tc mappings set to TC0 */
 2395	for (i = 1; i < TC_BITMASK + 1; i++) {
 2396		int q = netdev_get_prio_tc_map(dev, i);
 2397
 2398		tc = &dev->tc_to_txq[q];
 2399		if (tc->offset + tc->count > txq) {
 2400			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
 2401				i, q);
 
 
 
 2402			netdev_set_prio_tc_map(dev, i, 0);
 2403		}
 2404	}
 2405}
 2406
 2407int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
 2408{
 2409	if (dev->num_tc) {
 2410		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2411		int i;
 2412
 2413		/* walk through the TCs and see if it falls into any of them */
 2414		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
 2415			if ((txq - tc->offset) < tc->count)
 2416				return i;
 2417		}
 2418
 2419		/* didn't find it, just return -1 to indicate no match */
 2420		return -1;
 2421	}
 2422
 2423	return 0;
 2424}
 2425EXPORT_SYMBOL(netdev_txq_to_tc);
 2426
 2427#ifdef CONFIG_XPS
 2428struct static_key xps_needed __read_mostly;
 2429EXPORT_SYMBOL(xps_needed);
 2430struct static_key xps_rxqs_needed __read_mostly;
 2431EXPORT_SYMBOL(xps_rxqs_needed);
 2432static DEFINE_MUTEX(xps_map_mutex);
 2433#define xmap_dereference(P)		\
 2434	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
 2435
 2436static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
 2437			     int tci, u16 index)
 2438{
 2439	struct xps_map *map = NULL;
 2440	int pos;
 2441
 2442	if (dev_maps)
 2443		map = xmap_dereference(dev_maps->attr_map[tci]);
 2444	if (!map)
 2445		return false;
 2446
 2447	for (pos = map->len; pos--;) {
 2448		if (map->queues[pos] != index)
 2449			continue;
 2450
 2451		if (map->len > 1) {
 2452			map->queues[pos] = map->queues[--map->len];
 2453			break;
 2454		}
 2455
 2456		RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
 2457		kfree_rcu(map, rcu);
 2458		return false;
 2459	}
 2460
 2461	return true;
 2462}
 2463
 2464static bool remove_xps_queue_cpu(struct net_device *dev,
 2465				 struct xps_dev_maps *dev_maps,
 2466				 int cpu, u16 offset, u16 count)
 2467{
 2468	int num_tc = dev->num_tc ? : 1;
 2469	bool active = false;
 2470	int tci;
 2471
 2472	for (tci = cpu * num_tc; num_tc--; tci++) {
 2473		int i, j;
 2474
 2475		for (i = count, j = offset; i--; j++) {
 2476			if (!remove_xps_queue(dev_maps, tci, j))
 2477				break;
 2478		}
 2479
 2480		active |= i < 0;
 2481	}
 2482
 2483	return active;
 2484}
 2485
 2486static void reset_xps_maps(struct net_device *dev,
 2487			   struct xps_dev_maps *dev_maps,
 2488			   bool is_rxqs_map)
 2489{
 2490	if (is_rxqs_map) {
 2491		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
 2492		RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
 2493	} else {
 2494		RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
 2495	}
 2496	static_key_slow_dec_cpuslocked(&xps_needed);
 2497	kfree_rcu(dev_maps, rcu);
 2498}
 2499
 2500static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
 2501			   struct xps_dev_maps *dev_maps, unsigned int nr_ids,
 2502			   u16 offset, u16 count, bool is_rxqs_map)
 2503{
 2504	bool active = false;
 2505	int i, j;
 2506
 2507	for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
 2508	     j < nr_ids;)
 2509		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
 2510					       count);
 2511	if (!active)
 2512		reset_xps_maps(dev, dev_maps, is_rxqs_map);
 2513
 2514	if (!is_rxqs_map) {
 2515		for (i = offset + (count - 1); count--; i--) {
 2516			netdev_queue_numa_node_write(
 2517				netdev_get_tx_queue(dev, i),
 2518				NUMA_NO_NODE);
 2519		}
 2520	}
 2521}
 2522
 2523static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
 2524				   u16 count)
 2525{
 2526	const unsigned long *possible_mask = NULL;
 2527	struct xps_dev_maps *dev_maps;
 2528	unsigned int nr_ids;
 2529
 2530	if (!static_key_false(&xps_needed))
 2531		return;
 2532
 2533	cpus_read_lock();
 2534	mutex_lock(&xps_map_mutex);
 2535
 2536	if (static_key_false(&xps_rxqs_needed)) {
 2537		dev_maps = xmap_dereference(dev->xps_rxqs_map);
 2538		if (dev_maps) {
 2539			nr_ids = dev->num_rx_queues;
 2540			clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
 2541				       offset, count, true);
 2542		}
 2543	}
 2544
 2545	dev_maps = xmap_dereference(dev->xps_cpus_map);
 2546	if (!dev_maps)
 2547		goto out_no_maps;
 2548
 2549	if (num_possible_cpus() > 1)
 2550		possible_mask = cpumask_bits(cpu_possible_mask);
 2551	nr_ids = nr_cpu_ids;
 2552	clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
 2553		       false);
 2554
 2555out_no_maps:
 2556	mutex_unlock(&xps_map_mutex);
 2557	cpus_read_unlock();
 2558}
 2559
 2560static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
 2561{
 2562	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
 2563}
 2564
 2565static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
 2566				      u16 index, bool is_rxqs_map)
 2567{
 2568	struct xps_map *new_map;
 2569	int alloc_len = XPS_MIN_MAP_ALLOC;
 2570	int i, pos;
 2571
 2572	for (pos = 0; map && pos < map->len; pos++) {
 2573		if (map->queues[pos] != index)
 2574			continue;
 2575		return map;
 2576	}
 2577
 2578	/* Need to add tx-queue to this CPU's/rx-queue's existing map */
 2579	if (map) {
 2580		if (pos < map->alloc_len)
 2581			return map;
 2582
 2583		alloc_len = map->alloc_len * 2;
 2584	}
 2585
 2586	/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
 2587	 *  map
 2588	 */
 2589	if (is_rxqs_map)
 2590		new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
 2591	else
 2592		new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
 2593				       cpu_to_node(attr_index));
 2594	if (!new_map)
 2595		return NULL;
 2596
 2597	for (i = 0; i < pos; i++)
 2598		new_map->queues[i] = map->queues[i];
 2599	new_map->alloc_len = alloc_len;
 2600	new_map->len = pos;
 2601
 2602	return new_map;
 2603}
 2604
 2605/* Must be called under cpus_read_lock */
 2606int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 2607			  u16 index, bool is_rxqs_map)
 2608{
 2609	const unsigned long *online_mask = NULL, *possible_mask = NULL;
 2610	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
 2611	int i, j, tci, numa_node_id = -2;
 2612	int maps_sz, num_tc = 1, tc = 0;
 2613	struct xps_map *map, *new_map;
 2614	bool active = false;
 2615	unsigned int nr_ids;
 2616
 2617	if (dev->num_tc) {
 2618		/* Do not allow XPS on subordinate device directly */
 2619		num_tc = dev->num_tc;
 2620		if (num_tc < 0)
 2621			return -EINVAL;
 2622
 2623		/* If queue belongs to subordinate dev use its map */
 2624		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
 2625
 2626		tc = netdev_txq_to_tc(dev, index);
 2627		if (tc < 0)
 2628			return -EINVAL;
 2629	}
 2630
 2631	mutex_lock(&xps_map_mutex);
 2632	if (is_rxqs_map) {
 2633		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
 2634		dev_maps = xmap_dereference(dev->xps_rxqs_map);
 2635		nr_ids = dev->num_rx_queues;
 2636	} else {
 2637		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
 2638		if (num_possible_cpus() > 1) {
 2639			online_mask = cpumask_bits(cpu_online_mask);
 2640			possible_mask = cpumask_bits(cpu_possible_mask);
 2641		}
 2642		dev_maps = xmap_dereference(dev->xps_cpus_map);
 2643		nr_ids = nr_cpu_ids;
 2644	}
 2645
 2646	if (maps_sz < L1_CACHE_BYTES)
 2647		maps_sz = L1_CACHE_BYTES;
 2648
 2649	/* allocate memory for queue storage */
 2650	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
 2651	     j < nr_ids;) {
 2652		if (!new_dev_maps)
 2653			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
 2654		if (!new_dev_maps) {
 2655			mutex_unlock(&xps_map_mutex);
 2656			return -ENOMEM;
 2657		}
 2658
 2659		tci = j * num_tc + tc;
 2660		map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
 2661				 NULL;
 2662
 2663		map = expand_xps_map(map, j, index, is_rxqs_map);
 2664		if (!map)
 2665			goto error;
 2666
 2667		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2668	}
 2669
 2670	if (!new_dev_maps)
 2671		goto out_no_new_maps;
 2672
 2673	if (!dev_maps) {
 2674		/* Increment static keys at most once per type */
 2675		static_key_slow_inc_cpuslocked(&xps_needed);
 2676		if (is_rxqs_map)
 2677			static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
 2678	}
 2679
 2680	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2681	     j < nr_ids;) {
 2682		/* copy maps belonging to foreign traffic classes */
 2683		for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
 2684			/* fill in the new device map from the old device map */
 2685			map = xmap_dereference(dev_maps->attr_map[tci]);
 2686			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2687		}
 2688
 2689		/* We need to explicitly update tci as prevous loop
 2690		 * could break out early if dev_maps is NULL.
 2691		 */
 2692		tci = j * num_tc + tc;
 2693
 2694		if (netif_attr_test_mask(j, mask, nr_ids) &&
 2695		    netif_attr_test_online(j, online_mask, nr_ids)) {
 2696			/* add tx-queue to CPU/rx-queue maps */
 2697			int pos = 0;
 2698
 2699			map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2700			while ((pos < map->len) && (map->queues[pos] != index))
 2701				pos++;
 2702
 2703			if (pos == map->len)
 2704				map->queues[map->len++] = index;
 2705#ifdef CONFIG_NUMA
 2706			if (!is_rxqs_map) {
 2707				if (numa_node_id == -2)
 2708					numa_node_id = cpu_to_node(j);
 2709				else if (numa_node_id != cpu_to_node(j))
 2710					numa_node_id = -1;
 2711			}
 2712#endif
 2713		} else if (dev_maps) {
 2714			/* fill in the new device map from the old device map */
 2715			map = xmap_dereference(dev_maps->attr_map[tci]);
 2716			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2717		}
 2718
 2719		/* copy maps belonging to foreign traffic classes */
 2720		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
 2721			/* fill in the new device map from the old device map */
 2722			map = xmap_dereference(dev_maps->attr_map[tci]);
 2723			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2724		}
 2725	}
 2726
 2727	if (is_rxqs_map)
 2728		rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
 2729	else
 2730		rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
 2731
 2732	/* Cleanup old maps */
 2733	if (!dev_maps)
 2734		goto out_no_old_maps;
 2735
 2736	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2737	     j < nr_ids;) {
 2738		for (i = num_tc, tci = j * num_tc; i--; tci++) {
 2739			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2740			map = xmap_dereference(dev_maps->attr_map[tci]);
 2741			if (map && map != new_map)
 2742				kfree_rcu(map, rcu);
 2743		}
 2744	}
 2745
 2746	kfree_rcu(dev_maps, rcu);
 2747
 2748out_no_old_maps:
 2749	dev_maps = new_dev_maps;
 2750	active = true;
 2751
 2752out_no_new_maps:
 2753	if (!is_rxqs_map) {
 2754		/* update Tx queue numa node */
 2755		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
 2756					     (numa_node_id >= 0) ?
 2757					     numa_node_id : NUMA_NO_NODE);
 2758	}
 2759
 2760	if (!dev_maps)
 2761		goto out_no_maps;
 2762
 2763	/* removes tx-queue from unused CPUs/rx-queues */
 2764	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2765	     j < nr_ids;) {
 2766		for (i = tc, tci = j * num_tc; i--; tci++)
 2767			active |= remove_xps_queue(dev_maps, tci, index);
 2768		if (!netif_attr_test_mask(j, mask, nr_ids) ||
 2769		    !netif_attr_test_online(j, online_mask, nr_ids))
 2770			active |= remove_xps_queue(dev_maps, tci, index);
 2771		for (i = num_tc - tc, tci++; --i; tci++)
 2772			active |= remove_xps_queue(dev_maps, tci, index);
 2773	}
 2774
 2775	/* free map if not active */
 2776	if (!active)
 2777		reset_xps_maps(dev, dev_maps, is_rxqs_map);
 2778
 2779out_no_maps:
 2780	mutex_unlock(&xps_map_mutex);
 2781
 2782	return 0;
 2783error:
 2784	/* remove any maps that we added */
 2785	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2786	     j < nr_ids;) {
 2787		for (i = num_tc, tci = j * num_tc; i--; tci++) {
 2788			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2789			map = dev_maps ?
 2790			      xmap_dereference(dev_maps->attr_map[tci]) :
 2791			      NULL;
 2792			if (new_map && new_map != map)
 2793				kfree(new_map);
 2794		}
 2795	}
 2796
 2797	mutex_unlock(&xps_map_mutex);
 2798
 2799	kfree(new_dev_maps);
 2800	return -ENOMEM;
 2801}
 2802EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
 2803
 2804int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 2805			u16 index)
 2806{
 2807	int ret;
 2808
 2809	cpus_read_lock();
 2810	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
 2811	cpus_read_unlock();
 2812
 2813	return ret;
 2814}
 2815EXPORT_SYMBOL(netif_set_xps_queue);
 2816
 2817#endif
 2818static void netdev_unbind_all_sb_channels(struct net_device *dev)
 2819{
 2820	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2821
 2822	/* Unbind any subordinate channels */
 2823	while (txq-- != &dev->_tx[0]) {
 2824		if (txq->sb_dev)
 2825			netdev_unbind_sb_channel(dev, txq->sb_dev);
 2826	}
 2827}
 2828
 2829void netdev_reset_tc(struct net_device *dev)
 2830{
 2831#ifdef CONFIG_XPS
 2832	netif_reset_xps_queues_gt(dev, 0);
 2833#endif
 2834	netdev_unbind_all_sb_channels(dev);
 2835
 2836	/* Reset TC configuration of device */
 2837	dev->num_tc = 0;
 2838	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
 2839	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
 2840}
 2841EXPORT_SYMBOL(netdev_reset_tc);
 2842
 2843int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
 2844{
 2845	if (tc >= dev->num_tc)
 2846		return -EINVAL;
 2847
 2848#ifdef CONFIG_XPS
 2849	netif_reset_xps_queues(dev, offset, count);
 2850#endif
 2851	dev->tc_to_txq[tc].count = count;
 2852	dev->tc_to_txq[tc].offset = offset;
 2853	return 0;
 2854}
 2855EXPORT_SYMBOL(netdev_set_tc_queue);
 2856
 2857int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
 2858{
 2859	if (num_tc > TC_MAX_QUEUE)
 2860		return -EINVAL;
 2861
 2862#ifdef CONFIG_XPS
 2863	netif_reset_xps_queues_gt(dev, 0);
 2864#endif
 2865	netdev_unbind_all_sb_channels(dev);
 2866
 2867	dev->num_tc = num_tc;
 2868	return 0;
 2869}
 2870EXPORT_SYMBOL(netdev_set_num_tc);
 2871
 2872void netdev_unbind_sb_channel(struct net_device *dev,
 2873			      struct net_device *sb_dev)
 2874{
 2875	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2876
 2877#ifdef CONFIG_XPS
 2878	netif_reset_xps_queues_gt(sb_dev, 0);
 2879#endif
 2880	memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
 2881	memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
 2882
 2883	while (txq-- != &dev->_tx[0]) {
 2884		if (txq->sb_dev == sb_dev)
 2885			txq->sb_dev = NULL;
 2886	}
 2887}
 2888EXPORT_SYMBOL(netdev_unbind_sb_channel);
 2889
 2890int netdev_bind_sb_channel_queue(struct net_device *dev,
 2891				 struct net_device *sb_dev,
 2892				 u8 tc, u16 count, u16 offset)
 2893{
 2894	/* Make certain the sb_dev and dev are already configured */
 2895	if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
 2896		return -EINVAL;
 2897
 2898	/* We cannot hand out queues we don't have */
 2899	if ((offset + count) > dev->real_num_tx_queues)
 2900		return -EINVAL;
 2901
 2902	/* Record the mapping */
 2903	sb_dev->tc_to_txq[tc].count = count;
 2904	sb_dev->tc_to_txq[tc].offset = offset;
 2905
 2906	/* Provide a way for Tx queue to find the tc_to_txq map or
 2907	 * XPS map for itself.
 2908	 */
 2909	while (count--)
 2910		netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
 2911
 2912	return 0;
 2913}
 2914EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
 2915
 2916int netdev_set_sb_channel(struct net_device *dev, u16 channel)
 2917{
 2918	/* Do not use a multiqueue device to represent a subordinate channel */
 2919	if (netif_is_multiqueue(dev))
 2920		return -ENODEV;
 2921
 2922	/* We allow channels 1 - 32767 to be used for subordinate channels.
 2923	 * Channel 0 is meant to be "native" mode and used only to represent
 2924	 * the main root device. We allow writing 0 to reset the device back
 2925	 * to normal mode after being used as a subordinate channel.
 2926	 */
 2927	if (channel > S16_MAX)
 2928		return -EINVAL;
 2929
 2930	dev->num_tc = -channel;
 2931
 2932	return 0;
 2933}
 2934EXPORT_SYMBOL(netdev_set_sb_channel);
 2935
 2936/*
 2937 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
 2938 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
 2939 */
 2940int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
 2941{
 2942	bool disabling;
 2943	int rc;
 2944
 2945	disabling = txq < dev->real_num_tx_queues;
 2946
 2947	if (txq < 1 || txq > dev->num_tx_queues)
 2948		return -EINVAL;
 2949
 2950	if (dev->reg_state == NETREG_REGISTERED ||
 2951	    dev->reg_state == NETREG_UNREGISTERING) {
 2952		ASSERT_RTNL();
 2953
 2954		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
 2955						  txq);
 2956		if (rc)
 2957			return rc;
 2958
 2959		if (dev->num_tc)
 2960			netif_setup_tc(dev, txq);
 2961
 2962		dev->real_num_tx_queues = txq;
 2963
 2964		if (disabling) {
 2965			synchronize_net();
 2966			qdisc_reset_all_tx_gt(dev, txq);
 2967#ifdef CONFIG_XPS
 2968			netif_reset_xps_queues_gt(dev, txq);
 2969#endif
 2970		}
 2971	} else {
 2972		dev->real_num_tx_queues = txq;
 2973	}
 2974
 
 2975	return 0;
 2976}
 2977EXPORT_SYMBOL(netif_set_real_num_tx_queues);
 2978
 2979#ifdef CONFIG_SYSFS
 2980/**
 2981 *	netif_set_real_num_rx_queues - set actual number of RX queues used
 2982 *	@dev: Network device
 2983 *	@rxq: Actual number of RX queues
 2984 *
 2985 *	This must be called either with the rtnl_lock held or before
 2986 *	registration of the net device.  Returns 0 on success, or a
 2987 *	negative error code.  If called before registration, it always
 2988 *	succeeds.
 2989 */
 2990int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
 2991{
 2992	int rc;
 2993
 2994	if (rxq < 1 || rxq > dev->num_rx_queues)
 2995		return -EINVAL;
 2996
 2997	if (dev->reg_state == NETREG_REGISTERED) {
 2998		ASSERT_RTNL();
 2999
 3000		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
 3001						  rxq);
 3002		if (rc)
 3003			return rc;
 3004	}
 3005
 3006	dev->real_num_rx_queues = rxq;
 3007	return 0;
 3008}
 3009EXPORT_SYMBOL(netif_set_real_num_rx_queues);
 3010#endif
 3011
 3012/**
 3013 * netif_get_num_default_rss_queues - default number of RSS queues
 3014 *
 3015 * This routine should set an upper limit on the number of RSS queues
 3016 * used by default by multiqueue devices.
 3017 */
 3018int netif_get_num_default_rss_queues(void)
 3019{
 3020	return is_kdump_kernel() ?
 3021		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
 3022}
 3023EXPORT_SYMBOL(netif_get_num_default_rss_queues);
 3024
 3025static void __netif_reschedule(struct Qdisc *q)
 3026{
 3027	struct softnet_data *sd;
 3028	unsigned long flags;
 3029
 3030	local_irq_save(flags);
 3031	sd = this_cpu_ptr(&softnet_data);
 3032	q->next_sched = NULL;
 3033	*sd->output_queue_tailp = q;
 3034	sd->output_queue_tailp = &q->next_sched;
 3035	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 3036	local_irq_restore(flags);
 3037}
 3038
 3039void __netif_schedule(struct Qdisc *q)
 3040{
 3041	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
 3042		__netif_reschedule(q);
 3043}
 3044EXPORT_SYMBOL(__netif_schedule);
 3045
 3046struct dev_kfree_skb_cb {
 3047	enum skb_free_reason reason;
 3048};
 3049
 3050static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
 3051{
 3052	return (struct dev_kfree_skb_cb *)skb->cb;
 3053}
 
 3054
 3055void netif_schedule_queue(struct netdev_queue *txq)
 3056{
 3057	rcu_read_lock();
 3058	if (!netif_xmit_stopped(txq)) {
 3059		struct Qdisc *q = rcu_dereference(txq->qdisc);
 3060
 3061		__netif_schedule(q);
 3062	}
 3063	rcu_read_unlock();
 3064}
 3065EXPORT_SYMBOL(netif_schedule_queue);
 3066
 3067void netif_tx_wake_queue(struct netdev_queue *dev_queue)
 3068{
 3069	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
 3070		struct Qdisc *q;
 3071
 3072		rcu_read_lock();
 3073		q = rcu_dereference(dev_queue->qdisc);
 3074		__netif_schedule(q);
 3075		rcu_read_unlock();
 3076	}
 3077}
 3078EXPORT_SYMBOL(netif_tx_wake_queue);
 3079
 3080void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
 3081{
 3082	unsigned long flags;
 3083
 3084	if (unlikely(!skb))
 3085		return;
 3086
 3087	if (likely(refcount_read(&skb->users) == 1)) {
 3088		smp_rmb();
 3089		refcount_set(&skb->users, 0);
 3090	} else if (likely(!refcount_dec_and_test(&skb->users))) {
 3091		return;
 3092	}
 3093	get_kfree_skb_cb(skb)->reason = reason;
 3094	local_irq_save(flags);
 3095	skb->next = __this_cpu_read(softnet_data.completion_queue);
 3096	__this_cpu_write(softnet_data.completion_queue, skb);
 3097	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 3098	local_irq_restore(flags);
 3099}
 3100EXPORT_SYMBOL(__dev_kfree_skb_irq);
 3101
 3102void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
 3103{
 3104	if (in_irq() || irqs_disabled())
 3105		__dev_kfree_skb_irq(skb, reason);
 3106	else
 3107		dev_kfree_skb(skb);
 3108}
 3109EXPORT_SYMBOL(__dev_kfree_skb_any);
 3110
 3111
 3112/**
 3113 * netif_device_detach - mark device as removed
 3114 * @dev: network device
 3115 *
 3116 * Mark device as removed from system and therefore no longer available.
 3117 */
 3118void netif_device_detach(struct net_device *dev)
 3119{
 3120	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
 3121	    netif_running(dev)) {
 3122		netif_tx_stop_all_queues(dev);
 3123	}
 3124}
 3125EXPORT_SYMBOL(netif_device_detach);
 3126
 3127/**
 3128 * netif_device_attach - mark device as attached
 3129 * @dev: network device
 3130 *
 3131 * Mark device as attached from system and restart if needed.
 3132 */
 3133void netif_device_attach(struct net_device *dev)
 3134{
 3135	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
 3136	    netif_running(dev)) {
 3137		netif_tx_wake_all_queues(dev);
 3138		__netdev_watchdog_up(dev);
 3139	}
 3140}
 3141EXPORT_SYMBOL(netif_device_attach);
 3142
 3143/*
 3144 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
 3145 * to be used as a distribution range.
 
 
 
 
 
 3146 */
 3147static u16 skb_tx_hash(const struct net_device *dev,
 3148		       const struct net_device *sb_dev,
 3149		       struct sk_buff *skb)
 3150{
 3151	u32 hash;
 3152	u16 qoffset = 0;
 3153	u16 qcount = dev->real_num_tx_queues;
 3154
 3155	if (dev->num_tc) {
 3156		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
 3157
 3158		qoffset = sb_dev->tc_to_txq[tc].offset;
 3159		qcount = sb_dev->tc_to_txq[tc].count;
 3160	}
 3161
 3162	if (skb_rx_queue_recorded(skb)) {
 3163		hash = skb_get_rx_queue(skb);
 3164		if (hash >= qoffset)
 3165			hash -= qoffset;
 3166		while (unlikely(hash >= qcount))
 3167			hash -= qcount;
 3168		return hash + qoffset;
 3169	}
 3170
 3171	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
 3172}
 3173
 3174static void skb_warn_bad_offload(const struct sk_buff *skb)
 3175{
 3176	static const netdev_features_t null_features;
 3177	struct net_device *dev = skb->dev;
 3178	const char *name = "";
 3179
 3180	if (!net_ratelimit())
 3181		return;
 3182
 3183	if (dev) {
 3184		if (dev->dev.parent)
 3185			name = dev_driver_string(dev->dev.parent);
 3186		else
 3187			name = netdev_name(dev);
 3188	}
 3189	skb_dump(KERN_WARNING, skb, false);
 3190	WARN(1, "%s: caps=(%pNF, %pNF)\n",
 3191	     name, dev ? &dev->features : &null_features,
 3192	     skb->sk ? &skb->sk->sk_route_caps : &null_features);
 3193}
 
 
 3194
 3195/*
 3196 * Invalidate hardware checksum when packet is to be mangled, and
 3197 * complete checksum manually on outgoing path.
 3198 */
 3199int skb_checksum_help(struct sk_buff *skb)
 3200{
 3201	__wsum csum;
 3202	int ret = 0, offset;
 3203
 3204	if (skb->ip_summed == CHECKSUM_COMPLETE)
 3205		goto out_set_summed;
 3206
 3207	if (unlikely(skb_shinfo(skb)->gso_size)) {
 3208		skb_warn_bad_offload(skb);
 3209		return -EINVAL;
 3210	}
 3211
 3212	/* Before computing a checksum, we should make sure no frag could
 3213	 * be modified by an external entity : checksum could be wrong.
 3214	 */
 3215	if (skb_has_shared_frag(skb)) {
 3216		ret = __skb_linearize(skb);
 3217		if (ret)
 3218			goto out;
 3219	}
 3220
 3221	offset = skb_checksum_start_offset(skb);
 3222	BUG_ON(offset >= skb_headlen(skb));
 3223	csum = skb_checksum(skb, offset, skb->len - offset, 0);
 3224
 3225	offset += skb->csum_offset;
 3226	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
 3227
 3228	ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
 3229	if (ret)
 3230		goto out;
 3231
 3232	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
 3233out_set_summed:
 3234	skb->ip_summed = CHECKSUM_NONE;
 3235out:
 3236	return ret;
 3237}
 3238EXPORT_SYMBOL(skb_checksum_help);
 3239
 3240int skb_crc32c_csum_help(struct sk_buff *skb)
 3241{
 3242	__le32 crc32c_csum;
 3243	int ret = 0, offset, start;
 3244
 3245	if (skb->ip_summed != CHECKSUM_PARTIAL)
 3246		goto out;
 3247
 3248	if (unlikely(skb_is_gso(skb)))
 3249		goto out;
 3250
 3251	/* Before computing a checksum, we should make sure no frag could
 3252	 * be modified by an external entity : checksum could be wrong.
 3253	 */
 3254	if (unlikely(skb_has_shared_frag(skb))) {
 3255		ret = __skb_linearize(skb);
 3256		if (ret)
 3257			goto out;
 3258	}
 3259	start = skb_checksum_start_offset(skb);
 3260	offset = start + offsetof(struct sctphdr, checksum);
 3261	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
 3262		ret = -EINVAL;
 3263		goto out;
 3264	}
 3265
 3266	ret = skb_ensure_writable(skb, offset + sizeof(__le32));
 3267	if (ret)
 3268		goto out;
 3269
 3270	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
 3271						  skb->len - start, ~(__u32)0,
 3272						  crc32c_csum_stub));
 3273	*(__le32 *)(skb->data + offset) = crc32c_csum;
 3274	skb->ip_summed = CHECKSUM_NONE;
 3275	skb->csum_not_inet = 0;
 3276out:
 3277	return ret;
 3278}
 3279
 3280__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
 3281{
 3282	__be16 type = skb->protocol;
 3283
 3284	/* Tunnel gso handlers can set protocol to ethernet. */
 3285	if (type == htons(ETH_P_TEB)) {
 3286		struct ethhdr *eth;
 3287
 3288		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
 3289			return 0;
 3290
 3291		eth = (struct ethhdr *)skb->data;
 3292		type = eth->h_proto;
 3293	}
 3294
 3295	return __vlan_get_protocol(skb, type, depth);
 3296}
 3297
 3298/**
 3299 *	skb_mac_gso_segment - mac layer segmentation handler.
 3300 *	@skb: buffer to segment
 3301 *	@features: features for the output path (see dev->features)
 
 
 
 
 
 3302 */
 3303struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
 3304				    netdev_features_t features)
 3305{
 3306	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
 3307	struct packet_offload *ptype;
 3308	int vlan_depth = skb->mac_len;
 3309	__be16 type = skb_network_protocol(skb, &vlan_depth);
 
 3310
 3311	if (unlikely(!type))
 3312		return ERR_PTR(-EINVAL);
 3313
 3314	__skb_pull(skb, vlan_depth);
 
 3315
 3316	rcu_read_lock();
 3317	list_for_each_entry_rcu(ptype, &offload_base, list) {
 3318		if (ptype->type == type && ptype->callbacks.gso_segment) {
 3319			segs = ptype->callbacks.gso_segment(skb, features);
 3320			break;
 3321		}
 3322	}
 3323	rcu_read_unlock();
 3324
 3325	__skb_push(skb, skb->data - skb_mac_header(skb));
 3326
 3327	return segs;
 3328}
 3329EXPORT_SYMBOL(skb_mac_gso_segment);
 3330
 
 
 
 3331
 3332/* openvswitch calls this on rx path, so we need a different check.
 3333 */
 3334static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
 3335{
 3336	if (tx_path)
 3337		return skb->ip_summed != CHECKSUM_PARTIAL &&
 3338		       skb->ip_summed != CHECKSUM_UNNECESSARY;
 3339
 3340	return skb->ip_summed == CHECKSUM_NONE;
 3341}
 
 
 3342
 3343/**
 3344 *	__skb_gso_segment - Perform segmentation on skb.
 3345 *	@skb: buffer to segment
 3346 *	@features: features for the output path (see dev->features)
 3347 *	@tx_path: whether it is called in TX path
 3348 *
 3349 *	This function segments the given skb and returns a list of segments.
 3350 *
 3351 *	It may return NULL if the skb requires no segmentation.  This is
 3352 *	only possible when GSO is used for verifying header integrity.
 3353 *
 3354 *	Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
 3355 */
 3356struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 3357				  netdev_features_t features, bool tx_path)
 3358{
 3359	struct sk_buff *segs;
 3360
 3361	if (unlikely(skb_needs_check(skb, tx_path))) {
 3362		int err;
 3363
 3364		/* We're going to init ->check field in TCP or UDP header */
 3365		err = skb_cow_head(skb, 0);
 3366		if (err < 0)
 3367			return ERR_PTR(err);
 3368	}
 3369
 3370	/* Only report GSO partial support if it will enable us to
 3371	 * support segmentation on this frame without needing additional
 3372	 * work.
 3373	 */
 3374	if (features & NETIF_F_GSO_PARTIAL) {
 3375		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
 3376		struct net_device *dev = skb->dev;
 3377
 3378		partial_features |= dev->features & dev->gso_partial_features;
 3379		if (!skb_gso_ok(skb, features | partial_features))
 3380			features &= ~NETIF_F_GSO_PARTIAL;
 
 
 
 
 3381	}
 
 3382
 3383	BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
 3384		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
 3385
 3386	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
 3387	SKB_GSO_CB(skb)->encap_level = 0;
 3388
 3389	skb_reset_mac_header(skb);
 3390	skb_reset_mac_len(skb);
 3391
 3392	segs = skb_mac_gso_segment(skb, features);
 3393
 3394	if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
 3395		skb_warn_bad_offload(skb);
 3396
 3397	return segs;
 3398}
 3399EXPORT_SYMBOL(__skb_gso_segment);
 3400
 3401/* Take action when hardware reception checksum errors are detected. */
 3402#ifdef CONFIG_BUG
 3403void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
 3404{
 3405	if (net_ratelimit()) {
 3406		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
 3407		skb_dump(KERN_ERR, skb, true);
 3408		dump_stack();
 3409	}
 3410}
 3411EXPORT_SYMBOL(netdev_rx_csum_fault);
 3412#endif
 3413
 3414/* XXX: check that highmem exists at all on the given machine. */
 
 
 
 
 3415static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 3416{
 3417#ifdef CONFIG_HIGHMEM
 3418	int i;
 
 
 
 
 
 
 
 
 3419
 3420	if (!(dev->features & NETIF_F_HIGHDMA)) {
 
 3421		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 3422			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 3423
 3424			if (PageHighMem(skb_frag_page(frag)))
 3425				return 1;
 3426		}
 3427	}
 3428#endif
 3429	return 0;
 3430}
 3431
 3432/* If MPLS offload request, verify we are testing hardware MPLS features
 3433 * instead of standard features for the netdev.
 3434 */
 3435#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
 3436static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3437					   netdev_features_t features,
 3438					   __be16 type)
 3439{
 3440	if (eth_p_mpls(type))
 3441		features &= skb->dev->mpls_features;
 3442
 3443	return features;
 
 
 
 
 
 
 
 
 
 
 3444}
 3445#else
 3446static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3447					   netdev_features_t features,
 3448					   __be16 type)
 
 
 
 
 
 
 3449{
 3450	return features;
 3451}
 3452#endif
 3453
 3454static netdev_features_t harmonize_features(struct sk_buff *skb,
 3455	netdev_features_t features)
 3456{
 3457	__be16 type;
 3458
 3459	type = skb_network_protocol(skb, NULL);
 3460	features = net_mpls_features(skb, features, type);
 3461
 3462	if (skb->ip_summed != CHECKSUM_NONE &&
 3463	    !can_checksum_protocol(features, type)) {
 3464		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
 3465	}
 3466	if (illegal_highdma(skb->dev, skb))
 3467		features &= ~NETIF_F_SG;
 3468
 3469	return features;
 3470}
 3471
 3472netdev_features_t passthru_features_check(struct sk_buff *skb,
 3473					  struct net_device *dev,
 3474					  netdev_features_t features)
 
 
 
 3475{
 3476	return features;
 
 
 
 
 
 
 
 
 
 3477}
 3478EXPORT_SYMBOL(passthru_features_check);
 3479
 3480static netdev_features_t dflt_features_check(struct sk_buff *skb,
 3481					     struct net_device *dev,
 3482					     netdev_features_t features)
 3483{
 3484	return vlan_features_check(skb, features);
 
 
 
 
 
 
 3485}
 3486
 3487static netdev_features_t gso_features_check(const struct sk_buff *skb,
 3488					    struct net_device *dev,
 3489					    netdev_features_t features)
 3490{
 3491	u16 gso_segs = skb_shinfo(skb)->gso_segs;
 3492
 3493	if (gso_segs > dev->gso_max_segs)
 3494		return features & ~NETIF_F_GSO_MASK;
 3495
 3496	/* Support for GSO partial features requires software
 3497	 * intervention before we can actually process the packets
 3498	 * so we need to strip support for any partial features now
 3499	 * and we can pull them back in after we have partially
 3500	 * segmented the frame.
 3501	 */
 3502	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
 3503		features &= ~dev->gso_partial_features;
 3504
 3505	/* Make sure to clear the IPv4 ID mangling feature if the
 3506	 * IPv4 header has the potential to be fragmented.
 3507	 */
 3508	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
 3509		struct iphdr *iph = skb->encapsulation ?
 3510				    inner_ip_hdr(skb) : ip_hdr(skb);
 3511
 3512		if (!(iph->frag_off & htons(IP_DF)))
 3513			features &= ~NETIF_F_TSO_MANGLEID;
 3514	}
 3515
 3516	return features;
 3517}
 3518
 3519netdev_features_t netif_skb_features(struct sk_buff *skb)
 3520{
 3521	struct net_device *dev = skb->dev;
 3522	netdev_features_t features = dev->features;
 3523
 3524	if (skb_is_gso(skb))
 3525		features = gso_features_check(skb, dev, features);
 
 
 
 
 3526
 3527	/* If encapsulation offload request, verify we are testing
 3528	 * hardware encapsulation features instead of standard
 3529	 * features for the netdev
 3530	 */
 3531	if (skb->encapsulation)
 3532		features &= dev->hw_enc_features;
 3533
 3534	if (skb_vlan_tagged(skb))
 3535		features = netdev_intersect_features(features,
 3536						     dev->vlan_features |
 3537						     NETIF_F_HW_VLAN_CTAG_TX |
 3538						     NETIF_F_HW_VLAN_STAG_TX);
 3539
 3540	if (dev->netdev_ops->ndo_features_check)
 3541		features &= dev->netdev_ops->ndo_features_check(skb, dev,
 3542								features);
 3543	else
 3544		features &= dflt_features_check(skb, dev, features);
 3545
 3546	return harmonize_features(skb, features);
 
 
 
 
 
 
 3547}
 3548EXPORT_SYMBOL(netif_skb_features);
 3549
 3550static int xmit_one(struct sk_buff *skb, struct net_device *dev,
 3551		    struct netdev_queue *txq, bool more)
 3552{
 3553	unsigned int len;
 3554	int rc;
 3555
 3556	if (dev_nit_active(dev))
 3557		dev_queue_xmit_nit(skb, dev);
 3558
 3559	len = skb->len;
 3560	trace_net_dev_start_xmit(skb, dev);
 3561	rc = netdev_start_xmit(skb, dev, txq, more);
 3562	trace_net_dev_xmit(skb, rc, dev, len);
 3563
 3564	return rc;
 3565}
 3566
 3567struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
 3568				    struct netdev_queue *txq, int *ret)
 3569{
 3570	struct sk_buff *skb = first;
 3571	int rc = NETDEV_TX_OK;
 
 3572
 3573	while (skb) {
 3574		struct sk_buff *next = skb->next;
 3575
 3576		skb_mark_not_on_list(skb);
 3577		rc = xmit_one(skb, dev, txq, next != NULL);
 3578		if (unlikely(!dev_xmit_complete(rc))) {
 3579			skb->next = next;
 3580			goto out;
 3581		}
 3582
 3583		skb = next;
 3584		if (netif_tx_queue_stopped(txq) && skb) {
 3585			rc = NETDEV_TX_BUSY;
 3586			break;
 3587		}
 3588	}
 3589
 3590out:
 3591	*ret = rc;
 3592	return skb;
 3593}
 3594
 3595static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
 3596					  netdev_features_t features)
 3597{
 3598	if (skb_vlan_tag_present(skb) &&
 3599	    !vlan_hw_offload_capable(features, skb->vlan_proto))
 3600		skb = __vlan_hwaccel_push_inside(skb);
 3601	return skb;
 3602}
 3603
 3604int skb_csum_hwoffload_help(struct sk_buff *skb,
 3605			    const netdev_features_t features)
 3606{
 3607	if (unlikely(skb->csum_not_inet))
 3608		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
 3609			skb_crc32c_csum_help(skb);
 3610
 3611	return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
 3612}
 3613EXPORT_SYMBOL(skb_csum_hwoffload_help);
 3614
 3615static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
 3616{
 3617	netdev_features_t features;
 
 
 
 
 
 
 3618
 3619	features = netif_skb_features(skb);
 3620	skb = validate_xmit_vlan(skb, features);
 3621	if (unlikely(!skb))
 3622		goto out_null;
 3623
 3624	skb = sk_validate_xmit_skb(skb, dev);
 3625	if (unlikely(!skb))
 3626		goto out_null;
 3627
 3628	if (netif_needs_gso(skb, features)) {
 3629		struct sk_buff *segs;
 3630
 3631		segs = skb_gso_segment(skb, features);
 3632		if (IS_ERR(segs)) {
 3633			goto out_kfree_skb;
 3634		} else if (segs) {
 3635			consume_skb(skb);
 3636			skb = segs;
 3637		}
 3638	} else {
 3639		if (skb_needs_linearize(skb, features) &&
 3640		    __skb_linearize(skb))
 3641			goto out_kfree_skb;
 3642
 3643		/* If packet is not checksummed and device does not
 3644		 * support checksumming for this protocol, complete
 3645		 * checksumming here.
 3646		 */
 3647		if (skb->ip_summed == CHECKSUM_PARTIAL) {
 3648			if (skb->encapsulation)
 3649				skb_set_inner_transport_header(skb,
 3650							       skb_checksum_start_offset(skb));
 3651			else
 3652				skb_set_transport_header(skb,
 3653							 skb_checksum_start_offset(skb));
 3654			if (skb_csum_hwoffload_help(skb, features))
 3655				goto out_kfree_skb;
 
 
 3656		}
 
 
 
 
 
 
 
 3657	}
 3658
 3659	skb = validate_xmit_xfrm(skb, features, again);
 
 
 
 
 
 3660
 3661	return skb;
 
 
 
 
 
 3662
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3663out_kfree_skb:
 3664	kfree_skb(skb);
 3665out_null:
 3666	atomic_long_inc(&dev->tx_dropped);
 3667	return NULL;
 3668}
 3669
 3670struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
 
 
 
 
 
 
 
 3671{
 3672	struct sk_buff *next, *head = NULL, *tail;
 
 
 
 
 
 
 
 
 
 3673
 3674	for (; skb != NULL; skb = next) {
 3675		next = skb->next;
 3676		skb_mark_not_on_list(skb);
 
 
 3677
 3678		/* in case skb wont be segmented, point to itself */
 3679		skb->prev = skb;
 
 
 
 3680
 3681		skb = validate_xmit_skb(skb, dev, again);
 3682		if (!skb)
 3683			continue;
 3684
 3685		if (!head)
 3686			head = skb;
 3687		else
 3688			tail->next = skb;
 3689		/* If skb was segmented, skb->prev points to
 3690		 * the last segment. If not, it still contains skb.
 3691		 */
 3692		tail = skb->prev;
 
 3693	}
 3694	return head;
 3695}
 3696EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
 3697
 3698static void qdisc_pkt_len_init(struct sk_buff *skb)
 3699{
 3700	const struct skb_shared_info *shinfo = skb_shinfo(skb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3701
 3702	qdisc_skb_cb(skb)->pkt_len = skb->len;
 
 
 
 
 3703
 3704	/* To get more precise estimation of bytes sent on wire,
 3705	 * we add to pkt_len the headers size of all segments
 3706	 */
 3707	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
 3708		unsigned int hdr_len;
 3709		u16 gso_segs = shinfo->gso_segs;
 3710
 3711		/* mac layer + network layer */
 3712		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
 3713
 3714		/* + transport layer */
 3715		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
 3716			const struct tcphdr *th;
 3717			struct tcphdr _tcphdr;
 3718
 3719			th = skb_header_pointer(skb, skb_transport_offset(skb),
 3720						sizeof(_tcphdr), &_tcphdr);
 3721			if (likely(th))
 3722				hdr_len += __tcp_hdrlen(th);
 3723		} else {
 3724			struct udphdr _udphdr;
 3725
 3726			if (skb_header_pointer(skb, skb_transport_offset(skb),
 3727					       sizeof(_udphdr), &_udphdr))
 3728				hdr_len += sizeof(struct udphdr);
 3729		}
 
 
 
 
 3730
 3731		if (shinfo->gso_type & SKB_GSO_DODGY)
 3732			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
 3733						shinfo->gso_size);
 
 
 
 
 
 
 
 
 3734
 3735		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
 
 
 
 3736	}
 
 
 
 3737}
 3738
 3739static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 3740				 struct net_device *dev,
 3741				 struct netdev_queue *txq)
 3742{
 3743	spinlock_t *root_lock = qdisc_lock(q);
 3744	struct sk_buff *to_free = NULL;
 3745	bool contended;
 3746	int rc;
 3747
 
 3748	qdisc_calculate_pkt_len(skb, q);
 3749
 3750	if (q->flags & TCQ_F_NOLOCK) {
 3751		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
 3752		qdisc_run(q);
 3753
 3754		if (unlikely(to_free))
 3755			kfree_skb_list(to_free);
 3756		return rc;
 3757	}
 3758
 3759	/*
 3760	 * Heuristic to force contended enqueues to serialize on a
 3761	 * separate lock before trying to get qdisc main lock.
 3762	 * This permits qdisc->running owner to get the lock more
 3763	 * often and dequeue packets faster.
 3764	 */
 3765	contended = qdisc_is_running(q);
 3766	if (unlikely(contended))
 3767		spin_lock(&q->busylock);
 3768
 3769	spin_lock(root_lock);
 3770	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
 3771		__qdisc_drop(skb, &to_free);
 3772		rc = NET_XMIT_DROP;
 3773	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
 3774		   qdisc_run_begin(q)) {
 3775		/*
 3776		 * This is a work-conserving queue; there are no old skbs
 3777		 * waiting to be sent out; and the qdisc is not running -
 3778		 * xmit the skb directly.
 3779		 */
 
 
 3780
 3781		qdisc_bstats_update(q, skb);
 3782
 3783		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
 3784			if (unlikely(contended)) {
 3785				spin_unlock(&q->busylock);
 3786				contended = false;
 3787			}
 3788			__qdisc_run(q);
 3789		}
 
 3790
 3791		qdisc_run_end(q);
 3792		rc = NET_XMIT_SUCCESS;
 3793	} else {
 3794		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
 
 3795		if (qdisc_run_begin(q)) {
 3796			if (unlikely(contended)) {
 3797				spin_unlock(&q->busylock);
 3798				contended = false;
 3799			}
 3800			__qdisc_run(q);
 3801			qdisc_run_end(q);
 3802		}
 3803	}
 3804	spin_unlock(root_lock);
 3805	if (unlikely(to_free))
 3806		kfree_skb_list(to_free);
 3807	if (unlikely(contended))
 3808		spin_unlock(&q->busylock);
 3809	return rc;
 3810}
 3811
 3812#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 3813static void skb_update_prio(struct sk_buff *skb)
 3814{
 3815	const struct netprio_map *map;
 3816	const struct sock *sk;
 3817	unsigned int prioidx;
 3818
 3819	if (skb->priority)
 3820		return;
 3821	map = rcu_dereference_bh(skb->dev->priomap);
 3822	if (!map)
 3823		return;
 3824	sk = skb_to_full_sk(skb);
 3825	if (!sk)
 3826		return;
 3827
 3828	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
 3829
 3830	if (prioidx < map->priomap_len)
 3831		skb->priority = map->priomap[prioidx];
 3832}
 3833#else
 3834#define skb_update_prio(skb)
 3835#endif
 3836
 3837/**
 3838 *	dev_loopback_xmit - loop back @skb
 3839 *	@net: network namespace this loopback is happening in
 3840 *	@sk:  sk needed to be a netfilter okfn
 3841 *	@skb: buffer to transmit
 3842 */
 3843int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
 3844{
 3845	skb_reset_mac_header(skb);
 3846	__skb_pull(skb, skb_network_offset(skb));
 3847	skb->pkt_type = PACKET_LOOPBACK;
 3848	skb->ip_summed = CHECKSUM_UNNECESSARY;
 3849	WARN_ON(!skb_dst(skb));
 3850	skb_dst_force(skb);
 3851	netif_rx_ni(skb);
 3852	return 0;
 3853}
 3854EXPORT_SYMBOL(dev_loopback_xmit);
 3855
 3856#ifdef CONFIG_NET_EGRESS
 3857static struct sk_buff *
 3858sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 3859{
 3860	struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
 3861	struct tcf_result cl_res;
 3862
 3863	if (!miniq)
 3864		return skb;
 3865
 3866	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
 3867	mini_qdisc_bstats_cpu_update(miniq, skb);
 3868
 3869	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
 3870	case TC_ACT_OK:
 3871	case TC_ACT_RECLASSIFY:
 3872		skb->tc_index = TC_H_MIN(cl_res.classid);
 3873		break;
 3874	case TC_ACT_SHOT:
 3875		mini_qdisc_qstats_cpu_drop(miniq);
 3876		*ret = NET_XMIT_DROP;
 3877		kfree_skb(skb);
 3878		return NULL;
 3879	case TC_ACT_STOLEN:
 3880	case TC_ACT_QUEUED:
 3881	case TC_ACT_TRAP:
 3882		*ret = NET_XMIT_SUCCESS;
 3883		consume_skb(skb);
 3884		return NULL;
 3885	case TC_ACT_REDIRECT:
 3886		/* No need to push/pop skb's mac_header here on egress! */
 3887		skb_do_redirect(skb);
 3888		*ret = NET_XMIT_SUCCESS;
 3889		return NULL;
 3890	default:
 3891		break;
 3892	}
 3893
 3894	return skb;
 3895}
 3896#endif /* CONFIG_NET_EGRESS */
 3897
 3898#ifdef CONFIG_XPS
 3899static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
 3900			       struct xps_dev_maps *dev_maps, unsigned int tci)
 3901{
 3902	struct xps_map *map;
 3903	int queue_index = -1;
 3904
 3905	if (dev->num_tc) {
 3906		tci *= dev->num_tc;
 3907		tci += netdev_get_prio_tc_map(dev, skb->priority);
 3908	}
 3909
 3910	map = rcu_dereference(dev_maps->attr_map[tci]);
 3911	if (map) {
 3912		if (map->len == 1)
 3913			queue_index = map->queues[0];
 3914		else
 3915			queue_index = map->queues[reciprocal_scale(
 3916						skb_get_hash(skb), map->len)];
 3917		if (unlikely(queue_index >= dev->real_num_tx_queues))
 3918			queue_index = -1;
 3919	}
 3920	return queue_index;
 3921}
 3922#endif
 3923
 3924static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
 3925			 struct sk_buff *skb)
 3926{
 3927#ifdef CONFIG_XPS
 3928	struct xps_dev_maps *dev_maps;
 3929	struct sock *sk = skb->sk;
 3930	int queue_index = -1;
 3931
 3932	if (!static_key_false(&xps_needed))
 3933		return -1;
 3934
 3935	rcu_read_lock();
 3936	if (!static_key_false(&xps_rxqs_needed))
 3937		goto get_cpus_map;
 3938
 3939	dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
 3940	if (dev_maps) {
 3941		int tci = sk_rx_queue_get(sk);
 3942
 3943		if (tci >= 0 && tci < dev->num_rx_queues)
 3944			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 3945							  tci);
 3946	}
 3947
 3948get_cpus_map:
 3949	if (queue_index < 0) {
 3950		dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
 3951		if (dev_maps) {
 3952			unsigned int tci = skb->sender_cpu - 1;
 3953
 3954			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 3955							  tci);
 3956		}
 3957	}
 3958	rcu_read_unlock();
 3959
 3960	return queue_index;
 3961#else
 3962	return -1;
 3963#endif
 3964}
 3965
 3966u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
 3967		     struct net_device *sb_dev)
 3968{
 3969	return 0;
 3970}
 3971EXPORT_SYMBOL(dev_pick_tx_zero);
 3972
 3973u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
 3974		       struct net_device *sb_dev)
 3975{
 3976	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
 3977}
 3978EXPORT_SYMBOL(dev_pick_tx_cpu_id);
 3979
 3980u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
 3981		     struct net_device *sb_dev)
 3982{
 3983	struct sock *sk = skb->sk;
 3984	int queue_index = sk_tx_queue_get(sk);
 3985
 3986	sb_dev = sb_dev ? : dev;
 3987
 3988	if (queue_index < 0 || skb->ooo_okay ||
 3989	    queue_index >= dev->real_num_tx_queues) {
 3990		int new_index = get_xps_queue(dev, sb_dev, skb);
 3991
 3992		if (new_index < 0)
 3993			new_index = skb_tx_hash(dev, sb_dev, skb);
 3994
 3995		if (queue_index != new_index && sk &&
 3996		    sk_fullsock(sk) &&
 3997		    rcu_access_pointer(sk->sk_dst_cache))
 3998			sk_tx_queue_set(sk, new_index);
 3999
 4000		queue_index = new_index;
 4001	}
 4002
 4003	return queue_index;
 4004}
 4005EXPORT_SYMBOL(netdev_pick_tx);
 4006
 4007struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
 4008					 struct sk_buff *skb,
 4009					 struct net_device *sb_dev)
 4010{
 4011	int queue_index = 0;
 4012
 4013#ifdef CONFIG_XPS
 4014	u32 sender_cpu = skb->sender_cpu - 1;
 4015
 4016	if (sender_cpu >= (u32)NR_CPUS)
 4017		skb->sender_cpu = raw_smp_processor_id() + 1;
 4018#endif
 4019
 4020	if (dev->real_num_tx_queues != 1) {
 4021		const struct net_device_ops *ops = dev->netdev_ops;
 4022
 4023		if (ops->ndo_select_queue)
 4024			queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
 4025		else
 4026			queue_index = netdev_pick_tx(dev, skb, sb_dev);
 4027
 4028		queue_index = netdev_cap_txqueue(dev, queue_index);
 4029	}
 4030
 4031	skb_set_queue_mapping(skb, queue_index);
 4032	return netdev_get_tx_queue(dev, queue_index);
 4033}
 4034
 4035/**
 4036 *	__dev_queue_xmit - transmit a buffer
 4037 *	@skb: buffer to transmit
 4038 *	@sb_dev: suboordinate device used for L2 forwarding offload
 4039 *
 4040 *	Queue a buffer for transmission to a network device. The caller must
 4041 *	have set the device and priority and built the buffer before calling
 4042 *	this function. The function can be called from an interrupt.
 4043 *
 4044 *	A negative errno code is returned on a failure. A success does not
 4045 *	guarantee the frame will be transmitted as it may be dropped due
 4046 *	to congestion or traffic shaping.
 4047 *
 4048 * -----------------------------------------------------------------------------------
 4049 *      I notice this method can also return errors from the queue disciplines,
 4050 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
 4051 *      be positive.
 4052 *
 4053 *      Regardless of the return value, the skb is consumed, so it is currently
 4054 *      difficult to retry a send to this method.  (You can bump the ref count
 4055 *      before sending to hold a reference for retry if you are careful.)
 4056 *
 4057 *      When calling this method, interrupts MUST be enabled.  This is because
 4058 *      the BH enable code must have IRQs enabled so that it will not deadlock.
 4059 *          --BLG
 4060 */
 4061static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 4062{
 4063	struct net_device *dev = skb->dev;
 4064	struct netdev_queue *txq;
 4065	struct Qdisc *q;
 4066	int rc = -ENOMEM;
 4067	bool again = false;
 4068
 4069	skb_reset_mac_header(skb);
 4070
 4071	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
 4072		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
 4073
 4074	/* Disable soft irqs for various locks below. Also
 4075	 * stops preemption for RCU.
 4076	 */
 4077	rcu_read_lock_bh();
 4078
 4079	skb_update_prio(skb);
 
 4080
 4081	qdisc_pkt_len_init(skb);
 4082#ifdef CONFIG_NET_CLS_ACT
 4083	skb->tc_at_ingress = 0;
 4084# ifdef CONFIG_NET_EGRESS
 4085	if (static_branch_unlikely(&egress_needed_key)) {
 4086		skb = sch_handle_egress(skb, &rc, dev);
 4087		if (!skb)
 4088			goto out;
 4089	}
 4090# endif
 4091#endif
 4092	/* If device/qdisc don't need skb->dst, release it right now while
 4093	 * its hot in this cpu cache.
 4094	 */
 4095	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
 4096		skb_dst_drop(skb);
 4097	else
 4098		skb_dst_force(skb);
 4099
 4100	txq = netdev_core_pick_tx(dev, skb, sb_dev);
 4101	q = rcu_dereference_bh(txq->qdisc);
 4102
 4103	trace_net_dev_queue(skb);
 4104	if (q->enqueue) {
 4105		rc = __dev_xmit_skb(skb, q, dev, txq);
 4106		goto out;
 4107	}
 4108
 4109	/* The device has no queue. Common case for software devices:
 4110	 * loopback, all the sorts of tunnels...
 4111
 4112	 * Really, it is unlikely that netif_tx_lock protection is necessary
 4113	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
 4114	 * counters.)
 4115	 * However, it is possible, that they rely on protection
 4116	 * made by us here.
 4117
 4118	 * Check this and shot the lock. It is not prone from deadlocks.
 4119	 *Either shot noqueue qdisc, it is even simpler 8)
 4120	 */
 4121	if (dev->flags & IFF_UP) {
 4122		int cpu = smp_processor_id(); /* ok because BHs are off */
 4123
 4124		if (txq->xmit_lock_owner != cpu) {
 4125			if (dev_xmit_recursion())
 
 4126				goto recursion_alert;
 4127
 4128			skb = validate_xmit_skb(skb, dev, &again);
 4129			if (!skb)
 4130				goto out;
 4131
 4132			HARD_TX_LOCK(dev, txq, cpu);
 4133
 4134			if (!netif_xmit_stopped(txq)) {
 4135				dev_xmit_recursion_inc();
 4136				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
 4137				dev_xmit_recursion_dec();
 4138				if (dev_xmit_complete(rc)) {
 4139					HARD_TX_UNLOCK(dev, txq);
 4140					goto out;
 4141				}
 4142			}
 4143			HARD_TX_UNLOCK(dev, txq);
 4144			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
 4145					     dev->name);
 
 4146		} else {
 4147			/* Recursion is detected! It is possible,
 4148			 * unfortunately
 4149			 */
 4150recursion_alert:
 4151			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
 4152					     dev->name);
 
 4153		}
 4154	}
 4155
 4156	rc = -ENETDOWN;
 4157	rcu_read_unlock_bh();
 4158
 4159	atomic_long_inc(&dev->tx_dropped);
 4160	kfree_skb_list(skb);
 4161	return rc;
 4162out:
 4163	rcu_read_unlock_bh();
 4164	return rc;
 4165}
 4166
 4167int dev_queue_xmit(struct sk_buff *skb)
 4168{
 4169	return __dev_queue_xmit(skb, NULL);
 4170}
 4171EXPORT_SYMBOL(dev_queue_xmit);
 4172
 4173int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
 4174{
 4175	return __dev_queue_xmit(skb, sb_dev);
 4176}
 4177EXPORT_SYMBOL(dev_queue_xmit_accel);
 4178
 4179int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
 4180{
 4181	struct net_device *dev = skb->dev;
 4182	struct sk_buff *orig_skb = skb;
 4183	struct netdev_queue *txq;
 4184	int ret = NETDEV_TX_BUSY;
 4185	bool again = false;
 4186
 4187	if (unlikely(!netif_running(dev) ||
 4188		     !netif_carrier_ok(dev)))
 4189		goto drop;
 
 4190
 4191	skb = validate_xmit_skb_list(skb, dev, &again);
 4192	if (skb != orig_skb)
 4193		goto drop;
 
 
 
 
 4194
 4195	skb_set_queue_mapping(skb, queue_id);
 4196	txq = skb_get_tx_queue(dev, skb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 4197
 4198	local_bh_disable();
 4199
 4200	dev_xmit_recursion_inc();
 4201	HARD_TX_LOCK(dev, txq, smp_processor_id());
 4202	if (!netif_xmit_frozen_or_drv_stopped(txq))
 4203		ret = netdev_start_xmit(skb, dev, txq, false);
 4204	HARD_TX_UNLOCK(dev, txq);
 4205	dev_xmit_recursion_dec();
 4206
 4207	local_bh_enable();
 
 
 
 
 
 
 
 
 
 
 
 4208
 4209	if (!dev_xmit_complete(ret))
 4210		kfree_skb(skb);
 
 
 
 
 
 
 
 4211
 4212	return ret;
 4213drop:
 4214	atomic_long_inc(&dev->tx_dropped);
 4215	kfree_skb_list(skb);
 4216	return NET_XMIT_DROP;
 4217}
 4218EXPORT_SYMBOL(dev_direct_xmit);
 4219
 4220/*************************************************************************
 4221 *			Receiver routines
 4222 *************************************************************************/
 4223
 4224int netdev_max_backlog __read_mostly = 1000;
 4225EXPORT_SYMBOL(netdev_max_backlog);
 
 4226
 4227int netdev_tstamp_prequeue __read_mostly = 1;
 4228int netdev_budget __read_mostly = 300;
 4229/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
 4230unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
 4231int weight_p __read_mostly = 64;           /* old backlog weight */
 4232int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
 4233int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
 4234int dev_rx_weight __read_mostly = 64;
 4235int dev_tx_weight __read_mostly = 64;
 4236/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
 4237int gro_normal_batch __read_mostly = 8;
 4238
 4239/* Called with irq disabled */
 4240static inline void ____napi_schedule(struct softnet_data *sd,
 4241				     struct napi_struct *napi)
 4242{
 4243	list_add_tail(&napi->poll_list, &sd->poll_list);
 4244	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4245}
 
 4246
 4247#ifdef CONFIG_RPS
 4248
 4249/* One global table that all flow-based protocols share. */
 4250struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
 4251EXPORT_SYMBOL(rps_sock_flow_table);
 4252u32 rps_cpu_mask __read_mostly;
 4253EXPORT_SYMBOL(rps_cpu_mask);
 4254
 4255struct static_key_false rps_needed __read_mostly;
 4256EXPORT_SYMBOL(rps_needed);
 4257struct static_key_false rfs_needed __read_mostly;
 4258EXPORT_SYMBOL(rfs_needed);
 4259
 4260static struct rps_dev_flow *
 4261set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 4262	    struct rps_dev_flow *rflow, u16 next_cpu)
 4263{
 4264	if (next_cpu < nr_cpu_ids) {
 
 
 
 4265#ifdef CONFIG_RFS_ACCEL
 4266		struct netdev_rx_queue *rxqueue;
 4267		struct rps_dev_flow_table *flow_table;
 4268		struct rps_dev_flow *old_rflow;
 4269		u32 flow_id;
 4270		u16 rxq_index;
 4271		int rc;
 4272
 4273		/* Should we steer this flow to a different hardware queue? */
 4274		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
 4275		    !(dev->features & NETIF_F_NTUPLE))
 4276			goto out;
 4277		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
 4278		if (rxq_index == skb_get_rx_queue(skb))
 4279			goto out;
 4280
 4281		rxqueue = dev->_rx + rxq_index;
 4282		flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4283		if (!flow_table)
 4284			goto out;
 4285		flow_id = skb_get_hash(skb) & flow_table->mask;
 4286		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
 4287							rxq_index, flow_id);
 4288		if (rc < 0)
 4289			goto out;
 4290		old_rflow = rflow;
 4291		rflow = &flow_table->flows[flow_id];
 
 4292		rflow->filter = rc;
 4293		if (old_rflow->filter == rflow->filter)
 4294			old_rflow->filter = RPS_NO_FILTER;
 4295	out:
 4296#endif
 4297		rflow->last_qtail =
 4298			per_cpu(softnet_data, next_cpu).input_queue_head;
 4299	}
 4300
 4301	rflow->cpu = next_cpu;
 4302	return rflow;
 4303}
 4304
 4305/*
 4306 * get_rps_cpu is called from netif_receive_skb and returns the target
 4307 * CPU from the RPS map of the receiving queue for a given skb.
 4308 * rcu_read_lock must be held on entry.
 4309 */
 4310static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 4311		       struct rps_dev_flow **rflowp)
 4312{
 4313	const struct rps_sock_flow_table *sock_flow_table;
 4314	struct netdev_rx_queue *rxqueue = dev->_rx;
 4315	struct rps_dev_flow_table *flow_table;
 4316	struct rps_map *map;
 4317	int cpu = -1;
 4318	u32 tcpu;
 4319	u32 hash;
 4320
 4321	if (skb_rx_queue_recorded(skb)) {
 4322		u16 index = skb_get_rx_queue(skb);
 4323
 4324		if (unlikely(index >= dev->real_num_rx_queues)) {
 4325			WARN_ONCE(dev->real_num_rx_queues > 1,
 4326				  "%s received packet on queue %u, but number "
 4327				  "of RX queues is %u\n",
 4328				  dev->name, index, dev->real_num_rx_queues);
 4329			goto done;
 4330		}
 4331		rxqueue += index;
 4332	}
 4333
 4334	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
 4335
 4336	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4337	map = rcu_dereference(rxqueue->rps_map);
 4338	if (!flow_table && !map)
 
 
 
 
 
 
 
 
 4339		goto done;
 
 4340
 4341	skb_reset_network_header(skb);
 4342	hash = skb_get_hash(skb);
 4343	if (!hash)
 4344		goto done;
 4345
 
 4346	sock_flow_table = rcu_dereference(rps_sock_flow_table);
 4347	if (flow_table && sock_flow_table) {
 
 4348		struct rps_dev_flow *rflow;
 4349		u32 next_cpu;
 4350		u32 ident;
 4351
 4352		/* First check into global flow table if there is a match */
 4353		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
 4354		if ((ident ^ hash) & ~rps_cpu_mask)
 4355			goto try_rps;
 4356
 4357		next_cpu = ident & rps_cpu_mask;
 4358
 4359		/* OK, now we know there is a match,
 4360		 * we can look at the local (per receive queue) flow table
 4361		 */
 4362		rflow = &flow_table->flows[hash & flow_table->mask];
 4363		tcpu = rflow->cpu;
 4364
 4365		/*
 4366		 * If the desired CPU (where last recvmsg was done) is
 4367		 * different from current CPU (one in the rx-queue flow
 4368		 * table entry), switch if one of the following holds:
 4369		 *   - Current CPU is unset (>= nr_cpu_ids).
 4370		 *   - Current CPU is offline.
 4371		 *   - The current CPU's queue tail has advanced beyond the
 4372		 *     last packet that was enqueued using this table entry.
 4373		 *     This guarantees that all previous packets for the flow
 4374		 *     have been dequeued, thus preserving in order delivery.
 4375		 */
 4376		if (unlikely(tcpu != next_cpu) &&
 4377		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
 4378		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
 4379		      rflow->last_qtail)) >= 0)) {
 4380			tcpu = next_cpu;
 4381			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
 4382		}
 4383
 4384		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
 4385			*rflowp = rflow;
 4386			cpu = tcpu;
 4387			goto done;
 4388		}
 4389	}
 4390
 4391try_rps:
 
 4392
 4393	if (map) {
 4394		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
 4395		if (cpu_online(tcpu)) {
 4396			cpu = tcpu;
 4397			goto done;
 4398		}
 4399	}
 4400
 4401done:
 4402	return cpu;
 4403}
 4404
 4405#ifdef CONFIG_RFS_ACCEL
 4406
 4407/**
 4408 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
 4409 * @dev: Device on which the filter was set
 4410 * @rxq_index: RX queue index
 4411 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
 4412 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
 4413 *
 4414 * Drivers that implement ndo_rx_flow_steer() should periodically call
 4415 * this function for each installed filter and remove the filters for
 4416 * which it returns %true.
 4417 */
 4418bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
 4419			 u32 flow_id, u16 filter_id)
 4420{
 4421	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
 4422	struct rps_dev_flow_table *flow_table;
 4423	struct rps_dev_flow *rflow;
 4424	bool expire = true;
 4425	unsigned int cpu;
 4426
 4427	rcu_read_lock();
 4428	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4429	if (flow_table && flow_id <= flow_table->mask) {
 4430		rflow = &flow_table->flows[flow_id];
 4431		cpu = READ_ONCE(rflow->cpu);
 4432		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
 4433		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
 4434			   rflow->last_qtail) <
 4435		     (int)(10 * flow_table->mask)))
 4436			expire = false;
 4437	}
 4438	rcu_read_unlock();
 4439	return expire;
 4440}
 4441EXPORT_SYMBOL(rps_may_expire_flow);
 4442
 4443#endif /* CONFIG_RFS_ACCEL */
 4444
 4445/* Called from hardirq (IPI) context */
 4446static void rps_trigger_softirq(void *data)
 4447{
 4448	struct softnet_data *sd = data;
 4449
 4450	____napi_schedule(sd, &sd->backlog);
 4451	sd->received_rps++;
 4452}
 4453
 4454#endif /* CONFIG_RPS */
 4455
 4456/*
 4457 * Check if this softnet_data structure is another cpu one
 4458 * If yes, queue it to our IPI list and return 1
 4459 * If no, return 0
 4460 */
 4461static int rps_ipi_queued(struct softnet_data *sd)
 4462{
 4463#ifdef CONFIG_RPS
 4464	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
 4465
 4466	if (sd != mysd) {
 4467		sd->rps_ipi_next = mysd->rps_ipi_list;
 4468		mysd->rps_ipi_list = sd;
 4469
 4470		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4471		return 1;
 4472	}
 4473#endif /* CONFIG_RPS */
 4474	return 0;
 4475}
 4476
 4477#ifdef CONFIG_NET_FLOW_LIMIT
 4478int netdev_flow_limit_table_len __read_mostly = (1 << 12);
 4479#endif
 4480
 4481static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
 4482{
 4483#ifdef CONFIG_NET_FLOW_LIMIT
 4484	struct sd_flow_limit *fl;
 4485	struct softnet_data *sd;
 4486	unsigned int old_flow, new_flow;
 4487
 4488	if (qlen < (netdev_max_backlog >> 1))
 4489		return false;
 4490
 4491	sd = this_cpu_ptr(&softnet_data);
 4492
 4493	rcu_read_lock();
 4494	fl = rcu_dereference(sd->flow_limit);
 4495	if (fl) {
 4496		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
 4497		old_flow = fl->history[fl->history_head];
 4498		fl->history[fl->history_head] = new_flow;
 4499
 4500		fl->history_head++;
 4501		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
 4502
 4503		if (likely(fl->buckets[old_flow]))
 4504			fl->buckets[old_flow]--;
 4505
 4506		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
 4507			fl->count++;
 4508			rcu_read_unlock();
 4509			return true;
 4510		}
 4511	}
 4512	rcu_read_unlock();
 4513#endif
 4514	return false;
 4515}
 4516
 4517/*
 4518 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
 4519 * queue (may be a remote CPU queue).
 4520 */
 4521static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 4522			      unsigned int *qtail)
 4523{
 4524	struct softnet_data *sd;
 4525	unsigned long flags;
 4526	unsigned int qlen;
 4527
 4528	sd = &per_cpu(softnet_data, cpu);
 4529
 4530	local_irq_save(flags);
 4531
 4532	rps_lock(sd);
 4533	if (!netif_running(skb->dev))
 4534		goto drop;
 4535	qlen = skb_queue_len(&sd->input_pkt_queue);
 4536	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
 4537		if (qlen) {
 4538enqueue:
 4539			__skb_queue_tail(&sd->input_pkt_queue, skb);
 4540			input_queue_tail_incr_save(sd, qtail);
 4541			rps_unlock(sd);
 4542			local_irq_restore(flags);
 4543			return NET_RX_SUCCESS;
 4544		}
 4545
 4546		/* Schedule NAPI for backlog device
 4547		 * We can use non atomic operation since we own the queue lock
 4548		 */
 4549		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
 4550			if (!rps_ipi_queued(sd))
 4551				____napi_schedule(sd, &sd->backlog);
 4552		}
 4553		goto enqueue;
 4554	}
 4555
 4556drop:
 4557	sd->dropped++;
 4558	rps_unlock(sd);
 4559
 4560	local_irq_restore(flags);
 4561
 4562	atomic_long_inc(&skb->dev->rx_dropped);
 4563	kfree_skb(skb);
 4564	return NET_RX_DROP;
 4565}
 4566
 4567static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
 4568{
 4569	struct net_device *dev = skb->dev;
 4570	struct netdev_rx_queue *rxqueue;
 4571
 4572	rxqueue = dev->_rx;
 4573
 4574	if (skb_rx_queue_recorded(skb)) {
 4575		u16 index = skb_get_rx_queue(skb);
 4576
 4577		if (unlikely(index >= dev->real_num_rx_queues)) {
 4578			WARN_ONCE(dev->real_num_rx_queues > 1,
 4579				  "%s received packet on queue %u, but number "
 4580				  "of RX queues is %u\n",
 4581				  dev->name, index, dev->real_num_rx_queues);
 4582
 4583			return rxqueue; /* Return first rxqueue */
 4584		}
 4585		rxqueue += index;
 4586	}
 4587	return rxqueue;
 4588}
 4589
 4590static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 4591				     struct xdp_buff *xdp,
 4592				     struct bpf_prog *xdp_prog)
 4593{
 4594	struct netdev_rx_queue *rxqueue;
 4595	void *orig_data, *orig_data_end;
 4596	u32 metalen, act = XDP_DROP;
 4597	__be16 orig_eth_type;
 4598	struct ethhdr *eth;
 4599	bool orig_bcast;
 4600	int hlen, off;
 4601	u32 mac_len;
 4602
 4603	/* Reinjected packets coming from act_mirred or similar should
 4604	 * not get XDP generic processing.
 4605	 */
 4606	if (skb_is_redirected(skb))
 4607		return XDP_PASS;
 4608
 4609	/* XDP packets must be linear and must have sufficient headroom
 4610	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
 4611	 * native XDP provides, thus we need to do it here as well.
 4612	 */
 4613	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
 4614	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
 4615		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
 4616		int troom = skb->tail + skb->data_len - skb->end;
 4617
 4618		/* In case we have to go down the path and also linearize,
 4619		 * then lets do the pskb_expand_head() work just once here.
 4620		 */
 4621		if (pskb_expand_head(skb,
 4622				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
 4623				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
 4624			goto do_drop;
 4625		if (skb_linearize(skb))
 4626			goto do_drop;
 4627	}
 4628
 4629	/* The XDP program wants to see the packet starting at the MAC
 4630	 * header.
 4631	 */
 4632	mac_len = skb->data - skb_mac_header(skb);
 4633	hlen = skb_headlen(skb) + mac_len;
 4634	xdp->data = skb->data - mac_len;
 4635	xdp->data_meta = xdp->data;
 4636	xdp->data_end = xdp->data + hlen;
 4637	xdp->data_hard_start = skb->data - skb_headroom(skb);
 4638
 4639	/* SKB "head" area always have tailroom for skb_shared_info */
 4640	xdp->frame_sz  = (void *)skb_end_pointer(skb) - xdp->data_hard_start;
 4641	xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 4642
 4643	orig_data_end = xdp->data_end;
 4644	orig_data = xdp->data;
 4645	eth = (struct ethhdr *)xdp->data;
 4646	orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
 4647	orig_eth_type = eth->h_proto;
 4648
 4649	rxqueue = netif_get_rxqueue(skb);
 4650	xdp->rxq = &rxqueue->xdp_rxq;
 4651
 4652	act = bpf_prog_run_xdp(xdp_prog, xdp);
 4653
 4654	/* check if bpf_xdp_adjust_head was used */
 4655	off = xdp->data - orig_data;
 4656	if (off) {
 4657		if (off > 0)
 4658			__skb_pull(skb, off);
 4659		else if (off < 0)
 4660			__skb_push(skb, -off);
 4661
 4662		skb->mac_header += off;
 4663		skb_reset_network_header(skb);
 4664	}
 4665
 4666	/* check if bpf_xdp_adjust_tail was used */
 4667	off = xdp->data_end - orig_data_end;
 4668	if (off != 0) {
 4669		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
 4670		skb->len += off; /* positive on grow, negative on shrink */
 4671	}
 4672
 4673	/* check if XDP changed eth hdr such SKB needs update */
 4674	eth = (struct ethhdr *)xdp->data;
 4675	if ((orig_eth_type != eth->h_proto) ||
 4676	    (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
 4677		__skb_push(skb, ETH_HLEN);
 4678		skb->protocol = eth_type_trans(skb, skb->dev);
 4679	}
 4680
 4681	switch (act) {
 4682	case XDP_REDIRECT:
 4683	case XDP_TX:
 4684		__skb_push(skb, mac_len);
 4685		break;
 4686	case XDP_PASS:
 4687		metalen = xdp->data - xdp->data_meta;
 4688		if (metalen)
 4689			skb_metadata_set(skb, metalen);
 4690		break;
 4691	default:
 4692		bpf_warn_invalid_xdp_action(act);
 4693		fallthrough;
 4694	case XDP_ABORTED:
 4695		trace_xdp_exception(skb->dev, xdp_prog, act);
 4696		fallthrough;
 4697	case XDP_DROP:
 4698	do_drop:
 4699		kfree_skb(skb);
 4700		break;
 4701	}
 4702
 4703	return act;
 4704}
 4705
 4706/* When doing generic XDP we have to bypass the qdisc layer and the
 4707 * network taps in order to match in-driver-XDP behavior.
 4708 */
 4709void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 4710{
 4711	struct net_device *dev = skb->dev;
 4712	struct netdev_queue *txq;
 4713	bool free_skb = true;
 4714	int cpu, rc;
 4715
 4716	txq = netdev_core_pick_tx(dev, skb, NULL);
 4717	cpu = smp_processor_id();
 4718	HARD_TX_LOCK(dev, txq, cpu);
 4719	if (!netif_xmit_stopped(txq)) {
 4720		rc = netdev_start_xmit(skb, dev, txq, 0);
 4721		if (dev_xmit_complete(rc))
 4722			free_skb = false;
 4723	}
 4724	HARD_TX_UNLOCK(dev, txq);
 4725	if (free_skb) {
 4726		trace_xdp_exception(dev, xdp_prog, XDP_TX);
 4727		kfree_skb(skb);
 4728	}
 4729}
 4730
 4731static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
 4732
 4733int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
 4734{
 4735	if (xdp_prog) {
 4736		struct xdp_buff xdp;
 4737		u32 act;
 4738		int err;
 4739
 4740		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
 4741		if (act != XDP_PASS) {
 4742			switch (act) {
 4743			case XDP_REDIRECT:
 4744				err = xdp_do_generic_redirect(skb->dev, skb,
 4745							      &xdp, xdp_prog);
 4746				if (err)
 4747					goto out_redir;
 4748				break;
 4749			case XDP_TX:
 4750				generic_xdp_tx(skb, xdp_prog);
 4751				break;
 4752			}
 4753			return XDP_DROP;
 4754		}
 4755	}
 4756	return XDP_PASS;
 4757out_redir:
 4758	kfree_skb(skb);
 4759	return XDP_DROP;
 4760}
 4761EXPORT_SYMBOL_GPL(do_xdp_generic);
 4762
 4763static int netif_rx_internal(struct sk_buff *skb)
 4764{
 4765	int ret;
 4766
 4767	net_timestamp_check(netdev_tstamp_prequeue, skb);
 
 4768
 4769	trace_netif_rx(skb);
 4770
 4771#ifdef CONFIG_RPS
 4772	if (static_branch_unlikely(&rps_needed)) {
 4773		struct rps_dev_flow voidflow, *rflow = &voidflow;
 4774		int cpu;
 4775
 4776		preempt_disable();
 4777		rcu_read_lock();
 4778
 4779		cpu = get_rps_cpu(skb->dev, skb, &rflow);
 4780		if (cpu < 0)
 4781			cpu = smp_processor_id();
 4782
 4783		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 4784
 4785		rcu_read_unlock();
 4786		preempt_enable();
 4787	} else
 4788#endif
 4789	{
 4790		unsigned int qtail;
 4791
 4792		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
 4793		put_cpu();
 4794	}
 4795	return ret;
 4796}
 4797
 4798/**
 4799 *	netif_rx	-	post buffer to the network code
 4800 *	@skb: buffer to post
 4801 *
 4802 *	This function receives a packet from a device driver and queues it for
 4803 *	the upper (protocol) levels to process.  It always succeeds. The buffer
 4804 *	may be dropped during processing for congestion control or by the
 4805 *	protocol layers.
 4806 *
 4807 *	return values:
 4808 *	NET_RX_SUCCESS	(no congestion)
 4809 *	NET_RX_DROP     (packet was dropped)
 4810 *
 4811 */
 4812
 4813int netif_rx(struct sk_buff *skb)
 4814{
 4815	int ret;
 4816
 4817	trace_netif_rx_entry(skb);
 4818
 4819	ret = netif_rx_internal(skb);
 4820	trace_netif_rx_exit(ret);
 4821
 4822	return ret;
 4823}
 4824EXPORT_SYMBOL(netif_rx);
 4825
 4826int netif_rx_ni(struct sk_buff *skb)
 4827{
 4828	int err;
 4829
 4830	trace_netif_rx_ni_entry(skb);
 4831
 4832	preempt_disable();
 4833	err = netif_rx_internal(skb);
 4834	if (local_softirq_pending())
 4835		do_softirq();
 4836	preempt_enable();
 4837	trace_netif_rx_ni_exit(err);
 4838
 4839	return err;
 4840}
 4841EXPORT_SYMBOL(netif_rx_ni);
 4842
 4843static __latent_entropy void net_tx_action(struct softirq_action *h)
 4844{
 4845	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 4846
 4847	if (sd->completion_queue) {
 4848		struct sk_buff *clist;
 4849
 4850		local_irq_disable();
 4851		clist = sd->completion_queue;
 4852		sd->completion_queue = NULL;
 4853		local_irq_enable();
 4854
 4855		while (clist) {
 4856			struct sk_buff *skb = clist;
 4857
 4858			clist = clist->next;
 4859
 4860			WARN_ON(refcount_read(&skb->users));
 4861			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
 4862				trace_consume_skb(skb);
 4863			else
 4864				trace_kfree_skb(skb, net_tx_action);
 4865
 4866			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
 4867				__kfree_skb(skb);
 4868			else
 4869				__kfree_skb_defer(skb);
 4870		}
 4871
 4872		__kfree_skb_flush();
 4873	}
 4874
 4875	if (sd->output_queue) {
 4876		struct Qdisc *head;
 4877
 4878		local_irq_disable();
 4879		head = sd->output_queue;
 4880		sd->output_queue = NULL;
 4881		sd->output_queue_tailp = &sd->output_queue;
 4882		local_irq_enable();
 4883
 4884		while (head) {
 4885			struct Qdisc *q = head;
 4886			spinlock_t *root_lock = NULL;
 4887
 4888			head = head->next_sched;
 4889
 4890			if (!(q->flags & TCQ_F_NOLOCK)) {
 4891				root_lock = qdisc_lock(q);
 4892				spin_lock(root_lock);
 
 
 
 
 
 
 
 
 
 
 
 
 
 4893			}
 4894			/* We need to make sure head->next_sched is read
 4895			 * before clearing __QDISC_STATE_SCHED
 4896			 */
 4897			smp_mb__before_atomic();
 4898			clear_bit(__QDISC_STATE_SCHED, &q->state);
 4899			qdisc_run(q);
 4900			if (root_lock)
 4901				spin_unlock(root_lock);
 4902		}
 4903	}
 4904
 4905	xfrm_dev_backlog(sd);
 4906}
 4907
 4908#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
 
 4909/* This hook is defined here for ATM LANE */
 4910int (*br_fdb_test_addr_hook)(struct net_device *dev,
 4911			     unsigned char *addr) __read_mostly;
 4912EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
 4913#endif
 4914
 4915static inline struct sk_buff *
 4916sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 4917		   struct net_device *orig_dev)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 4918{
 4919#ifdef CONFIG_NET_CLS_ACT
 4920	struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
 4921	struct tcf_result cl_res;
 4922
 4923	/* If there's at least one ingress present somewhere (so
 4924	 * we get here via enabled static key), remaining devices
 4925	 * that are not configured with an ingress qdisc will bail
 4926	 * out here.
 4927	 */
 4928	if (!miniq)
 4929		return skb;
 4930
 4931	if (*pt_prev) {
 4932		*ret = deliver_skb(skb, *pt_prev, orig_dev);
 4933		*pt_prev = NULL;
 4934	}
 4935
 4936	qdisc_skb_cb(skb)->pkt_len = skb->len;
 4937	skb->tc_at_ingress = 1;
 4938	mini_qdisc_bstats_cpu_update(miniq, skb);
 4939
 4940	switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list,
 4941				     &cl_res, false)) {
 4942	case TC_ACT_OK:
 4943	case TC_ACT_RECLASSIFY:
 4944		skb->tc_index = TC_H_MIN(cl_res.classid);
 4945		break;
 4946	case TC_ACT_SHOT:
 4947		mini_qdisc_qstats_cpu_drop(miniq);
 4948		kfree_skb(skb);
 4949		return NULL;
 4950	case TC_ACT_STOLEN:
 4951	case TC_ACT_QUEUED:
 4952	case TC_ACT_TRAP:
 4953		consume_skb(skb);
 4954		return NULL;
 4955	case TC_ACT_REDIRECT:
 4956		/* skb_mac_header check was done by cls/act_bpf, so
 4957		 * we can safely push the L2 header back before
 4958		 * redirecting to another netdev
 4959		 */
 4960		__skb_push(skb, skb->mac_len);
 4961		skb_do_redirect(skb);
 4962		return NULL;
 4963	case TC_ACT_CONSUMED:
 4964		return NULL;
 4965	default:
 4966		break;
 4967	}
 4968#endif /* CONFIG_NET_CLS_ACT */
 
 
 4969	return skb;
 4970}
 4971
 4972/**
 4973 *	netdev_is_rx_handler_busy - check if receive handler is registered
 4974 *	@dev: device to check
 4975 *
 4976 *	Check if a receive handler is already registered for a given device.
 4977 *	Return true if there one.
 4978 *
 4979 *	The caller must hold the rtnl_mutex.
 4980 */
 4981bool netdev_is_rx_handler_busy(struct net_device *dev)
 4982{
 4983	ASSERT_RTNL();
 4984	return dev && rtnl_dereference(dev->rx_handler);
 4985}
 4986EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
 4987
 4988/**
 4989 *	netdev_rx_handler_register - register receive handler
 4990 *	@dev: device to register a handler for
 4991 *	@rx_handler: receive handler to register
 4992 *	@rx_handler_data: data pointer that is used by rx handler
 4993 *
 4994 *	Register a receive handler for a device. This handler will then be
 4995 *	called from __netif_receive_skb. A negative errno code is returned
 4996 *	on a failure.
 4997 *
 4998 *	The caller must hold the rtnl_mutex.
 4999 *
 5000 *	For a general description of rx_handler, see enum rx_handler_result.
 5001 */
 5002int netdev_rx_handler_register(struct net_device *dev,
 5003			       rx_handler_func_t *rx_handler,
 5004			       void *rx_handler_data)
 5005{
 5006	if (netdev_is_rx_handler_busy(dev))
 
 
 5007		return -EBUSY;
 5008
 5009	if (dev->priv_flags & IFF_NO_RX_HANDLER)
 5010		return -EINVAL;
 5011
 5012	/* Note: rx_handler_data must be set before rx_handler */
 5013	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
 5014	rcu_assign_pointer(dev->rx_handler, rx_handler);
 5015
 5016	return 0;
 5017}
 5018EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
 5019
 5020/**
 5021 *	netdev_rx_handler_unregister - unregister receive handler
 5022 *	@dev: device to unregister a handler from
 5023 *
 5024 *	Unregister a receive handler from a device.
 5025 *
 5026 *	The caller must hold the rtnl_mutex.
 5027 */
 5028void netdev_rx_handler_unregister(struct net_device *dev)
 5029{
 5030
 5031	ASSERT_RTNL();
 5032	RCU_INIT_POINTER(dev->rx_handler, NULL);
 5033	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
 5034	 * section has a guarantee to see a non NULL rx_handler_data
 5035	 * as well.
 5036	 */
 5037	synchronize_net();
 5038	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
 5039}
 5040EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
 5041
 5042/*
 5043 * Limit the use of PFMEMALLOC reserves to those protocols that implement
 5044 * the special handling of PFMEMALLOC skbs.
 5045 */
 5046static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
 5047{
 5048	switch (skb->protocol) {
 5049	case htons(ETH_P_ARP):
 5050	case htons(ETH_P_IP):
 5051	case htons(ETH_P_IPV6):
 5052	case htons(ETH_P_8021Q):
 5053	case htons(ETH_P_8021AD):
 5054		return true;
 5055	default:
 5056		return false;
 5057	}
 5058}
 5059
 5060static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
 5061			     int *ret, struct net_device *orig_dev)
 5062{
 5063	if (nf_hook_ingress_active(skb)) {
 5064		int ingress_retval;
 5065
 5066		if (*pt_prev) {
 5067			*ret = deliver_skb(skb, *pt_prev, orig_dev);
 5068			*pt_prev = NULL;
 5069		}
 5070
 5071		rcu_read_lock();
 5072		ingress_retval = nf_hook_ingress(skb);
 5073		rcu_read_unlock();
 5074		return ingress_retval;
 5075	}
 5076	return 0;
 5077}
 5078
 5079static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
 5080				    struct packet_type **ppt_prev)
 5081{
 5082	struct packet_type *ptype, *pt_prev;
 5083	rx_handler_func_t *rx_handler;
 5084	struct sk_buff *skb = *pskb;
 5085	struct net_device *orig_dev;
 
 5086	bool deliver_exact = false;
 5087	int ret = NET_RX_DROP;
 5088	__be16 type;
 5089
 5090	net_timestamp_check(!netdev_tstamp_prequeue, skb);
 
 5091
 5092	trace_netif_receive_skb(skb);
 5093
 
 
 
 
 
 
 5094	orig_dev = skb->dev;
 5095
 5096	skb_reset_network_header(skb);
 5097	if (!skb_transport_header_was_set(skb))
 5098		skb_reset_transport_header(skb);
 5099	skb_reset_mac_len(skb);
 5100
 5101	pt_prev = NULL;
 5102
 
 
 5103another_round:
 5104	skb->skb_iif = skb->dev->ifindex;
 5105
 5106	__this_cpu_inc(softnet_data.processed);
 5107
 5108	if (static_branch_unlikely(&generic_xdp_needed_key)) {
 5109		int ret2;
 5110
 5111		preempt_disable();
 5112		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
 5113		preempt_enable();
 5114
 5115		if (ret2 != XDP_PASS) {
 5116			ret = NET_RX_DROP;
 5117			goto out;
 5118		}
 5119		skb_reset_mac_len(skb);
 5120	}
 5121
 5122	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
 5123	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 5124		skb = skb_vlan_untag(skb);
 5125		if (unlikely(!skb))
 5126			goto out;
 5127	}
 5128
 5129	if (skb_skip_tc_classify(skb))
 5130		goto skip_classify;
 5131
 5132	if (pfmemalloc)
 5133		goto skip_taps;
 5134
 5135	list_for_each_entry_rcu(ptype, &ptype_all, list) {
 5136		if (pt_prev)
 5137			ret = deliver_skb(skb, pt_prev, orig_dev);
 5138		pt_prev = ptype;
 
 
 5139	}
 5140
 5141	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
 5142		if (pt_prev)
 5143			ret = deliver_skb(skb, pt_prev, orig_dev);
 5144		pt_prev = ptype;
 5145	}
 5146
 5147skip_taps:
 5148#ifdef CONFIG_NET_INGRESS
 5149	if (static_branch_unlikely(&ingress_needed_key)) {
 5150		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
 5151		if (!skb)
 5152			goto out;
 5153
 5154		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
 5155			goto out;
 5156	}
 5157#endif
 5158	skb_reset_redirect(skb);
 5159skip_classify:
 5160	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
 5161		goto drop;
 5162
 5163	if (skb_vlan_tag_present(skb)) {
 5164		if (pt_prev) {
 5165			ret = deliver_skb(skb, pt_prev, orig_dev);
 5166			pt_prev = NULL;
 5167		}
 5168		if (vlan_do_receive(&skb))
 5169			goto another_round;
 5170		else if (unlikely(!skb))
 5171			goto out;
 5172	}
 5173
 5174	rx_handler = rcu_dereference(skb->dev->rx_handler);
 5175	if (rx_handler) {
 5176		if (pt_prev) {
 5177			ret = deliver_skb(skb, pt_prev, orig_dev);
 5178			pt_prev = NULL;
 5179		}
 5180		switch (rx_handler(&skb)) {
 5181		case RX_HANDLER_CONSUMED:
 5182			ret = NET_RX_SUCCESS;
 5183			goto out;
 5184		case RX_HANDLER_ANOTHER:
 5185			goto another_round;
 5186		case RX_HANDLER_EXACT:
 5187			deliver_exact = true;
 5188		case RX_HANDLER_PASS:
 5189			break;
 5190		default:
 5191			BUG();
 5192		}
 5193	}
 5194
 5195	if (unlikely(skb_vlan_tag_present(skb))) {
 5196check_vlan_id:
 5197		if (skb_vlan_tag_get_id(skb)) {
 5198			/* Vlan id is non 0 and vlan_do_receive() above couldn't
 5199			 * find vlan device.
 5200			 */
 5201			skb->pkt_type = PACKET_OTHERHOST;
 5202		} else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
 5203			   skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 5204			/* Outer header is 802.1P with vlan 0, inner header is
 5205			 * 802.1Q or 802.1AD and vlan_do_receive() above could
 5206			 * not find vlan dev for vlan id 0.
 5207			 */
 5208			__vlan_hwaccel_clear_tag(skb);
 5209			skb = skb_vlan_untag(skb);
 5210			if (unlikely(!skb))
 5211				goto out;
 5212			if (vlan_do_receive(&skb))
 5213				/* After stripping off 802.1P header with vlan 0
 5214				 * vlan dev is found for inner header.
 5215				 */
 5216				goto another_round;
 5217			else if (unlikely(!skb))
 5218				goto out;
 5219			else
 5220				/* We have stripped outer 802.1P vlan 0 header.
 5221				 * But could not find vlan dev.
 5222				 * check again for vlan id to set OTHERHOST.
 5223				 */
 5224				goto check_vlan_id;
 5225		}
 5226		/* Note: we might in the future use prio bits
 5227		 * and set skb->priority like in vlan_do_receive()
 5228		 * For the time being, just ignore Priority Code Point
 5229		 */
 5230		__vlan_hwaccel_clear_tag(skb);
 5231	}
 5232
 5233	type = skb->protocol;
 5234
 5235	/* deliver only exact match when indicated */
 5236	if (likely(!deliver_exact)) {
 5237		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5238				       &ptype_base[ntohs(type) &
 5239						   PTYPE_HASH_MASK]);
 5240	}
 5241
 5242	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5243			       &orig_dev->ptype_specific);
 5244
 5245	if (unlikely(skb->dev != orig_dev)) {
 5246		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5247				       &skb->dev->ptype_specific);
 
 
 
 
 5248	}
 5249
 5250	if (pt_prev) {
 5251		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 5252			goto drop;
 5253		*ppt_prev = pt_prev;
 5254	} else {
 5255drop:
 5256		if (!deliver_exact)
 5257			atomic_long_inc(&skb->dev->rx_dropped);
 5258		else
 5259			atomic_long_inc(&skb->dev->rx_nohandler);
 5260		kfree_skb(skb);
 5261		/* Jamal, now you will not able to escape explaining
 5262		 * me how you were going to use this. :-)
 5263		 */
 5264		ret = NET_RX_DROP;
 5265	}
 5266
 5267out:
 5268	/* The invariant here is that if *ppt_prev is not NULL
 5269	 * then skb should also be non-NULL.
 5270	 *
 5271	 * Apparently *ppt_prev assignment above holds this invariant due to
 5272	 * skb dereferencing near it.
 5273	 */
 5274	*pskb = skb;
 5275	return ret;
 5276}
 5277
 5278static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
 5279{
 5280	struct net_device *orig_dev = skb->dev;
 5281	struct packet_type *pt_prev = NULL;
 5282	int ret;
 5283
 5284	ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 5285	if (pt_prev)
 5286		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
 5287					 skb->dev, pt_prev, orig_dev);
 5288	return ret;
 5289}
 5290
 5291/**
 5292 *	netif_receive_skb_core - special purpose version of netif_receive_skb
 5293 *	@skb: buffer to process
 5294 *
 5295 *	More direct receive version of netif_receive_skb().  It should
 5296 *	only be used by callers that have a need to skip RPS and Generic XDP.
 5297 *	Caller must also take care of handling if ``(page_is_)pfmemalloc``.
 5298 *
 5299 *	This function may only be called from softirq context and interrupts
 5300 *	should be enabled.
 5301 *
 5302 *	Return values (usually ignored):
 5303 *	NET_RX_SUCCESS: no congestion
 5304 *	NET_RX_DROP: packet was dropped
 5305 */
 5306int netif_receive_skb_core(struct sk_buff *skb)
 5307{
 5308	int ret;
 5309
 5310	rcu_read_lock();
 5311	ret = __netif_receive_skb_one_core(skb, false);
 5312	rcu_read_unlock();
 5313
 5314	return ret;
 5315}
 5316EXPORT_SYMBOL(netif_receive_skb_core);
 5317
 5318static inline void __netif_receive_skb_list_ptype(struct list_head *head,
 5319						  struct packet_type *pt_prev,
 5320						  struct net_device *orig_dev)
 5321{
 5322	struct sk_buff *skb, *next;
 5323
 5324	if (!pt_prev)
 5325		return;
 5326	if (list_empty(head))
 5327		return;
 5328	if (pt_prev->list_func != NULL)
 5329		INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
 5330				   ip_list_rcv, head, pt_prev, orig_dev);
 5331	else
 5332		list_for_each_entry_safe(skb, next, head, list) {
 5333			skb_list_del_init(skb);
 5334			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 5335		}
 5336}
 5337
 5338static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
 5339{
 5340	/* Fast-path assumptions:
 5341	 * - There is no RX handler.
 5342	 * - Only one packet_type matches.
 5343	 * If either of these fails, we will end up doing some per-packet
 5344	 * processing in-line, then handling the 'last ptype' for the whole
 5345	 * sublist.  This can't cause out-of-order delivery to any single ptype,
 5346	 * because the 'last ptype' must be constant across the sublist, and all
 5347	 * other ptypes are handled per-packet.
 5348	 */
 5349	/* Current (common) ptype of sublist */
 5350	struct packet_type *pt_curr = NULL;
 5351	/* Current (common) orig_dev of sublist */
 5352	struct net_device *od_curr = NULL;
 5353	struct list_head sublist;
 5354	struct sk_buff *skb, *next;
 5355
 5356	INIT_LIST_HEAD(&sublist);
 5357	list_for_each_entry_safe(skb, next, head, list) {
 5358		struct net_device *orig_dev = skb->dev;
 5359		struct packet_type *pt_prev = NULL;
 5360
 5361		skb_list_del_init(skb);
 5362		__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 5363		if (!pt_prev)
 5364			continue;
 5365		if (pt_curr != pt_prev || od_curr != orig_dev) {
 5366			/* dispatch old sublist */
 5367			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 5368			/* start new sublist */
 5369			INIT_LIST_HEAD(&sublist);
 5370			pt_curr = pt_prev;
 5371			od_curr = orig_dev;
 5372		}
 5373		list_add_tail(&skb->list, &sublist);
 5374	}
 5375
 5376	/* dispatch final sublist */
 5377	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 5378}
 5379
 5380static int __netif_receive_skb(struct sk_buff *skb)
 5381{
 5382	int ret;
 5383
 5384	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
 5385		unsigned int noreclaim_flag;
 5386
 5387		/*
 5388		 * PFMEMALLOC skbs are special, they should
 5389		 * - be delivered to SOCK_MEMALLOC sockets only
 5390		 * - stay away from userspace
 5391		 * - have bounded memory usage
 5392		 *
 5393		 * Use PF_MEMALLOC as this saves us from propagating the allocation
 5394		 * context down to all allocation sites.
 5395		 */
 5396		noreclaim_flag = memalloc_noreclaim_save();
 5397		ret = __netif_receive_skb_one_core(skb, true);
 5398		memalloc_noreclaim_restore(noreclaim_flag);
 5399	} else
 5400		ret = __netif_receive_skb_one_core(skb, false);
 5401
 5402	return ret;
 5403}
 5404
 5405static void __netif_receive_skb_list(struct list_head *head)
 5406{
 5407	unsigned long noreclaim_flag = 0;
 5408	struct sk_buff *skb, *next;
 5409	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
 5410
 5411	list_for_each_entry_safe(skb, next, head, list) {
 5412		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
 5413			struct list_head sublist;
 5414
 5415			/* Handle the previous sublist */
 5416			list_cut_before(&sublist, head, &skb->list);
 5417			if (!list_empty(&sublist))
 5418				__netif_receive_skb_list_core(&sublist, pfmemalloc);
 5419			pfmemalloc = !pfmemalloc;
 5420			/* See comments in __netif_receive_skb */
 5421			if (pfmemalloc)
 5422				noreclaim_flag = memalloc_noreclaim_save();
 5423			else
 5424				memalloc_noreclaim_restore(noreclaim_flag);
 5425		}
 5426	}
 5427	/* Handle the remaining sublist */
 5428	if (!list_empty(head))
 5429		__netif_receive_skb_list_core(head, pfmemalloc);
 5430	/* Restore pflags */
 5431	if (pfmemalloc)
 5432		memalloc_noreclaim_restore(noreclaim_flag);
 5433}
 5434
 5435static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 5436{
 5437	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
 5438	struct bpf_prog *new = xdp->prog;
 5439	int ret = 0;
 5440
 5441	if (new) {
 5442		u32 i;
 5443
 5444		/* generic XDP does not work with DEVMAPs that can
 5445		 * have a bpf_prog installed on an entry
 5446		 */
 5447		for (i = 0; i < new->aux->used_map_cnt; i++) {
 5448			if (dev_map_can_have_prog(new->aux->used_maps[i]))
 5449				return -EINVAL;
 5450			if (cpu_map_prog_allowed(new->aux->used_maps[i]))
 5451				return -EINVAL;
 5452		}
 5453	}
 5454
 5455	switch (xdp->command) {
 5456	case XDP_SETUP_PROG:
 5457		rcu_assign_pointer(dev->xdp_prog, new);
 5458		if (old)
 5459			bpf_prog_put(old);
 5460
 5461		if (old && !new) {
 5462			static_branch_dec(&generic_xdp_needed_key);
 5463		} else if (new && !old) {
 5464			static_branch_inc(&generic_xdp_needed_key);
 5465			dev_disable_lro(dev);
 5466			dev_disable_gro_hw(dev);
 5467		}
 5468		break;
 5469
 5470	default:
 5471		ret = -EINVAL;
 5472		break;
 5473	}
 5474
 5475	return ret;
 5476}
 5477
 5478static int netif_receive_skb_internal(struct sk_buff *skb)
 5479{
 5480	int ret;
 5481
 5482	net_timestamp_check(netdev_tstamp_prequeue, skb);
 5483
 5484	if (skb_defer_rx_timestamp(skb))
 5485		return NET_RX_SUCCESS;
 5486
 5487	rcu_read_lock();
 5488#ifdef CONFIG_RPS
 5489	if (static_branch_unlikely(&rps_needed)) {
 5490		struct rps_dev_flow voidflow, *rflow = &voidflow;
 5491		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 
 
 
 
 5492
 5493		if (cpu >= 0) {
 5494			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5495			rcu_read_unlock();
 5496			return ret;
 
 
 5497		}
 5498	}
 5499#endif
 5500	ret = __netif_receive_skb(skb);
 5501	rcu_read_unlock();
 5502	return ret;
 5503}
 5504
 5505static void netif_receive_skb_list_internal(struct list_head *head)
 5506{
 5507	struct sk_buff *skb, *next;
 5508	struct list_head sublist;
 5509
 5510	INIT_LIST_HEAD(&sublist);
 5511	list_for_each_entry_safe(skb, next, head, list) {
 5512		net_timestamp_check(netdev_tstamp_prequeue, skb);
 5513		skb_list_del_init(skb);
 5514		if (!skb_defer_rx_timestamp(skb))
 5515			list_add_tail(&skb->list, &sublist);
 5516	}
 5517	list_splice_init(&sublist, head);
 5518
 5519	rcu_read_lock();
 5520#ifdef CONFIG_RPS
 5521	if (static_branch_unlikely(&rps_needed)) {
 5522		list_for_each_entry_safe(skb, next, head, list) {
 5523			struct rps_dev_flow voidflow, *rflow = &voidflow;
 5524			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5525
 5526			if (cpu >= 0) {
 5527				/* Will be handled, remove from list */
 5528				skb_list_del_init(skb);
 5529				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5530			}
 5531		}
 5532	}
 
 
 5533#endif
 5534	__netif_receive_skb_list(head);
 5535	rcu_read_unlock();
 5536}
 5537
 5538/**
 5539 *	netif_receive_skb - process receive buffer from network
 5540 *	@skb: buffer to process
 5541 *
 5542 *	netif_receive_skb() is the main receive data processing function.
 5543 *	It always succeeds. The buffer may be dropped during processing
 5544 *	for congestion control or by the protocol layers.
 5545 *
 5546 *	This function may only be called from softirq context and interrupts
 5547 *	should be enabled.
 5548 *
 5549 *	Return values (usually ignored):
 5550 *	NET_RX_SUCCESS: no congestion
 5551 *	NET_RX_DROP: packet was dropped
 5552 */
 5553int netif_receive_skb(struct sk_buff *skb)
 5554{
 5555	int ret;
 5556
 5557	trace_netif_receive_skb_entry(skb);
 5558
 5559	ret = netif_receive_skb_internal(skb);
 5560	trace_netif_receive_skb_exit(ret);
 5561
 5562	return ret;
 5563}
 5564EXPORT_SYMBOL(netif_receive_skb);
 5565
 5566/**
 5567 *	netif_receive_skb_list - process many receive buffers from network
 5568 *	@head: list of skbs to process.
 5569 *
 5570 *	Since return value of netif_receive_skb() is normally ignored, and
 5571 *	wouldn't be meaningful for a list, this function returns void.
 5572 *
 5573 *	This function may only be called from softirq context and interrupts
 5574 *	should be enabled.
 5575 */
 5576void netif_receive_skb_list(struct list_head *head)
 5577{
 5578	struct sk_buff *skb;
 5579
 5580	if (list_empty(head))
 5581		return;
 5582	if (trace_netif_receive_skb_list_entry_enabled()) {
 5583		list_for_each_entry(skb, head, list)
 5584			trace_netif_receive_skb_list_entry(skb);
 5585	}
 5586	netif_receive_skb_list_internal(head);
 5587	trace_netif_receive_skb_list_exit(0);
 5588}
 5589EXPORT_SYMBOL(netif_receive_skb_list);
 5590
 5591static DEFINE_PER_CPU(struct work_struct, flush_works);
 5592
 5593/* Network device is going away, flush any packets still pending */
 5594static void flush_backlog(struct work_struct *work)
 5595{
 
 
 5596	struct sk_buff *skb, *tmp;
 5597	struct softnet_data *sd;
 5598
 5599	local_bh_disable();
 5600	sd = this_cpu_ptr(&softnet_data);
 5601
 5602	local_irq_disable();
 5603	rps_lock(sd);
 5604	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
 5605		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 5606			__skb_unlink(skb, &sd->input_pkt_queue);
 5607			dev_kfree_skb_irq(skb);
 5608			input_queue_head_incr(sd);
 5609		}
 5610	}
 5611	rps_unlock(sd);
 5612	local_irq_enable();
 5613
 5614	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
 5615		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 5616			__skb_unlink(skb, &sd->process_queue);
 5617			kfree_skb(skb);
 5618			input_queue_head_incr(sd);
 5619		}
 5620	}
 5621	local_bh_enable();
 5622}
 5623
 5624static void flush_all_backlogs(void)
 5625{
 5626	unsigned int cpu;
 5627
 5628	get_online_cpus();
 5629
 5630	for_each_online_cpu(cpu)
 5631		queue_work_on(cpu, system_highpri_wq,
 5632			      per_cpu_ptr(&flush_works, cpu));
 5633
 5634	for_each_online_cpu(cpu)
 5635		flush_work(per_cpu_ptr(&flush_works, cpu));
 5636
 5637	put_online_cpus();
 5638}
 5639
 5640/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
 5641static void gro_normal_list(struct napi_struct *napi)
 5642{
 5643	if (!napi->rx_count)
 5644		return;
 5645	netif_receive_skb_list_internal(&napi->rx_list);
 5646	INIT_LIST_HEAD(&napi->rx_list);
 5647	napi->rx_count = 0;
 5648}
 5649
 5650/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
 5651 * pass the whole batch up to the stack.
 5652 */
 5653static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb)
 5654{
 5655	list_add_tail(&skb->list, &napi->rx_list);
 5656	if (++napi->rx_count >= gro_normal_batch)
 5657		gro_normal_list(napi);
 5658}
 5659
 5660INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
 5661INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
 5662static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
 5663{
 5664	struct packet_offload *ptype;
 5665	__be16 type = skb->protocol;
 5666	struct list_head *head = &offload_base;
 5667	int err = -ENOENT;
 5668
 5669	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
 5670
 5671	if (NAPI_GRO_CB(skb)->count == 1) {
 5672		skb_shinfo(skb)->gso_size = 0;
 5673		goto out;
 5674	}
 5675
 5676	rcu_read_lock();
 5677	list_for_each_entry_rcu(ptype, head, list) {
 5678		if (ptype->type != type || !ptype->callbacks.gro_complete)
 5679			continue;
 5680
 5681		err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
 5682					 ipv6_gro_complete, inet_gro_complete,
 5683					 skb, 0);
 5684		break;
 5685	}
 5686	rcu_read_unlock();
 5687
 5688	if (err) {
 5689		WARN_ON(&ptype->list == head);
 5690		kfree_skb(skb);
 5691		return NET_RX_SUCCESS;
 5692	}
 5693
 5694out:
 5695	gro_normal_one(napi, skb);
 5696	return NET_RX_SUCCESS;
 5697}
 5698
 5699static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
 5700				   bool flush_old)
 5701{
 5702	struct list_head *head = &napi->gro_hash[index].list;
 5703	struct sk_buff *skb, *p;
 5704
 5705	list_for_each_entry_safe_reverse(skb, p, head, list) {
 5706		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
 5707			return;
 5708		skb_list_del_init(skb);
 5709		napi_gro_complete(napi, skb);
 5710		napi->gro_hash[index].count--;
 5711	}
 5712
 5713	if (!napi->gro_hash[index].count)
 5714		__clear_bit(index, &napi->gro_bitmask);
 5715}
 5716
 5717/* napi->gro_hash[].list contains packets ordered by age.
 5718 * youngest packets at the head of it.
 5719 * Complete skbs in reverse order to reduce latencies.
 5720 */
 5721void napi_gro_flush(struct napi_struct *napi, bool flush_old)
 5722{
 5723	unsigned long bitmask = napi->gro_bitmask;
 5724	unsigned int i, base = ~0U;
 5725
 5726	while ((i = ffs(bitmask)) != 0) {
 5727		bitmask >>= i;
 5728		base += i;
 5729		__napi_gro_flush_chain(napi, base, flush_old);
 5730	}
 5731}
 5732EXPORT_SYMBOL(napi_gro_flush);
 5733
 5734static struct list_head *gro_list_prepare(struct napi_struct *napi,
 5735					  struct sk_buff *skb)
 5736{
 5737	unsigned int maclen = skb->dev->hard_header_len;
 5738	u32 hash = skb_get_hash_raw(skb);
 5739	struct list_head *head;
 5740	struct sk_buff *p;
 5741
 5742	head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
 5743	list_for_each_entry(p, head, list) {
 5744		unsigned long diffs;
 5745
 5746		NAPI_GRO_CB(p)->flush = 0;
 5747
 5748		if (hash != skb_get_hash_raw(p)) {
 5749			NAPI_GRO_CB(p)->same_flow = 0;
 5750			continue;
 5751		}
 5752
 5753		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
 5754		diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
 5755		if (skb_vlan_tag_present(p))
 5756			diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
 5757		diffs |= skb_metadata_dst_cmp(p, skb);
 5758		diffs |= skb_metadata_differs(p, skb);
 5759		if (maclen == ETH_HLEN)
 5760			diffs |= compare_ether_header(skb_mac_header(p),
 5761						      skb_mac_header(skb));
 5762		else if (!diffs)
 5763			diffs = memcmp(skb_mac_header(p),
 5764				       skb_mac_header(skb),
 5765				       maclen);
 5766		NAPI_GRO_CB(p)->same_flow = !diffs;
 5767	}
 5768
 5769	return head;
 5770}
 5771
 5772static void skb_gro_reset_offset(struct sk_buff *skb)
 5773{
 5774	const struct skb_shared_info *pinfo = skb_shinfo(skb);
 5775	const skb_frag_t *frag0 = &pinfo->frags[0];
 5776
 5777	NAPI_GRO_CB(skb)->data_offset = 0;
 5778	NAPI_GRO_CB(skb)->frag0 = NULL;
 5779	NAPI_GRO_CB(skb)->frag0_len = 0;
 5780
 5781	if (!skb_headlen(skb) && pinfo->nr_frags &&
 5782	    !PageHighMem(skb_frag_page(frag0))) {
 5783		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
 5784		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
 5785						    skb_frag_size(frag0),
 5786						    skb->end - skb->tail);
 5787	}
 5788}
 5789
 5790static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
 5791{
 5792	struct skb_shared_info *pinfo = skb_shinfo(skb);
 5793
 5794	BUG_ON(skb->end - skb->tail < grow);
 5795
 5796	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
 5797
 5798	skb->data_len -= grow;
 5799	skb->tail += grow;
 5800
 5801	skb_frag_off_add(&pinfo->frags[0], grow);
 5802	skb_frag_size_sub(&pinfo->frags[0], grow);
 5803
 5804	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
 5805		skb_frag_unref(skb, 0);
 5806		memmove(pinfo->frags, pinfo->frags + 1,
 5807			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
 5808	}
 5809}
 5810
 5811static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
 5812{
 5813	struct sk_buff *oldest;
 5814
 5815	oldest = list_last_entry(head, struct sk_buff, list);
 5816
 5817	/* We are called with head length >= MAX_GRO_SKBS, so this is
 5818	 * impossible.
 5819	 */
 5820	if (WARN_ON_ONCE(!oldest))
 5821		return;
 5822
 5823	/* Do not adjust napi->gro_hash[].count, caller is adding a new
 5824	 * SKB to the chain.
 5825	 */
 5826	skb_list_del_init(oldest);
 5827	napi_gro_complete(napi, oldest);
 5828}
 5829
 5830INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
 5831							   struct sk_buff *));
 5832INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
 5833							   struct sk_buff *));
 5834static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 5835{
 5836	u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
 5837	struct list_head *head = &offload_base;
 5838	struct packet_offload *ptype;
 5839	__be16 type = skb->protocol;
 5840	struct list_head *gro_head;
 5841	struct sk_buff *pp = NULL;
 
 5842	enum gro_result ret;
 5843	int same_flow;
 5844	int grow;
 5845
 5846	if (netif_elide_gro(skb->dev))
 5847		goto normal;
 5848
 5849	gro_head = gro_list_prepare(napi, skb);
 
 5850
 5851	rcu_read_lock();
 5852	list_for_each_entry_rcu(ptype, head, list) {
 5853		if (ptype->type != type || !ptype->callbacks.gro_receive)
 5854			continue;
 5855
 5856		skb_set_network_header(skb, skb_gro_offset(skb));
 5857		skb_reset_mac_len(skb);
 
 5858		NAPI_GRO_CB(skb)->same_flow = 0;
 5859		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
 5860		NAPI_GRO_CB(skb)->free = 0;
 5861		NAPI_GRO_CB(skb)->encap_mark = 0;
 5862		NAPI_GRO_CB(skb)->recursion_counter = 0;
 5863		NAPI_GRO_CB(skb)->is_fou = 0;
 5864		NAPI_GRO_CB(skb)->is_atomic = 1;
 5865		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
 5866
 5867		/* Setup for GRO checksum validation */
 5868		switch (skb->ip_summed) {
 5869		case CHECKSUM_COMPLETE:
 5870			NAPI_GRO_CB(skb)->csum = skb->csum;
 5871			NAPI_GRO_CB(skb)->csum_valid = 1;
 5872			NAPI_GRO_CB(skb)->csum_cnt = 0;
 5873			break;
 5874		case CHECKSUM_UNNECESSARY:
 5875			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
 5876			NAPI_GRO_CB(skb)->csum_valid = 0;
 5877			break;
 5878		default:
 5879			NAPI_GRO_CB(skb)->csum_cnt = 0;
 5880			NAPI_GRO_CB(skb)->csum_valid = 0;
 5881		}
 5882
 5883		pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
 5884					ipv6_gro_receive, inet_gro_receive,
 5885					gro_head, skb);
 5886		break;
 5887	}
 5888	rcu_read_unlock();
 5889
 5890	if (&ptype->list == head)
 5891		goto normal;
 5892
 5893	if (PTR_ERR(pp) == -EINPROGRESS) {
 5894		ret = GRO_CONSUMED;
 5895		goto ok;
 5896	}
 5897
 5898	same_flow = NAPI_GRO_CB(skb)->same_flow;
 5899	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
 5900
 5901	if (pp) {
 5902		skb_list_del_init(pp);
 5903		napi_gro_complete(napi, pp);
 5904		napi->gro_hash[hash].count--;
 
 
 
 5905	}
 5906
 5907	if (same_flow)
 5908		goto ok;
 5909
 5910	if (NAPI_GRO_CB(skb)->flush)
 5911		goto normal;
 5912
 5913	if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
 5914		gro_flush_oldest(napi, gro_head);
 5915	} else {
 5916		napi->gro_hash[hash].count++;
 5917	}
 5918	NAPI_GRO_CB(skb)->count = 1;
 5919	NAPI_GRO_CB(skb)->age = jiffies;
 5920	NAPI_GRO_CB(skb)->last = skb;
 5921	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
 5922	list_add(&skb->list, gro_head);
 
 5923	ret = GRO_HELD;
 5924
 5925pull:
 5926	grow = skb_gro_offset(skb) - skb_headlen(skb);
 5927	if (grow > 0)
 5928		gro_pull_from_frag0(skb, grow);
 5929ok:
 5930	if (napi->gro_hash[hash].count) {
 5931		if (!test_bit(hash, &napi->gro_bitmask))
 5932			__set_bit(hash, &napi->gro_bitmask);
 5933	} else if (test_bit(hash, &napi->gro_bitmask)) {
 5934		__clear_bit(hash, &napi->gro_bitmask);
 
 
 
 
 
 
 
 
 
 
 5935	}
 5936
 
 5937	return ret;
 5938
 5939normal:
 5940	ret = GRO_NORMAL;
 5941	goto pull;
 5942}
 
 5943
 5944struct packet_offload *gro_find_receive_by_type(__be16 type)
 
 5945{
 5946	struct list_head *offload_head = &offload_base;
 5947	struct packet_offload *ptype;
 5948
 5949	list_for_each_entry_rcu(ptype, offload_head, list) {
 5950		if (ptype->type != type || !ptype->callbacks.gro_receive)
 5951			continue;
 5952		return ptype;
 5953	}
 5954	return NULL;
 5955}
 5956EXPORT_SYMBOL(gro_find_receive_by_type);
 5957
 5958struct packet_offload *gro_find_complete_by_type(__be16 type)
 5959{
 5960	struct list_head *offload_head = &offload_base;
 5961	struct packet_offload *ptype;
 5962
 5963	list_for_each_entry_rcu(ptype, offload_head, list) {
 5964		if (ptype->type != type || !ptype->callbacks.gro_complete)
 5965			continue;
 5966		return ptype;
 5967	}
 5968	return NULL;
 5969}
 5970EXPORT_SYMBOL(gro_find_complete_by_type);
 5971
 5972static void napi_skb_free_stolen_head(struct sk_buff *skb)
 5973{
 5974	skb_dst_drop(skb);
 5975	skb_ext_put(skb);
 5976	kmem_cache_free(skbuff_head_cache, skb);
 5977}
 5978
 5979static gro_result_t napi_skb_finish(struct napi_struct *napi,
 5980				    struct sk_buff *skb,
 5981				    gro_result_t ret)
 5982{
 5983	switch (ret) {
 5984	case GRO_NORMAL:
 5985		gro_normal_one(napi, skb);
 
 5986		break;
 5987
 5988	case GRO_DROP:
 
 5989		kfree_skb(skb);
 5990		break;
 5991
 5992	case GRO_MERGED_FREE:
 5993		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
 5994			napi_skb_free_stolen_head(skb);
 5995		else
 5996			__kfree_skb(skb);
 5997		break;
 5998
 5999	case GRO_HELD:
 6000	case GRO_MERGED:
 6001	case GRO_CONSUMED:
 6002		break;
 6003	}
 6004
 6005	return ret;
 6006}
 
 6007
 6008gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 6009{
 6010	gro_result_t ret;
 
 
 6011
 6012	skb_mark_napi_id(skb, napi);
 6013	trace_napi_gro_receive_entry(skb);
 
 
 
 
 
 
 
 6014
 
 
 6015	skb_gro_reset_offset(skb);
 6016
 6017	ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
 6018	trace_napi_gro_receive_exit(ret);
 6019
 6020	return ret;
 6021}
 6022EXPORT_SYMBOL(napi_gro_receive);
 6023
 6024static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
 6025{
 6026	if (unlikely(skb->pfmemalloc)) {
 6027		consume_skb(skb);
 6028		return;
 6029	}
 6030	__skb_pull(skb, skb_headlen(skb));
 6031	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
 6032	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
 6033	__vlan_hwaccel_clear_tag(skb);
 6034	skb->dev = napi->dev;
 6035	skb->skb_iif = 0;
 6036
 6037	/* eth_type_trans() assumes pkt_type is PACKET_HOST */
 6038	skb->pkt_type = PACKET_HOST;
 6039
 6040	skb->encapsulation = 0;
 6041	skb_shinfo(skb)->gso_type = 0;
 6042	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
 6043	skb_ext_reset(skb);
 6044
 6045	napi->skb = skb;
 6046}
 6047
 6048struct sk_buff *napi_get_frags(struct napi_struct *napi)
 6049{
 6050	struct sk_buff *skb = napi->skb;
 6051
 6052	if (!skb) {
 6053		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
 6054		if (skb) {
 6055			napi->skb = skb;
 6056			skb_mark_napi_id(skb, napi);
 6057		}
 6058	}
 6059	return skb;
 6060}
 6061EXPORT_SYMBOL(napi_get_frags);
 6062
 6063static gro_result_t napi_frags_finish(struct napi_struct *napi,
 6064				      struct sk_buff *skb,
 6065				      gro_result_t ret)
 6066{
 6067	switch (ret) {
 6068	case GRO_NORMAL:
 6069	case GRO_HELD:
 6070		__skb_push(skb, ETH_HLEN);
 6071		skb->protocol = eth_type_trans(skb, skb->dev);
 6072		if (ret == GRO_NORMAL)
 6073			gro_normal_one(napi, skb);
 
 
 
 6074		break;
 6075
 6076	case GRO_DROP:
 
 6077		napi_reuse_skb(napi, skb);
 6078		break;
 6079
 6080	case GRO_MERGED_FREE:
 6081		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
 6082			napi_skb_free_stolen_head(skb);
 6083		else
 6084			napi_reuse_skb(napi, skb);
 6085		break;
 6086
 6087	case GRO_MERGED:
 6088	case GRO_CONSUMED:
 6089		break;
 6090	}
 6091
 6092	return ret;
 6093}
 
 6094
 6095/* Upper GRO stack assumes network header starts at gro_offset=0
 6096 * Drivers could call both napi_gro_frags() and napi_gro_receive()
 6097 * We copy ethernet header into skb->data to have a common layout.
 6098 */
 6099static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 6100{
 6101	struct sk_buff *skb = napi->skb;
 6102	const struct ethhdr *eth;
 6103	unsigned int hlen = sizeof(*eth);
 
 6104
 6105	napi->skb = NULL;
 6106
 6107	skb_reset_mac_header(skb);
 6108	skb_gro_reset_offset(skb);
 6109
 6110	if (unlikely(skb_gro_header_hard(skb, hlen))) {
 6111		eth = skb_gro_header_slow(skb, hlen, 0);
 
 
 
 6112		if (unlikely(!eth)) {
 6113			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
 6114					     __func__, napi->dev->name);
 6115			napi_reuse_skb(napi, skb);
 6116			return NULL;
 
 6117		}
 6118	} else {
 6119		eth = (const struct ethhdr *)skb->data;
 6120		gro_pull_from_frag0(skb, hlen);
 6121		NAPI_GRO_CB(skb)->frag0 += hlen;
 6122		NAPI_GRO_CB(skb)->frag0_len -= hlen;
 6123	}
 6124	__skb_pull(skb, hlen);
 
 6125
 6126	/*
 6127	 * This works because the only protocols we care about don't require
 6128	 * special handling.
 6129	 * We'll fix it up properly in napi_frags_finish()
 6130	 */
 6131	skb->protocol = eth->h_proto;
 6132
 
 6133	return skb;
 6134}
 
 6135
 6136gro_result_t napi_gro_frags(struct napi_struct *napi)
 6137{
 6138	gro_result_t ret;
 6139	struct sk_buff *skb = napi_frags_skb(napi);
 6140
 6141	if (!skb)
 6142		return GRO_DROP;
 6143
 6144	trace_napi_gro_frags_entry(skb);
 6145
 6146	ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
 6147	trace_napi_gro_frags_exit(ret);
 6148
 6149	return ret;
 6150}
 6151EXPORT_SYMBOL(napi_gro_frags);
 6152
 6153/* Compute the checksum from gro_offset and return the folded value
 6154 * after adding in any pseudo checksum.
 6155 */
 6156__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
 6157{
 6158	__wsum wsum;
 6159	__sum16 sum;
 6160
 6161	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
 6162
 6163	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
 6164	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
 6165	/* See comments in __skb_checksum_complete(). */
 6166	if (likely(!sum)) {
 6167		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
 6168		    !skb->csum_complete_sw)
 6169			netdev_rx_csum_fault(skb->dev, skb);
 6170	}
 6171
 6172	NAPI_GRO_CB(skb)->csum = wsum;
 6173	NAPI_GRO_CB(skb)->csum_valid = 1;
 6174
 6175	return sum;
 6176}
 6177EXPORT_SYMBOL(__skb_gro_checksum_complete);
 6178
 6179static void net_rps_send_ipi(struct softnet_data *remsd)
 6180{
 6181#ifdef CONFIG_RPS
 6182	while (remsd) {
 6183		struct softnet_data *next = remsd->rps_ipi_next;
 6184
 6185		if (cpu_online(remsd->cpu))
 6186			smp_call_function_single_async(remsd->cpu, &remsd->csd);
 6187		remsd = next;
 6188	}
 6189#endif
 6190}
 6191
 6192/*
 6193 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
 6194 * Note: called with local irq disabled, but exits with local irq enabled.
 6195 */
 6196static void net_rps_action_and_irq_enable(struct softnet_data *sd)
 6197{
 6198#ifdef CONFIG_RPS
 6199	struct softnet_data *remsd = sd->rps_ipi_list;
 6200
 6201	if (remsd) {
 6202		sd->rps_ipi_list = NULL;
 6203
 6204		local_irq_enable();
 6205
 6206		/* Send pending IPI's to kick RPS processing on remote cpus. */
 6207		net_rps_send_ipi(remsd);
 
 
 
 
 
 
 
 6208	} else
 6209#endif
 6210		local_irq_enable();
 6211}
 6212
 6213static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
 6214{
 6215#ifdef CONFIG_RPS
 6216	return sd->rps_ipi_list != NULL;
 6217#else
 6218	return false;
 6219#endif
 6220}
 6221
 6222static int process_backlog(struct napi_struct *napi, int quota)
 6223{
 
 6224	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
 6225	bool again = true;
 6226	int work = 0;
 6227
 
 6228	/* Check if we have pending ipi, its better to send them now,
 6229	 * not waiting net_rx_action() end.
 6230	 */
 6231	if (sd_has_rps_ipi_waiting(sd)) {
 6232		local_irq_disable();
 6233		net_rps_action_and_irq_enable(sd);
 6234	}
 6235
 6236	napi->weight = dev_rx_weight;
 6237	while (again) {
 
 6238		struct sk_buff *skb;
 
 6239
 6240		while ((skb = __skb_dequeue(&sd->process_queue))) {
 6241			rcu_read_lock();
 6242			__netif_receive_skb(skb);
 6243			rcu_read_unlock();
 6244			input_queue_head_incr(sd);
 6245			if (++work >= quota)
 
 6246				return work;
 6247
 6248		}
 6249
 6250		local_irq_disable();
 6251		rps_lock(sd);
 6252		if (skb_queue_empty(&sd->input_pkt_queue)) {
 
 
 
 
 
 6253			/*
 6254			 * Inline a custom version of __napi_complete().
 6255			 * only current cpu owns and manipulates this napi,
 6256			 * and NAPI_STATE_SCHED is the only possible flag set
 6257			 * on backlog.
 6258			 * We can use a plain write instead of clear_bit(),
 6259			 * and we dont need an smp_mb() memory barrier.
 6260			 */
 
 6261			napi->state = 0;
 6262			again = false;
 6263		} else {
 6264			skb_queue_splice_tail_init(&sd->input_pkt_queue,
 6265						   &sd->process_queue);
 6266		}
 6267		rps_unlock(sd);
 6268		local_irq_enable();
 6269	}
 
 6270
 6271	return work;
 6272}
 6273
 6274/**
 6275 * __napi_schedule - schedule for receive
 6276 * @n: entry to schedule
 6277 *
 6278 * The entry's receive function will be scheduled to run.
 6279 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
 6280 */
 6281void __napi_schedule(struct napi_struct *n)
 6282{
 6283	unsigned long flags;
 6284
 6285	local_irq_save(flags);
 6286	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 6287	local_irq_restore(flags);
 6288}
 6289EXPORT_SYMBOL(__napi_schedule);
 6290
 6291/**
 6292 *	napi_schedule_prep - check if napi can be scheduled
 6293 *	@n: napi context
 6294 *
 6295 * Test if NAPI routine is already running, and if not mark
 6296 * it as running.  This is used as a condition variable
 6297 * insure only one NAPI poll instance runs.  We also make
 6298 * sure there is no pending NAPI disable.
 6299 */
 6300bool napi_schedule_prep(struct napi_struct *n)
 6301{
 6302	unsigned long val, new;
 6303
 6304	do {
 6305		val = READ_ONCE(n->state);
 6306		if (unlikely(val & NAPIF_STATE_DISABLE))
 6307			return false;
 6308		new = val | NAPIF_STATE_SCHED;
 6309
 6310		/* Sets STATE_MISSED bit if STATE_SCHED was already set
 6311		 * This was suggested by Alexander Duyck, as compiler
 6312		 * emits better code than :
 6313		 * if (val & NAPIF_STATE_SCHED)
 6314		 *     new |= NAPIF_STATE_MISSED;
 6315		 */
 6316		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
 6317						   NAPIF_STATE_MISSED;
 6318	} while (cmpxchg(&n->state, val, new) != val);
 6319
 6320	return !(val & NAPIF_STATE_SCHED);
 6321}
 6322EXPORT_SYMBOL(napi_schedule_prep);
 6323
 6324/**
 6325 * __napi_schedule_irqoff - schedule for receive
 6326 * @n: entry to schedule
 6327 *
 6328 * Variant of __napi_schedule() assuming hard irqs are masked
 6329 */
 6330void __napi_schedule_irqoff(struct napi_struct *n)
 6331{
 6332	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 6333}
 6334EXPORT_SYMBOL(__napi_schedule_irqoff);
 6335
 6336bool napi_complete_done(struct napi_struct *n, int work_done)
 6337{
 6338	unsigned long flags, val, new, timeout = 0;
 6339	bool ret = true;
 6340
 6341	/*
 6342	 * 1) Don't let napi dequeue from the cpu poll list
 6343	 *    just in case its running on a different cpu.
 6344	 * 2) If we are busy polling, do nothing here, we have
 6345	 *    the guarantee we will be called later.
 6346	 */
 6347	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
 6348				 NAPIF_STATE_IN_BUSY_POLL)))
 6349		return false;
 6350
 6351	if (work_done) {
 6352		if (n->gro_bitmask)
 6353			timeout = READ_ONCE(n->dev->gro_flush_timeout);
 6354		n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
 6355	}
 6356	if (n->defer_hard_irqs_count > 0) {
 6357		n->defer_hard_irqs_count--;
 6358		timeout = READ_ONCE(n->dev->gro_flush_timeout);
 6359		if (timeout)
 6360			ret = false;
 6361	}
 6362	if (n->gro_bitmask) {
 6363		/* When the NAPI instance uses a timeout and keeps postponing
 6364		 * it, we need to bound somehow the time packets are kept in
 6365		 * the GRO layer
 6366		 */
 6367		napi_gro_flush(n, !!timeout);
 6368	}
 6369
 6370	gro_normal_list(n);
 6371
 6372	if (unlikely(!list_empty(&n->poll_list))) {
 6373		/* If n->poll_list is not empty, we need to mask irqs */
 6374		local_irq_save(flags);
 6375		list_del_init(&n->poll_list);
 6376		local_irq_restore(flags);
 6377	}
 6378
 6379	do {
 6380		val = READ_ONCE(n->state);
 6381
 6382		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
 6383
 6384		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
 6385
 6386		/* If STATE_MISSED was set, leave STATE_SCHED set,
 6387		 * because we will call napi->poll() one more time.
 6388		 * This C code was suggested by Alexander Duyck to help gcc.
 6389		 */
 6390		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
 6391						    NAPIF_STATE_SCHED;
 6392	} while (cmpxchg(&n->state, val, new) != val);
 6393
 6394	if (unlikely(val & NAPIF_STATE_MISSED)) {
 6395		__napi_schedule(n);
 6396		return false;
 6397	}
 6398
 6399	if (timeout)
 6400		hrtimer_start(&n->timer, ns_to_ktime(timeout),
 6401			      HRTIMER_MODE_REL_PINNED);
 6402	return ret;
 6403}
 6404EXPORT_SYMBOL(napi_complete_done);
 6405
 6406/* must be called under rcu_read_lock(), as we dont take a reference */
 6407static struct napi_struct *napi_by_id(unsigned int napi_id)
 6408{
 6409	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
 6410	struct napi_struct *napi;
 6411
 6412	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
 6413		if (napi->napi_id == napi_id)
 6414			return napi;
 6415
 6416	return NULL;
 6417}
 6418
 6419#if defined(CONFIG_NET_RX_BUSY_POLL)
 6420
 6421#define BUSY_POLL_BUDGET 8
 6422
 6423static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
 6424{
 6425	int rc;
 6426
 6427	/* Busy polling means there is a high chance device driver hard irq
 6428	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
 6429	 * set in napi_schedule_prep().
 6430	 * Since we are about to call napi->poll() once more, we can safely
 6431	 * clear NAPI_STATE_MISSED.
 6432	 *
 6433	 * Note: x86 could use a single "lock and ..." instruction
 6434	 * to perform these two clear_bit()
 6435	 */
 6436	clear_bit(NAPI_STATE_MISSED, &napi->state);
 6437	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
 6438
 6439	local_bh_disable();
 6440
 6441	/* All we really want here is to re-enable device interrupts.
 6442	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
 6443	 */
 6444	rc = napi->poll(napi, BUSY_POLL_BUDGET);
 6445	/* We can't gro_normal_list() here, because napi->poll() might have
 6446	 * rearmed the napi (napi_complete_done()) in which case it could
 6447	 * already be running on another CPU.
 6448	 */
 6449	trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
 6450	netpoll_poll_unlock(have_poll_lock);
 6451	if (rc == BUSY_POLL_BUDGET) {
 6452		/* As the whole budget was spent, we still own the napi so can
 6453		 * safely handle the rx_list.
 6454		 */
 6455		gro_normal_list(napi);
 6456		__napi_schedule(napi);
 6457	}
 6458	local_bh_enable();
 6459}
 6460
 6461void napi_busy_loop(unsigned int napi_id,
 6462		    bool (*loop_end)(void *, unsigned long),
 6463		    void *loop_end_arg)
 6464{
 6465	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
 6466	int (*napi_poll)(struct napi_struct *napi, int budget);
 6467	void *have_poll_lock = NULL;
 6468	struct napi_struct *napi;
 6469
 6470restart:
 6471	napi_poll = NULL;
 6472
 6473	rcu_read_lock();
 6474
 6475	napi = napi_by_id(napi_id);
 6476	if (!napi)
 6477		goto out;
 6478
 6479	preempt_disable();
 6480	for (;;) {
 6481		int work = 0;
 6482
 6483		local_bh_disable();
 6484		if (!napi_poll) {
 6485			unsigned long val = READ_ONCE(napi->state);
 6486
 6487			/* If multiple threads are competing for this napi,
 6488			 * we avoid dirtying napi->state as much as we can.
 6489			 */
 6490			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
 6491				   NAPIF_STATE_IN_BUSY_POLL))
 6492				goto count;
 6493			if (cmpxchg(&napi->state, val,
 6494				    val | NAPIF_STATE_IN_BUSY_POLL |
 6495					  NAPIF_STATE_SCHED) != val)
 6496				goto count;
 6497			have_poll_lock = netpoll_poll_lock(napi);
 6498			napi_poll = napi->poll;
 6499		}
 6500		work = napi_poll(napi, BUSY_POLL_BUDGET);
 6501		trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
 6502		gro_normal_list(napi);
 6503count:
 6504		if (work > 0)
 6505			__NET_ADD_STATS(dev_net(napi->dev),
 6506					LINUX_MIB_BUSYPOLLRXPACKETS, work);
 6507		local_bh_enable();
 6508
 6509		if (!loop_end || loop_end(loop_end_arg, start_time))
 6510			break;
 6511
 6512		if (unlikely(need_resched())) {
 6513			if (napi_poll)
 6514				busy_poll_stop(napi, have_poll_lock);
 6515			preempt_enable();
 6516			rcu_read_unlock();
 6517			cond_resched();
 6518			if (loop_end(loop_end_arg, start_time))
 6519				return;
 6520			goto restart;
 6521		}
 6522		cpu_relax();
 6523	}
 6524	if (napi_poll)
 6525		busy_poll_stop(napi, have_poll_lock);
 6526	preempt_enable();
 6527out:
 6528	rcu_read_unlock();
 6529}
 6530EXPORT_SYMBOL(napi_busy_loop);
 6531
 6532#endif /* CONFIG_NET_RX_BUSY_POLL */
 6533
 6534static void napi_hash_add(struct napi_struct *napi)
 6535{
 6536	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
 6537	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
 6538		return;
 6539
 6540	spin_lock(&napi_hash_lock);
 6541
 6542	/* 0..NR_CPUS range is reserved for sender_cpu use */
 6543	do {
 6544		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
 6545			napi_gen_id = MIN_NAPI_ID;
 6546	} while (napi_by_id(napi_gen_id));
 6547	napi->napi_id = napi_gen_id;
 6548
 6549	hlist_add_head_rcu(&napi->napi_hash_node,
 6550			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
 6551
 6552	spin_unlock(&napi_hash_lock);
 6553}
 6554
 6555/* Warning : caller is responsible to make sure rcu grace period
 6556 * is respected before freeing memory containing @napi
 6557 */
 6558bool napi_hash_del(struct napi_struct *napi)
 6559{
 6560	bool rcu_sync_needed = false;
 6561
 6562	spin_lock(&napi_hash_lock);
 6563
 6564	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
 6565		rcu_sync_needed = true;
 6566		hlist_del_rcu(&napi->napi_hash_node);
 6567	}
 6568	spin_unlock(&napi_hash_lock);
 6569	return rcu_sync_needed;
 6570}
 6571EXPORT_SYMBOL_GPL(napi_hash_del);
 6572
 6573static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 6574{
 6575	struct napi_struct *napi;
 6576
 6577	napi = container_of(timer, struct napi_struct, timer);
 6578
 6579	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
 6580	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
 6581	 */
 6582	if (!napi_disable_pending(napi) &&
 6583	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
 6584		__napi_schedule_irqoff(napi);
 6585
 6586	return HRTIMER_NORESTART;
 6587}
 6588
 6589static void init_gro_hash(struct napi_struct *napi)
 6590{
 6591	int i;
 6592
 6593	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6594		INIT_LIST_HEAD(&napi->gro_hash[i].list);
 6595		napi->gro_hash[i].count = 0;
 6596	}
 6597	napi->gro_bitmask = 0;
 6598}
 
 6599
 6600void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 6601		    int (*poll)(struct napi_struct *, int), int weight)
 6602{
 6603	INIT_LIST_HEAD(&napi->poll_list);
 6604	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 6605	napi->timer.function = napi_watchdog;
 6606	init_gro_hash(napi);
 6607	napi->skb = NULL;
 6608	INIT_LIST_HEAD(&napi->rx_list);
 6609	napi->rx_count = 0;
 6610	napi->poll = poll;
 6611	if (weight > NAPI_POLL_WEIGHT)
 6612		netdev_err_once(dev, "%s() called with weight %d\n", __func__,
 6613				weight);
 6614	napi->weight = weight;
 
 6615	napi->dev = dev;
 6616#ifdef CONFIG_NETPOLL
 
 6617	napi->poll_owner = -1;
 6618#endif
 6619	set_bit(NAPI_STATE_SCHED, &napi->state);
 6620	set_bit(NAPI_STATE_NPSVC, &napi->state);
 6621	list_add_rcu(&napi->dev_list, &dev->napi_list);
 6622	napi_hash_add(napi);
 6623}
 6624EXPORT_SYMBOL(netif_napi_add);
 6625
 6626void napi_disable(struct napi_struct *n)
 6627{
 6628	might_sleep();
 6629	set_bit(NAPI_STATE_DISABLE, &n->state);
 6630
 6631	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
 6632		msleep(1);
 6633	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
 6634		msleep(1);
 6635
 6636	hrtimer_cancel(&n->timer);
 6637
 6638	clear_bit(NAPI_STATE_DISABLE, &n->state);
 6639}
 6640EXPORT_SYMBOL(napi_disable);
 6641
 6642static void flush_gro_hash(struct napi_struct *napi)
 6643{
 6644	int i;
 6645
 6646	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6647		struct sk_buff *skb, *n;
 6648
 6649		list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
 6650			kfree_skb(skb);
 6651		napi->gro_hash[i].count = 0;
 6652	}
 6653}
 6654
 6655/* Must be called in process context */
 6656void netif_napi_del(struct napi_struct *napi)
 6657{
 6658	might_sleep();
 6659	if (napi_hash_del(napi))
 6660		synchronize_net();
 6661	list_del_init(&napi->dev_list);
 6662	napi_free_frags(napi);
 6663
 6664	flush_gro_hash(napi);
 6665	napi->gro_bitmask = 0;
 6666}
 6667EXPORT_SYMBOL(netif_napi_del);
 6668
 6669static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 6670{
 6671	void *have;
 6672	int work, weight;
 6673
 6674	list_del_init(&n->poll_list);
 6675
 6676	have = netpoll_poll_lock(n);
 6677
 6678	weight = n->weight;
 6679
 6680	/* This NAPI_STATE_SCHED test is for avoiding a race
 6681	 * with netpoll's poll_napi().  Only the entity which
 6682	 * obtains the lock and sees NAPI_STATE_SCHED set will
 6683	 * actually make the ->poll() call.  Therefore we avoid
 6684	 * accidentally calling ->poll() when NAPI is not scheduled.
 6685	 */
 6686	work = 0;
 6687	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 6688		work = n->poll(n, weight);
 6689		trace_napi_poll(n, work, weight);
 6690	}
 6691
 6692	if (unlikely(work > weight))
 6693		pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
 6694			    n->poll, work, weight);
 6695
 6696	if (likely(work < weight))
 6697		goto out_unlock;
 6698
 6699	/* Drivers must not modify the NAPI state if they
 6700	 * consume the entire weight.  In such cases this code
 6701	 * still "owns" the NAPI instance and therefore can
 6702	 * move the instance around on the list at-will.
 6703	 */
 6704	if (unlikely(napi_disable_pending(n))) {
 6705		napi_complete(n);
 6706		goto out_unlock;
 6707	}
 6708
 6709	if (n->gro_bitmask) {
 6710		/* flush too old packets
 6711		 * If HZ < 1000, flush all packets.
 6712		 */
 6713		napi_gro_flush(n, HZ >= 1000);
 6714	}
 6715
 6716	gro_normal_list(n);
 6717
 6718	/* Some drivers may have called napi_schedule
 6719	 * prior to exhausting their budget.
 6720	 */
 6721	if (unlikely(!list_empty(&n->poll_list))) {
 6722		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
 6723			     n->dev ? n->dev->name : "backlog");
 6724		goto out_unlock;
 6725	}
 6726
 6727	list_add_tail(&n->poll_list, repoll);
 6728
 6729out_unlock:
 6730	netpoll_poll_unlock(have);
 6731
 6732	return work;
 6733}
 
 6734
 6735static __latent_entropy void net_rx_action(struct softirq_action *h)
 6736{
 6737	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 6738	unsigned long time_limit = jiffies +
 6739		usecs_to_jiffies(netdev_budget_usecs);
 6740	int budget = netdev_budget;
 6741	LIST_HEAD(list);
 6742	LIST_HEAD(repoll);
 6743
 6744	local_irq_disable();
 6745	list_splice_init(&sd->poll_list, &list);
 6746	local_irq_enable();
 6747
 6748	for (;;) {
 6749		struct napi_struct *n;
 
 6750
 6751		if (list_empty(&list)) {
 6752			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
 6753				goto out;
 6754			break;
 6755		}
 6756
 6757		n = list_first_entry(&list, struct napi_struct, poll_list);
 6758		budget -= napi_poll(n, &repoll);
 6759
 6760		/* If softirq window is exhausted then punt.
 6761		 * Allow this to run for 2 jiffies since which will allow
 6762		 * an average latency of 1.5/HZ.
 6763		 */
 6764		if (unlikely(budget <= 0 ||
 6765			     time_after_eq(jiffies, time_limit))) {
 6766			sd->time_squeeze++;
 6767			break;
 6768		}
 6769	}
 6770
 6771	local_irq_disable();
 6772
 6773	list_splice_tail_init(&sd->poll_list, &list);
 6774	list_splice_tail(&repoll, &list);
 6775	list_splice(&list, &sd->poll_list);
 6776	if (!list_empty(&sd->poll_list))
 6777		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 
 6778
 6779	net_rps_action_and_irq_enable(sd);
 6780out:
 6781	__kfree_skb_flush();
 6782}
 6783
 6784struct netdev_adjacent {
 6785	struct net_device *dev;
 6786
 6787	/* upper master flag, there can only be one master device per list */
 6788	bool master;
 
 
 
 
 
 
 
 
 
 6789
 6790	/* lookup ignore flag */
 6791	bool ignore;
 6792
 6793	/* counter for the number of times this device was added to us */
 6794	u16 ref_nr;
 6795
 6796	/* private field for the users */
 6797	void *private;
 6798
 6799	struct list_head list;
 6800	struct rcu_head rcu;
 6801};
 
 
 
 
 
 
 
 
 
 
 6802
 6803static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
 6804						 struct list_head *adj_list)
 6805{
 6806	struct netdev_adjacent *adj;
 6807
 6808	list_for_each_entry(adj, adj_list, list) {
 6809		if (adj->dev == adj_dev)
 6810			return adj;
 6811	}
 6812	return NULL;
 6813}
 
 6814
 6815static int ____netdev_has_upper_dev(struct net_device *upper_dev,
 6816				    struct netdev_nested_priv *priv)
 6817{
 6818	struct net_device *dev = (struct net_device *)priv->data;
 6819
 6820	return upper_dev == dev;
 
 
 
 6821}
 6822
 
 
 6823/**
 6824 * netdev_has_upper_dev - Check if device is linked to an upper device
 6825 * @dev: device
 6826 * @upper_dev: upper device to check
 6827 *
 6828 * Find out if a device is linked to specified upper device and return true
 6829 * in case it is. Note that this checks only immediate upper device,
 6830 * not through a complete stack of devices. The caller must hold the RTNL lock.
 6831 */
 6832bool netdev_has_upper_dev(struct net_device *dev,
 6833			  struct net_device *upper_dev)
 6834{
 6835	struct netdev_nested_priv priv = {
 6836		.data = (void *)upper_dev,
 6837	};
 
 
 
 6838
 6839	ASSERT_RTNL();
 6840
 6841	return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 6842					     &priv);
 6843}
 6844EXPORT_SYMBOL(netdev_has_upper_dev);
 6845
 6846/**
 6847 * netdev_has_upper_dev_all - Check if device is linked to an upper device
 6848 * @dev: device
 6849 * @upper_dev: upper device to check
 6850 *
 6851 * Find out if a device is linked to specified upper device and return true
 6852 * in case it is. Note that this checks the entire upper device chain.
 6853 * The caller must hold rcu lock.
 6854 */
 6855
 6856bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
 6857				  struct net_device *upper_dev)
 6858{
 6859	struct netdev_nested_priv priv = {
 6860		.data = (void *)upper_dev,
 6861	};
 6862
 6863	return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 6864					       &priv);
 6865}
 6866EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
 6867
 6868/**
 6869 * netdev_has_any_upper_dev - Check if device is linked to some device
 6870 * @dev: device
 6871 *
 6872 * Find out if a device is linked to an upper device and return true in case
 6873 * it is. The caller must hold the RTNL lock.
 6874 */
 6875bool netdev_has_any_upper_dev(struct net_device *dev)
 6876{
 6877	ASSERT_RTNL();
 6878
 6879	return !list_empty(&dev->adj_list.upper);
 6880}
 6881EXPORT_SYMBOL(netdev_has_any_upper_dev);
 6882
 6883/**
 6884 * netdev_master_upper_dev_get - Get master upper device
 6885 * @dev: device
 6886 *
 6887 * Find a master upper device and return pointer to it or NULL in case
 6888 * it's not there. The caller must hold the RTNL lock.
 6889 */
 6890struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
 6891{
 6892	struct netdev_adjacent *upper;
 
 6893
 6894	ASSERT_RTNL();
 
 
 6895
 6896	if (list_empty(&dev->adj_list.upper))
 6897		return NULL;
 6898
 6899	upper = list_first_entry(&dev->adj_list.upper,
 6900				 struct netdev_adjacent, list);
 6901	if (likely(upper->master))
 6902		return upper->dev;
 6903	return NULL;
 6904}
 6905EXPORT_SYMBOL(netdev_master_upper_dev_get);
 6906
 6907static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
 6908{
 6909	struct netdev_adjacent *upper;
 6910
 6911	ASSERT_RTNL();
 6912
 6913	if (list_empty(&dev->adj_list.upper))
 6914		return NULL;
 6915
 6916	upper = list_first_entry(&dev->adj_list.upper,
 6917				 struct netdev_adjacent, list);
 6918	if (likely(upper->master) && !upper->ignore)
 6919		return upper->dev;
 6920	return NULL;
 6921}
 6922
 6923/**
 6924 * netdev_has_any_lower_dev - Check if device is linked to some device
 6925 * @dev: device
 6926 *
 6927 * Find out if a device is linked to a lower device and return true in case
 6928 * it is. The caller must hold the RTNL lock.
 6929 */
 6930static bool netdev_has_any_lower_dev(struct net_device *dev)
 6931{
 6932	ASSERT_RTNL();
 6933
 6934	return !list_empty(&dev->adj_list.lower);
 6935}
 6936
 6937void *netdev_adjacent_get_private(struct list_head *adj_list)
 6938{
 6939	struct netdev_adjacent *adj;
 
 
 
 
 
 6940
 6941	adj = list_entry(adj_list, struct netdev_adjacent, list);
 
 
 6942
 6943	return adj->private;
 6944}
 6945EXPORT_SYMBOL(netdev_adjacent_get_private);
 6946
 6947/**
 6948 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
 6949 * @dev: device
 6950 * @iter: list_head ** of the current position
 6951 *
 6952 * Gets the next device from the dev's upper list, starting from iter
 6953 * position. The caller must hold RCU read lock.
 6954 */
 6955struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
 6956						 struct list_head **iter)
 6957{
 6958	struct netdev_adjacent *upper;
 6959
 6960	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 
 
 6961
 6962	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 6963
 6964	if (&upper->list == &dev->adj_list.upper)
 6965		return NULL;
 
 
 6966
 6967	*iter = &upper->list;
 6968
 6969	return upper->dev;
 
 6970}
 6971EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
 6972
 6973static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
 6974						  struct list_head **iter,
 6975						  bool *ignore)
 
 
 
 
 6976{
 6977	struct netdev_adjacent *upper;
 
 
 6978
 6979	upper = list_entry((*iter)->next, struct netdev_adjacent, list);
 
 
 6980
 6981	if (&upper->list == &dev->adj_list.upper)
 6982		return NULL;
 
 
 6983
 6984	*iter = &upper->list;
 6985	*ignore = upper->ignore;
 6986
 6987	return upper->dev;
 6988}
 6989
 6990static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
 6991						    struct list_head **iter)
 6992{
 6993	struct netdev_adjacent *upper;
 6994
 6995	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 
 
 
 6996
 6997	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 6998
 6999	if (&upper->list == &dev->adj_list.upper)
 7000		return NULL;
 7001
 7002	*iter = &upper->list;
 7003
 7004	return upper->dev;
 7005}
 7006
 7007static int __netdev_walk_all_upper_dev(struct net_device *dev,
 7008				       int (*fn)(struct net_device *dev,
 7009					 struct netdev_nested_priv *priv),
 7010				       struct netdev_nested_priv *priv)
 7011{
 7012	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7013	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7014	int ret, cur = 0;
 7015	bool ignore;
 7016
 7017	now = dev;
 7018	iter = &dev->adj_list.upper;
 7019
 7020	while (1) {
 7021		if (now != dev) {
 7022			ret = fn(now, priv);
 7023			if (ret)
 7024				return ret;
 7025		}
 7026
 7027		next = NULL;
 7028		while (1) {
 7029			udev = __netdev_next_upper_dev(now, &iter, &ignore);
 7030			if (!udev)
 7031				break;
 7032			if (ignore)
 7033				continue;
 7034
 7035			next = udev;
 7036			niter = &udev->adj_list.upper;
 7037			dev_stack[cur] = now;
 7038			iter_stack[cur++] = iter;
 7039			break;
 7040		}
 7041
 7042		if (!next) {
 7043			if (!cur)
 7044				return 0;
 7045			next = dev_stack[--cur];
 7046			niter = iter_stack[cur];
 7047		}
 7048
 7049		now = next;
 7050		iter = niter;
 7051	}
 7052
 7053	return 0;
 7054}
 7055
 7056int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
 7057				  int (*fn)(struct net_device *dev,
 7058					    struct netdev_nested_priv *priv),
 7059				  struct netdev_nested_priv *priv)
 7060{
 7061	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7062	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7063	int ret, cur = 0;
 7064
 7065	now = dev;
 7066	iter = &dev->adj_list.upper;
 7067
 7068	while (1) {
 7069		if (now != dev) {
 7070			ret = fn(now, priv);
 7071			if (ret)
 7072				return ret;
 7073		}
 7074
 7075		next = NULL;
 7076		while (1) {
 7077			udev = netdev_next_upper_dev_rcu(now, &iter);
 7078			if (!udev)
 7079				break;
 7080
 7081			next = udev;
 7082			niter = &udev->adj_list.upper;
 7083			dev_stack[cur] = now;
 7084			iter_stack[cur++] = iter;
 7085			break;
 7086		}
 7087
 7088		if (!next) {
 7089			if (!cur)
 7090				return 0;
 7091			next = dev_stack[--cur];
 7092			niter = iter_stack[cur];
 7093		}
 7094
 7095		now = next;
 7096		iter = niter;
 7097	}
 7098
 7099	return 0;
 7100}
 7101EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
 7102
 7103static bool __netdev_has_upper_dev(struct net_device *dev,
 7104				   struct net_device *upper_dev)
 7105{
 7106	struct netdev_nested_priv priv = {
 7107		.flags = 0,
 7108		.data = (void *)upper_dev,
 7109	};
 7110
 7111	ASSERT_RTNL();
 7112
 7113	return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
 7114					   &priv);
 
 
 
 
 
 
 
 
 
 
 
 
 
 7115}
 7116
 7117/**
 7118 * netdev_lower_get_next_private - Get the next ->private from the
 7119 *				   lower neighbour list
 7120 * @dev: device
 7121 * @iter: list_head ** of the current position
 7122 *
 7123 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 7124 * list, starting from iter position. The caller must hold either hold the
 7125 * RTNL lock or its own locking that guarantees that the neighbour lower
 7126 * list will remain unchanged.
 7127 */
 7128void *netdev_lower_get_next_private(struct net_device *dev,
 7129				    struct list_head **iter)
 7130{
 7131	struct netdev_adjacent *lower;
 7132
 7133	lower = list_entry(*iter, struct netdev_adjacent, list);
 7134
 7135	if (&lower->list == &dev->adj_list.lower)
 7136		return NULL;
 7137
 7138	*iter = lower->list.next;
 7139
 7140	return lower->private;
 7141}
 7142EXPORT_SYMBOL(netdev_lower_get_next_private);
 7143
 7144/**
 7145 * netdev_lower_get_next_private_rcu - Get the next ->private from the
 7146 *				       lower neighbour list, RCU
 7147 *				       variant
 7148 * @dev: device
 7149 * @iter: list_head ** of the current position
 7150 *
 7151 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 7152 * list, starting from iter position. The caller must hold RCU read lock.
 7153 */
 7154void *netdev_lower_get_next_private_rcu(struct net_device *dev,
 7155					struct list_head **iter)
 7156{
 7157	struct netdev_adjacent *lower;
 7158
 7159	WARN_ON_ONCE(!rcu_read_lock_held());
 7160
 7161	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7162
 7163	if (&lower->list == &dev->adj_list.lower)
 7164		return NULL;
 7165
 7166	*iter = &lower->list;
 7167
 7168	return lower->private;
 7169}
 7170EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
 7171
 7172/**
 7173 * netdev_lower_get_next - Get the next device from the lower neighbour
 7174 *                         list
 7175 * @dev: device
 7176 * @iter: list_head ** of the current position
 7177 *
 7178 * Gets the next netdev_adjacent from the dev's lower neighbour
 7179 * list, starting from iter position. The caller must hold RTNL lock or
 7180 * its own locking that guarantees that the neighbour lower
 7181 * list will remain unchanged.
 7182 */
 7183void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
 7184{
 7185	struct netdev_adjacent *lower;
 7186
 7187	lower = list_entry(*iter, struct netdev_adjacent, list);
 7188
 7189	if (&lower->list == &dev->adj_list.lower)
 7190		return NULL;
 7191
 7192	*iter = lower->list.next;
 7193
 7194	return lower->dev;
 7195}
 7196EXPORT_SYMBOL(netdev_lower_get_next);
 7197
 7198static struct net_device *netdev_next_lower_dev(struct net_device *dev,
 7199						struct list_head **iter)
 7200{
 7201	struct netdev_adjacent *lower;
 7202
 7203	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 7204
 7205	if (&lower->list == &dev->adj_list.lower)
 7206		return NULL;
 7207
 7208	*iter = &lower->list;
 7209
 7210	return lower->dev;
 7211}
 7212
 7213static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
 7214						  struct list_head **iter,
 7215						  bool *ignore)
 7216{
 7217	struct netdev_adjacent *lower;
 7218
 7219	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 7220
 7221	if (&lower->list == &dev->adj_list.lower)
 7222		return NULL;
 7223
 7224	*iter = &lower->list;
 7225	*ignore = lower->ignore;
 7226
 7227	return lower->dev;
 7228}
 7229
 7230int netdev_walk_all_lower_dev(struct net_device *dev,
 7231			      int (*fn)(struct net_device *dev,
 7232					struct netdev_nested_priv *priv),
 7233			      struct netdev_nested_priv *priv)
 7234{
 7235	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7236	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7237	int ret, cur = 0;
 7238
 7239	now = dev;
 7240	iter = &dev->adj_list.lower;
 7241
 7242	while (1) {
 7243		if (now != dev) {
 7244			ret = fn(now, priv);
 7245			if (ret)
 7246				return ret;
 7247		}
 7248
 7249		next = NULL;
 7250		while (1) {
 7251			ldev = netdev_next_lower_dev(now, &iter);
 7252			if (!ldev)
 7253				break;
 7254
 7255			next = ldev;
 7256			niter = &ldev->adj_list.lower;
 7257			dev_stack[cur] = now;
 7258			iter_stack[cur++] = iter;
 7259			break;
 7260		}
 7261
 7262		if (!next) {
 7263			if (!cur)
 7264				return 0;
 7265			next = dev_stack[--cur];
 7266			niter = iter_stack[cur];
 7267		}
 7268
 7269		now = next;
 7270		iter = niter;
 7271	}
 7272
 
 
 
 
 7273	return 0;
 7274}
 7275EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
 7276
 7277static int __netdev_walk_all_lower_dev(struct net_device *dev,
 7278				       int (*fn)(struct net_device *dev,
 7279					 struct netdev_nested_priv *priv),
 7280				       struct netdev_nested_priv *priv)
 7281{
 7282	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7283	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7284	int ret, cur = 0;
 7285	bool ignore;
 7286
 7287	now = dev;
 7288	iter = &dev->adj_list.lower;
 7289
 7290	while (1) {
 7291		if (now != dev) {
 7292			ret = fn(now, priv);
 7293			if (ret)
 7294				return ret;
 7295		}
 7296
 7297		next = NULL;
 7298		while (1) {
 7299			ldev = __netdev_next_lower_dev(now, &iter, &ignore);
 7300			if (!ldev)
 7301				break;
 7302			if (ignore)
 7303				continue;
 7304
 7305			next = ldev;
 7306			niter = &ldev->adj_list.lower;
 7307			dev_stack[cur] = now;
 7308			iter_stack[cur++] = iter;
 7309			break;
 7310		}
 7311
 7312		if (!next) {
 7313			if (!cur)
 7314				return 0;
 7315			next = dev_stack[--cur];
 7316			niter = iter_stack[cur];
 7317		}
 7318
 7319		now = next;
 7320		iter = niter;
 7321	}
 7322
 7323	return 0;
 7324}
 7325
 7326struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
 7327					     struct list_head **iter)
 7328{
 7329	struct netdev_adjacent *lower;
 7330
 7331	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7332	if (&lower->list == &dev->adj_list.lower)
 7333		return NULL;
 7334
 7335	*iter = &lower->list;
 7336
 7337	return lower->dev;
 7338}
 7339EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
 7340
 7341static u8 __netdev_upper_depth(struct net_device *dev)
 7342{
 7343	struct net_device *udev;
 7344	struct list_head *iter;
 7345	u8 max_depth = 0;
 7346	bool ignore;
 
 7347
 7348	for (iter = &dev->adj_list.upper,
 7349	     udev = __netdev_next_upper_dev(dev, &iter, &ignore);
 7350	     udev;
 7351	     udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
 7352		if (ignore)
 7353			continue;
 7354		if (max_depth < udev->upper_level)
 7355			max_depth = udev->upper_level;
 7356	}
 7357
 7358	return max_depth;
 7359}
 7360
 7361static u8 __netdev_lower_depth(struct net_device *dev)
 7362{
 7363	struct net_device *ldev;
 7364	struct list_head *iter;
 7365	u8 max_depth = 0;
 7366	bool ignore;
 7367
 7368	for (iter = &dev->adj_list.lower,
 7369	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
 7370	     ldev;
 7371	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
 7372		if (ignore)
 7373			continue;
 7374		if (max_depth < ldev->lower_level)
 7375			max_depth = ldev->lower_level;
 7376	}
 7377
 7378	return max_depth;
 7379}
 7380
 7381static int __netdev_update_upper_level(struct net_device *dev,
 7382				       struct netdev_nested_priv *__unused)
 7383{
 7384	dev->upper_level = __netdev_upper_depth(dev) + 1;
 7385	return 0;
 7386}
 
 7387
 7388static int __netdev_update_lower_level(struct net_device *dev,
 7389				       struct netdev_nested_priv *priv)
 7390{
 7391	dev->lower_level = __netdev_lower_depth(dev) + 1;
 7392
 7393#ifdef CONFIG_LOCKDEP
 7394	if (!priv)
 7395		return 0;
 7396
 7397	if (priv->flags & NESTED_SYNC_IMM)
 7398		dev->nested_level = dev->lower_level - 1;
 7399	if (priv->flags & NESTED_SYNC_TODO)
 7400		net_unlink_todo(dev);
 7401#endif
 7402	return 0;
 7403}
 7404
 7405int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
 7406				  int (*fn)(struct net_device *dev,
 7407					    struct netdev_nested_priv *priv),
 7408				  struct netdev_nested_priv *priv)
 7409{
 7410	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7411	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7412	int ret, cur = 0;
 7413
 7414	now = dev;
 7415	iter = &dev->adj_list.lower;
 7416
 7417	while (1) {
 7418		if (now != dev) {
 7419			ret = fn(now, priv);
 7420			if (ret)
 7421				return ret;
 7422		}
 7423
 7424		next = NULL;
 7425		while (1) {
 7426			ldev = netdev_next_lower_dev_rcu(now, &iter);
 7427			if (!ldev)
 7428				break;
 7429
 7430			next = ldev;
 7431			niter = &ldev->adj_list.lower;
 7432			dev_stack[cur] = now;
 7433			iter_stack[cur++] = iter;
 7434			break;
 7435		}
 7436
 7437		if (!next) {
 7438			if (!cur)
 7439				return 0;
 7440			next = dev_stack[--cur];
 7441			niter = iter_stack[cur];
 7442		}
 7443
 7444		now = next;
 7445		iter = niter;
 7446	}
 7447
 7448	return 0;
 7449}
 7450EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
 7451
 7452/**
 7453 * netdev_lower_get_first_private_rcu - Get the first ->private from the
 7454 *				       lower neighbour list, RCU
 7455 *				       variant
 7456 * @dev: device
 7457 *
 7458 * Gets the first netdev_adjacent->private from the dev's lower neighbour
 7459 * list. The caller must hold RCU read lock.
 7460 */
 7461void *netdev_lower_get_first_private_rcu(struct net_device *dev)
 7462{
 7463	struct netdev_adjacent *lower;
 7464
 7465	lower = list_first_or_null_rcu(&dev->adj_list.lower,
 7466			struct netdev_adjacent, list);
 7467	if (lower)
 7468		return lower->private;
 7469	return NULL;
 7470}
 7471EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
 7472
 7473/**
 7474 * netdev_master_upper_dev_get_rcu - Get master upper device
 7475 * @dev: device
 7476 *
 7477 * Find a master upper device and return pointer to it or NULL in case
 7478 * it's not there. The caller must hold the RCU read lock.
 7479 */
 7480struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
 7481{
 7482	struct netdev_adjacent *upper;
 7483
 7484	upper = list_first_or_null_rcu(&dev->adj_list.upper,
 7485				       struct netdev_adjacent, list);
 7486	if (upper && likely(upper->master))
 7487		return upper->dev;
 7488	return NULL;
 7489}
 7490EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
 7491
 7492static int netdev_adjacent_sysfs_add(struct net_device *dev,
 7493			      struct net_device *adj_dev,
 7494			      struct list_head *dev_list)
 7495{
 7496	char linkname[IFNAMSIZ+7];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 7497
 7498	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7499		"upper_%s" : "lower_%s", adj_dev->name);
 7500	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
 7501				 linkname);
 7502}
 7503static void netdev_adjacent_sysfs_del(struct net_device *dev,
 7504			       char *name,
 7505			       struct list_head *dev_list)
 7506{
 7507	char linkname[IFNAMSIZ+7];
 7508
 7509	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7510		"upper_%s" : "lower_%s", name);
 7511	sysfs_remove_link(&(dev->dev.kobj), linkname);
 7512}
 7513
 7514static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
 7515						 struct net_device *adj_dev,
 7516						 struct list_head *dev_list)
 7517{
 7518	return (dev_list == &dev->adj_list.upper ||
 7519		dev_list == &dev->adj_list.lower) &&
 7520		net_eq(dev_net(dev), dev_net(adj_dev));
 7521}
 7522
 7523static int __netdev_adjacent_dev_insert(struct net_device *dev,
 7524					struct net_device *adj_dev,
 7525					struct list_head *dev_list,
 7526					void *private, bool master)
 7527{
 7528	struct netdev_adjacent *adj;
 7529	int ret;
 7530
 7531	adj = __netdev_find_adj(adj_dev, dev_list);
 7532
 7533	if (adj) {
 7534		adj->ref_nr += 1;
 7535		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
 7536			 dev->name, adj_dev->name, adj->ref_nr);
 7537
 7538		return 0;
 7539	}
 7540
 7541	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
 7542	if (!adj)
 7543		return -ENOMEM;
 7544
 7545	adj->dev = adj_dev;
 7546	adj->master = master;
 7547	adj->ref_nr = 1;
 7548	adj->private = private;
 7549	adj->ignore = false;
 7550	dev_hold(adj_dev);
 7551
 7552	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
 7553		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
 7554
 7555	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
 7556		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
 7557		if (ret)
 7558			goto free_adj;
 7559	}
 7560
 7561	/* Ensure that master link is always the first item in list. */
 7562	if (master) {
 7563		ret = sysfs_create_link(&(dev->dev.kobj),
 7564					&(adj_dev->dev.kobj), "master");
 7565		if (ret)
 7566			goto remove_symlinks;
 7567
 7568		list_add_rcu(&adj->list, dev_list);
 7569	} else {
 7570		list_add_tail_rcu(&adj->list, dev_list);
 7571	}
 7572
 7573	return 0;
 7574
 7575remove_symlinks:
 7576	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 7577		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 7578free_adj:
 7579	kfree(adj);
 7580	dev_put(adj_dev);
 7581
 7582	return ret;
 7583}
 7584
 7585static void __netdev_adjacent_dev_remove(struct net_device *dev,
 7586					 struct net_device *adj_dev,
 7587					 u16 ref_nr,
 7588					 struct list_head *dev_list)
 7589{
 7590	struct netdev_adjacent *adj;
 7591
 7592	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
 7593		 dev->name, adj_dev->name, ref_nr);
 7594
 7595	adj = __netdev_find_adj(adj_dev, dev_list);
 7596
 7597	if (!adj) {
 7598		pr_err("Adjacency does not exist for device %s from %s\n",
 7599		       dev->name, adj_dev->name);
 7600		WARN_ON(1);
 7601		return;
 7602	}
 7603
 7604	if (adj->ref_nr > ref_nr) {
 7605		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
 7606			 dev->name, adj_dev->name, ref_nr,
 7607			 adj->ref_nr - ref_nr);
 7608		adj->ref_nr -= ref_nr;
 7609		return;
 7610	}
 7611
 7612	if (adj->master)
 7613		sysfs_remove_link(&(dev->dev.kobj), "master");
 7614
 7615	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 7616		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 7617
 7618	list_del_rcu(&adj->list);
 7619	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
 7620		 adj_dev->name, dev->name, adj_dev->name);
 7621	dev_put(adj_dev);
 7622	kfree_rcu(adj, rcu);
 7623}
 7624
 7625static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
 7626					    struct net_device *upper_dev,
 7627					    struct list_head *up_list,
 7628					    struct list_head *down_list,
 7629					    void *private, bool master)
 7630{
 7631	int ret;
 7632
 7633	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
 7634					   private, master);
 7635	if (ret)
 7636		return ret;
 7637
 7638	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
 7639					   private, false);
 7640	if (ret) {
 7641		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
 7642		return ret;
 7643	}
 7644
 7645	return 0;
 7646}
 7647
 7648static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
 7649					       struct net_device *upper_dev,
 7650					       u16 ref_nr,
 7651					       struct list_head *up_list,
 7652					       struct list_head *down_list)
 7653{
 7654	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
 7655	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
 7656}
 7657
 7658static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
 7659						struct net_device *upper_dev,
 7660						void *private, bool master)
 7661{
 7662	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
 7663						&dev->adj_list.upper,
 7664						&upper_dev->adj_list.lower,
 7665						private, master);
 7666}
 7667
 7668static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
 7669						   struct net_device *upper_dev)
 7670{
 7671	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
 7672					   &dev->adj_list.upper,
 7673					   &upper_dev->adj_list.lower);
 7674}
 7675
 7676static int __netdev_upper_dev_link(struct net_device *dev,
 7677				   struct net_device *upper_dev, bool master,
 7678				   void *upper_priv, void *upper_info,
 7679				   struct netdev_nested_priv *priv,
 7680				   struct netlink_ext_ack *extack)
 7681{
 7682	struct netdev_notifier_changeupper_info changeupper_info = {
 7683		.info = {
 7684			.dev = dev,
 7685			.extack = extack,
 7686		},
 7687		.upper_dev = upper_dev,
 7688		.master = master,
 7689		.linking = true,
 7690		.upper_info = upper_info,
 7691	};
 7692	struct net_device *master_dev;
 7693	int ret = 0;
 7694
 7695	ASSERT_RTNL();
 7696
 7697	if (dev == upper_dev)
 7698		return -EBUSY;
 7699
 7700	/* To prevent loops, check if dev is not upper device to upper_dev. */
 7701	if (__netdev_has_upper_dev(upper_dev, dev))
 7702		return -EBUSY;
 7703
 7704	if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
 7705		return -EMLINK;
 7706
 7707	if (!master) {
 7708		if (__netdev_has_upper_dev(dev, upper_dev))
 7709			return -EEXIST;
 7710	} else {
 7711		master_dev = __netdev_master_upper_dev_get(dev);
 7712		if (master_dev)
 7713			return master_dev == upper_dev ? -EEXIST : -EBUSY;
 7714	}
 7715
 7716	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 7717					    &changeupper_info.info);
 7718	ret = notifier_to_errno(ret);
 7719	if (ret)
 7720		return ret;
 7721
 7722	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
 7723						   master);
 7724	if (ret)
 7725		return ret;
 7726
 7727	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 7728					    &changeupper_info.info);
 7729	ret = notifier_to_errno(ret);
 7730	if (ret)
 7731		goto rollback;
 7732
 7733	__netdev_update_upper_level(dev, NULL);
 7734	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 7735
 7736	__netdev_update_lower_level(upper_dev, priv);
 7737	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 7738				    priv);
 7739
 7740	return 0;
 7741
 7742rollback:
 7743	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 7744
 7745	return ret;
 7746}
 7747
 7748/**
 7749 * netdev_upper_dev_link - Add a link to the upper device
 7750 * @dev: device
 7751 * @upper_dev: new upper device
 7752 * @extack: netlink extended ack
 7753 *
 7754 * Adds a link to device which is upper to this one. The caller must hold
 7755 * the RTNL lock. On a failure a negative errno code is returned.
 7756 * On success the reference counts are adjusted and the function
 7757 * returns zero.
 7758 */
 7759int netdev_upper_dev_link(struct net_device *dev,
 7760			  struct net_device *upper_dev,
 7761			  struct netlink_ext_ack *extack)
 7762{
 7763	struct netdev_nested_priv priv = {
 7764		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 7765		.data = NULL,
 7766	};
 7767
 7768	return __netdev_upper_dev_link(dev, upper_dev, false,
 7769				       NULL, NULL, &priv, extack);
 7770}
 7771EXPORT_SYMBOL(netdev_upper_dev_link);
 7772
 7773/**
 7774 * netdev_master_upper_dev_link - Add a master link to the upper device
 7775 * @dev: device
 7776 * @upper_dev: new upper device
 7777 * @upper_priv: upper device private
 7778 * @upper_info: upper info to be passed down via notifier
 7779 * @extack: netlink extended ack
 7780 *
 7781 * Adds a link to device which is upper to this one. In this case, only
 7782 * one master upper device can be linked, although other non-master devices
 7783 * might be linked as well. The caller must hold the RTNL lock.
 7784 * On a failure a negative errno code is returned. On success the reference
 7785 * counts are adjusted and the function returns zero.
 7786 */
 7787int netdev_master_upper_dev_link(struct net_device *dev,
 7788				 struct net_device *upper_dev,
 7789				 void *upper_priv, void *upper_info,
 7790				 struct netlink_ext_ack *extack)
 7791{
 7792	struct netdev_nested_priv priv = {
 7793		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 7794		.data = NULL,
 7795	};
 7796
 7797	return __netdev_upper_dev_link(dev, upper_dev, true,
 7798				       upper_priv, upper_info, &priv, extack);
 7799}
 7800EXPORT_SYMBOL(netdev_master_upper_dev_link);
 7801
 7802static void __netdev_upper_dev_unlink(struct net_device *dev,
 7803				      struct net_device *upper_dev,
 7804				      struct netdev_nested_priv *priv)
 7805{
 7806	struct netdev_notifier_changeupper_info changeupper_info = {
 7807		.info = {
 7808			.dev = dev,
 7809		},
 7810		.upper_dev = upper_dev,
 7811		.linking = false,
 7812	};
 7813
 7814	ASSERT_RTNL();
 7815
 7816	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
 7817
 7818	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 7819				      &changeupper_info.info);
 7820
 7821	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 7822
 7823	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 7824				      &changeupper_info.info);
 7825
 7826	__netdev_update_upper_level(dev, NULL);
 7827	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 7828
 7829	__netdev_update_lower_level(upper_dev, priv);
 7830	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 7831				    priv);
 7832}
 7833
 7834/**
 7835 * netdev_upper_dev_unlink - Removes a link to upper device
 7836 * @dev: device
 7837 * @upper_dev: new upper device
 7838 *
 7839 * Removes a link to device which is upper to this one. The caller must hold
 7840 * the RTNL lock.
 7841 */
 7842void netdev_upper_dev_unlink(struct net_device *dev,
 7843			     struct net_device *upper_dev)
 7844{
 7845	struct netdev_nested_priv priv = {
 7846		.flags = NESTED_SYNC_TODO,
 7847		.data = NULL,
 7848	};
 7849
 7850	__netdev_upper_dev_unlink(dev, upper_dev, &priv);
 7851}
 7852EXPORT_SYMBOL(netdev_upper_dev_unlink);
 
 
 
 7853
 7854static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
 7855				      struct net_device *lower_dev,
 7856				      bool val)
 7857{
 7858	struct netdev_adjacent *adj;
 7859
 7860	adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
 7861	if (adj)
 7862		adj->ignore = val;
 7863
 7864	adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
 7865	if (adj)
 7866		adj->ignore = val;
 7867}
 7868
 7869static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
 7870					struct net_device *lower_dev)
 7871{
 7872	__netdev_adjacent_dev_set(upper_dev, lower_dev, true);
 7873}
 7874
 7875static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
 7876				       struct net_device *lower_dev)
 7877{
 7878	__netdev_adjacent_dev_set(upper_dev, lower_dev, false);
 7879}
 7880
 7881int netdev_adjacent_change_prepare(struct net_device *old_dev,
 7882				   struct net_device *new_dev,
 7883				   struct net_device *dev,
 7884				   struct netlink_ext_ack *extack)
 7885{
 7886	struct netdev_nested_priv priv = {
 7887		.flags = 0,
 7888		.data = NULL,
 7889	};
 7890	int err;
 7891
 7892	if (!new_dev)
 7893		return 0;
 7894
 7895	if (old_dev && new_dev != old_dev)
 7896		netdev_adjacent_dev_disable(dev, old_dev);
 7897	err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
 7898				      extack);
 7899	if (err) {
 7900		if (old_dev && new_dev != old_dev)
 7901			netdev_adjacent_dev_enable(dev, old_dev);
 7902		return err;
 7903	}
 7904
 7905	return 0;
 7906}
 7907EXPORT_SYMBOL(netdev_adjacent_change_prepare);
 7908
 7909void netdev_adjacent_change_commit(struct net_device *old_dev,
 7910				   struct net_device *new_dev,
 7911				   struct net_device *dev)
 7912{
 7913	struct netdev_nested_priv priv = {
 7914		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 7915		.data = NULL,
 7916	};
 7917
 7918	if (!new_dev || !old_dev)
 7919		return;
 7920
 7921	if (new_dev == old_dev)
 7922		return;
 7923
 7924	netdev_adjacent_dev_enable(dev, old_dev);
 7925	__netdev_upper_dev_unlink(old_dev, dev, &priv);
 7926}
 7927EXPORT_SYMBOL(netdev_adjacent_change_commit);
 7928
 7929void netdev_adjacent_change_abort(struct net_device *old_dev,
 7930				  struct net_device *new_dev,
 7931				  struct net_device *dev)
 7932{
 7933	struct netdev_nested_priv priv = {
 7934		.flags = 0,
 7935		.data = NULL,
 7936	};
 7937
 7938	if (!new_dev)
 7939		return;
 7940
 7941	if (old_dev && new_dev != old_dev)
 7942		netdev_adjacent_dev_enable(dev, old_dev);
 7943
 7944	__netdev_upper_dev_unlink(new_dev, dev, &priv);
 7945}
 7946EXPORT_SYMBOL(netdev_adjacent_change_abort);
 7947
 7948/**
 7949 * netdev_bonding_info_change - Dispatch event about slave change
 7950 * @dev: device
 7951 * @bonding_info: info to dispatch
 7952 *
 7953 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
 7954 * The caller must hold the RTNL lock.
 7955 */
 7956void netdev_bonding_info_change(struct net_device *dev,
 7957				struct netdev_bonding_info *bonding_info)
 7958{
 7959	struct netdev_notifier_bonding_info info = {
 7960		.info.dev = dev,
 7961	};
 7962
 7963	memcpy(&info.bonding_info, bonding_info,
 7964	       sizeof(struct netdev_bonding_info));
 7965	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
 7966				      &info.info);
 7967}
 7968EXPORT_SYMBOL(netdev_bonding_info_change);
 7969
 7970/**
 7971 * netdev_get_xmit_slave - Get the xmit slave of master device
 7972 * @dev: device
 7973 * @skb: The packet
 7974 * @all_slaves: assume all the slaves are active
 7975 *
 7976 * The reference counters are not incremented so the caller must be
 7977 * careful with locks. The caller must hold RCU lock.
 7978 * %NULL is returned if no slave is found.
 
 7979 */
 7980
 7981struct net_device *netdev_get_xmit_slave(struct net_device *dev,
 7982					 struct sk_buff *skb,
 7983					 bool all_slaves)
 7984{
 7985	const struct net_device_ops *ops = dev->netdev_ops;
 7986
 7987	if (!ops->ndo_get_xmit_slave)
 7988		return NULL;
 7989	return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
 7990}
 7991EXPORT_SYMBOL(netdev_get_xmit_slave);
 7992
 7993static void netdev_adjacent_add_links(struct net_device *dev)
 7994{
 7995	struct netdev_adjacent *iter;
 7996
 7997	struct net *net = dev_net(dev);
 7998
 7999	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8000		if (!net_eq(net, dev_net(iter->dev)))
 8001			continue;
 8002		netdev_adjacent_sysfs_add(iter->dev, dev,
 8003					  &iter->dev->adj_list.lower);
 8004		netdev_adjacent_sysfs_add(dev, iter->dev,
 8005					  &dev->adj_list.upper);
 8006	}
 8007
 8008	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8009		if (!net_eq(net, dev_net(iter->dev)))
 8010			continue;
 8011		netdev_adjacent_sysfs_add(iter->dev, dev,
 8012					  &iter->dev->adj_list.upper);
 8013		netdev_adjacent_sysfs_add(dev, iter->dev,
 8014					  &dev->adj_list.lower);
 8015	}
 8016}
 8017
 8018static void netdev_adjacent_del_links(struct net_device *dev)
 8019{
 8020	struct netdev_adjacent *iter;
 8021
 8022	struct net *net = dev_net(dev);
 8023
 8024	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8025		if (!net_eq(net, dev_net(iter->dev)))
 8026			continue;
 8027		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 8028					  &iter->dev->adj_list.lower);
 8029		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 8030					  &dev->adj_list.upper);
 8031	}
 8032
 8033	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8034		if (!net_eq(net, dev_net(iter->dev)))
 8035			continue;
 8036		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 8037					  &iter->dev->adj_list.upper);
 8038		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 8039					  &dev->adj_list.lower);
 8040	}
 8041}
 8042
 8043void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
 8044{
 8045	struct netdev_adjacent *iter;
 8046
 8047	struct net *net = dev_net(dev);
 8048
 8049	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8050		if (!net_eq(net, dev_net(iter->dev)))
 8051			continue;
 8052		netdev_adjacent_sysfs_del(iter->dev, oldname,
 8053					  &iter->dev->adj_list.lower);
 8054		netdev_adjacent_sysfs_add(iter->dev, dev,
 8055					  &iter->dev->adj_list.lower);
 8056	}
 8057
 8058	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8059		if (!net_eq(net, dev_net(iter->dev)))
 8060			continue;
 8061		netdev_adjacent_sysfs_del(iter->dev, oldname,
 8062					  &iter->dev->adj_list.upper);
 8063		netdev_adjacent_sysfs_add(iter->dev, dev,
 8064					  &iter->dev->adj_list.upper);
 8065	}
 8066}
 8067
 8068void *netdev_lower_dev_get_private(struct net_device *dev,
 8069				   struct net_device *lower_dev)
 8070{
 8071	struct netdev_adjacent *lower;
 8072
 8073	if (!lower_dev)
 8074		return NULL;
 8075	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
 8076	if (!lower)
 8077		return NULL;
 8078
 8079	return lower->private;
 8080}
 8081EXPORT_SYMBOL(netdev_lower_dev_get_private);
 8082
 8083
 8084/**
 8085 * netdev_lower_change - Dispatch event about lower device state change
 8086 * @lower_dev: device
 8087 * @lower_state_info: state to dispatch
 8088 *
 8089 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
 8090 * The caller must hold the RTNL lock.
 
 
 8091 */
 8092void netdev_lower_state_changed(struct net_device *lower_dev,
 8093				void *lower_state_info)
 8094{
 8095	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
 8096		.info.dev = lower_dev,
 8097	};
 8098
 8099	ASSERT_RTNL();
 8100	changelowerstate_info.lower_state_info = lower_state_info;
 8101	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
 8102				      &changelowerstate_info.info);
 
 
 
 
 
 
 
 
 8103}
 8104EXPORT_SYMBOL(netdev_lower_state_changed);
 8105
 8106static void dev_change_rx_flags(struct net_device *dev, int flags)
 8107{
 8108	const struct net_device_ops *ops = dev->netdev_ops;
 8109
 8110	if (ops->ndo_change_rx_flags)
 8111		ops->ndo_change_rx_flags(dev, flags);
 8112}
 8113
 8114static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
 8115{
 8116	unsigned int old_flags = dev->flags;
 8117	kuid_t uid;
 8118	kgid_t gid;
 8119
 8120	ASSERT_RTNL();
 8121
 8122	dev->flags |= IFF_PROMISC;
 8123	dev->promiscuity += inc;
 8124	if (dev->promiscuity == 0) {
 8125		/*
 8126		 * Avoid overflow.
 8127		 * If inc causes overflow, untouch promisc and return error.
 8128		 */
 8129		if (inc < 0)
 8130			dev->flags &= ~IFF_PROMISC;
 8131		else {
 8132			dev->promiscuity -= inc;
 8133			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
 8134				dev->name);
 
 8135			return -EOVERFLOW;
 8136		}
 8137	}
 8138	if (dev->flags != old_flags) {
 8139		pr_info("device %s %s promiscuous mode\n",
 8140			dev->name,
 8141			dev->flags & IFF_PROMISC ? "entered" : "left");
 8142		if (audit_enabled) {
 8143			current_uid_gid(&uid, &gid);
 8144			audit_log(audit_context(), GFP_ATOMIC,
 8145				  AUDIT_ANOM_PROMISCUOUS,
 8146				  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
 8147				  dev->name, (dev->flags & IFF_PROMISC),
 8148				  (old_flags & IFF_PROMISC),
 8149				  from_kuid(&init_user_ns, audit_get_loginuid(current)),
 8150				  from_kuid(&init_user_ns, uid),
 8151				  from_kgid(&init_user_ns, gid),
 8152				  audit_get_sessionid(current));
 8153		}
 8154
 8155		dev_change_rx_flags(dev, IFF_PROMISC);
 8156	}
 8157	if (notify)
 8158		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
 8159	return 0;
 8160}
 8161
 8162/**
 8163 *	dev_set_promiscuity	- update promiscuity count on a device
 8164 *	@dev: device
 8165 *	@inc: modifier
 8166 *
 8167 *	Add or remove promiscuity from a device. While the count in the device
 8168 *	remains above zero the interface remains promiscuous. Once it hits zero
 8169 *	the device reverts back to normal filtering operation. A negative inc
 8170 *	value is used to drop promiscuity on the device.
 8171 *	Return 0 if successful or a negative errno code on error.
 8172 */
 8173int dev_set_promiscuity(struct net_device *dev, int inc)
 8174{
 8175	unsigned int old_flags = dev->flags;
 8176	int err;
 8177
 8178	err = __dev_set_promiscuity(dev, inc, true);
 8179	if (err < 0)
 8180		return err;
 8181	if (dev->flags != old_flags)
 8182		dev_set_rx_mode(dev);
 8183	return err;
 8184}
 8185EXPORT_SYMBOL(dev_set_promiscuity);
 8186
 8187static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
 
 
 
 
 
 
 
 
 
 
 
 
 
 8188{
 8189	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
 8190
 8191	ASSERT_RTNL();
 8192
 8193	dev->flags |= IFF_ALLMULTI;
 8194	dev->allmulti += inc;
 8195	if (dev->allmulti == 0) {
 8196		/*
 8197		 * Avoid overflow.
 8198		 * If inc causes overflow, untouch allmulti and return error.
 8199		 */
 8200		if (inc < 0)
 8201			dev->flags &= ~IFF_ALLMULTI;
 8202		else {
 8203			dev->allmulti -= inc;
 8204			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
 8205				dev->name);
 
 8206			return -EOVERFLOW;
 8207		}
 8208	}
 8209	if (dev->flags ^ old_flags) {
 8210		dev_change_rx_flags(dev, IFF_ALLMULTI);
 8211		dev_set_rx_mode(dev);
 8212		if (notify)
 8213			__dev_notify_flags(dev, old_flags,
 8214					   dev->gflags ^ old_gflags);
 8215	}
 8216	return 0;
 8217}
 8218
 8219/**
 8220 *	dev_set_allmulti	- update allmulti count on a device
 8221 *	@dev: device
 8222 *	@inc: modifier
 8223 *
 8224 *	Add or remove reception of all multicast frames to a device. While the
 8225 *	count in the device remains above zero the interface remains listening
 8226 *	to all interfaces. Once it hits zero the device reverts back to normal
 8227 *	filtering operation. A negative @inc value is used to drop the counter
 8228 *	when releasing a resource needing all multicasts.
 8229 *	Return 0 if successful or a negative errno code on error.
 8230 */
 8231
 8232int dev_set_allmulti(struct net_device *dev, int inc)
 8233{
 8234	return __dev_set_allmulti(dev, inc, true);
 8235}
 8236EXPORT_SYMBOL(dev_set_allmulti);
 8237
 8238/*
 8239 *	Upload unicast and multicast address lists to device and
 8240 *	configure RX filtering. When the device doesn't support unicast
 8241 *	filtering it is put in promiscuous mode while unicast addresses
 8242 *	are present.
 8243 */
 8244void __dev_set_rx_mode(struct net_device *dev)
 8245{
 8246	const struct net_device_ops *ops = dev->netdev_ops;
 8247
 8248	/* dev_open will call this function so the list will stay sane. */
 8249	if (!(dev->flags&IFF_UP))
 8250		return;
 8251
 8252	if (!netif_device_present(dev))
 8253		return;
 8254
 8255	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
 
 
 8256		/* Unicast addresses changes may only happen under the rtnl,
 8257		 * therefore calling __dev_set_promiscuity here is safe.
 8258		 */
 8259		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
 8260			__dev_set_promiscuity(dev, 1, false);
 8261			dev->uc_promisc = true;
 8262		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
 8263			__dev_set_promiscuity(dev, -1, false);
 8264			dev->uc_promisc = false;
 8265		}
 
 
 
 8266	}
 8267
 8268	if (ops->ndo_set_rx_mode)
 8269		ops->ndo_set_rx_mode(dev);
 8270}
 8271
 8272void dev_set_rx_mode(struct net_device *dev)
 8273{
 8274	netif_addr_lock_bh(dev);
 8275	__dev_set_rx_mode(dev);
 8276	netif_addr_unlock_bh(dev);
 8277}
 8278
 8279/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 8280 *	dev_get_flags - get flags reported to userspace
 8281 *	@dev: device
 8282 *
 8283 *	Get the combination of flag bits exported through APIs to userspace.
 8284 */
 8285unsigned int dev_get_flags(const struct net_device *dev)
 8286{
 8287	unsigned int flags;
 8288
 8289	flags = (dev->flags & ~(IFF_PROMISC |
 8290				IFF_ALLMULTI |
 8291				IFF_RUNNING |
 8292				IFF_LOWER_UP |
 8293				IFF_DORMANT)) |
 8294		(dev->gflags & (IFF_PROMISC |
 8295				IFF_ALLMULTI));
 8296
 8297	if (netif_running(dev)) {
 8298		if (netif_oper_up(dev))
 8299			flags |= IFF_RUNNING;
 8300		if (netif_carrier_ok(dev))
 8301			flags |= IFF_LOWER_UP;
 8302		if (netif_dormant(dev))
 8303			flags |= IFF_DORMANT;
 8304	}
 8305
 8306	return flags;
 8307}
 8308EXPORT_SYMBOL(dev_get_flags);
 8309
 8310int __dev_change_flags(struct net_device *dev, unsigned int flags,
 8311		       struct netlink_ext_ack *extack)
 8312{
 8313	unsigned int old_flags = dev->flags;
 8314	int ret;
 8315
 8316	ASSERT_RTNL();
 8317
 8318	/*
 8319	 *	Set the flags on our device.
 8320	 */
 8321
 8322	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
 8323			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
 8324			       IFF_AUTOMEDIA)) |
 8325		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
 8326				    IFF_ALLMULTI));
 8327
 8328	/*
 8329	 *	Load in the correct multicast list now the flags have changed.
 8330	 */
 8331
 8332	if ((old_flags ^ flags) & IFF_MULTICAST)
 8333		dev_change_rx_flags(dev, IFF_MULTICAST);
 8334
 8335	dev_set_rx_mode(dev);
 8336
 8337	/*
 8338	 *	Have we downed the interface. We handle IFF_UP ourselves
 8339	 *	according to user attempts to set it, rather than blindly
 8340	 *	setting it.
 8341	 */
 8342
 8343	ret = 0;
 8344	if ((old_flags ^ flags) & IFF_UP) {
 8345		if (old_flags & IFF_UP)
 8346			__dev_close(dev);
 8347		else
 8348			ret = __dev_open(dev, extack);
 8349	}
 8350
 8351	if ((flags ^ dev->gflags) & IFF_PROMISC) {
 8352		int inc = (flags & IFF_PROMISC) ? 1 : -1;
 8353		unsigned int old_flags = dev->flags;
 8354
 8355		dev->gflags ^= IFF_PROMISC;
 8356
 8357		if (__dev_set_promiscuity(dev, inc, false) >= 0)
 8358			if (dev->flags != old_flags)
 8359				dev_set_rx_mode(dev);
 8360	}
 8361
 8362	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
 8363	 * is important. Some (broken) drivers set IFF_PROMISC, when
 8364	 * IFF_ALLMULTI is requested not asking us and not reporting.
 8365	 */
 8366	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
 8367		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
 8368
 8369		dev->gflags ^= IFF_ALLMULTI;
 8370		__dev_set_allmulti(dev, inc, false);
 8371	}
 8372
 8373	return ret;
 8374}
 8375
 8376void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
 8377			unsigned int gchanges)
 8378{
 8379	unsigned int changes = dev->flags ^ old_flags;
 8380
 8381	if (gchanges)
 8382		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
 8383
 8384	if (changes & IFF_UP) {
 8385		if (dev->flags & IFF_UP)
 8386			call_netdevice_notifiers(NETDEV_UP, dev);
 8387		else
 8388			call_netdevice_notifiers(NETDEV_DOWN, dev);
 8389	}
 8390
 8391	if (dev->flags & IFF_UP &&
 8392	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
 8393		struct netdev_notifier_change_info change_info = {
 8394			.info = {
 8395				.dev = dev,
 8396			},
 8397			.flags_changed = changes,
 8398		};
 8399
 8400		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
 8401	}
 8402}
 8403
 8404/**
 8405 *	dev_change_flags - change device settings
 8406 *	@dev: device
 8407 *	@flags: device state flags
 8408 *	@extack: netlink extended ack
 8409 *
 8410 *	Change settings on device based state flags. The flags are
 8411 *	in the userspace exported format.
 8412 */
 8413int dev_change_flags(struct net_device *dev, unsigned int flags,
 8414		     struct netlink_ext_ack *extack)
 8415{
 8416	int ret;
 8417	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
 8418
 8419	ret = __dev_change_flags(dev, flags, extack);
 8420	if (ret < 0)
 8421		return ret;
 8422
 8423	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
 8424	__dev_notify_flags(dev, old_flags, changes);
 
 
 
 8425	return ret;
 8426}
 8427EXPORT_SYMBOL(dev_change_flags);
 8428
 8429int __dev_set_mtu(struct net_device *dev, int new_mtu)
 8430{
 8431	const struct net_device_ops *ops = dev->netdev_ops;
 8432
 8433	if (ops->ndo_change_mtu)
 8434		return ops->ndo_change_mtu(dev, new_mtu);
 8435
 8436	/* Pairs with all the lockless reads of dev->mtu in the stack */
 8437	WRITE_ONCE(dev->mtu, new_mtu);
 8438	return 0;
 8439}
 8440EXPORT_SYMBOL(__dev_set_mtu);
 8441
 8442int dev_validate_mtu(struct net_device *dev, int new_mtu,
 8443		     struct netlink_ext_ack *extack)
 8444{
 8445	/* MTU must be positive, and in range */
 8446	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
 8447		NL_SET_ERR_MSG(extack, "mtu less than device minimum");
 8448		return -EINVAL;
 8449	}
 8450
 8451	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
 8452		NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
 8453		return -EINVAL;
 8454	}
 8455	return 0;
 8456}
 8457
 8458/**
 8459 *	dev_set_mtu_ext - Change maximum transfer unit
 8460 *	@dev: device
 8461 *	@new_mtu: new transfer unit
 8462 *	@extack: netlink extended ack
 8463 *
 8464 *	Change the maximum transfer size of the network device.
 8465 */
 8466int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
 8467		    struct netlink_ext_ack *extack)
 8468{
 8469	int err, orig_mtu;
 
 8470
 8471	if (new_mtu == dev->mtu)
 8472		return 0;
 8473
 8474	err = dev_validate_mtu(dev, new_mtu, extack);
 8475	if (err)
 8476		return err;
 8477
 8478	if (!netif_device_present(dev))
 8479		return -ENODEV;
 8480
 8481	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
 8482	err = notifier_to_errno(err);
 8483	if (err)
 8484		return err;
 
 8485
 8486	orig_mtu = dev->mtu;
 8487	err = __dev_set_mtu(dev, new_mtu);
 8488
 8489	if (!err) {
 8490		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 8491						   orig_mtu);
 8492		err = notifier_to_errno(err);
 8493		if (err) {
 8494			/* setting mtu back and notifying everyone again,
 8495			 * so that they have a chance to revert changes.
 8496			 */
 8497			__dev_set_mtu(dev, orig_mtu);
 8498			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 8499						     new_mtu);
 8500		}
 8501	}
 8502	return err;
 8503}
 8504
 8505int dev_set_mtu(struct net_device *dev, int new_mtu)
 8506{
 8507	struct netlink_ext_ack extack;
 8508	int err;
 8509
 8510	memset(&extack, 0, sizeof(extack));
 8511	err = dev_set_mtu_ext(dev, new_mtu, &extack);
 8512	if (err && extack._msg)
 8513		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
 8514	return err;
 8515}
 8516EXPORT_SYMBOL(dev_set_mtu);
 8517
 8518/**
 8519 *	dev_change_tx_queue_len - Change TX queue length of a netdevice
 8520 *	@dev: device
 8521 *	@new_len: new tx queue length
 8522 */
 8523int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
 8524{
 8525	unsigned int orig_len = dev->tx_queue_len;
 8526	int res;
 8527
 8528	if (new_len != (unsigned int)new_len)
 8529		return -ERANGE;
 8530
 8531	if (new_len != orig_len) {
 8532		dev->tx_queue_len = new_len;
 8533		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
 8534		res = notifier_to_errno(res);
 8535		if (res)
 8536			goto err_rollback;
 8537		res = dev_qdisc_change_tx_queue_len(dev);
 8538		if (res)
 8539			goto err_rollback;
 8540	}
 8541
 8542	return 0;
 8543
 8544err_rollback:
 8545	netdev_err(dev, "refused to change device tx_queue_len\n");
 8546	dev->tx_queue_len = orig_len;
 8547	return res;
 8548}
 8549
 8550/**
 8551 *	dev_set_group - Change group this device belongs to
 8552 *	@dev: device
 8553 *	@new_group: group this device should belong to
 8554 */
 8555void dev_set_group(struct net_device *dev, int new_group)
 8556{
 8557	dev->group = new_group;
 8558}
 8559EXPORT_SYMBOL(dev_set_group);
 8560
 8561/**
 8562 *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
 8563 *	@dev: device
 8564 *	@addr: new address
 8565 *	@extack: netlink extended ack
 8566 */
 8567int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
 8568			      struct netlink_ext_ack *extack)
 8569{
 8570	struct netdev_notifier_pre_changeaddr_info info = {
 8571		.info.dev = dev,
 8572		.info.extack = extack,
 8573		.dev_addr = addr,
 8574	};
 8575	int rc;
 8576
 8577	rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
 8578	return notifier_to_errno(rc);
 8579}
 8580EXPORT_SYMBOL(dev_pre_changeaddr_notify);
 8581
 8582/**
 8583 *	dev_set_mac_address - Change Media Access Control Address
 8584 *	@dev: device
 8585 *	@sa: new address
 8586 *	@extack: netlink extended ack
 8587 *
 8588 *	Change the hardware (MAC) address of the device
 8589 */
 8590int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
 8591			struct netlink_ext_ack *extack)
 8592{
 8593	const struct net_device_ops *ops = dev->netdev_ops;
 8594	int err;
 8595
 8596	if (!ops->ndo_set_mac_address)
 8597		return -EOPNOTSUPP;
 8598	if (sa->sa_family != dev->type)
 8599		return -EINVAL;
 8600	if (!netif_device_present(dev))
 8601		return -ENODEV;
 8602	err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
 8603	if (err)
 8604		return err;
 8605	err = ops->ndo_set_mac_address(dev, sa);
 8606	if (err)
 8607		return err;
 8608	dev->addr_assign_type = NET_ADDR_SET;
 8609	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
 8610	add_device_randomness(dev->dev_addr, dev->addr_len);
 8611	return 0;
 8612}
 8613EXPORT_SYMBOL(dev_set_mac_address);
 8614
 8615/**
 8616 *	dev_change_carrier - Change device carrier
 8617 *	@dev: device
 8618 *	@new_carrier: new value
 8619 *
 8620 *	Change device carrier
 8621 */
 8622int dev_change_carrier(struct net_device *dev, bool new_carrier)
 8623{
 8624	const struct net_device_ops *ops = dev->netdev_ops;
 
 8625
 8626	if (!ops->ndo_change_carrier)
 8627		return -EOPNOTSUPP;
 8628	if (!netif_device_present(dev))
 8629		return -ENODEV;
 8630	return ops->ndo_change_carrier(dev, new_carrier);
 8631}
 8632EXPORT_SYMBOL(dev_change_carrier);
 8633
 8634/**
 8635 *	dev_get_phys_port_id - Get device physical port ID
 8636 *	@dev: device
 8637 *	@ppid: port ID
 8638 *
 8639 *	Get device physical port ID
 8640 */
 8641int dev_get_phys_port_id(struct net_device *dev,
 8642			 struct netdev_phys_item_id *ppid)
 8643{
 8644	const struct net_device_ops *ops = dev->netdev_ops;
 8645
 8646	if (!ops->ndo_get_phys_port_id)
 8647		return -EOPNOTSUPP;
 8648	return ops->ndo_get_phys_port_id(dev, ppid);
 8649}
 8650EXPORT_SYMBOL(dev_get_phys_port_id);
 8651
 8652/**
 8653 *	dev_get_phys_port_name - Get device physical port name
 8654 *	@dev: device
 8655 *	@name: port name
 8656 *	@len: limit of bytes to copy to name
 8657 *
 8658 *	Get device physical port name
 8659 */
 8660int dev_get_phys_port_name(struct net_device *dev,
 8661			   char *name, size_t len)
 8662{
 8663	const struct net_device_ops *ops = dev->netdev_ops;
 8664	int err;
 8665
 8666	if (ops->ndo_get_phys_port_name) {
 8667		err = ops->ndo_get_phys_port_name(dev, name, len);
 8668		if (err != -EOPNOTSUPP)
 8669			return err;
 8670	}
 8671	return devlink_compat_phys_port_name_get(dev, name, len);
 8672}
 8673EXPORT_SYMBOL(dev_get_phys_port_name);
 8674
 8675/**
 8676 *	dev_get_port_parent_id - Get the device's port parent identifier
 8677 *	@dev: network device
 8678 *	@ppid: pointer to a storage for the port's parent identifier
 8679 *	@recurse: allow/disallow recursion to lower devices
 8680 *
 8681 *	Get the devices's port parent identifier
 8682 */
 8683int dev_get_port_parent_id(struct net_device *dev,
 8684			   struct netdev_phys_item_id *ppid,
 8685			   bool recurse)
 8686{
 8687	const struct net_device_ops *ops = dev->netdev_ops;
 8688	struct netdev_phys_item_id first = { };
 8689	struct net_device *lower_dev;
 8690	struct list_head *iter;
 8691	int err;
 8692
 8693	if (ops->ndo_get_port_parent_id) {
 8694		err = ops->ndo_get_port_parent_id(dev, ppid);
 8695		if (err != -EOPNOTSUPP)
 8696			return err;
 8697	}
 8698
 8699	err = devlink_compat_switch_id_get(dev, ppid);
 8700	if (!err || err != -EOPNOTSUPP)
 8701		return err;
 8702
 8703	if (!recurse)
 8704		return -EOPNOTSUPP;
 
 
 
 
 
 8705
 8706	netdev_for_each_lower_dev(dev, lower_dev, iter) {
 8707		err = dev_get_port_parent_id(lower_dev, ppid, recurse);
 8708		if (err)
 8709			break;
 8710		if (!first.id_len)
 8711			first = *ppid;
 8712		else if (memcmp(&first, ppid, sizeof(*ppid)))
 8713			return -EOPNOTSUPP;
 8714	}
 8715
 8716	return err;
 8717}
 8718EXPORT_SYMBOL(dev_get_port_parent_id);
 8719
 8720/**
 8721 *	netdev_port_same_parent_id - Indicate if two network devices have
 8722 *	the same port parent identifier
 8723 *	@a: first network device
 8724 *	@b: second network device
 8725 */
 8726bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
 8727{
 8728	struct netdev_phys_item_id a_id = { };
 8729	struct netdev_phys_item_id b_id = { };
 
 8730
 8731	if (dev_get_port_parent_id(a, &a_id, true) ||
 8732	    dev_get_port_parent_id(b, &b_id, true))
 8733		return false;
 8734
 8735	return netdev_phys_item_id_same(&a_id, &b_id);
 8736}
 8737EXPORT_SYMBOL(netdev_port_same_parent_id);
 8738
 8739/**
 8740 *	dev_change_proto_down - update protocol port state information
 8741 *	@dev: device
 8742 *	@proto_down: new value
 8743 *
 8744 *	This info can be used by switch drivers to set the phys state of the
 8745 *	port.
 8746 */
 8747int dev_change_proto_down(struct net_device *dev, bool proto_down)
 8748{
 8749	const struct net_device_ops *ops = dev->netdev_ops;
 8750
 8751	if (!ops->ndo_change_proto_down)
 
 8752		return -EOPNOTSUPP;
 8753	if (!netif_device_present(dev))
 8754		return -ENODEV;
 8755	return ops->ndo_change_proto_down(dev, proto_down);
 8756}
 8757EXPORT_SYMBOL(dev_change_proto_down);
 8758
 8759/**
 8760 *	dev_change_proto_down_generic - generic implementation for
 8761 * 	ndo_change_proto_down that sets carrier according to
 8762 * 	proto_down.
 8763 *
 8764 *	@dev: device
 8765 *	@proto_down: new value
 8766 */
 8767int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
 8768{
 8769	if (proto_down)
 8770		netif_carrier_off(dev);
 8771	else
 8772		netif_carrier_on(dev);
 8773	dev->proto_down = proto_down;
 8774	return 0;
 8775}
 8776EXPORT_SYMBOL(dev_change_proto_down_generic);
 8777
 8778/**
 8779 *	dev_change_proto_down_reason - proto down reason
 8780 *
 8781 *	@dev: device
 8782 *	@mask: proto down mask
 8783 *	@value: proto down value
 8784 */
 8785void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
 8786				  u32 value)
 8787{
 8788	int b;
 8789
 8790	if (!mask) {
 8791		dev->proto_down_reason = value;
 8792	} else {
 8793		for_each_set_bit(b, &mask, 32) {
 8794			if (value & (1 << b))
 8795				dev->proto_down_reason |= BIT(b);
 8796			else
 8797				dev->proto_down_reason &= ~BIT(b);
 8798		}
 8799	}
 8800}
 8801EXPORT_SYMBOL(dev_change_proto_down_reason);
 8802
 8803struct bpf_xdp_link {
 8804	struct bpf_link link;
 8805	struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
 8806	int flags;
 8807};
 
 
 8808
 8809static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
 8810{
 8811	if (flags & XDP_FLAGS_HW_MODE)
 8812		return XDP_MODE_HW;
 8813	if (flags & XDP_FLAGS_DRV_MODE)
 8814		return XDP_MODE_DRV;
 8815	if (flags & XDP_FLAGS_SKB_MODE)
 8816		return XDP_MODE_SKB;
 8817	return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
 8818}
 8819
 8820static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
 8821{
 8822	switch (mode) {
 8823	case XDP_MODE_SKB:
 8824		return generic_xdp_install;
 8825	case XDP_MODE_DRV:
 8826	case XDP_MODE_HW:
 8827		return dev->netdev_ops->ndo_bpf;
 8828	default:
 8829		return NULL;
 8830	};
 8831}
 8832
 8833static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
 8834					 enum bpf_xdp_mode mode)
 8835{
 8836	return dev->xdp_state[mode].link;
 8837}
 8838
 8839static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
 8840				     enum bpf_xdp_mode mode)
 8841{
 8842	struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
 8843
 8844	if (link)
 8845		return link->link.prog;
 8846	return dev->xdp_state[mode].prog;
 8847}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 8848
 8849u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
 8850{
 8851	struct bpf_prog *prog = dev_xdp_prog(dev, mode);
 8852
 8853	return prog ? prog->aux->id : 0;
 8854}
 8855
 8856static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
 8857			     struct bpf_xdp_link *link)
 8858{
 8859	dev->xdp_state[mode].link = link;
 8860	dev->xdp_state[mode].prog = NULL;
 8861}
 8862
 8863static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
 8864			     struct bpf_prog *prog)
 8865{
 8866	dev->xdp_state[mode].link = NULL;
 8867	dev->xdp_state[mode].prog = prog;
 8868}
 
 
 
 
 
 8869
 8870static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
 8871			   bpf_op_t bpf_op, struct netlink_ext_ack *extack,
 8872			   u32 flags, struct bpf_prog *prog)
 8873{
 8874	struct netdev_bpf xdp;
 8875	int err;
 
 8876
 8877	memset(&xdp, 0, sizeof(xdp));
 8878	xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
 8879	xdp.extack = extack;
 8880	xdp.flags = flags;
 8881	xdp.prog = prog;
 8882
 8883	/* Drivers assume refcnt is already incremented (i.e, prog pointer is
 8884	 * "moved" into driver), so they don't increment it on their own, but
 8885	 * they do decrement refcnt when program is detached or replaced.
 8886	 * Given net_device also owns link/prog, we need to bump refcnt here
 8887	 * to prevent drivers from underflowing it.
 8888	 */
 8889	if (prog)
 8890		bpf_prog_inc(prog);
 8891	err = bpf_op(dev, &xdp);
 8892	if (err) {
 8893		if (prog)
 8894			bpf_prog_put(prog);
 8895		return err;
 8896	}
 8897
 8898	if (mode != XDP_MODE_HW)
 8899		bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
 8900
 8901	return 0;
 8902}
 8903
 8904static void dev_xdp_uninstall(struct net_device *dev)
 8905{
 8906	struct bpf_xdp_link *link;
 8907	struct bpf_prog *prog;
 8908	enum bpf_xdp_mode mode;
 8909	bpf_op_t bpf_op;
 8910
 8911	ASSERT_RTNL();
 8912
 8913	for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
 8914		prog = dev_xdp_prog(dev, mode);
 8915		if (!prog)
 8916			continue;
 8917
 8918		bpf_op = dev_xdp_bpf_op(dev, mode);
 8919		if (!bpf_op)
 8920			continue;
 8921
 8922		WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
 8923
 8924		/* auto-detach link from net device */
 8925		link = dev_xdp_link(dev, mode);
 8926		if (link)
 8927			link->dev = NULL;
 8928		else
 8929			bpf_prog_put(prog);
 8930
 8931		dev_xdp_set_link(dev, mode, NULL);
 8932	}
 8933}
 
 8934
 8935static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
 8936			  struct bpf_xdp_link *link, struct bpf_prog *new_prog,
 8937			  struct bpf_prog *old_prog, u32 flags)
 8938{
 8939	struct bpf_prog *cur_prog;
 8940	enum bpf_xdp_mode mode;
 8941	bpf_op_t bpf_op;
 8942	int err;
 8943
 8944	ASSERT_RTNL();
 8945
 8946	/* either link or prog attachment, never both */
 8947	if (link && (new_prog || old_prog))
 8948		return -EINVAL;
 8949	/* link supports only XDP mode flags */
 8950	if (link && (flags & ~XDP_FLAGS_MODES)) {
 8951		NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
 8952		return -EINVAL;
 8953	}
 8954	/* just one XDP mode bit should be set, zero defaults to SKB mode */
 8955	if (hweight32(flags & XDP_FLAGS_MODES) > 1) {
 8956		NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
 8957		return -EINVAL;
 8958	}
 8959	/* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
 8960	if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
 8961		NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
 8962		return -EINVAL;
 8963	}
 8964
 8965	mode = dev_xdp_mode(dev, flags);
 8966	/* can't replace attached link */
 8967	if (dev_xdp_link(dev, mode)) {
 8968		NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
 8969		return -EBUSY;
 8970	}
 8971
 8972	cur_prog = dev_xdp_prog(dev, mode);
 8973	/* can't replace attached prog with link */
 8974	if (link && cur_prog) {
 8975		NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
 8976		return -EBUSY;
 8977	}
 8978	if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
 8979		NL_SET_ERR_MSG(extack, "Active program does not match expected");
 8980		return -EEXIST;
 8981	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 8982
 8983	/* put effective new program into new_prog */
 8984	if (link)
 8985		new_prog = link->link.prog;
 8986
 8987	if (new_prog) {
 8988		bool offload = mode == XDP_MODE_HW;
 8989		enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
 8990					       ? XDP_MODE_DRV : XDP_MODE_SKB;
 8991
 8992		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
 8993			NL_SET_ERR_MSG(extack, "XDP program already attached");
 8994			return -EBUSY;
 8995		}
 8996		if (!offload && dev_xdp_prog(dev, other_mode)) {
 8997			NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
 8998			return -EEXIST;
 8999		}
 9000		if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) {
 9001			NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
 9002			return -EINVAL;
 9003		}
 9004		if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
 9005			NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
 9006			return -EINVAL;
 9007		}
 9008		if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
 9009			NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
 9010			return -EINVAL;
 9011		}
 9012	}
 9013
 9014	/* don't call drivers if the effective program didn't change */
 9015	if (new_prog != cur_prog) {
 9016		bpf_op = dev_xdp_bpf_op(dev, mode);
 9017		if (!bpf_op) {
 9018			NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
 9019			return -EOPNOTSUPP;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 9020		}
 
 9021
 9022		err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
 9023		if (err)
 9024			return err;
 9025	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 9026
 9027	if (link)
 9028		dev_xdp_set_link(dev, mode, link);
 9029	else
 9030		dev_xdp_set_prog(dev, mode, new_prog);
 9031	if (cur_prog)
 9032		bpf_prog_put(cur_prog);
 
 
 9033
 9034	return 0;
 9035}
 9036
 9037static int dev_xdp_attach_link(struct net_device *dev,
 9038			       struct netlink_ext_ack *extack,
 9039			       struct bpf_xdp_link *link)
 9040{
 9041	return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
 9042}
 9043
 9044static int dev_xdp_detach_link(struct net_device *dev,
 9045			       struct netlink_ext_ack *extack,
 9046			       struct bpf_xdp_link *link)
 9047{
 9048	enum bpf_xdp_mode mode;
 9049	bpf_op_t bpf_op;
 9050
 9051	ASSERT_RTNL();
 9052
 9053	mode = dev_xdp_mode(dev, link->flags);
 9054	if (dev_xdp_link(dev, mode) != link)
 9055		return -EINVAL;
 9056
 9057	bpf_op = dev_xdp_bpf_op(dev, mode);
 9058	WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
 9059	dev_xdp_set_link(dev, mode, NULL);
 9060	return 0;
 9061}
 9062
 9063static void bpf_xdp_link_release(struct bpf_link *link)
 9064{
 9065	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9066
 9067	rtnl_lock();
 9068
 9069	/* if racing with net_device's tear down, xdp_link->dev might be
 9070	 * already NULL, in which case link was already auto-detached
 9071	 */
 9072	if (xdp_link->dev) {
 9073		WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
 9074		xdp_link->dev = NULL;
 9075	}
 9076
 9077	rtnl_unlock();
 9078}
 9079
 9080static int bpf_xdp_link_detach(struct bpf_link *link)
 9081{
 9082	bpf_xdp_link_release(link);
 9083	return 0;
 9084}
 9085
 9086static void bpf_xdp_link_dealloc(struct bpf_link *link)
 9087{
 9088	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9089
 9090	kfree(xdp_link);
 9091}
 9092
 9093static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
 9094				     struct seq_file *seq)
 9095{
 9096	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9097	u32 ifindex = 0;
 9098
 9099	rtnl_lock();
 9100	if (xdp_link->dev)
 9101		ifindex = xdp_link->dev->ifindex;
 9102	rtnl_unlock();
 9103
 9104	seq_printf(seq, "ifindex:\t%u\n", ifindex);
 9105}
 9106
 9107static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
 9108				       struct bpf_link_info *info)
 9109{
 9110	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9111	u32 ifindex = 0;
 9112
 9113	rtnl_lock();
 9114	if (xdp_link->dev)
 9115		ifindex = xdp_link->dev->ifindex;
 9116	rtnl_unlock();
 9117
 9118	info->xdp.ifindex = ifindex;
 9119	return 0;
 9120}
 9121
 9122static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
 9123			       struct bpf_prog *old_prog)
 9124{
 9125	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9126	enum bpf_xdp_mode mode;
 9127	bpf_op_t bpf_op;
 9128	int err = 0;
 9129
 9130	rtnl_lock();
 9131
 9132	/* link might have been auto-released already, so fail */
 9133	if (!xdp_link->dev) {
 9134		err = -ENOLINK;
 9135		goto out_unlock;
 9136	}
 9137
 9138	if (old_prog && link->prog != old_prog) {
 9139		err = -EPERM;
 9140		goto out_unlock;
 9141	}
 9142	old_prog = link->prog;
 9143	if (old_prog == new_prog) {
 9144		/* no-op, don't disturb drivers */
 9145		bpf_prog_put(new_prog);
 9146		goto out_unlock;
 9147	}
 9148
 9149	mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
 9150	bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
 9151	err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
 9152			      xdp_link->flags, new_prog);
 9153	if (err)
 9154		goto out_unlock;
 9155
 9156	old_prog = xchg(&link->prog, new_prog);
 9157	bpf_prog_put(old_prog);
 9158
 9159out_unlock:
 9160	rtnl_unlock();
 9161	return err;
 9162}
 9163
 9164static const struct bpf_link_ops bpf_xdp_link_lops = {
 9165	.release = bpf_xdp_link_release,
 9166	.dealloc = bpf_xdp_link_dealloc,
 9167	.detach = bpf_xdp_link_detach,
 9168	.show_fdinfo = bpf_xdp_link_show_fdinfo,
 9169	.fill_link_info = bpf_xdp_link_fill_link_info,
 9170	.update_prog = bpf_xdp_link_update,
 9171};
 9172
 9173int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 9174{
 9175	struct net *net = current->nsproxy->net_ns;
 9176	struct bpf_link_primer link_primer;
 9177	struct bpf_xdp_link *link;
 9178	struct net_device *dev;
 9179	int err, fd;
 9180
 9181	dev = dev_get_by_index(net, attr->link_create.target_ifindex);
 9182	if (!dev)
 9183		return -EINVAL;
 9184
 9185	link = kzalloc(sizeof(*link), GFP_USER);
 9186	if (!link) {
 9187		err = -ENOMEM;
 9188		goto out_put_dev;
 9189	}
 9190
 9191	bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
 9192	link->dev = dev;
 9193	link->flags = attr->link_create.flags;
 9194
 9195	err = bpf_link_prime(&link->link, &link_primer);
 9196	if (err) {
 9197		kfree(link);
 9198		goto out_put_dev;
 9199	}
 9200
 9201	rtnl_lock();
 9202	err = dev_xdp_attach_link(dev, NULL, link);
 9203	rtnl_unlock();
 9204
 9205	if (err) {
 9206		bpf_link_cleanup(&link_primer);
 9207		goto out_put_dev;
 9208	}
 9209
 9210	fd = bpf_link_settle(&link_primer);
 9211	/* link itself doesn't hold dev's refcnt to not complicate shutdown */
 9212	dev_put(dev);
 9213	return fd;
 9214
 9215out_put_dev:
 9216	dev_put(dev);
 9217	return err;
 9218}
 9219
 9220/**
 9221 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
 9222 *	@dev: device
 9223 *	@extack: netlink extended ack
 9224 *	@fd: new program fd or negative value to clear
 9225 *	@expected_fd: old program fd that userspace expects to replace or clear
 9226 *	@flags: xdp-related flags
 9227 *
 9228 *	Set or clear a bpf program for a device
 9229 */
 9230int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 9231		      int fd, int expected_fd, u32 flags)
 9232{
 9233	enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
 9234	struct bpf_prog *new_prog = NULL, *old_prog = NULL;
 9235	int err;
 9236
 9237	ASSERT_RTNL();
 9238
 9239	if (fd >= 0) {
 9240		new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
 9241						 mode != XDP_MODE_SKB);
 9242		if (IS_ERR(new_prog))
 9243			return PTR_ERR(new_prog);
 9244	}
 9245
 9246	if (expected_fd >= 0) {
 9247		old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
 9248						 mode != XDP_MODE_SKB);
 9249		if (IS_ERR(old_prog)) {
 9250			err = PTR_ERR(old_prog);
 9251			old_prog = NULL;
 9252			goto err_out;
 9253		}
 9254	}
 9255
 9256	err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
 9257
 9258err_out:
 9259	if (err && new_prog)
 9260		bpf_prog_put(new_prog);
 9261	if (old_prog)
 9262		bpf_prog_put(old_prog);
 9263	return err;
 9264}
 9265
 9266/**
 9267 *	dev_new_index	-	allocate an ifindex
 9268 *	@net: the applicable net namespace
 9269 *
 9270 *	Returns a suitable unique value for a new device interface
 9271 *	number.  The caller must hold the rtnl semaphore or the
 9272 *	dev_base_lock to be sure it remains unique.
 9273 */
 9274static int dev_new_index(struct net *net)
 9275{
 9276	int ifindex = net->ifindex;
 9277
 9278	for (;;) {
 9279		if (++ifindex <= 0)
 9280			ifindex = 1;
 9281		if (!__dev_get_by_index(net, ifindex))
 9282			return net->ifindex = ifindex;
 9283	}
 9284}
 9285
 9286/* Delayed registration/unregisteration */
 9287static LIST_HEAD(net_todo_list);
 9288DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
 9289
 9290static void net_set_todo(struct net_device *dev)
 9291{
 9292	list_add_tail(&dev->todo_list, &net_todo_list);
 9293	dev_net(dev)->dev_unreg_count++;
 9294}
 9295
 9296static void rollback_registered_many(struct list_head *head)
 9297{
 9298	struct net_device *dev, *tmp;
 9299	LIST_HEAD(close_head);
 9300
 9301	BUG_ON(dev_boot_phase);
 9302	ASSERT_RTNL();
 9303
 9304	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
 9305		/* Some devices call without registering
 9306		 * for initialization unwind. Remove those
 9307		 * devices and proceed with the remaining.
 9308		 */
 9309		if (dev->reg_state == NETREG_UNINITIALIZED) {
 9310			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
 9311				 dev->name, dev);
 9312
 9313			WARN_ON(1);
 9314			list_del(&dev->unreg_list);
 9315			continue;
 9316		}
 9317		dev->dismantle = true;
 9318		BUG_ON(dev->reg_state != NETREG_REGISTERED);
 9319	}
 9320
 9321	/* If device is running, close it first. */
 9322	list_for_each_entry(dev, head, unreg_list)
 9323		list_add_tail(&dev->close_list, &close_head);
 9324	dev_close_many(&close_head, true);
 9325
 9326	list_for_each_entry(dev, head, unreg_list) {
 9327		/* And unlink it from device chain. */
 9328		unlist_netdevice(dev);
 9329
 9330		dev->reg_state = NETREG_UNREGISTERING;
 9331	}
 9332	flush_all_backlogs();
 9333
 9334	synchronize_net();
 9335
 9336	list_for_each_entry(dev, head, unreg_list) {
 9337		struct sk_buff *skb = NULL;
 9338
 9339		/* Shutdown queueing discipline. */
 9340		dev_shutdown(dev);
 9341
 9342		dev_xdp_uninstall(dev);
 9343
 9344		/* Notify protocols, that we are about to destroy
 9345		 * this device. They should clean all the things.
 9346		 */
 9347		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 9348
 9349		if (!dev->rtnl_link_ops ||
 9350		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 9351			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
 9352						     GFP_KERNEL, NULL, 0);
 9353
 9354		/*
 9355		 *	Flush the unicast and multicast chains
 9356		 */
 9357		dev_uc_flush(dev);
 9358		dev_mc_flush(dev);
 9359
 9360		netdev_name_node_alt_flush(dev);
 9361		netdev_name_node_free(dev->name_node);
 9362
 9363		if (dev->netdev_ops->ndo_uninit)
 9364			dev->netdev_ops->ndo_uninit(dev);
 9365
 9366		if (skb)
 9367			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
 9368
 9369		/* Notifier chain MUST detach us all upper devices. */
 9370		WARN_ON(netdev_has_any_upper_dev(dev));
 9371		WARN_ON(netdev_has_any_lower_dev(dev));
 9372
 9373		/* Remove entries from kobject tree */
 9374		netdev_unregister_kobject(dev);
 9375#ifdef CONFIG_XPS
 9376		/* Remove XPS queueing entries */
 9377		netif_reset_xps_queues_gt(dev, 0);
 9378#endif
 9379	}
 9380
 9381	synchronize_net();
 
 
 
 
 9382
 9383	list_for_each_entry(dev, head, unreg_list)
 9384		dev_put(dev);
 9385}
 9386
 9387static void rollback_registered(struct net_device *dev)
 9388{
 9389	LIST_HEAD(single);
 9390
 9391	list_add(&dev->unreg_list, &single);
 9392	rollback_registered_many(&single);
 9393	list_del(&single);
 9394}
 9395
 9396static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
 9397	struct net_device *upper, netdev_features_t features)
 9398{
 9399	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
 9400	netdev_features_t feature;
 9401	int feature_bit;
 9402
 9403	for_each_netdev_feature(upper_disables, feature_bit) {
 9404		feature = __NETIF_F_BIT(feature_bit);
 9405		if (!(upper->wanted_features & feature)
 9406		    && (features & feature)) {
 9407			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
 9408				   &feature, upper->name);
 9409			features &= ~feature;
 9410		}
 9411	}
 9412
 9413	return features;
 9414}
 9415
 9416static void netdev_sync_lower_features(struct net_device *upper,
 9417	struct net_device *lower, netdev_features_t features)
 9418{
 9419	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
 9420	netdev_features_t feature;
 9421	int feature_bit;
 9422
 9423	for_each_netdev_feature(upper_disables, feature_bit) {
 9424		feature = __NETIF_F_BIT(feature_bit);
 9425		if (!(features & feature) && (lower->features & feature)) {
 9426			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
 9427				   &feature, lower->name);
 9428			lower->wanted_features &= ~feature;
 9429			__netdev_update_features(lower);
 9430
 9431			if (unlikely(lower->features & feature))
 9432				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
 9433					    &feature, lower->name);
 9434			else
 9435				netdev_features_change(lower);
 9436		}
 9437	}
 9438}
 9439
 9440static netdev_features_t netdev_fix_features(struct net_device *dev,
 9441	netdev_features_t features)
 9442{
 9443	/* Fix illegal checksum combinations */
 9444	if ((features & NETIF_F_HW_CSUM) &&
 9445	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
 9446		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
 9447		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
 9448	}
 9449
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 9450	/* TSO requires that SG is present as well. */
 9451	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
 9452		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
 9453		features &= ~NETIF_F_ALL_TSO;
 9454	}
 9455
 9456	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
 9457					!(features & NETIF_F_IP_CSUM)) {
 9458		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
 9459		features &= ~NETIF_F_TSO;
 9460		features &= ~NETIF_F_TSO_ECN;
 9461	}
 9462
 9463	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
 9464					 !(features & NETIF_F_IPV6_CSUM)) {
 9465		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
 9466		features &= ~NETIF_F_TSO6;
 9467	}
 9468
 9469	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
 9470	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
 9471		features &= ~NETIF_F_TSO_MANGLEID;
 9472
 9473	/* TSO ECN requires that TSO is present as well. */
 9474	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
 9475		features &= ~NETIF_F_TSO_ECN;
 9476
 9477	/* Software GSO depends on SG. */
 9478	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
 9479		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
 9480		features &= ~NETIF_F_GSO;
 9481	}
 9482
 9483	/* GSO partial features require GSO partial be set */
 9484	if ((features & dev->gso_partial_features) &&
 9485	    !(features & NETIF_F_GSO_PARTIAL)) {
 9486		netdev_dbg(dev,
 9487			   "Dropping partially supported GSO features since no GSO partial.\n");
 9488		features &= ~dev->gso_partial_features;
 9489	}
 9490
 9491	if (!(features & NETIF_F_RXCSUM)) {
 9492		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
 9493		 * successfully merged by hardware must also have the
 9494		 * checksum verified by hardware.  If the user does not
 9495		 * want to enable RXCSUM, logically, we should disable GRO_HW.
 9496		 */
 9497		if (features & NETIF_F_GRO_HW) {
 9498			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
 9499			features &= ~NETIF_F_GRO_HW;
 9500		}
 9501	}
 9502
 9503	/* LRO/HW-GRO features cannot be combined with RX-FCS */
 9504	if (features & NETIF_F_RXFCS) {
 9505		if (features & NETIF_F_LRO) {
 9506			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
 9507			features &= ~NETIF_F_LRO;
 9508		}
 9509
 9510		if (features & NETIF_F_GRO_HW) {
 9511			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
 9512			features &= ~NETIF_F_GRO_HW;
 
 9513		}
 9514	}
 9515
 9516	return features;
 9517}
 9518
 9519int __netdev_update_features(struct net_device *dev)
 9520{
 9521	struct net_device *upper, *lower;
 9522	netdev_features_t features;
 9523	struct list_head *iter;
 9524	int err = -1;
 9525
 9526	ASSERT_RTNL();
 9527
 9528	features = netdev_get_wanted_features(dev);
 9529
 9530	if (dev->netdev_ops->ndo_fix_features)
 9531		features = dev->netdev_ops->ndo_fix_features(dev, features);
 9532
 9533	/* driver might be less strict about feature dependencies */
 9534	features = netdev_fix_features(dev, features);
 9535
 9536	/* some features can't be enabled if they're off an an upper device */
 9537	netdev_for_each_upper_dev_rcu(dev, upper, iter)
 9538		features = netdev_sync_upper_features(dev, upper, features);
 9539
 9540	if (dev->features == features)
 9541		goto sync_lower;
 9542
 9543	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
 9544		&dev->features, &features);
 9545
 9546	if (dev->netdev_ops->ndo_set_features)
 9547		err = dev->netdev_ops->ndo_set_features(dev, features);
 9548	else
 9549		err = 0;
 9550
 9551	if (unlikely(err < 0)) {
 9552		netdev_err(dev,
 9553			"set_features() failed (%d); wanted %pNF, left %pNF\n",
 9554			err, &features, &dev->features);
 9555		/* return non-0 since some features might have changed and
 9556		 * it's better to fire a spurious notification than miss it
 9557		 */
 9558		return -1;
 9559	}
 9560
 9561sync_lower:
 9562	/* some features must be disabled on lower devices when disabled
 9563	 * on an upper device (think: bonding master or bridge)
 9564	 */
 9565	netdev_for_each_lower_dev(dev, lower, iter)
 9566		netdev_sync_lower_features(dev, lower, features);
 9567
 9568	if (!err) {
 9569		netdev_features_t diff = features ^ dev->features;
 9570
 9571		if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
 9572			/* udp_tunnel_{get,drop}_rx_info both need
 9573			 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
 9574			 * device, or they won't do anything.
 9575			 * Thus we need to update dev->features
 9576			 * *before* calling udp_tunnel_get_rx_info,
 9577			 * but *after* calling udp_tunnel_drop_rx_info.
 9578			 */
 9579			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
 9580				dev->features = features;
 9581				udp_tunnel_get_rx_info(dev);
 9582			} else {
 9583				udp_tunnel_drop_rx_info(dev);
 9584			}
 9585		}
 9586
 9587		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
 9588			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
 9589				dev->features = features;
 9590				err |= vlan_get_rx_ctag_filter_info(dev);
 9591			} else {
 9592				vlan_drop_rx_ctag_filter_info(dev);
 9593			}
 9594		}
 9595
 9596		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
 9597			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
 9598				dev->features = features;
 9599				err |= vlan_get_rx_stag_filter_info(dev);
 9600			} else {
 9601				vlan_drop_rx_stag_filter_info(dev);
 9602			}
 9603		}
 9604
 9605		dev->features = features;
 9606	}
 9607
 9608	return err < 0 ? 0 : 1;
 9609}
 9610
 9611/**
 9612 *	netdev_update_features - recalculate device features
 9613 *	@dev: the device to check
 9614 *
 9615 *	Recalculate dev->features set and send notifications if it
 9616 *	has changed. Should be called after driver or hardware dependent
 9617 *	conditions might have changed that influence the features.
 9618 */
 9619void netdev_update_features(struct net_device *dev)
 9620{
 9621	if (__netdev_update_features(dev))
 9622		netdev_features_change(dev);
 9623}
 9624EXPORT_SYMBOL(netdev_update_features);
 9625
 9626/**
 9627 *	netdev_change_features - recalculate device features
 9628 *	@dev: the device to check
 9629 *
 9630 *	Recalculate dev->features set and send notifications even
 9631 *	if they have not changed. Should be called instead of
 9632 *	netdev_update_features() if also dev->vlan_features might
 9633 *	have changed to allow the changes to be propagated to stacked
 9634 *	VLAN devices.
 9635 */
 9636void netdev_change_features(struct net_device *dev)
 9637{
 9638	__netdev_update_features(dev);
 9639	netdev_features_change(dev);
 9640}
 9641EXPORT_SYMBOL(netdev_change_features);
 9642
 9643/**
 9644 *	netif_stacked_transfer_operstate -	transfer operstate
 9645 *	@rootdev: the root or lower level device to transfer state from
 9646 *	@dev: the device to transfer operstate to
 9647 *
 9648 *	Transfer operational state from root to device. This is normally
 9649 *	called when a stacking relationship exists between the root
 9650 *	device and the device(a leaf device).
 9651 */
 9652void netif_stacked_transfer_operstate(const struct net_device *rootdev,
 9653					struct net_device *dev)
 9654{
 9655	if (rootdev->operstate == IF_OPER_DORMANT)
 9656		netif_dormant_on(dev);
 9657	else
 9658		netif_dormant_off(dev);
 9659
 9660	if (rootdev->operstate == IF_OPER_TESTING)
 9661		netif_testing_on(dev);
 9662	else
 9663		netif_testing_off(dev);
 9664
 9665	if (netif_carrier_ok(rootdev))
 9666		netif_carrier_on(dev);
 9667	else
 9668		netif_carrier_off(dev);
 9669}
 9670EXPORT_SYMBOL(netif_stacked_transfer_operstate);
 9671
 
 9672static int netif_alloc_rx_queues(struct net_device *dev)
 9673{
 9674	unsigned int i, count = dev->num_rx_queues;
 9675	struct netdev_rx_queue *rx;
 9676	size_t sz = count * sizeof(*rx);
 9677	int err = 0;
 9678
 9679	BUG_ON(count < 1);
 9680
 9681	rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 9682	if (!rx)
 
 9683		return -ENOMEM;
 9684
 9685	dev->_rx = rx;
 9686
 9687	for (i = 0; i < count; i++) {
 9688		rx[i].dev = dev;
 9689
 9690		/* XDP RX-queue setup */
 9691		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
 9692		if (err < 0)
 9693			goto err_rxq_info;
 9694	}
 9695	return 0;
 9696
 9697err_rxq_info:
 9698	/* Rollback successful reg's and free other resources */
 9699	while (i--)
 9700		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
 9701	kvfree(dev->_rx);
 9702	dev->_rx = NULL;
 9703	return err;
 9704}
 9705
 9706static void netif_free_rx_queues(struct net_device *dev)
 9707{
 9708	unsigned int i, count = dev->num_rx_queues;
 9709
 9710	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
 9711	if (!dev->_rx)
 9712		return;
 9713
 9714	for (i = 0; i < count; i++)
 9715		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
 9716
 9717	kvfree(dev->_rx);
 9718}
 
 9719
 9720static void netdev_init_one_queue(struct net_device *dev,
 9721				  struct netdev_queue *queue, void *_unused)
 9722{
 9723	/* Initialize queue lock */
 9724	spin_lock_init(&queue->_xmit_lock);
 9725	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
 9726	queue->xmit_lock_owner = -1;
 9727	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
 9728	queue->dev = dev;
 9729#ifdef CONFIG_BQL
 9730	dql_init(&queue->dql, HZ);
 9731#endif
 9732}
 9733
 9734static void netif_free_tx_queues(struct net_device *dev)
 9735{
 9736	kvfree(dev->_tx);
 9737}
 9738
 9739static int netif_alloc_netdev_queues(struct net_device *dev)
 9740{
 9741	unsigned int count = dev->num_tx_queues;
 9742	struct netdev_queue *tx;
 9743	size_t sz = count * sizeof(*tx);
 9744
 9745	if (count < 1 || count > 0xffff)
 9746		return -EINVAL;
 9747
 9748	tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 9749	if (!tx)
 
 
 9750		return -ENOMEM;
 9751
 9752	dev->_tx = tx;
 9753
 9754	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
 9755	spin_lock_init(&dev->tx_global_lock);
 9756
 9757	return 0;
 9758}
 9759
 9760void netif_tx_stop_all_queues(struct net_device *dev)
 9761{
 9762	unsigned int i;
 9763
 9764	for (i = 0; i < dev->num_tx_queues; i++) {
 9765		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 9766
 9767		netif_tx_stop_queue(txq);
 9768	}
 9769}
 9770EXPORT_SYMBOL(netif_tx_stop_all_queues);
 9771
 9772/**
 9773 *	register_netdevice	- register a network device
 9774 *	@dev: device to register
 9775 *
 9776 *	Take a completed network device structure and add it to the kernel
 9777 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
 9778 *	chain. 0 is returned on success. A negative errno code is returned
 9779 *	on a failure to set up the device, or if the name is a duplicate.
 9780 *
 9781 *	Callers must hold the rtnl semaphore. You may want
 9782 *	register_netdev() instead of this.
 9783 *
 9784 *	BUGS:
 9785 *	The locking appears insufficient to guarantee two parallel registers
 9786 *	will not get the same name.
 9787 */
 9788
 9789int register_netdevice(struct net_device *dev)
 9790{
 9791	int ret;
 9792	struct net *net = dev_net(dev);
 9793
 9794	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
 9795		     NETDEV_FEATURE_COUNT);
 9796	BUG_ON(dev_boot_phase);
 9797	ASSERT_RTNL();
 9798
 9799	might_sleep();
 9800
 9801	/* When net_device's are persistent, this will be fatal. */
 9802	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
 9803	BUG_ON(!net);
 9804
 9805	ret = ethtool_check_ops(dev->ethtool_ops);
 9806	if (ret)
 9807		return ret;
 9808
 9809	spin_lock_init(&dev->addr_list_lock);
 9810	netdev_set_addr_lockdep_class(dev);
 9811
 9812	ret = dev_get_valid_name(net, dev, dev->name);
 
 
 9813	if (ret < 0)
 9814		goto out;
 9815
 9816	ret = -ENOMEM;
 9817	dev->name_node = netdev_name_node_head_alloc(dev);
 9818	if (!dev->name_node)
 9819		goto out;
 9820
 9821	/* Init, if this function is available */
 9822	if (dev->netdev_ops->ndo_init) {
 9823		ret = dev->netdev_ops->ndo_init(dev);
 9824		if (ret) {
 9825			if (ret > 0)
 9826				ret = -EIO;
 9827			goto err_free_name;
 9828		}
 9829	}
 9830
 9831	if (((dev->hw_features | dev->features) &
 9832	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
 9833	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
 9834	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
 9835		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
 9836		ret = -EINVAL;
 9837		goto err_uninit;
 9838	}
 9839
 9840	ret = -EBUSY;
 9841	if (!dev->ifindex)
 9842		dev->ifindex = dev_new_index(net);
 9843	else if (__dev_get_by_index(net, dev->ifindex))
 9844		goto err_uninit;
 9845
 9846	/* Transfer changeable features to wanted_features and enable
 9847	 * software offloads (GSO and GRO).
 9848	 */
 9849	dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
 9850	dev->features |= NETIF_F_SOFT_FEATURES;
 
 9851
 9852	if (dev->netdev_ops->ndo_udp_tunnel_add) {
 9853		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
 9854		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
 
 
 
 9855	}
 9856
 9857	dev->wanted_features = dev->features & dev->hw_features;
 9858
 9859	if (!(dev->flags & IFF_LOOPBACK))
 9860		dev->hw_features |= NETIF_F_NOCACHE_COPY;
 9861
 9862	/* If IPv4 TCP segmentation offload is supported we should also
 9863	 * allow the device to enable segmenting the frame with the option
 9864	 * of ignoring a static IP ID value.  This doesn't enable the
 9865	 * feature itself but allows the user to enable it later.
 9866	 */
 9867	if (dev->hw_features & NETIF_F_TSO)
 9868		dev->hw_features |= NETIF_F_TSO_MANGLEID;
 9869	if (dev->vlan_features & NETIF_F_TSO)
 9870		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
 9871	if (dev->mpls_features & NETIF_F_TSO)
 9872		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
 9873	if (dev->hw_enc_features & NETIF_F_TSO)
 9874		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
 9875
 9876	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
 9877	 */
 9878	dev->vlan_features |= NETIF_F_HIGHDMA;
 9879
 9880	/* Make NETIF_F_SG inheritable to tunnel devices.
 9881	 */
 9882	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
 9883
 9884	/* Make NETIF_F_SG inheritable to MPLS.
 9885	 */
 9886	dev->mpls_features |= NETIF_F_SG;
 9887
 9888	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
 9889	ret = notifier_to_errno(ret);
 9890	if (ret)
 9891		goto err_uninit;
 9892
 9893	ret = netdev_register_kobject(dev);
 9894	if (ret) {
 9895		dev->reg_state = NETREG_UNREGISTERED;
 9896		goto err_uninit;
 9897	}
 9898	dev->reg_state = NETREG_REGISTERED;
 9899
 9900	__netdev_update_features(dev);
 9901
 9902	/*
 9903	 *	Default initial state at registry is that the
 9904	 *	device is present.
 9905	 */
 9906
 9907	set_bit(__LINK_STATE_PRESENT, &dev->state);
 9908
 9909	linkwatch_init_dev(dev);
 9910
 9911	dev_init_scheduler(dev);
 9912	dev_hold(dev);
 9913	list_netdevice(dev);
 9914	add_device_randomness(dev->dev_addr, dev->addr_len);
 9915
 9916	/* If the device has permanent device address, driver should
 9917	 * set dev_addr and also addr_assign_type should be set to
 9918	 * NET_ADDR_PERM (default value).
 9919	 */
 9920	if (dev->addr_assign_type == NET_ADDR_PERM)
 9921		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
 9922
 9923	/* Notify protocols, that a new device appeared. */
 9924	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
 9925	ret = notifier_to_errno(ret);
 9926	if (ret) {
 9927		rollback_registered(dev);
 9928		rcu_barrier();
 9929
 9930		dev->reg_state = NETREG_UNREGISTERED;
 9931		/* We should put the kobject that hold in
 9932		 * netdev_unregister_kobject(), otherwise
 9933		 * the net device cannot be freed when
 9934		 * driver calls free_netdev(), because the
 9935		 * kobject is being hold.
 9936		 */
 9937		kobject_put(&dev->dev.kobj);
 9938	}
 9939	/*
 9940	 *	Prevent userspace races by waiting until the network
 9941	 *	device is fully setup before sending notifications.
 9942	 */
 9943	if (!dev->rtnl_link_ops ||
 9944	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 9945		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
 9946
 9947out:
 9948	return ret;
 9949
 9950err_uninit:
 9951	if (dev->netdev_ops->ndo_uninit)
 9952		dev->netdev_ops->ndo_uninit(dev);
 9953	if (dev->priv_destructor)
 9954		dev->priv_destructor(dev);
 9955err_free_name:
 9956	netdev_name_node_free(dev->name_node);
 9957	goto out;
 9958}
 9959EXPORT_SYMBOL(register_netdevice);
 9960
 9961/**
 9962 *	init_dummy_netdev	- init a dummy network device for NAPI
 9963 *	@dev: device to init
 9964 *
 9965 *	This takes a network device structure and initialize the minimum
 9966 *	amount of fields so it can be used to schedule NAPI polls without
 9967 *	registering a full blown interface. This is to be used by drivers
 9968 *	that need to tie several hardware interfaces to a single NAPI
 9969 *	poll scheduler due to HW limitations.
 9970 */
 9971int init_dummy_netdev(struct net_device *dev)
 9972{
 9973	/* Clear everything. Note we don't initialize spinlocks
 9974	 * are they aren't supposed to be taken by any of the
 9975	 * NAPI code and this dummy netdev is supposed to be
 9976	 * only ever used for NAPI polls
 9977	 */
 9978	memset(dev, 0, sizeof(struct net_device));
 9979
 9980	/* make sure we BUG if trying to hit standard
 9981	 * register/unregister code path
 9982	 */
 9983	dev->reg_state = NETREG_DUMMY;
 9984
 9985	/* NAPI wants this */
 9986	INIT_LIST_HEAD(&dev->napi_list);
 9987
 9988	/* a dummy interface is started by default */
 9989	set_bit(__LINK_STATE_PRESENT, &dev->state);
 9990	set_bit(__LINK_STATE_START, &dev->state);
 9991
 9992	/* napi_busy_loop stats accounting wants this */
 9993	dev_net_set(dev, &init_net);
 9994
 9995	/* Note : We dont allocate pcpu_refcnt for dummy devices,
 9996	 * because users of this 'device' dont need to change
 9997	 * its refcount.
 9998	 */
 9999
10000	return 0;
10001}
10002EXPORT_SYMBOL_GPL(init_dummy_netdev);
10003
10004
10005/**
10006 *	register_netdev	- register a network device
10007 *	@dev: device to register
10008 *
10009 *	Take a completed network device structure and add it to the kernel
10010 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
10011 *	chain. 0 is returned on success. A negative errno code is returned
10012 *	on a failure to set up the device, or if the name is a duplicate.
10013 *
10014 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
10015 *	and expands the device name if you passed a format string to
10016 *	alloc_netdev.
10017 */
10018int register_netdev(struct net_device *dev)
10019{
10020	int err;
10021
10022	if (rtnl_lock_killable())
10023		return -EINTR;
10024	err = register_netdevice(dev);
10025	rtnl_unlock();
10026	return err;
10027}
10028EXPORT_SYMBOL(register_netdev);
10029
10030int netdev_refcnt_read(const struct net_device *dev)
10031{
10032	int i, refcnt = 0;
10033
10034	for_each_possible_cpu(i)
10035		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
10036	return refcnt;
10037}
10038EXPORT_SYMBOL(netdev_refcnt_read);
10039
10040/**
10041 * netdev_wait_allrefs - wait until all references are gone.
10042 * @dev: target net_device
10043 *
10044 * This is called when unregistering network devices.
10045 *
10046 * Any protocol or device that holds a reference should register
10047 * for netdevice notification, and cleanup and put back the
10048 * reference if they receive an UNREGISTER event.
10049 * We can get stuck here if buggy protocols don't correctly
10050 * call dev_put.
10051 */
10052static void netdev_wait_allrefs(struct net_device *dev)
10053{
10054	unsigned long rebroadcast_time, warning_time;
10055	int refcnt;
10056
10057	linkwatch_forget_dev(dev);
10058
10059	rebroadcast_time = warning_time = jiffies;
10060	refcnt = netdev_refcnt_read(dev);
10061
10062	while (refcnt != 0) {
10063		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
10064			rtnl_lock();
10065
10066			/* Rebroadcast unregister notification */
10067			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10068
10069			__rtnl_unlock();
10070			rcu_barrier();
10071			rtnl_lock();
10072
10073			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
10074				     &dev->state)) {
10075				/* We must not have linkwatch events
10076				 * pending on unregister. If this
10077				 * happens, we simply run the queue
10078				 * unscheduled, resulting in a noop
10079				 * for this device.
10080				 */
10081				linkwatch_run_queue();
10082			}
10083
10084			__rtnl_unlock();
10085
10086			rebroadcast_time = jiffies;
10087		}
10088
10089		msleep(250);
10090
10091		refcnt = netdev_refcnt_read(dev);
10092
10093		if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
10094			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
10095				 dev->name, refcnt);
 
 
10096			warning_time = jiffies;
10097		}
10098	}
10099}
10100
10101/* The sequence is:
10102 *
10103 *	rtnl_lock();
10104 *	...
10105 *	register_netdevice(x1);
10106 *	register_netdevice(x2);
10107 *	...
10108 *	unregister_netdevice(y1);
10109 *	unregister_netdevice(y2);
10110 *      ...
10111 *	rtnl_unlock();
10112 *	free_netdev(y1);
10113 *	free_netdev(y2);
10114 *
10115 * We are invoked by rtnl_unlock().
10116 * This allows us to deal with problems:
10117 * 1) We can delete sysfs objects which invoke hotplug
10118 *    without deadlocking with linkwatch via keventd.
10119 * 2) Since we run with the RTNL semaphore not held, we can sleep
10120 *    safely in order to wait for the netdev refcnt to drop to zero.
10121 *
10122 * We must not return until all unregister events added during
10123 * the interval the lock was held have been completed.
10124 */
10125void netdev_run_todo(void)
10126{
10127	struct list_head list;
10128#ifdef CONFIG_LOCKDEP
10129	struct list_head unlink_list;
10130
10131	list_replace_init(&net_unlink_list, &unlink_list);
10132
10133	while (!list_empty(&unlink_list)) {
10134		struct net_device *dev = list_first_entry(&unlink_list,
10135							  struct net_device,
10136							  unlink_list);
10137		list_del(&dev->unlink_list);
10138		dev->nested_level = dev->lower_level - 1;
10139	}
10140#endif
10141
10142	/* Snapshot list, allow later requests */
10143	list_replace_init(&net_todo_list, &list);
10144
10145	__rtnl_unlock();
10146
10147
10148	/* Wait for rcu callbacks to finish before next phase */
10149	if (!list_empty(&list))
10150		rcu_barrier();
10151
10152	while (!list_empty(&list)) {
10153		struct net_device *dev
10154			= list_first_entry(&list, struct net_device, todo_list);
10155		list_del(&dev->todo_list);
10156
10157		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
10158			pr_err("network todo '%s' but state %d\n",
10159			       dev->name, dev->reg_state);
10160			dump_stack();
10161			continue;
10162		}
10163
10164		dev->reg_state = NETREG_UNREGISTERED;
10165
 
 
10166		netdev_wait_allrefs(dev);
10167
10168		/* paranoia */
10169		BUG_ON(netdev_refcnt_read(dev));
10170		BUG_ON(!list_empty(&dev->ptype_all));
10171		BUG_ON(!list_empty(&dev->ptype_specific));
10172		WARN_ON(rcu_access_pointer(dev->ip_ptr));
10173		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
10174#if IS_ENABLED(CONFIG_DECNET)
10175		WARN_ON(dev->dn_ptr);
10176#endif
10177		if (dev->priv_destructor)
10178			dev->priv_destructor(dev);
10179		if (dev->needs_free_netdev)
10180			free_netdev(dev);
10181
10182		/* Report a network device has been unregistered */
10183		rtnl_lock();
10184		dev_net(dev)->dev_unreg_count--;
10185		__rtnl_unlock();
10186		wake_up(&netdev_unregistering_wq);
10187
10188		/* Free network device */
10189		kobject_put(&dev->dev.kobj);
10190	}
10191}
10192
10193/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
10194 * all the same fields in the same order as net_device_stats, with only
10195 * the type differing, but rtnl_link_stats64 may have additional fields
10196 * at the end for newer counters.
10197 */
10198void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
10199			     const struct net_device_stats *netdev_stats)
10200{
10201#if BITS_PER_LONG == 64
10202	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
10203	memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
10204	/* zero out counters that only exist in rtnl_link_stats64 */
10205	memset((char *)stats64 + sizeof(*netdev_stats), 0,
10206	       sizeof(*stats64) - sizeof(*netdev_stats));
10207#else
10208	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
10209	const unsigned long *src = (const unsigned long *)netdev_stats;
10210	u64 *dst = (u64 *)stats64;
10211
10212	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
 
10213	for (i = 0; i < n; i++)
10214		dst[i] = src[i];
10215	/* zero out counters that only exist in rtnl_link_stats64 */
10216	memset((char *)stats64 + n * sizeof(u64), 0,
10217	       sizeof(*stats64) - n * sizeof(u64));
10218#endif
10219}
10220EXPORT_SYMBOL(netdev_stats_to_stats64);
10221
10222/**
10223 *	dev_get_stats	- get network device statistics
10224 *	@dev: device to get statistics from
10225 *	@storage: place to store stats
10226 *
10227 *	Get network statistics from device. Return @storage.
10228 *	The device driver may provide its own method by setting
10229 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
10230 *	otherwise the internal statistics structure is used.
10231 */
10232struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
10233					struct rtnl_link_stats64 *storage)
10234{
10235	const struct net_device_ops *ops = dev->netdev_ops;
10236
10237	if (ops->ndo_get_stats64) {
10238		memset(storage, 0, sizeof(*storage));
10239		ops->ndo_get_stats64(dev, storage);
10240	} else if (ops->ndo_get_stats) {
10241		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
10242	} else {
10243		netdev_stats_to_stats64(storage, &dev->stats);
10244	}
10245	storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
10246	storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
10247	storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
10248	return storage;
10249}
10250EXPORT_SYMBOL(dev_get_stats);
10251
10252struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
10253{
10254	struct netdev_queue *queue = dev_ingress_queue(dev);
10255
10256#ifdef CONFIG_NET_CLS_ACT
10257	if (queue)
10258		return queue;
10259	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
10260	if (!queue)
10261		return NULL;
10262	netdev_init_one_queue(dev, queue, NULL);
10263	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
10264	queue->qdisc_sleeping = &noop_qdisc;
10265	rcu_assign_pointer(dev->ingress_queue, queue);
10266#endif
10267	return queue;
10268}
10269
10270static const struct ethtool_ops default_ethtool_ops;
10271
10272void netdev_set_default_ethtool_ops(struct net_device *dev,
10273				    const struct ethtool_ops *ops)
10274{
10275	if (dev->ethtool_ops == &default_ethtool_ops)
10276		dev->ethtool_ops = ops;
10277}
10278EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
10279
10280void netdev_freemem(struct net_device *dev)
10281{
10282	char *addr = (char *)dev - dev->padded;
10283
10284	kvfree(addr);
10285}
10286
10287/**
10288 * alloc_netdev_mqs - allocate network device
10289 * @sizeof_priv: size of private data to allocate space for
10290 * @name: device name format string
10291 * @name_assign_type: origin of device name
10292 * @setup: callback to initialize device
10293 * @txqs: the number of TX subqueues to allocate
10294 * @rxqs: the number of RX subqueues to allocate
10295 *
10296 * Allocates a struct net_device with private data area for driver use
10297 * and performs basic initialization.  Also allocates subqueue structs
10298 * for each queue on the device.
10299 */
10300struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
10301		unsigned char name_assign_type,
10302		void (*setup)(struct net_device *),
10303		unsigned int txqs, unsigned int rxqs)
10304{
10305	struct net_device *dev;
10306	unsigned int alloc_size;
10307	struct net_device *p;
10308
10309	BUG_ON(strlen(name) >= sizeof(dev->name));
10310
10311	if (txqs < 1) {
10312		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
 
10313		return NULL;
10314	}
10315
 
10316	if (rxqs < 1) {
10317		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
 
10318		return NULL;
10319	}
 
10320
10321	alloc_size = sizeof(struct net_device);
10322	if (sizeof_priv) {
10323		/* ensure 32-byte alignment of private area */
10324		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
10325		alloc_size += sizeof_priv;
10326	}
10327	/* ensure 32-byte alignment of whole construct */
10328	alloc_size += NETDEV_ALIGN - 1;
10329
10330	p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
10331	if (!p)
 
10332		return NULL;
 
10333
10334	dev = PTR_ALIGN(p, NETDEV_ALIGN);
10335	dev->padded = (char *)dev - (char *)p;
10336
10337	dev->pcpu_refcnt = alloc_percpu(int);
10338	if (!dev->pcpu_refcnt)
10339		goto free_dev;
10340
10341	if (dev_addr_init(dev))
10342		goto free_pcpu;
10343
10344	dev_mc_init(dev);
10345	dev_uc_init(dev);
10346
10347	dev_net_set(dev, &init_net);
10348
10349	dev->gso_max_size = GSO_MAX_SIZE;
10350	dev->gso_max_segs = GSO_MAX_SEGS;
10351	dev->upper_level = 1;
10352	dev->lower_level = 1;
10353#ifdef CONFIG_LOCKDEP
10354	dev->nested_level = 0;
10355	INIT_LIST_HEAD(&dev->unlink_list);
10356#endif
10357
10358	INIT_LIST_HEAD(&dev->napi_list);
10359	INIT_LIST_HEAD(&dev->unreg_list);
10360	INIT_LIST_HEAD(&dev->close_list);
10361	INIT_LIST_HEAD(&dev->link_watch_list);
10362	INIT_LIST_HEAD(&dev->adj_list.upper);
10363	INIT_LIST_HEAD(&dev->adj_list.lower);
10364	INIT_LIST_HEAD(&dev->ptype_all);
10365	INIT_LIST_HEAD(&dev->ptype_specific);
10366	INIT_LIST_HEAD(&dev->net_notifier_list);
10367#ifdef CONFIG_NET_SCHED
10368	hash_init(dev->qdisc_hash);
10369#endif
10370	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
10371	setup(dev);
10372
10373	if (!dev->tx_queue_len) {
10374		dev->priv_flags |= IFF_NO_QUEUE;
10375		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
10376	}
10377
10378	dev->num_tx_queues = txqs;
10379	dev->real_num_tx_queues = txqs;
10380	if (netif_alloc_netdev_queues(dev))
10381		goto free_all;
10382
 
10383	dev->num_rx_queues = rxqs;
10384	dev->real_num_rx_queues = rxqs;
10385	if (netif_alloc_rx_queues(dev))
10386		goto free_all;
 
10387
10388	strcpy(dev->name, name);
10389	dev->name_assign_type = name_assign_type;
10390	dev->group = INIT_NETDEV_GROUP;
10391	if (!dev->ethtool_ops)
10392		dev->ethtool_ops = &default_ethtool_ops;
10393
10394	nf_hook_ingress_init(dev);
10395
10396	return dev;
10397
10398free_all:
10399	free_netdev(dev);
10400	return NULL;
10401
10402free_pcpu:
10403	free_percpu(dev->pcpu_refcnt);
10404free_dev:
10405	netdev_freemem(dev);
 
 
 
 
 
10406	return NULL;
10407}
10408EXPORT_SYMBOL(alloc_netdev_mqs);
10409
10410/**
10411 * free_netdev - free network device
10412 * @dev: device
10413 *
10414 * This function does the last stage of destroying an allocated device
10415 * interface. The reference to the device object is released. If this
10416 * is the last reference then it will be freed.Must be called in process
10417 * context.
10418 */
10419void free_netdev(struct net_device *dev)
10420{
10421	struct napi_struct *p, *n;
10422
10423	might_sleep();
10424	netif_free_tx_queues(dev);
10425	netif_free_rx_queues(dev);
 
 
 
10426
10427	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
10428
10429	/* Flush device addresses */
10430	dev_addr_flush(dev);
10431
10432	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
10433		netif_napi_del(p);
10434
10435	free_percpu(dev->pcpu_refcnt);
10436	dev->pcpu_refcnt = NULL;
10437	free_percpu(dev->xdp_bulkq);
10438	dev->xdp_bulkq = NULL;
10439
10440	/*  Compatibility with error handling in drivers */
10441	if (dev->reg_state == NETREG_UNINITIALIZED) {
10442		netdev_freemem(dev);
10443		return;
10444	}
10445
10446	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
10447	dev->reg_state = NETREG_RELEASED;
10448
10449	/* will free via device release */
10450	put_device(&dev->dev);
10451}
10452EXPORT_SYMBOL(free_netdev);
10453
10454/**
10455 *	synchronize_net -  Synchronize with packet receive processing
10456 *
10457 *	Wait for packets currently being received to be done.
10458 *	Does not block later packets from starting.
10459 */
10460void synchronize_net(void)
10461{
10462	might_sleep();
10463	if (rtnl_is_locked())
10464		synchronize_rcu_expedited();
10465	else
10466		synchronize_rcu();
10467}
10468EXPORT_SYMBOL(synchronize_net);
10469
10470/**
10471 *	unregister_netdevice_queue - remove device from the kernel
10472 *	@dev: device
10473 *	@head: list
10474 *
10475 *	This function shuts down a device interface and removes it
10476 *	from the kernel tables.
10477 *	If head not NULL, device is queued to be unregistered later.
10478 *
10479 *	Callers must hold the rtnl semaphore.  You may want
10480 *	unregister_netdev() instead of this.
10481 */
10482
10483void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
10484{
10485	ASSERT_RTNL();
10486
10487	if (head) {
10488		list_move_tail(&dev->unreg_list, head);
10489	} else {
10490		rollback_registered(dev);
10491		/* Finish processing unregister after unlock */
10492		net_set_todo(dev);
10493	}
10494}
10495EXPORT_SYMBOL(unregister_netdevice_queue);
10496
10497/**
10498 *	unregister_netdevice_many - unregister many devices
10499 *	@head: list of devices
10500 *
10501 *  Note: As most callers use a stack allocated list_head,
10502 *  we force a list_del() to make sure stack wont be corrupted later.
10503 */
10504void unregister_netdevice_many(struct list_head *head)
10505{
10506	struct net_device *dev;
10507
10508	if (!list_empty(head)) {
10509		rollback_registered_many(head);
10510		list_for_each_entry(dev, head, unreg_list)
10511			net_set_todo(dev);
10512		list_del(head);
10513	}
10514}
10515EXPORT_SYMBOL(unregister_netdevice_many);
10516
10517/**
10518 *	unregister_netdev - remove device from the kernel
10519 *	@dev: device
10520 *
10521 *	This function shuts down a device interface and removes it
10522 *	from the kernel tables.
10523 *
10524 *	This is just a wrapper for unregister_netdevice that takes
10525 *	the rtnl semaphore.  In general you want to use this and not
10526 *	unregister_netdevice.
10527 */
10528void unregister_netdev(struct net_device *dev)
10529{
10530	rtnl_lock();
10531	unregister_netdevice(dev);
10532	rtnl_unlock();
10533}
10534EXPORT_SYMBOL(unregister_netdev);
10535
10536/**
10537 *	dev_change_net_namespace - move device to different nethost namespace
10538 *	@dev: device
10539 *	@net: network namespace
10540 *	@pat: If not NULL name pattern to try if the current device name
10541 *	      is already taken in the destination network namespace.
10542 *
10543 *	This function shuts down a device interface and moves it
10544 *	to a new network namespace. On success 0 is returned, on
10545 *	a failure a netagive errno code is returned.
10546 *
10547 *	Callers must hold the rtnl semaphore.
10548 */
10549
10550int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
10551{
10552	struct net *net_old = dev_net(dev);
10553	int err, new_nsid, new_ifindex;
10554
10555	ASSERT_RTNL();
10556
10557	/* Don't allow namespace local devices to be moved. */
10558	err = -EINVAL;
10559	if (dev->features & NETIF_F_NETNS_LOCAL)
10560		goto out;
10561
10562	/* Ensure the device has been registrered */
 
10563	if (dev->reg_state != NETREG_REGISTERED)
10564		goto out;
10565
10566	/* Get out if there is nothing todo */
10567	err = 0;
10568	if (net_eq(net_old, net))
10569		goto out;
10570
10571	/* Pick the destination device name, and ensure
10572	 * we can use it in the destination network namespace.
10573	 */
10574	err = -EEXIST;
10575	if (__dev_get_by_name(net, dev->name)) {
10576		/* We get here if we can't use the current device name */
10577		if (!pat)
10578			goto out;
10579		err = dev_get_valid_name(net, dev, pat);
10580		if (err < 0)
10581			goto out;
10582	}
10583
10584	/*
10585	 * And now a mini version of register_netdevice unregister_netdevice.
10586	 */
10587
10588	/* If device is running close it first. */
10589	dev_close(dev);
10590
10591	/* And unlink it from device chain */
 
10592	unlist_netdevice(dev);
10593
10594	synchronize_net();
10595
10596	/* Shutdown queueing discipline. */
10597	dev_shutdown(dev);
10598
10599	/* Notify protocols, that we are about to destroy
10600	 * this device. They should clean all the things.
10601	 *
10602	 * Note that dev->reg_state stays at NETREG_REGISTERED.
10603	 * This is wanted because this way 8021q and macvlan know
10604	 * the device is just moving and can keep their slaves up.
10605	 */
10606	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10607	rcu_barrier();
10608
10609	new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
10610	/* If there is an ifindex conflict assign a new one */
10611	if (__dev_get_by_index(net, dev->ifindex))
10612		new_ifindex = dev_new_index(net);
10613	else
10614		new_ifindex = dev->ifindex;
10615
10616	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
10617			    new_ifindex);
10618
10619	/*
10620	 *	Flush the unicast and multicast chains
10621	 */
10622	dev_uc_flush(dev);
10623	dev_mc_flush(dev);
10624
10625	/* Send a netdev-removed uevent to the old namespace */
10626	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
10627	netdev_adjacent_del_links(dev);
10628
10629	/* Move per-net netdevice notifiers that are following the netdevice */
10630	move_netdevice_notifiers_dev_net(dev, net);
10631
10632	/* Actually switch the network namespace */
10633	dev_net_set(dev, net);
10634	dev->ifindex = new_ifindex;
10635
10636	/* Send a netdev-add uevent to the new namespace */
10637	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
10638	netdev_adjacent_add_links(dev);
 
 
 
 
10639
10640	/* Fixup kobjects */
10641	err = device_rename(&dev->dev, dev->name);
10642	WARN_ON(err);
10643
10644	/* Adapt owner in case owning user namespace of target network
10645	 * namespace is different from the original one.
10646	 */
10647	err = netdev_change_owner(dev, net_old, net);
10648	WARN_ON(err);
10649
10650	/* Add the device back in the hashes */
10651	list_netdevice(dev);
10652
10653	/* Notify protocols, that a new device appeared. */
10654	call_netdevice_notifiers(NETDEV_REGISTER, dev);
10655
10656	/*
10657	 *	Prevent userspace races by waiting until the network
10658	 *	device is fully setup before sending notifications.
10659	 */
10660	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
10661
10662	synchronize_net();
10663	err = 0;
10664out:
10665	return err;
10666}
10667EXPORT_SYMBOL_GPL(dev_change_net_namespace);
10668
10669static int dev_cpu_dead(unsigned int oldcpu)
 
 
10670{
10671	struct sk_buff **list_skb;
10672	struct sk_buff *skb;
10673	unsigned int cpu;
10674	struct softnet_data *sd, *oldsd, *remsd = NULL;
 
 
 
10675
10676	local_irq_disable();
10677	cpu = smp_processor_id();
10678	sd = &per_cpu(softnet_data, cpu);
10679	oldsd = &per_cpu(softnet_data, oldcpu);
10680
10681	/* Find end of our completion_queue. */
10682	list_skb = &sd->completion_queue;
10683	while (*list_skb)
10684		list_skb = &(*list_skb)->next;
10685	/* Append completion queue from offline CPU. */
10686	*list_skb = oldsd->completion_queue;
10687	oldsd->completion_queue = NULL;
10688
10689	/* Append output queue from offline CPU. */
10690	if (oldsd->output_queue) {
10691		*sd->output_queue_tailp = oldsd->output_queue;
10692		sd->output_queue_tailp = oldsd->output_queue_tailp;
10693		oldsd->output_queue = NULL;
10694		oldsd->output_queue_tailp = &oldsd->output_queue;
10695	}
10696	/* Append NAPI poll list from offline CPU, with one exception :
10697	 * process_backlog() must be called by cpu owning percpu backlog.
10698	 * We properly handle process_queue & input_pkt_queue later.
10699	 */
10700	while (!list_empty(&oldsd->poll_list)) {
10701		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
10702							    struct napi_struct,
10703							    poll_list);
10704
10705		list_del_init(&napi->poll_list);
10706		if (napi->poll == process_backlog)
10707			napi->state = 0;
10708		else
10709			____napi_schedule(sd, napi);
10710	}
10711
10712	raise_softirq_irqoff(NET_TX_SOFTIRQ);
10713	local_irq_enable();
10714
10715#ifdef CONFIG_RPS
10716	remsd = oldsd->rps_ipi_list;
10717	oldsd->rps_ipi_list = NULL;
10718#endif
10719	/* send out pending IPI's on offline CPU */
10720	net_rps_send_ipi(remsd);
10721
10722	/* Process offline CPU's input_pkt_queue */
10723	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
10724		netif_rx_ni(skb);
10725		input_queue_head_incr(oldsd);
10726	}
10727	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
10728		netif_rx_ni(skb);
10729		input_queue_head_incr(oldsd);
10730	}
10731
10732	return 0;
10733}
10734
 
10735/**
10736 *	netdev_increment_features - increment feature set by one
10737 *	@all: current feature set
10738 *	@one: new feature set
10739 *	@mask: mask feature set
10740 *
10741 *	Computes a new feature set after adding a device with feature set
10742 *	@one to the master device with current feature set @all.  Will not
10743 *	enable anything that is off in @mask. Returns the new feature set.
10744 */
10745netdev_features_t netdev_increment_features(netdev_features_t all,
10746	netdev_features_t one, netdev_features_t mask)
10747{
10748	if (mask & NETIF_F_HW_CSUM)
10749		mask |= NETIF_F_CSUM_MASK;
10750	mask |= NETIF_F_VLAN_CHALLENGED;
10751
10752	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
10753	all &= one | ~NETIF_F_ALL_FOR_ALL;
10754
 
 
 
 
10755	/* If one device supports hw checksumming, set for all. */
10756	if (all & NETIF_F_HW_CSUM)
10757		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
10758
10759	return all;
10760}
10761EXPORT_SYMBOL(netdev_increment_features);
10762
10763static struct hlist_head * __net_init netdev_create_hash(void)
10764{
10765	int i;
10766	struct hlist_head *hash;
10767
10768	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
10769	if (hash != NULL)
10770		for (i = 0; i < NETDEV_HASHENTRIES; i++)
10771			INIT_HLIST_HEAD(&hash[i]);
10772
10773	return hash;
10774}
10775
10776/* Initialize per network namespace state */
10777static int __net_init netdev_init(struct net *net)
10778{
10779	BUILD_BUG_ON(GRO_HASH_BUCKETS >
10780		     8 * sizeof_field(struct napi_struct, gro_bitmask));
10781
10782	if (net != &init_net)
10783		INIT_LIST_HEAD(&net->dev_base_head);
10784
10785	net->dev_name_head = netdev_create_hash();
10786	if (net->dev_name_head == NULL)
10787		goto err_name;
10788
10789	net->dev_index_head = netdev_create_hash();
10790	if (net->dev_index_head == NULL)
10791		goto err_idx;
10792
10793	RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
10794
10795	return 0;
10796
10797err_idx:
10798	kfree(net->dev_name_head);
10799err_name:
10800	return -ENOMEM;
10801}
10802
10803/**
10804 *	netdev_drivername - network driver for the device
10805 *	@dev: network device
10806 *
10807 *	Determine network driver for device.
10808 */
10809const char *netdev_drivername(const struct net_device *dev)
10810{
10811	const struct device_driver *driver;
10812	const struct device *parent;
10813	const char *empty = "";
10814
10815	parent = dev->dev.parent;
10816	if (!parent)
10817		return empty;
10818
10819	driver = parent->driver;
10820	if (driver && driver->name)
10821		return driver->name;
10822	return empty;
10823}
10824
10825static void __netdev_printk(const char *level, const struct net_device *dev,
10826			    struct va_format *vaf)
10827{
10828	if (dev && dev->dev.parent) {
10829		dev_printk_emit(level[1] - '0',
10830				dev->dev.parent,
10831				"%s %s %s%s: %pV",
10832				dev_driver_string(dev->dev.parent),
10833				dev_name(dev->dev.parent),
10834				netdev_name(dev), netdev_reg_state(dev),
10835				vaf);
10836	} else if (dev) {
10837		printk("%s%s%s: %pV",
10838		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
10839	} else {
10840		printk("%s(NULL net_device): %pV", level, vaf);
10841	}
10842}
10843
10844void netdev_printk(const char *level, const struct net_device *dev,
10845		   const char *format, ...)
10846{
10847	struct va_format vaf;
10848	va_list args;
 
10849
10850	va_start(args, format);
10851
10852	vaf.fmt = format;
10853	vaf.va = &args;
10854
10855	__netdev_printk(level, dev, &vaf);
 
10856
10857	va_end(args);
10858}
10859EXPORT_SYMBOL(netdev_printk);
10860
10861#define define_netdev_printk_level(func, level)			\
10862void func(const struct net_device *dev, const char *fmt, ...)	\
10863{								\
 
10864	struct va_format vaf;					\
10865	va_list args;						\
10866								\
10867	va_start(args, fmt);					\
10868								\
10869	vaf.fmt = fmt;						\
10870	vaf.va = &args;						\
10871								\
10872	__netdev_printk(level, dev, &vaf);			\
 
10873								\
10874	va_end(args);						\
10875}								\
10876EXPORT_SYMBOL(func);
10877
10878define_netdev_printk_level(netdev_emerg, KERN_EMERG);
10879define_netdev_printk_level(netdev_alert, KERN_ALERT);
10880define_netdev_printk_level(netdev_crit, KERN_CRIT);
10881define_netdev_printk_level(netdev_err, KERN_ERR);
10882define_netdev_printk_level(netdev_warn, KERN_WARNING);
10883define_netdev_printk_level(netdev_notice, KERN_NOTICE);
10884define_netdev_printk_level(netdev_info, KERN_INFO);
10885
10886static void __net_exit netdev_exit(struct net *net)
10887{
10888	kfree(net->dev_name_head);
10889	kfree(net->dev_index_head);
10890	if (net != &init_net)
10891		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
10892}
10893
10894static struct pernet_operations __net_initdata netdev_net_ops = {
10895	.init = netdev_init,
10896	.exit = netdev_exit,
10897};
10898
10899static void __net_exit default_device_exit(struct net *net)
10900{
10901	struct net_device *dev, *aux;
10902	/*
10903	 * Push all migratable network devices back to the
10904	 * initial network namespace
10905	 */
10906	rtnl_lock();
10907	for_each_netdev_safe(net, dev, aux) {
10908		int err;
10909		char fb_name[IFNAMSIZ];
10910
10911		/* Ignore unmoveable devices (i.e. loopback) */
10912		if (dev->features & NETIF_F_NETNS_LOCAL)
10913			continue;
10914
10915		/* Leave virtual devices for the generic cleanup */
10916		if (dev->rtnl_link_ops)
10917			continue;
10918
10919		/* Push remaining network devices to init_net */
10920		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
10921		if (__dev_get_by_name(&init_net, fb_name))
10922			snprintf(fb_name, IFNAMSIZ, "dev%%d");
10923		err = dev_change_net_namespace(dev, &init_net, fb_name);
10924		if (err) {
10925			pr_emerg("%s: failed to move %s to init_net: %d\n",
10926				 __func__, dev->name, err);
10927			BUG();
10928		}
10929	}
10930	rtnl_unlock();
10931}
10932
10933static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
10934{
10935	/* Return with the rtnl_lock held when there are no network
10936	 * devices unregistering in any network namespace in net_list.
10937	 */
10938	struct net *net;
10939	bool unregistering;
10940	DEFINE_WAIT_FUNC(wait, woken_wake_function);
10941
10942	add_wait_queue(&netdev_unregistering_wq, &wait);
10943	for (;;) {
10944		unregistering = false;
10945		rtnl_lock();
10946		list_for_each_entry(net, net_list, exit_list) {
10947			if (net->dev_unreg_count > 0) {
10948				unregistering = true;
10949				break;
10950			}
10951		}
10952		if (!unregistering)
10953			break;
10954		__rtnl_unlock();
10955
10956		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
10957	}
10958	remove_wait_queue(&netdev_unregistering_wq, &wait);
10959}
10960
10961static void __net_exit default_device_exit_batch(struct list_head *net_list)
10962{
10963	/* At exit all network devices most be removed from a network
10964	 * namespace.  Do this in the reverse order of registration.
10965	 * Do this across as many network namespaces as possible to
10966	 * improve batching efficiency.
10967	 */
10968	struct net_device *dev;
10969	struct net *net;
10970	LIST_HEAD(dev_kill_list);
10971
10972	/* To prevent network device cleanup code from dereferencing
10973	 * loopback devices or network devices that have been freed
10974	 * wait here for all pending unregistrations to complete,
10975	 * before unregistring the loopback device and allowing the
10976	 * network namespace be freed.
10977	 *
10978	 * The netdev todo list containing all network devices
10979	 * unregistrations that happen in default_device_exit_batch
10980	 * will run in the rtnl_unlock() at the end of
10981	 * default_device_exit_batch.
10982	 */
10983	rtnl_lock_unregistering(net_list);
10984	list_for_each_entry(net, net_list, exit_list) {
10985		for_each_netdev_reverse(net, dev) {
10986			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
10987				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
10988			else
10989				unregister_netdevice_queue(dev, &dev_kill_list);
10990		}
10991	}
10992	unregister_netdevice_many(&dev_kill_list);
 
10993	rtnl_unlock();
10994}
10995
10996static struct pernet_operations __net_initdata default_device_ops = {
10997	.exit = default_device_exit,
10998	.exit_batch = default_device_exit_batch,
10999};
11000
11001/*
11002 *	Initialize the DEV module. At boot time this walks the device list and
11003 *	unhooks any devices that fail to initialise (normally hardware not
11004 *	present) and leaves us with a valid list of present and active devices.
11005 *
11006 */
11007
11008/*
11009 *       This is called single threaded during boot, so no need
11010 *       to take the rtnl semaphore.
11011 */
11012static int __init net_dev_init(void)
11013{
11014	int i, rc = -ENOMEM;
11015
11016	BUG_ON(!dev_boot_phase);
11017
11018	if (dev_proc_init())
11019		goto out;
11020
11021	if (netdev_kobject_init())
11022		goto out;
11023
11024	INIT_LIST_HEAD(&ptype_all);
11025	for (i = 0; i < PTYPE_HASH_SIZE; i++)
11026		INIT_LIST_HEAD(&ptype_base[i]);
11027
11028	INIT_LIST_HEAD(&offload_base);
11029
11030	if (register_pernet_subsys(&netdev_net_ops))
11031		goto out;
11032
11033	/*
11034	 *	Initialise the packet receive queues.
11035	 */
11036
11037	for_each_possible_cpu(i) {
11038		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
11039		struct softnet_data *sd = &per_cpu(softnet_data, i);
11040
11041		INIT_WORK(flush, flush_backlog);
11042
11043		skb_queue_head_init(&sd->input_pkt_queue);
11044		skb_queue_head_init(&sd->process_queue);
11045#ifdef CONFIG_XFRM_OFFLOAD
11046		skb_queue_head_init(&sd->xfrm_backlog);
11047#endif
11048		INIT_LIST_HEAD(&sd->poll_list);
 
11049		sd->output_queue_tailp = &sd->output_queue;
11050#ifdef CONFIG_RPS
11051		sd->csd.func = rps_trigger_softirq;
11052		sd->csd.info = sd;
 
11053		sd->cpu = i;
11054#endif
11055
11056		init_gro_hash(&sd->backlog);
11057		sd->backlog.poll = process_backlog;
11058		sd->backlog.weight = weight_p;
 
 
11059	}
11060
11061	dev_boot_phase = 0;
11062
11063	/* The loopback device is special if any other network devices
11064	 * is present in a network namespace the loopback device must
11065	 * be present. Since we now dynamically allocate and free the
11066	 * loopback device ensure this invariant is maintained by
11067	 * keeping the loopback device as the first device on the
11068	 * list of network devices.  Ensuring the loopback devices
11069	 * is the first device that appears and the last network device
11070	 * that disappears.
11071	 */
11072	if (register_pernet_device(&loopback_net_ops))
11073		goto out;
11074
11075	if (register_pernet_device(&default_device_ops))
11076		goto out;
11077
11078	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
11079	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
11080
11081	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
11082				       NULL, dev_cpu_dead);
11083	WARN_ON(rc < 0);
11084	rc = 0;
11085out:
11086	return rc;
11087}
11088
11089subsys_initcall(net_dev_init);