Linux Audio

Check our new training course

Loading...
v4.17
   1/*
   2 *      NET3    Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 *              Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *                                      to 2 if register_netdev gets called
  25 *                                      before net_dev_init & also removed a
  26 *                                      few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *              Alan Cox        :       Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *                                      indefinitely on dev->refcnt
  71 *              J Hadi Salim    :       - Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <linux/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/sched/mm.h>
  85#include <linux/mutex.h>
  86#include <linux/string.h>
  87#include <linux/mm.h>
  88#include <linux/socket.h>
  89#include <linux/sockios.h>
  90#include <linux/errno.h>
  91#include <linux/interrupt.h>
  92#include <linux/if_ether.h>
  93#include <linux/netdevice.h>
  94#include <linux/etherdevice.h>
  95#include <linux/ethtool.h>
  96#include <linux/notifier.h>
  97#include <linux/skbuff.h>
  98#include <linux/bpf.h>
  99#include <linux/bpf_trace.h>
 100#include <net/net_namespace.h>
 101#include <net/sock.h>
 102#include <net/busy_poll.h>
 103#include <linux/rtnetlink.h>
 104#include <linux/stat.h>
 105#include <net/dst.h>
 106#include <net/dst_metadata.h>
 107#include <net/pkt_sched.h>
 108#include <net/pkt_cls.h>
 109#include <net/checksum.h>
 110#include <net/xfrm.h>
 111#include <linux/highmem.h>
 112#include <linux/init.h>
 113#include <linux/module.h>
 114#include <linux/netpoll.h>
 115#include <linux/rcupdate.h>
 116#include <linux/delay.h>
 117#include <net/iw_handler.h>
 118#include <asm/current.h>
 119#include <linux/audit.h>
 120#include <linux/dmaengine.h>
 121#include <linux/err.h>
 122#include <linux/ctype.h>
 123#include <linux/if_arp.h>
 124#include <linux/if_vlan.h>
 125#include <linux/ip.h>
 126#include <net/ip.h>
 127#include <net/mpls.h>
 128#include <linux/ipv6.h>
 129#include <linux/in.h>
 130#include <linux/jhash.h>
 131#include <linux/random.h>
 132#include <trace/events/napi.h>
 133#include <trace/events/net.h>
 134#include <trace/events/skb.h>
 135#include <linux/pci.h>
 136#include <linux/inetdevice.h>
 137#include <linux/cpu_rmap.h>
 138#include <linux/static_key.h>
 139#include <linux/hashtable.h>
 140#include <linux/vmalloc.h>
 141#include <linux/if_macvlan.h>
 142#include <linux/errqueue.h>
 143#include <linux/hrtimer.h>
 144#include <linux/netfilter_ingress.h>
 145#include <linux/crash_dump.h>
 146#include <linux/sctp.h>
 147#include <net/udp_tunnel.h>
 148#include <linux/net_namespace.h>
 149
 150#include "net-sysfs.h"
 151
 152/* Instead of increasing this, you should create a hash table. */
 153#define MAX_GRO_SKBS 8
 154
 155/* This should be increased if a protocol with a bigger head is added. */
 156#define GRO_MAX_HEAD (MAX_HEADER + 128)
 157
 158static DEFINE_SPINLOCK(ptype_lock);
 159static DEFINE_SPINLOCK(offload_lock);
 160struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 161struct list_head ptype_all __read_mostly;	/* Taps */
 162static struct list_head offload_base __read_mostly;
 163
 164static int netif_rx_internal(struct sk_buff *skb);
 165static int call_netdevice_notifiers_info(unsigned long val,
 
 166					 struct netdev_notifier_info *info);
 167static struct napi_struct *napi_by_id(unsigned int napi_id);
 168
 169/*
 170 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 171 * semaphore.
 172 *
 173 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 174 *
 175 * Writers must hold the rtnl semaphore while they loop through the
 176 * dev_base_head list, and hold dev_base_lock for writing when they do the
 177 * actual updates.  This allows pure readers to access the list even
 178 * while a writer is preparing to update it.
 179 *
 180 * To put it another way, dev_base_lock is held for writing only to
 181 * protect against pure readers; the rtnl semaphore provides the
 182 * protection against other writers.
 183 *
 184 * See, for example usages, register_netdevice() and
 185 * unregister_netdevice(), which must be called with the rtnl
 186 * semaphore held.
 187 */
 188DEFINE_RWLOCK(dev_base_lock);
 189EXPORT_SYMBOL(dev_base_lock);
 190
 191static DEFINE_MUTEX(ifalias_mutex);
 192
 193/* protects napi_hash addition/deletion and napi_gen_id */
 194static DEFINE_SPINLOCK(napi_hash_lock);
 195
 196static unsigned int napi_gen_id = NR_CPUS;
 197static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 198
 199static seqcount_t devnet_rename_seq;
 200
 201static inline void dev_base_seq_inc(struct net *net)
 202{
 203	while (++net->dev_base_seq == 0)
 204		;
 205}
 206
 207static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 208{
 209	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
 210
 211	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 212}
 213
 214static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 215{
 216	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 217}
 218
 219static inline void rps_lock(struct softnet_data *sd)
 220{
 221#ifdef CONFIG_RPS
 222	spin_lock(&sd->input_pkt_queue.lock);
 223#endif
 224}
 225
 226static inline void rps_unlock(struct softnet_data *sd)
 227{
 228#ifdef CONFIG_RPS
 229	spin_unlock(&sd->input_pkt_queue.lock);
 230#endif
 231}
 232
 233/* Device list insertion */
 234static void list_netdevice(struct net_device *dev)
 235{
 236	struct net *net = dev_net(dev);
 237
 238	ASSERT_RTNL();
 239
 240	write_lock_bh(&dev_base_lock);
 241	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 242	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 243	hlist_add_head_rcu(&dev->index_hlist,
 244			   dev_index_hash(net, dev->ifindex));
 245	write_unlock_bh(&dev_base_lock);
 246
 247	dev_base_seq_inc(net);
 248}
 249
 250/* Device list removal
 251 * caller must respect a RCU grace period before freeing/reusing dev
 252 */
 253static void unlist_netdevice(struct net_device *dev)
 254{
 255	ASSERT_RTNL();
 256
 257	/* Unlink dev from the device chain */
 258	write_lock_bh(&dev_base_lock);
 259	list_del_rcu(&dev->dev_list);
 260	hlist_del_rcu(&dev->name_hlist);
 261	hlist_del_rcu(&dev->index_hlist);
 262	write_unlock_bh(&dev_base_lock);
 263
 264	dev_base_seq_inc(dev_net(dev));
 265}
 266
 267/*
 268 *	Our notifier list
 269 */
 270
 271static RAW_NOTIFIER_HEAD(netdev_chain);
 272
 273/*
 274 *	Device drivers call our routines to queue packets here. We empty the
 275 *	queue in the local softnet handler.
 276 */
 277
 278DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 279EXPORT_PER_CPU_SYMBOL(softnet_data);
 280
 281#ifdef CONFIG_LOCKDEP
 282/*
 283 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 284 * according to dev->type
 285 */
 286static const unsigned short netdev_lock_type[] = {
 287	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 288	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 289	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 290	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 291	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 292	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 293	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 294	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 295	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 296	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 297	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 298	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 299	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 300	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 301	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 302
 303static const char *const netdev_lock_name[] = {
 304	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 305	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 306	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 307	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 308	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 309	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 310	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 311	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 312	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 313	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 314	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 315	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 316	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 317	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 318	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 319
 320static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 321static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 322
 323static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 324{
 325	int i;
 326
 327	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 328		if (netdev_lock_type[i] == dev_type)
 329			return i;
 330	/* the last key is used by default */
 331	return ARRAY_SIZE(netdev_lock_type) - 1;
 332}
 333
 334static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 335						 unsigned short dev_type)
 336{
 337	int i;
 338
 339	i = netdev_lock_pos(dev_type);
 340	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 341				   netdev_lock_name[i]);
 342}
 343
 344static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 345{
 346	int i;
 347
 348	i = netdev_lock_pos(dev->type);
 349	lockdep_set_class_and_name(&dev->addr_list_lock,
 350				   &netdev_addr_lock_key[i],
 351				   netdev_lock_name[i]);
 352}
 353#else
 354static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 355						 unsigned short dev_type)
 356{
 357}
 358static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 359{
 360}
 361#endif
 362
 363/*******************************************************************************
 364 *
 365 *		Protocol management and registration routines
 366 *
 367 *******************************************************************************/
 368
 
 
 
 369
 370/*
 371 *	Add a protocol ID to the list. Now that the input handler is
 372 *	smarter we can dispense with all the messy stuff that used to be
 373 *	here.
 374 *
 375 *	BEWARE!!! Protocol handlers, mangling input packets,
 376 *	MUST BE last in hash buckets and checking protocol handlers
 377 *	MUST start from promiscuous ptype_all chain in net_bh.
 378 *	It is true now, do not change it.
 379 *	Explanation follows: if protocol handler, mangling packet, will
 380 *	be the first on list, it is not able to sense, that packet
 381 *	is cloned and should be copied-on-write, so that it will
 382 *	change it and subsequent readers will get broken packet.
 383 *							--ANK (980803)
 384 */
 385
 386static inline struct list_head *ptype_head(const struct packet_type *pt)
 387{
 388	if (pt->type == htons(ETH_P_ALL))
 389		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 390	else
 391		return pt->dev ? &pt->dev->ptype_specific :
 392				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 393}
 394
 395/**
 396 *	dev_add_pack - add packet handler
 397 *	@pt: packet type declaration
 398 *
 399 *	Add a protocol handler to the networking stack. The passed &packet_type
 400 *	is linked into kernel lists and may not be freed until it has been
 401 *	removed from the kernel lists.
 402 *
 403 *	This call does not sleep therefore it can not
 404 *	guarantee all CPU's that are in middle of receiving packets
 405 *	will see the new packet type (until the next received packet).
 406 */
 407
 408void dev_add_pack(struct packet_type *pt)
 409{
 410	struct list_head *head = ptype_head(pt);
 411
 412	spin_lock(&ptype_lock);
 413	list_add_rcu(&pt->list, head);
 414	spin_unlock(&ptype_lock);
 415}
 416EXPORT_SYMBOL(dev_add_pack);
 417
 418/**
 419 *	__dev_remove_pack	 - remove packet handler
 420 *	@pt: packet type declaration
 421 *
 422 *	Remove a protocol handler that was previously added to the kernel
 423 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 424 *	from the kernel lists and can be freed or reused once this function
 425 *	returns.
 426 *
 427 *      The packet type might still be in use by receivers
 428 *	and must not be freed until after all the CPU's have gone
 429 *	through a quiescent state.
 430 */
 431void __dev_remove_pack(struct packet_type *pt)
 432{
 433	struct list_head *head = ptype_head(pt);
 434	struct packet_type *pt1;
 435
 436	spin_lock(&ptype_lock);
 437
 438	list_for_each_entry(pt1, head, list) {
 439		if (pt == pt1) {
 440			list_del_rcu(&pt->list);
 441			goto out;
 442		}
 443	}
 444
 445	pr_warn("dev_remove_pack: %p not found\n", pt);
 446out:
 447	spin_unlock(&ptype_lock);
 448}
 449EXPORT_SYMBOL(__dev_remove_pack);
 450
 451/**
 452 *	dev_remove_pack	 - remove packet handler
 453 *	@pt: packet type declaration
 454 *
 455 *	Remove a protocol handler that was previously added to the kernel
 456 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 457 *	from the kernel lists and can be freed or reused once this function
 458 *	returns.
 459 *
 460 *	This call sleeps to guarantee that no CPU is looking at the packet
 461 *	type after return.
 462 */
 463void dev_remove_pack(struct packet_type *pt)
 464{
 465	__dev_remove_pack(pt);
 466
 467	synchronize_net();
 468}
 469EXPORT_SYMBOL(dev_remove_pack);
 470
 471
 472/**
 473 *	dev_add_offload - register offload handlers
 474 *	@po: protocol offload declaration
 475 *
 476 *	Add protocol offload handlers to the networking stack. The passed
 477 *	&proto_offload is linked into kernel lists and may not be freed until
 478 *	it has been removed from the kernel lists.
 479 *
 480 *	This call does not sleep therefore it can not
 481 *	guarantee all CPU's that are in middle of receiving packets
 482 *	will see the new offload handlers (until the next received packet).
 483 */
 484void dev_add_offload(struct packet_offload *po)
 485{
 486	struct packet_offload *elem;
 487
 488	spin_lock(&offload_lock);
 489	list_for_each_entry(elem, &offload_base, list) {
 490		if (po->priority < elem->priority)
 491			break;
 492	}
 493	list_add_rcu(&po->list, elem->list.prev);
 494	spin_unlock(&offload_lock);
 495}
 496EXPORT_SYMBOL(dev_add_offload);
 497
 498/**
 499 *	__dev_remove_offload	 - remove offload handler
 500 *	@po: packet offload declaration
 501 *
 502 *	Remove a protocol offload handler that was previously added to the
 503 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
 504 *	is removed from the kernel lists and can be freed or reused once this
 505 *	function returns.
 506 *
 507 *      The packet type might still be in use by receivers
 508 *	and must not be freed until after all the CPU's have gone
 509 *	through a quiescent state.
 510 */
 511static void __dev_remove_offload(struct packet_offload *po)
 512{
 513	struct list_head *head = &offload_base;
 514	struct packet_offload *po1;
 515
 516	spin_lock(&offload_lock);
 517
 518	list_for_each_entry(po1, head, list) {
 519		if (po == po1) {
 520			list_del_rcu(&po->list);
 521			goto out;
 522		}
 523	}
 524
 525	pr_warn("dev_remove_offload: %p not found\n", po);
 526out:
 527	spin_unlock(&offload_lock);
 528}
 529
 530/**
 531 *	dev_remove_offload	 - remove packet offload handler
 532 *	@po: packet offload declaration
 533 *
 534 *	Remove a packet offload handler that was previously added to the kernel
 535 *	offload handlers by dev_add_offload(). The passed &offload_type is
 536 *	removed from the kernel lists and can be freed or reused once this
 537 *	function returns.
 538 *
 539 *	This call sleeps to guarantee that no CPU is looking at the packet
 540 *	type after return.
 541 */
 542void dev_remove_offload(struct packet_offload *po)
 543{
 544	__dev_remove_offload(po);
 545
 546	synchronize_net();
 547}
 548EXPORT_SYMBOL(dev_remove_offload);
 549
 550/******************************************************************************
 551 *
 552 *		      Device Boot-time Settings Routines
 553 *
 554 ******************************************************************************/
 555
 556/* Boot time configuration table */
 557static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 558
 559/**
 560 *	netdev_boot_setup_add	- add new setup entry
 561 *	@name: name of the device
 562 *	@map: configured settings for the device
 563 *
 564 *	Adds new setup entry to the dev_boot_setup list.  The function
 565 *	returns 0 on error and 1 on success.  This is a generic routine to
 566 *	all netdevices.
 567 */
 568static int netdev_boot_setup_add(char *name, struct ifmap *map)
 569{
 570	struct netdev_boot_setup *s;
 571	int i;
 572
 573	s = dev_boot_setup;
 574	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 575		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 576			memset(s[i].name, 0, sizeof(s[i].name));
 577			strlcpy(s[i].name, name, IFNAMSIZ);
 578			memcpy(&s[i].map, map, sizeof(s[i].map));
 579			break;
 580		}
 581	}
 582
 583	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 584}
 585
 586/**
 587 * netdev_boot_setup_check	- check boot time settings
 588 * @dev: the netdevice
 589 *
 590 * Check boot time settings for the device.
 591 * The found settings are set for the device to be used
 592 * later in the device probing.
 593 * Returns 0 if no settings found, 1 if they are.
 594 */
 595int netdev_boot_setup_check(struct net_device *dev)
 596{
 597	struct netdev_boot_setup *s = dev_boot_setup;
 598	int i;
 599
 600	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 601		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 602		    !strcmp(dev->name, s[i].name)) {
 603			dev->irq = s[i].map.irq;
 604			dev->base_addr = s[i].map.base_addr;
 605			dev->mem_start = s[i].map.mem_start;
 606			dev->mem_end = s[i].map.mem_end;
 607			return 1;
 608		}
 609	}
 610	return 0;
 611}
 612EXPORT_SYMBOL(netdev_boot_setup_check);
 613
 614
 615/**
 616 * netdev_boot_base	- get address from boot time settings
 617 * @prefix: prefix for network device
 618 * @unit: id for network device
 619 *
 620 * Check boot time settings for the base address of device.
 621 * The found settings are set for the device to be used
 622 * later in the device probing.
 623 * Returns 0 if no settings found.
 624 */
 625unsigned long netdev_boot_base(const char *prefix, int unit)
 626{
 627	const struct netdev_boot_setup *s = dev_boot_setup;
 628	char name[IFNAMSIZ];
 629	int i;
 630
 631	sprintf(name, "%s%d", prefix, unit);
 632
 633	/*
 634	 * If device already registered then return base of 1
 635	 * to indicate not to probe for this interface
 636	 */
 637	if (__dev_get_by_name(&init_net, name))
 638		return 1;
 639
 640	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 641		if (!strcmp(name, s[i].name))
 642			return s[i].map.base_addr;
 643	return 0;
 644}
 645
 646/*
 647 * Saves at boot time configured settings for any netdevice.
 648 */
 649int __init netdev_boot_setup(char *str)
 650{
 651	int ints[5];
 652	struct ifmap map;
 653
 654	str = get_options(str, ARRAY_SIZE(ints), ints);
 655	if (!str || !*str)
 656		return 0;
 657
 658	/* Save settings */
 659	memset(&map, 0, sizeof(map));
 660	if (ints[0] > 0)
 661		map.irq = ints[1];
 662	if (ints[0] > 1)
 663		map.base_addr = ints[2];
 664	if (ints[0] > 2)
 665		map.mem_start = ints[3];
 666	if (ints[0] > 3)
 667		map.mem_end = ints[4];
 668
 669	/* Add new entry to the list */
 670	return netdev_boot_setup_add(str, &map);
 671}
 672
 673__setup("netdev=", netdev_boot_setup);
 674
 675/*******************************************************************************
 676 *
 677 *			    Device Interface Subroutines
 678 *
 679 *******************************************************************************/
 680
 681/**
 682 *	dev_get_iflink	- get 'iflink' value of a interface
 683 *	@dev: targeted interface
 684 *
 685 *	Indicates the ifindex the interface is linked to.
 686 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
 687 */
 688
 689int dev_get_iflink(const struct net_device *dev)
 690{
 691	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 692		return dev->netdev_ops->ndo_get_iflink(dev);
 693
 694	return dev->ifindex;
 695}
 696EXPORT_SYMBOL(dev_get_iflink);
 697
 698/**
 699 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
 700 *	@dev: targeted interface
 701 *	@skb: The packet.
 702 *
 703 *	For better visibility of tunnel traffic OVS needs to retrieve
 704 *	egress tunnel information for a packet. Following API allows
 705 *	user to get this info.
 706 */
 707int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 708{
 709	struct ip_tunnel_info *info;
 710
 711	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 712		return -EINVAL;
 713
 714	info = skb_tunnel_info_unclone(skb);
 715	if (!info)
 716		return -ENOMEM;
 717	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 718		return -EINVAL;
 719
 720	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 721}
 722EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 723
 724/**
 725 *	__dev_get_by_name	- find a device by its name
 726 *	@net: the applicable net namespace
 727 *	@name: name to find
 728 *
 729 *	Find an interface by name. Must be called under RTNL semaphore
 730 *	or @dev_base_lock. If the name is found a pointer to the device
 731 *	is returned. If the name is not found then %NULL is returned. The
 732 *	reference counters are not incremented so the caller must be
 733 *	careful with locks.
 734 */
 735
 736struct net_device *__dev_get_by_name(struct net *net, const char *name)
 737{
 738	struct net_device *dev;
 739	struct hlist_head *head = dev_name_hash(net, name);
 740
 741	hlist_for_each_entry(dev, head, name_hlist)
 742		if (!strncmp(dev->name, name, IFNAMSIZ))
 743			return dev;
 744
 745	return NULL;
 746}
 747EXPORT_SYMBOL(__dev_get_by_name);
 748
 749/**
 750 * dev_get_by_name_rcu	- find a device by its name
 751 * @net: the applicable net namespace
 752 * @name: name to find
 753 *
 754 * Find an interface by name.
 755 * If the name is found a pointer to the device is returned.
 756 * If the name is not found then %NULL is returned.
 757 * The reference counters are not incremented so the caller must be
 758 * careful with locks. The caller must hold RCU lock.
 759 */
 760
 761struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 762{
 763	struct net_device *dev;
 764	struct hlist_head *head = dev_name_hash(net, name);
 765
 766	hlist_for_each_entry_rcu(dev, head, name_hlist)
 767		if (!strncmp(dev->name, name, IFNAMSIZ))
 768			return dev;
 769
 770	return NULL;
 771}
 772EXPORT_SYMBOL(dev_get_by_name_rcu);
 773
 774/**
 775 *	dev_get_by_name		- find a device by its name
 776 *	@net: the applicable net namespace
 777 *	@name: name to find
 778 *
 779 *	Find an interface by name. This can be called from any
 780 *	context and does its own locking. The returned handle has
 781 *	the usage count incremented and the caller must use dev_put() to
 782 *	release it when it is no longer needed. %NULL is returned if no
 783 *	matching device is found.
 784 */
 785
 786struct net_device *dev_get_by_name(struct net *net, const char *name)
 787{
 788	struct net_device *dev;
 789
 790	rcu_read_lock();
 791	dev = dev_get_by_name_rcu(net, name);
 792	if (dev)
 793		dev_hold(dev);
 794	rcu_read_unlock();
 795	return dev;
 796}
 797EXPORT_SYMBOL(dev_get_by_name);
 798
 799/**
 800 *	__dev_get_by_index - find a device by its ifindex
 801 *	@net: the applicable net namespace
 802 *	@ifindex: index of device
 803 *
 804 *	Search for an interface by index. Returns %NULL if the device
 805 *	is not found or a pointer to the device. The device has not
 806 *	had its reference counter increased so the caller must be careful
 807 *	about locking. The caller must hold either the RTNL semaphore
 808 *	or @dev_base_lock.
 809 */
 810
 811struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 812{
 813	struct net_device *dev;
 814	struct hlist_head *head = dev_index_hash(net, ifindex);
 815
 816	hlist_for_each_entry(dev, head, index_hlist)
 817		if (dev->ifindex == ifindex)
 818			return dev;
 819
 820	return NULL;
 821}
 822EXPORT_SYMBOL(__dev_get_by_index);
 823
 824/**
 825 *	dev_get_by_index_rcu - find a device by its ifindex
 826 *	@net: the applicable net namespace
 827 *	@ifindex: index of device
 828 *
 829 *	Search for an interface by index. Returns %NULL if the device
 830 *	is not found or a pointer to the device. The device has not
 831 *	had its reference counter increased so the caller must be careful
 832 *	about locking. The caller must hold RCU lock.
 833 */
 834
 835struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 836{
 837	struct net_device *dev;
 838	struct hlist_head *head = dev_index_hash(net, ifindex);
 839
 840	hlist_for_each_entry_rcu(dev, head, index_hlist)
 841		if (dev->ifindex == ifindex)
 842			return dev;
 843
 844	return NULL;
 845}
 846EXPORT_SYMBOL(dev_get_by_index_rcu);
 847
 848
 849/**
 850 *	dev_get_by_index - find a device by its ifindex
 851 *	@net: the applicable net namespace
 852 *	@ifindex: index of device
 853 *
 854 *	Search for an interface by index. Returns NULL if the device
 855 *	is not found or a pointer to the device. The device returned has
 856 *	had a reference added and the pointer is safe until the user calls
 857 *	dev_put to indicate they have finished with it.
 858 */
 859
 860struct net_device *dev_get_by_index(struct net *net, int ifindex)
 861{
 862	struct net_device *dev;
 863
 864	rcu_read_lock();
 865	dev = dev_get_by_index_rcu(net, ifindex);
 866	if (dev)
 867		dev_hold(dev);
 868	rcu_read_unlock();
 869	return dev;
 870}
 871EXPORT_SYMBOL(dev_get_by_index);
 872
 873/**
 874 *	dev_get_by_napi_id - find a device by napi_id
 875 *	@napi_id: ID of the NAPI struct
 876 *
 877 *	Search for an interface by NAPI ID. Returns %NULL if the device
 878 *	is not found or a pointer to the device. The device has not had
 879 *	its reference counter increased so the caller must be careful
 880 *	about locking. The caller must hold RCU lock.
 881 */
 882
 883struct net_device *dev_get_by_napi_id(unsigned int napi_id)
 884{
 885	struct napi_struct *napi;
 886
 887	WARN_ON_ONCE(!rcu_read_lock_held());
 888
 889	if (napi_id < MIN_NAPI_ID)
 890		return NULL;
 891
 892	napi = napi_by_id(napi_id);
 893
 894	return napi ? napi->dev : NULL;
 895}
 896EXPORT_SYMBOL(dev_get_by_napi_id);
 897
 898/**
 899 *	netdev_get_name - get a netdevice name, knowing its ifindex.
 900 *	@net: network namespace
 901 *	@name: a pointer to the buffer where the name will be stored.
 902 *	@ifindex: the ifindex of the interface to get the name from.
 903 *
 904 *	The use of raw_seqcount_begin() and cond_resched() before
 905 *	retrying is required as we want to give the writers a chance
 906 *	to complete when CONFIG_PREEMPT is not set.
 907 */
 908int netdev_get_name(struct net *net, char *name, int ifindex)
 909{
 910	struct net_device *dev;
 911	unsigned int seq;
 912
 913retry:
 914	seq = raw_seqcount_begin(&devnet_rename_seq);
 915	rcu_read_lock();
 916	dev = dev_get_by_index_rcu(net, ifindex);
 917	if (!dev) {
 918		rcu_read_unlock();
 919		return -ENODEV;
 920	}
 921
 922	strcpy(name, dev->name);
 923	rcu_read_unlock();
 924	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 925		cond_resched();
 926		goto retry;
 927	}
 928
 929	return 0;
 930}
 931
 932/**
 933 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 934 *	@net: the applicable net namespace
 935 *	@type: media type of device
 936 *	@ha: hardware address
 937 *
 938 *	Search for an interface by MAC address. Returns NULL if the device
 939 *	is not found or a pointer to the device.
 940 *	The caller must hold RCU or RTNL.
 941 *	The returned device has not had its ref count increased
 942 *	and the caller must therefore be careful about locking
 943 *
 944 */
 945
 946struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 947				       const char *ha)
 948{
 949	struct net_device *dev;
 950
 951	for_each_netdev_rcu(net, dev)
 952		if (dev->type == type &&
 953		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 954			return dev;
 955
 956	return NULL;
 957}
 958EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 959
 960struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 961{
 962	struct net_device *dev;
 963
 964	ASSERT_RTNL();
 965	for_each_netdev(net, dev)
 966		if (dev->type == type)
 967			return dev;
 968
 969	return NULL;
 970}
 971EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 972
 973struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 974{
 975	struct net_device *dev, *ret = NULL;
 976
 977	rcu_read_lock();
 978	for_each_netdev_rcu(net, dev)
 979		if (dev->type == type) {
 980			dev_hold(dev);
 981			ret = dev;
 982			break;
 983		}
 984	rcu_read_unlock();
 985	return ret;
 986}
 987EXPORT_SYMBOL(dev_getfirstbyhwtype);
 988
 989/**
 990 *	__dev_get_by_flags - find any device with given flags
 991 *	@net: the applicable net namespace
 992 *	@if_flags: IFF_* values
 993 *	@mask: bitmask of bits in if_flags to check
 994 *
 995 *	Search for any interface with the given flags. Returns NULL if a device
 996 *	is not found or a pointer to the device. Must be called inside
 997 *	rtnl_lock(), and result refcount is unchanged.
 998 */
 999
1000struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
1001				      unsigned short mask)
1002{
1003	struct net_device *dev, *ret;
1004
1005	ASSERT_RTNL();
1006
1007	ret = NULL;
1008	for_each_netdev(net, dev) {
1009		if (((dev->flags ^ if_flags) & mask) == 0) {
1010			ret = dev;
1011			break;
1012		}
1013	}
1014	return ret;
1015}
1016EXPORT_SYMBOL(__dev_get_by_flags);
1017
1018/**
1019 *	dev_valid_name - check if name is okay for network device
1020 *	@name: name string
1021 *
1022 *	Network device names need to be valid file names to
1023 *	to allow sysfs to work.  We also disallow any kind of
1024 *	whitespace.
1025 */
1026bool dev_valid_name(const char *name)
1027{
1028	if (*name == '\0')
1029		return false;
1030	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1031		return false;
1032	if (!strcmp(name, ".") || !strcmp(name, ".."))
1033		return false;
1034
1035	while (*name) {
1036		if (*name == '/' || *name == ':' || isspace(*name))
1037			return false;
1038		name++;
1039	}
1040	return true;
1041}
1042EXPORT_SYMBOL(dev_valid_name);
1043
1044/**
1045 *	__dev_alloc_name - allocate a name for a device
1046 *	@net: network namespace to allocate the device name in
1047 *	@name: name format string
1048 *	@buf:  scratch buffer and result name string
1049 *
1050 *	Passed a format string - eg "lt%d" it will try and find a suitable
1051 *	id. It scans list of devices to build up a free map, then chooses
1052 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1053 *	while allocating the name and adding the device in order to avoid
1054 *	duplicates.
1055 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1056 *	Returns the number of the unit assigned or a negative errno code.
1057 */
1058
1059static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1060{
1061	int i = 0;
1062	const char *p;
1063	const int max_netdevices = 8*PAGE_SIZE;
1064	unsigned long *inuse;
1065	struct net_device *d;
1066
1067	if (!dev_valid_name(name))
1068		return -EINVAL;
1069
1070	p = strchr(name, '%');
1071	if (p) {
1072		/*
1073		 * Verify the string as this thing may have come from
1074		 * the user.  There must be either one "%d" and no other "%"
1075		 * characters.
1076		 */
1077		if (p[1] != 'd' || strchr(p + 2, '%'))
1078			return -EINVAL;
1079
1080		/* Use one page as a bit array of possible slots */
1081		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1082		if (!inuse)
1083			return -ENOMEM;
1084
1085		for_each_netdev(net, d) {
1086			if (!sscanf(d->name, name, &i))
1087				continue;
1088			if (i < 0 || i >= max_netdevices)
1089				continue;
1090
1091			/*  avoid cases where sscanf is not exact inverse of printf */
1092			snprintf(buf, IFNAMSIZ, name, i);
1093			if (!strncmp(buf, d->name, IFNAMSIZ))
1094				set_bit(i, inuse);
1095		}
1096
1097		i = find_first_zero_bit(inuse, max_netdevices);
1098		free_page((unsigned long) inuse);
1099	}
1100
1101	snprintf(buf, IFNAMSIZ, name, i);
 
1102	if (!__dev_get_by_name(net, buf))
1103		return i;
1104
1105	/* It is possible to run out of possible slots
1106	 * when the name is long and there isn't enough space left
1107	 * for the digits, or if all bits are used.
1108	 */
1109	return -ENFILE;
1110}
1111
1112static int dev_alloc_name_ns(struct net *net,
1113			     struct net_device *dev,
1114			     const char *name)
1115{
1116	char buf[IFNAMSIZ];
1117	int ret;
1118
1119	BUG_ON(!net);
1120	ret = __dev_alloc_name(net, name, buf);
1121	if (ret >= 0)
1122		strlcpy(dev->name, buf, IFNAMSIZ);
1123	return ret;
1124}
1125
1126/**
1127 *	dev_alloc_name - allocate a name for a device
1128 *	@dev: device
1129 *	@name: name format string
1130 *
1131 *	Passed a format string - eg "lt%d" it will try and find a suitable
1132 *	id. It scans list of devices to build up a free map, then chooses
1133 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1134 *	while allocating the name and adding the device in order to avoid
1135 *	duplicates.
1136 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1137 *	Returns the number of the unit assigned or a negative errno code.
1138 */
1139
1140int dev_alloc_name(struct net_device *dev, const char *name)
1141{
1142	return dev_alloc_name_ns(dev_net(dev), dev, name);
 
 
 
 
 
 
 
 
 
1143}
1144EXPORT_SYMBOL(dev_alloc_name);
1145
1146int dev_get_valid_name(struct net *net, struct net_device *dev,
1147		       const char *name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1148{
1149	BUG_ON(!net);
1150
1151	if (!dev_valid_name(name))
1152		return -EINVAL;
1153
1154	if (strchr(name, '%'))
1155		return dev_alloc_name_ns(net, dev, name);
1156	else if (__dev_get_by_name(net, name))
1157		return -EEXIST;
1158	else if (dev->name != name)
1159		strlcpy(dev->name, name, IFNAMSIZ);
1160
1161	return 0;
1162}
1163EXPORT_SYMBOL(dev_get_valid_name);
1164
1165/**
1166 *	dev_change_name - change name of a device
1167 *	@dev: device
1168 *	@newname: name (or format string) must be at least IFNAMSIZ
1169 *
1170 *	Change name of a device, can pass format strings "eth%d".
1171 *	for wildcarding.
1172 */
1173int dev_change_name(struct net_device *dev, const char *newname)
1174{
1175	unsigned char old_assign_type;
1176	char oldname[IFNAMSIZ];
1177	int err = 0;
1178	int ret;
1179	struct net *net;
1180
1181	ASSERT_RTNL();
1182	BUG_ON(!dev_net(dev));
1183
1184	net = dev_net(dev);
1185	if (dev->flags & IFF_UP)
1186		return -EBUSY;
1187
1188	write_seqcount_begin(&devnet_rename_seq);
1189
1190	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1191		write_seqcount_end(&devnet_rename_seq);
1192		return 0;
1193	}
1194
1195	memcpy(oldname, dev->name, IFNAMSIZ);
1196
1197	err = dev_get_valid_name(net, dev, newname);
1198	if (err < 0) {
1199		write_seqcount_end(&devnet_rename_seq);
1200		return err;
1201	}
1202
1203	if (oldname[0] && !strchr(oldname, '%'))
1204		netdev_info(dev, "renamed from %s\n", oldname);
1205
1206	old_assign_type = dev->name_assign_type;
1207	dev->name_assign_type = NET_NAME_RENAMED;
1208
1209rollback:
1210	ret = device_rename(&dev->dev, dev->name);
1211	if (ret) {
1212		memcpy(dev->name, oldname, IFNAMSIZ);
1213		dev->name_assign_type = old_assign_type;
1214		write_seqcount_end(&devnet_rename_seq);
1215		return ret;
1216	}
1217
1218	write_seqcount_end(&devnet_rename_seq);
1219
1220	netdev_adjacent_rename_links(dev, oldname);
1221
1222	write_lock_bh(&dev_base_lock);
1223	hlist_del_rcu(&dev->name_hlist);
1224	write_unlock_bh(&dev_base_lock);
1225
1226	synchronize_rcu();
1227
1228	write_lock_bh(&dev_base_lock);
1229	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1230	write_unlock_bh(&dev_base_lock);
1231
1232	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1233	ret = notifier_to_errno(ret);
1234
1235	if (ret) {
1236		/* err >= 0 after dev_alloc_name() or stores the first errno */
1237		if (err >= 0) {
1238			err = ret;
1239			write_seqcount_begin(&devnet_rename_seq);
1240			memcpy(dev->name, oldname, IFNAMSIZ);
1241			memcpy(oldname, newname, IFNAMSIZ);
1242			dev->name_assign_type = old_assign_type;
1243			old_assign_type = NET_NAME_RENAMED;
1244			goto rollback;
1245		} else {
1246			pr_err("%s: name change rollback failed: %d\n",
1247			       dev->name, ret);
1248		}
1249	}
1250
1251	return err;
1252}
1253
1254/**
1255 *	dev_set_alias - change ifalias of a device
1256 *	@dev: device
1257 *	@alias: name up to IFALIASZ
1258 *	@len: limit of bytes to copy from info
1259 *
1260 *	Set ifalias for a device,
1261 */
1262int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1263{
1264	struct dev_ifalias *new_alias = NULL;
 
 
1265
1266	if (len >= IFALIASZ)
1267		return -EINVAL;
1268
1269	if (len) {
1270		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1271		if (!new_alias)
1272			return -ENOMEM;
1273
1274		memcpy(new_alias->ifalias, alias, len);
1275		new_alias->ifalias[len] = 0;
1276	}
1277
1278	mutex_lock(&ifalias_mutex);
1279	rcu_swap_protected(dev->ifalias, new_alias,
1280			   mutex_is_locked(&ifalias_mutex));
1281	mutex_unlock(&ifalias_mutex);
1282
1283	if (new_alias)
1284		kfree_rcu(new_alias, rcuhead);
1285
 
1286	return len;
1287}
1288
1289/**
1290 *	dev_get_alias - get ifalias of a device
1291 *	@dev: device
1292 *	@name: buffer to store name of ifalias
1293 *	@len: size of buffer
1294 *
1295 *	get ifalias for a device.  Caller must make sure dev cannot go
1296 *	away,  e.g. rcu read lock or own a reference count to device.
1297 */
1298int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1299{
1300	const struct dev_ifalias *alias;
1301	int ret = 0;
1302
1303	rcu_read_lock();
1304	alias = rcu_dereference(dev->ifalias);
1305	if (alias)
1306		ret = snprintf(name, len, "%s", alias->ifalias);
1307	rcu_read_unlock();
1308
1309	return ret;
1310}
1311
1312/**
1313 *	netdev_features_change - device changes features
1314 *	@dev: device to cause notification
1315 *
1316 *	Called to indicate a device has changed features.
1317 */
1318void netdev_features_change(struct net_device *dev)
1319{
1320	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1321}
1322EXPORT_SYMBOL(netdev_features_change);
1323
1324/**
1325 *	netdev_state_change - device changes state
1326 *	@dev: device to cause notification
1327 *
1328 *	Called to indicate a device has changed state. This function calls
1329 *	the notifier chains for netdev_chain and sends a NEWLINK message
1330 *	to the routing socket.
1331 */
1332void netdev_state_change(struct net_device *dev)
1333{
1334	if (dev->flags & IFF_UP) {
1335		struct netdev_notifier_change_info change_info = {
1336			.info.dev = dev,
1337		};
1338
1339		call_netdevice_notifiers_info(NETDEV_CHANGE,
 
1340					      &change_info.info);
1341		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1342	}
1343}
1344EXPORT_SYMBOL(netdev_state_change);
1345
1346/**
1347 * netdev_notify_peers - notify network peers about existence of @dev
1348 * @dev: network device
1349 *
1350 * Generate traffic such that interested network peers are aware of
1351 * @dev, such as by generating a gratuitous ARP. This may be used when
1352 * a device wants to inform the rest of the network about some sort of
1353 * reconfiguration such as a failover event or virtual machine
1354 * migration.
1355 */
1356void netdev_notify_peers(struct net_device *dev)
1357{
1358	rtnl_lock();
1359	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1360	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1361	rtnl_unlock();
1362}
1363EXPORT_SYMBOL(netdev_notify_peers);
1364
1365static int __dev_open(struct net_device *dev)
1366{
1367	const struct net_device_ops *ops = dev->netdev_ops;
1368	int ret;
1369
1370	ASSERT_RTNL();
1371
1372	if (!netif_device_present(dev))
1373		return -ENODEV;
1374
1375	/* Block netpoll from trying to do any rx path servicing.
1376	 * If we don't do this there is a chance ndo_poll_controller
1377	 * or ndo_poll may be running while we open the device
1378	 */
1379	netpoll_poll_disable(dev);
1380
1381	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1382	ret = notifier_to_errno(ret);
1383	if (ret)
1384		return ret;
1385
1386	set_bit(__LINK_STATE_START, &dev->state);
1387
1388	if (ops->ndo_validate_addr)
1389		ret = ops->ndo_validate_addr(dev);
1390
1391	if (!ret && ops->ndo_open)
1392		ret = ops->ndo_open(dev);
1393
1394	netpoll_poll_enable(dev);
1395
1396	if (ret)
1397		clear_bit(__LINK_STATE_START, &dev->state);
1398	else {
1399		dev->flags |= IFF_UP;
1400		dev_set_rx_mode(dev);
1401		dev_activate(dev);
1402		add_device_randomness(dev->dev_addr, dev->addr_len);
1403	}
1404
1405	return ret;
1406}
1407
1408/**
1409 *	dev_open	- prepare an interface for use.
1410 *	@dev:	device to open
1411 *
1412 *	Takes a device from down to up state. The device's private open
1413 *	function is invoked and then the multicast lists are loaded. Finally
1414 *	the device is moved into the up state and a %NETDEV_UP message is
1415 *	sent to the netdev notifier chain.
1416 *
1417 *	Calling this function on an active interface is a nop. On a failure
1418 *	a negative errno code is returned.
1419 */
1420int dev_open(struct net_device *dev)
1421{
1422	int ret;
1423
1424	if (dev->flags & IFF_UP)
1425		return 0;
1426
1427	ret = __dev_open(dev);
1428	if (ret < 0)
1429		return ret;
1430
1431	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1432	call_netdevice_notifiers(NETDEV_UP, dev);
1433
1434	return ret;
1435}
1436EXPORT_SYMBOL(dev_open);
1437
1438static void __dev_close_many(struct list_head *head)
1439{
1440	struct net_device *dev;
1441
1442	ASSERT_RTNL();
1443	might_sleep();
1444
1445	list_for_each_entry(dev, head, close_list) {
1446		/* Temporarily disable netpoll until the interface is down */
1447		netpoll_poll_disable(dev);
1448
1449		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1450
1451		clear_bit(__LINK_STATE_START, &dev->state);
1452
1453		/* Synchronize to scheduled poll. We cannot touch poll list, it
1454		 * can be even on different cpu. So just clear netif_running().
1455		 *
1456		 * dev->stop() will invoke napi_disable() on all of it's
1457		 * napi_struct instances on this device.
1458		 */
1459		smp_mb__after_atomic(); /* Commit netif_running(). */
1460	}
1461
1462	dev_deactivate_many(head);
1463
1464	list_for_each_entry(dev, head, close_list) {
1465		const struct net_device_ops *ops = dev->netdev_ops;
1466
1467		/*
1468		 *	Call the device specific close. This cannot fail.
1469		 *	Only if device is UP
1470		 *
1471		 *	We allow it to be called even after a DETACH hot-plug
1472		 *	event.
1473		 */
1474		if (ops->ndo_stop)
1475			ops->ndo_stop(dev);
1476
1477		dev->flags &= ~IFF_UP;
1478		netpoll_poll_enable(dev);
1479	}
 
 
1480}
1481
1482static void __dev_close(struct net_device *dev)
1483{
 
1484	LIST_HEAD(single);
1485
1486	list_add(&dev->close_list, &single);
1487	__dev_close_many(&single);
1488	list_del(&single);
 
 
1489}
1490
1491void dev_close_many(struct list_head *head, bool unlink)
1492{
1493	struct net_device *dev, *tmp;
1494
1495	/* Remove the devices that don't need to be closed */
1496	list_for_each_entry_safe(dev, tmp, head, close_list)
1497		if (!(dev->flags & IFF_UP))
1498			list_del_init(&dev->close_list);
1499
1500	__dev_close_many(head);
1501
1502	list_for_each_entry_safe(dev, tmp, head, close_list) {
1503		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1504		call_netdevice_notifiers(NETDEV_DOWN, dev);
1505		if (unlink)
1506			list_del_init(&dev->close_list);
1507	}
 
 
1508}
1509EXPORT_SYMBOL(dev_close_many);
1510
1511/**
1512 *	dev_close - shutdown an interface.
1513 *	@dev: device to shutdown
1514 *
1515 *	This function moves an active device into down state. A
1516 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1517 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1518 *	chain.
1519 */
1520void dev_close(struct net_device *dev)
1521{
1522	if (dev->flags & IFF_UP) {
1523		LIST_HEAD(single);
1524
1525		list_add(&dev->close_list, &single);
1526		dev_close_many(&single, true);
1527		list_del(&single);
1528	}
 
1529}
1530EXPORT_SYMBOL(dev_close);
1531
1532
1533/**
1534 *	dev_disable_lro - disable Large Receive Offload on a device
1535 *	@dev: device
1536 *
1537 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1538 *	called under RTNL.  This is needed if received packets may be
1539 *	forwarded to another interface.
1540 */
1541void dev_disable_lro(struct net_device *dev)
1542{
1543	struct net_device *lower_dev;
1544	struct list_head *iter;
1545
1546	dev->wanted_features &= ~NETIF_F_LRO;
1547	netdev_update_features(dev);
1548
1549	if (unlikely(dev->features & NETIF_F_LRO))
1550		netdev_WARN(dev, "failed to disable LRO!\n");
1551
1552	netdev_for_each_lower_dev(dev, lower_dev, iter)
1553		dev_disable_lro(lower_dev);
1554}
1555EXPORT_SYMBOL(dev_disable_lro);
1556
1557/**
1558 *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1559 *	@dev: device
1560 *
1561 *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
1562 *	called under RTNL.  This is needed if Generic XDP is installed on
1563 *	the device.
1564 */
1565static void dev_disable_gro_hw(struct net_device *dev)
1566{
1567	dev->wanted_features &= ~NETIF_F_GRO_HW;
1568	netdev_update_features(dev);
1569
1570	if (unlikely(dev->features & NETIF_F_GRO_HW))
1571		netdev_WARN(dev, "failed to disable GRO_HW!\n");
1572}
1573
1574const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1575{
1576#define N(val) 						\
1577	case NETDEV_##val:				\
1578		return "NETDEV_" __stringify(val);
1579	switch (cmd) {
1580	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1581	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1582	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1583	N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
1584	N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
1585	N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
1586	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1587	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1588	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1589	};
1590#undef N
1591	return "UNKNOWN_NETDEV_EVENT";
1592}
1593EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1594
1595static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1596				   struct net_device *dev)
1597{
1598	struct netdev_notifier_info info = {
1599		.dev = dev,
1600	};
1601
 
1602	return nb->notifier_call(nb, val, &info);
1603}
1604
1605static int dev_boot_phase = 1;
1606
1607/**
1608 * register_netdevice_notifier - register a network notifier block
1609 * @nb: notifier
1610 *
1611 * Register a notifier to be called when network device events occur.
1612 * The notifier passed is linked into the kernel structures and must
1613 * not be reused until it has been unregistered. A negative errno code
1614 * is returned on a failure.
1615 *
1616 * When registered all registration and up events are replayed
1617 * to the new notifier to allow device to have a race free
1618 * view of the network device list.
1619 */
1620
1621int register_netdevice_notifier(struct notifier_block *nb)
1622{
1623	struct net_device *dev;
1624	struct net_device *last;
1625	struct net *net;
1626	int err;
1627
1628	/* Close race with setup_net() and cleanup_net() */
1629	down_write(&pernet_ops_rwsem);
1630	rtnl_lock();
1631	err = raw_notifier_chain_register(&netdev_chain, nb);
1632	if (err)
1633		goto unlock;
1634	if (dev_boot_phase)
1635		goto unlock;
1636	for_each_net(net) {
1637		for_each_netdev(net, dev) {
1638			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1639			err = notifier_to_errno(err);
1640			if (err)
1641				goto rollback;
1642
1643			if (!(dev->flags & IFF_UP))
1644				continue;
1645
1646			call_netdevice_notifier(nb, NETDEV_UP, dev);
1647		}
1648	}
1649
1650unlock:
1651	rtnl_unlock();
1652	up_write(&pernet_ops_rwsem);
1653	return err;
1654
1655rollback:
1656	last = dev;
1657	for_each_net(net) {
1658		for_each_netdev(net, dev) {
1659			if (dev == last)
1660				goto outroll;
1661
1662			if (dev->flags & IFF_UP) {
1663				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1664							dev);
1665				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1666			}
1667			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1668		}
1669	}
1670
1671outroll:
1672	raw_notifier_chain_unregister(&netdev_chain, nb);
1673	goto unlock;
1674}
1675EXPORT_SYMBOL(register_netdevice_notifier);
1676
1677/**
1678 * unregister_netdevice_notifier - unregister a network notifier block
1679 * @nb: notifier
1680 *
1681 * Unregister a notifier previously registered by
1682 * register_netdevice_notifier(). The notifier is unlinked into the
1683 * kernel structures and may then be reused. A negative errno code
1684 * is returned on a failure.
1685 *
1686 * After unregistering unregister and down device events are synthesized
1687 * for all devices on the device list to the removed notifier to remove
1688 * the need for special case cleanup code.
1689 */
1690
1691int unregister_netdevice_notifier(struct notifier_block *nb)
1692{
1693	struct net_device *dev;
1694	struct net *net;
1695	int err;
1696
1697	/* Close race with setup_net() and cleanup_net() */
1698	down_write(&pernet_ops_rwsem);
1699	rtnl_lock();
1700	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1701	if (err)
1702		goto unlock;
1703
1704	for_each_net(net) {
1705		for_each_netdev(net, dev) {
1706			if (dev->flags & IFF_UP) {
1707				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1708							dev);
1709				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1710			}
1711			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1712		}
1713	}
1714unlock:
1715	rtnl_unlock();
1716	up_write(&pernet_ops_rwsem);
1717	return err;
1718}
1719EXPORT_SYMBOL(unregister_netdevice_notifier);
1720
1721/**
1722 *	call_netdevice_notifiers_info - call all network notifier blocks
1723 *	@val: value passed unmodified to notifier function
 
1724 *	@info: notifier information data
1725 *
1726 *	Call all network notifier blocks.  Parameters and return value
1727 *	are as for raw_notifier_call_chain().
1728 */
1729
1730static int call_netdevice_notifiers_info(unsigned long val,
 
1731					 struct netdev_notifier_info *info)
1732{
1733	ASSERT_RTNL();
 
1734	return raw_notifier_call_chain(&netdev_chain, val, info);
1735}
1736
1737/**
1738 *	call_netdevice_notifiers - call all network notifier blocks
1739 *      @val: value passed unmodified to notifier function
1740 *      @dev: net_device pointer passed unmodified to notifier function
1741 *
1742 *	Call all network notifier blocks.  Parameters and return value
1743 *	are as for raw_notifier_call_chain().
1744 */
1745
1746int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1747{
1748	struct netdev_notifier_info info = {
1749		.dev = dev,
1750	};
1751
1752	return call_netdevice_notifiers_info(val, &info);
1753}
1754EXPORT_SYMBOL(call_netdevice_notifiers);
1755
1756#ifdef CONFIG_NET_INGRESS
1757static struct static_key ingress_needed __read_mostly;
1758
1759void net_inc_ingress_queue(void)
1760{
1761	static_key_slow_inc(&ingress_needed);
1762}
1763EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1764
1765void net_dec_ingress_queue(void)
1766{
1767	static_key_slow_dec(&ingress_needed);
1768}
1769EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1770#endif
1771
1772#ifdef CONFIG_NET_EGRESS
1773static struct static_key egress_needed __read_mostly;
1774
1775void net_inc_egress_queue(void)
1776{
1777	static_key_slow_inc(&egress_needed);
1778}
1779EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1780
1781void net_dec_egress_queue(void)
1782{
1783	static_key_slow_dec(&egress_needed);
1784}
1785EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1786#endif
1787
1788static struct static_key netstamp_needed __read_mostly;
1789#ifdef HAVE_JUMP_LABEL
1790static atomic_t netstamp_needed_deferred;
1791static atomic_t netstamp_wanted;
1792static void netstamp_clear(struct work_struct *work)
1793{
1794	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1795	int wanted;
1796
1797	wanted = atomic_add_return(deferred, &netstamp_wanted);
1798	if (wanted > 0)
1799		static_key_enable(&netstamp_needed);
1800	else
1801		static_key_disable(&netstamp_needed);
1802}
1803static DECLARE_WORK(netstamp_work, netstamp_clear);
1804#endif
1805
1806void net_enable_timestamp(void)
1807{
1808#ifdef HAVE_JUMP_LABEL
1809	int wanted;
1810
1811	while (1) {
1812		wanted = atomic_read(&netstamp_wanted);
1813		if (wanted <= 0)
1814			break;
1815		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1816			return;
1817	}
1818	atomic_inc(&netstamp_needed_deferred);
1819	schedule_work(&netstamp_work);
1820#else
1821	static_key_slow_inc(&netstamp_needed);
1822#endif
1823}
1824EXPORT_SYMBOL(net_enable_timestamp);
1825
1826void net_disable_timestamp(void)
1827{
1828#ifdef HAVE_JUMP_LABEL
1829	int wanted;
1830
1831	while (1) {
1832		wanted = atomic_read(&netstamp_wanted);
1833		if (wanted <= 1)
1834			break;
1835		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1836			return;
1837	}
1838	atomic_dec(&netstamp_needed_deferred);
1839	schedule_work(&netstamp_work);
1840#else
1841	static_key_slow_dec(&netstamp_needed);
1842#endif
1843}
1844EXPORT_SYMBOL(net_disable_timestamp);
1845
1846static inline void net_timestamp_set(struct sk_buff *skb)
1847{
1848	skb->tstamp = 0;
1849	if (static_key_false(&netstamp_needed))
1850		__net_timestamp(skb);
1851}
1852
1853#define net_timestamp_check(COND, SKB)			\
1854	if (static_key_false(&netstamp_needed)) {		\
1855		if ((COND) && !(SKB)->tstamp)	\
1856			__net_timestamp(SKB);		\
1857	}						\
1858
1859bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1860{
1861	unsigned int len;
1862
1863	if (!(dev->flags & IFF_UP))
1864		return false;
1865
1866	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1867	if (skb->len <= len)
1868		return true;
1869
1870	/* if TSO is enabled, we don't care about the length as the packet
1871	 * could be forwarded without being segmented before
1872	 */
1873	if (skb_is_gso(skb))
1874		return true;
1875
1876	return false;
1877}
1878EXPORT_SYMBOL_GPL(is_skb_forwardable);
1879
1880int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1881{
1882	int ret = ____dev_forward_skb(dev, skb);
1883
1884	if (likely(!ret)) {
1885		skb->protocol = eth_type_trans(skb, dev);
1886		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1887	}
1888
1889	return ret;
1890}
1891EXPORT_SYMBOL_GPL(__dev_forward_skb);
1892
1893/**
1894 * dev_forward_skb - loopback an skb to another netif
1895 *
1896 * @dev: destination network device
1897 * @skb: buffer to forward
1898 *
1899 * return values:
1900 *	NET_RX_SUCCESS	(no congestion)
1901 *	NET_RX_DROP     (packet was dropped, but freed)
1902 *
1903 * dev_forward_skb can be used for injecting an skb from the
1904 * start_xmit function of one device into the receive queue
1905 * of another device.
1906 *
1907 * The receiving device may be in another namespace, so
1908 * we have to clear all information in the skb that could
1909 * impact namespace isolation.
1910 */
1911int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1912{
1913	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1914}
1915EXPORT_SYMBOL_GPL(dev_forward_skb);
1916
1917static inline int deliver_skb(struct sk_buff *skb,
1918			      struct packet_type *pt_prev,
1919			      struct net_device *orig_dev)
1920{
1921	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
1922		return -ENOMEM;
1923	refcount_inc(&skb->users);
1924	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1925}
1926
1927static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1928					  struct packet_type **pt,
1929					  struct net_device *orig_dev,
1930					  __be16 type,
1931					  struct list_head *ptype_list)
1932{
1933	struct packet_type *ptype, *pt_prev = *pt;
1934
1935	list_for_each_entry_rcu(ptype, ptype_list, list) {
1936		if (ptype->type != type)
1937			continue;
1938		if (pt_prev)
1939			deliver_skb(skb, pt_prev, orig_dev);
1940		pt_prev = ptype;
1941	}
1942	*pt = pt_prev;
1943}
1944
1945static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1946{
1947	if (!ptype->af_packet_priv || !skb->sk)
1948		return false;
1949
1950	if (ptype->id_match)
1951		return ptype->id_match(ptype, skb->sk);
1952	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1953		return true;
1954
1955	return false;
1956}
1957
1958/*
1959 *	Support routine. Sends outgoing frames to any network
1960 *	taps currently in use.
1961 */
1962
1963void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1964{
1965	struct packet_type *ptype;
1966	struct sk_buff *skb2 = NULL;
1967	struct packet_type *pt_prev = NULL;
1968	struct list_head *ptype_list = &ptype_all;
1969
1970	rcu_read_lock();
1971again:
1972	list_for_each_entry_rcu(ptype, ptype_list, list) {
1973		/* Never send packets back to the socket
1974		 * they originated from - MvS (miquels@drinkel.ow.org)
1975		 */
1976		if (skb_loop_sk(ptype, skb))
1977			continue;
1978
1979		if (pt_prev) {
1980			deliver_skb(skb2, pt_prev, skb->dev);
1981			pt_prev = ptype;
1982			continue;
1983		}
1984
1985		/* need to clone skb, done only once */
1986		skb2 = skb_clone(skb, GFP_ATOMIC);
1987		if (!skb2)
1988			goto out_unlock;
1989
1990		net_timestamp_set(skb2);
1991
1992		/* skb->nh should be correctly
1993		 * set by sender, so that the second statement is
1994		 * just protection against buggy protocols.
1995		 */
1996		skb_reset_mac_header(skb2);
1997
1998		if (skb_network_header(skb2) < skb2->data ||
1999		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2000			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2001					     ntohs(skb2->protocol),
2002					     dev->name);
2003			skb_reset_network_header(skb2);
2004		}
2005
2006		skb2->transport_header = skb2->network_header;
2007		skb2->pkt_type = PACKET_OUTGOING;
2008		pt_prev = ptype;
2009	}
2010
2011	if (ptype_list == &ptype_all) {
2012		ptype_list = &dev->ptype_all;
2013		goto again;
2014	}
2015out_unlock:
2016	if (pt_prev) {
2017		if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2018			pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2019		else
2020			kfree_skb(skb2);
2021	}
2022	rcu_read_unlock();
2023}
2024EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2025
2026/**
2027 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2028 * @dev: Network device
2029 * @txq: number of queues available
2030 *
2031 * If real_num_tx_queues is changed the tc mappings may no longer be
2032 * valid. To resolve this verify the tc mapping remains valid and if
2033 * not NULL the mapping. With no priorities mapping to this
2034 * offset/count pair it will no longer be used. In the worst case TC0
2035 * is invalid nothing can be done so disable priority mappings. If is
2036 * expected that drivers will fix this mapping if they can before
2037 * calling netif_set_real_num_tx_queues.
2038 */
2039static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2040{
2041	int i;
2042	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2043
2044	/* If TC0 is invalidated disable TC mapping */
2045	if (tc->offset + tc->count > txq) {
2046		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2047		dev->num_tc = 0;
2048		return;
2049	}
2050
2051	/* Invalidated prio to tc mappings set to TC0 */
2052	for (i = 1; i < TC_BITMASK + 1; i++) {
2053		int q = netdev_get_prio_tc_map(dev, i);
2054
2055		tc = &dev->tc_to_txq[q];
2056		if (tc->offset + tc->count > txq) {
2057			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2058				i, q);
2059			netdev_set_prio_tc_map(dev, i, 0);
2060		}
2061	}
2062}
2063
2064int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2065{
2066	if (dev->num_tc) {
2067		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2068		int i;
2069
2070		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2071			if ((txq - tc->offset) < tc->count)
2072				return i;
2073		}
2074
2075		return -1;
2076	}
2077
2078	return 0;
2079}
2080EXPORT_SYMBOL(netdev_txq_to_tc);
2081
2082#ifdef CONFIG_XPS
2083static DEFINE_MUTEX(xps_map_mutex);
2084#define xmap_dereference(P)		\
2085	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2086
2087static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2088			     int tci, u16 index)
2089{
2090	struct xps_map *map = NULL;
2091	int pos;
2092
2093	if (dev_maps)
2094		map = xmap_dereference(dev_maps->cpu_map[tci]);
2095	if (!map)
2096		return false;
2097
2098	for (pos = map->len; pos--;) {
2099		if (map->queues[pos] != index)
2100			continue;
2101
2102		if (map->len > 1) {
2103			map->queues[pos] = map->queues[--map->len];
2104			break;
2105		}
2106
2107		RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
2108		kfree_rcu(map, rcu);
2109		return false;
2110	}
2111
2112	return true;
2113}
2114
2115static bool remove_xps_queue_cpu(struct net_device *dev,
2116				 struct xps_dev_maps *dev_maps,
2117				 int cpu, u16 offset, u16 count)
2118{
2119	int num_tc = dev->num_tc ? : 1;
2120	bool active = false;
2121	int tci;
2122
2123	for (tci = cpu * num_tc; num_tc--; tci++) {
2124		int i, j;
2125
2126		for (i = count, j = offset; i--; j++) {
2127			if (!remove_xps_queue(dev_maps, tci, j))
2128				break;
2129		}
2130
2131		active |= i < 0;
2132	}
2133
2134	return active;
2135}
2136
2137static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2138				   u16 count)
2139{
2140	struct xps_dev_maps *dev_maps;
2141	int cpu, i;
2142	bool active = false;
2143
2144	mutex_lock(&xps_map_mutex);
2145	dev_maps = xmap_dereference(dev->xps_maps);
2146
2147	if (!dev_maps)
2148		goto out_no_maps;
2149
2150	for_each_possible_cpu(cpu)
2151		active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
2152					       offset, count);
2153
2154	if (!active) {
2155		RCU_INIT_POINTER(dev->xps_maps, NULL);
2156		kfree_rcu(dev_maps, rcu);
2157	}
2158
2159	for (i = offset + (count - 1); count--; i--)
2160		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2161					     NUMA_NO_NODE);
2162
2163out_no_maps:
2164	mutex_unlock(&xps_map_mutex);
2165}
2166
2167static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2168{
2169	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2170}
2171
2172static struct xps_map *expand_xps_map(struct xps_map *map,
2173				      int cpu, u16 index)
2174{
2175	struct xps_map *new_map;
2176	int alloc_len = XPS_MIN_MAP_ALLOC;
2177	int i, pos;
2178
2179	for (pos = 0; map && pos < map->len; pos++) {
2180		if (map->queues[pos] != index)
2181			continue;
2182		return map;
2183	}
2184
2185	/* Need to add queue to this CPU's existing map */
2186	if (map) {
2187		if (pos < map->alloc_len)
2188			return map;
2189
2190		alloc_len = map->alloc_len * 2;
2191	}
2192
2193	/* Need to allocate new map to store queue on this CPU's map */
2194	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2195			       cpu_to_node(cpu));
2196	if (!new_map)
2197		return NULL;
2198
2199	for (i = 0; i < pos; i++)
2200		new_map->queues[i] = map->queues[i];
2201	new_map->alloc_len = alloc_len;
2202	new_map->len = pos;
2203
2204	return new_map;
2205}
2206
2207int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2208			u16 index)
2209{
2210	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2211	int i, cpu, tci, numa_node_id = -2;
2212	int maps_sz, num_tc = 1, tc = 0;
2213	struct xps_map *map, *new_map;
2214	bool active = false;
2215
2216	if (dev->num_tc) {
2217		num_tc = dev->num_tc;
2218		tc = netdev_txq_to_tc(dev, index);
2219		if (tc < 0)
2220			return -EINVAL;
2221	}
2222
2223	maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
2224	if (maps_sz < L1_CACHE_BYTES)
2225		maps_sz = L1_CACHE_BYTES;
2226
2227	mutex_lock(&xps_map_mutex);
2228
2229	dev_maps = xmap_dereference(dev->xps_maps);
2230
2231	/* allocate memory for queue storage */
2232	for_each_cpu_and(cpu, cpu_online_mask, mask) {
2233		if (!new_dev_maps)
2234			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2235		if (!new_dev_maps) {
2236			mutex_unlock(&xps_map_mutex);
2237			return -ENOMEM;
2238		}
2239
2240		tci = cpu * num_tc + tc;
2241		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
2242				 NULL;
2243
2244		map = expand_xps_map(map, cpu, index);
2245		if (!map)
2246			goto error;
2247
2248		RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2249	}
2250
2251	if (!new_dev_maps)
2252		goto out_no_new_maps;
2253
2254	for_each_possible_cpu(cpu) {
2255		/* copy maps belonging to foreign traffic classes */
2256		for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
2257			/* fill in the new device map from the old device map */
2258			map = xmap_dereference(dev_maps->cpu_map[tci]);
2259			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2260		}
2261
2262		/* We need to explicitly update tci as prevous loop
2263		 * could break out early if dev_maps is NULL.
2264		 */
2265		tci = cpu * num_tc + tc;
2266
2267		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2268			/* add queue to CPU maps */
2269			int pos = 0;
2270
2271			map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2272			while ((pos < map->len) && (map->queues[pos] != index))
2273				pos++;
2274
2275			if (pos == map->len)
2276				map->queues[map->len++] = index;
2277#ifdef CONFIG_NUMA
2278			if (numa_node_id == -2)
2279				numa_node_id = cpu_to_node(cpu);
2280			else if (numa_node_id != cpu_to_node(cpu))
2281				numa_node_id = -1;
2282#endif
2283		} else if (dev_maps) {
2284			/* fill in the new device map from the old device map */
2285			map = xmap_dereference(dev_maps->cpu_map[tci]);
2286			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2287		}
2288
2289		/* copy maps belonging to foreign traffic classes */
2290		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2291			/* fill in the new device map from the old device map */
2292			map = xmap_dereference(dev_maps->cpu_map[tci]);
2293			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2294		}
2295	}
2296
2297	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2298
2299	/* Cleanup old maps */
2300	if (!dev_maps)
2301		goto out_no_old_maps;
2302
2303	for_each_possible_cpu(cpu) {
2304		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2305			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2306			map = xmap_dereference(dev_maps->cpu_map[tci]);
2307			if (map && map != new_map)
2308				kfree_rcu(map, rcu);
2309		}
2310	}
2311
2312	kfree_rcu(dev_maps, rcu);
2313
2314out_no_old_maps:
2315	dev_maps = new_dev_maps;
2316	active = true;
2317
2318out_no_new_maps:
2319	/* update Tx queue numa node */
2320	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2321				     (numa_node_id >= 0) ? numa_node_id :
2322				     NUMA_NO_NODE);
2323
2324	if (!dev_maps)
2325		goto out_no_maps;
2326
2327	/* removes queue from unused CPUs */
2328	for_each_possible_cpu(cpu) {
2329		for (i = tc, tci = cpu * num_tc; i--; tci++)
2330			active |= remove_xps_queue(dev_maps, tci, index);
2331		if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
2332			active |= remove_xps_queue(dev_maps, tci, index);
2333		for (i = num_tc - tc, tci++; --i; tci++)
2334			active |= remove_xps_queue(dev_maps, tci, index);
2335	}
2336
2337	/* free map if not active */
2338	if (!active) {
2339		RCU_INIT_POINTER(dev->xps_maps, NULL);
2340		kfree_rcu(dev_maps, rcu);
2341	}
2342
2343out_no_maps:
2344	mutex_unlock(&xps_map_mutex);
2345
2346	return 0;
2347error:
2348	/* remove any maps that we added */
2349	for_each_possible_cpu(cpu) {
2350		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2351			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2352			map = dev_maps ?
2353			      xmap_dereference(dev_maps->cpu_map[tci]) :
2354			      NULL;
2355			if (new_map && new_map != map)
2356				kfree(new_map);
2357		}
2358	}
2359
2360	mutex_unlock(&xps_map_mutex);
2361
2362	kfree(new_dev_maps);
2363	return -ENOMEM;
2364}
2365EXPORT_SYMBOL(netif_set_xps_queue);
2366
2367#endif
2368void netdev_reset_tc(struct net_device *dev)
2369{
2370#ifdef CONFIG_XPS
2371	netif_reset_xps_queues_gt(dev, 0);
2372#endif
2373	dev->num_tc = 0;
2374	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2375	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2376}
2377EXPORT_SYMBOL(netdev_reset_tc);
2378
2379int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2380{
2381	if (tc >= dev->num_tc)
2382		return -EINVAL;
2383
2384#ifdef CONFIG_XPS
2385	netif_reset_xps_queues(dev, offset, count);
2386#endif
2387	dev->tc_to_txq[tc].count = count;
2388	dev->tc_to_txq[tc].offset = offset;
2389	return 0;
2390}
2391EXPORT_SYMBOL(netdev_set_tc_queue);
2392
2393int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2394{
2395	if (num_tc > TC_MAX_QUEUE)
2396		return -EINVAL;
2397
2398#ifdef CONFIG_XPS
2399	netif_reset_xps_queues_gt(dev, 0);
2400#endif
2401	dev->num_tc = num_tc;
2402	return 0;
2403}
2404EXPORT_SYMBOL(netdev_set_num_tc);
2405
2406/*
2407 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2408 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2409 */
2410int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2411{
2412	bool disabling;
2413	int rc;
2414
2415	disabling = txq < dev->real_num_tx_queues;
2416
2417	if (txq < 1 || txq > dev->num_tx_queues)
2418		return -EINVAL;
2419
2420	if (dev->reg_state == NETREG_REGISTERED ||
2421	    dev->reg_state == NETREG_UNREGISTERING) {
2422		ASSERT_RTNL();
2423
2424		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2425						  txq);
2426		if (rc)
2427			return rc;
2428
2429		if (dev->num_tc)
2430			netif_setup_tc(dev, txq);
2431
2432		dev->real_num_tx_queues = txq;
2433
2434		if (disabling) {
2435			synchronize_net();
2436			qdisc_reset_all_tx_gt(dev, txq);
2437#ifdef CONFIG_XPS
2438			netif_reset_xps_queues_gt(dev, txq);
2439#endif
2440		}
2441	} else {
2442		dev->real_num_tx_queues = txq;
2443	}
2444
 
2445	return 0;
2446}
2447EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2448
2449#ifdef CONFIG_SYSFS
2450/**
2451 *	netif_set_real_num_rx_queues - set actual number of RX queues used
2452 *	@dev: Network device
2453 *	@rxq: Actual number of RX queues
2454 *
2455 *	This must be called either with the rtnl_lock held or before
2456 *	registration of the net device.  Returns 0 on success, or a
2457 *	negative error code.  If called before registration, it always
2458 *	succeeds.
2459 */
2460int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2461{
2462	int rc;
2463
2464	if (rxq < 1 || rxq > dev->num_rx_queues)
2465		return -EINVAL;
2466
2467	if (dev->reg_state == NETREG_REGISTERED) {
2468		ASSERT_RTNL();
2469
2470		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2471						  rxq);
2472		if (rc)
2473			return rc;
2474	}
2475
2476	dev->real_num_rx_queues = rxq;
2477	return 0;
2478}
2479EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2480#endif
2481
2482/**
2483 * netif_get_num_default_rss_queues - default number of RSS queues
2484 *
2485 * This routine should set an upper limit on the number of RSS queues
2486 * used by default by multiqueue devices.
2487 */
2488int netif_get_num_default_rss_queues(void)
2489{
2490	return is_kdump_kernel() ?
2491		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2492}
2493EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2494
2495static void __netif_reschedule(struct Qdisc *q)
2496{
2497	struct softnet_data *sd;
2498	unsigned long flags;
2499
2500	local_irq_save(flags);
2501	sd = this_cpu_ptr(&softnet_data);
2502	q->next_sched = NULL;
2503	*sd->output_queue_tailp = q;
2504	sd->output_queue_tailp = &q->next_sched;
2505	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2506	local_irq_restore(flags);
2507}
2508
2509void __netif_schedule(struct Qdisc *q)
2510{
2511	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2512		__netif_reschedule(q);
2513}
2514EXPORT_SYMBOL(__netif_schedule);
2515
2516struct dev_kfree_skb_cb {
2517	enum skb_free_reason reason;
2518};
2519
2520static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2521{
2522	return (struct dev_kfree_skb_cb *)skb->cb;
2523}
2524
2525void netif_schedule_queue(struct netdev_queue *txq)
2526{
2527	rcu_read_lock();
2528	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2529		struct Qdisc *q = rcu_dereference(txq->qdisc);
2530
2531		__netif_schedule(q);
2532	}
2533	rcu_read_unlock();
2534}
2535EXPORT_SYMBOL(netif_schedule_queue);
2536
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2537void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2538{
2539	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2540		struct Qdisc *q;
2541
2542		rcu_read_lock();
2543		q = rcu_dereference(dev_queue->qdisc);
2544		__netif_schedule(q);
2545		rcu_read_unlock();
2546	}
2547}
2548EXPORT_SYMBOL(netif_tx_wake_queue);
2549
2550void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2551{
2552	unsigned long flags;
2553
2554	if (unlikely(!skb))
2555		return;
2556
2557	if (likely(refcount_read(&skb->users) == 1)) {
2558		smp_rmb();
2559		refcount_set(&skb->users, 0);
2560	} else if (likely(!refcount_dec_and_test(&skb->users))) {
2561		return;
2562	}
2563	get_kfree_skb_cb(skb)->reason = reason;
2564	local_irq_save(flags);
2565	skb->next = __this_cpu_read(softnet_data.completion_queue);
2566	__this_cpu_write(softnet_data.completion_queue, skb);
2567	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2568	local_irq_restore(flags);
2569}
2570EXPORT_SYMBOL(__dev_kfree_skb_irq);
2571
2572void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2573{
2574	if (in_irq() || irqs_disabled())
2575		__dev_kfree_skb_irq(skb, reason);
2576	else
2577		dev_kfree_skb(skb);
2578}
2579EXPORT_SYMBOL(__dev_kfree_skb_any);
2580
2581
2582/**
2583 * netif_device_detach - mark device as removed
2584 * @dev: network device
2585 *
2586 * Mark device as removed from system and therefore no longer available.
2587 */
2588void netif_device_detach(struct net_device *dev)
2589{
2590	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2591	    netif_running(dev)) {
2592		netif_tx_stop_all_queues(dev);
2593	}
2594}
2595EXPORT_SYMBOL(netif_device_detach);
2596
2597/**
2598 * netif_device_attach - mark device as attached
2599 * @dev: network device
2600 *
2601 * Mark device as attached from system and restart if needed.
2602 */
2603void netif_device_attach(struct net_device *dev)
2604{
2605	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2606	    netif_running(dev)) {
2607		netif_tx_wake_all_queues(dev);
2608		__netdev_watchdog_up(dev);
2609	}
2610}
2611EXPORT_SYMBOL(netif_device_attach);
2612
2613/*
2614 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2615 * to be used as a distribution range.
2616 */
2617u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2618		  unsigned int num_tx_queues)
2619{
2620	u32 hash;
2621	u16 qoffset = 0;
2622	u16 qcount = num_tx_queues;
2623
2624	if (skb_rx_queue_recorded(skb)) {
2625		hash = skb_get_rx_queue(skb);
2626		while (unlikely(hash >= num_tx_queues))
2627			hash -= num_tx_queues;
2628		return hash;
2629	}
2630
2631	if (dev->num_tc) {
2632		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2633
2634		qoffset = dev->tc_to_txq[tc].offset;
2635		qcount = dev->tc_to_txq[tc].count;
2636	}
2637
2638	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2639}
2640EXPORT_SYMBOL(__skb_tx_hash);
2641
2642static void skb_warn_bad_offload(const struct sk_buff *skb)
2643{
2644	static const netdev_features_t null_features;
2645	struct net_device *dev = skb->dev;
2646	const char *name = "";
2647
2648	if (!net_ratelimit())
2649		return;
2650
2651	if (dev) {
2652		if (dev->dev.parent)
2653			name = dev_driver_string(dev->dev.parent);
2654		else
2655			name = netdev_name(dev);
2656	}
2657	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2658	     "gso_type=%d ip_summed=%d\n",
2659	     name, dev ? &dev->features : &null_features,
2660	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2661	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2662	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2663}
2664
2665/*
2666 * Invalidate hardware checksum when packet is to be mangled, and
2667 * complete checksum manually on outgoing path.
2668 */
2669int skb_checksum_help(struct sk_buff *skb)
2670{
2671	__wsum csum;
2672	int ret = 0, offset;
2673
2674	if (skb->ip_summed == CHECKSUM_COMPLETE)
2675		goto out_set_summed;
2676
2677	if (unlikely(skb_shinfo(skb)->gso_size)) {
2678		skb_warn_bad_offload(skb);
2679		return -EINVAL;
2680	}
2681
2682	/* Before computing a checksum, we should make sure no frag could
2683	 * be modified by an external entity : checksum could be wrong.
2684	 */
2685	if (skb_has_shared_frag(skb)) {
2686		ret = __skb_linearize(skb);
2687		if (ret)
2688			goto out;
2689	}
2690
2691	offset = skb_checksum_start_offset(skb);
2692	BUG_ON(offset >= skb_headlen(skb));
2693	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2694
2695	offset += skb->csum_offset;
2696	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2697
2698	if (skb_cloned(skb) &&
2699	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2700		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2701		if (ret)
2702			goto out;
2703	}
2704
2705	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2706out_set_summed:
2707	skb->ip_summed = CHECKSUM_NONE;
2708out:
2709	return ret;
2710}
2711EXPORT_SYMBOL(skb_checksum_help);
2712
2713int skb_crc32c_csum_help(struct sk_buff *skb)
2714{
2715	__le32 crc32c_csum;
2716	int ret = 0, offset, start;
2717
2718	if (skb->ip_summed != CHECKSUM_PARTIAL)
2719		goto out;
2720
2721	if (unlikely(skb_is_gso(skb)))
2722		goto out;
2723
2724	/* Before computing a checksum, we should make sure no frag could
2725	 * be modified by an external entity : checksum could be wrong.
2726	 */
2727	if (unlikely(skb_has_shared_frag(skb))) {
2728		ret = __skb_linearize(skb);
2729		if (ret)
2730			goto out;
2731	}
2732	start = skb_checksum_start_offset(skb);
2733	offset = start + offsetof(struct sctphdr, checksum);
2734	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
2735		ret = -EINVAL;
2736		goto out;
2737	}
2738	if (skb_cloned(skb) &&
2739	    !skb_clone_writable(skb, offset + sizeof(__le32))) {
2740		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2741		if (ret)
2742			goto out;
2743	}
2744	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
2745						  skb->len - start, ~(__u32)0,
2746						  crc32c_csum_stub));
2747	*(__le32 *)(skb->data + offset) = crc32c_csum;
2748	skb->ip_summed = CHECKSUM_NONE;
2749	skb->csum_not_inet = 0;
2750out:
2751	return ret;
2752}
2753
2754__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2755{
2756	__be16 type = skb->protocol;
2757
2758	/* Tunnel gso handlers can set protocol to ethernet. */
2759	if (type == htons(ETH_P_TEB)) {
2760		struct ethhdr *eth;
2761
2762		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2763			return 0;
2764
2765		eth = (struct ethhdr *)skb->data;
2766		type = eth->h_proto;
2767	}
2768
2769	return __vlan_get_protocol(skb, type, depth);
2770}
2771
2772/**
2773 *	skb_mac_gso_segment - mac layer segmentation handler.
2774 *	@skb: buffer to segment
2775 *	@features: features for the output path (see dev->features)
2776 */
2777struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2778				    netdev_features_t features)
2779{
2780	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2781	struct packet_offload *ptype;
2782	int vlan_depth = skb->mac_len;
2783	__be16 type = skb_network_protocol(skb, &vlan_depth);
2784
2785	if (unlikely(!type))
2786		return ERR_PTR(-EINVAL);
2787
2788	__skb_pull(skb, vlan_depth);
2789
2790	rcu_read_lock();
2791	list_for_each_entry_rcu(ptype, &offload_base, list) {
2792		if (ptype->type == type && ptype->callbacks.gso_segment) {
2793			segs = ptype->callbacks.gso_segment(skb, features);
2794			break;
2795		}
2796	}
2797	rcu_read_unlock();
2798
2799	__skb_push(skb, skb->data - skb_mac_header(skb));
2800
2801	return segs;
2802}
2803EXPORT_SYMBOL(skb_mac_gso_segment);
2804
2805
2806/* openvswitch calls this on rx path, so we need a different check.
2807 */
2808static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2809{
2810	if (tx_path)
2811		return skb->ip_summed != CHECKSUM_PARTIAL &&
2812		       skb->ip_summed != CHECKSUM_UNNECESSARY;
2813
2814	return skb->ip_summed == CHECKSUM_NONE;
2815}
2816
2817/**
2818 *	__skb_gso_segment - Perform segmentation on skb.
2819 *	@skb: buffer to segment
2820 *	@features: features for the output path (see dev->features)
2821 *	@tx_path: whether it is called in TX path
2822 *
2823 *	This function segments the given skb and returns a list of segments.
2824 *
2825 *	It may return NULL if the skb requires no segmentation.  This is
2826 *	only possible when GSO is used for verifying header integrity.
2827 *
2828 *	Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2829 */
2830struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2831				  netdev_features_t features, bool tx_path)
2832{
2833	struct sk_buff *segs;
2834
2835	if (unlikely(skb_needs_check(skb, tx_path))) {
2836		int err;
2837
2838		/* We're going to init ->check field in TCP or UDP header */
 
2839		err = skb_cow_head(skb, 0);
2840		if (err < 0)
2841			return ERR_PTR(err);
2842	}
2843
2844	/* Only report GSO partial support if it will enable us to
2845	 * support segmentation on this frame without needing additional
2846	 * work.
2847	 */
2848	if (features & NETIF_F_GSO_PARTIAL) {
2849		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2850		struct net_device *dev = skb->dev;
2851
2852		partial_features |= dev->features & dev->gso_partial_features;
2853		if (!skb_gso_ok(skb, features | partial_features))
2854			features &= ~NETIF_F_GSO_PARTIAL;
2855	}
2856
2857	BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2858		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2859
2860	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2861	SKB_GSO_CB(skb)->encap_level = 0;
2862
2863	skb_reset_mac_header(skb);
2864	skb_reset_mac_len(skb);
2865
2866	segs = skb_mac_gso_segment(skb, features);
2867
2868	if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
2869		skb_warn_bad_offload(skb);
2870
2871	return segs;
2872}
2873EXPORT_SYMBOL(__skb_gso_segment);
2874
2875/* Take action when hardware reception checksum errors are detected. */
2876#ifdef CONFIG_BUG
2877void netdev_rx_csum_fault(struct net_device *dev)
2878{
2879	if (net_ratelimit()) {
2880		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2881		dump_stack();
2882	}
2883}
2884EXPORT_SYMBOL(netdev_rx_csum_fault);
2885#endif
2886
2887/* Actually, we should eliminate this check as soon as we know, that:
2888 * 1. IOMMU is present and allows to map all the memory.
2889 * 2. No high memory really exists on this machine.
2890 */
2891
2892static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2893{
2894#ifdef CONFIG_HIGHMEM
2895	int i;
2896
2897	if (!(dev->features & NETIF_F_HIGHDMA)) {
2898		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2899			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2900
2901			if (PageHighMem(skb_frag_page(frag)))
2902				return 1;
2903		}
2904	}
2905
2906	if (PCI_DMA_BUS_IS_PHYS) {
2907		struct device *pdev = dev->dev.parent;
2908
2909		if (!pdev)
2910			return 0;
2911		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2912			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2913			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2914
2915			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2916				return 1;
2917		}
2918	}
2919#endif
2920	return 0;
2921}
2922
2923/* If MPLS offload request, verify we are testing hardware MPLS features
2924 * instead of standard features for the netdev.
2925 */
2926#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2927static netdev_features_t net_mpls_features(struct sk_buff *skb,
2928					   netdev_features_t features,
2929					   __be16 type)
2930{
2931	if (eth_p_mpls(type))
2932		features &= skb->dev->mpls_features;
2933
2934	return features;
2935}
2936#else
2937static netdev_features_t net_mpls_features(struct sk_buff *skb,
2938					   netdev_features_t features,
2939					   __be16 type)
2940{
2941	return features;
2942}
2943#endif
2944
2945static netdev_features_t harmonize_features(struct sk_buff *skb,
2946	netdev_features_t features)
2947{
2948	int tmp;
2949	__be16 type;
2950
2951	type = skb_network_protocol(skb, &tmp);
2952	features = net_mpls_features(skb, features, type);
2953
2954	if (skb->ip_summed != CHECKSUM_NONE &&
2955	    !can_checksum_protocol(features, type)) {
2956		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2957	}
2958	if (illegal_highdma(skb->dev, skb))
2959		features &= ~NETIF_F_SG;
2960
2961	return features;
2962}
2963
2964netdev_features_t passthru_features_check(struct sk_buff *skb,
2965					  struct net_device *dev,
2966					  netdev_features_t features)
2967{
2968	return features;
2969}
2970EXPORT_SYMBOL(passthru_features_check);
2971
2972static netdev_features_t dflt_features_check(struct sk_buff *skb,
2973					     struct net_device *dev,
2974					     netdev_features_t features)
2975{
2976	return vlan_features_check(skb, features);
2977}
2978
2979static netdev_features_t gso_features_check(const struct sk_buff *skb,
2980					    struct net_device *dev,
2981					    netdev_features_t features)
2982{
2983	u16 gso_segs = skb_shinfo(skb)->gso_segs;
2984
2985	if (gso_segs > dev->gso_max_segs)
2986		return features & ~NETIF_F_GSO_MASK;
2987
2988	/* Support for GSO partial features requires software
2989	 * intervention before we can actually process the packets
2990	 * so we need to strip support for any partial features now
2991	 * and we can pull them back in after we have partially
2992	 * segmented the frame.
2993	 */
2994	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2995		features &= ~dev->gso_partial_features;
2996
2997	/* Make sure to clear the IPv4 ID mangling feature if the
2998	 * IPv4 header has the potential to be fragmented.
2999	 */
3000	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3001		struct iphdr *iph = skb->encapsulation ?
3002				    inner_ip_hdr(skb) : ip_hdr(skb);
3003
3004		if (!(iph->frag_off & htons(IP_DF)))
3005			features &= ~NETIF_F_TSO_MANGLEID;
3006	}
3007
3008	return features;
3009}
3010
3011netdev_features_t netif_skb_features(struct sk_buff *skb)
3012{
3013	struct net_device *dev = skb->dev;
3014	netdev_features_t features = dev->features;
3015
3016	if (skb_is_gso(skb))
3017		features = gso_features_check(skb, dev, features);
3018
3019	/* If encapsulation offload request, verify we are testing
3020	 * hardware encapsulation features instead of standard
3021	 * features for the netdev
3022	 */
3023	if (skb->encapsulation)
3024		features &= dev->hw_enc_features;
3025
3026	if (skb_vlan_tagged(skb))
3027		features = netdev_intersect_features(features,
3028						     dev->vlan_features |
3029						     NETIF_F_HW_VLAN_CTAG_TX |
3030						     NETIF_F_HW_VLAN_STAG_TX);
3031
3032	if (dev->netdev_ops->ndo_features_check)
3033		features &= dev->netdev_ops->ndo_features_check(skb, dev,
3034								features);
3035	else
3036		features &= dflt_features_check(skb, dev, features);
3037
3038	return harmonize_features(skb, features);
3039}
3040EXPORT_SYMBOL(netif_skb_features);
3041
3042static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3043		    struct netdev_queue *txq, bool more)
3044{
3045	unsigned int len;
3046	int rc;
3047
3048	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
3049		dev_queue_xmit_nit(skb, dev);
3050
3051	len = skb->len;
3052	trace_net_dev_start_xmit(skb, dev);
3053	rc = netdev_start_xmit(skb, dev, txq, more);
3054	trace_net_dev_xmit(skb, rc, dev, len);
3055
3056	return rc;
3057}
3058
3059struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3060				    struct netdev_queue *txq, int *ret)
3061{
3062	struct sk_buff *skb = first;
3063	int rc = NETDEV_TX_OK;
3064
3065	while (skb) {
3066		struct sk_buff *next = skb->next;
3067
3068		skb->next = NULL;
3069		rc = xmit_one(skb, dev, txq, next != NULL);
3070		if (unlikely(!dev_xmit_complete(rc))) {
3071			skb->next = next;
3072			goto out;
3073		}
3074
3075		skb = next;
3076		if (netif_xmit_stopped(txq) && skb) {
3077			rc = NETDEV_TX_BUSY;
3078			break;
3079		}
3080	}
3081
3082out:
3083	*ret = rc;
3084	return skb;
3085}
3086
3087static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3088					  netdev_features_t features)
3089{
3090	if (skb_vlan_tag_present(skb) &&
3091	    !vlan_hw_offload_capable(features, skb->vlan_proto))
3092		skb = __vlan_hwaccel_push_inside(skb);
3093	return skb;
3094}
3095
3096int skb_csum_hwoffload_help(struct sk_buff *skb,
3097			    const netdev_features_t features)
3098{
3099	if (unlikely(skb->csum_not_inet))
3100		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3101			skb_crc32c_csum_help(skb);
3102
3103	return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
3104}
3105EXPORT_SYMBOL(skb_csum_hwoffload_help);
3106
3107static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3108{
3109	netdev_features_t features;
3110
3111	features = netif_skb_features(skb);
3112	skb = validate_xmit_vlan(skb, features);
3113	if (unlikely(!skb))
3114		goto out_null;
3115
3116	if (netif_needs_gso(skb, features)) {
3117		struct sk_buff *segs;
3118
3119		segs = skb_gso_segment(skb, features);
3120		if (IS_ERR(segs)) {
3121			goto out_kfree_skb;
3122		} else if (segs) {
3123			consume_skb(skb);
3124			skb = segs;
3125		}
3126	} else {
3127		if (skb_needs_linearize(skb, features) &&
3128		    __skb_linearize(skb))
3129			goto out_kfree_skb;
3130
3131		/* If packet is not checksummed and device does not
3132		 * support checksumming for this protocol, complete
3133		 * checksumming here.
3134		 */
3135		if (skb->ip_summed == CHECKSUM_PARTIAL) {
3136			if (skb->encapsulation)
3137				skb_set_inner_transport_header(skb,
3138							       skb_checksum_start_offset(skb));
3139			else
3140				skb_set_transport_header(skb,
3141							 skb_checksum_start_offset(skb));
3142			if (skb_csum_hwoffload_help(skb, features))
 
3143				goto out_kfree_skb;
3144		}
3145	}
3146
3147	skb = validate_xmit_xfrm(skb, features, again);
3148
3149	return skb;
3150
3151out_kfree_skb:
3152	kfree_skb(skb);
3153out_null:
3154	atomic_long_inc(&dev->tx_dropped);
3155	return NULL;
3156}
3157
3158struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3159{
3160	struct sk_buff *next, *head = NULL, *tail;
3161
3162	for (; skb != NULL; skb = next) {
3163		next = skb->next;
3164		skb->next = NULL;
3165
3166		/* in case skb wont be segmented, point to itself */
3167		skb->prev = skb;
3168
3169		skb = validate_xmit_skb(skb, dev, again);
3170		if (!skb)
3171			continue;
3172
3173		if (!head)
3174			head = skb;
3175		else
3176			tail->next = skb;
3177		/* If skb was segmented, skb->prev points to
3178		 * the last segment. If not, it still contains skb.
3179		 */
3180		tail = skb->prev;
3181	}
3182	return head;
3183}
3184EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3185
3186static void qdisc_pkt_len_init(struct sk_buff *skb)
3187{
3188	const struct skb_shared_info *shinfo = skb_shinfo(skb);
3189
3190	qdisc_skb_cb(skb)->pkt_len = skb->len;
3191
3192	/* To get more precise estimation of bytes sent on wire,
3193	 * we add to pkt_len the headers size of all segments
3194	 */
3195	if (shinfo->gso_size)  {
3196		unsigned int hdr_len;
3197		u16 gso_segs = shinfo->gso_segs;
3198
3199		/* mac layer + network layer */
3200		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3201
3202		/* + transport layer */
3203		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3204			const struct tcphdr *th;
3205			struct tcphdr _tcphdr;
3206
3207			th = skb_header_pointer(skb, skb_transport_offset(skb),
3208						sizeof(_tcphdr), &_tcphdr);
3209			if (likely(th))
3210				hdr_len += __tcp_hdrlen(th);
3211		} else {
3212			struct udphdr _udphdr;
3213
3214			if (skb_header_pointer(skb, skb_transport_offset(skb),
3215					       sizeof(_udphdr), &_udphdr))
3216				hdr_len += sizeof(struct udphdr);
3217		}
3218
3219		if (shinfo->gso_type & SKB_GSO_DODGY)
3220			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3221						shinfo->gso_size);
3222
3223		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3224	}
3225}
3226
3227static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3228				 struct net_device *dev,
3229				 struct netdev_queue *txq)
3230{
3231	spinlock_t *root_lock = qdisc_lock(q);
3232	struct sk_buff *to_free = NULL;
3233	bool contended;
3234	int rc;
3235
3236	qdisc_calculate_pkt_len(skb, q);
3237
3238	if (q->flags & TCQ_F_NOLOCK) {
3239		if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3240			__qdisc_drop(skb, &to_free);
3241			rc = NET_XMIT_DROP;
3242		} else {
3243			rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3244			__qdisc_run(q);
3245		}
3246
3247		if (unlikely(to_free))
3248			kfree_skb_list(to_free);
3249		return rc;
3250	}
3251
3252	/*
3253	 * Heuristic to force contended enqueues to serialize on a
3254	 * separate lock before trying to get qdisc main lock.
3255	 * This permits qdisc->running owner to get the lock more
3256	 * often and dequeue packets faster.
3257	 */
3258	contended = qdisc_is_running(q);
3259	if (unlikely(contended))
3260		spin_lock(&q->busylock);
3261
3262	spin_lock(root_lock);
3263	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3264		__qdisc_drop(skb, &to_free);
3265		rc = NET_XMIT_DROP;
3266	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3267		   qdisc_run_begin(q)) {
3268		/*
3269		 * This is a work-conserving queue; there are no old skbs
3270		 * waiting to be sent out; and the qdisc is not running -
3271		 * xmit the skb directly.
3272		 */
3273
3274		qdisc_bstats_update(q, skb);
3275
3276		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3277			if (unlikely(contended)) {
3278				spin_unlock(&q->busylock);
3279				contended = false;
3280			}
3281			__qdisc_run(q);
3282		}
 
3283
3284		qdisc_run_end(q);
3285		rc = NET_XMIT_SUCCESS;
3286	} else {
3287		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3288		if (qdisc_run_begin(q)) {
3289			if (unlikely(contended)) {
3290				spin_unlock(&q->busylock);
3291				contended = false;
3292			}
3293			__qdisc_run(q);
3294			qdisc_run_end(q);
3295		}
3296	}
3297	spin_unlock(root_lock);
3298	if (unlikely(to_free))
3299		kfree_skb_list(to_free);
3300	if (unlikely(contended))
3301		spin_unlock(&q->busylock);
3302	return rc;
3303}
3304
3305#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3306static void skb_update_prio(struct sk_buff *skb)
3307{
3308	const struct netprio_map *map;
3309	const struct sock *sk;
3310	unsigned int prioidx;
3311
3312	if (skb->priority)
3313		return;
3314	map = rcu_dereference_bh(skb->dev->priomap);
3315	if (!map)
3316		return;
3317	sk = skb_to_full_sk(skb);
3318	if (!sk)
3319		return;
3320
3321	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
 
 
3322
3323	if (prioidx < map->priomap_len)
3324		skb->priority = map->priomap[prioidx];
 
3325}
3326#else
3327#define skb_update_prio(skb)
3328#endif
3329
3330DEFINE_PER_CPU(int, xmit_recursion);
3331EXPORT_SYMBOL(xmit_recursion);
3332
3333/**
3334 *	dev_loopback_xmit - loop back @skb
3335 *	@net: network namespace this loopback is happening in
3336 *	@sk:  sk needed to be a netfilter okfn
3337 *	@skb: buffer to transmit
3338 */
3339int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3340{
3341	skb_reset_mac_header(skb);
3342	__skb_pull(skb, skb_network_offset(skb));
3343	skb->pkt_type = PACKET_LOOPBACK;
3344	skb->ip_summed = CHECKSUM_UNNECESSARY;
3345	WARN_ON(!skb_dst(skb));
3346	skb_dst_force(skb);
3347	netif_rx_ni(skb);
3348	return 0;
3349}
3350EXPORT_SYMBOL(dev_loopback_xmit);
3351
3352#ifdef CONFIG_NET_EGRESS
3353static struct sk_buff *
3354sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3355{
3356	struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
3357	struct tcf_result cl_res;
3358
3359	if (!miniq)
3360		return skb;
3361
3362	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3363	mini_qdisc_bstats_cpu_update(miniq, skb);
 
 
3364
3365	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
3366	case TC_ACT_OK:
3367	case TC_ACT_RECLASSIFY:
3368		skb->tc_index = TC_H_MIN(cl_res.classid);
3369		break;
3370	case TC_ACT_SHOT:
3371		mini_qdisc_qstats_cpu_drop(miniq);
3372		*ret = NET_XMIT_DROP;
3373		kfree_skb(skb);
3374		return NULL;
3375	case TC_ACT_STOLEN:
3376	case TC_ACT_QUEUED:
3377	case TC_ACT_TRAP:
3378		*ret = NET_XMIT_SUCCESS;
3379		consume_skb(skb);
3380		return NULL;
3381	case TC_ACT_REDIRECT:
3382		/* No need to push/pop skb's mac_header here on egress! */
3383		skb_do_redirect(skb);
3384		*ret = NET_XMIT_SUCCESS;
3385		return NULL;
3386	default:
3387		break;
3388	}
3389
3390	return skb;
3391}
3392#endif /* CONFIG_NET_EGRESS */
3393
3394static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3395{
3396#ifdef CONFIG_XPS
3397	struct xps_dev_maps *dev_maps;
3398	struct xps_map *map;
3399	int queue_index = -1;
3400
3401	rcu_read_lock();
3402	dev_maps = rcu_dereference(dev->xps_maps);
3403	if (dev_maps) {
3404		unsigned int tci = skb->sender_cpu - 1;
3405
3406		if (dev->num_tc) {
3407			tci *= dev->num_tc;
3408			tci += netdev_get_prio_tc_map(dev, skb->priority);
3409		}
3410
3411		map = rcu_dereference(dev_maps->cpu_map[tci]);
3412		if (map) {
3413			if (map->len == 1)
3414				queue_index = map->queues[0];
3415			else
3416				queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3417									   map->len)];
3418			if (unlikely(queue_index >= dev->real_num_tx_queues))
3419				queue_index = -1;
3420		}
3421	}
3422	rcu_read_unlock();
3423
3424	return queue_index;
3425#else
3426	return -1;
3427#endif
3428}
3429
3430static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3431{
3432	struct sock *sk = skb->sk;
3433	int queue_index = sk_tx_queue_get(sk);
3434
3435	if (queue_index < 0 || skb->ooo_okay ||
3436	    queue_index >= dev->real_num_tx_queues) {
3437		int new_index = get_xps_queue(dev, skb);
3438
3439		if (new_index < 0)
3440			new_index = skb_tx_hash(dev, skb);
3441
3442		if (queue_index != new_index && sk &&
3443		    sk_fullsock(sk) &&
3444		    rcu_access_pointer(sk->sk_dst_cache))
3445			sk_tx_queue_set(sk, new_index);
3446
3447		queue_index = new_index;
3448	}
3449
3450	return queue_index;
3451}
3452
3453struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3454				    struct sk_buff *skb,
3455				    void *accel_priv)
3456{
3457	int queue_index = 0;
3458
3459#ifdef CONFIG_XPS
3460	u32 sender_cpu = skb->sender_cpu - 1;
3461
3462	if (sender_cpu >= (u32)NR_CPUS)
3463		skb->sender_cpu = raw_smp_processor_id() + 1;
3464#endif
3465
3466	if (dev->real_num_tx_queues != 1) {
3467		const struct net_device_ops *ops = dev->netdev_ops;
3468
3469		if (ops->ndo_select_queue)
3470			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3471							    __netdev_pick_tx);
3472		else
3473			queue_index = __netdev_pick_tx(dev, skb);
3474
3475		queue_index = netdev_cap_txqueue(dev, queue_index);
 
3476	}
3477
3478	skb_set_queue_mapping(skb, queue_index);
3479	return netdev_get_tx_queue(dev, queue_index);
3480}
3481
3482/**
3483 *	__dev_queue_xmit - transmit a buffer
3484 *	@skb: buffer to transmit
3485 *	@accel_priv: private data used for L2 forwarding offload
3486 *
3487 *	Queue a buffer for transmission to a network device. The caller must
3488 *	have set the device and priority and built the buffer before calling
3489 *	this function. The function can be called from an interrupt.
3490 *
3491 *	A negative errno code is returned on a failure. A success does not
3492 *	guarantee the frame will be transmitted as it may be dropped due
3493 *	to congestion or traffic shaping.
3494 *
3495 * -----------------------------------------------------------------------------------
3496 *      I notice this method can also return errors from the queue disciplines,
3497 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3498 *      be positive.
3499 *
3500 *      Regardless of the return value, the skb is consumed, so it is currently
3501 *      difficult to retry a send to this method.  (You can bump the ref count
3502 *      before sending to hold a reference for retry if you are careful.)
3503 *
3504 *      When calling this method, interrupts MUST be enabled.  This is because
3505 *      the BH enable code must have IRQs enabled so that it will not deadlock.
3506 *          --BLG
3507 */
3508static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3509{
3510	struct net_device *dev = skb->dev;
3511	struct netdev_queue *txq;
3512	struct Qdisc *q;
3513	int rc = -ENOMEM;
3514	bool again = false;
3515
3516	skb_reset_mac_header(skb);
3517
3518	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3519		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3520
3521	/* Disable soft irqs for various locks below. Also
3522	 * stops preemption for RCU.
3523	 */
3524	rcu_read_lock_bh();
3525
3526	skb_update_prio(skb);
3527
3528	qdisc_pkt_len_init(skb);
3529#ifdef CONFIG_NET_CLS_ACT
3530	skb->tc_at_ingress = 0;
3531# ifdef CONFIG_NET_EGRESS
3532	if (static_key_false(&egress_needed)) {
3533		skb = sch_handle_egress(skb, &rc, dev);
3534		if (!skb)
3535			goto out;
3536	}
3537# endif
3538#endif
3539	/* If device/qdisc don't need skb->dst, release it right now while
3540	 * its hot in this cpu cache.
3541	 */
3542	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3543		skb_dst_drop(skb);
3544	else
3545		skb_dst_force(skb);
3546
3547	txq = netdev_pick_tx(dev, skb, accel_priv);
3548	q = rcu_dereference_bh(txq->qdisc);
3549
3550	trace_net_dev_queue(skb);
3551	if (q->enqueue) {
3552		rc = __dev_xmit_skb(skb, q, dev, txq);
3553		goto out;
3554	}
3555
3556	/* The device has no queue. Common case for software devices:
3557	 * loopback, all the sorts of tunnels...
3558
3559	 * Really, it is unlikely that netif_tx_lock protection is necessary
3560	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3561	 * counters.)
3562	 * However, it is possible, that they rely on protection
3563	 * made by us here.
3564
3565	 * Check this and shot the lock. It is not prone from deadlocks.
3566	 *Either shot noqueue qdisc, it is even simpler 8)
3567	 */
3568	if (dev->flags & IFF_UP) {
3569		int cpu = smp_processor_id(); /* ok because BHs are off */
3570
3571		if (txq->xmit_lock_owner != cpu) {
3572			if (unlikely(__this_cpu_read(xmit_recursion) >
3573				     XMIT_RECURSION_LIMIT))
3574				goto recursion_alert;
3575
3576			skb = validate_xmit_skb(skb, dev, &again);
3577			if (!skb)
3578				goto out;
3579
3580			HARD_TX_LOCK(dev, txq, cpu);
3581
3582			if (!netif_xmit_stopped(txq)) {
3583				__this_cpu_inc(xmit_recursion);
3584				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3585				__this_cpu_dec(xmit_recursion);
3586				if (dev_xmit_complete(rc)) {
3587					HARD_TX_UNLOCK(dev, txq);
3588					goto out;
3589				}
3590			}
3591			HARD_TX_UNLOCK(dev, txq);
3592			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3593					     dev->name);
3594		} else {
3595			/* Recursion is detected! It is possible,
3596			 * unfortunately
3597			 */
3598recursion_alert:
3599			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3600					     dev->name);
3601		}
3602	}
3603
3604	rc = -ENETDOWN;
3605	rcu_read_unlock_bh();
3606
3607	atomic_long_inc(&dev->tx_dropped);
3608	kfree_skb_list(skb);
3609	return rc;
3610out:
3611	rcu_read_unlock_bh();
3612	return rc;
3613}
3614
3615int dev_queue_xmit(struct sk_buff *skb)
3616{
3617	return __dev_queue_xmit(skb, NULL);
3618}
3619EXPORT_SYMBOL(dev_queue_xmit);
3620
3621int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3622{
3623	return __dev_queue_xmit(skb, accel_priv);
3624}
3625EXPORT_SYMBOL(dev_queue_xmit_accel);
3626
3627
3628/*************************************************************************
3629 *			Receiver routines
3630 *************************************************************************/
3631
3632int netdev_max_backlog __read_mostly = 1000;
3633EXPORT_SYMBOL(netdev_max_backlog);
3634
3635int netdev_tstamp_prequeue __read_mostly = 1;
3636int netdev_budget __read_mostly = 300;
3637unsigned int __read_mostly netdev_budget_usecs = 2000;
3638int weight_p __read_mostly = 64;           /* old backlog weight */
3639int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
3640int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
3641int dev_rx_weight __read_mostly = 64;
3642int dev_tx_weight __read_mostly = 64;
3643
3644/* Called with irq disabled */
3645static inline void ____napi_schedule(struct softnet_data *sd,
3646				     struct napi_struct *napi)
3647{
3648	list_add_tail(&napi->poll_list, &sd->poll_list);
3649	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3650}
3651
3652#ifdef CONFIG_RPS
3653
3654/* One global table that all flow-based protocols share. */
3655struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3656EXPORT_SYMBOL(rps_sock_flow_table);
3657u32 rps_cpu_mask __read_mostly;
3658EXPORT_SYMBOL(rps_cpu_mask);
3659
3660struct static_key rps_needed __read_mostly;
3661EXPORT_SYMBOL(rps_needed);
3662struct static_key rfs_needed __read_mostly;
3663EXPORT_SYMBOL(rfs_needed);
3664
3665static struct rps_dev_flow *
3666set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3667	    struct rps_dev_flow *rflow, u16 next_cpu)
3668{
3669	if (next_cpu < nr_cpu_ids) {
3670#ifdef CONFIG_RFS_ACCEL
3671		struct netdev_rx_queue *rxqueue;
3672		struct rps_dev_flow_table *flow_table;
3673		struct rps_dev_flow *old_rflow;
3674		u32 flow_id;
3675		u16 rxq_index;
3676		int rc;
3677
3678		/* Should we steer this flow to a different hardware queue? */
3679		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3680		    !(dev->features & NETIF_F_NTUPLE))
3681			goto out;
3682		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3683		if (rxq_index == skb_get_rx_queue(skb))
3684			goto out;
3685
3686		rxqueue = dev->_rx + rxq_index;
3687		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3688		if (!flow_table)
3689			goto out;
3690		flow_id = skb_get_hash(skb) & flow_table->mask;
3691		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3692							rxq_index, flow_id);
3693		if (rc < 0)
3694			goto out;
3695		old_rflow = rflow;
3696		rflow = &flow_table->flows[flow_id];
3697		rflow->filter = rc;
3698		if (old_rflow->filter == rflow->filter)
3699			old_rflow->filter = RPS_NO_FILTER;
3700	out:
3701#endif
3702		rflow->last_qtail =
3703			per_cpu(softnet_data, next_cpu).input_queue_head;
3704	}
3705
3706	rflow->cpu = next_cpu;
3707	return rflow;
3708}
3709
3710/*
3711 * get_rps_cpu is called from netif_receive_skb and returns the target
3712 * CPU from the RPS map of the receiving queue for a given skb.
3713 * rcu_read_lock must be held on entry.
3714 */
3715static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3716		       struct rps_dev_flow **rflowp)
3717{
3718	const struct rps_sock_flow_table *sock_flow_table;
3719	struct netdev_rx_queue *rxqueue = dev->_rx;
3720	struct rps_dev_flow_table *flow_table;
3721	struct rps_map *map;
3722	int cpu = -1;
3723	u32 tcpu;
3724	u32 hash;
3725
3726	if (skb_rx_queue_recorded(skb)) {
3727		u16 index = skb_get_rx_queue(skb);
3728
3729		if (unlikely(index >= dev->real_num_rx_queues)) {
3730			WARN_ONCE(dev->real_num_rx_queues > 1,
3731				  "%s received packet on queue %u, but number "
3732				  "of RX queues is %u\n",
3733				  dev->name, index, dev->real_num_rx_queues);
3734			goto done;
3735		}
3736		rxqueue += index;
3737	}
3738
3739	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3740
3741	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3742	map = rcu_dereference(rxqueue->rps_map);
3743	if (!flow_table && !map)
3744		goto done;
3745
3746	skb_reset_network_header(skb);
3747	hash = skb_get_hash(skb);
3748	if (!hash)
3749		goto done;
3750
3751	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3752	if (flow_table && sock_flow_table) {
3753		struct rps_dev_flow *rflow;
3754		u32 next_cpu;
3755		u32 ident;
3756
3757		/* First check into global flow table if there is a match */
3758		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3759		if ((ident ^ hash) & ~rps_cpu_mask)
3760			goto try_rps;
3761
3762		next_cpu = ident & rps_cpu_mask;
3763
3764		/* OK, now we know there is a match,
3765		 * we can look at the local (per receive queue) flow table
3766		 */
3767		rflow = &flow_table->flows[hash & flow_table->mask];
3768		tcpu = rflow->cpu;
3769
3770		/*
3771		 * If the desired CPU (where last recvmsg was done) is
3772		 * different from current CPU (one in the rx-queue flow
3773		 * table entry), switch if one of the following holds:
3774		 *   - Current CPU is unset (>= nr_cpu_ids).
3775		 *   - Current CPU is offline.
3776		 *   - The current CPU's queue tail has advanced beyond the
3777		 *     last packet that was enqueued using this table entry.
3778		 *     This guarantees that all previous packets for the flow
3779		 *     have been dequeued, thus preserving in order delivery.
3780		 */
3781		if (unlikely(tcpu != next_cpu) &&
3782		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3783		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3784		      rflow->last_qtail)) >= 0)) {
3785			tcpu = next_cpu;
3786			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3787		}
3788
3789		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3790			*rflowp = rflow;
3791			cpu = tcpu;
3792			goto done;
3793		}
3794	}
3795
3796try_rps:
3797
3798	if (map) {
3799		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3800		if (cpu_online(tcpu)) {
3801			cpu = tcpu;
3802			goto done;
3803		}
3804	}
3805
3806done:
3807	return cpu;
3808}
3809
3810#ifdef CONFIG_RFS_ACCEL
3811
3812/**
3813 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3814 * @dev: Device on which the filter was set
3815 * @rxq_index: RX queue index
3816 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3817 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3818 *
3819 * Drivers that implement ndo_rx_flow_steer() should periodically call
3820 * this function for each installed filter and remove the filters for
3821 * which it returns %true.
3822 */
3823bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3824			 u32 flow_id, u16 filter_id)
3825{
3826	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3827	struct rps_dev_flow_table *flow_table;
3828	struct rps_dev_flow *rflow;
3829	bool expire = true;
3830	unsigned int cpu;
3831
3832	rcu_read_lock();
3833	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3834	if (flow_table && flow_id <= flow_table->mask) {
3835		rflow = &flow_table->flows[flow_id];
3836		cpu = READ_ONCE(rflow->cpu);
3837		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3838		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3839			   rflow->last_qtail) <
3840		     (int)(10 * flow_table->mask)))
3841			expire = false;
3842	}
3843	rcu_read_unlock();
3844	return expire;
3845}
3846EXPORT_SYMBOL(rps_may_expire_flow);
3847
3848#endif /* CONFIG_RFS_ACCEL */
3849
3850/* Called from hardirq (IPI) context */
3851static void rps_trigger_softirq(void *data)
3852{
3853	struct softnet_data *sd = data;
3854
3855	____napi_schedule(sd, &sd->backlog);
3856	sd->received_rps++;
3857}
3858
3859#endif /* CONFIG_RPS */
3860
3861/*
3862 * Check if this softnet_data structure is another cpu one
3863 * If yes, queue it to our IPI list and return 1
3864 * If no, return 0
3865 */
3866static int rps_ipi_queued(struct softnet_data *sd)
3867{
3868#ifdef CONFIG_RPS
3869	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3870
3871	if (sd != mysd) {
3872		sd->rps_ipi_next = mysd->rps_ipi_list;
3873		mysd->rps_ipi_list = sd;
3874
3875		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3876		return 1;
3877	}
3878#endif /* CONFIG_RPS */
3879	return 0;
3880}
3881
3882#ifdef CONFIG_NET_FLOW_LIMIT
3883int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3884#endif
3885
3886static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3887{
3888#ifdef CONFIG_NET_FLOW_LIMIT
3889	struct sd_flow_limit *fl;
3890	struct softnet_data *sd;
3891	unsigned int old_flow, new_flow;
3892
3893	if (qlen < (netdev_max_backlog >> 1))
3894		return false;
3895
3896	sd = this_cpu_ptr(&softnet_data);
3897
3898	rcu_read_lock();
3899	fl = rcu_dereference(sd->flow_limit);
3900	if (fl) {
3901		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3902		old_flow = fl->history[fl->history_head];
3903		fl->history[fl->history_head] = new_flow;
3904
3905		fl->history_head++;
3906		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3907
3908		if (likely(fl->buckets[old_flow]))
3909			fl->buckets[old_flow]--;
3910
3911		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3912			fl->count++;
3913			rcu_read_unlock();
3914			return true;
3915		}
3916	}
3917	rcu_read_unlock();
3918#endif
3919	return false;
3920}
3921
3922/*
3923 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3924 * queue (may be a remote CPU queue).
3925 */
3926static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3927			      unsigned int *qtail)
3928{
3929	struct softnet_data *sd;
3930	unsigned long flags;
3931	unsigned int qlen;
3932
3933	sd = &per_cpu(softnet_data, cpu);
3934
3935	local_irq_save(flags);
3936
3937	rps_lock(sd);
3938	if (!netif_running(skb->dev))
3939		goto drop;
3940	qlen = skb_queue_len(&sd->input_pkt_queue);
3941	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3942		if (qlen) {
3943enqueue:
3944			__skb_queue_tail(&sd->input_pkt_queue, skb);
3945			input_queue_tail_incr_save(sd, qtail);
3946			rps_unlock(sd);
3947			local_irq_restore(flags);
3948			return NET_RX_SUCCESS;
3949		}
3950
3951		/* Schedule NAPI for backlog device
3952		 * We can use non atomic operation since we own the queue lock
3953		 */
3954		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3955			if (!rps_ipi_queued(sd))
3956				____napi_schedule(sd, &sd->backlog);
3957		}
3958		goto enqueue;
3959	}
3960
3961drop:
3962	sd->dropped++;
3963	rps_unlock(sd);
3964
3965	local_irq_restore(flags);
3966
3967	atomic_long_inc(&skb->dev->rx_dropped);
3968	kfree_skb(skb);
3969	return NET_RX_DROP;
3970}
3971
3972static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
3973{
3974	struct net_device *dev = skb->dev;
3975	struct netdev_rx_queue *rxqueue;
3976
3977	rxqueue = dev->_rx;
3978
3979	if (skb_rx_queue_recorded(skb)) {
3980		u16 index = skb_get_rx_queue(skb);
3981
3982		if (unlikely(index >= dev->real_num_rx_queues)) {
3983			WARN_ONCE(dev->real_num_rx_queues > 1,
3984				  "%s received packet on queue %u, but number "
3985				  "of RX queues is %u\n",
3986				  dev->name, index, dev->real_num_rx_queues);
3987
3988			return rxqueue; /* Return first rxqueue */
3989		}
3990		rxqueue += index;
3991	}
3992	return rxqueue;
3993}
3994
3995static u32 netif_receive_generic_xdp(struct sk_buff *skb,
3996				     struct bpf_prog *xdp_prog)
3997{
3998	struct netdev_rx_queue *rxqueue;
3999	u32 metalen, act = XDP_DROP;
4000	struct xdp_buff xdp;
4001	void *orig_data;
4002	int hlen, off;
4003	u32 mac_len;
4004
4005	/* Reinjected packets coming from act_mirred or similar should
4006	 * not get XDP generic processing.
4007	 */
4008	if (skb_cloned(skb))
4009		return XDP_PASS;
4010
4011	/* XDP packets must be linear and must have sufficient headroom
4012	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
4013	 * native XDP provides, thus we need to do it here as well.
4014	 */
4015	if (skb_is_nonlinear(skb) ||
4016	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
4017		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
4018		int troom = skb->tail + skb->data_len - skb->end;
4019
4020		/* In case we have to go down the path and also linearize,
4021		 * then lets do the pskb_expand_head() work just once here.
4022		 */
4023		if (pskb_expand_head(skb,
4024				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
4025				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
4026			goto do_drop;
4027		if (skb_linearize(skb))
4028			goto do_drop;
4029	}
4030
4031	/* The XDP program wants to see the packet starting at the MAC
4032	 * header.
4033	 */
4034	mac_len = skb->data - skb_mac_header(skb);
4035	hlen = skb_headlen(skb) + mac_len;
4036	xdp.data = skb->data - mac_len;
4037	xdp.data_meta = xdp.data;
4038	xdp.data_end = xdp.data + hlen;
4039	xdp.data_hard_start = skb->data - skb_headroom(skb);
4040	orig_data = xdp.data;
4041
4042	rxqueue = netif_get_rxqueue(skb);
4043	xdp.rxq = &rxqueue->xdp_rxq;
4044
4045	act = bpf_prog_run_xdp(xdp_prog, &xdp);
4046
4047	off = xdp.data - orig_data;
4048	if (off > 0)
4049		__skb_pull(skb, off);
4050	else if (off < 0)
4051		__skb_push(skb, -off);
4052	skb->mac_header += off;
4053
4054	switch (act) {
4055	case XDP_REDIRECT:
4056	case XDP_TX:
4057		__skb_push(skb, mac_len);
4058		break;
4059	case XDP_PASS:
4060		metalen = xdp.data - xdp.data_meta;
4061		if (metalen)
4062			skb_metadata_set(skb, metalen);
4063		break;
4064	default:
4065		bpf_warn_invalid_xdp_action(act);
4066		/* fall through */
4067	case XDP_ABORTED:
4068		trace_xdp_exception(skb->dev, xdp_prog, act);
4069		/* fall through */
4070	case XDP_DROP:
4071	do_drop:
4072		kfree_skb(skb);
4073		break;
4074	}
4075
4076	return act;
4077}
4078
4079/* When doing generic XDP we have to bypass the qdisc layer and the
4080 * network taps in order to match in-driver-XDP behavior.
4081 */
4082void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
4083{
4084	struct net_device *dev = skb->dev;
4085	struct netdev_queue *txq;
4086	bool free_skb = true;
4087	int cpu, rc;
4088
4089	txq = netdev_pick_tx(dev, skb, NULL);
4090	cpu = smp_processor_id();
4091	HARD_TX_LOCK(dev, txq, cpu);
4092	if (!netif_xmit_stopped(txq)) {
4093		rc = netdev_start_xmit(skb, dev, txq, 0);
4094		if (dev_xmit_complete(rc))
4095			free_skb = false;
4096	}
4097	HARD_TX_UNLOCK(dev, txq);
4098	if (free_skb) {
4099		trace_xdp_exception(dev, xdp_prog, XDP_TX);
4100		kfree_skb(skb);
4101	}
4102}
4103EXPORT_SYMBOL_GPL(generic_xdp_tx);
4104
4105static struct static_key generic_xdp_needed __read_mostly;
4106
4107int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
4108{
4109	if (xdp_prog) {
4110		u32 act = netif_receive_generic_xdp(skb, xdp_prog);
4111		int err;
4112
4113		if (act != XDP_PASS) {
4114			switch (act) {
4115			case XDP_REDIRECT:
4116				err = xdp_do_generic_redirect(skb->dev, skb,
4117							      xdp_prog);
4118				if (err)
4119					goto out_redir;
4120			/* fallthru to submit skb */
4121			case XDP_TX:
4122				generic_xdp_tx(skb, xdp_prog);
4123				break;
4124			}
4125			return XDP_DROP;
4126		}
4127	}
4128	return XDP_PASS;
4129out_redir:
4130	kfree_skb(skb);
4131	return XDP_DROP;
4132}
4133EXPORT_SYMBOL_GPL(do_xdp_generic);
4134
4135static int netif_rx_internal(struct sk_buff *skb)
4136{
4137	int ret;
4138
4139	net_timestamp_check(netdev_tstamp_prequeue, skb);
4140
4141	trace_netif_rx(skb);
4142
4143	if (static_key_false(&generic_xdp_needed)) {
4144		int ret;
4145
4146		preempt_disable();
4147		rcu_read_lock();
4148		ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
4149		rcu_read_unlock();
4150		preempt_enable();
4151
4152		/* Consider XDP consuming the packet a success from
4153		 * the netdev point of view we do not want to count
4154		 * this as an error.
4155		 */
4156		if (ret != XDP_PASS)
4157			return NET_RX_SUCCESS;
4158	}
4159
4160#ifdef CONFIG_RPS
4161	if (static_key_false(&rps_needed)) {
4162		struct rps_dev_flow voidflow, *rflow = &voidflow;
4163		int cpu;
4164
4165		preempt_disable();
4166		rcu_read_lock();
4167
4168		cpu = get_rps_cpu(skb->dev, skb, &rflow);
4169		if (cpu < 0)
4170			cpu = smp_processor_id();
4171
4172		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4173
4174		rcu_read_unlock();
4175		preempt_enable();
4176	} else
4177#endif
4178	{
4179		unsigned int qtail;
4180
4181		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
4182		put_cpu();
4183	}
4184	return ret;
4185}
4186
4187/**
4188 *	netif_rx	-	post buffer to the network code
4189 *	@skb: buffer to post
4190 *
4191 *	This function receives a packet from a device driver and queues it for
4192 *	the upper (protocol) levels to process.  It always succeeds. The buffer
4193 *	may be dropped during processing for congestion control or by the
4194 *	protocol layers.
4195 *
4196 *	return values:
4197 *	NET_RX_SUCCESS	(no congestion)
4198 *	NET_RX_DROP     (packet was dropped)
4199 *
4200 */
4201
4202int netif_rx(struct sk_buff *skb)
4203{
4204	trace_netif_rx_entry(skb);
4205
4206	return netif_rx_internal(skb);
4207}
4208EXPORT_SYMBOL(netif_rx);
4209
4210int netif_rx_ni(struct sk_buff *skb)
4211{
4212	int err;
4213
4214	trace_netif_rx_ni_entry(skb);
4215
4216	preempt_disable();
4217	err = netif_rx_internal(skb);
4218	if (local_softirq_pending())
4219		do_softirq();
4220	preempt_enable();
4221
4222	return err;
4223}
4224EXPORT_SYMBOL(netif_rx_ni);
4225
4226static __latent_entropy void net_tx_action(struct softirq_action *h)
4227{
4228	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4229
4230	if (sd->completion_queue) {
4231		struct sk_buff *clist;
4232
4233		local_irq_disable();
4234		clist = sd->completion_queue;
4235		sd->completion_queue = NULL;
4236		local_irq_enable();
4237
4238		while (clist) {
4239			struct sk_buff *skb = clist;
4240
4241			clist = clist->next;
4242
4243			WARN_ON(refcount_read(&skb->users));
4244			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
4245				trace_consume_skb(skb);
4246			else
4247				trace_kfree_skb(skb, net_tx_action);
4248
4249			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
4250				__kfree_skb(skb);
4251			else
4252				__kfree_skb_defer(skb);
4253		}
4254
4255		__kfree_skb_flush();
4256	}
4257
4258	if (sd->output_queue) {
4259		struct Qdisc *head;
4260
4261		local_irq_disable();
4262		head = sd->output_queue;
4263		sd->output_queue = NULL;
4264		sd->output_queue_tailp = &sd->output_queue;
4265		local_irq_enable();
4266
4267		while (head) {
4268			struct Qdisc *q = head;
4269			spinlock_t *root_lock = NULL;
4270
4271			head = head->next_sched;
4272
4273			if (!(q->flags & TCQ_F_NOLOCK)) {
4274				root_lock = qdisc_lock(q);
4275				spin_lock(root_lock);
4276			}
4277			/* We need to make sure head->next_sched is read
4278			 * before clearing __QDISC_STATE_SCHED
4279			 */
4280			smp_mb__before_atomic();
4281			clear_bit(__QDISC_STATE_SCHED, &q->state);
4282			qdisc_run(q);
4283			if (root_lock)
4284				spin_unlock(root_lock);
4285		}
4286	}
4287
4288	xfrm_dev_backlog(sd);
4289}
4290
4291#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
4292/* This hook is defined here for ATM LANE */
4293int (*br_fdb_test_addr_hook)(struct net_device *dev,
4294			     unsigned char *addr) __read_mostly;
4295EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
4296#endif
4297
4298static inline struct sk_buff *
4299sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4300		   struct net_device *orig_dev)
4301{
4302#ifdef CONFIG_NET_CLS_ACT
4303	struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
4304	struct tcf_result cl_res;
4305
4306	/* If there's at least one ingress present somewhere (so
4307	 * we get here via enabled static key), remaining devices
4308	 * that are not configured with an ingress qdisc will bail
4309	 * out here.
4310	 */
4311	if (!miniq)
4312		return skb;
4313
4314	if (*pt_prev) {
4315		*ret = deliver_skb(skb, *pt_prev, orig_dev);
4316		*pt_prev = NULL;
4317	}
4318
4319	qdisc_skb_cb(skb)->pkt_len = skb->len;
4320	skb->tc_at_ingress = 1;
4321	mini_qdisc_bstats_cpu_update(miniq, skb);
4322
4323	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
4324	case TC_ACT_OK:
4325	case TC_ACT_RECLASSIFY:
4326		skb->tc_index = TC_H_MIN(cl_res.classid);
4327		break;
4328	case TC_ACT_SHOT:
4329		mini_qdisc_qstats_cpu_drop(miniq);
4330		kfree_skb(skb);
4331		return NULL;
4332	case TC_ACT_STOLEN:
4333	case TC_ACT_QUEUED:
4334	case TC_ACT_TRAP:
4335		consume_skb(skb);
4336		return NULL;
4337	case TC_ACT_REDIRECT:
4338		/* skb_mac_header check was done by cls/act_bpf, so
4339		 * we can safely push the L2 header back before
4340		 * redirecting to another netdev
4341		 */
4342		__skb_push(skb, skb->mac_len);
4343		skb_do_redirect(skb);
4344		return NULL;
4345	default:
4346		break;
4347	}
4348#endif /* CONFIG_NET_CLS_ACT */
4349	return skb;
4350}
4351
4352/**
4353 *	netdev_is_rx_handler_busy - check if receive handler is registered
4354 *	@dev: device to check
4355 *
4356 *	Check if a receive handler is already registered for a given device.
4357 *	Return true if there one.
4358 *
4359 *	The caller must hold the rtnl_mutex.
4360 */
4361bool netdev_is_rx_handler_busy(struct net_device *dev)
4362{
4363	ASSERT_RTNL();
4364	return dev && rtnl_dereference(dev->rx_handler);
4365}
4366EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
4367
4368/**
4369 *	netdev_rx_handler_register - register receive handler
4370 *	@dev: device to register a handler for
4371 *	@rx_handler: receive handler to register
4372 *	@rx_handler_data: data pointer that is used by rx handler
4373 *
4374 *	Register a receive handler for a device. This handler will then be
4375 *	called from __netif_receive_skb. A negative errno code is returned
4376 *	on a failure.
4377 *
4378 *	The caller must hold the rtnl_mutex.
4379 *
4380 *	For a general description of rx_handler, see enum rx_handler_result.
4381 */
4382int netdev_rx_handler_register(struct net_device *dev,
4383			       rx_handler_func_t *rx_handler,
4384			       void *rx_handler_data)
4385{
4386	if (netdev_is_rx_handler_busy(dev))
4387		return -EBUSY;
4388
4389	if (dev->priv_flags & IFF_NO_RX_HANDLER)
4390		return -EINVAL;
4391
4392	/* Note: rx_handler_data must be set before rx_handler */
4393	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4394	rcu_assign_pointer(dev->rx_handler, rx_handler);
4395
4396	return 0;
4397}
4398EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4399
4400/**
4401 *	netdev_rx_handler_unregister - unregister receive handler
4402 *	@dev: device to unregister a handler from
4403 *
4404 *	Unregister a receive handler from a device.
4405 *
4406 *	The caller must hold the rtnl_mutex.
4407 */
4408void netdev_rx_handler_unregister(struct net_device *dev)
4409{
4410
4411	ASSERT_RTNL();
4412	RCU_INIT_POINTER(dev->rx_handler, NULL);
4413	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4414	 * section has a guarantee to see a non NULL rx_handler_data
4415	 * as well.
4416	 */
4417	synchronize_net();
4418	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4419}
4420EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4421
4422/*
4423 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4424 * the special handling of PFMEMALLOC skbs.
4425 */
4426static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4427{
4428	switch (skb->protocol) {
4429	case htons(ETH_P_ARP):
4430	case htons(ETH_P_IP):
4431	case htons(ETH_P_IPV6):
4432	case htons(ETH_P_8021Q):
4433	case htons(ETH_P_8021AD):
4434		return true;
4435	default:
4436		return false;
4437	}
4438}
4439
4440static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4441			     int *ret, struct net_device *orig_dev)
4442{
4443#ifdef CONFIG_NETFILTER_INGRESS
4444	if (nf_hook_ingress_active(skb)) {
4445		int ingress_retval;
4446
4447		if (*pt_prev) {
4448			*ret = deliver_skb(skb, *pt_prev, orig_dev);
4449			*pt_prev = NULL;
4450		}
4451
4452		rcu_read_lock();
4453		ingress_retval = nf_hook_ingress(skb);
4454		rcu_read_unlock();
4455		return ingress_retval;
4456	}
4457#endif /* CONFIG_NETFILTER_INGRESS */
4458	return 0;
4459}
4460
4461static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4462{
4463	struct packet_type *ptype, *pt_prev;
4464	rx_handler_func_t *rx_handler;
4465	struct net_device *orig_dev;
4466	bool deliver_exact = false;
4467	int ret = NET_RX_DROP;
4468	__be16 type;
4469
4470	net_timestamp_check(!netdev_tstamp_prequeue, skb);
4471
4472	trace_netif_receive_skb(skb);
4473
4474	orig_dev = skb->dev;
4475
4476	skb_reset_network_header(skb);
4477	if (!skb_transport_header_was_set(skb))
4478		skb_reset_transport_header(skb);
4479	skb_reset_mac_len(skb);
4480
4481	pt_prev = NULL;
4482
4483another_round:
4484	skb->skb_iif = skb->dev->ifindex;
4485
4486	__this_cpu_inc(softnet_data.processed);
4487
4488	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4489	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4490		skb = skb_vlan_untag(skb);
4491		if (unlikely(!skb))
4492			goto out;
4493	}
4494
4495	if (skb_skip_tc_classify(skb))
4496		goto skip_classify;
 
 
 
 
4497
4498	if (pfmemalloc)
4499		goto skip_taps;
4500
4501	list_for_each_entry_rcu(ptype, &ptype_all, list) {
4502		if (pt_prev)
4503			ret = deliver_skb(skb, pt_prev, orig_dev);
4504		pt_prev = ptype;
4505	}
4506
4507	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4508		if (pt_prev)
4509			ret = deliver_skb(skb, pt_prev, orig_dev);
4510		pt_prev = ptype;
4511	}
4512
4513skip_taps:
4514#ifdef CONFIG_NET_INGRESS
4515	if (static_key_false(&ingress_needed)) {
4516		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4517		if (!skb)
4518			goto out;
4519
4520		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4521			goto out;
4522	}
4523#endif
4524	skb_reset_tc(skb);
4525skip_classify:
 
 
4526	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4527		goto drop;
4528
4529	if (skb_vlan_tag_present(skb)) {
4530		if (pt_prev) {
4531			ret = deliver_skb(skb, pt_prev, orig_dev);
4532			pt_prev = NULL;
4533		}
4534		if (vlan_do_receive(&skb))
4535			goto another_round;
4536		else if (unlikely(!skb))
4537			goto out;
4538	}
4539
4540	rx_handler = rcu_dereference(skb->dev->rx_handler);
4541	if (rx_handler) {
4542		if (pt_prev) {
4543			ret = deliver_skb(skb, pt_prev, orig_dev);
4544			pt_prev = NULL;
4545		}
4546		switch (rx_handler(&skb)) {
4547		case RX_HANDLER_CONSUMED:
4548			ret = NET_RX_SUCCESS;
4549			goto out;
4550		case RX_HANDLER_ANOTHER:
4551			goto another_round;
4552		case RX_HANDLER_EXACT:
4553			deliver_exact = true;
4554		case RX_HANDLER_PASS:
4555			break;
4556		default:
4557			BUG();
4558		}
4559	}
4560
4561	if (unlikely(skb_vlan_tag_present(skb))) {
4562		if (skb_vlan_tag_get_id(skb))
4563			skb->pkt_type = PACKET_OTHERHOST;
4564		/* Note: we might in the future use prio bits
4565		 * and set skb->priority like in vlan_do_receive()
4566		 * For the time being, just ignore Priority Code Point
4567		 */
4568		skb->vlan_tci = 0;
4569	}
4570
4571	type = skb->protocol;
4572
4573	/* deliver only exact match when indicated */
4574	if (likely(!deliver_exact)) {
4575		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4576				       &ptype_base[ntohs(type) &
4577						   PTYPE_HASH_MASK]);
4578	}
4579
4580	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4581			       &orig_dev->ptype_specific);
4582
4583	if (unlikely(skb->dev != orig_dev)) {
4584		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4585				       &skb->dev->ptype_specific);
4586	}
4587
4588	if (pt_prev) {
4589		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
4590			goto drop;
4591		else
4592			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4593	} else {
4594drop:
4595		if (!deliver_exact)
4596			atomic_long_inc(&skb->dev->rx_dropped);
4597		else
4598			atomic_long_inc(&skb->dev->rx_nohandler);
4599		kfree_skb(skb);
4600		/* Jamal, now you will not able to escape explaining
4601		 * me how you were going to use this. :-)
4602		 */
4603		ret = NET_RX_DROP;
4604	}
4605
4606out:
4607	return ret;
4608}
4609
4610/**
4611 *	netif_receive_skb_core - special purpose version of netif_receive_skb
4612 *	@skb: buffer to process
4613 *
4614 *	More direct receive version of netif_receive_skb().  It should
4615 *	only be used by callers that have a need to skip RPS and Generic XDP.
4616 *	Caller must also take care of handling if (page_is_)pfmemalloc.
4617 *
4618 *	This function may only be called from softirq context and interrupts
4619 *	should be enabled.
4620 *
4621 *	Return values (usually ignored):
4622 *	NET_RX_SUCCESS: no congestion
4623 *	NET_RX_DROP: packet was dropped
4624 */
4625int netif_receive_skb_core(struct sk_buff *skb)
4626{
4627	int ret;
4628
4629	rcu_read_lock();
4630	ret = __netif_receive_skb_core(skb, false);
4631	rcu_read_unlock();
4632
4633	return ret;
4634}
4635EXPORT_SYMBOL(netif_receive_skb_core);
4636
4637static int __netif_receive_skb(struct sk_buff *skb)
4638{
4639	int ret;
4640
4641	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4642		unsigned int noreclaim_flag;
4643
4644		/*
4645		 * PFMEMALLOC skbs are special, they should
4646		 * - be delivered to SOCK_MEMALLOC sockets only
4647		 * - stay away from userspace
4648		 * - have bounded memory usage
4649		 *
4650		 * Use PF_MEMALLOC as this saves us from propagating the allocation
4651		 * context down to all allocation sites.
4652		 */
4653		noreclaim_flag = memalloc_noreclaim_save();
4654		ret = __netif_receive_skb_core(skb, true);
4655		memalloc_noreclaim_restore(noreclaim_flag);
4656	} else
4657		ret = __netif_receive_skb_core(skb, false);
4658
4659	return ret;
4660}
4661
4662static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
4663{
4664	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
4665	struct bpf_prog *new = xdp->prog;
4666	int ret = 0;
4667
4668	switch (xdp->command) {
4669	case XDP_SETUP_PROG:
4670		rcu_assign_pointer(dev->xdp_prog, new);
4671		if (old)
4672			bpf_prog_put(old);
4673
4674		if (old && !new) {
4675			static_key_slow_dec(&generic_xdp_needed);
4676		} else if (new && !old) {
4677			static_key_slow_inc(&generic_xdp_needed);
4678			dev_disable_lro(dev);
4679			dev_disable_gro_hw(dev);
4680		}
4681		break;
4682
4683	case XDP_QUERY_PROG:
4684		xdp->prog_attached = !!old;
4685		xdp->prog_id = old ? old->aux->id : 0;
4686		break;
4687
4688	default:
4689		ret = -EINVAL;
4690		break;
4691	}
4692
4693	return ret;
4694}
4695
4696static int netif_receive_skb_internal(struct sk_buff *skb)
4697{
4698	int ret;
4699
4700	net_timestamp_check(netdev_tstamp_prequeue, skb);
4701
4702	if (skb_defer_rx_timestamp(skb))
4703		return NET_RX_SUCCESS;
4704
4705	if (static_key_false(&generic_xdp_needed)) {
4706		int ret;
4707
4708		preempt_disable();
4709		rcu_read_lock();
4710		ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
4711		rcu_read_unlock();
4712		preempt_enable();
4713
4714		if (ret != XDP_PASS)
4715			return NET_RX_DROP;
4716	}
4717
4718	rcu_read_lock();
 
4719#ifdef CONFIG_RPS
4720	if (static_key_false(&rps_needed)) {
4721		struct rps_dev_flow voidflow, *rflow = &voidflow;
4722		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4723
4724		if (cpu >= 0) {
4725			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4726			rcu_read_unlock();
4727			return ret;
4728		}
4729	}
4730#endif
4731	ret = __netif_receive_skb(skb);
4732	rcu_read_unlock();
4733	return ret;
4734}
4735
4736/**
4737 *	netif_receive_skb - process receive buffer from network
4738 *	@skb: buffer to process
4739 *
4740 *	netif_receive_skb() is the main receive data processing function.
4741 *	It always succeeds. The buffer may be dropped during processing
4742 *	for congestion control or by the protocol layers.
4743 *
4744 *	This function may only be called from softirq context and interrupts
4745 *	should be enabled.
4746 *
4747 *	Return values (usually ignored):
4748 *	NET_RX_SUCCESS: no congestion
4749 *	NET_RX_DROP: packet was dropped
4750 */
4751int netif_receive_skb(struct sk_buff *skb)
4752{
4753	trace_netif_receive_skb_entry(skb);
4754
4755	return netif_receive_skb_internal(skb);
4756}
4757EXPORT_SYMBOL(netif_receive_skb);
4758
4759DEFINE_PER_CPU(struct work_struct, flush_works);
4760
4761/* Network device is going away, flush any packets still pending */
4762static void flush_backlog(struct work_struct *work)
4763{
4764	struct sk_buff *skb, *tmp;
4765	struct softnet_data *sd;
4766
4767	local_bh_disable();
4768	sd = this_cpu_ptr(&softnet_data);
4769
4770	local_irq_disable();
4771	rps_lock(sd);
4772	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4773		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4774			__skb_unlink(skb, &sd->input_pkt_queue);
4775			kfree_skb(skb);
4776			input_queue_head_incr(sd);
4777		}
4778	}
4779	rps_unlock(sd);
4780	local_irq_enable();
4781
4782	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4783		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4784			__skb_unlink(skb, &sd->process_queue);
4785			kfree_skb(skb);
4786			input_queue_head_incr(sd);
4787		}
4788	}
4789	local_bh_enable();
4790}
4791
4792static void flush_all_backlogs(void)
4793{
4794	unsigned int cpu;
4795
4796	get_online_cpus();
4797
4798	for_each_online_cpu(cpu)
4799		queue_work_on(cpu, system_highpri_wq,
4800			      per_cpu_ptr(&flush_works, cpu));
4801
4802	for_each_online_cpu(cpu)
4803		flush_work(per_cpu_ptr(&flush_works, cpu));
4804
4805	put_online_cpus();
4806}
4807
4808static int napi_gro_complete(struct sk_buff *skb)
4809{
4810	struct packet_offload *ptype;
4811	__be16 type = skb->protocol;
4812	struct list_head *head = &offload_base;
4813	int err = -ENOENT;
4814
4815	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4816
4817	if (NAPI_GRO_CB(skb)->count == 1) {
4818		skb_shinfo(skb)->gso_size = 0;
4819		goto out;
4820	}
4821
4822	rcu_read_lock();
4823	list_for_each_entry_rcu(ptype, head, list) {
4824		if (ptype->type != type || !ptype->callbacks.gro_complete)
4825			continue;
4826
4827		err = ptype->callbacks.gro_complete(skb, 0);
4828		break;
4829	}
4830	rcu_read_unlock();
4831
4832	if (err) {
4833		WARN_ON(&ptype->list == head);
4834		kfree_skb(skb);
4835		return NET_RX_SUCCESS;
4836	}
4837
4838out:
4839	return netif_receive_skb_internal(skb);
4840}
4841
4842/* napi->gro_list contains packets ordered by age.
4843 * youngest packets at the head of it.
4844 * Complete skbs in reverse order to reduce latencies.
4845 */
4846void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4847{
4848	struct sk_buff *skb, *prev = NULL;
4849
4850	/* scan list and build reverse chain */
4851	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4852		skb->prev = prev;
4853		prev = skb;
4854	}
4855
4856	for (skb = prev; skb; skb = prev) {
4857		skb->next = NULL;
4858
4859		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4860			return;
4861
4862		prev = skb->prev;
4863		napi_gro_complete(skb);
4864		napi->gro_count--;
4865	}
4866
4867	napi->gro_list = NULL;
4868}
4869EXPORT_SYMBOL(napi_gro_flush);
4870
4871static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4872{
4873	struct sk_buff *p;
4874	unsigned int maclen = skb->dev->hard_header_len;
4875	u32 hash = skb_get_hash_raw(skb);
4876
4877	for (p = napi->gro_list; p; p = p->next) {
4878		unsigned long diffs;
4879
4880		NAPI_GRO_CB(p)->flush = 0;
4881
4882		if (hash != skb_get_hash_raw(p)) {
4883			NAPI_GRO_CB(p)->same_flow = 0;
4884			continue;
4885		}
4886
4887		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4888		diffs |= p->vlan_tci ^ skb->vlan_tci;
4889		diffs |= skb_metadata_dst_cmp(p, skb);
4890		diffs |= skb_metadata_differs(p, skb);
4891		if (maclen == ETH_HLEN)
4892			diffs |= compare_ether_header(skb_mac_header(p),
4893						      skb_mac_header(skb));
4894		else if (!diffs)
4895			diffs = memcmp(skb_mac_header(p),
4896				       skb_mac_header(skb),
4897				       maclen);
4898		NAPI_GRO_CB(p)->same_flow = !diffs;
4899	}
4900}
4901
4902static void skb_gro_reset_offset(struct sk_buff *skb)
4903{
4904	const struct skb_shared_info *pinfo = skb_shinfo(skb);
4905	const skb_frag_t *frag0 = &pinfo->frags[0];
4906
4907	NAPI_GRO_CB(skb)->data_offset = 0;
4908	NAPI_GRO_CB(skb)->frag0 = NULL;
4909	NAPI_GRO_CB(skb)->frag0_len = 0;
4910
4911	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4912	    pinfo->nr_frags &&
4913	    !PageHighMem(skb_frag_page(frag0))) {
4914		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4915		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4916						    skb_frag_size(frag0),
4917						    skb->end - skb->tail);
4918	}
4919}
4920
4921static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4922{
4923	struct skb_shared_info *pinfo = skb_shinfo(skb);
4924
4925	BUG_ON(skb->end - skb->tail < grow);
4926
4927	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4928
4929	skb->data_len -= grow;
4930	skb->tail += grow;
4931
4932	pinfo->frags[0].page_offset += grow;
4933	skb_frag_size_sub(&pinfo->frags[0], grow);
4934
4935	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4936		skb_frag_unref(skb, 0);
4937		memmove(pinfo->frags, pinfo->frags + 1,
4938			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
4939	}
4940}
4941
4942static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4943{
4944	struct sk_buff **pp = NULL;
4945	struct packet_offload *ptype;
4946	__be16 type = skb->protocol;
4947	struct list_head *head = &offload_base;
4948	int same_flow;
4949	enum gro_result ret;
4950	int grow;
4951
4952	if (netif_elide_gro(skb->dev))
 
 
 
4953		goto normal;
4954
4955	gro_list_prepare(napi, skb);
4956
4957	rcu_read_lock();
4958	list_for_each_entry_rcu(ptype, head, list) {
4959		if (ptype->type != type || !ptype->callbacks.gro_receive)
4960			continue;
4961
4962		skb_set_network_header(skb, skb_gro_offset(skb));
4963		skb_reset_mac_len(skb);
4964		NAPI_GRO_CB(skb)->same_flow = 0;
4965		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
4966		NAPI_GRO_CB(skb)->free = 0;
4967		NAPI_GRO_CB(skb)->encap_mark = 0;
4968		NAPI_GRO_CB(skb)->recursion_counter = 0;
4969		NAPI_GRO_CB(skb)->is_fou = 0;
4970		NAPI_GRO_CB(skb)->is_atomic = 1;
4971		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4972
4973		/* Setup for GRO checksum validation */
4974		switch (skb->ip_summed) {
4975		case CHECKSUM_COMPLETE:
4976			NAPI_GRO_CB(skb)->csum = skb->csum;
4977			NAPI_GRO_CB(skb)->csum_valid = 1;
4978			NAPI_GRO_CB(skb)->csum_cnt = 0;
4979			break;
4980		case CHECKSUM_UNNECESSARY:
4981			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4982			NAPI_GRO_CB(skb)->csum_valid = 0;
4983			break;
4984		default:
4985			NAPI_GRO_CB(skb)->csum_cnt = 0;
4986			NAPI_GRO_CB(skb)->csum_valid = 0;
4987		}
4988
4989		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4990		break;
4991	}
4992	rcu_read_unlock();
4993
4994	if (&ptype->list == head)
4995		goto normal;
4996
4997	if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
4998		ret = GRO_CONSUMED;
4999		goto ok;
5000	}
5001
5002	same_flow = NAPI_GRO_CB(skb)->same_flow;
5003	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
5004
5005	if (pp) {
5006		struct sk_buff *nskb = *pp;
5007
5008		*pp = nskb->next;
5009		nskb->next = NULL;
5010		napi_gro_complete(nskb);
5011		napi->gro_count--;
5012	}
5013
5014	if (same_flow)
5015		goto ok;
5016
5017	if (NAPI_GRO_CB(skb)->flush)
5018		goto normal;
5019
5020	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
5021		struct sk_buff *nskb = napi->gro_list;
5022
5023		/* locate the end of the list to select the 'oldest' flow */
5024		while (nskb->next) {
5025			pp = &nskb->next;
5026			nskb = *pp;
5027		}
5028		*pp = NULL;
5029		nskb->next = NULL;
5030		napi_gro_complete(nskb);
5031	} else {
5032		napi->gro_count++;
5033	}
5034	NAPI_GRO_CB(skb)->count = 1;
5035	NAPI_GRO_CB(skb)->age = jiffies;
5036	NAPI_GRO_CB(skb)->last = skb;
5037	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
5038	skb->next = napi->gro_list;
5039	napi->gro_list = skb;
5040	ret = GRO_HELD;
5041
5042pull:
5043	grow = skb_gro_offset(skb) - skb_headlen(skb);
5044	if (grow > 0)
5045		gro_pull_from_frag0(skb, grow);
5046ok:
5047	return ret;
5048
5049normal:
5050	ret = GRO_NORMAL;
5051	goto pull;
5052}
5053
5054struct packet_offload *gro_find_receive_by_type(__be16 type)
5055{
5056	struct list_head *offload_head = &offload_base;
5057	struct packet_offload *ptype;
5058
5059	list_for_each_entry_rcu(ptype, offload_head, list) {
5060		if (ptype->type != type || !ptype->callbacks.gro_receive)
5061			continue;
5062		return ptype;
5063	}
5064	return NULL;
5065}
5066EXPORT_SYMBOL(gro_find_receive_by_type);
5067
5068struct packet_offload *gro_find_complete_by_type(__be16 type)
5069{
5070	struct list_head *offload_head = &offload_base;
5071	struct packet_offload *ptype;
5072
5073	list_for_each_entry_rcu(ptype, offload_head, list) {
5074		if (ptype->type != type || !ptype->callbacks.gro_complete)
5075			continue;
5076		return ptype;
5077	}
5078	return NULL;
5079}
5080EXPORT_SYMBOL(gro_find_complete_by_type);
5081
5082static void napi_skb_free_stolen_head(struct sk_buff *skb)
5083{
5084	skb_dst_drop(skb);
5085	secpath_reset(skb);
5086	kmem_cache_free(skbuff_head_cache, skb);
5087}
5088
5089static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
5090{
5091	switch (ret) {
5092	case GRO_NORMAL:
5093		if (netif_receive_skb_internal(skb))
5094			ret = GRO_DROP;
5095		break;
5096
5097	case GRO_DROP:
5098		kfree_skb(skb);
5099		break;
5100
5101	case GRO_MERGED_FREE:
5102		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
5103			napi_skb_free_stolen_head(skb);
5104		else
 
5105			__kfree_skb(skb);
 
5106		break;
5107
5108	case GRO_HELD:
5109	case GRO_MERGED:
5110	case GRO_CONSUMED:
5111		break;
5112	}
5113
5114	return ret;
5115}
5116
5117gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5118{
5119	skb_mark_napi_id(skb, napi);
5120	trace_napi_gro_receive_entry(skb);
5121
5122	skb_gro_reset_offset(skb);
5123
5124	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
5125}
5126EXPORT_SYMBOL(napi_gro_receive);
5127
5128static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
5129{
5130	if (unlikely(skb->pfmemalloc)) {
5131		consume_skb(skb);
5132		return;
5133	}
5134	__skb_pull(skb, skb_headlen(skb));
5135	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
5136	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
5137	skb->vlan_tci = 0;
5138	skb->dev = napi->dev;
5139	skb->skb_iif = 0;
5140	skb->encapsulation = 0;
5141	skb_shinfo(skb)->gso_type = 0;
5142	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
5143	secpath_reset(skb);
5144
5145	napi->skb = skb;
5146}
5147
5148struct sk_buff *napi_get_frags(struct napi_struct *napi)
5149{
5150	struct sk_buff *skb = napi->skb;
5151
5152	if (!skb) {
5153		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
5154		if (skb) {
5155			napi->skb = skb;
5156			skb_mark_napi_id(skb, napi);
5157		}
5158	}
5159	return skb;
5160}
5161EXPORT_SYMBOL(napi_get_frags);
5162
5163static gro_result_t napi_frags_finish(struct napi_struct *napi,
5164				      struct sk_buff *skb,
5165				      gro_result_t ret)
5166{
5167	switch (ret) {
5168	case GRO_NORMAL:
5169	case GRO_HELD:
5170		__skb_push(skb, ETH_HLEN);
5171		skb->protocol = eth_type_trans(skb, skb->dev);
5172		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
5173			ret = GRO_DROP;
5174		break;
5175
5176	case GRO_DROP:
5177		napi_reuse_skb(napi, skb);
5178		break;
5179
5180	case GRO_MERGED_FREE:
5181		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
5182			napi_skb_free_stolen_head(skb);
5183		else
5184			napi_reuse_skb(napi, skb);
5185		break;
5186
5187	case GRO_MERGED:
5188	case GRO_CONSUMED:
5189		break;
5190	}
5191
5192	return ret;
5193}
5194
5195/* Upper GRO stack assumes network header starts at gro_offset=0
5196 * Drivers could call both napi_gro_frags() and napi_gro_receive()
5197 * We copy ethernet header into skb->data to have a common layout.
5198 */
5199static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
5200{
5201	struct sk_buff *skb = napi->skb;
5202	const struct ethhdr *eth;
5203	unsigned int hlen = sizeof(*eth);
5204
5205	napi->skb = NULL;
5206
5207	skb_reset_mac_header(skb);
5208	skb_gro_reset_offset(skb);
5209
5210	eth = skb_gro_header_fast(skb, 0);
5211	if (unlikely(skb_gro_header_hard(skb, hlen))) {
5212		eth = skb_gro_header_slow(skb, hlen, 0);
5213		if (unlikely(!eth)) {
5214			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
5215					     __func__, napi->dev->name);
5216			napi_reuse_skb(napi, skb);
5217			return NULL;
5218		}
5219	} else {
5220		gro_pull_from_frag0(skb, hlen);
5221		NAPI_GRO_CB(skb)->frag0 += hlen;
5222		NAPI_GRO_CB(skb)->frag0_len -= hlen;
5223	}
5224	__skb_pull(skb, hlen);
5225
5226	/*
5227	 * This works because the only protocols we care about don't require
5228	 * special handling.
5229	 * We'll fix it up properly in napi_frags_finish()
5230	 */
5231	skb->protocol = eth->h_proto;
5232
5233	return skb;
5234}
5235
5236gro_result_t napi_gro_frags(struct napi_struct *napi)
5237{
5238	struct sk_buff *skb = napi_frags_skb(napi);
5239
5240	if (!skb)
5241		return GRO_DROP;
5242
5243	trace_napi_gro_frags_entry(skb);
5244
5245	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
5246}
5247EXPORT_SYMBOL(napi_gro_frags);
5248
5249/* Compute the checksum from gro_offset and return the folded value
5250 * after adding in any pseudo checksum.
5251 */
5252__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
5253{
5254	__wsum wsum;
5255	__sum16 sum;
5256
5257	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
5258
5259	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
5260	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
5261	if (likely(!sum)) {
5262		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
5263		    !skb->csum_complete_sw)
5264			netdev_rx_csum_fault(skb->dev);
5265	}
5266
5267	NAPI_GRO_CB(skb)->csum = wsum;
5268	NAPI_GRO_CB(skb)->csum_valid = 1;
5269
5270	return sum;
5271}
5272EXPORT_SYMBOL(__skb_gro_checksum_complete);
5273
5274static void net_rps_send_ipi(struct softnet_data *remsd)
5275{
5276#ifdef CONFIG_RPS
5277	while (remsd) {
5278		struct softnet_data *next = remsd->rps_ipi_next;
5279
5280		if (cpu_online(remsd->cpu))
5281			smp_call_function_single_async(remsd->cpu, &remsd->csd);
5282		remsd = next;
5283	}
5284#endif
5285}
5286
5287/*
5288 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
5289 * Note: called with local irq disabled, but exits with local irq enabled.
5290 */
5291static void net_rps_action_and_irq_enable(struct softnet_data *sd)
5292{
5293#ifdef CONFIG_RPS
5294	struct softnet_data *remsd = sd->rps_ipi_list;
5295
5296	if (remsd) {
5297		sd->rps_ipi_list = NULL;
5298
5299		local_irq_enable();
5300
5301		/* Send pending IPI's to kick RPS processing on remote cpus. */
5302		net_rps_send_ipi(remsd);
 
 
 
 
 
 
 
5303	} else
5304#endif
5305		local_irq_enable();
5306}
5307
5308static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
5309{
5310#ifdef CONFIG_RPS
5311	return sd->rps_ipi_list != NULL;
5312#else
5313	return false;
5314#endif
5315}
5316
5317static int process_backlog(struct napi_struct *napi, int quota)
5318{
5319	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
5320	bool again = true;
5321	int work = 0;
5322
5323	/* Check if we have pending ipi, its better to send them now,
5324	 * not waiting net_rx_action() end.
5325	 */
5326	if (sd_has_rps_ipi_waiting(sd)) {
5327		local_irq_disable();
5328		net_rps_action_and_irq_enable(sd);
5329	}
5330
5331	napi->weight = dev_rx_weight;
5332	while (again) {
5333		struct sk_buff *skb;
5334
5335		while ((skb = __skb_dequeue(&sd->process_queue))) {
5336			rcu_read_lock();
5337			__netif_receive_skb(skb);
5338			rcu_read_unlock();
5339			input_queue_head_incr(sd);
5340			if (++work >= quota)
5341				return work;
5342
5343		}
5344
5345		local_irq_disable();
5346		rps_lock(sd);
5347		if (skb_queue_empty(&sd->input_pkt_queue)) {
5348			/*
5349			 * Inline a custom version of __napi_complete().
5350			 * only current cpu owns and manipulates this napi,
5351			 * and NAPI_STATE_SCHED is the only possible flag set
5352			 * on backlog.
5353			 * We can use a plain write instead of clear_bit(),
5354			 * and we dont need an smp_mb() memory barrier.
5355			 */
5356			napi->state = 0;
5357			again = false;
5358		} else {
5359			skb_queue_splice_tail_init(&sd->input_pkt_queue,
5360						   &sd->process_queue);
5361		}
5362		rps_unlock(sd);
5363		local_irq_enable();
5364	}
5365
5366	return work;
5367}
5368
5369/**
5370 * __napi_schedule - schedule for receive
5371 * @n: entry to schedule
5372 *
5373 * The entry's receive function will be scheduled to run.
5374 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
5375 */
5376void __napi_schedule(struct napi_struct *n)
5377{
5378	unsigned long flags;
5379
5380	local_irq_save(flags);
5381	____napi_schedule(this_cpu_ptr(&softnet_data), n);
5382	local_irq_restore(flags);
5383}
5384EXPORT_SYMBOL(__napi_schedule);
5385
5386/**
5387 *	napi_schedule_prep - check if napi can be scheduled
5388 *	@n: napi context
5389 *
5390 * Test if NAPI routine is already running, and if not mark
5391 * it as running.  This is used as a condition variable
5392 * insure only one NAPI poll instance runs.  We also make
5393 * sure there is no pending NAPI disable.
5394 */
5395bool napi_schedule_prep(struct napi_struct *n)
5396{
5397	unsigned long val, new;
5398
5399	do {
5400		val = READ_ONCE(n->state);
5401		if (unlikely(val & NAPIF_STATE_DISABLE))
5402			return false;
5403		new = val | NAPIF_STATE_SCHED;
5404
5405		/* Sets STATE_MISSED bit if STATE_SCHED was already set
5406		 * This was suggested by Alexander Duyck, as compiler
5407		 * emits better code than :
5408		 * if (val & NAPIF_STATE_SCHED)
5409		 *     new |= NAPIF_STATE_MISSED;
5410		 */
5411		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
5412						   NAPIF_STATE_MISSED;
5413	} while (cmpxchg(&n->state, val, new) != val);
5414
5415	return !(val & NAPIF_STATE_SCHED);
5416}
5417EXPORT_SYMBOL(napi_schedule_prep);
5418
5419/**
5420 * __napi_schedule_irqoff - schedule for receive
5421 * @n: entry to schedule
5422 *
5423 * Variant of __napi_schedule() assuming hard irqs are masked
5424 */
5425void __napi_schedule_irqoff(struct napi_struct *n)
5426{
5427	____napi_schedule(this_cpu_ptr(&softnet_data), n);
5428}
5429EXPORT_SYMBOL(__napi_schedule_irqoff);
5430
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5431bool napi_complete_done(struct napi_struct *n, int work_done)
5432{
5433	unsigned long flags, val, new;
5434
5435	/*
5436	 * 1) Don't let napi dequeue from the cpu poll list
5437	 *    just in case its running on a different cpu.
5438	 * 2) If we are busy polling, do nothing here, we have
5439	 *    the guarantee we will be called later.
5440	 */
5441	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
5442				 NAPIF_STATE_IN_BUSY_POLL)))
5443		return false;
5444
5445	if (n->gro_list) {
5446		unsigned long timeout = 0;
5447
5448		if (work_done)
5449			timeout = n->dev->gro_flush_timeout;
5450
5451		if (timeout)
5452			hrtimer_start(&n->timer, ns_to_ktime(timeout),
5453				      HRTIMER_MODE_REL_PINNED);
5454		else
5455			napi_gro_flush(n, false);
5456	}
5457	if (unlikely(!list_empty(&n->poll_list))) {
5458		/* If n->poll_list is not empty, we need to mask irqs */
5459		local_irq_save(flags);
5460		list_del_init(&n->poll_list);
5461		local_irq_restore(flags);
5462	}
5463
5464	do {
5465		val = READ_ONCE(n->state);
5466
5467		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
5468
5469		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
5470
5471		/* If STATE_MISSED was set, leave STATE_SCHED set,
5472		 * because we will call napi->poll() one more time.
5473		 * This C code was suggested by Alexander Duyck to help gcc.
5474		 */
5475		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
5476						    NAPIF_STATE_SCHED;
5477	} while (cmpxchg(&n->state, val, new) != val);
5478
5479	if (unlikely(val & NAPIF_STATE_MISSED)) {
5480		__napi_schedule(n);
5481		return false;
5482	}
5483
5484	return true;
5485}
5486EXPORT_SYMBOL(napi_complete_done);
5487
5488/* must be called under rcu_read_lock(), as we dont take a reference */
5489static struct napi_struct *napi_by_id(unsigned int napi_id)
5490{
5491	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
5492	struct napi_struct *napi;
5493
5494	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
5495		if (napi->napi_id == napi_id)
5496			return napi;
5497
5498	return NULL;
5499}
5500
5501#if defined(CONFIG_NET_RX_BUSY_POLL)
5502
5503#define BUSY_POLL_BUDGET 8
5504
5505static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
5506{
5507	int rc;
5508
5509	/* Busy polling means there is a high chance device driver hard irq
5510	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
5511	 * set in napi_schedule_prep().
5512	 * Since we are about to call napi->poll() once more, we can safely
5513	 * clear NAPI_STATE_MISSED.
5514	 *
5515	 * Note: x86 could use a single "lock and ..." instruction
5516	 * to perform these two clear_bit()
5517	 */
5518	clear_bit(NAPI_STATE_MISSED, &napi->state);
5519	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
5520
5521	local_bh_disable();
5522
5523	/* All we really want here is to re-enable device interrupts.
5524	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
5525	 */
5526	rc = napi->poll(napi, BUSY_POLL_BUDGET);
5527	trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5528	netpoll_poll_unlock(have_poll_lock);
5529	if (rc == BUSY_POLL_BUDGET)
5530		__napi_schedule(napi);
5531	local_bh_enable();
 
 
5532}
5533
5534void napi_busy_loop(unsigned int napi_id,
5535		    bool (*loop_end)(void *, unsigned long),
5536		    void *loop_end_arg)
5537{
5538	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
5539	int (*napi_poll)(struct napi_struct *napi, int budget);
 
5540	void *have_poll_lock = NULL;
5541	struct napi_struct *napi;
 
5542
5543restart:
 
5544	napi_poll = NULL;
5545
5546	rcu_read_lock();
5547
5548	napi = napi_by_id(napi_id);
5549	if (!napi)
5550		goto out;
5551
 
 
 
5552	preempt_disable();
5553	for (;;) {
5554		int work = 0;
5555
5556		local_bh_disable();
 
 
 
 
5557		if (!napi_poll) {
5558			unsigned long val = READ_ONCE(napi->state);
5559
5560			/* If multiple threads are competing for this napi,
5561			 * we avoid dirtying napi->state as much as we can.
5562			 */
5563			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
5564				   NAPIF_STATE_IN_BUSY_POLL))
5565				goto count;
5566			if (cmpxchg(&napi->state, val,
5567				    val | NAPIF_STATE_IN_BUSY_POLL |
5568					  NAPIF_STATE_SCHED) != val)
5569				goto count;
5570			have_poll_lock = netpoll_poll_lock(napi);
5571			napi_poll = napi->poll;
5572		}
5573		work = napi_poll(napi, BUSY_POLL_BUDGET);
5574		trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
5575count:
5576		if (work > 0)
5577			__NET_ADD_STATS(dev_net(napi->dev),
5578					LINUX_MIB_BUSYPOLLRXPACKETS, work);
5579		local_bh_enable();
5580
5581		if (!loop_end || loop_end(loop_end_arg, start_time))
 
 
 
 
5582			break;
5583
5584		if (unlikely(need_resched())) {
5585			if (napi_poll)
5586				busy_poll_stop(napi, have_poll_lock);
5587			preempt_enable();
5588			rcu_read_unlock();
5589			cond_resched();
5590			if (loop_end(loop_end_arg, start_time))
5591				return;
 
5592			goto restart;
5593		}
5594		cpu_relax();
5595	}
5596	if (napi_poll)
5597		busy_poll_stop(napi, have_poll_lock);
5598	preempt_enable();
 
5599out:
5600	rcu_read_unlock();
 
5601}
5602EXPORT_SYMBOL(napi_busy_loop);
5603
5604#endif /* CONFIG_NET_RX_BUSY_POLL */
5605
5606static void napi_hash_add(struct napi_struct *napi)
5607{
5608	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5609	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5610		return;
5611
5612	spin_lock(&napi_hash_lock);
5613
5614	/* 0..NR_CPUS range is reserved for sender_cpu use */
5615	do {
5616		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
5617			napi_gen_id = MIN_NAPI_ID;
5618	} while (napi_by_id(napi_gen_id));
5619	napi->napi_id = napi_gen_id;
5620
5621	hlist_add_head_rcu(&napi->napi_hash_node,
5622			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5623
5624	spin_unlock(&napi_hash_lock);
5625}
5626
5627/* Warning : caller is responsible to make sure rcu grace period
5628 * is respected before freeing memory containing @napi
5629 */
5630bool napi_hash_del(struct napi_struct *napi)
5631{
5632	bool rcu_sync_needed = false;
5633
5634	spin_lock(&napi_hash_lock);
5635
5636	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5637		rcu_sync_needed = true;
5638		hlist_del_rcu(&napi->napi_hash_node);
5639	}
5640	spin_unlock(&napi_hash_lock);
5641	return rcu_sync_needed;
5642}
5643EXPORT_SYMBOL_GPL(napi_hash_del);
5644
5645static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5646{
5647	struct napi_struct *napi;
5648
5649	napi = container_of(timer, struct napi_struct, timer);
5650
5651	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
5652	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
5653	 */
5654	if (napi->gro_list && !napi_disable_pending(napi) &&
5655	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
5656		__napi_schedule_irqoff(napi);
5657
5658	return HRTIMER_NORESTART;
5659}
5660
5661void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5662		    int (*poll)(struct napi_struct *, int), int weight)
5663{
5664	INIT_LIST_HEAD(&napi->poll_list);
5665	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5666	napi->timer.function = napi_watchdog;
5667	napi->gro_count = 0;
5668	napi->gro_list = NULL;
5669	napi->skb = NULL;
5670	napi->poll = poll;
5671	if (weight > NAPI_POLL_WEIGHT)
5672		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5673			    weight, dev->name);
5674	napi->weight = weight;
5675	list_add(&napi->dev_list, &dev->napi_list);
5676	napi->dev = dev;
5677#ifdef CONFIG_NETPOLL
5678	napi->poll_owner = -1;
5679#endif
5680	set_bit(NAPI_STATE_SCHED, &napi->state);
5681	napi_hash_add(napi);
5682}
5683EXPORT_SYMBOL(netif_napi_add);
5684
5685void napi_disable(struct napi_struct *n)
5686{
5687	might_sleep();
5688	set_bit(NAPI_STATE_DISABLE, &n->state);
5689
5690	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5691		msleep(1);
5692	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5693		msleep(1);
5694
5695	hrtimer_cancel(&n->timer);
5696
5697	clear_bit(NAPI_STATE_DISABLE, &n->state);
5698}
5699EXPORT_SYMBOL(napi_disable);
5700
5701/* Must be called in process context */
5702void netif_napi_del(struct napi_struct *napi)
5703{
5704	might_sleep();
5705	if (napi_hash_del(napi))
5706		synchronize_net();
5707	list_del_init(&napi->dev_list);
5708	napi_free_frags(napi);
5709
5710	kfree_skb_list(napi->gro_list);
5711	napi->gro_list = NULL;
5712	napi->gro_count = 0;
5713}
5714EXPORT_SYMBOL(netif_napi_del);
5715
5716static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5717{
5718	void *have;
5719	int work, weight;
5720
5721	list_del_init(&n->poll_list);
5722
5723	have = netpoll_poll_lock(n);
5724
5725	weight = n->weight;
5726
5727	/* This NAPI_STATE_SCHED test is for avoiding a race
5728	 * with netpoll's poll_napi().  Only the entity which
5729	 * obtains the lock and sees NAPI_STATE_SCHED set will
5730	 * actually make the ->poll() call.  Therefore we avoid
5731	 * accidentally calling ->poll() when NAPI is not scheduled.
5732	 */
5733	work = 0;
5734	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5735		work = n->poll(n, weight);
5736		trace_napi_poll(n, work, weight);
5737	}
5738
5739	WARN_ON_ONCE(work > weight);
5740
5741	if (likely(work < weight))
5742		goto out_unlock;
5743
5744	/* Drivers must not modify the NAPI state if they
5745	 * consume the entire weight.  In such cases this code
5746	 * still "owns" the NAPI instance and therefore can
5747	 * move the instance around on the list at-will.
5748	 */
5749	if (unlikely(napi_disable_pending(n))) {
5750		napi_complete(n);
5751		goto out_unlock;
5752	}
5753
5754	if (n->gro_list) {
5755		/* flush too old packets
5756		 * If HZ < 1000, flush all packets.
5757		 */
5758		napi_gro_flush(n, HZ >= 1000);
5759	}
5760
5761	/* Some drivers may have called napi_schedule
5762	 * prior to exhausting their budget.
5763	 */
5764	if (unlikely(!list_empty(&n->poll_list))) {
5765		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5766			     n->dev ? n->dev->name : "backlog");
5767		goto out_unlock;
5768	}
5769
5770	list_add_tail(&n->poll_list, repoll);
5771
5772out_unlock:
5773	netpoll_poll_unlock(have);
5774
5775	return work;
5776}
5777
5778static __latent_entropy void net_rx_action(struct softirq_action *h)
5779{
5780	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5781	unsigned long time_limit = jiffies +
5782		usecs_to_jiffies(netdev_budget_usecs);
5783	int budget = netdev_budget;
5784	LIST_HEAD(list);
5785	LIST_HEAD(repoll);
5786
5787	local_irq_disable();
5788	list_splice_init(&sd->poll_list, &list);
5789	local_irq_enable();
5790
5791	for (;;) {
5792		struct napi_struct *n;
5793
5794		if (list_empty(&list)) {
5795			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5796				goto out;
5797			break;
5798		}
5799
5800		n = list_first_entry(&list, struct napi_struct, poll_list);
5801		budget -= napi_poll(n, &repoll);
5802
5803		/* If softirq window is exhausted then punt.
5804		 * Allow this to run for 2 jiffies since which will allow
5805		 * an average latency of 1.5/HZ.
5806		 */
5807		if (unlikely(budget <= 0 ||
5808			     time_after_eq(jiffies, time_limit))) {
5809			sd->time_squeeze++;
5810			break;
5811		}
5812	}
5813
5814	local_irq_disable();
5815
5816	list_splice_tail_init(&sd->poll_list, &list);
5817	list_splice_tail(&repoll, &list);
5818	list_splice(&list, &sd->poll_list);
5819	if (!list_empty(&sd->poll_list))
5820		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
5821
5822	net_rps_action_and_irq_enable(sd);
5823out:
5824	__kfree_skb_flush();
5825}
5826
5827struct netdev_adjacent {
5828	struct net_device *dev;
5829
5830	/* upper master flag, there can only be one master device per list */
5831	bool master;
5832
5833	/* counter for the number of times this device was added to us */
5834	u16 ref_nr;
5835
5836	/* private field for the users */
5837	void *private;
5838
5839	struct list_head list;
5840	struct rcu_head rcu;
5841};
5842
5843static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5844						 struct list_head *adj_list)
5845{
5846	struct netdev_adjacent *adj;
5847
5848	list_for_each_entry(adj, adj_list, list) {
5849		if (adj->dev == adj_dev)
5850			return adj;
5851	}
5852	return NULL;
5853}
5854
5855static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
5856{
5857	struct net_device *dev = data;
5858
5859	return upper_dev == dev;
5860}
5861
5862/**
5863 * netdev_has_upper_dev - Check if device is linked to an upper device
5864 * @dev: device
5865 * @upper_dev: upper device to check
5866 *
5867 * Find out if a device is linked to specified upper device and return true
5868 * in case it is. Note that this checks only immediate upper device,
5869 * not through a complete stack of devices. The caller must hold the RTNL lock.
5870 */
5871bool netdev_has_upper_dev(struct net_device *dev,
5872			  struct net_device *upper_dev)
5873{
5874	ASSERT_RTNL();
5875
5876	return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5877					     upper_dev);
5878}
5879EXPORT_SYMBOL(netdev_has_upper_dev);
5880
5881/**
5882 * netdev_has_upper_dev_all - Check if device is linked to an upper device
5883 * @dev: device
5884 * @upper_dev: upper device to check
5885 *
5886 * Find out if a device is linked to specified upper device and return true
5887 * in case it is. Note that this checks the entire upper device chain.
5888 * The caller must hold rcu lock.
5889 */
5890
5891bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
5892				  struct net_device *upper_dev)
5893{
5894	return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5895					       upper_dev);
5896}
5897EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
5898
5899/**
5900 * netdev_has_any_upper_dev - Check if device is linked to some device
5901 * @dev: device
5902 *
5903 * Find out if a device is linked to an upper device and return true in case
5904 * it is. The caller must hold the RTNL lock.
5905 */
5906bool netdev_has_any_upper_dev(struct net_device *dev)
5907{
5908	ASSERT_RTNL();
5909
5910	return !list_empty(&dev->adj_list.upper);
5911}
5912EXPORT_SYMBOL(netdev_has_any_upper_dev);
5913
5914/**
5915 * netdev_master_upper_dev_get - Get master upper device
5916 * @dev: device
5917 *
5918 * Find a master upper device and return pointer to it or NULL in case
5919 * it's not there. The caller must hold the RTNL lock.
5920 */
5921struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5922{
5923	struct netdev_adjacent *upper;
5924
5925	ASSERT_RTNL();
5926
5927	if (list_empty(&dev->adj_list.upper))
5928		return NULL;
5929
5930	upper = list_first_entry(&dev->adj_list.upper,
5931				 struct netdev_adjacent, list);
5932	if (likely(upper->master))
5933		return upper->dev;
5934	return NULL;
5935}
5936EXPORT_SYMBOL(netdev_master_upper_dev_get);
5937
5938/**
5939 * netdev_has_any_lower_dev - Check if device is linked to some device
5940 * @dev: device
5941 *
5942 * Find out if a device is linked to a lower device and return true in case
5943 * it is. The caller must hold the RTNL lock.
5944 */
5945static bool netdev_has_any_lower_dev(struct net_device *dev)
5946{
5947	ASSERT_RTNL();
5948
5949	return !list_empty(&dev->adj_list.lower);
5950}
5951
5952void *netdev_adjacent_get_private(struct list_head *adj_list)
5953{
5954	struct netdev_adjacent *adj;
5955
5956	adj = list_entry(adj_list, struct netdev_adjacent, list);
5957
5958	return adj->private;
5959}
5960EXPORT_SYMBOL(netdev_adjacent_get_private);
5961
5962/**
5963 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5964 * @dev: device
5965 * @iter: list_head ** of the current position
5966 *
5967 * Gets the next device from the dev's upper list, starting from iter
5968 * position. The caller must hold RCU read lock.
5969 */
5970struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5971						 struct list_head **iter)
5972{
5973	struct netdev_adjacent *upper;
5974
5975	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5976
5977	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5978
5979	if (&upper->list == &dev->adj_list.upper)
5980		return NULL;
5981
5982	*iter = &upper->list;
5983
5984	return upper->dev;
5985}
5986EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5987
5988static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
5989						    struct list_head **iter)
5990{
5991	struct netdev_adjacent *upper;
5992
5993	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5994
5995	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5996
5997	if (&upper->list == &dev->adj_list.upper)
5998		return NULL;
5999
6000	*iter = &upper->list;
6001
6002	return upper->dev;
6003}
6004
6005int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
6006				  int (*fn)(struct net_device *dev,
6007					    void *data),
6008				  void *data)
6009{
6010	struct net_device *udev;
6011	struct list_head *iter;
6012	int ret;
6013
6014	for (iter = &dev->adj_list.upper,
6015	     udev = netdev_next_upper_dev_rcu(dev, &iter);
6016	     udev;
6017	     udev = netdev_next_upper_dev_rcu(dev, &iter)) {
6018		/* first is the upper device itself */
6019		ret = fn(udev, data);
6020		if (ret)
6021			return ret;
6022
6023		/* then look at all of its upper devices */
6024		ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
6025		if (ret)
6026			return ret;
6027	}
6028
6029	return 0;
6030}
6031EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
6032
6033/**
6034 * netdev_lower_get_next_private - Get the next ->private from the
6035 *				   lower neighbour list
6036 * @dev: device
6037 * @iter: list_head ** of the current position
6038 *
6039 * Gets the next netdev_adjacent->private from the dev's lower neighbour
6040 * list, starting from iter position. The caller must hold either hold the
6041 * RTNL lock or its own locking that guarantees that the neighbour lower
6042 * list will remain unchanged.
6043 */
6044void *netdev_lower_get_next_private(struct net_device *dev,
6045				    struct list_head **iter)
6046{
6047	struct netdev_adjacent *lower;
6048
6049	lower = list_entry(*iter, struct netdev_adjacent, list);
6050
6051	if (&lower->list == &dev->adj_list.lower)
6052		return NULL;
6053
6054	*iter = lower->list.next;
6055
6056	return lower->private;
6057}
6058EXPORT_SYMBOL(netdev_lower_get_next_private);
6059
6060/**
6061 * netdev_lower_get_next_private_rcu - Get the next ->private from the
6062 *				       lower neighbour list, RCU
6063 *				       variant
6064 * @dev: device
6065 * @iter: list_head ** of the current position
6066 *
6067 * Gets the next netdev_adjacent->private from the dev's lower neighbour
6068 * list, starting from iter position. The caller must hold RCU read lock.
6069 */
6070void *netdev_lower_get_next_private_rcu(struct net_device *dev,
6071					struct list_head **iter)
6072{
6073	struct netdev_adjacent *lower;
6074
6075	WARN_ON_ONCE(!rcu_read_lock_held());
6076
6077	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6078
6079	if (&lower->list == &dev->adj_list.lower)
6080		return NULL;
6081
6082	*iter = &lower->list;
6083
6084	return lower->private;
6085}
6086EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
6087
6088/**
6089 * netdev_lower_get_next - Get the next device from the lower neighbour
6090 *                         list
6091 * @dev: device
6092 * @iter: list_head ** of the current position
6093 *
6094 * Gets the next netdev_adjacent from the dev's lower neighbour
6095 * list, starting from iter position. The caller must hold RTNL lock or
6096 * its own locking that guarantees that the neighbour lower
6097 * list will remain unchanged.
6098 */
6099void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
6100{
6101	struct netdev_adjacent *lower;
6102
6103	lower = list_entry(*iter, struct netdev_adjacent, list);
6104
6105	if (&lower->list == &dev->adj_list.lower)
6106		return NULL;
6107
6108	*iter = lower->list.next;
6109
6110	return lower->dev;
6111}
6112EXPORT_SYMBOL(netdev_lower_get_next);
6113
6114static struct net_device *netdev_next_lower_dev(struct net_device *dev,
6115						struct list_head **iter)
6116{
6117	struct netdev_adjacent *lower;
6118
6119	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
6120
6121	if (&lower->list == &dev->adj_list.lower)
6122		return NULL;
6123
6124	*iter = &lower->list;
6125
6126	return lower->dev;
6127}
6128
6129int netdev_walk_all_lower_dev(struct net_device *dev,
6130			      int (*fn)(struct net_device *dev,
6131					void *data),
6132			      void *data)
6133{
6134	struct net_device *ldev;
6135	struct list_head *iter;
6136	int ret;
6137
6138	for (iter = &dev->adj_list.lower,
6139	     ldev = netdev_next_lower_dev(dev, &iter);
6140	     ldev;
6141	     ldev = netdev_next_lower_dev(dev, &iter)) {
6142		/* first is the lower device itself */
6143		ret = fn(ldev, data);
6144		if (ret)
6145			return ret;
6146
6147		/* then look at all of its lower devices */
6148		ret = netdev_walk_all_lower_dev(ldev, fn, data);
6149		if (ret)
6150			return ret;
6151	}
6152
6153	return 0;
6154}
6155EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
6156
6157static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
6158						    struct list_head **iter)
6159{
6160	struct netdev_adjacent *lower;
6161
6162	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6163	if (&lower->list == &dev->adj_list.lower)
6164		return NULL;
6165
6166	*iter = &lower->list;
6167
6168	return lower->dev;
6169}
6170
6171int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
6172				  int (*fn)(struct net_device *dev,
6173					    void *data),
6174				  void *data)
6175{
6176	struct net_device *ldev;
6177	struct list_head *iter;
6178	int ret;
6179
6180	for (iter = &dev->adj_list.lower,
6181	     ldev = netdev_next_lower_dev_rcu(dev, &iter);
6182	     ldev;
6183	     ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
6184		/* first is the lower device itself */
6185		ret = fn(ldev, data);
6186		if (ret)
6187			return ret;
6188
6189		/* then look at all of its lower devices */
6190		ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
6191		if (ret)
6192			return ret;
6193	}
6194
6195	return 0;
6196}
6197EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
6198
6199/**
6200 * netdev_lower_get_first_private_rcu - Get the first ->private from the
6201 *				       lower neighbour list, RCU
6202 *				       variant
6203 * @dev: device
6204 *
6205 * Gets the first netdev_adjacent->private from the dev's lower neighbour
6206 * list. The caller must hold RCU read lock.
6207 */
6208void *netdev_lower_get_first_private_rcu(struct net_device *dev)
6209{
6210	struct netdev_adjacent *lower;
6211
6212	lower = list_first_or_null_rcu(&dev->adj_list.lower,
6213			struct netdev_adjacent, list);
6214	if (lower)
6215		return lower->private;
6216	return NULL;
6217}
6218EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
6219
6220/**
6221 * netdev_master_upper_dev_get_rcu - Get master upper device
6222 * @dev: device
6223 *
6224 * Find a master upper device and return pointer to it or NULL in case
6225 * it's not there. The caller must hold the RCU read lock.
6226 */
6227struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
6228{
6229	struct netdev_adjacent *upper;
6230
6231	upper = list_first_or_null_rcu(&dev->adj_list.upper,
6232				       struct netdev_adjacent, list);
6233	if (upper && likely(upper->master))
6234		return upper->dev;
6235	return NULL;
6236}
6237EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
6238
6239static int netdev_adjacent_sysfs_add(struct net_device *dev,
6240			      struct net_device *adj_dev,
6241			      struct list_head *dev_list)
6242{
6243	char linkname[IFNAMSIZ+7];
6244
6245	sprintf(linkname, dev_list == &dev->adj_list.upper ?
6246		"upper_%s" : "lower_%s", adj_dev->name);
6247	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
6248				 linkname);
6249}
6250static void netdev_adjacent_sysfs_del(struct net_device *dev,
6251			       char *name,
6252			       struct list_head *dev_list)
6253{
6254	char linkname[IFNAMSIZ+7];
6255
6256	sprintf(linkname, dev_list == &dev->adj_list.upper ?
6257		"upper_%s" : "lower_%s", name);
6258	sysfs_remove_link(&(dev->dev.kobj), linkname);
6259}
6260
6261static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
6262						 struct net_device *adj_dev,
6263						 struct list_head *dev_list)
6264{
6265	return (dev_list == &dev->adj_list.upper ||
6266		dev_list == &dev->adj_list.lower) &&
6267		net_eq(dev_net(dev), dev_net(adj_dev));
6268}
6269
6270static int __netdev_adjacent_dev_insert(struct net_device *dev,
6271					struct net_device *adj_dev,
6272					struct list_head *dev_list,
6273					void *private, bool master)
6274{
6275	struct netdev_adjacent *adj;
6276	int ret;
6277
6278	adj = __netdev_find_adj(adj_dev, dev_list);
6279
6280	if (adj) {
6281		adj->ref_nr += 1;
6282		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
6283			 dev->name, adj_dev->name, adj->ref_nr);
6284
6285		return 0;
6286	}
6287
6288	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
6289	if (!adj)
6290		return -ENOMEM;
6291
6292	adj->dev = adj_dev;
6293	adj->master = master;
6294	adj->ref_nr = 1;
6295	adj->private = private;
6296	dev_hold(adj_dev);
6297
6298	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
6299		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
6300
6301	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
6302		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
6303		if (ret)
6304			goto free_adj;
6305	}
6306
6307	/* Ensure that master link is always the first item in list. */
6308	if (master) {
6309		ret = sysfs_create_link(&(dev->dev.kobj),
6310					&(adj_dev->dev.kobj), "master");
6311		if (ret)
6312			goto remove_symlinks;
6313
6314		list_add_rcu(&adj->list, dev_list);
6315	} else {
6316		list_add_tail_rcu(&adj->list, dev_list);
6317	}
6318
6319	return 0;
6320
6321remove_symlinks:
6322	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
6323		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
6324free_adj:
6325	kfree(adj);
6326	dev_put(adj_dev);
6327
6328	return ret;
6329}
6330
6331static void __netdev_adjacent_dev_remove(struct net_device *dev,
6332					 struct net_device *adj_dev,
6333					 u16 ref_nr,
6334					 struct list_head *dev_list)
6335{
6336	struct netdev_adjacent *adj;
6337
6338	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
6339		 dev->name, adj_dev->name, ref_nr);
6340
6341	adj = __netdev_find_adj(adj_dev, dev_list);
6342
6343	if (!adj) {
6344		pr_err("Adjacency does not exist for device %s from %s\n",
6345		       dev->name, adj_dev->name);
6346		WARN_ON(1);
6347		return;
6348	}
6349
6350	if (adj->ref_nr > ref_nr) {
6351		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
6352			 dev->name, adj_dev->name, ref_nr,
6353			 adj->ref_nr - ref_nr);
6354		adj->ref_nr -= ref_nr;
6355		return;
6356	}
6357
6358	if (adj->master)
6359		sysfs_remove_link(&(dev->dev.kobj), "master");
6360
6361	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
6362		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
6363
6364	list_del_rcu(&adj->list);
6365	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
6366		 adj_dev->name, dev->name, adj_dev->name);
6367	dev_put(adj_dev);
6368	kfree_rcu(adj, rcu);
6369}
6370
6371static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
6372					    struct net_device *upper_dev,
6373					    struct list_head *up_list,
6374					    struct list_head *down_list,
6375					    void *private, bool master)
6376{
6377	int ret;
6378
6379	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
6380					   private, master);
6381	if (ret)
6382		return ret;
6383
6384	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
6385					   private, false);
6386	if (ret) {
6387		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
6388		return ret;
6389	}
6390
6391	return 0;
6392}
6393
6394static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
6395					       struct net_device *upper_dev,
6396					       u16 ref_nr,
6397					       struct list_head *up_list,
6398					       struct list_head *down_list)
6399{
6400	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
6401	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
6402}
6403
6404static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
6405						struct net_device *upper_dev,
6406						void *private, bool master)
6407{
6408	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
6409						&dev->adj_list.upper,
6410						&upper_dev->adj_list.lower,
6411						private, master);
6412}
6413
6414static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
6415						   struct net_device *upper_dev)
6416{
6417	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
6418					   &dev->adj_list.upper,
6419					   &upper_dev->adj_list.lower);
6420}
6421
6422static int __netdev_upper_dev_link(struct net_device *dev,
6423				   struct net_device *upper_dev, bool master,
6424				   void *upper_priv, void *upper_info,
6425				   struct netlink_ext_ack *extack)
6426{
6427	struct netdev_notifier_changeupper_info changeupper_info = {
6428		.info = {
6429			.dev = dev,
6430			.extack = extack,
6431		},
6432		.upper_dev = upper_dev,
6433		.master = master,
6434		.linking = true,
6435		.upper_info = upper_info,
6436	};
6437	struct net_device *master_dev;
6438	int ret = 0;
6439
6440	ASSERT_RTNL();
6441
6442	if (dev == upper_dev)
6443		return -EBUSY;
6444
6445	/* To prevent loops, check if dev is not upper device to upper_dev. */
6446	if (netdev_has_upper_dev(upper_dev, dev))
6447		return -EBUSY;
6448
6449	if (!master) {
6450		if (netdev_has_upper_dev(dev, upper_dev))
6451			return -EEXIST;
6452	} else {
6453		master_dev = netdev_master_upper_dev_get(dev);
6454		if (master_dev)
6455			return master_dev == upper_dev ? -EEXIST : -EBUSY;
6456	}
6457
6458	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 
 
 
 
 
 
 
 
6459					    &changeupper_info.info);
6460	ret = notifier_to_errno(ret);
6461	if (ret)
6462		return ret;
6463
6464	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
6465						   master);
6466	if (ret)
6467		return ret;
6468
6469	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
6470					    &changeupper_info.info);
6471	ret = notifier_to_errno(ret);
6472	if (ret)
6473		goto rollback;
6474
6475	return 0;
6476
6477rollback:
6478	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6479
6480	return ret;
6481}
6482
6483/**
6484 * netdev_upper_dev_link - Add a link to the upper device
6485 * @dev: device
6486 * @upper_dev: new upper device
6487 * @extack: netlink extended ack
6488 *
6489 * Adds a link to device which is upper to this one. The caller must hold
6490 * the RTNL lock. On a failure a negative errno code is returned.
6491 * On success the reference counts are adjusted and the function
6492 * returns zero.
6493 */
6494int netdev_upper_dev_link(struct net_device *dev,
6495			  struct net_device *upper_dev,
6496			  struct netlink_ext_ack *extack)
6497{
6498	return __netdev_upper_dev_link(dev, upper_dev, false,
6499				       NULL, NULL, extack);
6500}
6501EXPORT_SYMBOL(netdev_upper_dev_link);
6502
6503/**
6504 * netdev_master_upper_dev_link - Add a master link to the upper device
6505 * @dev: device
6506 * @upper_dev: new upper device
6507 * @upper_priv: upper device private
6508 * @upper_info: upper info to be passed down via notifier
6509 * @extack: netlink extended ack
6510 *
6511 * Adds a link to device which is upper to this one. In this case, only
6512 * one master upper device can be linked, although other non-master devices
6513 * might be linked as well. The caller must hold the RTNL lock.
6514 * On a failure a negative errno code is returned. On success the reference
6515 * counts are adjusted and the function returns zero.
6516 */
6517int netdev_master_upper_dev_link(struct net_device *dev,
6518				 struct net_device *upper_dev,
6519				 void *upper_priv, void *upper_info,
6520				 struct netlink_ext_ack *extack)
6521{
6522	return __netdev_upper_dev_link(dev, upper_dev, true,
6523				       upper_priv, upper_info, extack);
6524}
6525EXPORT_SYMBOL(netdev_master_upper_dev_link);
6526
6527/**
6528 * netdev_upper_dev_unlink - Removes a link to upper device
6529 * @dev: device
6530 * @upper_dev: new upper device
6531 *
6532 * Removes a link to device which is upper to this one. The caller must hold
6533 * the RTNL lock.
6534 */
6535void netdev_upper_dev_unlink(struct net_device *dev,
6536			     struct net_device *upper_dev)
6537{
6538	struct netdev_notifier_changeupper_info changeupper_info = {
6539		.info = {
6540			.dev = dev,
6541		},
6542		.upper_dev = upper_dev,
6543		.linking = false,
6544	};
6545
6546	ASSERT_RTNL();
6547
 
6548	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
 
6549
6550	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
6551				      &changeupper_info.info);
6552
6553	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6554
6555	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
6556				      &changeupper_info.info);
6557}
6558EXPORT_SYMBOL(netdev_upper_dev_unlink);
6559
6560/**
6561 * netdev_bonding_info_change - Dispatch event about slave change
6562 * @dev: device
6563 * @bonding_info: info to dispatch
6564 *
6565 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
6566 * The caller must hold the RTNL lock.
6567 */
6568void netdev_bonding_info_change(struct net_device *dev,
6569				struct netdev_bonding_info *bonding_info)
6570{
6571	struct netdev_notifier_bonding_info info = {
6572		.info.dev = dev,
6573	};
6574
6575	memcpy(&info.bonding_info, bonding_info,
6576	       sizeof(struct netdev_bonding_info));
6577	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
6578				      &info.info);
6579}
6580EXPORT_SYMBOL(netdev_bonding_info_change);
6581
6582static void netdev_adjacent_add_links(struct net_device *dev)
6583{
6584	struct netdev_adjacent *iter;
6585
6586	struct net *net = dev_net(dev);
6587
6588	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6589		if (!net_eq(net, dev_net(iter->dev)))
6590			continue;
6591		netdev_adjacent_sysfs_add(iter->dev, dev,
6592					  &iter->dev->adj_list.lower);
6593		netdev_adjacent_sysfs_add(dev, iter->dev,
6594					  &dev->adj_list.upper);
6595	}
6596
6597	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6598		if (!net_eq(net, dev_net(iter->dev)))
6599			continue;
6600		netdev_adjacent_sysfs_add(iter->dev, dev,
6601					  &iter->dev->adj_list.upper);
6602		netdev_adjacent_sysfs_add(dev, iter->dev,
6603					  &dev->adj_list.lower);
6604	}
6605}
6606
6607static void netdev_adjacent_del_links(struct net_device *dev)
6608{
6609	struct netdev_adjacent *iter;
6610
6611	struct net *net = dev_net(dev);
6612
6613	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6614		if (!net_eq(net, dev_net(iter->dev)))
6615			continue;
6616		netdev_adjacent_sysfs_del(iter->dev, dev->name,
6617					  &iter->dev->adj_list.lower);
6618		netdev_adjacent_sysfs_del(dev, iter->dev->name,
6619					  &dev->adj_list.upper);
6620	}
6621
6622	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6623		if (!net_eq(net, dev_net(iter->dev)))
6624			continue;
6625		netdev_adjacent_sysfs_del(iter->dev, dev->name,
6626					  &iter->dev->adj_list.upper);
6627		netdev_adjacent_sysfs_del(dev, iter->dev->name,
6628					  &dev->adj_list.lower);
6629	}
6630}
6631
6632void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
6633{
6634	struct netdev_adjacent *iter;
6635
6636	struct net *net = dev_net(dev);
6637
6638	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6639		if (!net_eq(net, dev_net(iter->dev)))
6640			continue;
6641		netdev_adjacent_sysfs_del(iter->dev, oldname,
6642					  &iter->dev->adj_list.lower);
6643		netdev_adjacent_sysfs_add(iter->dev, dev,
6644					  &iter->dev->adj_list.lower);
6645	}
6646
6647	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6648		if (!net_eq(net, dev_net(iter->dev)))
6649			continue;
6650		netdev_adjacent_sysfs_del(iter->dev, oldname,
6651					  &iter->dev->adj_list.upper);
6652		netdev_adjacent_sysfs_add(iter->dev, dev,
6653					  &iter->dev->adj_list.upper);
6654	}
6655}
6656
6657void *netdev_lower_dev_get_private(struct net_device *dev,
6658				   struct net_device *lower_dev)
6659{
6660	struct netdev_adjacent *lower;
6661
6662	if (!lower_dev)
6663		return NULL;
6664	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
6665	if (!lower)
6666		return NULL;
6667
6668	return lower->private;
6669}
6670EXPORT_SYMBOL(netdev_lower_dev_get_private);
6671
6672
6673int dev_get_nest_level(struct net_device *dev)
6674{
6675	struct net_device *lower = NULL;
6676	struct list_head *iter;
6677	int max_nest = -1;
6678	int nest;
6679
6680	ASSERT_RTNL();
6681
6682	netdev_for_each_lower_dev(dev, lower, iter) {
6683		nest = dev_get_nest_level(lower);
6684		if (max_nest < nest)
6685			max_nest = nest;
6686	}
6687
6688	return max_nest + 1;
6689}
6690EXPORT_SYMBOL(dev_get_nest_level);
6691
6692/**
6693 * netdev_lower_change - Dispatch event about lower device state change
6694 * @lower_dev: device
6695 * @lower_state_info: state to dispatch
6696 *
6697 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6698 * The caller must hold the RTNL lock.
6699 */
6700void netdev_lower_state_changed(struct net_device *lower_dev,
6701				void *lower_state_info)
6702{
6703	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
6704		.info.dev = lower_dev,
6705	};
6706
6707	ASSERT_RTNL();
6708	changelowerstate_info.lower_state_info = lower_state_info;
6709	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
6710				      &changelowerstate_info.info);
6711}
6712EXPORT_SYMBOL(netdev_lower_state_changed);
6713
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6714static void dev_change_rx_flags(struct net_device *dev, int flags)
6715{
6716	const struct net_device_ops *ops = dev->netdev_ops;
6717
6718	if (ops->ndo_change_rx_flags)
6719		ops->ndo_change_rx_flags(dev, flags);
6720}
6721
6722static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6723{
6724	unsigned int old_flags = dev->flags;
6725	kuid_t uid;
6726	kgid_t gid;
6727
6728	ASSERT_RTNL();
6729
6730	dev->flags |= IFF_PROMISC;
6731	dev->promiscuity += inc;
6732	if (dev->promiscuity == 0) {
6733		/*
6734		 * Avoid overflow.
6735		 * If inc causes overflow, untouch promisc and return error.
6736		 */
6737		if (inc < 0)
6738			dev->flags &= ~IFF_PROMISC;
6739		else {
6740			dev->promiscuity -= inc;
6741			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6742				dev->name);
6743			return -EOVERFLOW;
6744		}
6745	}
6746	if (dev->flags != old_flags) {
6747		pr_info("device %s %s promiscuous mode\n",
6748			dev->name,
6749			dev->flags & IFF_PROMISC ? "entered" : "left");
6750		if (audit_enabled) {
6751			current_uid_gid(&uid, &gid);
6752			audit_log(current->audit_context, GFP_ATOMIC,
6753				AUDIT_ANOM_PROMISCUOUS,
6754				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6755				dev->name, (dev->flags & IFF_PROMISC),
6756				(old_flags & IFF_PROMISC),
6757				from_kuid(&init_user_ns, audit_get_loginuid(current)),
6758				from_kuid(&init_user_ns, uid),
6759				from_kgid(&init_user_ns, gid),
6760				audit_get_sessionid(current));
6761		}
6762
6763		dev_change_rx_flags(dev, IFF_PROMISC);
6764	}
6765	if (notify)
6766		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
6767	return 0;
6768}
6769
6770/**
6771 *	dev_set_promiscuity	- update promiscuity count on a device
6772 *	@dev: device
6773 *	@inc: modifier
6774 *
6775 *	Add or remove promiscuity from a device. While the count in the device
6776 *	remains above zero the interface remains promiscuous. Once it hits zero
6777 *	the device reverts back to normal filtering operation. A negative inc
6778 *	value is used to drop promiscuity on the device.
6779 *	Return 0 if successful or a negative errno code on error.
6780 */
6781int dev_set_promiscuity(struct net_device *dev, int inc)
6782{
6783	unsigned int old_flags = dev->flags;
6784	int err;
6785
6786	err = __dev_set_promiscuity(dev, inc, true);
6787	if (err < 0)
6788		return err;
6789	if (dev->flags != old_flags)
6790		dev_set_rx_mode(dev);
6791	return err;
6792}
6793EXPORT_SYMBOL(dev_set_promiscuity);
6794
6795static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6796{
6797	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6798
6799	ASSERT_RTNL();
6800
6801	dev->flags |= IFF_ALLMULTI;
6802	dev->allmulti += inc;
6803	if (dev->allmulti == 0) {
6804		/*
6805		 * Avoid overflow.
6806		 * If inc causes overflow, untouch allmulti and return error.
6807		 */
6808		if (inc < 0)
6809			dev->flags &= ~IFF_ALLMULTI;
6810		else {
6811			dev->allmulti -= inc;
6812			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6813				dev->name);
6814			return -EOVERFLOW;
6815		}
6816	}
6817	if (dev->flags ^ old_flags) {
6818		dev_change_rx_flags(dev, IFF_ALLMULTI);
6819		dev_set_rx_mode(dev);
6820		if (notify)
6821			__dev_notify_flags(dev, old_flags,
6822					   dev->gflags ^ old_gflags);
6823	}
6824	return 0;
6825}
6826
6827/**
6828 *	dev_set_allmulti	- update allmulti count on a device
6829 *	@dev: device
6830 *	@inc: modifier
6831 *
6832 *	Add or remove reception of all multicast frames to a device. While the
6833 *	count in the device remains above zero the interface remains listening
6834 *	to all interfaces. Once it hits zero the device reverts back to normal
6835 *	filtering operation. A negative @inc value is used to drop the counter
6836 *	when releasing a resource needing all multicasts.
6837 *	Return 0 if successful or a negative errno code on error.
6838 */
6839
6840int dev_set_allmulti(struct net_device *dev, int inc)
6841{
6842	return __dev_set_allmulti(dev, inc, true);
6843}
6844EXPORT_SYMBOL(dev_set_allmulti);
6845
6846/*
6847 *	Upload unicast and multicast address lists to device and
6848 *	configure RX filtering. When the device doesn't support unicast
6849 *	filtering it is put in promiscuous mode while unicast addresses
6850 *	are present.
6851 */
6852void __dev_set_rx_mode(struct net_device *dev)
6853{
6854	const struct net_device_ops *ops = dev->netdev_ops;
6855
6856	/* dev_open will call this function so the list will stay sane. */
6857	if (!(dev->flags&IFF_UP))
6858		return;
6859
6860	if (!netif_device_present(dev))
6861		return;
6862
6863	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6864		/* Unicast addresses changes may only happen under the rtnl,
6865		 * therefore calling __dev_set_promiscuity here is safe.
6866		 */
6867		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6868			__dev_set_promiscuity(dev, 1, false);
6869			dev->uc_promisc = true;
6870		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6871			__dev_set_promiscuity(dev, -1, false);
6872			dev->uc_promisc = false;
6873		}
6874	}
6875
6876	if (ops->ndo_set_rx_mode)
6877		ops->ndo_set_rx_mode(dev);
6878}
6879
6880void dev_set_rx_mode(struct net_device *dev)
6881{
6882	netif_addr_lock_bh(dev);
6883	__dev_set_rx_mode(dev);
6884	netif_addr_unlock_bh(dev);
6885}
6886
6887/**
6888 *	dev_get_flags - get flags reported to userspace
6889 *	@dev: device
6890 *
6891 *	Get the combination of flag bits exported through APIs to userspace.
6892 */
6893unsigned int dev_get_flags(const struct net_device *dev)
6894{
6895	unsigned int flags;
6896
6897	flags = (dev->flags & ~(IFF_PROMISC |
6898				IFF_ALLMULTI |
6899				IFF_RUNNING |
6900				IFF_LOWER_UP |
6901				IFF_DORMANT)) |
6902		(dev->gflags & (IFF_PROMISC |
6903				IFF_ALLMULTI));
6904
6905	if (netif_running(dev)) {
6906		if (netif_oper_up(dev))
6907			flags |= IFF_RUNNING;
6908		if (netif_carrier_ok(dev))
6909			flags |= IFF_LOWER_UP;
6910		if (netif_dormant(dev))
6911			flags |= IFF_DORMANT;
6912	}
6913
6914	return flags;
6915}
6916EXPORT_SYMBOL(dev_get_flags);
6917
6918int __dev_change_flags(struct net_device *dev, unsigned int flags)
6919{
6920	unsigned int old_flags = dev->flags;
6921	int ret;
6922
6923	ASSERT_RTNL();
6924
6925	/*
6926	 *	Set the flags on our device.
6927	 */
6928
6929	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6930			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6931			       IFF_AUTOMEDIA)) |
6932		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6933				    IFF_ALLMULTI));
6934
6935	/*
6936	 *	Load in the correct multicast list now the flags have changed.
6937	 */
6938
6939	if ((old_flags ^ flags) & IFF_MULTICAST)
6940		dev_change_rx_flags(dev, IFF_MULTICAST);
6941
6942	dev_set_rx_mode(dev);
6943
6944	/*
6945	 *	Have we downed the interface. We handle IFF_UP ourselves
6946	 *	according to user attempts to set it, rather than blindly
6947	 *	setting it.
6948	 */
6949
6950	ret = 0;
6951	if ((old_flags ^ flags) & IFF_UP) {
6952		if (old_flags & IFF_UP)
6953			__dev_close(dev);
6954		else
6955			ret = __dev_open(dev);
6956	}
6957
6958	if ((flags ^ dev->gflags) & IFF_PROMISC) {
6959		int inc = (flags & IFF_PROMISC) ? 1 : -1;
6960		unsigned int old_flags = dev->flags;
6961
6962		dev->gflags ^= IFF_PROMISC;
6963
6964		if (__dev_set_promiscuity(dev, inc, false) >= 0)
6965			if (dev->flags != old_flags)
6966				dev_set_rx_mode(dev);
6967	}
6968
6969	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6970	 * is important. Some (broken) drivers set IFF_PROMISC, when
6971	 * IFF_ALLMULTI is requested not asking us and not reporting.
6972	 */
6973	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6974		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6975
6976		dev->gflags ^= IFF_ALLMULTI;
6977		__dev_set_allmulti(dev, inc, false);
6978	}
6979
6980	return ret;
6981}
6982
6983void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6984			unsigned int gchanges)
6985{
6986	unsigned int changes = dev->flags ^ old_flags;
6987
6988	if (gchanges)
6989		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6990
6991	if (changes & IFF_UP) {
6992		if (dev->flags & IFF_UP)
6993			call_netdevice_notifiers(NETDEV_UP, dev);
6994		else
6995			call_netdevice_notifiers(NETDEV_DOWN, dev);
6996	}
6997
6998	if (dev->flags & IFF_UP &&
6999	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
7000		struct netdev_notifier_change_info change_info = {
7001			.info = {
7002				.dev = dev,
7003			},
7004			.flags_changed = changes,
7005		};
7006
7007		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
 
 
7008	}
7009}
7010
7011/**
7012 *	dev_change_flags - change device settings
7013 *	@dev: device
7014 *	@flags: device state flags
7015 *
7016 *	Change settings on device based state flags. The flags are
7017 *	in the userspace exported format.
7018 */
7019int dev_change_flags(struct net_device *dev, unsigned int flags)
7020{
7021	int ret;
7022	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
7023
7024	ret = __dev_change_flags(dev, flags);
7025	if (ret < 0)
7026		return ret;
7027
7028	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
7029	__dev_notify_flags(dev, old_flags, changes);
7030	return ret;
7031}
7032EXPORT_SYMBOL(dev_change_flags);
7033
7034int __dev_set_mtu(struct net_device *dev, int new_mtu)
7035{
7036	const struct net_device_ops *ops = dev->netdev_ops;
7037
7038	if (ops->ndo_change_mtu)
7039		return ops->ndo_change_mtu(dev, new_mtu);
7040
7041	dev->mtu = new_mtu;
7042	return 0;
7043}
7044EXPORT_SYMBOL(__dev_set_mtu);
7045
7046/**
7047 *	dev_set_mtu - Change maximum transfer unit
7048 *	@dev: device
7049 *	@new_mtu: new transfer unit
7050 *
7051 *	Change the maximum transfer size of the network device.
7052 */
7053int dev_set_mtu(struct net_device *dev, int new_mtu)
7054{
7055	int err, orig_mtu;
7056
7057	if (new_mtu == dev->mtu)
7058		return 0;
7059
7060	/* MTU must be positive, and in range */
7061	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
7062		net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
7063				    dev->name, new_mtu, dev->min_mtu);
7064		return -EINVAL;
7065	}
7066
7067	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
7068		net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
7069				    dev->name, new_mtu, dev->max_mtu);
7070		return -EINVAL;
7071	}
7072
7073	if (!netif_device_present(dev))
7074		return -ENODEV;
7075
7076	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
7077	err = notifier_to_errno(err);
7078	if (err)
7079		return err;
7080
7081	orig_mtu = dev->mtu;
7082	err = __dev_set_mtu(dev, new_mtu);
7083
7084	if (!err) {
7085		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
7086		err = notifier_to_errno(err);
7087		if (err) {
7088			/* setting mtu back and notifying everyone again,
7089			 * so that they have a chance to revert changes.
7090			 */
7091			__dev_set_mtu(dev, orig_mtu);
7092			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
7093		}
7094	}
7095	return err;
7096}
7097EXPORT_SYMBOL(dev_set_mtu);
7098
7099/**
7100 *	dev_change_tx_queue_len - Change TX queue length of a netdevice
7101 *	@dev: device
7102 *	@new_len: new tx queue length
7103 */
7104int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
7105{
7106	unsigned int orig_len = dev->tx_queue_len;
7107	int res;
7108
7109	if (new_len != (unsigned int)new_len)
7110		return -ERANGE;
7111
7112	if (new_len != orig_len) {
7113		dev->tx_queue_len = new_len;
7114		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
7115		res = notifier_to_errno(res);
7116		if (res) {
7117			netdev_err(dev,
7118				   "refused to change device tx_queue_len\n");
7119			dev->tx_queue_len = orig_len;
7120			return res;
7121		}
7122		return dev_qdisc_change_tx_queue_len(dev);
7123	}
7124
7125	return 0;
7126}
7127
7128/**
7129 *	dev_set_group - Change group this device belongs to
7130 *	@dev: device
7131 *	@new_group: group this device should belong to
7132 */
7133void dev_set_group(struct net_device *dev, int new_group)
7134{
7135	dev->group = new_group;
7136}
7137EXPORT_SYMBOL(dev_set_group);
7138
7139/**
7140 *	dev_set_mac_address - Change Media Access Control Address
7141 *	@dev: device
7142 *	@sa: new address
7143 *
7144 *	Change the hardware (MAC) address of the device
7145 */
7146int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
7147{
7148	const struct net_device_ops *ops = dev->netdev_ops;
7149	int err;
7150
7151	if (!ops->ndo_set_mac_address)
7152		return -EOPNOTSUPP;
7153	if (sa->sa_family != dev->type)
7154		return -EINVAL;
7155	if (!netif_device_present(dev))
7156		return -ENODEV;
7157	err = ops->ndo_set_mac_address(dev, sa);
7158	if (err)
7159		return err;
7160	dev->addr_assign_type = NET_ADDR_SET;
7161	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
7162	add_device_randomness(dev->dev_addr, dev->addr_len);
7163	return 0;
7164}
7165EXPORT_SYMBOL(dev_set_mac_address);
7166
7167/**
7168 *	dev_change_carrier - Change device carrier
7169 *	@dev: device
7170 *	@new_carrier: new value
7171 *
7172 *	Change device carrier
7173 */
7174int dev_change_carrier(struct net_device *dev, bool new_carrier)
7175{
7176	const struct net_device_ops *ops = dev->netdev_ops;
7177
7178	if (!ops->ndo_change_carrier)
7179		return -EOPNOTSUPP;
7180	if (!netif_device_present(dev))
7181		return -ENODEV;
7182	return ops->ndo_change_carrier(dev, new_carrier);
7183}
7184EXPORT_SYMBOL(dev_change_carrier);
7185
7186/**
7187 *	dev_get_phys_port_id - Get device physical port ID
7188 *	@dev: device
7189 *	@ppid: port ID
7190 *
7191 *	Get device physical port ID
7192 */
7193int dev_get_phys_port_id(struct net_device *dev,
7194			 struct netdev_phys_item_id *ppid)
7195{
7196	const struct net_device_ops *ops = dev->netdev_ops;
7197
7198	if (!ops->ndo_get_phys_port_id)
7199		return -EOPNOTSUPP;
7200	return ops->ndo_get_phys_port_id(dev, ppid);
7201}
7202EXPORT_SYMBOL(dev_get_phys_port_id);
7203
7204/**
7205 *	dev_get_phys_port_name - Get device physical port name
7206 *	@dev: device
7207 *	@name: port name
7208 *	@len: limit of bytes to copy to name
7209 *
7210 *	Get device physical port name
7211 */
7212int dev_get_phys_port_name(struct net_device *dev,
7213			   char *name, size_t len)
7214{
7215	const struct net_device_ops *ops = dev->netdev_ops;
7216
7217	if (!ops->ndo_get_phys_port_name)
7218		return -EOPNOTSUPP;
7219	return ops->ndo_get_phys_port_name(dev, name, len);
7220}
7221EXPORT_SYMBOL(dev_get_phys_port_name);
7222
7223/**
7224 *	dev_change_proto_down - update protocol port state information
7225 *	@dev: device
7226 *	@proto_down: new value
7227 *
7228 *	This info can be used by switch drivers to set the phys state of the
7229 *	port.
7230 */
7231int dev_change_proto_down(struct net_device *dev, bool proto_down)
7232{
7233	const struct net_device_ops *ops = dev->netdev_ops;
7234
7235	if (!ops->ndo_change_proto_down)
7236		return -EOPNOTSUPP;
7237	if (!netif_device_present(dev))
7238		return -ENODEV;
7239	return ops->ndo_change_proto_down(dev, proto_down);
7240}
7241EXPORT_SYMBOL(dev_change_proto_down);
7242
7243void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
7244		     struct netdev_bpf *xdp)
7245{
7246	memset(xdp, 0, sizeof(*xdp));
7247	xdp->command = XDP_QUERY_PROG;
7248
7249	/* Query must always succeed. */
7250	WARN_ON(bpf_op(dev, xdp) < 0);
7251}
7252
7253static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op)
7254{
7255	struct netdev_bpf xdp;
7256
7257	__dev_xdp_query(dev, bpf_op, &xdp);
7258
7259	return xdp.prog_attached;
7260}
7261
7262static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
7263			   struct netlink_ext_ack *extack, u32 flags,
7264			   struct bpf_prog *prog)
7265{
7266	struct netdev_bpf xdp;
7267
7268	memset(&xdp, 0, sizeof(xdp));
7269	if (flags & XDP_FLAGS_HW_MODE)
7270		xdp.command = XDP_SETUP_PROG_HW;
7271	else
7272		xdp.command = XDP_SETUP_PROG;
7273	xdp.extack = extack;
7274	xdp.flags = flags;
7275	xdp.prog = prog;
7276
7277	return bpf_op(dev, &xdp);
7278}
7279
7280static void dev_xdp_uninstall(struct net_device *dev)
7281{
7282	struct netdev_bpf xdp;
7283	bpf_op_t ndo_bpf;
7284
7285	/* Remove generic XDP */
7286	WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL));
7287
7288	/* Remove from the driver */
7289	ndo_bpf = dev->netdev_ops->ndo_bpf;
7290	if (!ndo_bpf)
7291		return;
7292
7293	__dev_xdp_query(dev, ndo_bpf, &xdp);
7294	if (xdp.prog_attached == XDP_ATTACHED_NONE)
7295		return;
7296
7297	/* Program removal should always succeed */
7298	WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL));
7299}
7300
7301/**
7302 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
7303 *	@dev: device
7304 *	@extack: netlink extended ack
7305 *	@fd: new program fd or negative value to clear
7306 *	@flags: xdp-related flags
7307 *
7308 *	Set or clear a bpf program for a device
7309 */
7310int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
7311		      int fd, u32 flags)
7312{
7313	const struct net_device_ops *ops = dev->netdev_ops;
7314	struct bpf_prog *prog = NULL;
7315	bpf_op_t bpf_op, bpf_chk;
7316	int err;
7317
7318	ASSERT_RTNL();
7319
7320	bpf_op = bpf_chk = ops->ndo_bpf;
7321	if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE)))
7322		return -EOPNOTSUPP;
7323	if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE))
7324		bpf_op = generic_xdp_install;
7325	if (bpf_op == bpf_chk)
7326		bpf_chk = generic_xdp_install;
7327
7328	if (fd >= 0) {
7329		if (bpf_chk && __dev_xdp_attached(dev, bpf_chk))
7330			return -EEXIST;
7331		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
7332		    __dev_xdp_attached(dev, bpf_op))
7333			return -EBUSY;
 
 
 
 
 
7334
7335		prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
7336					     bpf_op == ops->ndo_bpf);
7337		if (IS_ERR(prog))
7338			return PTR_ERR(prog);
7339
7340		if (!(flags & XDP_FLAGS_HW_MODE) &&
7341		    bpf_prog_is_dev_bound(prog->aux)) {
7342			NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported");
7343			bpf_prog_put(prog);
7344			return -EINVAL;
7345		}
7346	}
7347
7348	err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
 
 
 
 
7349	if (err < 0 && prog)
7350		bpf_prog_put(prog);
7351
7352	return err;
7353}
 
7354
7355/**
7356 *	dev_new_index	-	allocate an ifindex
7357 *	@net: the applicable net namespace
7358 *
7359 *	Returns a suitable unique value for a new device interface
7360 *	number.  The caller must hold the rtnl semaphore or the
7361 *	dev_base_lock to be sure it remains unique.
7362 */
7363static int dev_new_index(struct net *net)
7364{
7365	int ifindex = net->ifindex;
7366
7367	for (;;) {
7368		if (++ifindex <= 0)
7369			ifindex = 1;
7370		if (!__dev_get_by_index(net, ifindex))
7371			return net->ifindex = ifindex;
7372	}
7373}
7374
7375/* Delayed registration/unregisteration */
7376static LIST_HEAD(net_todo_list);
7377DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
7378
7379static void net_set_todo(struct net_device *dev)
7380{
7381	list_add_tail(&dev->todo_list, &net_todo_list);
7382	dev_net(dev)->dev_unreg_count++;
7383}
7384
7385static void rollback_registered_many(struct list_head *head)
7386{
7387	struct net_device *dev, *tmp;
7388	LIST_HEAD(close_head);
7389
7390	BUG_ON(dev_boot_phase);
7391	ASSERT_RTNL();
7392
7393	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
7394		/* Some devices call without registering
7395		 * for initialization unwind. Remove those
7396		 * devices and proceed with the remaining.
7397		 */
7398		if (dev->reg_state == NETREG_UNINITIALIZED) {
7399			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
7400				 dev->name, dev);
7401
7402			WARN_ON(1);
7403			list_del(&dev->unreg_list);
7404			continue;
7405		}
7406		dev->dismantle = true;
7407		BUG_ON(dev->reg_state != NETREG_REGISTERED);
7408	}
7409
7410	/* If device is running, close it first. */
7411	list_for_each_entry(dev, head, unreg_list)
7412		list_add_tail(&dev->close_list, &close_head);
7413	dev_close_many(&close_head, true);
7414
7415	list_for_each_entry(dev, head, unreg_list) {
7416		/* And unlink it from device chain. */
7417		unlist_netdevice(dev);
7418
7419		dev->reg_state = NETREG_UNREGISTERING;
7420	}
7421	flush_all_backlogs();
7422
7423	synchronize_net();
7424
7425	list_for_each_entry(dev, head, unreg_list) {
7426		struct sk_buff *skb = NULL;
7427
7428		/* Shutdown queueing discipline. */
7429		dev_shutdown(dev);
7430
7431		dev_xdp_uninstall(dev);
7432
7433		/* Notify protocols, that we are about to destroy
7434		 * this device. They should clean all the things.
7435		 */
7436		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7437
7438		if (!dev->rtnl_link_ops ||
7439		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7440			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
7441						     GFP_KERNEL, NULL, 0);
7442
7443		/*
7444		 *	Flush the unicast and multicast chains
7445		 */
7446		dev_uc_flush(dev);
7447		dev_mc_flush(dev);
7448
7449		if (dev->netdev_ops->ndo_uninit)
7450			dev->netdev_ops->ndo_uninit(dev);
7451
7452		if (skb)
7453			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
7454
7455		/* Notifier chain MUST detach us all upper devices. */
7456		WARN_ON(netdev_has_any_upper_dev(dev));
7457		WARN_ON(netdev_has_any_lower_dev(dev));
7458
7459		/* Remove entries from kobject tree */
7460		netdev_unregister_kobject(dev);
7461#ifdef CONFIG_XPS
7462		/* Remove XPS queueing entries */
7463		netif_reset_xps_queues_gt(dev, 0);
7464#endif
7465	}
7466
7467	synchronize_net();
7468
7469	list_for_each_entry(dev, head, unreg_list)
7470		dev_put(dev);
7471}
7472
7473static void rollback_registered(struct net_device *dev)
7474{
7475	LIST_HEAD(single);
7476
7477	list_add(&dev->unreg_list, &single);
7478	rollback_registered_many(&single);
7479	list_del(&single);
7480}
7481
7482static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
7483	struct net_device *upper, netdev_features_t features)
7484{
7485	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
7486	netdev_features_t feature;
7487	int feature_bit;
7488
7489	for_each_netdev_feature(&upper_disables, feature_bit) {
7490		feature = __NETIF_F_BIT(feature_bit);
7491		if (!(upper->wanted_features & feature)
7492		    && (features & feature)) {
7493			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
7494				   &feature, upper->name);
7495			features &= ~feature;
7496		}
7497	}
7498
7499	return features;
7500}
7501
7502static void netdev_sync_lower_features(struct net_device *upper,
7503	struct net_device *lower, netdev_features_t features)
7504{
7505	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
7506	netdev_features_t feature;
7507	int feature_bit;
7508
7509	for_each_netdev_feature(&upper_disables, feature_bit) {
7510		feature = __NETIF_F_BIT(feature_bit);
7511		if (!(features & feature) && (lower->features & feature)) {
7512			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
7513				   &feature, lower->name);
7514			lower->wanted_features &= ~feature;
7515			netdev_update_features(lower);
7516
7517			if (unlikely(lower->features & feature))
7518				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
7519					    &feature, lower->name);
7520		}
7521	}
7522}
7523
7524static netdev_features_t netdev_fix_features(struct net_device *dev,
7525	netdev_features_t features)
7526{
7527	/* Fix illegal checksum combinations */
7528	if ((features & NETIF_F_HW_CSUM) &&
7529	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
7530		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
7531		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
7532	}
7533
7534	/* TSO requires that SG is present as well. */
7535	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
7536		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
7537		features &= ~NETIF_F_ALL_TSO;
7538	}
7539
7540	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
7541					!(features & NETIF_F_IP_CSUM)) {
7542		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
7543		features &= ~NETIF_F_TSO;
7544		features &= ~NETIF_F_TSO_ECN;
7545	}
7546
7547	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
7548					 !(features & NETIF_F_IPV6_CSUM)) {
7549		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
7550		features &= ~NETIF_F_TSO6;
7551	}
7552
7553	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
7554	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
7555		features &= ~NETIF_F_TSO_MANGLEID;
7556
7557	/* TSO ECN requires that TSO is present as well. */
7558	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
7559		features &= ~NETIF_F_TSO_ECN;
7560
7561	/* Software GSO depends on SG. */
7562	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
7563		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
7564		features &= ~NETIF_F_GSO;
7565	}
7566
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7567	/* GSO partial features require GSO partial be set */
7568	if ((features & dev->gso_partial_features) &&
7569	    !(features & NETIF_F_GSO_PARTIAL)) {
7570		netdev_dbg(dev,
7571			   "Dropping partially supported GSO features since no GSO partial.\n");
7572		features &= ~dev->gso_partial_features;
7573	}
7574
7575	if (!(features & NETIF_F_RXCSUM)) {
7576		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
7577		 * successfully merged by hardware must also have the
7578		 * checksum verified by hardware.  If the user does not
7579		 * want to enable RXCSUM, logically, we should disable GRO_HW.
7580		 */
7581		if (features & NETIF_F_GRO_HW) {
7582			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
7583			features &= ~NETIF_F_GRO_HW;
7584		}
7585	}
7586
7587	/* LRO/HW-GRO features cannot be combined with RX-FCS */
7588	if (features & NETIF_F_RXFCS) {
7589		if (features & NETIF_F_LRO) {
7590			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
7591			features &= ~NETIF_F_LRO;
7592		}
7593
7594		if (features & NETIF_F_GRO_HW) {
7595			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
7596			features &= ~NETIF_F_GRO_HW;
7597		}
7598	}
7599
7600	return features;
7601}
7602
7603int __netdev_update_features(struct net_device *dev)
7604{
7605	struct net_device *upper, *lower;
7606	netdev_features_t features;
7607	struct list_head *iter;
7608	int err = -1;
7609
7610	ASSERT_RTNL();
7611
7612	features = netdev_get_wanted_features(dev);
7613
7614	if (dev->netdev_ops->ndo_fix_features)
7615		features = dev->netdev_ops->ndo_fix_features(dev, features);
7616
7617	/* driver might be less strict about feature dependencies */
7618	features = netdev_fix_features(dev, features);
7619
7620	/* some features can't be enabled if they're off an an upper device */
7621	netdev_for_each_upper_dev_rcu(dev, upper, iter)
7622		features = netdev_sync_upper_features(dev, upper, features);
7623
7624	if (dev->features == features)
7625		goto sync_lower;
7626
7627	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
7628		&dev->features, &features);
7629
7630	if (dev->netdev_ops->ndo_set_features)
7631		err = dev->netdev_ops->ndo_set_features(dev, features);
7632	else
7633		err = 0;
7634
7635	if (unlikely(err < 0)) {
7636		netdev_err(dev,
7637			"set_features() failed (%d); wanted %pNF, left %pNF\n",
7638			err, &features, &dev->features);
7639		/* return non-0 since some features might have changed and
7640		 * it's better to fire a spurious notification than miss it
7641		 */
7642		return -1;
7643	}
7644
7645sync_lower:
7646	/* some features must be disabled on lower devices when disabled
7647	 * on an upper device (think: bonding master or bridge)
7648	 */
7649	netdev_for_each_lower_dev(dev, lower, iter)
7650		netdev_sync_lower_features(dev, lower, features);
7651
7652	if (!err) {
7653		netdev_features_t diff = features ^ dev->features;
7654
7655		if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
7656			/* udp_tunnel_{get,drop}_rx_info both need
7657			 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
7658			 * device, or they won't do anything.
7659			 * Thus we need to update dev->features
7660			 * *before* calling udp_tunnel_get_rx_info,
7661			 * but *after* calling udp_tunnel_drop_rx_info.
7662			 */
7663			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
7664				dev->features = features;
7665				udp_tunnel_get_rx_info(dev);
7666			} else {
7667				udp_tunnel_drop_rx_info(dev);
7668			}
7669		}
7670
7671		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
7672			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
7673				dev->features = features;
7674				err |= vlan_get_rx_ctag_filter_info(dev);
7675			} else {
7676				vlan_drop_rx_ctag_filter_info(dev);
7677			}
7678		}
7679
7680		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
7681			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
7682				dev->features = features;
7683				err |= vlan_get_rx_stag_filter_info(dev);
7684			} else {
7685				vlan_drop_rx_stag_filter_info(dev);
7686			}
7687		}
7688
7689		dev->features = features;
7690	}
7691
7692	return err < 0 ? 0 : 1;
7693}
7694
7695/**
7696 *	netdev_update_features - recalculate device features
7697 *	@dev: the device to check
7698 *
7699 *	Recalculate dev->features set and send notifications if it
7700 *	has changed. Should be called after driver or hardware dependent
7701 *	conditions might have changed that influence the features.
7702 */
7703void netdev_update_features(struct net_device *dev)
7704{
7705	if (__netdev_update_features(dev))
7706		netdev_features_change(dev);
7707}
7708EXPORT_SYMBOL(netdev_update_features);
7709
7710/**
7711 *	netdev_change_features - recalculate device features
7712 *	@dev: the device to check
7713 *
7714 *	Recalculate dev->features set and send notifications even
7715 *	if they have not changed. Should be called instead of
7716 *	netdev_update_features() if also dev->vlan_features might
7717 *	have changed to allow the changes to be propagated to stacked
7718 *	VLAN devices.
7719 */
7720void netdev_change_features(struct net_device *dev)
7721{
7722	__netdev_update_features(dev);
7723	netdev_features_change(dev);
7724}
7725EXPORT_SYMBOL(netdev_change_features);
7726
7727/**
7728 *	netif_stacked_transfer_operstate -	transfer operstate
7729 *	@rootdev: the root or lower level device to transfer state from
7730 *	@dev: the device to transfer operstate to
7731 *
7732 *	Transfer operational state from root to device. This is normally
7733 *	called when a stacking relationship exists between the root
7734 *	device and the device(a leaf device).
7735 */
7736void netif_stacked_transfer_operstate(const struct net_device *rootdev,
7737					struct net_device *dev)
7738{
7739	if (rootdev->operstate == IF_OPER_DORMANT)
7740		netif_dormant_on(dev);
7741	else
7742		netif_dormant_off(dev);
7743
7744	if (netif_carrier_ok(rootdev))
7745		netif_carrier_on(dev);
7746	else
7747		netif_carrier_off(dev);
 
 
 
7748}
7749EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7750
 
7751static int netif_alloc_rx_queues(struct net_device *dev)
7752{
7753	unsigned int i, count = dev->num_rx_queues;
7754	struct netdev_rx_queue *rx;
7755	size_t sz = count * sizeof(*rx);
7756	int err = 0;
7757
7758	BUG_ON(count < 1);
7759
7760	rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
7761	if (!rx)
7762		return -ENOMEM;
7763
 
 
7764	dev->_rx = rx;
7765
7766	for (i = 0; i < count; i++) {
7767		rx[i].dev = dev;
7768
7769		/* XDP RX-queue setup */
7770		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
7771		if (err < 0)
7772			goto err_rxq_info;
7773	}
7774	return 0;
7775
7776err_rxq_info:
7777	/* Rollback successful reg's and free other resources */
7778	while (i--)
7779		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
7780	kvfree(dev->_rx);
7781	dev->_rx = NULL;
7782	return err;
7783}
7784
7785static void netif_free_rx_queues(struct net_device *dev)
7786{
7787	unsigned int i, count = dev->num_rx_queues;
7788
7789	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
7790	if (!dev->_rx)
7791		return;
7792
7793	for (i = 0; i < count; i++)
7794		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
7795
7796	kvfree(dev->_rx);
7797}
 
7798
7799static void netdev_init_one_queue(struct net_device *dev,
7800				  struct netdev_queue *queue, void *_unused)
7801{
7802	/* Initialize queue lock */
7803	spin_lock_init(&queue->_xmit_lock);
7804	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7805	queue->xmit_lock_owner = -1;
7806	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
7807	queue->dev = dev;
7808#ifdef CONFIG_BQL
7809	dql_init(&queue->dql, HZ);
7810#endif
7811}
7812
7813static void netif_free_tx_queues(struct net_device *dev)
7814{
7815	kvfree(dev->_tx);
7816}
7817
7818static int netif_alloc_netdev_queues(struct net_device *dev)
7819{
7820	unsigned int count = dev->num_tx_queues;
7821	struct netdev_queue *tx;
7822	size_t sz = count * sizeof(*tx);
7823
7824	if (count < 1 || count > 0xffff)
7825		return -EINVAL;
7826
7827	tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
7828	if (!tx)
7829		return -ENOMEM;
7830
 
 
7831	dev->_tx = tx;
7832
7833	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7834	spin_lock_init(&dev->tx_global_lock);
7835
7836	return 0;
7837}
7838
7839void netif_tx_stop_all_queues(struct net_device *dev)
7840{
7841	unsigned int i;
7842
7843	for (i = 0; i < dev->num_tx_queues; i++) {
7844		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
7845
7846		netif_tx_stop_queue(txq);
7847	}
7848}
7849EXPORT_SYMBOL(netif_tx_stop_all_queues);
7850
7851/**
7852 *	register_netdevice	- register a network device
7853 *	@dev: device to register
7854 *
7855 *	Take a completed network device structure and add it to the kernel
7856 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7857 *	chain. 0 is returned on success. A negative errno code is returned
7858 *	on a failure to set up the device, or if the name is a duplicate.
7859 *
7860 *	Callers must hold the rtnl semaphore. You may want
7861 *	register_netdev() instead of this.
7862 *
7863 *	BUGS:
7864 *	The locking appears insufficient to guarantee two parallel registers
7865 *	will not get the same name.
7866 */
7867
7868int register_netdevice(struct net_device *dev)
7869{
7870	int ret;
7871	struct net *net = dev_net(dev);
7872
7873	BUG_ON(dev_boot_phase);
7874	ASSERT_RTNL();
7875
7876	might_sleep();
7877
7878	/* When net_device's are persistent, this will be fatal. */
7879	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7880	BUG_ON(!net);
7881
7882	spin_lock_init(&dev->addr_list_lock);
7883	netdev_set_addr_lockdep_class(dev);
7884
7885	ret = dev_get_valid_name(net, dev, dev->name);
7886	if (ret < 0)
7887		goto out;
7888
7889	/* Init, if this function is available */
7890	if (dev->netdev_ops->ndo_init) {
7891		ret = dev->netdev_ops->ndo_init(dev);
7892		if (ret) {
7893			if (ret > 0)
7894				ret = -EIO;
7895			goto out;
7896		}
7897	}
7898
7899	if (((dev->hw_features | dev->features) &
7900	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
7901	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7902	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7903		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7904		ret = -EINVAL;
7905		goto err_uninit;
7906	}
7907
7908	ret = -EBUSY;
7909	if (!dev->ifindex)
7910		dev->ifindex = dev_new_index(net);
7911	else if (__dev_get_by_index(net, dev->ifindex))
7912		goto err_uninit;
7913
7914	/* Transfer changeable features to wanted_features and enable
7915	 * software offloads (GSO and GRO).
7916	 */
7917	dev->hw_features |= NETIF_F_SOFT_FEATURES;
7918	dev->features |= NETIF_F_SOFT_FEATURES;
7919
7920	if (dev->netdev_ops->ndo_udp_tunnel_add) {
7921		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
7922		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
7923	}
7924
7925	dev->wanted_features = dev->features & dev->hw_features;
7926
7927	if (!(dev->flags & IFF_LOOPBACK))
7928		dev->hw_features |= NETIF_F_NOCACHE_COPY;
7929
7930	/* If IPv4 TCP segmentation offload is supported we should also
7931	 * allow the device to enable segmenting the frame with the option
7932	 * of ignoring a static IP ID value.  This doesn't enable the
7933	 * feature itself but allows the user to enable it later.
7934	 */
7935	if (dev->hw_features & NETIF_F_TSO)
7936		dev->hw_features |= NETIF_F_TSO_MANGLEID;
7937	if (dev->vlan_features & NETIF_F_TSO)
7938		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7939	if (dev->mpls_features & NETIF_F_TSO)
7940		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7941	if (dev->hw_enc_features & NETIF_F_TSO)
7942		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7943
7944	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7945	 */
7946	dev->vlan_features |= NETIF_F_HIGHDMA;
7947
7948	/* Make NETIF_F_SG inheritable to tunnel devices.
7949	 */
7950	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7951
7952	/* Make NETIF_F_SG inheritable to MPLS.
7953	 */
7954	dev->mpls_features |= NETIF_F_SG;
7955
7956	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7957	ret = notifier_to_errno(ret);
7958	if (ret)
7959		goto err_uninit;
7960
7961	ret = netdev_register_kobject(dev);
7962	if (ret)
7963		goto err_uninit;
7964	dev->reg_state = NETREG_REGISTERED;
7965
7966	__netdev_update_features(dev);
7967
7968	/*
7969	 *	Default initial state at registry is that the
7970	 *	device is present.
7971	 */
7972
7973	set_bit(__LINK_STATE_PRESENT, &dev->state);
7974
7975	linkwatch_init_dev(dev);
7976
7977	dev_init_scheduler(dev);
7978	dev_hold(dev);
7979	list_netdevice(dev);
7980	add_device_randomness(dev->dev_addr, dev->addr_len);
7981
7982	/* If the device has permanent device address, driver should
7983	 * set dev_addr and also addr_assign_type should be set to
7984	 * NET_ADDR_PERM (default value).
7985	 */
7986	if (dev->addr_assign_type == NET_ADDR_PERM)
7987		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7988
7989	/* Notify protocols, that a new device appeared. */
7990	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7991	ret = notifier_to_errno(ret);
7992	if (ret) {
7993		rollback_registered(dev);
7994		dev->reg_state = NETREG_UNREGISTERED;
7995	}
7996	/*
7997	 *	Prevent userspace races by waiting until the network
7998	 *	device is fully setup before sending notifications.
7999	 */
8000	if (!dev->rtnl_link_ops ||
8001	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
8002		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
8003
8004out:
8005	return ret;
8006
8007err_uninit:
8008	if (dev->netdev_ops->ndo_uninit)
8009		dev->netdev_ops->ndo_uninit(dev);
8010	if (dev->priv_destructor)
8011		dev->priv_destructor(dev);
8012	goto out;
8013}
8014EXPORT_SYMBOL(register_netdevice);
8015
8016/**
8017 *	init_dummy_netdev	- init a dummy network device for NAPI
8018 *	@dev: device to init
8019 *
8020 *	This takes a network device structure and initialize the minimum
8021 *	amount of fields so it can be used to schedule NAPI polls without
8022 *	registering a full blown interface. This is to be used by drivers
8023 *	that need to tie several hardware interfaces to a single NAPI
8024 *	poll scheduler due to HW limitations.
8025 */
8026int init_dummy_netdev(struct net_device *dev)
8027{
8028	/* Clear everything. Note we don't initialize spinlocks
8029	 * are they aren't supposed to be taken by any of the
8030	 * NAPI code and this dummy netdev is supposed to be
8031	 * only ever used for NAPI polls
8032	 */
8033	memset(dev, 0, sizeof(struct net_device));
8034
8035	/* make sure we BUG if trying to hit standard
8036	 * register/unregister code path
8037	 */
8038	dev->reg_state = NETREG_DUMMY;
8039
8040	/* NAPI wants this */
8041	INIT_LIST_HEAD(&dev->napi_list);
8042
8043	/* a dummy interface is started by default */
8044	set_bit(__LINK_STATE_PRESENT, &dev->state);
8045	set_bit(__LINK_STATE_START, &dev->state);
8046
8047	/* Note : We dont allocate pcpu_refcnt for dummy devices,
8048	 * because users of this 'device' dont need to change
8049	 * its refcount.
8050	 */
8051
8052	return 0;
8053}
8054EXPORT_SYMBOL_GPL(init_dummy_netdev);
8055
8056
8057/**
8058 *	register_netdev	- register a network device
8059 *	@dev: device to register
8060 *
8061 *	Take a completed network device structure and add it to the kernel
8062 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
8063 *	chain. 0 is returned on success. A negative errno code is returned
8064 *	on a failure to set up the device, or if the name is a duplicate.
8065 *
8066 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
8067 *	and expands the device name if you passed a format string to
8068 *	alloc_netdev.
8069 */
8070int register_netdev(struct net_device *dev)
8071{
8072	int err;
8073
8074	if (rtnl_lock_killable())
8075		return -EINTR;
8076	err = register_netdevice(dev);
8077	rtnl_unlock();
8078	return err;
8079}
8080EXPORT_SYMBOL(register_netdev);
8081
8082int netdev_refcnt_read(const struct net_device *dev)
8083{
8084	int i, refcnt = 0;
8085
8086	for_each_possible_cpu(i)
8087		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
8088	return refcnt;
8089}
8090EXPORT_SYMBOL(netdev_refcnt_read);
8091
8092/**
8093 * netdev_wait_allrefs - wait until all references are gone.
8094 * @dev: target net_device
8095 *
8096 * This is called when unregistering network devices.
8097 *
8098 * Any protocol or device that holds a reference should register
8099 * for netdevice notification, and cleanup and put back the
8100 * reference if they receive an UNREGISTER event.
8101 * We can get stuck here if buggy protocols don't correctly
8102 * call dev_put.
8103 */
8104static void netdev_wait_allrefs(struct net_device *dev)
8105{
8106	unsigned long rebroadcast_time, warning_time;
8107	int refcnt;
8108
8109	linkwatch_forget_dev(dev);
8110
8111	rebroadcast_time = warning_time = jiffies;
8112	refcnt = netdev_refcnt_read(dev);
8113
8114	while (refcnt != 0) {
8115		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
8116			rtnl_lock();
8117
8118			/* Rebroadcast unregister notification */
8119			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8120
8121			__rtnl_unlock();
8122			rcu_barrier();
8123			rtnl_lock();
8124
 
8125			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
8126				     &dev->state)) {
8127				/* We must not have linkwatch events
8128				 * pending on unregister. If this
8129				 * happens, we simply run the queue
8130				 * unscheduled, resulting in a noop
8131				 * for this device.
8132				 */
8133				linkwatch_run_queue();
8134			}
8135
8136			__rtnl_unlock();
8137
8138			rebroadcast_time = jiffies;
8139		}
8140
8141		msleep(250);
8142
8143		refcnt = netdev_refcnt_read(dev);
8144
8145		if (time_after(jiffies, warning_time + 10 * HZ)) {
8146			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
8147				 dev->name, refcnt);
8148			warning_time = jiffies;
8149		}
8150	}
8151}
8152
8153/* The sequence is:
8154 *
8155 *	rtnl_lock();
8156 *	...
8157 *	register_netdevice(x1);
8158 *	register_netdevice(x2);
8159 *	...
8160 *	unregister_netdevice(y1);
8161 *	unregister_netdevice(y2);
8162 *      ...
8163 *	rtnl_unlock();
8164 *	free_netdev(y1);
8165 *	free_netdev(y2);
8166 *
8167 * We are invoked by rtnl_unlock().
8168 * This allows us to deal with problems:
8169 * 1) We can delete sysfs objects which invoke hotplug
8170 *    without deadlocking with linkwatch via keventd.
8171 * 2) Since we run with the RTNL semaphore not held, we can sleep
8172 *    safely in order to wait for the netdev refcnt to drop to zero.
8173 *
8174 * We must not return until all unregister events added during
8175 * the interval the lock was held have been completed.
8176 */
8177void netdev_run_todo(void)
8178{
8179	struct list_head list;
8180
8181	/* Snapshot list, allow later requests */
8182	list_replace_init(&net_todo_list, &list);
8183
8184	__rtnl_unlock();
8185
8186
8187	/* Wait for rcu callbacks to finish before next phase */
8188	if (!list_empty(&list))
8189		rcu_barrier();
8190
8191	while (!list_empty(&list)) {
8192		struct net_device *dev
8193			= list_first_entry(&list, struct net_device, todo_list);
8194		list_del(&dev->todo_list);
8195
 
 
 
 
8196		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
8197			pr_err("network todo '%s' but state %d\n",
8198			       dev->name, dev->reg_state);
8199			dump_stack();
8200			continue;
8201		}
8202
8203		dev->reg_state = NETREG_UNREGISTERED;
8204
8205		netdev_wait_allrefs(dev);
8206
8207		/* paranoia */
8208		BUG_ON(netdev_refcnt_read(dev));
8209		BUG_ON(!list_empty(&dev->ptype_all));
8210		BUG_ON(!list_empty(&dev->ptype_specific));
8211		WARN_ON(rcu_access_pointer(dev->ip_ptr));
8212		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
8213#if IS_ENABLED(CONFIG_DECNET)
8214		WARN_ON(dev->dn_ptr);
8215#endif
8216		if (dev->priv_destructor)
8217			dev->priv_destructor(dev);
8218		if (dev->needs_free_netdev)
8219			free_netdev(dev);
8220
8221		/* Report a network device has been unregistered */
8222		rtnl_lock();
8223		dev_net(dev)->dev_unreg_count--;
8224		__rtnl_unlock();
8225		wake_up(&netdev_unregistering_wq);
8226
8227		/* Free network device */
8228		kobject_put(&dev->dev.kobj);
8229	}
8230}
8231
8232/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
8233 * all the same fields in the same order as net_device_stats, with only
8234 * the type differing, but rtnl_link_stats64 may have additional fields
8235 * at the end for newer counters.
8236 */
8237void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
8238			     const struct net_device_stats *netdev_stats)
8239{
8240#if BITS_PER_LONG == 64
8241	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
8242	memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
8243	/* zero out counters that only exist in rtnl_link_stats64 */
8244	memset((char *)stats64 + sizeof(*netdev_stats), 0,
8245	       sizeof(*stats64) - sizeof(*netdev_stats));
8246#else
8247	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
8248	const unsigned long *src = (const unsigned long *)netdev_stats;
8249	u64 *dst = (u64 *)stats64;
8250
8251	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
8252	for (i = 0; i < n; i++)
8253		dst[i] = src[i];
8254	/* zero out counters that only exist in rtnl_link_stats64 */
8255	memset((char *)stats64 + n * sizeof(u64), 0,
8256	       sizeof(*stats64) - n * sizeof(u64));
8257#endif
8258}
8259EXPORT_SYMBOL(netdev_stats_to_stats64);
8260
8261/**
8262 *	dev_get_stats	- get network device statistics
8263 *	@dev: device to get statistics from
8264 *	@storage: place to store stats
8265 *
8266 *	Get network statistics from device. Return @storage.
8267 *	The device driver may provide its own method by setting
8268 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
8269 *	otherwise the internal statistics structure is used.
8270 */
8271struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
8272					struct rtnl_link_stats64 *storage)
8273{
8274	const struct net_device_ops *ops = dev->netdev_ops;
8275
8276	if (ops->ndo_get_stats64) {
8277		memset(storage, 0, sizeof(*storage));
8278		ops->ndo_get_stats64(dev, storage);
8279	} else if (ops->ndo_get_stats) {
8280		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
8281	} else {
8282		netdev_stats_to_stats64(storage, &dev->stats);
8283	}
8284	storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
8285	storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
8286	storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
8287	return storage;
8288}
8289EXPORT_SYMBOL(dev_get_stats);
8290
8291struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
8292{
8293	struct netdev_queue *queue = dev_ingress_queue(dev);
8294
8295#ifdef CONFIG_NET_CLS_ACT
8296	if (queue)
8297		return queue;
8298	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
8299	if (!queue)
8300		return NULL;
8301	netdev_init_one_queue(dev, queue, NULL);
8302	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
8303	queue->qdisc_sleeping = &noop_qdisc;
8304	rcu_assign_pointer(dev->ingress_queue, queue);
8305#endif
8306	return queue;
8307}
8308
8309static const struct ethtool_ops default_ethtool_ops;
8310
8311void netdev_set_default_ethtool_ops(struct net_device *dev,
8312				    const struct ethtool_ops *ops)
8313{
8314	if (dev->ethtool_ops == &default_ethtool_ops)
8315		dev->ethtool_ops = ops;
8316}
8317EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
8318
8319void netdev_freemem(struct net_device *dev)
8320{
8321	char *addr = (char *)dev - dev->padded;
8322
8323	kvfree(addr);
8324}
8325
8326/**
8327 * alloc_netdev_mqs - allocate network device
8328 * @sizeof_priv: size of private data to allocate space for
8329 * @name: device name format string
8330 * @name_assign_type: origin of device name
8331 * @setup: callback to initialize device
8332 * @txqs: the number of TX subqueues to allocate
8333 * @rxqs: the number of RX subqueues to allocate
8334 *
8335 * Allocates a struct net_device with private data area for driver use
8336 * and performs basic initialization.  Also allocates subqueue structs
8337 * for each queue on the device.
8338 */
8339struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
8340		unsigned char name_assign_type,
8341		void (*setup)(struct net_device *),
8342		unsigned int txqs, unsigned int rxqs)
8343{
8344	struct net_device *dev;
8345	unsigned int alloc_size;
8346	struct net_device *p;
8347
8348	BUG_ON(strlen(name) >= sizeof(dev->name));
8349
8350	if (txqs < 1) {
8351		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
8352		return NULL;
8353	}
8354
 
8355	if (rxqs < 1) {
8356		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
8357		return NULL;
8358	}
 
8359
8360	alloc_size = sizeof(struct net_device);
8361	if (sizeof_priv) {
8362		/* ensure 32-byte alignment of private area */
8363		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
8364		alloc_size += sizeof_priv;
8365	}
8366	/* ensure 32-byte alignment of whole construct */
8367	alloc_size += NETDEV_ALIGN - 1;
8368
8369	p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 
 
8370	if (!p)
8371		return NULL;
8372
8373	dev = PTR_ALIGN(p, NETDEV_ALIGN);
8374	dev->padded = (char *)dev - (char *)p;
8375
8376	dev->pcpu_refcnt = alloc_percpu(int);
8377	if (!dev->pcpu_refcnt)
8378		goto free_dev;
8379
8380	if (dev_addr_init(dev))
8381		goto free_pcpu;
8382
8383	dev_mc_init(dev);
8384	dev_uc_init(dev);
8385
8386	dev_net_set(dev, &init_net);
8387
8388	dev->gso_max_size = GSO_MAX_SIZE;
8389	dev->gso_max_segs = GSO_MAX_SEGS;
8390
8391	INIT_LIST_HEAD(&dev->napi_list);
8392	INIT_LIST_HEAD(&dev->unreg_list);
8393	INIT_LIST_HEAD(&dev->close_list);
8394	INIT_LIST_HEAD(&dev->link_watch_list);
8395	INIT_LIST_HEAD(&dev->adj_list.upper);
8396	INIT_LIST_HEAD(&dev->adj_list.lower);
8397	INIT_LIST_HEAD(&dev->ptype_all);
8398	INIT_LIST_HEAD(&dev->ptype_specific);
8399#ifdef CONFIG_NET_SCHED
8400	hash_init(dev->qdisc_hash);
8401#endif
8402	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
8403	setup(dev);
8404
8405	if (!dev->tx_queue_len) {
8406		dev->priv_flags |= IFF_NO_QUEUE;
8407		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
8408	}
8409
8410	dev->num_tx_queues = txqs;
8411	dev->real_num_tx_queues = txqs;
8412	if (netif_alloc_netdev_queues(dev))
8413		goto free_all;
8414
 
8415	dev->num_rx_queues = rxqs;
8416	dev->real_num_rx_queues = rxqs;
8417	if (netif_alloc_rx_queues(dev))
8418		goto free_all;
 
8419
8420	strcpy(dev->name, name);
8421	dev->name_assign_type = name_assign_type;
8422	dev->group = INIT_NETDEV_GROUP;
8423	if (!dev->ethtool_ops)
8424		dev->ethtool_ops = &default_ethtool_ops;
8425
8426	nf_hook_ingress_init(dev);
8427
8428	return dev;
8429
8430free_all:
8431	free_netdev(dev);
8432	return NULL;
8433
8434free_pcpu:
8435	free_percpu(dev->pcpu_refcnt);
8436free_dev:
8437	netdev_freemem(dev);
8438	return NULL;
8439}
8440EXPORT_SYMBOL(alloc_netdev_mqs);
8441
8442/**
8443 * free_netdev - free network device
8444 * @dev: device
8445 *
8446 * This function does the last stage of destroying an allocated device
8447 * interface. The reference to the device object is released. If this
8448 * is the last reference then it will be freed.Must be called in process
8449 * context.
8450 */
8451void free_netdev(struct net_device *dev)
8452{
8453	struct napi_struct *p, *n;
8454
8455	might_sleep();
8456	netif_free_tx_queues(dev);
8457	netif_free_rx_queues(dev);
 
 
8458
8459	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
8460
8461	/* Flush device addresses */
8462	dev_addr_flush(dev);
8463
8464	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
8465		netif_napi_del(p);
8466
8467	free_percpu(dev->pcpu_refcnt);
8468	dev->pcpu_refcnt = NULL;
8469
8470	/*  Compatibility with error handling in drivers */
8471	if (dev->reg_state == NETREG_UNINITIALIZED) {
8472		netdev_freemem(dev);
8473		return;
8474	}
8475
8476	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
8477	dev->reg_state = NETREG_RELEASED;
8478
8479	/* will free via device release */
8480	put_device(&dev->dev);
8481}
8482EXPORT_SYMBOL(free_netdev);
8483
8484/**
8485 *	synchronize_net -  Synchronize with packet receive processing
8486 *
8487 *	Wait for packets currently being received to be done.
8488 *	Does not block later packets from starting.
8489 */
8490void synchronize_net(void)
8491{
8492	might_sleep();
8493	if (rtnl_is_locked())
8494		synchronize_rcu_expedited();
8495	else
8496		synchronize_rcu();
8497}
8498EXPORT_SYMBOL(synchronize_net);
8499
8500/**
8501 *	unregister_netdevice_queue - remove device from the kernel
8502 *	@dev: device
8503 *	@head: list
8504 *
8505 *	This function shuts down a device interface and removes it
8506 *	from the kernel tables.
8507 *	If head not NULL, device is queued to be unregistered later.
8508 *
8509 *	Callers must hold the rtnl semaphore.  You may want
8510 *	unregister_netdev() instead of this.
8511 */
8512
8513void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
8514{
8515	ASSERT_RTNL();
8516
8517	if (head) {
8518		list_move_tail(&dev->unreg_list, head);
8519	} else {
8520		rollback_registered(dev);
8521		/* Finish processing unregister after unlock */
8522		net_set_todo(dev);
8523	}
8524}
8525EXPORT_SYMBOL(unregister_netdevice_queue);
8526
8527/**
8528 *	unregister_netdevice_many - unregister many devices
8529 *	@head: list of devices
8530 *
8531 *  Note: As most callers use a stack allocated list_head,
8532 *  we force a list_del() to make sure stack wont be corrupted later.
8533 */
8534void unregister_netdevice_many(struct list_head *head)
8535{
8536	struct net_device *dev;
8537
8538	if (!list_empty(head)) {
8539		rollback_registered_many(head);
8540		list_for_each_entry(dev, head, unreg_list)
8541			net_set_todo(dev);
8542		list_del(head);
8543	}
8544}
8545EXPORT_SYMBOL(unregister_netdevice_many);
8546
8547/**
8548 *	unregister_netdev - remove device from the kernel
8549 *	@dev: device
8550 *
8551 *	This function shuts down a device interface and removes it
8552 *	from the kernel tables.
8553 *
8554 *	This is just a wrapper for unregister_netdevice that takes
8555 *	the rtnl semaphore.  In general you want to use this and not
8556 *	unregister_netdevice.
8557 */
8558void unregister_netdev(struct net_device *dev)
8559{
8560	rtnl_lock();
8561	unregister_netdevice(dev);
8562	rtnl_unlock();
8563}
8564EXPORT_SYMBOL(unregister_netdev);
8565
8566/**
8567 *	dev_change_net_namespace - move device to different nethost namespace
8568 *	@dev: device
8569 *	@net: network namespace
8570 *	@pat: If not NULL name pattern to try if the current device name
8571 *	      is already taken in the destination network namespace.
8572 *
8573 *	This function shuts down a device interface and moves it
8574 *	to a new network namespace. On success 0 is returned, on
8575 *	a failure a netagive errno code is returned.
8576 *
8577 *	Callers must hold the rtnl semaphore.
8578 */
8579
8580int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
8581{
8582	int err, new_nsid, new_ifindex;
8583
8584	ASSERT_RTNL();
8585
8586	/* Don't allow namespace local devices to be moved. */
8587	err = -EINVAL;
8588	if (dev->features & NETIF_F_NETNS_LOCAL)
8589		goto out;
8590
8591	/* Ensure the device has been registrered */
8592	if (dev->reg_state != NETREG_REGISTERED)
8593		goto out;
8594
8595	/* Get out if there is nothing todo */
8596	err = 0;
8597	if (net_eq(dev_net(dev), net))
8598		goto out;
8599
8600	/* Pick the destination device name, and ensure
8601	 * we can use it in the destination network namespace.
8602	 */
8603	err = -EEXIST;
8604	if (__dev_get_by_name(net, dev->name)) {
8605		/* We get here if we can't use the current device name */
8606		if (!pat)
8607			goto out;
8608		if (dev_get_valid_name(net, dev, pat) < 0)
8609			goto out;
8610	}
8611
8612	/*
8613	 * And now a mini version of register_netdevice unregister_netdevice.
8614	 */
8615
8616	/* If device is running close it first. */
8617	dev_close(dev);
8618
8619	/* And unlink it from device chain */
8620	err = -ENODEV;
8621	unlist_netdevice(dev);
8622
8623	synchronize_net();
8624
8625	/* Shutdown queueing discipline. */
8626	dev_shutdown(dev);
8627
8628	/* Notify protocols, that we are about to destroy
8629	 * this device. They should clean all the things.
8630	 *
8631	 * Note that dev->reg_state stays at NETREG_REGISTERED.
8632	 * This is wanted because this way 8021q and macvlan know
8633	 * the device is just moving and can keep their slaves up.
8634	 */
8635	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8636	rcu_barrier();
8637
8638	new_nsid = peernet2id_alloc(dev_net(dev), net);
8639	/* If there is an ifindex conflict assign a new one */
8640	if (__dev_get_by_index(net, dev->ifindex))
8641		new_ifindex = dev_new_index(net);
8642	else
8643		new_ifindex = dev->ifindex;
8644
8645	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
8646			    new_ifindex);
8647
8648	/*
8649	 *	Flush the unicast and multicast chains
8650	 */
8651	dev_uc_flush(dev);
8652	dev_mc_flush(dev);
8653
8654	/* Send a netdev-removed uevent to the old namespace */
8655	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
8656	netdev_adjacent_del_links(dev);
8657
8658	/* Actually switch the network namespace */
8659	dev_net_set(dev, net);
8660	dev->ifindex = new_ifindex;
 
 
 
8661
8662	/* Send a netdev-add uevent to the new namespace */
8663	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
8664	netdev_adjacent_add_links(dev);
8665
8666	/* Fixup kobjects */
8667	err = device_rename(&dev->dev, dev->name);
8668	WARN_ON(err);
8669
8670	/* Add the device back in the hashes */
8671	list_netdevice(dev);
8672
8673	/* Notify protocols, that a new device appeared. */
8674	call_netdevice_notifiers(NETDEV_REGISTER, dev);
8675
8676	/*
8677	 *	Prevent userspace races by waiting until the network
8678	 *	device is fully setup before sending notifications.
8679	 */
8680	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
8681
8682	synchronize_net();
8683	err = 0;
8684out:
8685	return err;
8686}
8687EXPORT_SYMBOL_GPL(dev_change_net_namespace);
8688
8689static int dev_cpu_dead(unsigned int oldcpu)
8690{
8691	struct sk_buff **list_skb;
8692	struct sk_buff *skb;
8693	unsigned int cpu;
8694	struct softnet_data *sd, *oldsd, *remsd = NULL;
8695
8696	local_irq_disable();
8697	cpu = smp_processor_id();
8698	sd = &per_cpu(softnet_data, cpu);
8699	oldsd = &per_cpu(softnet_data, oldcpu);
8700
8701	/* Find end of our completion_queue. */
8702	list_skb = &sd->completion_queue;
8703	while (*list_skb)
8704		list_skb = &(*list_skb)->next;
8705	/* Append completion queue from offline CPU. */
8706	*list_skb = oldsd->completion_queue;
8707	oldsd->completion_queue = NULL;
8708
8709	/* Append output queue from offline CPU. */
8710	if (oldsd->output_queue) {
8711		*sd->output_queue_tailp = oldsd->output_queue;
8712		sd->output_queue_tailp = oldsd->output_queue_tailp;
8713		oldsd->output_queue = NULL;
8714		oldsd->output_queue_tailp = &oldsd->output_queue;
8715	}
8716	/* Append NAPI poll list from offline CPU, with one exception :
8717	 * process_backlog() must be called by cpu owning percpu backlog.
8718	 * We properly handle process_queue & input_pkt_queue later.
8719	 */
8720	while (!list_empty(&oldsd->poll_list)) {
8721		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
8722							    struct napi_struct,
8723							    poll_list);
8724
8725		list_del_init(&napi->poll_list);
8726		if (napi->poll == process_backlog)
8727			napi->state = 0;
8728		else
8729			____napi_schedule(sd, napi);
8730	}
8731
8732	raise_softirq_irqoff(NET_TX_SOFTIRQ);
8733	local_irq_enable();
8734
8735#ifdef CONFIG_RPS
8736	remsd = oldsd->rps_ipi_list;
8737	oldsd->rps_ipi_list = NULL;
8738#endif
8739	/* send out pending IPI's on offline CPU */
8740	net_rps_send_ipi(remsd);
8741
8742	/* Process offline CPU's input_pkt_queue */
8743	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
8744		netif_rx_ni(skb);
8745		input_queue_head_incr(oldsd);
8746	}
8747	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
8748		netif_rx_ni(skb);
8749		input_queue_head_incr(oldsd);
8750	}
8751
8752	return 0;
8753}
8754
8755/**
8756 *	netdev_increment_features - increment feature set by one
8757 *	@all: current feature set
8758 *	@one: new feature set
8759 *	@mask: mask feature set
8760 *
8761 *	Computes a new feature set after adding a device with feature set
8762 *	@one to the master device with current feature set @all.  Will not
8763 *	enable anything that is off in @mask. Returns the new feature set.
8764 */
8765netdev_features_t netdev_increment_features(netdev_features_t all,
8766	netdev_features_t one, netdev_features_t mask)
8767{
8768	if (mask & NETIF_F_HW_CSUM)
8769		mask |= NETIF_F_CSUM_MASK;
8770	mask |= NETIF_F_VLAN_CHALLENGED;
8771
8772	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
8773	all &= one | ~NETIF_F_ALL_FOR_ALL;
8774
8775	/* If one device supports hw checksumming, set for all. */
8776	if (all & NETIF_F_HW_CSUM)
8777		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
8778
8779	return all;
8780}
8781EXPORT_SYMBOL(netdev_increment_features);
8782
8783static struct hlist_head * __net_init netdev_create_hash(void)
8784{
8785	int i;
8786	struct hlist_head *hash;
8787
8788	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8789	if (hash != NULL)
8790		for (i = 0; i < NETDEV_HASHENTRIES; i++)
8791			INIT_HLIST_HEAD(&hash[i]);
8792
8793	return hash;
8794}
8795
8796/* Initialize per network namespace state */
8797static int __net_init netdev_init(struct net *net)
8798{
8799	if (net != &init_net)
8800		INIT_LIST_HEAD(&net->dev_base_head);
8801
8802	net->dev_name_head = netdev_create_hash();
8803	if (net->dev_name_head == NULL)
8804		goto err_name;
8805
8806	net->dev_index_head = netdev_create_hash();
8807	if (net->dev_index_head == NULL)
8808		goto err_idx;
8809
8810	return 0;
8811
8812err_idx:
8813	kfree(net->dev_name_head);
8814err_name:
8815	return -ENOMEM;
8816}
8817
8818/**
8819 *	netdev_drivername - network driver for the device
8820 *	@dev: network device
8821 *
8822 *	Determine network driver for device.
8823 */
8824const char *netdev_drivername(const struct net_device *dev)
8825{
8826	const struct device_driver *driver;
8827	const struct device *parent;
8828	const char *empty = "";
8829
8830	parent = dev->dev.parent;
8831	if (!parent)
8832		return empty;
8833
8834	driver = parent->driver;
8835	if (driver && driver->name)
8836		return driver->name;
8837	return empty;
8838}
8839
8840static void __netdev_printk(const char *level, const struct net_device *dev,
8841			    struct va_format *vaf)
8842{
8843	if (dev && dev->dev.parent) {
8844		dev_printk_emit(level[1] - '0',
8845				dev->dev.parent,
8846				"%s %s %s%s: %pV",
8847				dev_driver_string(dev->dev.parent),
8848				dev_name(dev->dev.parent),
8849				netdev_name(dev), netdev_reg_state(dev),
8850				vaf);
8851	} else if (dev) {
8852		printk("%s%s%s: %pV",
8853		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
8854	} else {
8855		printk("%s(NULL net_device): %pV", level, vaf);
8856	}
8857}
8858
8859void netdev_printk(const char *level, const struct net_device *dev,
8860		   const char *format, ...)
8861{
8862	struct va_format vaf;
8863	va_list args;
8864
8865	va_start(args, format);
8866
8867	vaf.fmt = format;
8868	vaf.va = &args;
8869
8870	__netdev_printk(level, dev, &vaf);
8871
8872	va_end(args);
8873}
8874EXPORT_SYMBOL(netdev_printk);
8875
8876#define define_netdev_printk_level(func, level)			\
8877void func(const struct net_device *dev, const char *fmt, ...)	\
8878{								\
8879	struct va_format vaf;					\
8880	va_list args;						\
8881								\
8882	va_start(args, fmt);					\
8883								\
8884	vaf.fmt = fmt;						\
8885	vaf.va = &args;						\
8886								\
8887	__netdev_printk(level, dev, &vaf);			\
8888								\
8889	va_end(args);						\
8890}								\
8891EXPORT_SYMBOL(func);
8892
8893define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8894define_netdev_printk_level(netdev_alert, KERN_ALERT);
8895define_netdev_printk_level(netdev_crit, KERN_CRIT);
8896define_netdev_printk_level(netdev_err, KERN_ERR);
8897define_netdev_printk_level(netdev_warn, KERN_WARNING);
8898define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8899define_netdev_printk_level(netdev_info, KERN_INFO);
8900
8901static void __net_exit netdev_exit(struct net *net)
8902{
8903	kfree(net->dev_name_head);
8904	kfree(net->dev_index_head);
8905	if (net != &init_net)
8906		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
8907}
8908
8909static struct pernet_operations __net_initdata netdev_net_ops = {
8910	.init = netdev_init,
8911	.exit = netdev_exit,
8912};
8913
8914static void __net_exit default_device_exit(struct net *net)
8915{
8916	struct net_device *dev, *aux;
8917	/*
8918	 * Push all migratable network devices back to the
8919	 * initial network namespace
8920	 */
8921	rtnl_lock();
8922	for_each_netdev_safe(net, dev, aux) {
8923		int err;
8924		char fb_name[IFNAMSIZ];
8925
8926		/* Ignore unmoveable devices (i.e. loopback) */
8927		if (dev->features & NETIF_F_NETNS_LOCAL)
8928			continue;
8929
8930		/* Leave virtual devices for the generic cleanup */
8931		if (dev->rtnl_link_ops)
8932			continue;
8933
8934		/* Push remaining network devices to init_net */
8935		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8936		err = dev_change_net_namespace(dev, &init_net, fb_name);
8937		if (err) {
8938			pr_emerg("%s: failed to move %s to init_net: %d\n",
8939				 __func__, dev->name, err);
8940			BUG();
8941		}
8942	}
8943	rtnl_unlock();
8944}
8945
8946static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8947{
8948	/* Return with the rtnl_lock held when there are no network
8949	 * devices unregistering in any network namespace in net_list.
8950	 */
8951	struct net *net;
8952	bool unregistering;
8953	DEFINE_WAIT_FUNC(wait, woken_wake_function);
8954
8955	add_wait_queue(&netdev_unregistering_wq, &wait);
8956	for (;;) {
8957		unregistering = false;
8958		rtnl_lock();
8959		list_for_each_entry(net, net_list, exit_list) {
8960			if (net->dev_unreg_count > 0) {
8961				unregistering = true;
8962				break;
8963			}
8964		}
8965		if (!unregistering)
8966			break;
8967		__rtnl_unlock();
8968
8969		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8970	}
8971	remove_wait_queue(&netdev_unregistering_wq, &wait);
8972}
8973
8974static void __net_exit default_device_exit_batch(struct list_head *net_list)
8975{
8976	/* At exit all network devices most be removed from a network
8977	 * namespace.  Do this in the reverse order of registration.
8978	 * Do this across as many network namespaces as possible to
8979	 * improve batching efficiency.
8980	 */
8981	struct net_device *dev;
8982	struct net *net;
8983	LIST_HEAD(dev_kill_list);
8984
8985	/* To prevent network device cleanup code from dereferencing
8986	 * loopback devices or network devices that have been freed
8987	 * wait here for all pending unregistrations to complete,
8988	 * before unregistring the loopback device and allowing the
8989	 * network namespace be freed.
8990	 *
8991	 * The netdev todo list containing all network devices
8992	 * unregistrations that happen in default_device_exit_batch
8993	 * will run in the rtnl_unlock() at the end of
8994	 * default_device_exit_batch.
8995	 */
8996	rtnl_lock_unregistering(net_list);
8997	list_for_each_entry(net, net_list, exit_list) {
8998		for_each_netdev_reverse(net, dev) {
8999			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
9000				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
9001			else
9002				unregister_netdevice_queue(dev, &dev_kill_list);
9003		}
9004	}
9005	unregister_netdevice_many(&dev_kill_list);
9006	rtnl_unlock();
9007}
9008
9009static struct pernet_operations __net_initdata default_device_ops = {
9010	.exit = default_device_exit,
9011	.exit_batch = default_device_exit_batch,
9012};
9013
9014/*
9015 *	Initialize the DEV module. At boot time this walks the device list and
9016 *	unhooks any devices that fail to initialise (normally hardware not
9017 *	present) and leaves us with a valid list of present and active devices.
9018 *
9019 */
9020
9021/*
9022 *       This is called single threaded during boot, so no need
9023 *       to take the rtnl semaphore.
9024 */
9025static int __init net_dev_init(void)
9026{
9027	int i, rc = -ENOMEM;
9028
9029	BUG_ON(!dev_boot_phase);
9030
9031	if (dev_proc_init())
9032		goto out;
9033
9034	if (netdev_kobject_init())
9035		goto out;
9036
9037	INIT_LIST_HEAD(&ptype_all);
9038	for (i = 0; i < PTYPE_HASH_SIZE; i++)
9039		INIT_LIST_HEAD(&ptype_base[i]);
9040
9041	INIT_LIST_HEAD(&offload_base);
9042
9043	if (register_pernet_subsys(&netdev_net_ops))
9044		goto out;
9045
9046	/*
9047	 *	Initialise the packet receive queues.
9048	 */
9049
9050	for_each_possible_cpu(i) {
9051		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
9052		struct softnet_data *sd = &per_cpu(softnet_data, i);
9053
9054		INIT_WORK(flush, flush_backlog);
9055
9056		skb_queue_head_init(&sd->input_pkt_queue);
9057		skb_queue_head_init(&sd->process_queue);
9058#ifdef CONFIG_XFRM_OFFLOAD
9059		skb_queue_head_init(&sd->xfrm_backlog);
9060#endif
9061		INIT_LIST_HEAD(&sd->poll_list);
9062		sd->output_queue_tailp = &sd->output_queue;
9063#ifdef CONFIG_RPS
9064		sd->csd.func = rps_trigger_softirq;
9065		sd->csd.info = sd;
9066		sd->cpu = i;
9067#endif
9068
9069		sd->backlog.poll = process_backlog;
9070		sd->backlog.weight = weight_p;
9071	}
9072
9073	dev_boot_phase = 0;
9074
9075	/* The loopback device is special if any other network devices
9076	 * is present in a network namespace the loopback device must
9077	 * be present. Since we now dynamically allocate and free the
9078	 * loopback device ensure this invariant is maintained by
9079	 * keeping the loopback device as the first device on the
9080	 * list of network devices.  Ensuring the loopback devices
9081	 * is the first device that appears and the last network device
9082	 * that disappears.
9083	 */
9084	if (register_pernet_device(&loopback_net_ops))
9085		goto out;
9086
9087	if (register_pernet_device(&default_device_ops))
9088		goto out;
9089
9090	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
9091	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
9092
9093	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
9094				       NULL, dev_cpu_dead);
9095	WARN_ON(rc < 0);
 
9096	rc = 0;
9097out:
9098	return rc;
9099}
9100
9101subsys_initcall(net_dev_init);
v4.10.11
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <linux/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
 
  84#include <linux/mutex.h>
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
  97#include <linux/bpf.h>
 
  98#include <net/net_namespace.h>
  99#include <net/sock.h>
 100#include <net/busy_poll.h>
 101#include <linux/rtnetlink.h>
 102#include <linux/stat.h>
 103#include <net/dst.h>
 104#include <net/dst_metadata.h>
 105#include <net/pkt_sched.h>
 
 106#include <net/checksum.h>
 107#include <net/xfrm.h>
 108#include <linux/highmem.h>
 109#include <linux/init.h>
 110#include <linux/module.h>
 111#include <linux/netpoll.h>
 112#include <linux/rcupdate.h>
 113#include <linux/delay.h>
 114#include <net/iw_handler.h>
 115#include <asm/current.h>
 116#include <linux/audit.h>
 117#include <linux/dmaengine.h>
 118#include <linux/err.h>
 119#include <linux/ctype.h>
 120#include <linux/if_arp.h>
 121#include <linux/if_vlan.h>
 122#include <linux/ip.h>
 123#include <net/ip.h>
 124#include <net/mpls.h>
 125#include <linux/ipv6.h>
 126#include <linux/in.h>
 127#include <linux/jhash.h>
 128#include <linux/random.h>
 129#include <trace/events/napi.h>
 130#include <trace/events/net.h>
 131#include <trace/events/skb.h>
 132#include <linux/pci.h>
 133#include <linux/inetdevice.h>
 134#include <linux/cpu_rmap.h>
 135#include <linux/static_key.h>
 136#include <linux/hashtable.h>
 137#include <linux/vmalloc.h>
 138#include <linux/if_macvlan.h>
 139#include <linux/errqueue.h>
 140#include <linux/hrtimer.h>
 141#include <linux/netfilter_ingress.h>
 142#include <linux/crash_dump.h>
 
 
 
 143
 144#include "net-sysfs.h"
 145
 146/* Instead of increasing this, you should create a hash table. */
 147#define MAX_GRO_SKBS 8
 148
 149/* This should be increased if a protocol with a bigger head is added. */
 150#define GRO_MAX_HEAD (MAX_HEADER + 128)
 151
 152static DEFINE_SPINLOCK(ptype_lock);
 153static DEFINE_SPINLOCK(offload_lock);
 154struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 155struct list_head ptype_all __read_mostly;	/* Taps */
 156static struct list_head offload_base __read_mostly;
 157
 158static int netif_rx_internal(struct sk_buff *skb);
 159static int call_netdevice_notifiers_info(unsigned long val,
 160					 struct net_device *dev,
 161					 struct netdev_notifier_info *info);
 
 162
 163/*
 164 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 165 * semaphore.
 166 *
 167 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 168 *
 169 * Writers must hold the rtnl semaphore while they loop through the
 170 * dev_base_head list, and hold dev_base_lock for writing when they do the
 171 * actual updates.  This allows pure readers to access the list even
 172 * while a writer is preparing to update it.
 173 *
 174 * To put it another way, dev_base_lock is held for writing only to
 175 * protect against pure readers; the rtnl semaphore provides the
 176 * protection against other writers.
 177 *
 178 * See, for example usages, register_netdevice() and
 179 * unregister_netdevice(), which must be called with the rtnl
 180 * semaphore held.
 181 */
 182DEFINE_RWLOCK(dev_base_lock);
 183EXPORT_SYMBOL(dev_base_lock);
 184
 
 
 185/* protects napi_hash addition/deletion and napi_gen_id */
 186static DEFINE_SPINLOCK(napi_hash_lock);
 187
 188static unsigned int napi_gen_id = NR_CPUS;
 189static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 190
 191static seqcount_t devnet_rename_seq;
 192
 193static inline void dev_base_seq_inc(struct net *net)
 194{
 195	while (++net->dev_base_seq == 0);
 
 196}
 197
 198static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 199{
 200	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
 201
 202	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 203}
 204
 205static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 206{
 207	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 208}
 209
 210static inline void rps_lock(struct softnet_data *sd)
 211{
 212#ifdef CONFIG_RPS
 213	spin_lock(&sd->input_pkt_queue.lock);
 214#endif
 215}
 216
 217static inline void rps_unlock(struct softnet_data *sd)
 218{
 219#ifdef CONFIG_RPS
 220	spin_unlock(&sd->input_pkt_queue.lock);
 221#endif
 222}
 223
 224/* Device list insertion */
 225static void list_netdevice(struct net_device *dev)
 226{
 227	struct net *net = dev_net(dev);
 228
 229	ASSERT_RTNL();
 230
 231	write_lock_bh(&dev_base_lock);
 232	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 233	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 234	hlist_add_head_rcu(&dev->index_hlist,
 235			   dev_index_hash(net, dev->ifindex));
 236	write_unlock_bh(&dev_base_lock);
 237
 238	dev_base_seq_inc(net);
 239}
 240
 241/* Device list removal
 242 * caller must respect a RCU grace period before freeing/reusing dev
 243 */
 244static void unlist_netdevice(struct net_device *dev)
 245{
 246	ASSERT_RTNL();
 247
 248	/* Unlink dev from the device chain */
 249	write_lock_bh(&dev_base_lock);
 250	list_del_rcu(&dev->dev_list);
 251	hlist_del_rcu(&dev->name_hlist);
 252	hlist_del_rcu(&dev->index_hlist);
 253	write_unlock_bh(&dev_base_lock);
 254
 255	dev_base_seq_inc(dev_net(dev));
 256}
 257
 258/*
 259 *	Our notifier list
 260 */
 261
 262static RAW_NOTIFIER_HEAD(netdev_chain);
 263
 264/*
 265 *	Device drivers call our routines to queue packets here. We empty the
 266 *	queue in the local softnet handler.
 267 */
 268
 269DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 270EXPORT_PER_CPU_SYMBOL(softnet_data);
 271
 272#ifdef CONFIG_LOCKDEP
 273/*
 274 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 275 * according to dev->type
 276 */
 277static const unsigned short netdev_lock_type[] =
 278	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 279	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 280	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 281	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 282	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 283	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 284	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 285	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 286	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 287	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 288	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 289	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 290	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 291	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 292	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 293
 294static const char *const netdev_lock_name[] =
 295	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 296	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 297	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 298	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 299	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 300	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 301	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 302	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 303	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 304	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 305	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 306	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 307	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 308	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 309	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 310
 311static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 312static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 313
 314static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 315{
 316	int i;
 317
 318	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 319		if (netdev_lock_type[i] == dev_type)
 320			return i;
 321	/* the last key is used by default */
 322	return ARRAY_SIZE(netdev_lock_type) - 1;
 323}
 324
 325static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 326						 unsigned short dev_type)
 327{
 328	int i;
 329
 330	i = netdev_lock_pos(dev_type);
 331	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 332				   netdev_lock_name[i]);
 333}
 334
 335static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 336{
 337	int i;
 338
 339	i = netdev_lock_pos(dev->type);
 340	lockdep_set_class_and_name(&dev->addr_list_lock,
 341				   &netdev_addr_lock_key[i],
 342				   netdev_lock_name[i]);
 343}
 344#else
 345static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 346						 unsigned short dev_type)
 347{
 348}
 349static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 350{
 351}
 352#endif
 353
 354/*******************************************************************************
 
 
 
 
 355
 356		Protocol management and registration routines
 357
 358*******************************************************************************/
 359
 360/*
 361 *	Add a protocol ID to the list. Now that the input handler is
 362 *	smarter we can dispense with all the messy stuff that used to be
 363 *	here.
 364 *
 365 *	BEWARE!!! Protocol handlers, mangling input packets,
 366 *	MUST BE last in hash buckets and checking protocol handlers
 367 *	MUST start from promiscuous ptype_all chain in net_bh.
 368 *	It is true now, do not change it.
 369 *	Explanation follows: if protocol handler, mangling packet, will
 370 *	be the first on list, it is not able to sense, that packet
 371 *	is cloned and should be copied-on-write, so that it will
 372 *	change it and subsequent readers will get broken packet.
 373 *							--ANK (980803)
 374 */
 375
 376static inline struct list_head *ptype_head(const struct packet_type *pt)
 377{
 378	if (pt->type == htons(ETH_P_ALL))
 379		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 380	else
 381		return pt->dev ? &pt->dev->ptype_specific :
 382				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 383}
 384
 385/**
 386 *	dev_add_pack - add packet handler
 387 *	@pt: packet type declaration
 388 *
 389 *	Add a protocol handler to the networking stack. The passed &packet_type
 390 *	is linked into kernel lists and may not be freed until it has been
 391 *	removed from the kernel lists.
 392 *
 393 *	This call does not sleep therefore it can not
 394 *	guarantee all CPU's that are in middle of receiving packets
 395 *	will see the new packet type (until the next received packet).
 396 */
 397
 398void dev_add_pack(struct packet_type *pt)
 399{
 400	struct list_head *head = ptype_head(pt);
 401
 402	spin_lock(&ptype_lock);
 403	list_add_rcu(&pt->list, head);
 404	spin_unlock(&ptype_lock);
 405}
 406EXPORT_SYMBOL(dev_add_pack);
 407
 408/**
 409 *	__dev_remove_pack	 - remove packet handler
 410 *	@pt: packet type declaration
 411 *
 412 *	Remove a protocol handler that was previously added to the kernel
 413 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 414 *	from the kernel lists and can be freed or reused once this function
 415 *	returns.
 416 *
 417 *      The packet type might still be in use by receivers
 418 *	and must not be freed until after all the CPU's have gone
 419 *	through a quiescent state.
 420 */
 421void __dev_remove_pack(struct packet_type *pt)
 422{
 423	struct list_head *head = ptype_head(pt);
 424	struct packet_type *pt1;
 425
 426	spin_lock(&ptype_lock);
 427
 428	list_for_each_entry(pt1, head, list) {
 429		if (pt == pt1) {
 430			list_del_rcu(&pt->list);
 431			goto out;
 432		}
 433	}
 434
 435	pr_warn("dev_remove_pack: %p not found\n", pt);
 436out:
 437	spin_unlock(&ptype_lock);
 438}
 439EXPORT_SYMBOL(__dev_remove_pack);
 440
 441/**
 442 *	dev_remove_pack	 - remove packet handler
 443 *	@pt: packet type declaration
 444 *
 445 *	Remove a protocol handler that was previously added to the kernel
 446 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 447 *	from the kernel lists and can be freed or reused once this function
 448 *	returns.
 449 *
 450 *	This call sleeps to guarantee that no CPU is looking at the packet
 451 *	type after return.
 452 */
 453void dev_remove_pack(struct packet_type *pt)
 454{
 455	__dev_remove_pack(pt);
 456
 457	synchronize_net();
 458}
 459EXPORT_SYMBOL(dev_remove_pack);
 460
 461
 462/**
 463 *	dev_add_offload - register offload handlers
 464 *	@po: protocol offload declaration
 465 *
 466 *	Add protocol offload handlers to the networking stack. The passed
 467 *	&proto_offload is linked into kernel lists and may not be freed until
 468 *	it has been removed from the kernel lists.
 469 *
 470 *	This call does not sleep therefore it can not
 471 *	guarantee all CPU's that are in middle of receiving packets
 472 *	will see the new offload handlers (until the next received packet).
 473 */
 474void dev_add_offload(struct packet_offload *po)
 475{
 476	struct packet_offload *elem;
 477
 478	spin_lock(&offload_lock);
 479	list_for_each_entry(elem, &offload_base, list) {
 480		if (po->priority < elem->priority)
 481			break;
 482	}
 483	list_add_rcu(&po->list, elem->list.prev);
 484	spin_unlock(&offload_lock);
 485}
 486EXPORT_SYMBOL(dev_add_offload);
 487
 488/**
 489 *	__dev_remove_offload	 - remove offload handler
 490 *	@po: packet offload declaration
 491 *
 492 *	Remove a protocol offload handler that was previously added to the
 493 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
 494 *	is removed from the kernel lists and can be freed or reused once this
 495 *	function returns.
 496 *
 497 *      The packet type might still be in use by receivers
 498 *	and must not be freed until after all the CPU's have gone
 499 *	through a quiescent state.
 500 */
 501static void __dev_remove_offload(struct packet_offload *po)
 502{
 503	struct list_head *head = &offload_base;
 504	struct packet_offload *po1;
 505
 506	spin_lock(&offload_lock);
 507
 508	list_for_each_entry(po1, head, list) {
 509		if (po == po1) {
 510			list_del_rcu(&po->list);
 511			goto out;
 512		}
 513	}
 514
 515	pr_warn("dev_remove_offload: %p not found\n", po);
 516out:
 517	spin_unlock(&offload_lock);
 518}
 519
 520/**
 521 *	dev_remove_offload	 - remove packet offload handler
 522 *	@po: packet offload declaration
 523 *
 524 *	Remove a packet offload handler that was previously added to the kernel
 525 *	offload handlers by dev_add_offload(). The passed &offload_type is
 526 *	removed from the kernel lists and can be freed or reused once this
 527 *	function returns.
 528 *
 529 *	This call sleeps to guarantee that no CPU is looking at the packet
 530 *	type after return.
 531 */
 532void dev_remove_offload(struct packet_offload *po)
 533{
 534	__dev_remove_offload(po);
 535
 536	synchronize_net();
 537}
 538EXPORT_SYMBOL(dev_remove_offload);
 539
 540/******************************************************************************
 541
 542		      Device Boot-time Settings Routines
 543
 544*******************************************************************************/
 545
 546/* Boot time configuration table */
 547static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 548
 549/**
 550 *	netdev_boot_setup_add	- add new setup entry
 551 *	@name: name of the device
 552 *	@map: configured settings for the device
 553 *
 554 *	Adds new setup entry to the dev_boot_setup list.  The function
 555 *	returns 0 on error and 1 on success.  This is a generic routine to
 556 *	all netdevices.
 557 */
 558static int netdev_boot_setup_add(char *name, struct ifmap *map)
 559{
 560	struct netdev_boot_setup *s;
 561	int i;
 562
 563	s = dev_boot_setup;
 564	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 565		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 566			memset(s[i].name, 0, sizeof(s[i].name));
 567			strlcpy(s[i].name, name, IFNAMSIZ);
 568			memcpy(&s[i].map, map, sizeof(s[i].map));
 569			break;
 570		}
 571	}
 572
 573	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 574}
 575
 576/**
 577 *	netdev_boot_setup_check	- check boot time settings
 578 *	@dev: the netdevice
 579 *
 580 * 	Check boot time settings for the device.
 581 *	The found settings are set for the device to be used
 582 *	later in the device probing.
 583 *	Returns 0 if no settings found, 1 if they are.
 584 */
 585int netdev_boot_setup_check(struct net_device *dev)
 586{
 587	struct netdev_boot_setup *s = dev_boot_setup;
 588	int i;
 589
 590	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 591		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 592		    !strcmp(dev->name, s[i].name)) {
 593			dev->irq 	= s[i].map.irq;
 594			dev->base_addr 	= s[i].map.base_addr;
 595			dev->mem_start 	= s[i].map.mem_start;
 596			dev->mem_end 	= s[i].map.mem_end;
 597			return 1;
 598		}
 599	}
 600	return 0;
 601}
 602EXPORT_SYMBOL(netdev_boot_setup_check);
 603
 604
 605/**
 606 *	netdev_boot_base	- get address from boot time settings
 607 *	@prefix: prefix for network device
 608 *	@unit: id for network device
 609 *
 610 * 	Check boot time settings for the base address of device.
 611 *	The found settings are set for the device to be used
 612 *	later in the device probing.
 613 *	Returns 0 if no settings found.
 614 */
 615unsigned long netdev_boot_base(const char *prefix, int unit)
 616{
 617	const struct netdev_boot_setup *s = dev_boot_setup;
 618	char name[IFNAMSIZ];
 619	int i;
 620
 621	sprintf(name, "%s%d", prefix, unit);
 622
 623	/*
 624	 * If device already registered then return base of 1
 625	 * to indicate not to probe for this interface
 626	 */
 627	if (__dev_get_by_name(&init_net, name))
 628		return 1;
 629
 630	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 631		if (!strcmp(name, s[i].name))
 632			return s[i].map.base_addr;
 633	return 0;
 634}
 635
 636/*
 637 * Saves at boot time configured settings for any netdevice.
 638 */
 639int __init netdev_boot_setup(char *str)
 640{
 641	int ints[5];
 642	struct ifmap map;
 643
 644	str = get_options(str, ARRAY_SIZE(ints), ints);
 645	if (!str || !*str)
 646		return 0;
 647
 648	/* Save settings */
 649	memset(&map, 0, sizeof(map));
 650	if (ints[0] > 0)
 651		map.irq = ints[1];
 652	if (ints[0] > 1)
 653		map.base_addr = ints[2];
 654	if (ints[0] > 2)
 655		map.mem_start = ints[3];
 656	if (ints[0] > 3)
 657		map.mem_end = ints[4];
 658
 659	/* Add new entry to the list */
 660	return netdev_boot_setup_add(str, &map);
 661}
 662
 663__setup("netdev=", netdev_boot_setup);
 664
 665/*******************************************************************************
 666
 667			    Device Interface Subroutines
 668
 669*******************************************************************************/
 670
 671/**
 672 *	dev_get_iflink	- get 'iflink' value of a interface
 673 *	@dev: targeted interface
 674 *
 675 *	Indicates the ifindex the interface is linked to.
 676 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
 677 */
 678
 679int dev_get_iflink(const struct net_device *dev)
 680{
 681	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 682		return dev->netdev_ops->ndo_get_iflink(dev);
 683
 684	return dev->ifindex;
 685}
 686EXPORT_SYMBOL(dev_get_iflink);
 687
 688/**
 689 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
 690 *	@dev: targeted interface
 691 *	@skb: The packet.
 692 *
 693 *	For better visibility of tunnel traffic OVS needs to retrieve
 694 *	egress tunnel information for a packet. Following API allows
 695 *	user to get this info.
 696 */
 697int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 698{
 699	struct ip_tunnel_info *info;
 700
 701	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 702		return -EINVAL;
 703
 704	info = skb_tunnel_info_unclone(skb);
 705	if (!info)
 706		return -ENOMEM;
 707	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 708		return -EINVAL;
 709
 710	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 711}
 712EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 713
 714/**
 715 *	__dev_get_by_name	- find a device by its name
 716 *	@net: the applicable net namespace
 717 *	@name: name to find
 718 *
 719 *	Find an interface by name. Must be called under RTNL semaphore
 720 *	or @dev_base_lock. If the name is found a pointer to the device
 721 *	is returned. If the name is not found then %NULL is returned. The
 722 *	reference counters are not incremented so the caller must be
 723 *	careful with locks.
 724 */
 725
 726struct net_device *__dev_get_by_name(struct net *net, const char *name)
 727{
 728	struct net_device *dev;
 729	struct hlist_head *head = dev_name_hash(net, name);
 730
 731	hlist_for_each_entry(dev, head, name_hlist)
 732		if (!strncmp(dev->name, name, IFNAMSIZ))
 733			return dev;
 734
 735	return NULL;
 736}
 737EXPORT_SYMBOL(__dev_get_by_name);
 738
 739/**
 740 *	dev_get_by_name_rcu	- find a device by its name
 741 *	@net: the applicable net namespace
 742 *	@name: name to find
 743 *
 744 *	Find an interface by name.
 745 *	If the name is found a pointer to the device is returned.
 746 * 	If the name is not found then %NULL is returned.
 747 *	The reference counters are not incremented so the caller must be
 748 *	careful with locks. The caller must hold RCU lock.
 749 */
 750
 751struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 752{
 753	struct net_device *dev;
 754	struct hlist_head *head = dev_name_hash(net, name);
 755
 756	hlist_for_each_entry_rcu(dev, head, name_hlist)
 757		if (!strncmp(dev->name, name, IFNAMSIZ))
 758			return dev;
 759
 760	return NULL;
 761}
 762EXPORT_SYMBOL(dev_get_by_name_rcu);
 763
 764/**
 765 *	dev_get_by_name		- find a device by its name
 766 *	@net: the applicable net namespace
 767 *	@name: name to find
 768 *
 769 *	Find an interface by name. This can be called from any
 770 *	context and does its own locking. The returned handle has
 771 *	the usage count incremented and the caller must use dev_put() to
 772 *	release it when it is no longer needed. %NULL is returned if no
 773 *	matching device is found.
 774 */
 775
 776struct net_device *dev_get_by_name(struct net *net, const char *name)
 777{
 778	struct net_device *dev;
 779
 780	rcu_read_lock();
 781	dev = dev_get_by_name_rcu(net, name);
 782	if (dev)
 783		dev_hold(dev);
 784	rcu_read_unlock();
 785	return dev;
 786}
 787EXPORT_SYMBOL(dev_get_by_name);
 788
 789/**
 790 *	__dev_get_by_index - find a device by its ifindex
 791 *	@net: the applicable net namespace
 792 *	@ifindex: index of device
 793 *
 794 *	Search for an interface by index. Returns %NULL if the device
 795 *	is not found or a pointer to the device. The device has not
 796 *	had its reference counter increased so the caller must be careful
 797 *	about locking. The caller must hold either the RTNL semaphore
 798 *	or @dev_base_lock.
 799 */
 800
 801struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 802{
 803	struct net_device *dev;
 804	struct hlist_head *head = dev_index_hash(net, ifindex);
 805
 806	hlist_for_each_entry(dev, head, index_hlist)
 807		if (dev->ifindex == ifindex)
 808			return dev;
 809
 810	return NULL;
 811}
 812EXPORT_SYMBOL(__dev_get_by_index);
 813
 814/**
 815 *	dev_get_by_index_rcu - find a device by its ifindex
 816 *	@net: the applicable net namespace
 817 *	@ifindex: index of device
 818 *
 819 *	Search for an interface by index. Returns %NULL if the device
 820 *	is not found or a pointer to the device. The device has not
 821 *	had its reference counter increased so the caller must be careful
 822 *	about locking. The caller must hold RCU lock.
 823 */
 824
 825struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 826{
 827	struct net_device *dev;
 828	struct hlist_head *head = dev_index_hash(net, ifindex);
 829
 830	hlist_for_each_entry_rcu(dev, head, index_hlist)
 831		if (dev->ifindex == ifindex)
 832			return dev;
 833
 834	return NULL;
 835}
 836EXPORT_SYMBOL(dev_get_by_index_rcu);
 837
 838
 839/**
 840 *	dev_get_by_index - find a device by its ifindex
 841 *	@net: the applicable net namespace
 842 *	@ifindex: index of device
 843 *
 844 *	Search for an interface by index. Returns NULL if the device
 845 *	is not found or a pointer to the device. The device returned has
 846 *	had a reference added and the pointer is safe until the user calls
 847 *	dev_put to indicate they have finished with it.
 848 */
 849
 850struct net_device *dev_get_by_index(struct net *net, int ifindex)
 851{
 852	struct net_device *dev;
 853
 854	rcu_read_lock();
 855	dev = dev_get_by_index_rcu(net, ifindex);
 856	if (dev)
 857		dev_hold(dev);
 858	rcu_read_unlock();
 859	return dev;
 860}
 861EXPORT_SYMBOL(dev_get_by_index);
 862
 863/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 864 *	netdev_get_name - get a netdevice name, knowing its ifindex.
 865 *	@net: network namespace
 866 *	@name: a pointer to the buffer where the name will be stored.
 867 *	@ifindex: the ifindex of the interface to get the name from.
 868 *
 869 *	The use of raw_seqcount_begin() and cond_resched() before
 870 *	retrying is required as we want to give the writers a chance
 871 *	to complete when CONFIG_PREEMPT is not set.
 872 */
 873int netdev_get_name(struct net *net, char *name, int ifindex)
 874{
 875	struct net_device *dev;
 876	unsigned int seq;
 877
 878retry:
 879	seq = raw_seqcount_begin(&devnet_rename_seq);
 880	rcu_read_lock();
 881	dev = dev_get_by_index_rcu(net, ifindex);
 882	if (!dev) {
 883		rcu_read_unlock();
 884		return -ENODEV;
 885	}
 886
 887	strcpy(name, dev->name);
 888	rcu_read_unlock();
 889	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 890		cond_resched();
 891		goto retry;
 892	}
 893
 894	return 0;
 895}
 896
 897/**
 898 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 899 *	@net: the applicable net namespace
 900 *	@type: media type of device
 901 *	@ha: hardware address
 902 *
 903 *	Search for an interface by MAC address. Returns NULL if the device
 904 *	is not found or a pointer to the device.
 905 *	The caller must hold RCU or RTNL.
 906 *	The returned device has not had its ref count increased
 907 *	and the caller must therefore be careful about locking
 908 *
 909 */
 910
 911struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 912				       const char *ha)
 913{
 914	struct net_device *dev;
 915
 916	for_each_netdev_rcu(net, dev)
 917		if (dev->type == type &&
 918		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 919			return dev;
 920
 921	return NULL;
 922}
 923EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 924
 925struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 926{
 927	struct net_device *dev;
 928
 929	ASSERT_RTNL();
 930	for_each_netdev(net, dev)
 931		if (dev->type == type)
 932			return dev;
 933
 934	return NULL;
 935}
 936EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 937
 938struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 939{
 940	struct net_device *dev, *ret = NULL;
 941
 942	rcu_read_lock();
 943	for_each_netdev_rcu(net, dev)
 944		if (dev->type == type) {
 945			dev_hold(dev);
 946			ret = dev;
 947			break;
 948		}
 949	rcu_read_unlock();
 950	return ret;
 951}
 952EXPORT_SYMBOL(dev_getfirstbyhwtype);
 953
 954/**
 955 *	__dev_get_by_flags - find any device with given flags
 956 *	@net: the applicable net namespace
 957 *	@if_flags: IFF_* values
 958 *	@mask: bitmask of bits in if_flags to check
 959 *
 960 *	Search for any interface with the given flags. Returns NULL if a device
 961 *	is not found or a pointer to the device. Must be called inside
 962 *	rtnl_lock(), and result refcount is unchanged.
 963 */
 964
 965struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 966				      unsigned short mask)
 967{
 968	struct net_device *dev, *ret;
 969
 970	ASSERT_RTNL();
 971
 972	ret = NULL;
 973	for_each_netdev(net, dev) {
 974		if (((dev->flags ^ if_flags) & mask) == 0) {
 975			ret = dev;
 976			break;
 977		}
 978	}
 979	return ret;
 980}
 981EXPORT_SYMBOL(__dev_get_by_flags);
 982
 983/**
 984 *	dev_valid_name - check if name is okay for network device
 985 *	@name: name string
 986 *
 987 *	Network device names need to be valid file names to
 988 *	to allow sysfs to work.  We also disallow any kind of
 989 *	whitespace.
 990 */
 991bool dev_valid_name(const char *name)
 992{
 993	if (*name == '\0')
 994		return false;
 995	if (strlen(name) >= IFNAMSIZ)
 996		return false;
 997	if (!strcmp(name, ".") || !strcmp(name, ".."))
 998		return false;
 999
1000	while (*name) {
1001		if (*name == '/' || *name == ':' || isspace(*name))
1002			return false;
1003		name++;
1004	}
1005	return true;
1006}
1007EXPORT_SYMBOL(dev_valid_name);
1008
1009/**
1010 *	__dev_alloc_name - allocate a name for a device
1011 *	@net: network namespace to allocate the device name in
1012 *	@name: name format string
1013 *	@buf:  scratch buffer and result name string
1014 *
1015 *	Passed a format string - eg "lt%d" it will try and find a suitable
1016 *	id. It scans list of devices to build up a free map, then chooses
1017 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1018 *	while allocating the name and adding the device in order to avoid
1019 *	duplicates.
1020 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021 *	Returns the number of the unit assigned or a negative errno code.
1022 */
1023
1024static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1025{
1026	int i = 0;
1027	const char *p;
1028	const int max_netdevices = 8*PAGE_SIZE;
1029	unsigned long *inuse;
1030	struct net_device *d;
1031
1032	p = strnchr(name, IFNAMSIZ-1, '%');
 
 
 
1033	if (p) {
1034		/*
1035		 * Verify the string as this thing may have come from
1036		 * the user.  There must be either one "%d" and no other "%"
1037		 * characters.
1038		 */
1039		if (p[1] != 'd' || strchr(p + 2, '%'))
1040			return -EINVAL;
1041
1042		/* Use one page as a bit array of possible slots */
1043		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1044		if (!inuse)
1045			return -ENOMEM;
1046
1047		for_each_netdev(net, d) {
1048			if (!sscanf(d->name, name, &i))
1049				continue;
1050			if (i < 0 || i >= max_netdevices)
1051				continue;
1052
1053			/*  avoid cases where sscanf is not exact inverse of printf */
1054			snprintf(buf, IFNAMSIZ, name, i);
1055			if (!strncmp(buf, d->name, IFNAMSIZ))
1056				set_bit(i, inuse);
1057		}
1058
1059		i = find_first_zero_bit(inuse, max_netdevices);
1060		free_page((unsigned long) inuse);
1061	}
1062
1063	if (buf != name)
1064		snprintf(buf, IFNAMSIZ, name, i);
1065	if (!__dev_get_by_name(net, buf))
1066		return i;
1067
1068	/* It is possible to run out of possible slots
1069	 * when the name is long and there isn't enough space left
1070	 * for the digits, or if all bits are used.
1071	 */
1072	return -ENFILE;
1073}
1074
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1075/**
1076 *	dev_alloc_name - allocate a name for a device
1077 *	@dev: device
1078 *	@name: name format string
1079 *
1080 *	Passed a format string - eg "lt%d" it will try and find a suitable
1081 *	id. It scans list of devices to build up a free map, then chooses
1082 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1083 *	while allocating the name and adding the device in order to avoid
1084 *	duplicates.
1085 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086 *	Returns the number of the unit assigned or a negative errno code.
1087 */
1088
1089int dev_alloc_name(struct net_device *dev, const char *name)
1090{
1091	char buf[IFNAMSIZ];
1092	struct net *net;
1093	int ret;
1094
1095	BUG_ON(!dev_net(dev));
1096	net = dev_net(dev);
1097	ret = __dev_alloc_name(net, name, buf);
1098	if (ret >= 0)
1099		strlcpy(dev->name, buf, IFNAMSIZ);
1100	return ret;
1101}
1102EXPORT_SYMBOL(dev_alloc_name);
1103
1104static int dev_alloc_name_ns(struct net *net,
1105			     struct net_device *dev,
1106			     const char *name)
1107{
1108	char buf[IFNAMSIZ];
1109	int ret;
1110
1111	ret = __dev_alloc_name(net, name, buf);
1112	if (ret >= 0)
1113		strlcpy(dev->name, buf, IFNAMSIZ);
1114	return ret;
1115}
1116
1117static int dev_get_valid_name(struct net *net,
1118			      struct net_device *dev,
1119			      const char *name)
1120{
1121	BUG_ON(!net);
1122
1123	if (!dev_valid_name(name))
1124		return -EINVAL;
1125
1126	if (strchr(name, '%'))
1127		return dev_alloc_name_ns(net, dev, name);
1128	else if (__dev_get_by_name(net, name))
1129		return -EEXIST;
1130	else if (dev->name != name)
1131		strlcpy(dev->name, name, IFNAMSIZ);
1132
1133	return 0;
1134}
 
1135
1136/**
1137 *	dev_change_name - change name of a device
1138 *	@dev: device
1139 *	@newname: name (or format string) must be at least IFNAMSIZ
1140 *
1141 *	Change name of a device, can pass format strings "eth%d".
1142 *	for wildcarding.
1143 */
1144int dev_change_name(struct net_device *dev, const char *newname)
1145{
1146	unsigned char old_assign_type;
1147	char oldname[IFNAMSIZ];
1148	int err = 0;
1149	int ret;
1150	struct net *net;
1151
1152	ASSERT_RTNL();
1153	BUG_ON(!dev_net(dev));
1154
1155	net = dev_net(dev);
1156	if (dev->flags & IFF_UP)
1157		return -EBUSY;
1158
1159	write_seqcount_begin(&devnet_rename_seq);
1160
1161	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1162		write_seqcount_end(&devnet_rename_seq);
1163		return 0;
1164	}
1165
1166	memcpy(oldname, dev->name, IFNAMSIZ);
1167
1168	err = dev_get_valid_name(net, dev, newname);
1169	if (err < 0) {
1170		write_seqcount_end(&devnet_rename_seq);
1171		return err;
1172	}
1173
1174	if (oldname[0] && !strchr(oldname, '%'))
1175		netdev_info(dev, "renamed from %s\n", oldname);
1176
1177	old_assign_type = dev->name_assign_type;
1178	dev->name_assign_type = NET_NAME_RENAMED;
1179
1180rollback:
1181	ret = device_rename(&dev->dev, dev->name);
1182	if (ret) {
1183		memcpy(dev->name, oldname, IFNAMSIZ);
1184		dev->name_assign_type = old_assign_type;
1185		write_seqcount_end(&devnet_rename_seq);
1186		return ret;
1187	}
1188
1189	write_seqcount_end(&devnet_rename_seq);
1190
1191	netdev_adjacent_rename_links(dev, oldname);
1192
1193	write_lock_bh(&dev_base_lock);
1194	hlist_del_rcu(&dev->name_hlist);
1195	write_unlock_bh(&dev_base_lock);
1196
1197	synchronize_rcu();
1198
1199	write_lock_bh(&dev_base_lock);
1200	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1201	write_unlock_bh(&dev_base_lock);
1202
1203	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1204	ret = notifier_to_errno(ret);
1205
1206	if (ret) {
1207		/* err >= 0 after dev_alloc_name() or stores the first errno */
1208		if (err >= 0) {
1209			err = ret;
1210			write_seqcount_begin(&devnet_rename_seq);
1211			memcpy(dev->name, oldname, IFNAMSIZ);
1212			memcpy(oldname, newname, IFNAMSIZ);
1213			dev->name_assign_type = old_assign_type;
1214			old_assign_type = NET_NAME_RENAMED;
1215			goto rollback;
1216		} else {
1217			pr_err("%s: name change rollback failed: %d\n",
1218			       dev->name, ret);
1219		}
1220	}
1221
1222	return err;
1223}
1224
1225/**
1226 *	dev_set_alias - change ifalias of a device
1227 *	@dev: device
1228 *	@alias: name up to IFALIASZ
1229 *	@len: limit of bytes to copy from info
1230 *
1231 *	Set ifalias for a device,
1232 */
1233int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1234{
1235	char *new_ifalias;
1236
1237	ASSERT_RTNL();
1238
1239	if (len >= IFALIASZ)
1240		return -EINVAL;
1241
1242	if (!len) {
1243		kfree(dev->ifalias);
1244		dev->ifalias = NULL;
1245		return 0;
 
 
 
1246	}
1247
1248	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1249	if (!new_ifalias)
1250		return -ENOMEM;
1251	dev->ifalias = new_ifalias;
 
 
 
1252
1253	strlcpy(dev->ifalias, alias, len+1);
1254	return len;
1255}
1256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1257
1258/**
1259 *	netdev_features_change - device changes features
1260 *	@dev: device to cause notification
1261 *
1262 *	Called to indicate a device has changed features.
1263 */
1264void netdev_features_change(struct net_device *dev)
1265{
1266	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1267}
1268EXPORT_SYMBOL(netdev_features_change);
1269
1270/**
1271 *	netdev_state_change - device changes state
1272 *	@dev: device to cause notification
1273 *
1274 *	Called to indicate a device has changed state. This function calls
1275 *	the notifier chains for netdev_chain and sends a NEWLINK message
1276 *	to the routing socket.
1277 */
1278void netdev_state_change(struct net_device *dev)
1279{
1280	if (dev->flags & IFF_UP) {
1281		struct netdev_notifier_change_info change_info;
 
 
1282
1283		change_info.flags_changed = 0;
1284		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1285					      &change_info.info);
1286		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1287	}
1288}
1289EXPORT_SYMBOL(netdev_state_change);
1290
1291/**
1292 * 	netdev_notify_peers - notify network peers about existence of @dev
1293 * 	@dev: network device
1294 *
1295 * Generate traffic such that interested network peers are aware of
1296 * @dev, such as by generating a gratuitous ARP. This may be used when
1297 * a device wants to inform the rest of the network about some sort of
1298 * reconfiguration such as a failover event or virtual machine
1299 * migration.
1300 */
1301void netdev_notify_peers(struct net_device *dev)
1302{
1303	rtnl_lock();
1304	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
 
1305	rtnl_unlock();
1306}
1307EXPORT_SYMBOL(netdev_notify_peers);
1308
1309static int __dev_open(struct net_device *dev)
1310{
1311	const struct net_device_ops *ops = dev->netdev_ops;
1312	int ret;
1313
1314	ASSERT_RTNL();
1315
1316	if (!netif_device_present(dev))
1317		return -ENODEV;
1318
1319	/* Block netpoll from trying to do any rx path servicing.
1320	 * If we don't do this there is a chance ndo_poll_controller
1321	 * or ndo_poll may be running while we open the device
1322	 */
1323	netpoll_poll_disable(dev);
1324
1325	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1326	ret = notifier_to_errno(ret);
1327	if (ret)
1328		return ret;
1329
1330	set_bit(__LINK_STATE_START, &dev->state);
1331
1332	if (ops->ndo_validate_addr)
1333		ret = ops->ndo_validate_addr(dev);
1334
1335	if (!ret && ops->ndo_open)
1336		ret = ops->ndo_open(dev);
1337
1338	netpoll_poll_enable(dev);
1339
1340	if (ret)
1341		clear_bit(__LINK_STATE_START, &dev->state);
1342	else {
1343		dev->flags |= IFF_UP;
1344		dev_set_rx_mode(dev);
1345		dev_activate(dev);
1346		add_device_randomness(dev->dev_addr, dev->addr_len);
1347	}
1348
1349	return ret;
1350}
1351
1352/**
1353 *	dev_open	- prepare an interface for use.
1354 *	@dev:	device to open
1355 *
1356 *	Takes a device from down to up state. The device's private open
1357 *	function is invoked and then the multicast lists are loaded. Finally
1358 *	the device is moved into the up state and a %NETDEV_UP message is
1359 *	sent to the netdev notifier chain.
1360 *
1361 *	Calling this function on an active interface is a nop. On a failure
1362 *	a negative errno code is returned.
1363 */
1364int dev_open(struct net_device *dev)
1365{
1366	int ret;
1367
1368	if (dev->flags & IFF_UP)
1369		return 0;
1370
1371	ret = __dev_open(dev);
1372	if (ret < 0)
1373		return ret;
1374
1375	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376	call_netdevice_notifiers(NETDEV_UP, dev);
1377
1378	return ret;
1379}
1380EXPORT_SYMBOL(dev_open);
1381
1382static int __dev_close_many(struct list_head *head)
1383{
1384	struct net_device *dev;
1385
1386	ASSERT_RTNL();
1387	might_sleep();
1388
1389	list_for_each_entry(dev, head, close_list) {
1390		/* Temporarily disable netpoll until the interface is down */
1391		netpoll_poll_disable(dev);
1392
1393		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1394
1395		clear_bit(__LINK_STATE_START, &dev->state);
1396
1397		/* Synchronize to scheduled poll. We cannot touch poll list, it
1398		 * can be even on different cpu. So just clear netif_running().
1399		 *
1400		 * dev->stop() will invoke napi_disable() on all of it's
1401		 * napi_struct instances on this device.
1402		 */
1403		smp_mb__after_atomic(); /* Commit netif_running(). */
1404	}
1405
1406	dev_deactivate_many(head);
1407
1408	list_for_each_entry(dev, head, close_list) {
1409		const struct net_device_ops *ops = dev->netdev_ops;
1410
1411		/*
1412		 *	Call the device specific close. This cannot fail.
1413		 *	Only if device is UP
1414		 *
1415		 *	We allow it to be called even after a DETACH hot-plug
1416		 *	event.
1417		 */
1418		if (ops->ndo_stop)
1419			ops->ndo_stop(dev);
1420
1421		dev->flags &= ~IFF_UP;
1422		netpoll_poll_enable(dev);
1423	}
1424
1425	return 0;
1426}
1427
1428static int __dev_close(struct net_device *dev)
1429{
1430	int retval;
1431	LIST_HEAD(single);
1432
1433	list_add(&dev->close_list, &single);
1434	retval = __dev_close_many(&single);
1435	list_del(&single);
1436
1437	return retval;
1438}
1439
1440int dev_close_many(struct list_head *head, bool unlink)
1441{
1442	struct net_device *dev, *tmp;
1443
1444	/* Remove the devices that don't need to be closed */
1445	list_for_each_entry_safe(dev, tmp, head, close_list)
1446		if (!(dev->flags & IFF_UP))
1447			list_del_init(&dev->close_list);
1448
1449	__dev_close_many(head);
1450
1451	list_for_each_entry_safe(dev, tmp, head, close_list) {
1452		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1453		call_netdevice_notifiers(NETDEV_DOWN, dev);
1454		if (unlink)
1455			list_del_init(&dev->close_list);
1456	}
1457
1458	return 0;
1459}
1460EXPORT_SYMBOL(dev_close_many);
1461
1462/**
1463 *	dev_close - shutdown an interface.
1464 *	@dev: device to shutdown
1465 *
1466 *	This function moves an active device into down state. A
1467 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1468 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1469 *	chain.
1470 */
1471int dev_close(struct net_device *dev)
1472{
1473	if (dev->flags & IFF_UP) {
1474		LIST_HEAD(single);
1475
1476		list_add(&dev->close_list, &single);
1477		dev_close_many(&single, true);
1478		list_del(&single);
1479	}
1480	return 0;
1481}
1482EXPORT_SYMBOL(dev_close);
1483
1484
1485/**
1486 *	dev_disable_lro - disable Large Receive Offload on a device
1487 *	@dev: device
1488 *
1489 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1490 *	called under RTNL.  This is needed if received packets may be
1491 *	forwarded to another interface.
1492 */
1493void dev_disable_lro(struct net_device *dev)
1494{
1495	struct net_device *lower_dev;
1496	struct list_head *iter;
1497
1498	dev->wanted_features &= ~NETIF_F_LRO;
1499	netdev_update_features(dev);
1500
1501	if (unlikely(dev->features & NETIF_F_LRO))
1502		netdev_WARN(dev, "failed to disable LRO!\n");
1503
1504	netdev_for_each_lower_dev(dev, lower_dev, iter)
1505		dev_disable_lro(lower_dev);
1506}
1507EXPORT_SYMBOL(dev_disable_lro);
1508
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1509static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1510				   struct net_device *dev)
1511{
1512	struct netdev_notifier_info info;
 
 
1513
1514	netdev_notifier_info_init(&info, dev);
1515	return nb->notifier_call(nb, val, &info);
1516}
1517
1518static int dev_boot_phase = 1;
1519
1520/**
1521 *	register_netdevice_notifier - register a network notifier block
1522 *	@nb: notifier
1523 *
1524 *	Register a notifier to be called when network device events occur.
1525 *	The notifier passed is linked into the kernel structures and must
1526 *	not be reused until it has been unregistered. A negative errno code
1527 *	is returned on a failure.
1528 *
1529 * 	When registered all registration and up events are replayed
1530 *	to the new notifier to allow device to have a race free
1531 *	view of the network device list.
1532 */
1533
1534int register_netdevice_notifier(struct notifier_block *nb)
1535{
1536	struct net_device *dev;
1537	struct net_device *last;
1538	struct net *net;
1539	int err;
1540
 
 
1541	rtnl_lock();
1542	err = raw_notifier_chain_register(&netdev_chain, nb);
1543	if (err)
1544		goto unlock;
1545	if (dev_boot_phase)
1546		goto unlock;
1547	for_each_net(net) {
1548		for_each_netdev(net, dev) {
1549			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1550			err = notifier_to_errno(err);
1551			if (err)
1552				goto rollback;
1553
1554			if (!(dev->flags & IFF_UP))
1555				continue;
1556
1557			call_netdevice_notifier(nb, NETDEV_UP, dev);
1558		}
1559	}
1560
1561unlock:
1562	rtnl_unlock();
 
1563	return err;
1564
1565rollback:
1566	last = dev;
1567	for_each_net(net) {
1568		for_each_netdev(net, dev) {
1569			if (dev == last)
1570				goto outroll;
1571
1572			if (dev->flags & IFF_UP) {
1573				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1574							dev);
1575				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1576			}
1577			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1578		}
1579	}
1580
1581outroll:
1582	raw_notifier_chain_unregister(&netdev_chain, nb);
1583	goto unlock;
1584}
1585EXPORT_SYMBOL(register_netdevice_notifier);
1586
1587/**
1588 *	unregister_netdevice_notifier - unregister a network notifier block
1589 *	@nb: notifier
1590 *
1591 *	Unregister a notifier previously registered by
1592 *	register_netdevice_notifier(). The notifier is unlinked into the
1593 *	kernel structures and may then be reused. A negative errno code
1594 *	is returned on a failure.
1595 *
1596 * 	After unregistering unregister and down device events are synthesized
1597 *	for all devices on the device list to the removed notifier to remove
1598 *	the need for special case cleanup code.
1599 */
1600
1601int unregister_netdevice_notifier(struct notifier_block *nb)
1602{
1603	struct net_device *dev;
1604	struct net *net;
1605	int err;
1606
 
 
1607	rtnl_lock();
1608	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1609	if (err)
1610		goto unlock;
1611
1612	for_each_net(net) {
1613		for_each_netdev(net, dev) {
1614			if (dev->flags & IFF_UP) {
1615				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1616							dev);
1617				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1618			}
1619			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1620		}
1621	}
1622unlock:
1623	rtnl_unlock();
 
1624	return err;
1625}
1626EXPORT_SYMBOL(unregister_netdevice_notifier);
1627
1628/**
1629 *	call_netdevice_notifiers_info - call all network notifier blocks
1630 *	@val: value passed unmodified to notifier function
1631 *	@dev: net_device pointer passed unmodified to notifier function
1632 *	@info: notifier information data
1633 *
1634 *	Call all network notifier blocks.  Parameters and return value
1635 *	are as for raw_notifier_call_chain().
1636 */
1637
1638static int call_netdevice_notifiers_info(unsigned long val,
1639					 struct net_device *dev,
1640					 struct netdev_notifier_info *info)
1641{
1642	ASSERT_RTNL();
1643	netdev_notifier_info_init(info, dev);
1644	return raw_notifier_call_chain(&netdev_chain, val, info);
1645}
1646
1647/**
1648 *	call_netdevice_notifiers - call all network notifier blocks
1649 *      @val: value passed unmodified to notifier function
1650 *      @dev: net_device pointer passed unmodified to notifier function
1651 *
1652 *	Call all network notifier blocks.  Parameters and return value
1653 *	are as for raw_notifier_call_chain().
1654 */
1655
1656int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1657{
1658	struct netdev_notifier_info info;
 
 
1659
1660	return call_netdevice_notifiers_info(val, dev, &info);
1661}
1662EXPORT_SYMBOL(call_netdevice_notifiers);
1663
1664#ifdef CONFIG_NET_INGRESS
1665static struct static_key ingress_needed __read_mostly;
1666
1667void net_inc_ingress_queue(void)
1668{
1669	static_key_slow_inc(&ingress_needed);
1670}
1671EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1672
1673void net_dec_ingress_queue(void)
1674{
1675	static_key_slow_dec(&ingress_needed);
1676}
1677EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1678#endif
1679
1680#ifdef CONFIG_NET_EGRESS
1681static struct static_key egress_needed __read_mostly;
1682
1683void net_inc_egress_queue(void)
1684{
1685	static_key_slow_inc(&egress_needed);
1686}
1687EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1688
1689void net_dec_egress_queue(void)
1690{
1691	static_key_slow_dec(&egress_needed);
1692}
1693EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1694#endif
1695
1696static struct static_key netstamp_needed __read_mostly;
1697#ifdef HAVE_JUMP_LABEL
1698static atomic_t netstamp_needed_deferred;
1699static atomic_t netstamp_wanted;
1700static void netstamp_clear(struct work_struct *work)
1701{
1702	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1703	int wanted;
1704
1705	wanted = atomic_add_return(deferred, &netstamp_wanted);
1706	if (wanted > 0)
1707		static_key_enable(&netstamp_needed);
1708	else
1709		static_key_disable(&netstamp_needed);
1710}
1711static DECLARE_WORK(netstamp_work, netstamp_clear);
1712#endif
1713
1714void net_enable_timestamp(void)
1715{
1716#ifdef HAVE_JUMP_LABEL
1717	int wanted;
1718
1719	while (1) {
1720		wanted = atomic_read(&netstamp_wanted);
1721		if (wanted <= 0)
1722			break;
1723		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1724			return;
1725	}
1726	atomic_inc(&netstamp_needed_deferred);
1727	schedule_work(&netstamp_work);
1728#else
1729	static_key_slow_inc(&netstamp_needed);
1730#endif
1731}
1732EXPORT_SYMBOL(net_enable_timestamp);
1733
1734void net_disable_timestamp(void)
1735{
1736#ifdef HAVE_JUMP_LABEL
1737	int wanted;
1738
1739	while (1) {
1740		wanted = atomic_read(&netstamp_wanted);
1741		if (wanted <= 1)
1742			break;
1743		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1744			return;
1745	}
1746	atomic_dec(&netstamp_needed_deferred);
1747	schedule_work(&netstamp_work);
1748#else
1749	static_key_slow_dec(&netstamp_needed);
1750#endif
1751}
1752EXPORT_SYMBOL(net_disable_timestamp);
1753
1754static inline void net_timestamp_set(struct sk_buff *skb)
1755{
1756	skb->tstamp = 0;
1757	if (static_key_false(&netstamp_needed))
1758		__net_timestamp(skb);
1759}
1760
1761#define net_timestamp_check(COND, SKB)			\
1762	if (static_key_false(&netstamp_needed)) {		\
1763		if ((COND) && !(SKB)->tstamp)	\
1764			__net_timestamp(SKB);		\
1765	}						\
1766
1767bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1768{
1769	unsigned int len;
1770
1771	if (!(dev->flags & IFF_UP))
1772		return false;
1773
1774	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1775	if (skb->len <= len)
1776		return true;
1777
1778	/* if TSO is enabled, we don't care about the length as the packet
1779	 * could be forwarded without being segmented before
1780	 */
1781	if (skb_is_gso(skb))
1782		return true;
1783
1784	return false;
1785}
1786EXPORT_SYMBOL_GPL(is_skb_forwardable);
1787
1788int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1789{
1790	int ret = ____dev_forward_skb(dev, skb);
1791
1792	if (likely(!ret)) {
1793		skb->protocol = eth_type_trans(skb, dev);
1794		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1795	}
1796
1797	return ret;
1798}
1799EXPORT_SYMBOL_GPL(__dev_forward_skb);
1800
1801/**
1802 * dev_forward_skb - loopback an skb to another netif
1803 *
1804 * @dev: destination network device
1805 * @skb: buffer to forward
1806 *
1807 * return values:
1808 *	NET_RX_SUCCESS	(no congestion)
1809 *	NET_RX_DROP     (packet was dropped, but freed)
1810 *
1811 * dev_forward_skb can be used for injecting an skb from the
1812 * start_xmit function of one device into the receive queue
1813 * of another device.
1814 *
1815 * The receiving device may be in another namespace, so
1816 * we have to clear all information in the skb that could
1817 * impact namespace isolation.
1818 */
1819int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1820{
1821	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1822}
1823EXPORT_SYMBOL_GPL(dev_forward_skb);
1824
1825static inline int deliver_skb(struct sk_buff *skb,
1826			      struct packet_type *pt_prev,
1827			      struct net_device *orig_dev)
1828{
1829	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1830		return -ENOMEM;
1831	atomic_inc(&skb->users);
1832	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1833}
1834
1835static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1836					  struct packet_type **pt,
1837					  struct net_device *orig_dev,
1838					  __be16 type,
1839					  struct list_head *ptype_list)
1840{
1841	struct packet_type *ptype, *pt_prev = *pt;
1842
1843	list_for_each_entry_rcu(ptype, ptype_list, list) {
1844		if (ptype->type != type)
1845			continue;
1846		if (pt_prev)
1847			deliver_skb(skb, pt_prev, orig_dev);
1848		pt_prev = ptype;
1849	}
1850	*pt = pt_prev;
1851}
1852
1853static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1854{
1855	if (!ptype->af_packet_priv || !skb->sk)
1856		return false;
1857
1858	if (ptype->id_match)
1859		return ptype->id_match(ptype, skb->sk);
1860	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1861		return true;
1862
1863	return false;
1864}
1865
1866/*
1867 *	Support routine. Sends outgoing frames to any network
1868 *	taps currently in use.
1869 */
1870
1871void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1872{
1873	struct packet_type *ptype;
1874	struct sk_buff *skb2 = NULL;
1875	struct packet_type *pt_prev = NULL;
1876	struct list_head *ptype_list = &ptype_all;
1877
1878	rcu_read_lock();
1879again:
1880	list_for_each_entry_rcu(ptype, ptype_list, list) {
1881		/* Never send packets back to the socket
1882		 * they originated from - MvS (miquels@drinkel.ow.org)
1883		 */
1884		if (skb_loop_sk(ptype, skb))
1885			continue;
1886
1887		if (pt_prev) {
1888			deliver_skb(skb2, pt_prev, skb->dev);
1889			pt_prev = ptype;
1890			continue;
1891		}
1892
1893		/* need to clone skb, done only once */
1894		skb2 = skb_clone(skb, GFP_ATOMIC);
1895		if (!skb2)
1896			goto out_unlock;
1897
1898		net_timestamp_set(skb2);
1899
1900		/* skb->nh should be correctly
1901		 * set by sender, so that the second statement is
1902		 * just protection against buggy protocols.
1903		 */
1904		skb_reset_mac_header(skb2);
1905
1906		if (skb_network_header(skb2) < skb2->data ||
1907		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1908			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1909					     ntohs(skb2->protocol),
1910					     dev->name);
1911			skb_reset_network_header(skb2);
1912		}
1913
1914		skb2->transport_header = skb2->network_header;
1915		skb2->pkt_type = PACKET_OUTGOING;
1916		pt_prev = ptype;
1917	}
1918
1919	if (ptype_list == &ptype_all) {
1920		ptype_list = &dev->ptype_all;
1921		goto again;
1922	}
1923out_unlock:
1924	if (pt_prev)
1925		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
 
 
 
 
1926	rcu_read_unlock();
1927}
1928EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1929
1930/**
1931 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1932 * @dev: Network device
1933 * @txq: number of queues available
1934 *
1935 * If real_num_tx_queues is changed the tc mappings may no longer be
1936 * valid. To resolve this verify the tc mapping remains valid and if
1937 * not NULL the mapping. With no priorities mapping to this
1938 * offset/count pair it will no longer be used. In the worst case TC0
1939 * is invalid nothing can be done so disable priority mappings. If is
1940 * expected that drivers will fix this mapping if they can before
1941 * calling netif_set_real_num_tx_queues.
1942 */
1943static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1944{
1945	int i;
1946	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1947
1948	/* If TC0 is invalidated disable TC mapping */
1949	if (tc->offset + tc->count > txq) {
1950		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1951		dev->num_tc = 0;
1952		return;
1953	}
1954
1955	/* Invalidated prio to tc mappings set to TC0 */
1956	for (i = 1; i < TC_BITMASK + 1; i++) {
1957		int q = netdev_get_prio_tc_map(dev, i);
1958
1959		tc = &dev->tc_to_txq[q];
1960		if (tc->offset + tc->count > txq) {
1961			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1962				i, q);
1963			netdev_set_prio_tc_map(dev, i, 0);
1964		}
1965	}
1966}
1967
1968int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
1969{
1970	if (dev->num_tc) {
1971		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1972		int i;
1973
1974		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
1975			if ((txq - tc->offset) < tc->count)
1976				return i;
1977		}
1978
1979		return -1;
1980	}
1981
1982	return 0;
1983}
 
1984
1985#ifdef CONFIG_XPS
1986static DEFINE_MUTEX(xps_map_mutex);
1987#define xmap_dereference(P)		\
1988	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1989
1990static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
1991			     int tci, u16 index)
1992{
1993	struct xps_map *map = NULL;
1994	int pos;
1995
1996	if (dev_maps)
1997		map = xmap_dereference(dev_maps->cpu_map[tci]);
1998	if (!map)
1999		return false;
2000
2001	for (pos = map->len; pos--;) {
2002		if (map->queues[pos] != index)
2003			continue;
2004
2005		if (map->len > 1) {
2006			map->queues[pos] = map->queues[--map->len];
2007			break;
2008		}
2009
2010		RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
2011		kfree_rcu(map, rcu);
2012		return false;
2013	}
2014
2015	return true;
2016}
2017
2018static bool remove_xps_queue_cpu(struct net_device *dev,
2019				 struct xps_dev_maps *dev_maps,
2020				 int cpu, u16 offset, u16 count)
2021{
2022	int num_tc = dev->num_tc ? : 1;
2023	bool active = false;
2024	int tci;
2025
2026	for (tci = cpu * num_tc; num_tc--; tci++) {
2027		int i, j;
2028
2029		for (i = count, j = offset; i--; j++) {
2030			if (!remove_xps_queue(dev_maps, cpu, j))
2031				break;
2032		}
2033
2034		active |= i < 0;
2035	}
2036
2037	return active;
2038}
2039
2040static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2041				   u16 count)
2042{
2043	struct xps_dev_maps *dev_maps;
2044	int cpu, i;
2045	bool active = false;
2046
2047	mutex_lock(&xps_map_mutex);
2048	dev_maps = xmap_dereference(dev->xps_maps);
2049
2050	if (!dev_maps)
2051		goto out_no_maps;
2052
2053	for_each_possible_cpu(cpu)
2054		active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
2055					       offset, count);
2056
2057	if (!active) {
2058		RCU_INIT_POINTER(dev->xps_maps, NULL);
2059		kfree_rcu(dev_maps, rcu);
2060	}
2061
2062	for (i = offset + (count - 1); count--; i--)
2063		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2064					     NUMA_NO_NODE);
2065
2066out_no_maps:
2067	mutex_unlock(&xps_map_mutex);
2068}
2069
2070static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2071{
2072	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2073}
2074
2075static struct xps_map *expand_xps_map(struct xps_map *map,
2076				      int cpu, u16 index)
2077{
2078	struct xps_map *new_map;
2079	int alloc_len = XPS_MIN_MAP_ALLOC;
2080	int i, pos;
2081
2082	for (pos = 0; map && pos < map->len; pos++) {
2083		if (map->queues[pos] != index)
2084			continue;
2085		return map;
2086	}
2087
2088	/* Need to add queue to this CPU's existing map */
2089	if (map) {
2090		if (pos < map->alloc_len)
2091			return map;
2092
2093		alloc_len = map->alloc_len * 2;
2094	}
2095
2096	/* Need to allocate new map to store queue on this CPU's map */
2097	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2098			       cpu_to_node(cpu));
2099	if (!new_map)
2100		return NULL;
2101
2102	for (i = 0; i < pos; i++)
2103		new_map->queues[i] = map->queues[i];
2104	new_map->alloc_len = alloc_len;
2105	new_map->len = pos;
2106
2107	return new_map;
2108}
2109
2110int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2111			u16 index)
2112{
2113	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2114	int i, cpu, tci, numa_node_id = -2;
2115	int maps_sz, num_tc = 1, tc = 0;
2116	struct xps_map *map, *new_map;
2117	bool active = false;
2118
2119	if (dev->num_tc) {
2120		num_tc = dev->num_tc;
2121		tc = netdev_txq_to_tc(dev, index);
2122		if (tc < 0)
2123			return -EINVAL;
2124	}
2125
2126	maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
2127	if (maps_sz < L1_CACHE_BYTES)
2128		maps_sz = L1_CACHE_BYTES;
2129
2130	mutex_lock(&xps_map_mutex);
2131
2132	dev_maps = xmap_dereference(dev->xps_maps);
2133
2134	/* allocate memory for queue storage */
2135	for_each_cpu_and(cpu, cpu_online_mask, mask) {
2136		if (!new_dev_maps)
2137			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2138		if (!new_dev_maps) {
2139			mutex_unlock(&xps_map_mutex);
2140			return -ENOMEM;
2141		}
2142
2143		tci = cpu * num_tc + tc;
2144		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
2145				 NULL;
2146
2147		map = expand_xps_map(map, cpu, index);
2148		if (!map)
2149			goto error;
2150
2151		RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2152	}
2153
2154	if (!new_dev_maps)
2155		goto out_no_new_maps;
2156
2157	for_each_possible_cpu(cpu) {
2158		/* copy maps belonging to foreign traffic classes */
2159		for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
2160			/* fill in the new device map from the old device map */
2161			map = xmap_dereference(dev_maps->cpu_map[tci]);
2162			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2163		}
2164
2165		/* We need to explicitly update tci as prevous loop
2166		 * could break out early if dev_maps is NULL.
2167		 */
2168		tci = cpu * num_tc + tc;
2169
2170		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2171			/* add queue to CPU maps */
2172			int pos = 0;
2173
2174			map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2175			while ((pos < map->len) && (map->queues[pos] != index))
2176				pos++;
2177
2178			if (pos == map->len)
2179				map->queues[map->len++] = index;
2180#ifdef CONFIG_NUMA
2181			if (numa_node_id == -2)
2182				numa_node_id = cpu_to_node(cpu);
2183			else if (numa_node_id != cpu_to_node(cpu))
2184				numa_node_id = -1;
2185#endif
2186		} else if (dev_maps) {
2187			/* fill in the new device map from the old device map */
2188			map = xmap_dereference(dev_maps->cpu_map[tci]);
2189			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2190		}
2191
2192		/* copy maps belonging to foreign traffic classes */
2193		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2194			/* fill in the new device map from the old device map */
2195			map = xmap_dereference(dev_maps->cpu_map[tci]);
2196			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2197		}
2198	}
2199
2200	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2201
2202	/* Cleanup old maps */
2203	if (!dev_maps)
2204		goto out_no_old_maps;
2205
2206	for_each_possible_cpu(cpu) {
2207		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2208			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2209			map = xmap_dereference(dev_maps->cpu_map[tci]);
2210			if (map && map != new_map)
2211				kfree_rcu(map, rcu);
2212		}
2213	}
2214
2215	kfree_rcu(dev_maps, rcu);
2216
2217out_no_old_maps:
2218	dev_maps = new_dev_maps;
2219	active = true;
2220
2221out_no_new_maps:
2222	/* update Tx queue numa node */
2223	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2224				     (numa_node_id >= 0) ? numa_node_id :
2225				     NUMA_NO_NODE);
2226
2227	if (!dev_maps)
2228		goto out_no_maps;
2229
2230	/* removes queue from unused CPUs */
2231	for_each_possible_cpu(cpu) {
2232		for (i = tc, tci = cpu * num_tc; i--; tci++)
2233			active |= remove_xps_queue(dev_maps, tci, index);
2234		if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
2235			active |= remove_xps_queue(dev_maps, tci, index);
2236		for (i = num_tc - tc, tci++; --i; tci++)
2237			active |= remove_xps_queue(dev_maps, tci, index);
2238	}
2239
2240	/* free map if not active */
2241	if (!active) {
2242		RCU_INIT_POINTER(dev->xps_maps, NULL);
2243		kfree_rcu(dev_maps, rcu);
2244	}
2245
2246out_no_maps:
2247	mutex_unlock(&xps_map_mutex);
2248
2249	return 0;
2250error:
2251	/* remove any maps that we added */
2252	for_each_possible_cpu(cpu) {
2253		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2254			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2255			map = dev_maps ?
2256			      xmap_dereference(dev_maps->cpu_map[tci]) :
2257			      NULL;
2258			if (new_map && new_map != map)
2259				kfree(new_map);
2260		}
2261	}
2262
2263	mutex_unlock(&xps_map_mutex);
2264
2265	kfree(new_dev_maps);
2266	return -ENOMEM;
2267}
2268EXPORT_SYMBOL(netif_set_xps_queue);
2269
2270#endif
2271void netdev_reset_tc(struct net_device *dev)
2272{
2273#ifdef CONFIG_XPS
2274	netif_reset_xps_queues_gt(dev, 0);
2275#endif
2276	dev->num_tc = 0;
2277	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2278	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2279}
2280EXPORT_SYMBOL(netdev_reset_tc);
2281
2282int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2283{
2284	if (tc >= dev->num_tc)
2285		return -EINVAL;
2286
2287#ifdef CONFIG_XPS
2288	netif_reset_xps_queues(dev, offset, count);
2289#endif
2290	dev->tc_to_txq[tc].count = count;
2291	dev->tc_to_txq[tc].offset = offset;
2292	return 0;
2293}
2294EXPORT_SYMBOL(netdev_set_tc_queue);
2295
2296int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2297{
2298	if (num_tc > TC_MAX_QUEUE)
2299		return -EINVAL;
2300
2301#ifdef CONFIG_XPS
2302	netif_reset_xps_queues_gt(dev, 0);
2303#endif
2304	dev->num_tc = num_tc;
2305	return 0;
2306}
2307EXPORT_SYMBOL(netdev_set_num_tc);
2308
2309/*
2310 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2311 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2312 */
2313int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2314{
 
2315	int rc;
2316
 
 
2317	if (txq < 1 || txq > dev->num_tx_queues)
2318		return -EINVAL;
2319
2320	if (dev->reg_state == NETREG_REGISTERED ||
2321	    dev->reg_state == NETREG_UNREGISTERING) {
2322		ASSERT_RTNL();
2323
2324		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2325						  txq);
2326		if (rc)
2327			return rc;
2328
2329		if (dev->num_tc)
2330			netif_setup_tc(dev, txq);
2331
2332		if (txq < dev->real_num_tx_queues) {
 
 
 
2333			qdisc_reset_all_tx_gt(dev, txq);
2334#ifdef CONFIG_XPS
2335			netif_reset_xps_queues_gt(dev, txq);
2336#endif
2337		}
 
 
2338	}
2339
2340	dev->real_num_tx_queues = txq;
2341	return 0;
2342}
2343EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2344
2345#ifdef CONFIG_SYSFS
2346/**
2347 *	netif_set_real_num_rx_queues - set actual number of RX queues used
2348 *	@dev: Network device
2349 *	@rxq: Actual number of RX queues
2350 *
2351 *	This must be called either with the rtnl_lock held or before
2352 *	registration of the net device.  Returns 0 on success, or a
2353 *	negative error code.  If called before registration, it always
2354 *	succeeds.
2355 */
2356int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2357{
2358	int rc;
2359
2360	if (rxq < 1 || rxq > dev->num_rx_queues)
2361		return -EINVAL;
2362
2363	if (dev->reg_state == NETREG_REGISTERED) {
2364		ASSERT_RTNL();
2365
2366		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2367						  rxq);
2368		if (rc)
2369			return rc;
2370	}
2371
2372	dev->real_num_rx_queues = rxq;
2373	return 0;
2374}
2375EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2376#endif
2377
2378/**
2379 * netif_get_num_default_rss_queues - default number of RSS queues
2380 *
2381 * This routine should set an upper limit on the number of RSS queues
2382 * used by default by multiqueue devices.
2383 */
2384int netif_get_num_default_rss_queues(void)
2385{
2386	return is_kdump_kernel() ?
2387		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2388}
2389EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2390
2391static void __netif_reschedule(struct Qdisc *q)
2392{
2393	struct softnet_data *sd;
2394	unsigned long flags;
2395
2396	local_irq_save(flags);
2397	sd = this_cpu_ptr(&softnet_data);
2398	q->next_sched = NULL;
2399	*sd->output_queue_tailp = q;
2400	sd->output_queue_tailp = &q->next_sched;
2401	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2402	local_irq_restore(flags);
2403}
2404
2405void __netif_schedule(struct Qdisc *q)
2406{
2407	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2408		__netif_reschedule(q);
2409}
2410EXPORT_SYMBOL(__netif_schedule);
2411
2412struct dev_kfree_skb_cb {
2413	enum skb_free_reason reason;
2414};
2415
2416static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2417{
2418	return (struct dev_kfree_skb_cb *)skb->cb;
2419}
2420
2421void netif_schedule_queue(struct netdev_queue *txq)
2422{
2423	rcu_read_lock();
2424	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2425		struct Qdisc *q = rcu_dereference(txq->qdisc);
2426
2427		__netif_schedule(q);
2428	}
2429	rcu_read_unlock();
2430}
2431EXPORT_SYMBOL(netif_schedule_queue);
2432
2433/**
2434 *	netif_wake_subqueue - allow sending packets on subqueue
2435 *	@dev: network device
2436 *	@queue_index: sub queue index
2437 *
2438 * Resume individual transmit queue of a device with multiple transmit queues.
2439 */
2440void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2441{
2442	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2443
2444	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2445		struct Qdisc *q;
2446
2447		rcu_read_lock();
2448		q = rcu_dereference(txq->qdisc);
2449		__netif_schedule(q);
2450		rcu_read_unlock();
2451	}
2452}
2453EXPORT_SYMBOL(netif_wake_subqueue);
2454
2455void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2456{
2457	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2458		struct Qdisc *q;
2459
2460		rcu_read_lock();
2461		q = rcu_dereference(dev_queue->qdisc);
2462		__netif_schedule(q);
2463		rcu_read_unlock();
2464	}
2465}
2466EXPORT_SYMBOL(netif_tx_wake_queue);
2467
2468void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2469{
2470	unsigned long flags;
2471
2472	if (likely(atomic_read(&skb->users) == 1)) {
 
 
 
2473		smp_rmb();
2474		atomic_set(&skb->users, 0);
2475	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2476		return;
2477	}
2478	get_kfree_skb_cb(skb)->reason = reason;
2479	local_irq_save(flags);
2480	skb->next = __this_cpu_read(softnet_data.completion_queue);
2481	__this_cpu_write(softnet_data.completion_queue, skb);
2482	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2483	local_irq_restore(flags);
2484}
2485EXPORT_SYMBOL(__dev_kfree_skb_irq);
2486
2487void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2488{
2489	if (in_irq() || irqs_disabled())
2490		__dev_kfree_skb_irq(skb, reason);
2491	else
2492		dev_kfree_skb(skb);
2493}
2494EXPORT_SYMBOL(__dev_kfree_skb_any);
2495
2496
2497/**
2498 * netif_device_detach - mark device as removed
2499 * @dev: network device
2500 *
2501 * Mark device as removed from system and therefore no longer available.
2502 */
2503void netif_device_detach(struct net_device *dev)
2504{
2505	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2506	    netif_running(dev)) {
2507		netif_tx_stop_all_queues(dev);
2508	}
2509}
2510EXPORT_SYMBOL(netif_device_detach);
2511
2512/**
2513 * netif_device_attach - mark device as attached
2514 * @dev: network device
2515 *
2516 * Mark device as attached from system and restart if needed.
2517 */
2518void netif_device_attach(struct net_device *dev)
2519{
2520	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2521	    netif_running(dev)) {
2522		netif_tx_wake_all_queues(dev);
2523		__netdev_watchdog_up(dev);
2524	}
2525}
2526EXPORT_SYMBOL(netif_device_attach);
2527
2528/*
2529 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2530 * to be used as a distribution range.
2531 */
2532u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2533		  unsigned int num_tx_queues)
2534{
2535	u32 hash;
2536	u16 qoffset = 0;
2537	u16 qcount = num_tx_queues;
2538
2539	if (skb_rx_queue_recorded(skb)) {
2540		hash = skb_get_rx_queue(skb);
2541		while (unlikely(hash >= num_tx_queues))
2542			hash -= num_tx_queues;
2543		return hash;
2544	}
2545
2546	if (dev->num_tc) {
2547		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
 
2548		qoffset = dev->tc_to_txq[tc].offset;
2549		qcount = dev->tc_to_txq[tc].count;
2550	}
2551
2552	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2553}
2554EXPORT_SYMBOL(__skb_tx_hash);
2555
2556static void skb_warn_bad_offload(const struct sk_buff *skb)
2557{
2558	static const netdev_features_t null_features;
2559	struct net_device *dev = skb->dev;
2560	const char *name = "";
2561
2562	if (!net_ratelimit())
2563		return;
2564
2565	if (dev) {
2566		if (dev->dev.parent)
2567			name = dev_driver_string(dev->dev.parent);
2568		else
2569			name = netdev_name(dev);
2570	}
2571	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2572	     "gso_type=%d ip_summed=%d\n",
2573	     name, dev ? &dev->features : &null_features,
2574	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2575	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2576	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2577}
2578
2579/*
2580 * Invalidate hardware checksum when packet is to be mangled, and
2581 * complete checksum manually on outgoing path.
2582 */
2583int skb_checksum_help(struct sk_buff *skb)
2584{
2585	__wsum csum;
2586	int ret = 0, offset;
2587
2588	if (skb->ip_summed == CHECKSUM_COMPLETE)
2589		goto out_set_summed;
2590
2591	if (unlikely(skb_shinfo(skb)->gso_size)) {
2592		skb_warn_bad_offload(skb);
2593		return -EINVAL;
2594	}
2595
2596	/* Before computing a checksum, we should make sure no frag could
2597	 * be modified by an external entity : checksum could be wrong.
2598	 */
2599	if (skb_has_shared_frag(skb)) {
2600		ret = __skb_linearize(skb);
2601		if (ret)
2602			goto out;
2603	}
2604
2605	offset = skb_checksum_start_offset(skb);
2606	BUG_ON(offset >= skb_headlen(skb));
2607	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2608
2609	offset += skb->csum_offset;
2610	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2611
2612	if (skb_cloned(skb) &&
2613	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2614		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2615		if (ret)
2616			goto out;
2617	}
2618
2619	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2620out_set_summed:
2621	skb->ip_summed = CHECKSUM_NONE;
2622out:
2623	return ret;
2624}
2625EXPORT_SYMBOL(skb_checksum_help);
2626
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2627__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2628{
2629	__be16 type = skb->protocol;
2630
2631	/* Tunnel gso handlers can set protocol to ethernet. */
2632	if (type == htons(ETH_P_TEB)) {
2633		struct ethhdr *eth;
2634
2635		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2636			return 0;
2637
2638		eth = (struct ethhdr *)skb_mac_header(skb);
2639		type = eth->h_proto;
2640	}
2641
2642	return __vlan_get_protocol(skb, type, depth);
2643}
2644
2645/**
2646 *	skb_mac_gso_segment - mac layer segmentation handler.
2647 *	@skb: buffer to segment
2648 *	@features: features for the output path (see dev->features)
2649 */
2650struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2651				    netdev_features_t features)
2652{
2653	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2654	struct packet_offload *ptype;
2655	int vlan_depth = skb->mac_len;
2656	__be16 type = skb_network_protocol(skb, &vlan_depth);
2657
2658	if (unlikely(!type))
2659		return ERR_PTR(-EINVAL);
2660
2661	__skb_pull(skb, vlan_depth);
2662
2663	rcu_read_lock();
2664	list_for_each_entry_rcu(ptype, &offload_base, list) {
2665		if (ptype->type == type && ptype->callbacks.gso_segment) {
2666			segs = ptype->callbacks.gso_segment(skb, features);
2667			break;
2668		}
2669	}
2670	rcu_read_unlock();
2671
2672	__skb_push(skb, skb->data - skb_mac_header(skb));
2673
2674	return segs;
2675}
2676EXPORT_SYMBOL(skb_mac_gso_segment);
2677
2678
2679/* openvswitch calls this on rx path, so we need a different check.
2680 */
2681static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2682{
2683	if (tx_path)
2684		return skb->ip_summed != CHECKSUM_PARTIAL;
2685	else
2686		return skb->ip_summed == CHECKSUM_NONE;
 
2687}
2688
2689/**
2690 *	__skb_gso_segment - Perform segmentation on skb.
2691 *	@skb: buffer to segment
2692 *	@features: features for the output path (see dev->features)
2693 *	@tx_path: whether it is called in TX path
2694 *
2695 *	This function segments the given skb and returns a list of segments.
2696 *
2697 *	It may return NULL if the skb requires no segmentation.  This is
2698 *	only possible when GSO is used for verifying header integrity.
2699 *
2700 *	Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2701 */
2702struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2703				  netdev_features_t features, bool tx_path)
2704{
 
 
2705	if (unlikely(skb_needs_check(skb, tx_path))) {
2706		int err;
2707
2708		skb_warn_bad_offload(skb);
2709
2710		err = skb_cow_head(skb, 0);
2711		if (err < 0)
2712			return ERR_PTR(err);
2713	}
2714
2715	/* Only report GSO partial support if it will enable us to
2716	 * support segmentation on this frame without needing additional
2717	 * work.
2718	 */
2719	if (features & NETIF_F_GSO_PARTIAL) {
2720		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2721		struct net_device *dev = skb->dev;
2722
2723		partial_features |= dev->features & dev->gso_partial_features;
2724		if (!skb_gso_ok(skb, features | partial_features))
2725			features &= ~NETIF_F_GSO_PARTIAL;
2726	}
2727
2728	BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2729		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2730
2731	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2732	SKB_GSO_CB(skb)->encap_level = 0;
2733
2734	skb_reset_mac_header(skb);
2735	skb_reset_mac_len(skb);
2736
2737	return skb_mac_gso_segment(skb, features);
 
 
 
 
 
2738}
2739EXPORT_SYMBOL(__skb_gso_segment);
2740
2741/* Take action when hardware reception checksum errors are detected. */
2742#ifdef CONFIG_BUG
2743void netdev_rx_csum_fault(struct net_device *dev)
2744{
2745	if (net_ratelimit()) {
2746		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2747		dump_stack();
2748	}
2749}
2750EXPORT_SYMBOL(netdev_rx_csum_fault);
2751#endif
2752
2753/* Actually, we should eliminate this check as soon as we know, that:
2754 * 1. IOMMU is present and allows to map all the memory.
2755 * 2. No high memory really exists on this machine.
2756 */
2757
2758static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2759{
2760#ifdef CONFIG_HIGHMEM
2761	int i;
 
2762	if (!(dev->features & NETIF_F_HIGHDMA)) {
2763		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2764			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
2765			if (PageHighMem(skb_frag_page(frag)))
2766				return 1;
2767		}
2768	}
2769
2770	if (PCI_DMA_BUS_IS_PHYS) {
2771		struct device *pdev = dev->dev.parent;
2772
2773		if (!pdev)
2774			return 0;
2775		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2776			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2777			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
 
2778			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2779				return 1;
2780		}
2781	}
2782#endif
2783	return 0;
2784}
2785
2786/* If MPLS offload request, verify we are testing hardware MPLS features
2787 * instead of standard features for the netdev.
2788 */
2789#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2790static netdev_features_t net_mpls_features(struct sk_buff *skb,
2791					   netdev_features_t features,
2792					   __be16 type)
2793{
2794	if (eth_p_mpls(type))
2795		features &= skb->dev->mpls_features;
2796
2797	return features;
2798}
2799#else
2800static netdev_features_t net_mpls_features(struct sk_buff *skb,
2801					   netdev_features_t features,
2802					   __be16 type)
2803{
2804	return features;
2805}
2806#endif
2807
2808static netdev_features_t harmonize_features(struct sk_buff *skb,
2809	netdev_features_t features)
2810{
2811	int tmp;
2812	__be16 type;
2813
2814	type = skb_network_protocol(skb, &tmp);
2815	features = net_mpls_features(skb, features, type);
2816
2817	if (skb->ip_summed != CHECKSUM_NONE &&
2818	    !can_checksum_protocol(features, type)) {
2819		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2820	}
2821	if (illegal_highdma(skb->dev, skb))
2822		features &= ~NETIF_F_SG;
2823
2824	return features;
2825}
2826
2827netdev_features_t passthru_features_check(struct sk_buff *skb,
2828					  struct net_device *dev,
2829					  netdev_features_t features)
2830{
2831	return features;
2832}
2833EXPORT_SYMBOL(passthru_features_check);
2834
2835static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2836					     struct net_device *dev,
2837					     netdev_features_t features)
2838{
2839	return vlan_features_check(skb, features);
2840}
2841
2842static netdev_features_t gso_features_check(const struct sk_buff *skb,
2843					    struct net_device *dev,
2844					    netdev_features_t features)
2845{
2846	u16 gso_segs = skb_shinfo(skb)->gso_segs;
2847
2848	if (gso_segs > dev->gso_max_segs)
2849		return features & ~NETIF_F_GSO_MASK;
2850
2851	/* Support for GSO partial features requires software
2852	 * intervention before we can actually process the packets
2853	 * so we need to strip support for any partial features now
2854	 * and we can pull them back in after we have partially
2855	 * segmented the frame.
2856	 */
2857	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2858		features &= ~dev->gso_partial_features;
2859
2860	/* Make sure to clear the IPv4 ID mangling feature if the
2861	 * IPv4 header has the potential to be fragmented.
2862	 */
2863	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2864		struct iphdr *iph = skb->encapsulation ?
2865				    inner_ip_hdr(skb) : ip_hdr(skb);
2866
2867		if (!(iph->frag_off & htons(IP_DF)))
2868			features &= ~NETIF_F_TSO_MANGLEID;
2869	}
2870
2871	return features;
2872}
2873
2874netdev_features_t netif_skb_features(struct sk_buff *skb)
2875{
2876	struct net_device *dev = skb->dev;
2877	netdev_features_t features = dev->features;
2878
2879	if (skb_is_gso(skb))
2880		features = gso_features_check(skb, dev, features);
2881
2882	/* If encapsulation offload request, verify we are testing
2883	 * hardware encapsulation features instead of standard
2884	 * features for the netdev
2885	 */
2886	if (skb->encapsulation)
2887		features &= dev->hw_enc_features;
2888
2889	if (skb_vlan_tagged(skb))
2890		features = netdev_intersect_features(features,
2891						     dev->vlan_features |
2892						     NETIF_F_HW_VLAN_CTAG_TX |
2893						     NETIF_F_HW_VLAN_STAG_TX);
2894
2895	if (dev->netdev_ops->ndo_features_check)
2896		features &= dev->netdev_ops->ndo_features_check(skb, dev,
2897								features);
2898	else
2899		features &= dflt_features_check(skb, dev, features);
2900
2901	return harmonize_features(skb, features);
2902}
2903EXPORT_SYMBOL(netif_skb_features);
2904
2905static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2906		    struct netdev_queue *txq, bool more)
2907{
2908	unsigned int len;
2909	int rc;
2910
2911	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2912		dev_queue_xmit_nit(skb, dev);
2913
2914	len = skb->len;
2915	trace_net_dev_start_xmit(skb, dev);
2916	rc = netdev_start_xmit(skb, dev, txq, more);
2917	trace_net_dev_xmit(skb, rc, dev, len);
2918
2919	return rc;
2920}
2921
2922struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2923				    struct netdev_queue *txq, int *ret)
2924{
2925	struct sk_buff *skb = first;
2926	int rc = NETDEV_TX_OK;
2927
2928	while (skb) {
2929		struct sk_buff *next = skb->next;
2930
2931		skb->next = NULL;
2932		rc = xmit_one(skb, dev, txq, next != NULL);
2933		if (unlikely(!dev_xmit_complete(rc))) {
2934			skb->next = next;
2935			goto out;
2936		}
2937
2938		skb = next;
2939		if (netif_xmit_stopped(txq) && skb) {
2940			rc = NETDEV_TX_BUSY;
2941			break;
2942		}
2943	}
2944
2945out:
2946	*ret = rc;
2947	return skb;
2948}
2949
2950static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2951					  netdev_features_t features)
2952{
2953	if (skb_vlan_tag_present(skb) &&
2954	    !vlan_hw_offload_capable(features, skb->vlan_proto))
2955		skb = __vlan_hwaccel_push_inside(skb);
2956	return skb;
2957}
2958
2959static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
 
 
 
 
 
 
 
 
 
 
 
2960{
2961	netdev_features_t features;
2962
2963	features = netif_skb_features(skb);
2964	skb = validate_xmit_vlan(skb, features);
2965	if (unlikely(!skb))
2966		goto out_null;
2967
2968	if (netif_needs_gso(skb, features)) {
2969		struct sk_buff *segs;
2970
2971		segs = skb_gso_segment(skb, features);
2972		if (IS_ERR(segs)) {
2973			goto out_kfree_skb;
2974		} else if (segs) {
2975			consume_skb(skb);
2976			skb = segs;
2977		}
2978	} else {
2979		if (skb_needs_linearize(skb, features) &&
2980		    __skb_linearize(skb))
2981			goto out_kfree_skb;
2982
2983		/* If packet is not checksummed and device does not
2984		 * support checksumming for this protocol, complete
2985		 * checksumming here.
2986		 */
2987		if (skb->ip_summed == CHECKSUM_PARTIAL) {
2988			if (skb->encapsulation)
2989				skb_set_inner_transport_header(skb,
2990							       skb_checksum_start_offset(skb));
2991			else
2992				skb_set_transport_header(skb,
2993							 skb_checksum_start_offset(skb));
2994			if (!(features & NETIF_F_CSUM_MASK) &&
2995			    skb_checksum_help(skb))
2996				goto out_kfree_skb;
2997		}
2998	}
2999
 
 
3000	return skb;
3001
3002out_kfree_skb:
3003	kfree_skb(skb);
3004out_null:
3005	atomic_long_inc(&dev->tx_dropped);
3006	return NULL;
3007}
3008
3009struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
3010{
3011	struct sk_buff *next, *head = NULL, *tail;
3012
3013	for (; skb != NULL; skb = next) {
3014		next = skb->next;
3015		skb->next = NULL;
3016
3017		/* in case skb wont be segmented, point to itself */
3018		skb->prev = skb;
3019
3020		skb = validate_xmit_skb(skb, dev);
3021		if (!skb)
3022			continue;
3023
3024		if (!head)
3025			head = skb;
3026		else
3027			tail->next = skb;
3028		/* If skb was segmented, skb->prev points to
3029		 * the last segment. If not, it still contains skb.
3030		 */
3031		tail = skb->prev;
3032	}
3033	return head;
3034}
3035EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3036
3037static void qdisc_pkt_len_init(struct sk_buff *skb)
3038{
3039	const struct skb_shared_info *shinfo = skb_shinfo(skb);
3040
3041	qdisc_skb_cb(skb)->pkt_len = skb->len;
3042
3043	/* To get more precise estimation of bytes sent on wire,
3044	 * we add to pkt_len the headers size of all segments
3045	 */
3046	if (shinfo->gso_size)  {
3047		unsigned int hdr_len;
3048		u16 gso_segs = shinfo->gso_segs;
3049
3050		/* mac layer + network layer */
3051		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3052
3053		/* + transport layer */
3054		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3055			hdr_len += tcp_hdrlen(skb);
3056		else
3057			hdr_len += sizeof(struct udphdr);
 
 
 
 
 
 
 
 
 
 
 
3058
3059		if (shinfo->gso_type & SKB_GSO_DODGY)
3060			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3061						shinfo->gso_size);
3062
3063		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3064	}
3065}
3066
3067static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3068				 struct net_device *dev,
3069				 struct netdev_queue *txq)
3070{
3071	spinlock_t *root_lock = qdisc_lock(q);
3072	struct sk_buff *to_free = NULL;
3073	bool contended;
3074	int rc;
3075
3076	qdisc_calculate_pkt_len(skb, q);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3077	/*
3078	 * Heuristic to force contended enqueues to serialize on a
3079	 * separate lock before trying to get qdisc main lock.
3080	 * This permits qdisc->running owner to get the lock more
3081	 * often and dequeue packets faster.
3082	 */
3083	contended = qdisc_is_running(q);
3084	if (unlikely(contended))
3085		spin_lock(&q->busylock);
3086
3087	spin_lock(root_lock);
3088	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3089		__qdisc_drop(skb, &to_free);
3090		rc = NET_XMIT_DROP;
3091	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3092		   qdisc_run_begin(q)) {
3093		/*
3094		 * This is a work-conserving queue; there are no old skbs
3095		 * waiting to be sent out; and the qdisc is not running -
3096		 * xmit the skb directly.
3097		 */
3098
3099		qdisc_bstats_update(q, skb);
3100
3101		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3102			if (unlikely(contended)) {
3103				spin_unlock(&q->busylock);
3104				contended = false;
3105			}
3106			__qdisc_run(q);
3107		} else
3108			qdisc_run_end(q);
3109
 
3110		rc = NET_XMIT_SUCCESS;
3111	} else {
3112		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3113		if (qdisc_run_begin(q)) {
3114			if (unlikely(contended)) {
3115				spin_unlock(&q->busylock);
3116				contended = false;
3117			}
3118			__qdisc_run(q);
 
3119		}
3120	}
3121	spin_unlock(root_lock);
3122	if (unlikely(to_free))
3123		kfree_skb_list(to_free);
3124	if (unlikely(contended))
3125		spin_unlock(&q->busylock);
3126	return rc;
3127}
3128
3129#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3130static void skb_update_prio(struct sk_buff *skb)
3131{
3132	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
 
 
 
 
 
 
 
 
 
 
 
3133
3134	if (!skb->priority && skb->sk && map) {
3135		unsigned int prioidx =
3136			sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3137
3138		if (prioidx < map->priomap_len)
3139			skb->priority = map->priomap[prioidx];
3140	}
3141}
3142#else
3143#define skb_update_prio(skb)
3144#endif
3145
3146DEFINE_PER_CPU(int, xmit_recursion);
3147EXPORT_SYMBOL(xmit_recursion);
3148
3149/**
3150 *	dev_loopback_xmit - loop back @skb
3151 *	@net: network namespace this loopback is happening in
3152 *	@sk:  sk needed to be a netfilter okfn
3153 *	@skb: buffer to transmit
3154 */
3155int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3156{
3157	skb_reset_mac_header(skb);
3158	__skb_pull(skb, skb_network_offset(skb));
3159	skb->pkt_type = PACKET_LOOPBACK;
3160	skb->ip_summed = CHECKSUM_UNNECESSARY;
3161	WARN_ON(!skb_dst(skb));
3162	skb_dst_force(skb);
3163	netif_rx_ni(skb);
3164	return 0;
3165}
3166EXPORT_SYMBOL(dev_loopback_xmit);
3167
3168#ifdef CONFIG_NET_EGRESS
3169static struct sk_buff *
3170sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3171{
3172	struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3173	struct tcf_result cl_res;
3174
3175	if (!cl)
3176		return skb;
3177
3178	/* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3179	 * earlier by the caller.
3180	 */
3181	qdisc_bstats_cpu_update(cl->q, skb);
3182
3183	switch (tc_classify(skb, cl, &cl_res, false)) {
3184	case TC_ACT_OK:
3185	case TC_ACT_RECLASSIFY:
3186		skb->tc_index = TC_H_MIN(cl_res.classid);
3187		break;
3188	case TC_ACT_SHOT:
3189		qdisc_qstats_cpu_drop(cl->q);
3190		*ret = NET_XMIT_DROP;
3191		kfree_skb(skb);
3192		return NULL;
3193	case TC_ACT_STOLEN:
3194	case TC_ACT_QUEUED:
 
3195		*ret = NET_XMIT_SUCCESS;
3196		consume_skb(skb);
3197		return NULL;
3198	case TC_ACT_REDIRECT:
3199		/* No need to push/pop skb's mac_header here on egress! */
3200		skb_do_redirect(skb);
3201		*ret = NET_XMIT_SUCCESS;
3202		return NULL;
3203	default:
3204		break;
3205	}
3206
3207	return skb;
3208}
3209#endif /* CONFIG_NET_EGRESS */
3210
3211static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3212{
3213#ifdef CONFIG_XPS
3214	struct xps_dev_maps *dev_maps;
3215	struct xps_map *map;
3216	int queue_index = -1;
3217
3218	rcu_read_lock();
3219	dev_maps = rcu_dereference(dev->xps_maps);
3220	if (dev_maps) {
3221		unsigned int tci = skb->sender_cpu - 1;
3222
3223		if (dev->num_tc) {
3224			tci *= dev->num_tc;
3225			tci += netdev_get_prio_tc_map(dev, skb->priority);
3226		}
3227
3228		map = rcu_dereference(dev_maps->cpu_map[tci]);
3229		if (map) {
3230			if (map->len == 1)
3231				queue_index = map->queues[0];
3232			else
3233				queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3234									   map->len)];
3235			if (unlikely(queue_index >= dev->real_num_tx_queues))
3236				queue_index = -1;
3237		}
3238	}
3239	rcu_read_unlock();
3240
3241	return queue_index;
3242#else
3243	return -1;
3244#endif
3245}
3246
3247static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3248{
3249	struct sock *sk = skb->sk;
3250	int queue_index = sk_tx_queue_get(sk);
3251
3252	if (queue_index < 0 || skb->ooo_okay ||
3253	    queue_index >= dev->real_num_tx_queues) {
3254		int new_index = get_xps_queue(dev, skb);
 
3255		if (new_index < 0)
3256			new_index = skb_tx_hash(dev, skb);
3257
3258		if (queue_index != new_index && sk &&
3259		    sk_fullsock(sk) &&
3260		    rcu_access_pointer(sk->sk_dst_cache))
3261			sk_tx_queue_set(sk, new_index);
3262
3263		queue_index = new_index;
3264	}
3265
3266	return queue_index;
3267}
3268
3269struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3270				    struct sk_buff *skb,
3271				    void *accel_priv)
3272{
3273	int queue_index = 0;
3274
3275#ifdef CONFIG_XPS
3276	u32 sender_cpu = skb->sender_cpu - 1;
3277
3278	if (sender_cpu >= (u32)NR_CPUS)
3279		skb->sender_cpu = raw_smp_processor_id() + 1;
3280#endif
3281
3282	if (dev->real_num_tx_queues != 1) {
3283		const struct net_device_ops *ops = dev->netdev_ops;
 
3284		if (ops->ndo_select_queue)
3285			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3286							    __netdev_pick_tx);
3287		else
3288			queue_index = __netdev_pick_tx(dev, skb);
3289
3290		if (!accel_priv)
3291			queue_index = netdev_cap_txqueue(dev, queue_index);
3292	}
3293
3294	skb_set_queue_mapping(skb, queue_index);
3295	return netdev_get_tx_queue(dev, queue_index);
3296}
3297
3298/**
3299 *	__dev_queue_xmit - transmit a buffer
3300 *	@skb: buffer to transmit
3301 *	@accel_priv: private data used for L2 forwarding offload
3302 *
3303 *	Queue a buffer for transmission to a network device. The caller must
3304 *	have set the device and priority and built the buffer before calling
3305 *	this function. The function can be called from an interrupt.
3306 *
3307 *	A negative errno code is returned on a failure. A success does not
3308 *	guarantee the frame will be transmitted as it may be dropped due
3309 *	to congestion or traffic shaping.
3310 *
3311 * -----------------------------------------------------------------------------------
3312 *      I notice this method can also return errors from the queue disciplines,
3313 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3314 *      be positive.
3315 *
3316 *      Regardless of the return value, the skb is consumed, so it is currently
3317 *      difficult to retry a send to this method.  (You can bump the ref count
3318 *      before sending to hold a reference for retry if you are careful.)
3319 *
3320 *      When calling this method, interrupts MUST be enabled.  This is because
3321 *      the BH enable code must have IRQs enabled so that it will not deadlock.
3322 *          --BLG
3323 */
3324static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3325{
3326	struct net_device *dev = skb->dev;
3327	struct netdev_queue *txq;
3328	struct Qdisc *q;
3329	int rc = -ENOMEM;
 
3330
3331	skb_reset_mac_header(skb);
3332
3333	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3334		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3335
3336	/* Disable soft irqs for various locks below. Also
3337	 * stops preemption for RCU.
3338	 */
3339	rcu_read_lock_bh();
3340
3341	skb_update_prio(skb);
3342
3343	qdisc_pkt_len_init(skb);
3344#ifdef CONFIG_NET_CLS_ACT
3345	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3346# ifdef CONFIG_NET_EGRESS
3347	if (static_key_false(&egress_needed)) {
3348		skb = sch_handle_egress(skb, &rc, dev);
3349		if (!skb)
3350			goto out;
3351	}
3352# endif
3353#endif
3354	/* If device/qdisc don't need skb->dst, release it right now while
3355	 * its hot in this cpu cache.
3356	 */
3357	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3358		skb_dst_drop(skb);
3359	else
3360		skb_dst_force(skb);
3361
3362	txq = netdev_pick_tx(dev, skb, accel_priv);
3363	q = rcu_dereference_bh(txq->qdisc);
3364
3365	trace_net_dev_queue(skb);
3366	if (q->enqueue) {
3367		rc = __dev_xmit_skb(skb, q, dev, txq);
3368		goto out;
3369	}
3370
3371	/* The device has no queue. Common case for software devices:
3372	   loopback, all the sorts of tunnels...
3373
3374	   Really, it is unlikely that netif_tx_lock protection is necessary
3375	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3376	   counters.)
3377	   However, it is possible, that they rely on protection
3378	   made by us here.
3379
3380	   Check this and shot the lock. It is not prone from deadlocks.
3381	   Either shot noqueue qdisc, it is even simpler 8)
3382	 */
3383	if (dev->flags & IFF_UP) {
3384		int cpu = smp_processor_id(); /* ok because BHs are off */
3385
3386		if (txq->xmit_lock_owner != cpu) {
3387			if (unlikely(__this_cpu_read(xmit_recursion) >
3388				     XMIT_RECURSION_LIMIT))
3389				goto recursion_alert;
3390
3391			skb = validate_xmit_skb(skb, dev);
3392			if (!skb)
3393				goto out;
3394
3395			HARD_TX_LOCK(dev, txq, cpu);
3396
3397			if (!netif_xmit_stopped(txq)) {
3398				__this_cpu_inc(xmit_recursion);
3399				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3400				__this_cpu_dec(xmit_recursion);
3401				if (dev_xmit_complete(rc)) {
3402					HARD_TX_UNLOCK(dev, txq);
3403					goto out;
3404				}
3405			}
3406			HARD_TX_UNLOCK(dev, txq);
3407			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3408					     dev->name);
3409		} else {
3410			/* Recursion is detected! It is possible,
3411			 * unfortunately
3412			 */
3413recursion_alert:
3414			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3415					     dev->name);
3416		}
3417	}
3418
3419	rc = -ENETDOWN;
3420	rcu_read_unlock_bh();
3421
3422	atomic_long_inc(&dev->tx_dropped);
3423	kfree_skb_list(skb);
3424	return rc;
3425out:
3426	rcu_read_unlock_bh();
3427	return rc;
3428}
3429
3430int dev_queue_xmit(struct sk_buff *skb)
3431{
3432	return __dev_queue_xmit(skb, NULL);
3433}
3434EXPORT_SYMBOL(dev_queue_xmit);
3435
3436int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3437{
3438	return __dev_queue_xmit(skb, accel_priv);
3439}
3440EXPORT_SYMBOL(dev_queue_xmit_accel);
3441
3442
3443/*=======================================================================
3444			Receiver routines
3445  =======================================================================*/
3446
3447int netdev_max_backlog __read_mostly = 1000;
3448EXPORT_SYMBOL(netdev_max_backlog);
3449
3450int netdev_tstamp_prequeue __read_mostly = 1;
3451int netdev_budget __read_mostly = 300;
3452int weight_p __read_mostly = 64;            /* old backlog weight */
 
 
 
 
 
3453
3454/* Called with irq disabled */
3455static inline void ____napi_schedule(struct softnet_data *sd,
3456				     struct napi_struct *napi)
3457{
3458	list_add_tail(&napi->poll_list, &sd->poll_list);
3459	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3460}
3461
3462#ifdef CONFIG_RPS
3463
3464/* One global table that all flow-based protocols share. */
3465struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3466EXPORT_SYMBOL(rps_sock_flow_table);
3467u32 rps_cpu_mask __read_mostly;
3468EXPORT_SYMBOL(rps_cpu_mask);
3469
3470struct static_key rps_needed __read_mostly;
3471EXPORT_SYMBOL(rps_needed);
3472struct static_key rfs_needed __read_mostly;
3473EXPORT_SYMBOL(rfs_needed);
3474
3475static struct rps_dev_flow *
3476set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3477	    struct rps_dev_flow *rflow, u16 next_cpu)
3478{
3479	if (next_cpu < nr_cpu_ids) {
3480#ifdef CONFIG_RFS_ACCEL
3481		struct netdev_rx_queue *rxqueue;
3482		struct rps_dev_flow_table *flow_table;
3483		struct rps_dev_flow *old_rflow;
3484		u32 flow_id;
3485		u16 rxq_index;
3486		int rc;
3487
3488		/* Should we steer this flow to a different hardware queue? */
3489		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3490		    !(dev->features & NETIF_F_NTUPLE))
3491			goto out;
3492		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3493		if (rxq_index == skb_get_rx_queue(skb))
3494			goto out;
3495
3496		rxqueue = dev->_rx + rxq_index;
3497		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3498		if (!flow_table)
3499			goto out;
3500		flow_id = skb_get_hash(skb) & flow_table->mask;
3501		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3502							rxq_index, flow_id);
3503		if (rc < 0)
3504			goto out;
3505		old_rflow = rflow;
3506		rflow = &flow_table->flows[flow_id];
3507		rflow->filter = rc;
3508		if (old_rflow->filter == rflow->filter)
3509			old_rflow->filter = RPS_NO_FILTER;
3510	out:
3511#endif
3512		rflow->last_qtail =
3513			per_cpu(softnet_data, next_cpu).input_queue_head;
3514	}
3515
3516	rflow->cpu = next_cpu;
3517	return rflow;
3518}
3519
3520/*
3521 * get_rps_cpu is called from netif_receive_skb and returns the target
3522 * CPU from the RPS map of the receiving queue for a given skb.
3523 * rcu_read_lock must be held on entry.
3524 */
3525static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3526		       struct rps_dev_flow **rflowp)
3527{
3528	const struct rps_sock_flow_table *sock_flow_table;
3529	struct netdev_rx_queue *rxqueue = dev->_rx;
3530	struct rps_dev_flow_table *flow_table;
3531	struct rps_map *map;
3532	int cpu = -1;
3533	u32 tcpu;
3534	u32 hash;
3535
3536	if (skb_rx_queue_recorded(skb)) {
3537		u16 index = skb_get_rx_queue(skb);
3538
3539		if (unlikely(index >= dev->real_num_rx_queues)) {
3540			WARN_ONCE(dev->real_num_rx_queues > 1,
3541				  "%s received packet on queue %u, but number "
3542				  "of RX queues is %u\n",
3543				  dev->name, index, dev->real_num_rx_queues);
3544			goto done;
3545		}
3546		rxqueue += index;
3547	}
3548
3549	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3550
3551	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3552	map = rcu_dereference(rxqueue->rps_map);
3553	if (!flow_table && !map)
3554		goto done;
3555
3556	skb_reset_network_header(skb);
3557	hash = skb_get_hash(skb);
3558	if (!hash)
3559		goto done;
3560
3561	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3562	if (flow_table && sock_flow_table) {
3563		struct rps_dev_flow *rflow;
3564		u32 next_cpu;
3565		u32 ident;
3566
3567		/* First check into global flow table if there is a match */
3568		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3569		if ((ident ^ hash) & ~rps_cpu_mask)
3570			goto try_rps;
3571
3572		next_cpu = ident & rps_cpu_mask;
3573
3574		/* OK, now we know there is a match,
3575		 * we can look at the local (per receive queue) flow table
3576		 */
3577		rflow = &flow_table->flows[hash & flow_table->mask];
3578		tcpu = rflow->cpu;
3579
3580		/*
3581		 * If the desired CPU (where last recvmsg was done) is
3582		 * different from current CPU (one in the rx-queue flow
3583		 * table entry), switch if one of the following holds:
3584		 *   - Current CPU is unset (>= nr_cpu_ids).
3585		 *   - Current CPU is offline.
3586		 *   - The current CPU's queue tail has advanced beyond the
3587		 *     last packet that was enqueued using this table entry.
3588		 *     This guarantees that all previous packets for the flow
3589		 *     have been dequeued, thus preserving in order delivery.
3590		 */
3591		if (unlikely(tcpu != next_cpu) &&
3592		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3593		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3594		      rflow->last_qtail)) >= 0)) {
3595			tcpu = next_cpu;
3596			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3597		}
3598
3599		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3600			*rflowp = rflow;
3601			cpu = tcpu;
3602			goto done;
3603		}
3604	}
3605
3606try_rps:
3607
3608	if (map) {
3609		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3610		if (cpu_online(tcpu)) {
3611			cpu = tcpu;
3612			goto done;
3613		}
3614	}
3615
3616done:
3617	return cpu;
3618}
3619
3620#ifdef CONFIG_RFS_ACCEL
3621
3622/**
3623 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3624 * @dev: Device on which the filter was set
3625 * @rxq_index: RX queue index
3626 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3627 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3628 *
3629 * Drivers that implement ndo_rx_flow_steer() should periodically call
3630 * this function for each installed filter and remove the filters for
3631 * which it returns %true.
3632 */
3633bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3634			 u32 flow_id, u16 filter_id)
3635{
3636	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3637	struct rps_dev_flow_table *flow_table;
3638	struct rps_dev_flow *rflow;
3639	bool expire = true;
3640	unsigned int cpu;
3641
3642	rcu_read_lock();
3643	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3644	if (flow_table && flow_id <= flow_table->mask) {
3645		rflow = &flow_table->flows[flow_id];
3646		cpu = ACCESS_ONCE(rflow->cpu);
3647		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3648		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3649			   rflow->last_qtail) <
3650		     (int)(10 * flow_table->mask)))
3651			expire = false;
3652	}
3653	rcu_read_unlock();
3654	return expire;
3655}
3656EXPORT_SYMBOL(rps_may_expire_flow);
3657
3658#endif /* CONFIG_RFS_ACCEL */
3659
3660/* Called from hardirq (IPI) context */
3661static void rps_trigger_softirq(void *data)
3662{
3663	struct softnet_data *sd = data;
3664
3665	____napi_schedule(sd, &sd->backlog);
3666	sd->received_rps++;
3667}
3668
3669#endif /* CONFIG_RPS */
3670
3671/*
3672 * Check if this softnet_data structure is another cpu one
3673 * If yes, queue it to our IPI list and return 1
3674 * If no, return 0
3675 */
3676static int rps_ipi_queued(struct softnet_data *sd)
3677{
3678#ifdef CONFIG_RPS
3679	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3680
3681	if (sd != mysd) {
3682		sd->rps_ipi_next = mysd->rps_ipi_list;
3683		mysd->rps_ipi_list = sd;
3684
3685		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3686		return 1;
3687	}
3688#endif /* CONFIG_RPS */
3689	return 0;
3690}
3691
3692#ifdef CONFIG_NET_FLOW_LIMIT
3693int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3694#endif
3695
3696static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3697{
3698#ifdef CONFIG_NET_FLOW_LIMIT
3699	struct sd_flow_limit *fl;
3700	struct softnet_data *sd;
3701	unsigned int old_flow, new_flow;
3702
3703	if (qlen < (netdev_max_backlog >> 1))
3704		return false;
3705
3706	sd = this_cpu_ptr(&softnet_data);
3707
3708	rcu_read_lock();
3709	fl = rcu_dereference(sd->flow_limit);
3710	if (fl) {
3711		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3712		old_flow = fl->history[fl->history_head];
3713		fl->history[fl->history_head] = new_flow;
3714
3715		fl->history_head++;
3716		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3717
3718		if (likely(fl->buckets[old_flow]))
3719			fl->buckets[old_flow]--;
3720
3721		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3722			fl->count++;
3723			rcu_read_unlock();
3724			return true;
3725		}
3726	}
3727	rcu_read_unlock();
3728#endif
3729	return false;
3730}
3731
3732/*
3733 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3734 * queue (may be a remote CPU queue).
3735 */
3736static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3737			      unsigned int *qtail)
3738{
3739	struct softnet_data *sd;
3740	unsigned long flags;
3741	unsigned int qlen;
3742
3743	sd = &per_cpu(softnet_data, cpu);
3744
3745	local_irq_save(flags);
3746
3747	rps_lock(sd);
3748	if (!netif_running(skb->dev))
3749		goto drop;
3750	qlen = skb_queue_len(&sd->input_pkt_queue);
3751	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3752		if (qlen) {
3753enqueue:
3754			__skb_queue_tail(&sd->input_pkt_queue, skb);
3755			input_queue_tail_incr_save(sd, qtail);
3756			rps_unlock(sd);
3757			local_irq_restore(flags);
3758			return NET_RX_SUCCESS;
3759		}
3760
3761		/* Schedule NAPI for backlog device
3762		 * We can use non atomic operation since we own the queue lock
3763		 */
3764		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3765			if (!rps_ipi_queued(sd))
3766				____napi_schedule(sd, &sd->backlog);
3767		}
3768		goto enqueue;
3769	}
3770
3771drop:
3772	sd->dropped++;
3773	rps_unlock(sd);
3774
3775	local_irq_restore(flags);
3776
3777	atomic_long_inc(&skb->dev->rx_dropped);
3778	kfree_skb(skb);
3779	return NET_RX_DROP;
3780}
3781
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3782static int netif_rx_internal(struct sk_buff *skb)
3783{
3784	int ret;
3785
3786	net_timestamp_check(netdev_tstamp_prequeue, skb);
3787
3788	trace_netif_rx(skb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3789#ifdef CONFIG_RPS
3790	if (static_key_false(&rps_needed)) {
3791		struct rps_dev_flow voidflow, *rflow = &voidflow;
3792		int cpu;
3793
3794		preempt_disable();
3795		rcu_read_lock();
3796
3797		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3798		if (cpu < 0)
3799			cpu = smp_processor_id();
3800
3801		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3802
3803		rcu_read_unlock();
3804		preempt_enable();
3805	} else
3806#endif
3807	{
3808		unsigned int qtail;
 
3809		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3810		put_cpu();
3811	}
3812	return ret;
3813}
3814
3815/**
3816 *	netif_rx	-	post buffer to the network code
3817 *	@skb: buffer to post
3818 *
3819 *	This function receives a packet from a device driver and queues it for
3820 *	the upper (protocol) levels to process.  It always succeeds. The buffer
3821 *	may be dropped during processing for congestion control or by the
3822 *	protocol layers.
3823 *
3824 *	return values:
3825 *	NET_RX_SUCCESS	(no congestion)
3826 *	NET_RX_DROP     (packet was dropped)
3827 *
3828 */
3829
3830int netif_rx(struct sk_buff *skb)
3831{
3832	trace_netif_rx_entry(skb);
3833
3834	return netif_rx_internal(skb);
3835}
3836EXPORT_SYMBOL(netif_rx);
3837
3838int netif_rx_ni(struct sk_buff *skb)
3839{
3840	int err;
3841
3842	trace_netif_rx_ni_entry(skb);
3843
3844	preempt_disable();
3845	err = netif_rx_internal(skb);
3846	if (local_softirq_pending())
3847		do_softirq();
3848	preempt_enable();
3849
3850	return err;
3851}
3852EXPORT_SYMBOL(netif_rx_ni);
3853
3854static __latent_entropy void net_tx_action(struct softirq_action *h)
3855{
3856	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3857
3858	if (sd->completion_queue) {
3859		struct sk_buff *clist;
3860
3861		local_irq_disable();
3862		clist = sd->completion_queue;
3863		sd->completion_queue = NULL;
3864		local_irq_enable();
3865
3866		while (clist) {
3867			struct sk_buff *skb = clist;
 
3868			clist = clist->next;
3869
3870			WARN_ON(atomic_read(&skb->users));
3871			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3872				trace_consume_skb(skb);
3873			else
3874				trace_kfree_skb(skb, net_tx_action);
3875
3876			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3877				__kfree_skb(skb);
3878			else
3879				__kfree_skb_defer(skb);
3880		}
3881
3882		__kfree_skb_flush();
3883	}
3884
3885	if (sd->output_queue) {
3886		struct Qdisc *head;
3887
3888		local_irq_disable();
3889		head = sd->output_queue;
3890		sd->output_queue = NULL;
3891		sd->output_queue_tailp = &sd->output_queue;
3892		local_irq_enable();
3893
3894		while (head) {
3895			struct Qdisc *q = head;
3896			spinlock_t *root_lock;
3897
3898			head = head->next_sched;
3899
3900			root_lock = qdisc_lock(q);
3901			spin_lock(root_lock);
 
 
3902			/* We need to make sure head->next_sched is read
3903			 * before clearing __QDISC_STATE_SCHED
3904			 */
3905			smp_mb__before_atomic();
3906			clear_bit(__QDISC_STATE_SCHED, &q->state);
3907			qdisc_run(q);
3908			spin_unlock(root_lock);
 
3909		}
3910	}
 
 
3911}
3912
3913#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3914/* This hook is defined here for ATM LANE */
3915int (*br_fdb_test_addr_hook)(struct net_device *dev,
3916			     unsigned char *addr) __read_mostly;
3917EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3918#endif
3919
3920static inline struct sk_buff *
3921sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3922		   struct net_device *orig_dev)
3923{
3924#ifdef CONFIG_NET_CLS_ACT
3925	struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3926	struct tcf_result cl_res;
3927
3928	/* If there's at least one ingress present somewhere (so
3929	 * we get here via enabled static key), remaining devices
3930	 * that are not configured with an ingress qdisc will bail
3931	 * out here.
3932	 */
3933	if (!cl)
3934		return skb;
 
3935	if (*pt_prev) {
3936		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3937		*pt_prev = NULL;
3938	}
3939
3940	qdisc_skb_cb(skb)->pkt_len = skb->len;
3941	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3942	qdisc_bstats_cpu_update(cl->q, skb);
3943
3944	switch (tc_classify(skb, cl, &cl_res, false)) {
3945	case TC_ACT_OK:
3946	case TC_ACT_RECLASSIFY:
3947		skb->tc_index = TC_H_MIN(cl_res.classid);
3948		break;
3949	case TC_ACT_SHOT:
3950		qdisc_qstats_cpu_drop(cl->q);
3951		kfree_skb(skb);
3952		return NULL;
3953	case TC_ACT_STOLEN:
3954	case TC_ACT_QUEUED:
 
3955		consume_skb(skb);
3956		return NULL;
3957	case TC_ACT_REDIRECT:
3958		/* skb_mac_header check was done by cls/act_bpf, so
3959		 * we can safely push the L2 header back before
3960		 * redirecting to another netdev
3961		 */
3962		__skb_push(skb, skb->mac_len);
3963		skb_do_redirect(skb);
3964		return NULL;
3965	default:
3966		break;
3967	}
3968#endif /* CONFIG_NET_CLS_ACT */
3969	return skb;
3970}
3971
3972/**
3973 *	netdev_is_rx_handler_busy - check if receive handler is registered
3974 *	@dev: device to check
3975 *
3976 *	Check if a receive handler is already registered for a given device.
3977 *	Return true if there one.
3978 *
3979 *	The caller must hold the rtnl_mutex.
3980 */
3981bool netdev_is_rx_handler_busy(struct net_device *dev)
3982{
3983	ASSERT_RTNL();
3984	return dev && rtnl_dereference(dev->rx_handler);
3985}
3986EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3987
3988/**
3989 *	netdev_rx_handler_register - register receive handler
3990 *	@dev: device to register a handler for
3991 *	@rx_handler: receive handler to register
3992 *	@rx_handler_data: data pointer that is used by rx handler
3993 *
3994 *	Register a receive handler for a device. This handler will then be
3995 *	called from __netif_receive_skb. A negative errno code is returned
3996 *	on a failure.
3997 *
3998 *	The caller must hold the rtnl_mutex.
3999 *
4000 *	For a general description of rx_handler, see enum rx_handler_result.
4001 */
4002int netdev_rx_handler_register(struct net_device *dev,
4003			       rx_handler_func_t *rx_handler,
4004			       void *rx_handler_data)
4005{
4006	ASSERT_RTNL();
 
4007
4008	if (dev->rx_handler)
4009		return -EBUSY;
4010
4011	/* Note: rx_handler_data must be set before rx_handler */
4012	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4013	rcu_assign_pointer(dev->rx_handler, rx_handler);
4014
4015	return 0;
4016}
4017EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4018
4019/**
4020 *	netdev_rx_handler_unregister - unregister receive handler
4021 *	@dev: device to unregister a handler from
4022 *
4023 *	Unregister a receive handler from a device.
4024 *
4025 *	The caller must hold the rtnl_mutex.
4026 */
4027void netdev_rx_handler_unregister(struct net_device *dev)
4028{
4029
4030	ASSERT_RTNL();
4031	RCU_INIT_POINTER(dev->rx_handler, NULL);
4032	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4033	 * section has a guarantee to see a non NULL rx_handler_data
4034	 * as well.
4035	 */
4036	synchronize_net();
4037	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4038}
4039EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4040
4041/*
4042 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4043 * the special handling of PFMEMALLOC skbs.
4044 */
4045static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4046{
4047	switch (skb->protocol) {
4048	case htons(ETH_P_ARP):
4049	case htons(ETH_P_IP):
4050	case htons(ETH_P_IPV6):
4051	case htons(ETH_P_8021Q):
4052	case htons(ETH_P_8021AD):
4053		return true;
4054	default:
4055		return false;
4056	}
4057}
4058
4059static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4060			     int *ret, struct net_device *orig_dev)
4061{
4062#ifdef CONFIG_NETFILTER_INGRESS
4063	if (nf_hook_ingress_active(skb)) {
4064		int ingress_retval;
4065
4066		if (*pt_prev) {
4067			*ret = deliver_skb(skb, *pt_prev, orig_dev);
4068			*pt_prev = NULL;
4069		}
4070
4071		rcu_read_lock();
4072		ingress_retval = nf_hook_ingress(skb);
4073		rcu_read_unlock();
4074		return ingress_retval;
4075	}
4076#endif /* CONFIG_NETFILTER_INGRESS */
4077	return 0;
4078}
4079
4080static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4081{
4082	struct packet_type *ptype, *pt_prev;
4083	rx_handler_func_t *rx_handler;
4084	struct net_device *orig_dev;
4085	bool deliver_exact = false;
4086	int ret = NET_RX_DROP;
4087	__be16 type;
4088
4089	net_timestamp_check(!netdev_tstamp_prequeue, skb);
4090
4091	trace_netif_receive_skb(skb);
4092
4093	orig_dev = skb->dev;
4094
4095	skb_reset_network_header(skb);
4096	if (!skb_transport_header_was_set(skb))
4097		skb_reset_transport_header(skb);
4098	skb_reset_mac_len(skb);
4099
4100	pt_prev = NULL;
4101
4102another_round:
4103	skb->skb_iif = skb->dev->ifindex;
4104
4105	__this_cpu_inc(softnet_data.processed);
4106
4107	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4108	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4109		skb = skb_vlan_untag(skb);
4110		if (unlikely(!skb))
4111			goto out;
4112	}
4113
4114#ifdef CONFIG_NET_CLS_ACT
4115	if (skb->tc_verd & TC_NCLS) {
4116		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4117		goto ncls;
4118	}
4119#endif
4120
4121	if (pfmemalloc)
4122		goto skip_taps;
4123
4124	list_for_each_entry_rcu(ptype, &ptype_all, list) {
4125		if (pt_prev)
4126			ret = deliver_skb(skb, pt_prev, orig_dev);
4127		pt_prev = ptype;
4128	}
4129
4130	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4131		if (pt_prev)
4132			ret = deliver_skb(skb, pt_prev, orig_dev);
4133		pt_prev = ptype;
4134	}
4135
4136skip_taps:
4137#ifdef CONFIG_NET_INGRESS
4138	if (static_key_false(&ingress_needed)) {
4139		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4140		if (!skb)
4141			goto out;
4142
4143		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4144			goto out;
4145	}
4146#endif
4147#ifdef CONFIG_NET_CLS_ACT
4148	skb->tc_verd = 0;
4149ncls:
4150#endif
4151	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4152		goto drop;
4153
4154	if (skb_vlan_tag_present(skb)) {
4155		if (pt_prev) {
4156			ret = deliver_skb(skb, pt_prev, orig_dev);
4157			pt_prev = NULL;
4158		}
4159		if (vlan_do_receive(&skb))
4160			goto another_round;
4161		else if (unlikely(!skb))
4162			goto out;
4163	}
4164
4165	rx_handler = rcu_dereference(skb->dev->rx_handler);
4166	if (rx_handler) {
4167		if (pt_prev) {
4168			ret = deliver_skb(skb, pt_prev, orig_dev);
4169			pt_prev = NULL;
4170		}
4171		switch (rx_handler(&skb)) {
4172		case RX_HANDLER_CONSUMED:
4173			ret = NET_RX_SUCCESS;
4174			goto out;
4175		case RX_HANDLER_ANOTHER:
4176			goto another_round;
4177		case RX_HANDLER_EXACT:
4178			deliver_exact = true;
4179		case RX_HANDLER_PASS:
4180			break;
4181		default:
4182			BUG();
4183		}
4184	}
4185
4186	if (unlikely(skb_vlan_tag_present(skb))) {
4187		if (skb_vlan_tag_get_id(skb))
4188			skb->pkt_type = PACKET_OTHERHOST;
4189		/* Note: we might in the future use prio bits
4190		 * and set skb->priority like in vlan_do_receive()
4191		 * For the time being, just ignore Priority Code Point
4192		 */
4193		skb->vlan_tci = 0;
4194	}
4195
4196	type = skb->protocol;
4197
4198	/* deliver only exact match when indicated */
4199	if (likely(!deliver_exact)) {
4200		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4201				       &ptype_base[ntohs(type) &
4202						   PTYPE_HASH_MASK]);
4203	}
4204
4205	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4206			       &orig_dev->ptype_specific);
4207
4208	if (unlikely(skb->dev != orig_dev)) {
4209		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4210				       &skb->dev->ptype_specific);
4211	}
4212
4213	if (pt_prev) {
4214		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4215			goto drop;
4216		else
4217			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4218	} else {
4219drop:
4220		if (!deliver_exact)
4221			atomic_long_inc(&skb->dev->rx_dropped);
4222		else
4223			atomic_long_inc(&skb->dev->rx_nohandler);
4224		kfree_skb(skb);
4225		/* Jamal, now you will not able to escape explaining
4226		 * me how you were going to use this. :-)
4227		 */
4228		ret = NET_RX_DROP;
4229	}
4230
4231out:
4232	return ret;
4233}
4234
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4235static int __netif_receive_skb(struct sk_buff *skb)
4236{
4237	int ret;
4238
4239	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4240		unsigned long pflags = current->flags;
4241
4242		/*
4243		 * PFMEMALLOC skbs are special, they should
4244		 * - be delivered to SOCK_MEMALLOC sockets only
4245		 * - stay away from userspace
4246		 * - have bounded memory usage
4247		 *
4248		 * Use PF_MEMALLOC as this saves us from propagating the allocation
4249		 * context down to all allocation sites.
4250		 */
4251		current->flags |= PF_MEMALLOC;
4252		ret = __netif_receive_skb_core(skb, true);
4253		tsk_restore_flags(current, pflags, PF_MEMALLOC);
4254	} else
4255		ret = __netif_receive_skb_core(skb, false);
4256
4257	return ret;
4258}
4259
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4260static int netif_receive_skb_internal(struct sk_buff *skb)
4261{
4262	int ret;
4263
4264	net_timestamp_check(netdev_tstamp_prequeue, skb);
4265
4266	if (skb_defer_rx_timestamp(skb))
4267		return NET_RX_SUCCESS;
4268
 
 
 
 
 
 
 
 
 
 
 
 
 
4269	rcu_read_lock();
4270
4271#ifdef CONFIG_RPS
4272	if (static_key_false(&rps_needed)) {
4273		struct rps_dev_flow voidflow, *rflow = &voidflow;
4274		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4275
4276		if (cpu >= 0) {
4277			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4278			rcu_read_unlock();
4279			return ret;
4280		}
4281	}
4282#endif
4283	ret = __netif_receive_skb(skb);
4284	rcu_read_unlock();
4285	return ret;
4286}
4287
4288/**
4289 *	netif_receive_skb - process receive buffer from network
4290 *	@skb: buffer to process
4291 *
4292 *	netif_receive_skb() is the main receive data processing function.
4293 *	It always succeeds. The buffer may be dropped during processing
4294 *	for congestion control or by the protocol layers.
4295 *
4296 *	This function may only be called from softirq context and interrupts
4297 *	should be enabled.
4298 *
4299 *	Return values (usually ignored):
4300 *	NET_RX_SUCCESS: no congestion
4301 *	NET_RX_DROP: packet was dropped
4302 */
4303int netif_receive_skb(struct sk_buff *skb)
4304{
4305	trace_netif_receive_skb_entry(skb);
4306
4307	return netif_receive_skb_internal(skb);
4308}
4309EXPORT_SYMBOL(netif_receive_skb);
4310
4311DEFINE_PER_CPU(struct work_struct, flush_works);
4312
4313/* Network device is going away, flush any packets still pending */
4314static void flush_backlog(struct work_struct *work)
4315{
4316	struct sk_buff *skb, *tmp;
4317	struct softnet_data *sd;
4318
4319	local_bh_disable();
4320	sd = this_cpu_ptr(&softnet_data);
4321
4322	local_irq_disable();
4323	rps_lock(sd);
4324	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4325		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4326			__skb_unlink(skb, &sd->input_pkt_queue);
4327			kfree_skb(skb);
4328			input_queue_head_incr(sd);
4329		}
4330	}
4331	rps_unlock(sd);
4332	local_irq_enable();
4333
4334	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4335		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4336			__skb_unlink(skb, &sd->process_queue);
4337			kfree_skb(skb);
4338			input_queue_head_incr(sd);
4339		}
4340	}
4341	local_bh_enable();
4342}
4343
4344static void flush_all_backlogs(void)
4345{
4346	unsigned int cpu;
4347
4348	get_online_cpus();
4349
4350	for_each_online_cpu(cpu)
4351		queue_work_on(cpu, system_highpri_wq,
4352			      per_cpu_ptr(&flush_works, cpu));
4353
4354	for_each_online_cpu(cpu)
4355		flush_work(per_cpu_ptr(&flush_works, cpu));
4356
4357	put_online_cpus();
4358}
4359
4360static int napi_gro_complete(struct sk_buff *skb)
4361{
4362	struct packet_offload *ptype;
4363	__be16 type = skb->protocol;
4364	struct list_head *head = &offload_base;
4365	int err = -ENOENT;
4366
4367	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4368
4369	if (NAPI_GRO_CB(skb)->count == 1) {
4370		skb_shinfo(skb)->gso_size = 0;
4371		goto out;
4372	}
4373
4374	rcu_read_lock();
4375	list_for_each_entry_rcu(ptype, head, list) {
4376		if (ptype->type != type || !ptype->callbacks.gro_complete)
4377			continue;
4378
4379		err = ptype->callbacks.gro_complete(skb, 0);
4380		break;
4381	}
4382	rcu_read_unlock();
4383
4384	if (err) {
4385		WARN_ON(&ptype->list == head);
4386		kfree_skb(skb);
4387		return NET_RX_SUCCESS;
4388	}
4389
4390out:
4391	return netif_receive_skb_internal(skb);
4392}
4393
4394/* napi->gro_list contains packets ordered by age.
4395 * youngest packets at the head of it.
4396 * Complete skbs in reverse order to reduce latencies.
4397 */
4398void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4399{
4400	struct sk_buff *skb, *prev = NULL;
4401
4402	/* scan list and build reverse chain */
4403	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4404		skb->prev = prev;
4405		prev = skb;
4406	}
4407
4408	for (skb = prev; skb; skb = prev) {
4409		skb->next = NULL;
4410
4411		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4412			return;
4413
4414		prev = skb->prev;
4415		napi_gro_complete(skb);
4416		napi->gro_count--;
4417	}
4418
4419	napi->gro_list = NULL;
4420}
4421EXPORT_SYMBOL(napi_gro_flush);
4422
4423static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4424{
4425	struct sk_buff *p;
4426	unsigned int maclen = skb->dev->hard_header_len;
4427	u32 hash = skb_get_hash_raw(skb);
4428
4429	for (p = napi->gro_list; p; p = p->next) {
4430		unsigned long diffs;
4431
4432		NAPI_GRO_CB(p)->flush = 0;
4433
4434		if (hash != skb_get_hash_raw(p)) {
4435			NAPI_GRO_CB(p)->same_flow = 0;
4436			continue;
4437		}
4438
4439		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4440		diffs |= p->vlan_tci ^ skb->vlan_tci;
4441		diffs |= skb_metadata_dst_cmp(p, skb);
 
4442		if (maclen == ETH_HLEN)
4443			diffs |= compare_ether_header(skb_mac_header(p),
4444						      skb_mac_header(skb));
4445		else if (!diffs)
4446			diffs = memcmp(skb_mac_header(p),
4447				       skb_mac_header(skb),
4448				       maclen);
4449		NAPI_GRO_CB(p)->same_flow = !diffs;
4450	}
4451}
4452
4453static void skb_gro_reset_offset(struct sk_buff *skb)
4454{
4455	const struct skb_shared_info *pinfo = skb_shinfo(skb);
4456	const skb_frag_t *frag0 = &pinfo->frags[0];
4457
4458	NAPI_GRO_CB(skb)->data_offset = 0;
4459	NAPI_GRO_CB(skb)->frag0 = NULL;
4460	NAPI_GRO_CB(skb)->frag0_len = 0;
4461
4462	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4463	    pinfo->nr_frags &&
4464	    !PageHighMem(skb_frag_page(frag0))) {
4465		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4466		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4467						    skb_frag_size(frag0),
4468						    skb->end - skb->tail);
4469	}
4470}
4471
4472static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4473{
4474	struct skb_shared_info *pinfo = skb_shinfo(skb);
4475
4476	BUG_ON(skb->end - skb->tail < grow);
4477
4478	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4479
4480	skb->data_len -= grow;
4481	skb->tail += grow;
4482
4483	pinfo->frags[0].page_offset += grow;
4484	skb_frag_size_sub(&pinfo->frags[0], grow);
4485
4486	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4487		skb_frag_unref(skb, 0);
4488		memmove(pinfo->frags, pinfo->frags + 1,
4489			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
4490	}
4491}
4492
4493static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4494{
4495	struct sk_buff **pp = NULL;
4496	struct packet_offload *ptype;
4497	__be16 type = skb->protocol;
4498	struct list_head *head = &offload_base;
4499	int same_flow;
4500	enum gro_result ret;
4501	int grow;
4502
4503	if (!(skb->dev->features & NETIF_F_GRO))
4504		goto normal;
4505
4506	if (skb->csum_bad)
4507		goto normal;
4508
4509	gro_list_prepare(napi, skb);
4510
4511	rcu_read_lock();
4512	list_for_each_entry_rcu(ptype, head, list) {
4513		if (ptype->type != type || !ptype->callbacks.gro_receive)
4514			continue;
4515
4516		skb_set_network_header(skb, skb_gro_offset(skb));
4517		skb_reset_mac_len(skb);
4518		NAPI_GRO_CB(skb)->same_flow = 0;
4519		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
4520		NAPI_GRO_CB(skb)->free = 0;
4521		NAPI_GRO_CB(skb)->encap_mark = 0;
4522		NAPI_GRO_CB(skb)->recursion_counter = 0;
4523		NAPI_GRO_CB(skb)->is_fou = 0;
4524		NAPI_GRO_CB(skb)->is_atomic = 1;
4525		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4526
4527		/* Setup for GRO checksum validation */
4528		switch (skb->ip_summed) {
4529		case CHECKSUM_COMPLETE:
4530			NAPI_GRO_CB(skb)->csum = skb->csum;
4531			NAPI_GRO_CB(skb)->csum_valid = 1;
4532			NAPI_GRO_CB(skb)->csum_cnt = 0;
4533			break;
4534		case CHECKSUM_UNNECESSARY:
4535			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4536			NAPI_GRO_CB(skb)->csum_valid = 0;
4537			break;
4538		default:
4539			NAPI_GRO_CB(skb)->csum_cnt = 0;
4540			NAPI_GRO_CB(skb)->csum_valid = 0;
4541		}
4542
4543		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4544		break;
4545	}
4546	rcu_read_unlock();
4547
4548	if (&ptype->list == head)
4549		goto normal;
4550
 
 
 
 
 
4551	same_flow = NAPI_GRO_CB(skb)->same_flow;
4552	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4553
4554	if (pp) {
4555		struct sk_buff *nskb = *pp;
4556
4557		*pp = nskb->next;
4558		nskb->next = NULL;
4559		napi_gro_complete(nskb);
4560		napi->gro_count--;
4561	}
4562
4563	if (same_flow)
4564		goto ok;
4565
4566	if (NAPI_GRO_CB(skb)->flush)
4567		goto normal;
4568
4569	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4570		struct sk_buff *nskb = napi->gro_list;
4571
4572		/* locate the end of the list to select the 'oldest' flow */
4573		while (nskb->next) {
4574			pp = &nskb->next;
4575			nskb = *pp;
4576		}
4577		*pp = NULL;
4578		nskb->next = NULL;
4579		napi_gro_complete(nskb);
4580	} else {
4581		napi->gro_count++;
4582	}
4583	NAPI_GRO_CB(skb)->count = 1;
4584	NAPI_GRO_CB(skb)->age = jiffies;
4585	NAPI_GRO_CB(skb)->last = skb;
4586	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4587	skb->next = napi->gro_list;
4588	napi->gro_list = skb;
4589	ret = GRO_HELD;
4590
4591pull:
4592	grow = skb_gro_offset(skb) - skb_headlen(skb);
4593	if (grow > 0)
4594		gro_pull_from_frag0(skb, grow);
4595ok:
4596	return ret;
4597
4598normal:
4599	ret = GRO_NORMAL;
4600	goto pull;
4601}
4602
4603struct packet_offload *gro_find_receive_by_type(__be16 type)
4604{
4605	struct list_head *offload_head = &offload_base;
4606	struct packet_offload *ptype;
4607
4608	list_for_each_entry_rcu(ptype, offload_head, list) {
4609		if (ptype->type != type || !ptype->callbacks.gro_receive)
4610			continue;
4611		return ptype;
4612	}
4613	return NULL;
4614}
4615EXPORT_SYMBOL(gro_find_receive_by_type);
4616
4617struct packet_offload *gro_find_complete_by_type(__be16 type)
4618{
4619	struct list_head *offload_head = &offload_base;
4620	struct packet_offload *ptype;
4621
4622	list_for_each_entry_rcu(ptype, offload_head, list) {
4623		if (ptype->type != type || !ptype->callbacks.gro_complete)
4624			continue;
4625		return ptype;
4626	}
4627	return NULL;
4628}
4629EXPORT_SYMBOL(gro_find_complete_by_type);
4630
 
 
 
 
 
 
 
4631static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4632{
4633	switch (ret) {
4634	case GRO_NORMAL:
4635		if (netif_receive_skb_internal(skb))
4636			ret = GRO_DROP;
4637		break;
4638
4639	case GRO_DROP:
4640		kfree_skb(skb);
4641		break;
4642
4643	case GRO_MERGED_FREE:
4644		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4645			skb_dst_drop(skb);
4646			kmem_cache_free(skbuff_head_cache, skb);
4647		} else {
4648			__kfree_skb(skb);
4649		}
4650		break;
4651
4652	case GRO_HELD:
4653	case GRO_MERGED:
 
4654		break;
4655	}
4656
4657	return ret;
4658}
4659
4660gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4661{
4662	skb_mark_napi_id(skb, napi);
4663	trace_napi_gro_receive_entry(skb);
4664
4665	skb_gro_reset_offset(skb);
4666
4667	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4668}
4669EXPORT_SYMBOL(napi_gro_receive);
4670
4671static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4672{
4673	if (unlikely(skb->pfmemalloc)) {
4674		consume_skb(skb);
4675		return;
4676	}
4677	__skb_pull(skb, skb_headlen(skb));
4678	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4679	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4680	skb->vlan_tci = 0;
4681	skb->dev = napi->dev;
4682	skb->skb_iif = 0;
4683	skb->encapsulation = 0;
4684	skb_shinfo(skb)->gso_type = 0;
4685	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
 
4686
4687	napi->skb = skb;
4688}
4689
4690struct sk_buff *napi_get_frags(struct napi_struct *napi)
4691{
4692	struct sk_buff *skb = napi->skb;
4693
4694	if (!skb) {
4695		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4696		if (skb) {
4697			napi->skb = skb;
4698			skb_mark_napi_id(skb, napi);
4699		}
4700	}
4701	return skb;
4702}
4703EXPORT_SYMBOL(napi_get_frags);
4704
4705static gro_result_t napi_frags_finish(struct napi_struct *napi,
4706				      struct sk_buff *skb,
4707				      gro_result_t ret)
4708{
4709	switch (ret) {
4710	case GRO_NORMAL:
4711	case GRO_HELD:
4712		__skb_push(skb, ETH_HLEN);
4713		skb->protocol = eth_type_trans(skb, skb->dev);
4714		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4715			ret = GRO_DROP;
4716		break;
4717
4718	case GRO_DROP:
 
 
 
4719	case GRO_MERGED_FREE:
4720		napi_reuse_skb(napi, skb);
 
 
 
4721		break;
4722
4723	case GRO_MERGED:
 
4724		break;
4725	}
4726
4727	return ret;
4728}
4729
4730/* Upper GRO stack assumes network header starts at gro_offset=0
4731 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4732 * We copy ethernet header into skb->data to have a common layout.
4733 */
4734static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4735{
4736	struct sk_buff *skb = napi->skb;
4737	const struct ethhdr *eth;
4738	unsigned int hlen = sizeof(*eth);
4739
4740	napi->skb = NULL;
4741
4742	skb_reset_mac_header(skb);
4743	skb_gro_reset_offset(skb);
4744
4745	eth = skb_gro_header_fast(skb, 0);
4746	if (unlikely(skb_gro_header_hard(skb, hlen))) {
4747		eth = skb_gro_header_slow(skb, hlen, 0);
4748		if (unlikely(!eth)) {
4749			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4750					     __func__, napi->dev->name);
4751			napi_reuse_skb(napi, skb);
4752			return NULL;
4753		}
4754	} else {
4755		gro_pull_from_frag0(skb, hlen);
4756		NAPI_GRO_CB(skb)->frag0 += hlen;
4757		NAPI_GRO_CB(skb)->frag0_len -= hlen;
4758	}
4759	__skb_pull(skb, hlen);
4760
4761	/*
4762	 * This works because the only protocols we care about don't require
4763	 * special handling.
4764	 * We'll fix it up properly in napi_frags_finish()
4765	 */
4766	skb->protocol = eth->h_proto;
4767
4768	return skb;
4769}
4770
4771gro_result_t napi_gro_frags(struct napi_struct *napi)
4772{
4773	struct sk_buff *skb = napi_frags_skb(napi);
4774
4775	if (!skb)
4776		return GRO_DROP;
4777
4778	trace_napi_gro_frags_entry(skb);
4779
4780	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4781}
4782EXPORT_SYMBOL(napi_gro_frags);
4783
4784/* Compute the checksum from gro_offset and return the folded value
4785 * after adding in any pseudo checksum.
4786 */
4787__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4788{
4789	__wsum wsum;
4790	__sum16 sum;
4791
4792	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4793
4794	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4795	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4796	if (likely(!sum)) {
4797		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4798		    !skb->csum_complete_sw)
4799			netdev_rx_csum_fault(skb->dev);
4800	}
4801
4802	NAPI_GRO_CB(skb)->csum = wsum;
4803	NAPI_GRO_CB(skb)->csum_valid = 1;
4804
4805	return sum;
4806}
4807EXPORT_SYMBOL(__skb_gro_checksum_complete);
4808
 
 
 
 
 
 
 
 
 
 
 
 
 
4809/*
4810 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4811 * Note: called with local irq disabled, but exits with local irq enabled.
4812 */
4813static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4814{
4815#ifdef CONFIG_RPS
4816	struct softnet_data *remsd = sd->rps_ipi_list;
4817
4818	if (remsd) {
4819		sd->rps_ipi_list = NULL;
4820
4821		local_irq_enable();
4822
4823		/* Send pending IPI's to kick RPS processing on remote cpus. */
4824		while (remsd) {
4825			struct softnet_data *next = remsd->rps_ipi_next;
4826
4827			if (cpu_online(remsd->cpu))
4828				smp_call_function_single_async(remsd->cpu,
4829							   &remsd->csd);
4830			remsd = next;
4831		}
4832	} else
4833#endif
4834		local_irq_enable();
4835}
4836
4837static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4838{
4839#ifdef CONFIG_RPS
4840	return sd->rps_ipi_list != NULL;
4841#else
4842	return false;
4843#endif
4844}
4845
4846static int process_backlog(struct napi_struct *napi, int quota)
4847{
4848	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4849	bool again = true;
4850	int work = 0;
4851
4852	/* Check if we have pending ipi, its better to send them now,
4853	 * not waiting net_rx_action() end.
4854	 */
4855	if (sd_has_rps_ipi_waiting(sd)) {
4856		local_irq_disable();
4857		net_rps_action_and_irq_enable(sd);
4858	}
4859
4860	napi->weight = weight_p;
4861	while (again) {
4862		struct sk_buff *skb;
4863
4864		while ((skb = __skb_dequeue(&sd->process_queue))) {
4865			rcu_read_lock();
4866			__netif_receive_skb(skb);
4867			rcu_read_unlock();
4868			input_queue_head_incr(sd);
4869			if (++work >= quota)
4870				return work;
4871
4872		}
4873
4874		local_irq_disable();
4875		rps_lock(sd);
4876		if (skb_queue_empty(&sd->input_pkt_queue)) {
4877			/*
4878			 * Inline a custom version of __napi_complete().
4879			 * only current cpu owns and manipulates this napi,
4880			 * and NAPI_STATE_SCHED is the only possible flag set
4881			 * on backlog.
4882			 * We can use a plain write instead of clear_bit(),
4883			 * and we dont need an smp_mb() memory barrier.
4884			 */
4885			napi->state = 0;
4886			again = false;
4887		} else {
4888			skb_queue_splice_tail_init(&sd->input_pkt_queue,
4889						   &sd->process_queue);
4890		}
4891		rps_unlock(sd);
4892		local_irq_enable();
4893	}
4894
4895	return work;
4896}
4897
4898/**
4899 * __napi_schedule - schedule for receive
4900 * @n: entry to schedule
4901 *
4902 * The entry's receive function will be scheduled to run.
4903 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4904 */
4905void __napi_schedule(struct napi_struct *n)
4906{
4907	unsigned long flags;
4908
4909	local_irq_save(flags);
4910	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4911	local_irq_restore(flags);
4912}
4913EXPORT_SYMBOL(__napi_schedule);
4914
4915/**
4916 *	napi_schedule_prep - check if napi can be scheduled
4917 *	@n: napi context
4918 *
4919 * Test if NAPI routine is already running, and if not mark
4920 * it as running.  This is used as a condition variable
4921 * insure only one NAPI poll instance runs.  We also make
4922 * sure there is no pending NAPI disable.
4923 */
4924bool napi_schedule_prep(struct napi_struct *n)
4925{
4926	unsigned long val, new;
4927
4928	do {
4929		val = READ_ONCE(n->state);
4930		if (unlikely(val & NAPIF_STATE_DISABLE))
4931			return false;
4932		new = val | NAPIF_STATE_SCHED;
4933
4934		/* Sets STATE_MISSED bit if STATE_SCHED was already set
4935		 * This was suggested by Alexander Duyck, as compiler
4936		 * emits better code than :
4937		 * if (val & NAPIF_STATE_SCHED)
4938		 *     new |= NAPIF_STATE_MISSED;
4939		 */
4940		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
4941						   NAPIF_STATE_MISSED;
4942	} while (cmpxchg(&n->state, val, new) != val);
4943
4944	return !(val & NAPIF_STATE_SCHED);
4945}
4946EXPORT_SYMBOL(napi_schedule_prep);
4947
4948/**
4949 * __napi_schedule_irqoff - schedule for receive
4950 * @n: entry to schedule
4951 *
4952 * Variant of __napi_schedule() assuming hard irqs are masked
4953 */
4954void __napi_schedule_irqoff(struct napi_struct *n)
4955{
4956	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4957}
4958EXPORT_SYMBOL(__napi_schedule_irqoff);
4959
4960bool __napi_complete(struct napi_struct *n)
4961{
4962	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4963
4964	/* Some drivers call us directly, instead of calling
4965	 * napi_complete_done().
4966	 */
4967	if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
4968		return false;
4969
4970	list_del_init(&n->poll_list);
4971	smp_mb__before_atomic();
4972	clear_bit(NAPI_STATE_SCHED, &n->state);
4973	return true;
4974}
4975EXPORT_SYMBOL(__napi_complete);
4976
4977bool napi_complete_done(struct napi_struct *n, int work_done)
4978{
4979	unsigned long flags, val, new;
4980
4981	/*
4982	 * 1) Don't let napi dequeue from the cpu poll list
4983	 *    just in case its running on a different cpu.
4984	 * 2) If we are busy polling, do nothing here, we have
4985	 *    the guarantee we will be called later.
4986	 */
4987	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
4988				 NAPIF_STATE_IN_BUSY_POLL)))
4989		return false;
4990
4991	if (n->gro_list) {
4992		unsigned long timeout = 0;
4993
4994		if (work_done)
4995			timeout = n->dev->gro_flush_timeout;
4996
4997		if (timeout)
4998			hrtimer_start(&n->timer, ns_to_ktime(timeout),
4999				      HRTIMER_MODE_REL_PINNED);
5000		else
5001			napi_gro_flush(n, false);
5002	}
5003	if (unlikely(!list_empty(&n->poll_list))) {
5004		/* If n->poll_list is not empty, we need to mask irqs */
5005		local_irq_save(flags);
5006		list_del_init(&n->poll_list);
5007		local_irq_restore(flags);
5008	}
5009
5010	do {
5011		val = READ_ONCE(n->state);
5012
5013		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
5014
5015		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
5016
5017		/* If STATE_MISSED was set, leave STATE_SCHED set,
5018		 * because we will call napi->poll() one more time.
5019		 * This C code was suggested by Alexander Duyck to help gcc.
5020		 */
5021		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
5022						    NAPIF_STATE_SCHED;
5023	} while (cmpxchg(&n->state, val, new) != val);
5024
5025	if (unlikely(val & NAPIF_STATE_MISSED)) {
5026		__napi_schedule(n);
5027		return false;
5028	}
5029
5030	return true;
5031}
5032EXPORT_SYMBOL(napi_complete_done);
5033
5034/* must be called under rcu_read_lock(), as we dont take a reference */
5035static struct napi_struct *napi_by_id(unsigned int napi_id)
5036{
5037	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
5038	struct napi_struct *napi;
5039
5040	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
5041		if (napi->napi_id == napi_id)
5042			return napi;
5043
5044	return NULL;
5045}
5046
5047#if defined(CONFIG_NET_RX_BUSY_POLL)
5048
5049#define BUSY_POLL_BUDGET 8
5050
5051static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
5052{
5053	int rc;
5054
5055	/* Busy polling means there is a high chance device driver hard irq
5056	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
5057	 * set in napi_schedule_prep().
5058	 * Since we are about to call napi->poll() once more, we can safely
5059	 * clear NAPI_STATE_MISSED.
5060	 *
5061	 * Note: x86 could use a single "lock and ..." instruction
5062	 * to perform these two clear_bit()
5063	 */
5064	clear_bit(NAPI_STATE_MISSED, &napi->state);
5065	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
5066
5067	local_bh_disable();
5068
5069	/* All we really want here is to re-enable device interrupts.
5070	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
5071	 */
5072	rc = napi->poll(napi, BUSY_POLL_BUDGET);
 
5073	netpoll_poll_unlock(have_poll_lock);
5074	if (rc == BUSY_POLL_BUDGET)
5075		__napi_schedule(napi);
5076	local_bh_enable();
5077	if (local_softirq_pending())
5078		do_softirq();
5079}
5080
5081bool sk_busy_loop(struct sock *sk, int nonblock)
 
 
5082{
5083	unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
5084	int (*napi_poll)(struct napi_struct *napi, int budget);
5085	int (*busy_poll)(struct napi_struct *dev);
5086	void *have_poll_lock = NULL;
5087	struct napi_struct *napi;
5088	int rc;
5089
5090restart:
5091	rc = false;
5092	napi_poll = NULL;
5093
5094	rcu_read_lock();
5095
5096	napi = napi_by_id(sk->sk_napi_id);
5097	if (!napi)
5098		goto out;
5099
5100	/* Note: ndo_busy_poll method is optional in linux-4.5 */
5101	busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
5102
5103	preempt_disable();
5104	for (;;) {
5105		rc = 0;
 
5106		local_bh_disable();
5107		if (busy_poll) {
5108			rc = busy_poll(napi);
5109			goto count;
5110		}
5111		if (!napi_poll) {
5112			unsigned long val = READ_ONCE(napi->state);
5113
5114			/* If multiple threads are competing for this napi,
5115			 * we avoid dirtying napi->state as much as we can.
5116			 */
5117			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
5118				   NAPIF_STATE_IN_BUSY_POLL))
5119				goto count;
5120			if (cmpxchg(&napi->state, val,
5121				    val | NAPIF_STATE_IN_BUSY_POLL |
5122					  NAPIF_STATE_SCHED) != val)
5123				goto count;
5124			have_poll_lock = netpoll_poll_lock(napi);
5125			napi_poll = napi->poll;
5126		}
5127		rc = napi_poll(napi, BUSY_POLL_BUDGET);
5128		trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5129count:
5130		if (rc > 0)
5131			__NET_ADD_STATS(sock_net(sk),
5132					LINUX_MIB_BUSYPOLLRXPACKETS, rc);
5133		local_bh_enable();
5134
5135		if (rc == LL_FLUSH_FAILED)
5136			break; /* permanent failure */
5137
5138		if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
5139		    busy_loop_timeout(end_time))
5140			break;
5141
5142		if (unlikely(need_resched())) {
5143			if (napi_poll)
5144				busy_poll_stop(napi, have_poll_lock);
5145			preempt_enable();
5146			rcu_read_unlock();
5147			cond_resched();
5148			rc = !skb_queue_empty(&sk->sk_receive_queue);
5149			if (rc || busy_loop_timeout(end_time))
5150				return rc;
5151			goto restart;
5152		}
5153		cpu_relax();
5154	}
5155	if (napi_poll)
5156		busy_poll_stop(napi, have_poll_lock);
5157	preempt_enable();
5158	rc = !skb_queue_empty(&sk->sk_receive_queue);
5159out:
5160	rcu_read_unlock();
5161	return rc;
5162}
5163EXPORT_SYMBOL(sk_busy_loop);
5164
5165#endif /* CONFIG_NET_RX_BUSY_POLL */
5166
5167static void napi_hash_add(struct napi_struct *napi)
5168{
5169	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5170	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5171		return;
5172
5173	spin_lock(&napi_hash_lock);
5174
5175	/* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5176	do {
5177		if (unlikely(++napi_gen_id < NR_CPUS + 1))
5178			napi_gen_id = NR_CPUS + 1;
5179	} while (napi_by_id(napi_gen_id));
5180	napi->napi_id = napi_gen_id;
5181
5182	hlist_add_head_rcu(&napi->napi_hash_node,
5183			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5184
5185	spin_unlock(&napi_hash_lock);
5186}
5187
5188/* Warning : caller is responsible to make sure rcu grace period
5189 * is respected before freeing memory containing @napi
5190 */
5191bool napi_hash_del(struct napi_struct *napi)
5192{
5193	bool rcu_sync_needed = false;
5194
5195	spin_lock(&napi_hash_lock);
5196
5197	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5198		rcu_sync_needed = true;
5199		hlist_del_rcu(&napi->napi_hash_node);
5200	}
5201	spin_unlock(&napi_hash_lock);
5202	return rcu_sync_needed;
5203}
5204EXPORT_SYMBOL_GPL(napi_hash_del);
5205
5206static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5207{
5208	struct napi_struct *napi;
5209
5210	napi = container_of(timer, struct napi_struct, timer);
5211
5212	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
5213	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
5214	 */
5215	if (napi->gro_list && !napi_disable_pending(napi) &&
5216	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
5217		__napi_schedule_irqoff(napi);
5218
5219	return HRTIMER_NORESTART;
5220}
5221
5222void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5223		    int (*poll)(struct napi_struct *, int), int weight)
5224{
5225	INIT_LIST_HEAD(&napi->poll_list);
5226	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5227	napi->timer.function = napi_watchdog;
5228	napi->gro_count = 0;
5229	napi->gro_list = NULL;
5230	napi->skb = NULL;
5231	napi->poll = poll;
5232	if (weight > NAPI_POLL_WEIGHT)
5233		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5234			    weight, dev->name);
5235	napi->weight = weight;
5236	list_add(&napi->dev_list, &dev->napi_list);
5237	napi->dev = dev;
5238#ifdef CONFIG_NETPOLL
5239	napi->poll_owner = -1;
5240#endif
5241	set_bit(NAPI_STATE_SCHED, &napi->state);
5242	napi_hash_add(napi);
5243}
5244EXPORT_SYMBOL(netif_napi_add);
5245
5246void napi_disable(struct napi_struct *n)
5247{
5248	might_sleep();
5249	set_bit(NAPI_STATE_DISABLE, &n->state);
5250
5251	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5252		msleep(1);
5253	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5254		msleep(1);
5255
5256	hrtimer_cancel(&n->timer);
5257
5258	clear_bit(NAPI_STATE_DISABLE, &n->state);
5259}
5260EXPORT_SYMBOL(napi_disable);
5261
5262/* Must be called in process context */
5263void netif_napi_del(struct napi_struct *napi)
5264{
5265	might_sleep();
5266	if (napi_hash_del(napi))
5267		synchronize_net();
5268	list_del_init(&napi->dev_list);
5269	napi_free_frags(napi);
5270
5271	kfree_skb_list(napi->gro_list);
5272	napi->gro_list = NULL;
5273	napi->gro_count = 0;
5274}
5275EXPORT_SYMBOL(netif_napi_del);
5276
5277static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5278{
5279	void *have;
5280	int work, weight;
5281
5282	list_del_init(&n->poll_list);
5283
5284	have = netpoll_poll_lock(n);
5285
5286	weight = n->weight;
5287
5288	/* This NAPI_STATE_SCHED test is for avoiding a race
5289	 * with netpoll's poll_napi().  Only the entity which
5290	 * obtains the lock and sees NAPI_STATE_SCHED set will
5291	 * actually make the ->poll() call.  Therefore we avoid
5292	 * accidentally calling ->poll() when NAPI is not scheduled.
5293	 */
5294	work = 0;
5295	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5296		work = n->poll(n, weight);
5297		trace_napi_poll(n, work, weight);
5298	}
5299
5300	WARN_ON_ONCE(work > weight);
5301
5302	if (likely(work < weight))
5303		goto out_unlock;
5304
5305	/* Drivers must not modify the NAPI state if they
5306	 * consume the entire weight.  In such cases this code
5307	 * still "owns" the NAPI instance and therefore can
5308	 * move the instance around on the list at-will.
5309	 */
5310	if (unlikely(napi_disable_pending(n))) {
5311		napi_complete(n);
5312		goto out_unlock;
5313	}
5314
5315	if (n->gro_list) {
5316		/* flush too old packets
5317		 * If HZ < 1000, flush all packets.
5318		 */
5319		napi_gro_flush(n, HZ >= 1000);
5320	}
5321
5322	/* Some drivers may have called napi_schedule
5323	 * prior to exhausting their budget.
5324	 */
5325	if (unlikely(!list_empty(&n->poll_list))) {
5326		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5327			     n->dev ? n->dev->name : "backlog");
5328		goto out_unlock;
5329	}
5330
5331	list_add_tail(&n->poll_list, repoll);
5332
5333out_unlock:
5334	netpoll_poll_unlock(have);
5335
5336	return work;
5337}
5338
5339static __latent_entropy void net_rx_action(struct softirq_action *h)
5340{
5341	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5342	unsigned long time_limit = jiffies + 2;
 
5343	int budget = netdev_budget;
5344	LIST_HEAD(list);
5345	LIST_HEAD(repoll);
5346
5347	local_irq_disable();
5348	list_splice_init(&sd->poll_list, &list);
5349	local_irq_enable();
5350
5351	for (;;) {
5352		struct napi_struct *n;
5353
5354		if (list_empty(&list)) {
5355			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5356				goto out;
5357			break;
5358		}
5359
5360		n = list_first_entry(&list, struct napi_struct, poll_list);
5361		budget -= napi_poll(n, &repoll);
5362
5363		/* If softirq window is exhausted then punt.
5364		 * Allow this to run for 2 jiffies since which will allow
5365		 * an average latency of 1.5/HZ.
5366		 */
5367		if (unlikely(budget <= 0 ||
5368			     time_after_eq(jiffies, time_limit))) {
5369			sd->time_squeeze++;
5370			break;
5371		}
5372	}
5373
5374	local_irq_disable();
5375
5376	list_splice_tail_init(&sd->poll_list, &list);
5377	list_splice_tail(&repoll, &list);
5378	list_splice(&list, &sd->poll_list);
5379	if (!list_empty(&sd->poll_list))
5380		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
5381
5382	net_rps_action_and_irq_enable(sd);
5383out:
5384	__kfree_skb_flush();
5385}
5386
5387struct netdev_adjacent {
5388	struct net_device *dev;
5389
5390	/* upper master flag, there can only be one master device per list */
5391	bool master;
5392
5393	/* counter for the number of times this device was added to us */
5394	u16 ref_nr;
5395
5396	/* private field for the users */
5397	void *private;
5398
5399	struct list_head list;
5400	struct rcu_head rcu;
5401};
5402
5403static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5404						 struct list_head *adj_list)
5405{
5406	struct netdev_adjacent *adj;
5407
5408	list_for_each_entry(adj, adj_list, list) {
5409		if (adj->dev == adj_dev)
5410			return adj;
5411	}
5412	return NULL;
5413}
5414
5415static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
5416{
5417	struct net_device *dev = data;
5418
5419	return upper_dev == dev;
5420}
5421
5422/**
5423 * netdev_has_upper_dev - Check if device is linked to an upper device
5424 * @dev: device
5425 * @upper_dev: upper device to check
5426 *
5427 * Find out if a device is linked to specified upper device and return true
5428 * in case it is. Note that this checks only immediate upper device,
5429 * not through a complete stack of devices. The caller must hold the RTNL lock.
5430 */
5431bool netdev_has_upper_dev(struct net_device *dev,
5432			  struct net_device *upper_dev)
5433{
5434	ASSERT_RTNL();
5435
5436	return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5437					     upper_dev);
5438}
5439EXPORT_SYMBOL(netdev_has_upper_dev);
5440
5441/**
5442 * netdev_has_upper_dev_all - Check if device is linked to an upper device
5443 * @dev: device
5444 * @upper_dev: upper device to check
5445 *
5446 * Find out if a device is linked to specified upper device and return true
5447 * in case it is. Note that this checks the entire upper device chain.
5448 * The caller must hold rcu lock.
5449 */
5450
5451bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
5452				  struct net_device *upper_dev)
5453{
5454	return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5455					       upper_dev);
5456}
5457EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
5458
5459/**
5460 * netdev_has_any_upper_dev - Check if device is linked to some device
5461 * @dev: device
5462 *
5463 * Find out if a device is linked to an upper device and return true in case
5464 * it is. The caller must hold the RTNL lock.
5465 */
5466static bool netdev_has_any_upper_dev(struct net_device *dev)
5467{
5468	ASSERT_RTNL();
5469
5470	return !list_empty(&dev->adj_list.upper);
5471}
 
5472
5473/**
5474 * netdev_master_upper_dev_get - Get master upper device
5475 * @dev: device
5476 *
5477 * Find a master upper device and return pointer to it or NULL in case
5478 * it's not there. The caller must hold the RTNL lock.
5479 */
5480struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5481{
5482	struct netdev_adjacent *upper;
5483
5484	ASSERT_RTNL();
5485
5486	if (list_empty(&dev->adj_list.upper))
5487		return NULL;
5488
5489	upper = list_first_entry(&dev->adj_list.upper,
5490				 struct netdev_adjacent, list);
5491	if (likely(upper->master))
5492		return upper->dev;
5493	return NULL;
5494}
5495EXPORT_SYMBOL(netdev_master_upper_dev_get);
5496
5497/**
5498 * netdev_has_any_lower_dev - Check if device is linked to some device
5499 * @dev: device
5500 *
5501 * Find out if a device is linked to a lower device and return true in case
5502 * it is. The caller must hold the RTNL lock.
5503 */
5504static bool netdev_has_any_lower_dev(struct net_device *dev)
5505{
5506	ASSERT_RTNL();
5507
5508	return !list_empty(&dev->adj_list.lower);
5509}
5510
5511void *netdev_adjacent_get_private(struct list_head *adj_list)
5512{
5513	struct netdev_adjacent *adj;
5514
5515	adj = list_entry(adj_list, struct netdev_adjacent, list);
5516
5517	return adj->private;
5518}
5519EXPORT_SYMBOL(netdev_adjacent_get_private);
5520
5521/**
5522 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5523 * @dev: device
5524 * @iter: list_head ** of the current position
5525 *
5526 * Gets the next device from the dev's upper list, starting from iter
5527 * position. The caller must hold RCU read lock.
5528 */
5529struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5530						 struct list_head **iter)
5531{
5532	struct netdev_adjacent *upper;
5533
5534	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5535
5536	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5537
5538	if (&upper->list == &dev->adj_list.upper)
5539		return NULL;
5540
5541	*iter = &upper->list;
5542
5543	return upper->dev;
5544}
5545EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5546
5547static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
5548						    struct list_head **iter)
5549{
5550	struct netdev_adjacent *upper;
5551
5552	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5553
5554	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5555
5556	if (&upper->list == &dev->adj_list.upper)
5557		return NULL;
5558
5559	*iter = &upper->list;
5560
5561	return upper->dev;
5562}
5563
5564int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
5565				  int (*fn)(struct net_device *dev,
5566					    void *data),
5567				  void *data)
5568{
5569	struct net_device *udev;
5570	struct list_head *iter;
5571	int ret;
5572
5573	for (iter = &dev->adj_list.upper,
5574	     udev = netdev_next_upper_dev_rcu(dev, &iter);
5575	     udev;
5576	     udev = netdev_next_upper_dev_rcu(dev, &iter)) {
5577		/* first is the upper device itself */
5578		ret = fn(udev, data);
5579		if (ret)
5580			return ret;
5581
5582		/* then look at all of its upper devices */
5583		ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
5584		if (ret)
5585			return ret;
5586	}
5587
5588	return 0;
5589}
5590EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
5591
5592/**
5593 * netdev_lower_get_next_private - Get the next ->private from the
5594 *				   lower neighbour list
5595 * @dev: device
5596 * @iter: list_head ** of the current position
5597 *
5598 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5599 * list, starting from iter position. The caller must hold either hold the
5600 * RTNL lock or its own locking that guarantees that the neighbour lower
5601 * list will remain unchanged.
5602 */
5603void *netdev_lower_get_next_private(struct net_device *dev,
5604				    struct list_head **iter)
5605{
5606	struct netdev_adjacent *lower;
5607
5608	lower = list_entry(*iter, struct netdev_adjacent, list);
5609
5610	if (&lower->list == &dev->adj_list.lower)
5611		return NULL;
5612
5613	*iter = lower->list.next;
5614
5615	return lower->private;
5616}
5617EXPORT_SYMBOL(netdev_lower_get_next_private);
5618
5619/**
5620 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5621 *				       lower neighbour list, RCU
5622 *				       variant
5623 * @dev: device
5624 * @iter: list_head ** of the current position
5625 *
5626 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5627 * list, starting from iter position. The caller must hold RCU read lock.
5628 */
5629void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5630					struct list_head **iter)
5631{
5632	struct netdev_adjacent *lower;
5633
5634	WARN_ON_ONCE(!rcu_read_lock_held());
5635
5636	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5637
5638	if (&lower->list == &dev->adj_list.lower)
5639		return NULL;
5640
5641	*iter = &lower->list;
5642
5643	return lower->private;
5644}
5645EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5646
5647/**
5648 * netdev_lower_get_next - Get the next device from the lower neighbour
5649 *                         list
5650 * @dev: device
5651 * @iter: list_head ** of the current position
5652 *
5653 * Gets the next netdev_adjacent from the dev's lower neighbour
5654 * list, starting from iter position. The caller must hold RTNL lock or
5655 * its own locking that guarantees that the neighbour lower
5656 * list will remain unchanged.
5657 */
5658void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5659{
5660	struct netdev_adjacent *lower;
5661
5662	lower = list_entry(*iter, struct netdev_adjacent, list);
5663
5664	if (&lower->list == &dev->adj_list.lower)
5665		return NULL;
5666
5667	*iter = lower->list.next;
5668
5669	return lower->dev;
5670}
5671EXPORT_SYMBOL(netdev_lower_get_next);
5672
5673static struct net_device *netdev_next_lower_dev(struct net_device *dev,
5674						struct list_head **iter)
5675{
5676	struct netdev_adjacent *lower;
5677
5678	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5679
5680	if (&lower->list == &dev->adj_list.lower)
5681		return NULL;
5682
5683	*iter = &lower->list;
5684
5685	return lower->dev;
5686}
5687
5688int netdev_walk_all_lower_dev(struct net_device *dev,
5689			      int (*fn)(struct net_device *dev,
5690					void *data),
5691			      void *data)
5692{
5693	struct net_device *ldev;
5694	struct list_head *iter;
5695	int ret;
5696
5697	for (iter = &dev->adj_list.lower,
5698	     ldev = netdev_next_lower_dev(dev, &iter);
5699	     ldev;
5700	     ldev = netdev_next_lower_dev(dev, &iter)) {
5701		/* first is the lower device itself */
5702		ret = fn(ldev, data);
5703		if (ret)
5704			return ret;
5705
5706		/* then look at all of its lower devices */
5707		ret = netdev_walk_all_lower_dev(ldev, fn, data);
5708		if (ret)
5709			return ret;
5710	}
5711
5712	return 0;
5713}
5714EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
5715
5716static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
5717						    struct list_head **iter)
5718{
5719	struct netdev_adjacent *lower;
5720
5721	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5722	if (&lower->list == &dev->adj_list.lower)
5723		return NULL;
5724
5725	*iter = &lower->list;
5726
5727	return lower->dev;
5728}
5729
5730int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
5731				  int (*fn)(struct net_device *dev,
5732					    void *data),
5733				  void *data)
5734{
5735	struct net_device *ldev;
5736	struct list_head *iter;
5737	int ret;
5738
5739	for (iter = &dev->adj_list.lower,
5740	     ldev = netdev_next_lower_dev_rcu(dev, &iter);
5741	     ldev;
5742	     ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
5743		/* first is the lower device itself */
5744		ret = fn(ldev, data);
5745		if (ret)
5746			return ret;
5747
5748		/* then look at all of its lower devices */
5749		ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
5750		if (ret)
5751			return ret;
5752	}
5753
5754	return 0;
5755}
5756EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
5757
5758/**
5759 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5760 *				       lower neighbour list, RCU
5761 *				       variant
5762 * @dev: device
5763 *
5764 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5765 * list. The caller must hold RCU read lock.
5766 */
5767void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5768{
5769	struct netdev_adjacent *lower;
5770
5771	lower = list_first_or_null_rcu(&dev->adj_list.lower,
5772			struct netdev_adjacent, list);
5773	if (lower)
5774		return lower->private;
5775	return NULL;
5776}
5777EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5778
5779/**
5780 * netdev_master_upper_dev_get_rcu - Get master upper device
5781 * @dev: device
5782 *
5783 * Find a master upper device and return pointer to it or NULL in case
5784 * it's not there. The caller must hold the RCU read lock.
5785 */
5786struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5787{
5788	struct netdev_adjacent *upper;
5789
5790	upper = list_first_or_null_rcu(&dev->adj_list.upper,
5791				       struct netdev_adjacent, list);
5792	if (upper && likely(upper->master))
5793		return upper->dev;
5794	return NULL;
5795}
5796EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5797
5798static int netdev_adjacent_sysfs_add(struct net_device *dev,
5799			      struct net_device *adj_dev,
5800			      struct list_head *dev_list)
5801{
5802	char linkname[IFNAMSIZ+7];
 
5803	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5804		"upper_%s" : "lower_%s", adj_dev->name);
5805	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5806				 linkname);
5807}
5808static void netdev_adjacent_sysfs_del(struct net_device *dev,
5809			       char *name,
5810			       struct list_head *dev_list)
5811{
5812	char linkname[IFNAMSIZ+7];
 
5813	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5814		"upper_%s" : "lower_%s", name);
5815	sysfs_remove_link(&(dev->dev.kobj), linkname);
5816}
5817
5818static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5819						 struct net_device *adj_dev,
5820						 struct list_head *dev_list)
5821{
5822	return (dev_list == &dev->adj_list.upper ||
5823		dev_list == &dev->adj_list.lower) &&
5824		net_eq(dev_net(dev), dev_net(adj_dev));
5825}
5826
5827static int __netdev_adjacent_dev_insert(struct net_device *dev,
5828					struct net_device *adj_dev,
5829					struct list_head *dev_list,
5830					void *private, bool master)
5831{
5832	struct netdev_adjacent *adj;
5833	int ret;
5834
5835	adj = __netdev_find_adj(adj_dev, dev_list);
5836
5837	if (adj) {
5838		adj->ref_nr += 1;
5839		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
5840			 dev->name, adj_dev->name, adj->ref_nr);
5841
5842		return 0;
5843	}
5844
5845	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5846	if (!adj)
5847		return -ENOMEM;
5848
5849	adj->dev = adj_dev;
5850	adj->master = master;
5851	adj->ref_nr = 1;
5852	adj->private = private;
5853	dev_hold(adj_dev);
5854
5855	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
5856		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
5857
5858	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5859		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5860		if (ret)
5861			goto free_adj;
5862	}
5863
5864	/* Ensure that master link is always the first item in list. */
5865	if (master) {
5866		ret = sysfs_create_link(&(dev->dev.kobj),
5867					&(adj_dev->dev.kobj), "master");
5868		if (ret)
5869			goto remove_symlinks;
5870
5871		list_add_rcu(&adj->list, dev_list);
5872	} else {
5873		list_add_tail_rcu(&adj->list, dev_list);
5874	}
5875
5876	return 0;
5877
5878remove_symlinks:
5879	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5880		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5881free_adj:
5882	kfree(adj);
5883	dev_put(adj_dev);
5884
5885	return ret;
5886}
5887
5888static void __netdev_adjacent_dev_remove(struct net_device *dev,
5889					 struct net_device *adj_dev,
5890					 u16 ref_nr,
5891					 struct list_head *dev_list)
5892{
5893	struct netdev_adjacent *adj;
5894
5895	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
5896		 dev->name, adj_dev->name, ref_nr);
5897
5898	adj = __netdev_find_adj(adj_dev, dev_list);
5899
5900	if (!adj) {
5901		pr_err("Adjacency does not exist for device %s from %s\n",
5902		       dev->name, adj_dev->name);
5903		WARN_ON(1);
5904		return;
5905	}
5906
5907	if (adj->ref_nr > ref_nr) {
5908		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
5909			 dev->name, adj_dev->name, ref_nr,
5910			 adj->ref_nr - ref_nr);
5911		adj->ref_nr -= ref_nr;
5912		return;
5913	}
5914
5915	if (adj->master)
5916		sysfs_remove_link(&(dev->dev.kobj), "master");
5917
5918	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5919		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5920
5921	list_del_rcu(&adj->list);
5922	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
5923		 adj_dev->name, dev->name, adj_dev->name);
5924	dev_put(adj_dev);
5925	kfree_rcu(adj, rcu);
5926}
5927
5928static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5929					    struct net_device *upper_dev,
5930					    struct list_head *up_list,
5931					    struct list_head *down_list,
5932					    void *private, bool master)
5933{
5934	int ret;
5935
5936	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
5937					   private, master);
5938	if (ret)
5939		return ret;
5940
5941	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
5942					   private, false);
5943	if (ret) {
5944		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
5945		return ret;
5946	}
5947
5948	return 0;
5949}
5950
5951static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5952					       struct net_device *upper_dev,
5953					       u16 ref_nr,
5954					       struct list_head *up_list,
5955					       struct list_head *down_list)
5956{
5957	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5958	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5959}
5960
5961static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5962						struct net_device *upper_dev,
5963						void *private, bool master)
5964{
5965	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5966						&dev->adj_list.upper,
5967						&upper_dev->adj_list.lower,
5968						private, master);
5969}
5970
5971static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5972						   struct net_device *upper_dev)
5973{
5974	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5975					   &dev->adj_list.upper,
5976					   &upper_dev->adj_list.lower);
5977}
5978
5979static int __netdev_upper_dev_link(struct net_device *dev,
5980				   struct net_device *upper_dev, bool master,
5981				   void *upper_priv, void *upper_info)
 
5982{
5983	struct netdev_notifier_changeupper_info changeupper_info;
 
 
 
 
 
 
 
 
 
 
5984	int ret = 0;
5985
5986	ASSERT_RTNL();
5987
5988	if (dev == upper_dev)
5989		return -EBUSY;
5990
5991	/* To prevent loops, check if dev is not upper device to upper_dev. */
5992	if (netdev_has_upper_dev(upper_dev, dev))
5993		return -EBUSY;
5994
5995	if (netdev_has_upper_dev(dev, upper_dev))
5996		return -EEXIST;
 
 
 
 
 
 
5997
5998	if (master && netdev_master_upper_dev_get(dev))
5999		return -EBUSY;
6000
6001	changeupper_info.upper_dev = upper_dev;
6002	changeupper_info.master = master;
6003	changeupper_info.linking = true;
6004	changeupper_info.upper_info = upper_info;
6005
6006	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
6007					    &changeupper_info.info);
6008	ret = notifier_to_errno(ret);
6009	if (ret)
6010		return ret;
6011
6012	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
6013						   master);
6014	if (ret)
6015		return ret;
6016
6017	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
6018					    &changeupper_info.info);
6019	ret = notifier_to_errno(ret);
6020	if (ret)
6021		goto rollback;
6022
6023	return 0;
6024
6025rollback:
6026	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6027
6028	return ret;
6029}
6030
6031/**
6032 * netdev_upper_dev_link - Add a link to the upper device
6033 * @dev: device
6034 * @upper_dev: new upper device
 
6035 *
6036 * Adds a link to device which is upper to this one. The caller must hold
6037 * the RTNL lock. On a failure a negative errno code is returned.
6038 * On success the reference counts are adjusted and the function
6039 * returns zero.
6040 */
6041int netdev_upper_dev_link(struct net_device *dev,
6042			  struct net_device *upper_dev)
 
6043{
6044	return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
 
6045}
6046EXPORT_SYMBOL(netdev_upper_dev_link);
6047
6048/**
6049 * netdev_master_upper_dev_link - Add a master link to the upper device
6050 * @dev: device
6051 * @upper_dev: new upper device
6052 * @upper_priv: upper device private
6053 * @upper_info: upper info to be passed down via notifier
 
6054 *
6055 * Adds a link to device which is upper to this one. In this case, only
6056 * one master upper device can be linked, although other non-master devices
6057 * might be linked as well. The caller must hold the RTNL lock.
6058 * On a failure a negative errno code is returned. On success the reference
6059 * counts are adjusted and the function returns zero.
6060 */
6061int netdev_master_upper_dev_link(struct net_device *dev,
6062				 struct net_device *upper_dev,
6063				 void *upper_priv, void *upper_info)
 
6064{
6065	return __netdev_upper_dev_link(dev, upper_dev, true,
6066				       upper_priv, upper_info);
6067}
6068EXPORT_SYMBOL(netdev_master_upper_dev_link);
6069
6070/**
6071 * netdev_upper_dev_unlink - Removes a link to upper device
6072 * @dev: device
6073 * @upper_dev: new upper device
6074 *
6075 * Removes a link to device which is upper to this one. The caller must hold
6076 * the RTNL lock.
6077 */
6078void netdev_upper_dev_unlink(struct net_device *dev,
6079			     struct net_device *upper_dev)
6080{
6081	struct netdev_notifier_changeupper_info changeupper_info;
 
 
 
 
 
 
 
6082	ASSERT_RTNL();
6083
6084	changeupper_info.upper_dev = upper_dev;
6085	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
6086	changeupper_info.linking = false;
6087
6088	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
6089				      &changeupper_info.info);
6090
6091	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6092
6093	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
6094				      &changeupper_info.info);
6095}
6096EXPORT_SYMBOL(netdev_upper_dev_unlink);
6097
6098/**
6099 * netdev_bonding_info_change - Dispatch event about slave change
6100 * @dev: device
6101 * @bonding_info: info to dispatch
6102 *
6103 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
6104 * The caller must hold the RTNL lock.
6105 */
6106void netdev_bonding_info_change(struct net_device *dev,
6107				struct netdev_bonding_info *bonding_info)
6108{
6109	struct netdev_notifier_bonding_info	info;
 
 
6110
6111	memcpy(&info.bonding_info, bonding_info,
6112	       sizeof(struct netdev_bonding_info));
6113	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
6114				      &info.info);
6115}
6116EXPORT_SYMBOL(netdev_bonding_info_change);
6117
6118static void netdev_adjacent_add_links(struct net_device *dev)
6119{
6120	struct netdev_adjacent *iter;
6121
6122	struct net *net = dev_net(dev);
6123
6124	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6125		if (!net_eq(net, dev_net(iter->dev)))
6126			continue;
6127		netdev_adjacent_sysfs_add(iter->dev, dev,
6128					  &iter->dev->adj_list.lower);
6129		netdev_adjacent_sysfs_add(dev, iter->dev,
6130					  &dev->adj_list.upper);
6131	}
6132
6133	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6134		if (!net_eq(net, dev_net(iter->dev)))
6135			continue;
6136		netdev_adjacent_sysfs_add(iter->dev, dev,
6137					  &iter->dev->adj_list.upper);
6138		netdev_adjacent_sysfs_add(dev, iter->dev,
6139					  &dev->adj_list.lower);
6140	}
6141}
6142
6143static void netdev_adjacent_del_links(struct net_device *dev)
6144{
6145	struct netdev_adjacent *iter;
6146
6147	struct net *net = dev_net(dev);
6148
6149	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6150		if (!net_eq(net, dev_net(iter->dev)))
6151			continue;
6152		netdev_adjacent_sysfs_del(iter->dev, dev->name,
6153					  &iter->dev->adj_list.lower);
6154		netdev_adjacent_sysfs_del(dev, iter->dev->name,
6155					  &dev->adj_list.upper);
6156	}
6157
6158	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6159		if (!net_eq(net, dev_net(iter->dev)))
6160			continue;
6161		netdev_adjacent_sysfs_del(iter->dev, dev->name,
6162					  &iter->dev->adj_list.upper);
6163		netdev_adjacent_sysfs_del(dev, iter->dev->name,
6164					  &dev->adj_list.lower);
6165	}
6166}
6167
6168void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
6169{
6170	struct netdev_adjacent *iter;
6171
6172	struct net *net = dev_net(dev);
6173
6174	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6175		if (!net_eq(net, dev_net(iter->dev)))
6176			continue;
6177		netdev_adjacent_sysfs_del(iter->dev, oldname,
6178					  &iter->dev->adj_list.lower);
6179		netdev_adjacent_sysfs_add(iter->dev, dev,
6180					  &iter->dev->adj_list.lower);
6181	}
6182
6183	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6184		if (!net_eq(net, dev_net(iter->dev)))
6185			continue;
6186		netdev_adjacent_sysfs_del(iter->dev, oldname,
6187					  &iter->dev->adj_list.upper);
6188		netdev_adjacent_sysfs_add(iter->dev, dev,
6189					  &iter->dev->adj_list.upper);
6190	}
6191}
6192
6193void *netdev_lower_dev_get_private(struct net_device *dev,
6194				   struct net_device *lower_dev)
6195{
6196	struct netdev_adjacent *lower;
6197
6198	if (!lower_dev)
6199		return NULL;
6200	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
6201	if (!lower)
6202		return NULL;
6203
6204	return lower->private;
6205}
6206EXPORT_SYMBOL(netdev_lower_dev_get_private);
6207
6208
6209int dev_get_nest_level(struct net_device *dev)
6210{
6211	struct net_device *lower = NULL;
6212	struct list_head *iter;
6213	int max_nest = -1;
6214	int nest;
6215
6216	ASSERT_RTNL();
6217
6218	netdev_for_each_lower_dev(dev, lower, iter) {
6219		nest = dev_get_nest_level(lower);
6220		if (max_nest < nest)
6221			max_nest = nest;
6222	}
6223
6224	return max_nest + 1;
6225}
6226EXPORT_SYMBOL(dev_get_nest_level);
6227
6228/**
6229 * netdev_lower_change - Dispatch event about lower device state change
6230 * @lower_dev: device
6231 * @lower_state_info: state to dispatch
6232 *
6233 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6234 * The caller must hold the RTNL lock.
6235 */
6236void netdev_lower_state_changed(struct net_device *lower_dev,
6237				void *lower_state_info)
6238{
6239	struct netdev_notifier_changelowerstate_info changelowerstate_info;
 
 
6240
6241	ASSERT_RTNL();
6242	changelowerstate_info.lower_state_info = lower_state_info;
6243	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6244				      &changelowerstate_info.info);
6245}
6246EXPORT_SYMBOL(netdev_lower_state_changed);
6247
6248int netdev_default_l2upper_neigh_construct(struct net_device *dev,
6249					   struct neighbour *n)
6250{
6251	struct net_device *lower_dev, *stop_dev;
6252	struct list_head *iter;
6253	int err;
6254
6255	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6256		if (!lower_dev->netdev_ops->ndo_neigh_construct)
6257			continue;
6258		err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
6259		if (err) {
6260			stop_dev = lower_dev;
6261			goto rollback;
6262		}
6263	}
6264	return 0;
6265
6266rollback:
6267	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6268		if (lower_dev == stop_dev)
6269			break;
6270		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6271			continue;
6272		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6273	}
6274	return err;
6275}
6276EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6277
6278void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6279					  struct neighbour *n)
6280{
6281	struct net_device *lower_dev;
6282	struct list_head *iter;
6283
6284	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6285		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6286			continue;
6287		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6288	}
6289}
6290EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6291
6292static void dev_change_rx_flags(struct net_device *dev, int flags)
6293{
6294	const struct net_device_ops *ops = dev->netdev_ops;
6295
6296	if (ops->ndo_change_rx_flags)
6297		ops->ndo_change_rx_flags(dev, flags);
6298}
6299
6300static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6301{
6302	unsigned int old_flags = dev->flags;
6303	kuid_t uid;
6304	kgid_t gid;
6305
6306	ASSERT_RTNL();
6307
6308	dev->flags |= IFF_PROMISC;
6309	dev->promiscuity += inc;
6310	if (dev->promiscuity == 0) {
6311		/*
6312		 * Avoid overflow.
6313		 * If inc causes overflow, untouch promisc and return error.
6314		 */
6315		if (inc < 0)
6316			dev->flags &= ~IFF_PROMISC;
6317		else {
6318			dev->promiscuity -= inc;
6319			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6320				dev->name);
6321			return -EOVERFLOW;
6322		}
6323	}
6324	if (dev->flags != old_flags) {
6325		pr_info("device %s %s promiscuous mode\n",
6326			dev->name,
6327			dev->flags & IFF_PROMISC ? "entered" : "left");
6328		if (audit_enabled) {
6329			current_uid_gid(&uid, &gid);
6330			audit_log(current->audit_context, GFP_ATOMIC,
6331				AUDIT_ANOM_PROMISCUOUS,
6332				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6333				dev->name, (dev->flags & IFF_PROMISC),
6334				(old_flags & IFF_PROMISC),
6335				from_kuid(&init_user_ns, audit_get_loginuid(current)),
6336				from_kuid(&init_user_ns, uid),
6337				from_kgid(&init_user_ns, gid),
6338				audit_get_sessionid(current));
6339		}
6340
6341		dev_change_rx_flags(dev, IFF_PROMISC);
6342	}
6343	if (notify)
6344		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
6345	return 0;
6346}
6347
6348/**
6349 *	dev_set_promiscuity	- update promiscuity count on a device
6350 *	@dev: device
6351 *	@inc: modifier
6352 *
6353 *	Add or remove promiscuity from a device. While the count in the device
6354 *	remains above zero the interface remains promiscuous. Once it hits zero
6355 *	the device reverts back to normal filtering operation. A negative inc
6356 *	value is used to drop promiscuity on the device.
6357 *	Return 0 if successful or a negative errno code on error.
6358 */
6359int dev_set_promiscuity(struct net_device *dev, int inc)
6360{
6361	unsigned int old_flags = dev->flags;
6362	int err;
6363
6364	err = __dev_set_promiscuity(dev, inc, true);
6365	if (err < 0)
6366		return err;
6367	if (dev->flags != old_flags)
6368		dev_set_rx_mode(dev);
6369	return err;
6370}
6371EXPORT_SYMBOL(dev_set_promiscuity);
6372
6373static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6374{
6375	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6376
6377	ASSERT_RTNL();
6378
6379	dev->flags |= IFF_ALLMULTI;
6380	dev->allmulti += inc;
6381	if (dev->allmulti == 0) {
6382		/*
6383		 * Avoid overflow.
6384		 * If inc causes overflow, untouch allmulti and return error.
6385		 */
6386		if (inc < 0)
6387			dev->flags &= ~IFF_ALLMULTI;
6388		else {
6389			dev->allmulti -= inc;
6390			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6391				dev->name);
6392			return -EOVERFLOW;
6393		}
6394	}
6395	if (dev->flags ^ old_flags) {
6396		dev_change_rx_flags(dev, IFF_ALLMULTI);
6397		dev_set_rx_mode(dev);
6398		if (notify)
6399			__dev_notify_flags(dev, old_flags,
6400					   dev->gflags ^ old_gflags);
6401	}
6402	return 0;
6403}
6404
6405/**
6406 *	dev_set_allmulti	- update allmulti count on a device
6407 *	@dev: device
6408 *	@inc: modifier
6409 *
6410 *	Add or remove reception of all multicast frames to a device. While the
6411 *	count in the device remains above zero the interface remains listening
6412 *	to all interfaces. Once it hits zero the device reverts back to normal
6413 *	filtering operation. A negative @inc value is used to drop the counter
6414 *	when releasing a resource needing all multicasts.
6415 *	Return 0 if successful or a negative errno code on error.
6416 */
6417
6418int dev_set_allmulti(struct net_device *dev, int inc)
6419{
6420	return __dev_set_allmulti(dev, inc, true);
6421}
6422EXPORT_SYMBOL(dev_set_allmulti);
6423
6424/*
6425 *	Upload unicast and multicast address lists to device and
6426 *	configure RX filtering. When the device doesn't support unicast
6427 *	filtering it is put in promiscuous mode while unicast addresses
6428 *	are present.
6429 */
6430void __dev_set_rx_mode(struct net_device *dev)
6431{
6432	const struct net_device_ops *ops = dev->netdev_ops;
6433
6434	/* dev_open will call this function so the list will stay sane. */
6435	if (!(dev->flags&IFF_UP))
6436		return;
6437
6438	if (!netif_device_present(dev))
6439		return;
6440
6441	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6442		/* Unicast addresses changes may only happen under the rtnl,
6443		 * therefore calling __dev_set_promiscuity here is safe.
6444		 */
6445		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6446			__dev_set_promiscuity(dev, 1, false);
6447			dev->uc_promisc = true;
6448		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6449			__dev_set_promiscuity(dev, -1, false);
6450			dev->uc_promisc = false;
6451		}
6452	}
6453
6454	if (ops->ndo_set_rx_mode)
6455		ops->ndo_set_rx_mode(dev);
6456}
6457
6458void dev_set_rx_mode(struct net_device *dev)
6459{
6460	netif_addr_lock_bh(dev);
6461	__dev_set_rx_mode(dev);
6462	netif_addr_unlock_bh(dev);
6463}
6464
6465/**
6466 *	dev_get_flags - get flags reported to userspace
6467 *	@dev: device
6468 *
6469 *	Get the combination of flag bits exported through APIs to userspace.
6470 */
6471unsigned int dev_get_flags(const struct net_device *dev)
6472{
6473	unsigned int flags;
6474
6475	flags = (dev->flags & ~(IFF_PROMISC |
6476				IFF_ALLMULTI |
6477				IFF_RUNNING |
6478				IFF_LOWER_UP |
6479				IFF_DORMANT)) |
6480		(dev->gflags & (IFF_PROMISC |
6481				IFF_ALLMULTI));
6482
6483	if (netif_running(dev)) {
6484		if (netif_oper_up(dev))
6485			flags |= IFF_RUNNING;
6486		if (netif_carrier_ok(dev))
6487			flags |= IFF_LOWER_UP;
6488		if (netif_dormant(dev))
6489			flags |= IFF_DORMANT;
6490	}
6491
6492	return flags;
6493}
6494EXPORT_SYMBOL(dev_get_flags);
6495
6496int __dev_change_flags(struct net_device *dev, unsigned int flags)
6497{
6498	unsigned int old_flags = dev->flags;
6499	int ret;
6500
6501	ASSERT_RTNL();
6502
6503	/*
6504	 *	Set the flags on our device.
6505	 */
6506
6507	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6508			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6509			       IFF_AUTOMEDIA)) |
6510		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6511				    IFF_ALLMULTI));
6512
6513	/*
6514	 *	Load in the correct multicast list now the flags have changed.
6515	 */
6516
6517	if ((old_flags ^ flags) & IFF_MULTICAST)
6518		dev_change_rx_flags(dev, IFF_MULTICAST);
6519
6520	dev_set_rx_mode(dev);
6521
6522	/*
6523	 *	Have we downed the interface. We handle IFF_UP ourselves
6524	 *	according to user attempts to set it, rather than blindly
6525	 *	setting it.
6526	 */
6527
6528	ret = 0;
6529	if ((old_flags ^ flags) & IFF_UP)
6530		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
 
 
 
 
6531
6532	if ((flags ^ dev->gflags) & IFF_PROMISC) {
6533		int inc = (flags & IFF_PROMISC) ? 1 : -1;
6534		unsigned int old_flags = dev->flags;
6535
6536		dev->gflags ^= IFF_PROMISC;
6537
6538		if (__dev_set_promiscuity(dev, inc, false) >= 0)
6539			if (dev->flags != old_flags)
6540				dev_set_rx_mode(dev);
6541	}
6542
6543	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6544	   is important. Some (broken) drivers set IFF_PROMISC, when
6545	   IFF_ALLMULTI is requested not asking us and not reporting.
6546	 */
6547	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6548		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6549
6550		dev->gflags ^= IFF_ALLMULTI;
6551		__dev_set_allmulti(dev, inc, false);
6552	}
6553
6554	return ret;
6555}
6556
6557void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6558			unsigned int gchanges)
6559{
6560	unsigned int changes = dev->flags ^ old_flags;
6561
6562	if (gchanges)
6563		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6564
6565	if (changes & IFF_UP) {
6566		if (dev->flags & IFF_UP)
6567			call_netdevice_notifiers(NETDEV_UP, dev);
6568		else
6569			call_netdevice_notifiers(NETDEV_DOWN, dev);
6570	}
6571
6572	if (dev->flags & IFF_UP &&
6573	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6574		struct netdev_notifier_change_info change_info;
 
 
 
 
 
6575
6576		change_info.flags_changed = changes;
6577		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6578					      &change_info.info);
6579	}
6580}
6581
6582/**
6583 *	dev_change_flags - change device settings
6584 *	@dev: device
6585 *	@flags: device state flags
6586 *
6587 *	Change settings on device based state flags. The flags are
6588 *	in the userspace exported format.
6589 */
6590int dev_change_flags(struct net_device *dev, unsigned int flags)
6591{
6592	int ret;
6593	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6594
6595	ret = __dev_change_flags(dev, flags);
6596	if (ret < 0)
6597		return ret;
6598
6599	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6600	__dev_notify_flags(dev, old_flags, changes);
6601	return ret;
6602}
6603EXPORT_SYMBOL(dev_change_flags);
6604
6605static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6606{
6607	const struct net_device_ops *ops = dev->netdev_ops;
6608
6609	if (ops->ndo_change_mtu)
6610		return ops->ndo_change_mtu(dev, new_mtu);
6611
6612	dev->mtu = new_mtu;
6613	return 0;
6614}
 
6615
6616/**
6617 *	dev_set_mtu - Change maximum transfer unit
6618 *	@dev: device
6619 *	@new_mtu: new transfer unit
6620 *
6621 *	Change the maximum transfer size of the network device.
6622 */
6623int dev_set_mtu(struct net_device *dev, int new_mtu)
6624{
6625	int err, orig_mtu;
6626
6627	if (new_mtu == dev->mtu)
6628		return 0;
6629
6630	/* MTU must be positive, and in range */
6631	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
6632		net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
6633				    dev->name, new_mtu, dev->min_mtu);
6634		return -EINVAL;
6635	}
6636
6637	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
6638		net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
6639				    dev->name, new_mtu, dev->max_mtu);
6640		return -EINVAL;
6641	}
6642
6643	if (!netif_device_present(dev))
6644		return -ENODEV;
6645
6646	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6647	err = notifier_to_errno(err);
6648	if (err)
6649		return err;
6650
6651	orig_mtu = dev->mtu;
6652	err = __dev_set_mtu(dev, new_mtu);
6653
6654	if (!err) {
6655		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6656		err = notifier_to_errno(err);
6657		if (err) {
6658			/* setting mtu back and notifying everyone again,
6659			 * so that they have a chance to revert changes.
6660			 */
6661			__dev_set_mtu(dev, orig_mtu);
6662			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6663		}
6664	}
6665	return err;
6666}
6667EXPORT_SYMBOL(dev_set_mtu);
6668
6669/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6670 *	dev_set_group - Change group this device belongs to
6671 *	@dev: device
6672 *	@new_group: group this device should belong to
6673 */
6674void dev_set_group(struct net_device *dev, int new_group)
6675{
6676	dev->group = new_group;
6677}
6678EXPORT_SYMBOL(dev_set_group);
6679
6680/**
6681 *	dev_set_mac_address - Change Media Access Control Address
6682 *	@dev: device
6683 *	@sa: new address
6684 *
6685 *	Change the hardware (MAC) address of the device
6686 */
6687int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6688{
6689	const struct net_device_ops *ops = dev->netdev_ops;
6690	int err;
6691
6692	if (!ops->ndo_set_mac_address)
6693		return -EOPNOTSUPP;
6694	if (sa->sa_family != dev->type)
6695		return -EINVAL;
6696	if (!netif_device_present(dev))
6697		return -ENODEV;
6698	err = ops->ndo_set_mac_address(dev, sa);
6699	if (err)
6700		return err;
6701	dev->addr_assign_type = NET_ADDR_SET;
6702	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6703	add_device_randomness(dev->dev_addr, dev->addr_len);
6704	return 0;
6705}
6706EXPORT_SYMBOL(dev_set_mac_address);
6707
6708/**
6709 *	dev_change_carrier - Change device carrier
6710 *	@dev: device
6711 *	@new_carrier: new value
6712 *
6713 *	Change device carrier
6714 */
6715int dev_change_carrier(struct net_device *dev, bool new_carrier)
6716{
6717	const struct net_device_ops *ops = dev->netdev_ops;
6718
6719	if (!ops->ndo_change_carrier)
6720		return -EOPNOTSUPP;
6721	if (!netif_device_present(dev))
6722		return -ENODEV;
6723	return ops->ndo_change_carrier(dev, new_carrier);
6724}
6725EXPORT_SYMBOL(dev_change_carrier);
6726
6727/**
6728 *	dev_get_phys_port_id - Get device physical port ID
6729 *	@dev: device
6730 *	@ppid: port ID
6731 *
6732 *	Get device physical port ID
6733 */
6734int dev_get_phys_port_id(struct net_device *dev,
6735			 struct netdev_phys_item_id *ppid)
6736{
6737	const struct net_device_ops *ops = dev->netdev_ops;
6738
6739	if (!ops->ndo_get_phys_port_id)
6740		return -EOPNOTSUPP;
6741	return ops->ndo_get_phys_port_id(dev, ppid);
6742}
6743EXPORT_SYMBOL(dev_get_phys_port_id);
6744
6745/**
6746 *	dev_get_phys_port_name - Get device physical port name
6747 *	@dev: device
6748 *	@name: port name
6749 *	@len: limit of bytes to copy to name
6750 *
6751 *	Get device physical port name
6752 */
6753int dev_get_phys_port_name(struct net_device *dev,
6754			   char *name, size_t len)
6755{
6756	const struct net_device_ops *ops = dev->netdev_ops;
6757
6758	if (!ops->ndo_get_phys_port_name)
6759		return -EOPNOTSUPP;
6760	return ops->ndo_get_phys_port_name(dev, name, len);
6761}
6762EXPORT_SYMBOL(dev_get_phys_port_name);
6763
6764/**
6765 *	dev_change_proto_down - update protocol port state information
6766 *	@dev: device
6767 *	@proto_down: new value
6768 *
6769 *	This info can be used by switch drivers to set the phys state of the
6770 *	port.
6771 */
6772int dev_change_proto_down(struct net_device *dev, bool proto_down)
6773{
6774	const struct net_device_ops *ops = dev->netdev_ops;
6775
6776	if (!ops->ndo_change_proto_down)
6777		return -EOPNOTSUPP;
6778	if (!netif_device_present(dev))
6779		return -ENODEV;
6780	return ops->ndo_change_proto_down(dev, proto_down);
6781}
6782EXPORT_SYMBOL(dev_change_proto_down);
6783
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6784/**
6785 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
6786 *	@dev: device
 
6787 *	@fd: new program fd or negative value to clear
6788 *	@flags: xdp-related flags
6789 *
6790 *	Set or clear a bpf program for a device
6791 */
6792int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
 
6793{
6794	const struct net_device_ops *ops = dev->netdev_ops;
6795	struct bpf_prog *prog = NULL;
6796	struct netdev_xdp xdp;
6797	int err;
6798
6799	ASSERT_RTNL();
6800
6801	if (!ops->ndo_xdp)
 
6802		return -EOPNOTSUPP;
 
 
 
 
 
6803	if (fd >= 0) {
6804		if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
6805			memset(&xdp, 0, sizeof(xdp));
6806			xdp.command = XDP_QUERY_PROG;
6807
6808			err = ops->ndo_xdp(dev, &xdp);
6809			if (err < 0)
6810				return err;
6811			if (xdp.prog_attached)
6812				return -EBUSY;
6813		}
6814
6815		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
 
6816		if (IS_ERR(prog))
6817			return PTR_ERR(prog);
 
 
 
 
 
 
 
6818	}
6819
6820	memset(&xdp, 0, sizeof(xdp));
6821	xdp.command = XDP_SETUP_PROG;
6822	xdp.prog = prog;
6823
6824	err = ops->ndo_xdp(dev, &xdp);
6825	if (err < 0 && prog)
6826		bpf_prog_put(prog);
6827
6828	return err;
6829}
6830EXPORT_SYMBOL(dev_change_xdp_fd);
6831
6832/**
6833 *	dev_new_index	-	allocate an ifindex
6834 *	@net: the applicable net namespace
6835 *
6836 *	Returns a suitable unique value for a new device interface
6837 *	number.  The caller must hold the rtnl semaphore or the
6838 *	dev_base_lock to be sure it remains unique.
6839 */
6840static int dev_new_index(struct net *net)
6841{
6842	int ifindex = net->ifindex;
 
6843	for (;;) {
6844		if (++ifindex <= 0)
6845			ifindex = 1;
6846		if (!__dev_get_by_index(net, ifindex))
6847			return net->ifindex = ifindex;
6848	}
6849}
6850
6851/* Delayed registration/unregisteration */
6852static LIST_HEAD(net_todo_list);
6853DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6854
6855static void net_set_todo(struct net_device *dev)
6856{
6857	list_add_tail(&dev->todo_list, &net_todo_list);
6858	dev_net(dev)->dev_unreg_count++;
6859}
6860
6861static void rollback_registered_many(struct list_head *head)
6862{
6863	struct net_device *dev, *tmp;
6864	LIST_HEAD(close_head);
6865
6866	BUG_ON(dev_boot_phase);
6867	ASSERT_RTNL();
6868
6869	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6870		/* Some devices call without registering
6871		 * for initialization unwind. Remove those
6872		 * devices and proceed with the remaining.
6873		 */
6874		if (dev->reg_state == NETREG_UNINITIALIZED) {
6875			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6876				 dev->name, dev);
6877
6878			WARN_ON(1);
6879			list_del(&dev->unreg_list);
6880			continue;
6881		}
6882		dev->dismantle = true;
6883		BUG_ON(dev->reg_state != NETREG_REGISTERED);
6884	}
6885
6886	/* If device is running, close it first. */
6887	list_for_each_entry(dev, head, unreg_list)
6888		list_add_tail(&dev->close_list, &close_head);
6889	dev_close_many(&close_head, true);
6890
6891	list_for_each_entry(dev, head, unreg_list) {
6892		/* And unlink it from device chain. */
6893		unlist_netdevice(dev);
6894
6895		dev->reg_state = NETREG_UNREGISTERING;
6896	}
6897	flush_all_backlogs();
6898
6899	synchronize_net();
6900
6901	list_for_each_entry(dev, head, unreg_list) {
6902		struct sk_buff *skb = NULL;
6903
6904		/* Shutdown queueing discipline. */
6905		dev_shutdown(dev);
6906
 
6907
6908		/* Notify protocols, that we are about to destroy
6909		   this device. They should clean all the things.
6910		*/
6911		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6912
6913		if (!dev->rtnl_link_ops ||
6914		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6915			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6916						     GFP_KERNEL);
6917
6918		/*
6919		 *	Flush the unicast and multicast chains
6920		 */
6921		dev_uc_flush(dev);
6922		dev_mc_flush(dev);
6923
6924		if (dev->netdev_ops->ndo_uninit)
6925			dev->netdev_ops->ndo_uninit(dev);
6926
6927		if (skb)
6928			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6929
6930		/* Notifier chain MUST detach us all upper devices. */
6931		WARN_ON(netdev_has_any_upper_dev(dev));
6932		WARN_ON(netdev_has_any_lower_dev(dev));
6933
6934		/* Remove entries from kobject tree */
6935		netdev_unregister_kobject(dev);
6936#ifdef CONFIG_XPS
6937		/* Remove XPS queueing entries */
6938		netif_reset_xps_queues_gt(dev, 0);
6939#endif
6940	}
6941
6942	synchronize_net();
6943
6944	list_for_each_entry(dev, head, unreg_list)
6945		dev_put(dev);
6946}
6947
6948static void rollback_registered(struct net_device *dev)
6949{
6950	LIST_HEAD(single);
6951
6952	list_add(&dev->unreg_list, &single);
6953	rollback_registered_many(&single);
6954	list_del(&single);
6955}
6956
6957static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6958	struct net_device *upper, netdev_features_t features)
6959{
6960	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6961	netdev_features_t feature;
6962	int feature_bit;
6963
6964	for_each_netdev_feature(&upper_disables, feature_bit) {
6965		feature = __NETIF_F_BIT(feature_bit);
6966		if (!(upper->wanted_features & feature)
6967		    && (features & feature)) {
6968			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6969				   &feature, upper->name);
6970			features &= ~feature;
6971		}
6972	}
6973
6974	return features;
6975}
6976
6977static void netdev_sync_lower_features(struct net_device *upper,
6978	struct net_device *lower, netdev_features_t features)
6979{
6980	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6981	netdev_features_t feature;
6982	int feature_bit;
6983
6984	for_each_netdev_feature(&upper_disables, feature_bit) {
6985		feature = __NETIF_F_BIT(feature_bit);
6986		if (!(features & feature) && (lower->features & feature)) {
6987			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6988				   &feature, lower->name);
6989			lower->wanted_features &= ~feature;
6990			netdev_update_features(lower);
6991
6992			if (unlikely(lower->features & feature))
6993				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6994					    &feature, lower->name);
6995		}
6996	}
6997}
6998
6999static netdev_features_t netdev_fix_features(struct net_device *dev,
7000	netdev_features_t features)
7001{
7002	/* Fix illegal checksum combinations */
7003	if ((features & NETIF_F_HW_CSUM) &&
7004	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
7005		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
7006		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
7007	}
7008
7009	/* TSO requires that SG is present as well. */
7010	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
7011		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
7012		features &= ~NETIF_F_ALL_TSO;
7013	}
7014
7015	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
7016					!(features & NETIF_F_IP_CSUM)) {
7017		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
7018		features &= ~NETIF_F_TSO;
7019		features &= ~NETIF_F_TSO_ECN;
7020	}
7021
7022	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
7023					 !(features & NETIF_F_IPV6_CSUM)) {
7024		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
7025		features &= ~NETIF_F_TSO6;
7026	}
7027
7028	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
7029	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
7030		features &= ~NETIF_F_TSO_MANGLEID;
7031
7032	/* TSO ECN requires that TSO is present as well. */
7033	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
7034		features &= ~NETIF_F_TSO_ECN;
7035
7036	/* Software GSO depends on SG. */
7037	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
7038		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
7039		features &= ~NETIF_F_GSO;
7040	}
7041
7042	/* UFO needs SG and checksumming */
7043	if (features & NETIF_F_UFO) {
7044		/* maybe split UFO into V4 and V6? */
7045		if (!(features & NETIF_F_HW_CSUM) &&
7046		    ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
7047		     (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
7048			netdev_dbg(dev,
7049				"Dropping NETIF_F_UFO since no checksum offload features.\n");
7050			features &= ~NETIF_F_UFO;
7051		}
7052
7053		if (!(features & NETIF_F_SG)) {
7054			netdev_dbg(dev,
7055				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
7056			features &= ~NETIF_F_UFO;
7057		}
7058	}
7059
7060	/* GSO partial features require GSO partial be set */
7061	if ((features & dev->gso_partial_features) &&
7062	    !(features & NETIF_F_GSO_PARTIAL)) {
7063		netdev_dbg(dev,
7064			   "Dropping partially supported GSO features since no GSO partial.\n");
7065		features &= ~dev->gso_partial_features;
7066	}
7067
7068#ifdef CONFIG_NET_RX_BUSY_POLL
7069	if (dev->netdev_ops->ndo_busy_poll)
7070		features |= NETIF_F_BUSY_POLL;
7071	else
7072#endif
7073		features &= ~NETIF_F_BUSY_POLL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7074
7075	return features;
7076}
7077
7078int __netdev_update_features(struct net_device *dev)
7079{
7080	struct net_device *upper, *lower;
7081	netdev_features_t features;
7082	struct list_head *iter;
7083	int err = -1;
7084
7085	ASSERT_RTNL();
7086
7087	features = netdev_get_wanted_features(dev);
7088
7089	if (dev->netdev_ops->ndo_fix_features)
7090		features = dev->netdev_ops->ndo_fix_features(dev, features);
7091
7092	/* driver might be less strict about feature dependencies */
7093	features = netdev_fix_features(dev, features);
7094
7095	/* some features can't be enabled if they're off an an upper device */
7096	netdev_for_each_upper_dev_rcu(dev, upper, iter)
7097		features = netdev_sync_upper_features(dev, upper, features);
7098
7099	if (dev->features == features)
7100		goto sync_lower;
7101
7102	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
7103		&dev->features, &features);
7104
7105	if (dev->netdev_ops->ndo_set_features)
7106		err = dev->netdev_ops->ndo_set_features(dev, features);
7107	else
7108		err = 0;
7109
7110	if (unlikely(err < 0)) {
7111		netdev_err(dev,
7112			"set_features() failed (%d); wanted %pNF, left %pNF\n",
7113			err, &features, &dev->features);
7114		/* return non-0 since some features might have changed and
7115		 * it's better to fire a spurious notification than miss it
7116		 */
7117		return -1;
7118	}
7119
7120sync_lower:
7121	/* some features must be disabled on lower devices when disabled
7122	 * on an upper device (think: bonding master or bridge)
7123	 */
7124	netdev_for_each_lower_dev(dev, lower, iter)
7125		netdev_sync_lower_features(dev, lower, features);
7126
7127	if (!err)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7128		dev->features = features;
 
7129
7130	return err < 0 ? 0 : 1;
7131}
7132
7133/**
7134 *	netdev_update_features - recalculate device features
7135 *	@dev: the device to check
7136 *
7137 *	Recalculate dev->features set and send notifications if it
7138 *	has changed. Should be called after driver or hardware dependent
7139 *	conditions might have changed that influence the features.
7140 */
7141void netdev_update_features(struct net_device *dev)
7142{
7143	if (__netdev_update_features(dev))
7144		netdev_features_change(dev);
7145}
7146EXPORT_SYMBOL(netdev_update_features);
7147
7148/**
7149 *	netdev_change_features - recalculate device features
7150 *	@dev: the device to check
7151 *
7152 *	Recalculate dev->features set and send notifications even
7153 *	if they have not changed. Should be called instead of
7154 *	netdev_update_features() if also dev->vlan_features might
7155 *	have changed to allow the changes to be propagated to stacked
7156 *	VLAN devices.
7157 */
7158void netdev_change_features(struct net_device *dev)
7159{
7160	__netdev_update_features(dev);
7161	netdev_features_change(dev);
7162}
7163EXPORT_SYMBOL(netdev_change_features);
7164
7165/**
7166 *	netif_stacked_transfer_operstate -	transfer operstate
7167 *	@rootdev: the root or lower level device to transfer state from
7168 *	@dev: the device to transfer operstate to
7169 *
7170 *	Transfer operational state from root to device. This is normally
7171 *	called when a stacking relationship exists between the root
7172 *	device and the device(a leaf device).
7173 */
7174void netif_stacked_transfer_operstate(const struct net_device *rootdev,
7175					struct net_device *dev)
7176{
7177	if (rootdev->operstate == IF_OPER_DORMANT)
7178		netif_dormant_on(dev);
7179	else
7180		netif_dormant_off(dev);
7181
7182	if (netif_carrier_ok(rootdev)) {
7183		if (!netif_carrier_ok(dev))
7184			netif_carrier_on(dev);
7185	} else {
7186		if (netif_carrier_ok(dev))
7187			netif_carrier_off(dev);
7188	}
7189}
7190EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7191
7192#ifdef CONFIG_SYSFS
7193static int netif_alloc_rx_queues(struct net_device *dev)
7194{
7195	unsigned int i, count = dev->num_rx_queues;
7196	struct netdev_rx_queue *rx;
7197	size_t sz = count * sizeof(*rx);
 
7198
7199	BUG_ON(count < 1);
7200
7201	rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7202	if (!rx) {
7203		rx = vzalloc(sz);
7204		if (!rx)
7205			return -ENOMEM;
7206	}
7207	dev->_rx = rx;
7208
7209	for (i = 0; i < count; i++)
7210		rx[i].dev = dev;
 
 
 
 
 
 
7211	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7212}
7213#endif
7214
7215static void netdev_init_one_queue(struct net_device *dev,
7216				  struct netdev_queue *queue, void *_unused)
7217{
7218	/* Initialize queue lock */
7219	spin_lock_init(&queue->_xmit_lock);
7220	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7221	queue->xmit_lock_owner = -1;
7222	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
7223	queue->dev = dev;
7224#ifdef CONFIG_BQL
7225	dql_init(&queue->dql, HZ);
7226#endif
7227}
7228
7229static void netif_free_tx_queues(struct net_device *dev)
7230{
7231	kvfree(dev->_tx);
7232}
7233
7234static int netif_alloc_netdev_queues(struct net_device *dev)
7235{
7236	unsigned int count = dev->num_tx_queues;
7237	struct netdev_queue *tx;
7238	size_t sz = count * sizeof(*tx);
7239
7240	if (count < 1 || count > 0xffff)
7241		return -EINVAL;
7242
7243	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7244	if (!tx) {
7245		tx = vzalloc(sz);
7246		if (!tx)
7247			return -ENOMEM;
7248	}
7249	dev->_tx = tx;
7250
7251	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7252	spin_lock_init(&dev->tx_global_lock);
7253
7254	return 0;
7255}
7256
7257void netif_tx_stop_all_queues(struct net_device *dev)
7258{
7259	unsigned int i;
7260
7261	for (i = 0; i < dev->num_tx_queues; i++) {
7262		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 
7263		netif_tx_stop_queue(txq);
7264	}
7265}
7266EXPORT_SYMBOL(netif_tx_stop_all_queues);
7267
7268/**
7269 *	register_netdevice	- register a network device
7270 *	@dev: device to register
7271 *
7272 *	Take a completed network device structure and add it to the kernel
7273 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7274 *	chain. 0 is returned on success. A negative errno code is returned
7275 *	on a failure to set up the device, or if the name is a duplicate.
7276 *
7277 *	Callers must hold the rtnl semaphore. You may want
7278 *	register_netdev() instead of this.
7279 *
7280 *	BUGS:
7281 *	The locking appears insufficient to guarantee two parallel registers
7282 *	will not get the same name.
7283 */
7284
7285int register_netdevice(struct net_device *dev)
7286{
7287	int ret;
7288	struct net *net = dev_net(dev);
7289
7290	BUG_ON(dev_boot_phase);
7291	ASSERT_RTNL();
7292
7293	might_sleep();
7294
7295	/* When net_device's are persistent, this will be fatal. */
7296	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7297	BUG_ON(!net);
7298
7299	spin_lock_init(&dev->addr_list_lock);
7300	netdev_set_addr_lockdep_class(dev);
7301
7302	ret = dev_get_valid_name(net, dev, dev->name);
7303	if (ret < 0)
7304		goto out;
7305
7306	/* Init, if this function is available */
7307	if (dev->netdev_ops->ndo_init) {
7308		ret = dev->netdev_ops->ndo_init(dev);
7309		if (ret) {
7310			if (ret > 0)
7311				ret = -EIO;
7312			goto out;
7313		}
7314	}
7315
7316	if (((dev->hw_features | dev->features) &
7317	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
7318	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7319	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7320		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7321		ret = -EINVAL;
7322		goto err_uninit;
7323	}
7324
7325	ret = -EBUSY;
7326	if (!dev->ifindex)
7327		dev->ifindex = dev_new_index(net);
7328	else if (__dev_get_by_index(net, dev->ifindex))
7329		goto err_uninit;
7330
7331	/* Transfer changeable features to wanted_features and enable
7332	 * software offloads (GSO and GRO).
7333	 */
7334	dev->hw_features |= NETIF_F_SOFT_FEATURES;
7335	dev->features |= NETIF_F_SOFT_FEATURES;
 
 
 
 
 
 
7336	dev->wanted_features = dev->features & dev->hw_features;
7337
7338	if (!(dev->flags & IFF_LOOPBACK))
7339		dev->hw_features |= NETIF_F_NOCACHE_COPY;
7340
7341	/* If IPv4 TCP segmentation offload is supported we should also
7342	 * allow the device to enable segmenting the frame with the option
7343	 * of ignoring a static IP ID value.  This doesn't enable the
7344	 * feature itself but allows the user to enable it later.
7345	 */
7346	if (dev->hw_features & NETIF_F_TSO)
7347		dev->hw_features |= NETIF_F_TSO_MANGLEID;
7348	if (dev->vlan_features & NETIF_F_TSO)
7349		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7350	if (dev->mpls_features & NETIF_F_TSO)
7351		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7352	if (dev->hw_enc_features & NETIF_F_TSO)
7353		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7354
7355	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7356	 */
7357	dev->vlan_features |= NETIF_F_HIGHDMA;
7358
7359	/* Make NETIF_F_SG inheritable to tunnel devices.
7360	 */
7361	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7362
7363	/* Make NETIF_F_SG inheritable to MPLS.
7364	 */
7365	dev->mpls_features |= NETIF_F_SG;
7366
7367	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7368	ret = notifier_to_errno(ret);
7369	if (ret)
7370		goto err_uninit;
7371
7372	ret = netdev_register_kobject(dev);
7373	if (ret)
7374		goto err_uninit;
7375	dev->reg_state = NETREG_REGISTERED;
7376
7377	__netdev_update_features(dev);
7378
7379	/*
7380	 *	Default initial state at registry is that the
7381	 *	device is present.
7382	 */
7383
7384	set_bit(__LINK_STATE_PRESENT, &dev->state);
7385
7386	linkwatch_init_dev(dev);
7387
7388	dev_init_scheduler(dev);
7389	dev_hold(dev);
7390	list_netdevice(dev);
7391	add_device_randomness(dev->dev_addr, dev->addr_len);
7392
7393	/* If the device has permanent device address, driver should
7394	 * set dev_addr and also addr_assign_type should be set to
7395	 * NET_ADDR_PERM (default value).
7396	 */
7397	if (dev->addr_assign_type == NET_ADDR_PERM)
7398		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7399
7400	/* Notify protocols, that a new device appeared. */
7401	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7402	ret = notifier_to_errno(ret);
7403	if (ret) {
7404		rollback_registered(dev);
7405		dev->reg_state = NETREG_UNREGISTERED;
7406	}
7407	/*
7408	 *	Prevent userspace races by waiting until the network
7409	 *	device is fully setup before sending notifications.
7410	 */
7411	if (!dev->rtnl_link_ops ||
7412	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7413		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7414
7415out:
7416	return ret;
7417
7418err_uninit:
7419	if (dev->netdev_ops->ndo_uninit)
7420		dev->netdev_ops->ndo_uninit(dev);
 
 
7421	goto out;
7422}
7423EXPORT_SYMBOL(register_netdevice);
7424
7425/**
7426 *	init_dummy_netdev	- init a dummy network device for NAPI
7427 *	@dev: device to init
7428 *
7429 *	This takes a network device structure and initialize the minimum
7430 *	amount of fields so it can be used to schedule NAPI polls without
7431 *	registering a full blown interface. This is to be used by drivers
7432 *	that need to tie several hardware interfaces to a single NAPI
7433 *	poll scheduler due to HW limitations.
7434 */
7435int init_dummy_netdev(struct net_device *dev)
7436{
7437	/* Clear everything. Note we don't initialize spinlocks
7438	 * are they aren't supposed to be taken by any of the
7439	 * NAPI code and this dummy netdev is supposed to be
7440	 * only ever used for NAPI polls
7441	 */
7442	memset(dev, 0, sizeof(struct net_device));
7443
7444	/* make sure we BUG if trying to hit standard
7445	 * register/unregister code path
7446	 */
7447	dev->reg_state = NETREG_DUMMY;
7448
7449	/* NAPI wants this */
7450	INIT_LIST_HEAD(&dev->napi_list);
7451
7452	/* a dummy interface is started by default */
7453	set_bit(__LINK_STATE_PRESENT, &dev->state);
7454	set_bit(__LINK_STATE_START, &dev->state);
7455
7456	/* Note : We dont allocate pcpu_refcnt for dummy devices,
7457	 * because users of this 'device' dont need to change
7458	 * its refcount.
7459	 */
7460
7461	return 0;
7462}
7463EXPORT_SYMBOL_GPL(init_dummy_netdev);
7464
7465
7466/**
7467 *	register_netdev	- register a network device
7468 *	@dev: device to register
7469 *
7470 *	Take a completed network device structure and add it to the kernel
7471 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7472 *	chain. 0 is returned on success. A negative errno code is returned
7473 *	on a failure to set up the device, or if the name is a duplicate.
7474 *
7475 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
7476 *	and expands the device name if you passed a format string to
7477 *	alloc_netdev.
7478 */
7479int register_netdev(struct net_device *dev)
7480{
7481	int err;
7482
7483	rtnl_lock();
 
7484	err = register_netdevice(dev);
7485	rtnl_unlock();
7486	return err;
7487}
7488EXPORT_SYMBOL(register_netdev);
7489
7490int netdev_refcnt_read(const struct net_device *dev)
7491{
7492	int i, refcnt = 0;
7493
7494	for_each_possible_cpu(i)
7495		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7496	return refcnt;
7497}
7498EXPORT_SYMBOL(netdev_refcnt_read);
7499
7500/**
7501 * netdev_wait_allrefs - wait until all references are gone.
7502 * @dev: target net_device
7503 *
7504 * This is called when unregistering network devices.
7505 *
7506 * Any protocol or device that holds a reference should register
7507 * for netdevice notification, and cleanup and put back the
7508 * reference if they receive an UNREGISTER event.
7509 * We can get stuck here if buggy protocols don't correctly
7510 * call dev_put.
7511 */
7512static void netdev_wait_allrefs(struct net_device *dev)
7513{
7514	unsigned long rebroadcast_time, warning_time;
7515	int refcnt;
7516
7517	linkwatch_forget_dev(dev);
7518
7519	rebroadcast_time = warning_time = jiffies;
7520	refcnt = netdev_refcnt_read(dev);
7521
7522	while (refcnt != 0) {
7523		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7524			rtnl_lock();
7525
7526			/* Rebroadcast unregister notification */
7527			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7528
7529			__rtnl_unlock();
7530			rcu_barrier();
7531			rtnl_lock();
7532
7533			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7534			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7535				     &dev->state)) {
7536				/* We must not have linkwatch events
7537				 * pending on unregister. If this
7538				 * happens, we simply run the queue
7539				 * unscheduled, resulting in a noop
7540				 * for this device.
7541				 */
7542				linkwatch_run_queue();
7543			}
7544
7545			__rtnl_unlock();
7546
7547			rebroadcast_time = jiffies;
7548		}
7549
7550		msleep(250);
7551
7552		refcnt = netdev_refcnt_read(dev);
7553
7554		if (time_after(jiffies, warning_time + 10 * HZ)) {
7555			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7556				 dev->name, refcnt);
7557			warning_time = jiffies;
7558		}
7559	}
7560}
7561
7562/* The sequence is:
7563 *
7564 *	rtnl_lock();
7565 *	...
7566 *	register_netdevice(x1);
7567 *	register_netdevice(x2);
7568 *	...
7569 *	unregister_netdevice(y1);
7570 *	unregister_netdevice(y2);
7571 *      ...
7572 *	rtnl_unlock();
7573 *	free_netdev(y1);
7574 *	free_netdev(y2);
7575 *
7576 * We are invoked by rtnl_unlock().
7577 * This allows us to deal with problems:
7578 * 1) We can delete sysfs objects which invoke hotplug
7579 *    without deadlocking with linkwatch via keventd.
7580 * 2) Since we run with the RTNL semaphore not held, we can sleep
7581 *    safely in order to wait for the netdev refcnt to drop to zero.
7582 *
7583 * We must not return until all unregister events added during
7584 * the interval the lock was held have been completed.
7585 */
7586void netdev_run_todo(void)
7587{
7588	struct list_head list;
7589
7590	/* Snapshot list, allow later requests */
7591	list_replace_init(&net_todo_list, &list);
7592
7593	__rtnl_unlock();
7594
7595
7596	/* Wait for rcu callbacks to finish before next phase */
7597	if (!list_empty(&list))
7598		rcu_barrier();
7599
7600	while (!list_empty(&list)) {
7601		struct net_device *dev
7602			= list_first_entry(&list, struct net_device, todo_list);
7603		list_del(&dev->todo_list);
7604
7605		rtnl_lock();
7606		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7607		__rtnl_unlock();
7608
7609		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7610			pr_err("network todo '%s' but state %d\n",
7611			       dev->name, dev->reg_state);
7612			dump_stack();
7613			continue;
7614		}
7615
7616		dev->reg_state = NETREG_UNREGISTERED;
7617
7618		netdev_wait_allrefs(dev);
7619
7620		/* paranoia */
7621		BUG_ON(netdev_refcnt_read(dev));
7622		BUG_ON(!list_empty(&dev->ptype_all));
7623		BUG_ON(!list_empty(&dev->ptype_specific));
7624		WARN_ON(rcu_access_pointer(dev->ip_ptr));
7625		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
 
7626		WARN_ON(dev->dn_ptr);
7627
7628		if (dev->destructor)
7629			dev->destructor(dev);
 
 
7630
7631		/* Report a network device has been unregistered */
7632		rtnl_lock();
7633		dev_net(dev)->dev_unreg_count--;
7634		__rtnl_unlock();
7635		wake_up(&netdev_unregistering_wq);
7636
7637		/* Free network device */
7638		kobject_put(&dev->dev.kobj);
7639	}
7640}
7641
7642/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7643 * all the same fields in the same order as net_device_stats, with only
7644 * the type differing, but rtnl_link_stats64 may have additional fields
7645 * at the end for newer counters.
7646 */
7647void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7648			     const struct net_device_stats *netdev_stats)
7649{
7650#if BITS_PER_LONG == 64
7651	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7652	memcpy(stats64, netdev_stats, sizeof(*stats64));
7653	/* zero out counters that only exist in rtnl_link_stats64 */
7654	memset((char *)stats64 + sizeof(*netdev_stats), 0,
7655	       sizeof(*stats64) - sizeof(*netdev_stats));
7656#else
7657	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7658	const unsigned long *src = (const unsigned long *)netdev_stats;
7659	u64 *dst = (u64 *)stats64;
7660
7661	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7662	for (i = 0; i < n; i++)
7663		dst[i] = src[i];
7664	/* zero out counters that only exist in rtnl_link_stats64 */
7665	memset((char *)stats64 + n * sizeof(u64), 0,
7666	       sizeof(*stats64) - n * sizeof(u64));
7667#endif
7668}
7669EXPORT_SYMBOL(netdev_stats_to_stats64);
7670
7671/**
7672 *	dev_get_stats	- get network device statistics
7673 *	@dev: device to get statistics from
7674 *	@storage: place to store stats
7675 *
7676 *	Get network statistics from device. Return @storage.
7677 *	The device driver may provide its own method by setting
7678 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7679 *	otherwise the internal statistics structure is used.
7680 */
7681struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7682					struct rtnl_link_stats64 *storage)
7683{
7684	const struct net_device_ops *ops = dev->netdev_ops;
7685
7686	if (ops->ndo_get_stats64) {
7687		memset(storage, 0, sizeof(*storage));
7688		ops->ndo_get_stats64(dev, storage);
7689	} else if (ops->ndo_get_stats) {
7690		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7691	} else {
7692		netdev_stats_to_stats64(storage, &dev->stats);
7693	}
7694	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7695	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7696	storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7697	return storage;
7698}
7699EXPORT_SYMBOL(dev_get_stats);
7700
7701struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7702{
7703	struct netdev_queue *queue = dev_ingress_queue(dev);
7704
7705#ifdef CONFIG_NET_CLS_ACT
7706	if (queue)
7707		return queue;
7708	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7709	if (!queue)
7710		return NULL;
7711	netdev_init_one_queue(dev, queue, NULL);
7712	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7713	queue->qdisc_sleeping = &noop_qdisc;
7714	rcu_assign_pointer(dev->ingress_queue, queue);
7715#endif
7716	return queue;
7717}
7718
7719static const struct ethtool_ops default_ethtool_ops;
7720
7721void netdev_set_default_ethtool_ops(struct net_device *dev,
7722				    const struct ethtool_ops *ops)
7723{
7724	if (dev->ethtool_ops == &default_ethtool_ops)
7725		dev->ethtool_ops = ops;
7726}
7727EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7728
7729void netdev_freemem(struct net_device *dev)
7730{
7731	char *addr = (char *)dev - dev->padded;
7732
7733	kvfree(addr);
7734}
7735
7736/**
7737 *	alloc_netdev_mqs - allocate network device
7738 *	@sizeof_priv:		size of private data to allocate space for
7739 *	@name:			device name format string
7740 *	@name_assign_type: 	origin of device name
7741 *	@setup:			callback to initialize device
7742 *	@txqs:			the number of TX subqueues to allocate
7743 *	@rxqs:			the number of RX subqueues to allocate
7744 *
7745 *	Allocates a struct net_device with private data area for driver use
7746 *	and performs basic initialization.  Also allocates subqueue structs
7747 *	for each queue on the device.
7748 */
7749struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7750		unsigned char name_assign_type,
7751		void (*setup)(struct net_device *),
7752		unsigned int txqs, unsigned int rxqs)
7753{
7754	struct net_device *dev;
7755	size_t alloc_size;
7756	struct net_device *p;
7757
7758	BUG_ON(strlen(name) >= sizeof(dev->name));
7759
7760	if (txqs < 1) {
7761		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7762		return NULL;
7763	}
7764
7765#ifdef CONFIG_SYSFS
7766	if (rxqs < 1) {
7767		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7768		return NULL;
7769	}
7770#endif
7771
7772	alloc_size = sizeof(struct net_device);
7773	if (sizeof_priv) {
7774		/* ensure 32-byte alignment of private area */
7775		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7776		alloc_size += sizeof_priv;
7777	}
7778	/* ensure 32-byte alignment of whole construct */
7779	alloc_size += NETDEV_ALIGN - 1;
7780
7781	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7782	if (!p)
7783		p = vzalloc(alloc_size);
7784	if (!p)
7785		return NULL;
7786
7787	dev = PTR_ALIGN(p, NETDEV_ALIGN);
7788	dev->padded = (char *)dev - (char *)p;
7789
7790	dev->pcpu_refcnt = alloc_percpu(int);
7791	if (!dev->pcpu_refcnt)
7792		goto free_dev;
7793
7794	if (dev_addr_init(dev))
7795		goto free_pcpu;
7796
7797	dev_mc_init(dev);
7798	dev_uc_init(dev);
7799
7800	dev_net_set(dev, &init_net);
7801
7802	dev->gso_max_size = GSO_MAX_SIZE;
7803	dev->gso_max_segs = GSO_MAX_SEGS;
7804
7805	INIT_LIST_HEAD(&dev->napi_list);
7806	INIT_LIST_HEAD(&dev->unreg_list);
7807	INIT_LIST_HEAD(&dev->close_list);
7808	INIT_LIST_HEAD(&dev->link_watch_list);
7809	INIT_LIST_HEAD(&dev->adj_list.upper);
7810	INIT_LIST_HEAD(&dev->adj_list.lower);
7811	INIT_LIST_HEAD(&dev->ptype_all);
7812	INIT_LIST_HEAD(&dev->ptype_specific);
7813#ifdef CONFIG_NET_SCHED
7814	hash_init(dev->qdisc_hash);
7815#endif
7816	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7817	setup(dev);
7818
7819	if (!dev->tx_queue_len) {
7820		dev->priv_flags |= IFF_NO_QUEUE;
7821		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
7822	}
7823
7824	dev->num_tx_queues = txqs;
7825	dev->real_num_tx_queues = txqs;
7826	if (netif_alloc_netdev_queues(dev))
7827		goto free_all;
7828
7829#ifdef CONFIG_SYSFS
7830	dev->num_rx_queues = rxqs;
7831	dev->real_num_rx_queues = rxqs;
7832	if (netif_alloc_rx_queues(dev))
7833		goto free_all;
7834#endif
7835
7836	strcpy(dev->name, name);
7837	dev->name_assign_type = name_assign_type;
7838	dev->group = INIT_NETDEV_GROUP;
7839	if (!dev->ethtool_ops)
7840		dev->ethtool_ops = &default_ethtool_ops;
7841
7842	nf_hook_ingress_init(dev);
7843
7844	return dev;
7845
7846free_all:
7847	free_netdev(dev);
7848	return NULL;
7849
7850free_pcpu:
7851	free_percpu(dev->pcpu_refcnt);
7852free_dev:
7853	netdev_freemem(dev);
7854	return NULL;
7855}
7856EXPORT_SYMBOL(alloc_netdev_mqs);
7857
7858/**
7859 *	free_netdev - free network device
7860 *	@dev: device
7861 *
7862 *	This function does the last stage of destroying an allocated device
7863 * 	interface. The reference to the device object is released.
7864 *	If this is the last reference then it will be freed.
7865 *	Must be called in process context.
7866 */
7867void free_netdev(struct net_device *dev)
7868{
7869	struct napi_struct *p, *n;
7870
7871	might_sleep();
7872	netif_free_tx_queues(dev);
7873#ifdef CONFIG_SYSFS
7874	kvfree(dev->_rx);
7875#endif
7876
7877	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7878
7879	/* Flush device addresses */
7880	dev_addr_flush(dev);
7881
7882	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7883		netif_napi_del(p);
7884
7885	free_percpu(dev->pcpu_refcnt);
7886	dev->pcpu_refcnt = NULL;
7887
7888	/*  Compatibility with error handling in drivers */
7889	if (dev->reg_state == NETREG_UNINITIALIZED) {
7890		netdev_freemem(dev);
7891		return;
7892	}
7893
7894	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7895	dev->reg_state = NETREG_RELEASED;
7896
7897	/* will free via device release */
7898	put_device(&dev->dev);
7899}
7900EXPORT_SYMBOL(free_netdev);
7901
7902/**
7903 *	synchronize_net -  Synchronize with packet receive processing
7904 *
7905 *	Wait for packets currently being received to be done.
7906 *	Does not block later packets from starting.
7907 */
7908void synchronize_net(void)
7909{
7910	might_sleep();
7911	if (rtnl_is_locked())
7912		synchronize_rcu_expedited();
7913	else
7914		synchronize_rcu();
7915}
7916EXPORT_SYMBOL(synchronize_net);
7917
7918/**
7919 *	unregister_netdevice_queue - remove device from the kernel
7920 *	@dev: device
7921 *	@head: list
7922 *
7923 *	This function shuts down a device interface and removes it
7924 *	from the kernel tables.
7925 *	If head not NULL, device is queued to be unregistered later.
7926 *
7927 *	Callers must hold the rtnl semaphore.  You may want
7928 *	unregister_netdev() instead of this.
7929 */
7930
7931void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7932{
7933	ASSERT_RTNL();
7934
7935	if (head) {
7936		list_move_tail(&dev->unreg_list, head);
7937	} else {
7938		rollback_registered(dev);
7939		/* Finish processing unregister after unlock */
7940		net_set_todo(dev);
7941	}
7942}
7943EXPORT_SYMBOL(unregister_netdevice_queue);
7944
7945/**
7946 *	unregister_netdevice_many - unregister many devices
7947 *	@head: list of devices
7948 *
7949 *  Note: As most callers use a stack allocated list_head,
7950 *  we force a list_del() to make sure stack wont be corrupted later.
7951 */
7952void unregister_netdevice_many(struct list_head *head)
7953{
7954	struct net_device *dev;
7955
7956	if (!list_empty(head)) {
7957		rollback_registered_many(head);
7958		list_for_each_entry(dev, head, unreg_list)
7959			net_set_todo(dev);
7960		list_del(head);
7961	}
7962}
7963EXPORT_SYMBOL(unregister_netdevice_many);
7964
7965/**
7966 *	unregister_netdev - remove device from the kernel
7967 *	@dev: device
7968 *
7969 *	This function shuts down a device interface and removes it
7970 *	from the kernel tables.
7971 *
7972 *	This is just a wrapper for unregister_netdevice that takes
7973 *	the rtnl semaphore.  In general you want to use this and not
7974 *	unregister_netdevice.
7975 */
7976void unregister_netdev(struct net_device *dev)
7977{
7978	rtnl_lock();
7979	unregister_netdevice(dev);
7980	rtnl_unlock();
7981}
7982EXPORT_SYMBOL(unregister_netdev);
7983
7984/**
7985 *	dev_change_net_namespace - move device to different nethost namespace
7986 *	@dev: device
7987 *	@net: network namespace
7988 *	@pat: If not NULL name pattern to try if the current device name
7989 *	      is already taken in the destination network namespace.
7990 *
7991 *	This function shuts down a device interface and moves it
7992 *	to a new network namespace. On success 0 is returned, on
7993 *	a failure a netagive errno code is returned.
7994 *
7995 *	Callers must hold the rtnl semaphore.
7996 */
7997
7998int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7999{
8000	int err;
8001
8002	ASSERT_RTNL();
8003
8004	/* Don't allow namespace local devices to be moved. */
8005	err = -EINVAL;
8006	if (dev->features & NETIF_F_NETNS_LOCAL)
8007		goto out;
8008
8009	/* Ensure the device has been registrered */
8010	if (dev->reg_state != NETREG_REGISTERED)
8011		goto out;
8012
8013	/* Get out if there is nothing todo */
8014	err = 0;
8015	if (net_eq(dev_net(dev), net))
8016		goto out;
8017
8018	/* Pick the destination device name, and ensure
8019	 * we can use it in the destination network namespace.
8020	 */
8021	err = -EEXIST;
8022	if (__dev_get_by_name(net, dev->name)) {
8023		/* We get here if we can't use the current device name */
8024		if (!pat)
8025			goto out;
8026		if (dev_get_valid_name(net, dev, pat) < 0)
8027			goto out;
8028	}
8029
8030	/*
8031	 * And now a mini version of register_netdevice unregister_netdevice.
8032	 */
8033
8034	/* If device is running close it first. */
8035	dev_close(dev);
8036
8037	/* And unlink it from device chain */
8038	err = -ENODEV;
8039	unlist_netdevice(dev);
8040
8041	synchronize_net();
8042
8043	/* Shutdown queueing discipline. */
8044	dev_shutdown(dev);
8045
8046	/* Notify protocols, that we are about to destroy
8047	   this device. They should clean all the things.
8048
8049	   Note that dev->reg_state stays at NETREG_REGISTERED.
8050	   This is wanted because this way 8021q and macvlan know
8051	   the device is just moving and can keep their slaves up.
8052	*/
8053	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8054	rcu_barrier();
8055	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
8056	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
 
 
 
 
 
 
 
 
8057
8058	/*
8059	 *	Flush the unicast and multicast chains
8060	 */
8061	dev_uc_flush(dev);
8062	dev_mc_flush(dev);
8063
8064	/* Send a netdev-removed uevent to the old namespace */
8065	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
8066	netdev_adjacent_del_links(dev);
8067
8068	/* Actually switch the network namespace */
8069	dev_net_set(dev, net);
8070
8071	/* If there is an ifindex conflict assign a new one */
8072	if (__dev_get_by_index(net, dev->ifindex))
8073		dev->ifindex = dev_new_index(net);
8074
8075	/* Send a netdev-add uevent to the new namespace */
8076	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
8077	netdev_adjacent_add_links(dev);
8078
8079	/* Fixup kobjects */
8080	err = device_rename(&dev->dev, dev->name);
8081	WARN_ON(err);
8082
8083	/* Add the device back in the hashes */
8084	list_netdevice(dev);
8085
8086	/* Notify protocols, that a new device appeared. */
8087	call_netdevice_notifiers(NETDEV_REGISTER, dev);
8088
8089	/*
8090	 *	Prevent userspace races by waiting until the network
8091	 *	device is fully setup before sending notifications.
8092	 */
8093	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
8094
8095	synchronize_net();
8096	err = 0;
8097out:
8098	return err;
8099}
8100EXPORT_SYMBOL_GPL(dev_change_net_namespace);
8101
8102static int dev_cpu_dead(unsigned int oldcpu)
8103{
8104	struct sk_buff **list_skb;
8105	struct sk_buff *skb;
8106	unsigned int cpu;
8107	struct softnet_data *sd, *oldsd;
8108
8109	local_irq_disable();
8110	cpu = smp_processor_id();
8111	sd = &per_cpu(softnet_data, cpu);
8112	oldsd = &per_cpu(softnet_data, oldcpu);
8113
8114	/* Find end of our completion_queue. */
8115	list_skb = &sd->completion_queue;
8116	while (*list_skb)
8117		list_skb = &(*list_skb)->next;
8118	/* Append completion queue from offline CPU. */
8119	*list_skb = oldsd->completion_queue;
8120	oldsd->completion_queue = NULL;
8121
8122	/* Append output queue from offline CPU. */
8123	if (oldsd->output_queue) {
8124		*sd->output_queue_tailp = oldsd->output_queue;
8125		sd->output_queue_tailp = oldsd->output_queue_tailp;
8126		oldsd->output_queue = NULL;
8127		oldsd->output_queue_tailp = &oldsd->output_queue;
8128	}
8129	/* Append NAPI poll list from offline CPU, with one exception :
8130	 * process_backlog() must be called by cpu owning percpu backlog.
8131	 * We properly handle process_queue & input_pkt_queue later.
8132	 */
8133	while (!list_empty(&oldsd->poll_list)) {
8134		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
8135							    struct napi_struct,
8136							    poll_list);
8137
8138		list_del_init(&napi->poll_list);
8139		if (napi->poll == process_backlog)
8140			napi->state = 0;
8141		else
8142			____napi_schedule(sd, napi);
8143	}
8144
8145	raise_softirq_irqoff(NET_TX_SOFTIRQ);
8146	local_irq_enable();
8147
 
 
 
 
 
 
 
8148	/* Process offline CPU's input_pkt_queue */
8149	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
8150		netif_rx_ni(skb);
8151		input_queue_head_incr(oldsd);
8152	}
8153	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
8154		netif_rx_ni(skb);
8155		input_queue_head_incr(oldsd);
8156	}
8157
8158	return 0;
8159}
8160
8161/**
8162 *	netdev_increment_features - increment feature set by one
8163 *	@all: current feature set
8164 *	@one: new feature set
8165 *	@mask: mask feature set
8166 *
8167 *	Computes a new feature set after adding a device with feature set
8168 *	@one to the master device with current feature set @all.  Will not
8169 *	enable anything that is off in @mask. Returns the new feature set.
8170 */
8171netdev_features_t netdev_increment_features(netdev_features_t all,
8172	netdev_features_t one, netdev_features_t mask)
8173{
8174	if (mask & NETIF_F_HW_CSUM)
8175		mask |= NETIF_F_CSUM_MASK;
8176	mask |= NETIF_F_VLAN_CHALLENGED;
8177
8178	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
8179	all &= one | ~NETIF_F_ALL_FOR_ALL;
8180
8181	/* If one device supports hw checksumming, set for all. */
8182	if (all & NETIF_F_HW_CSUM)
8183		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
8184
8185	return all;
8186}
8187EXPORT_SYMBOL(netdev_increment_features);
8188
8189static struct hlist_head * __net_init netdev_create_hash(void)
8190{
8191	int i;
8192	struct hlist_head *hash;
8193
8194	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8195	if (hash != NULL)
8196		for (i = 0; i < NETDEV_HASHENTRIES; i++)
8197			INIT_HLIST_HEAD(&hash[i]);
8198
8199	return hash;
8200}
8201
8202/* Initialize per network namespace state */
8203static int __net_init netdev_init(struct net *net)
8204{
8205	if (net != &init_net)
8206		INIT_LIST_HEAD(&net->dev_base_head);
8207
8208	net->dev_name_head = netdev_create_hash();
8209	if (net->dev_name_head == NULL)
8210		goto err_name;
8211
8212	net->dev_index_head = netdev_create_hash();
8213	if (net->dev_index_head == NULL)
8214		goto err_idx;
8215
8216	return 0;
8217
8218err_idx:
8219	kfree(net->dev_name_head);
8220err_name:
8221	return -ENOMEM;
8222}
8223
8224/**
8225 *	netdev_drivername - network driver for the device
8226 *	@dev: network device
8227 *
8228 *	Determine network driver for device.
8229 */
8230const char *netdev_drivername(const struct net_device *dev)
8231{
8232	const struct device_driver *driver;
8233	const struct device *parent;
8234	const char *empty = "";
8235
8236	parent = dev->dev.parent;
8237	if (!parent)
8238		return empty;
8239
8240	driver = parent->driver;
8241	if (driver && driver->name)
8242		return driver->name;
8243	return empty;
8244}
8245
8246static void __netdev_printk(const char *level, const struct net_device *dev,
8247			    struct va_format *vaf)
8248{
8249	if (dev && dev->dev.parent) {
8250		dev_printk_emit(level[1] - '0',
8251				dev->dev.parent,
8252				"%s %s %s%s: %pV",
8253				dev_driver_string(dev->dev.parent),
8254				dev_name(dev->dev.parent),
8255				netdev_name(dev), netdev_reg_state(dev),
8256				vaf);
8257	} else if (dev) {
8258		printk("%s%s%s: %pV",
8259		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
8260	} else {
8261		printk("%s(NULL net_device): %pV", level, vaf);
8262	}
8263}
8264
8265void netdev_printk(const char *level, const struct net_device *dev,
8266		   const char *format, ...)
8267{
8268	struct va_format vaf;
8269	va_list args;
8270
8271	va_start(args, format);
8272
8273	vaf.fmt = format;
8274	vaf.va = &args;
8275
8276	__netdev_printk(level, dev, &vaf);
8277
8278	va_end(args);
8279}
8280EXPORT_SYMBOL(netdev_printk);
8281
8282#define define_netdev_printk_level(func, level)			\
8283void func(const struct net_device *dev, const char *fmt, ...)	\
8284{								\
8285	struct va_format vaf;					\
8286	va_list args;						\
8287								\
8288	va_start(args, fmt);					\
8289								\
8290	vaf.fmt = fmt;						\
8291	vaf.va = &args;						\
8292								\
8293	__netdev_printk(level, dev, &vaf);			\
8294								\
8295	va_end(args);						\
8296}								\
8297EXPORT_SYMBOL(func);
8298
8299define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8300define_netdev_printk_level(netdev_alert, KERN_ALERT);
8301define_netdev_printk_level(netdev_crit, KERN_CRIT);
8302define_netdev_printk_level(netdev_err, KERN_ERR);
8303define_netdev_printk_level(netdev_warn, KERN_WARNING);
8304define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8305define_netdev_printk_level(netdev_info, KERN_INFO);
8306
8307static void __net_exit netdev_exit(struct net *net)
8308{
8309	kfree(net->dev_name_head);
8310	kfree(net->dev_index_head);
 
 
8311}
8312
8313static struct pernet_operations __net_initdata netdev_net_ops = {
8314	.init = netdev_init,
8315	.exit = netdev_exit,
8316};
8317
8318static void __net_exit default_device_exit(struct net *net)
8319{
8320	struct net_device *dev, *aux;
8321	/*
8322	 * Push all migratable network devices back to the
8323	 * initial network namespace
8324	 */
8325	rtnl_lock();
8326	for_each_netdev_safe(net, dev, aux) {
8327		int err;
8328		char fb_name[IFNAMSIZ];
8329
8330		/* Ignore unmoveable devices (i.e. loopback) */
8331		if (dev->features & NETIF_F_NETNS_LOCAL)
8332			continue;
8333
8334		/* Leave virtual devices for the generic cleanup */
8335		if (dev->rtnl_link_ops)
8336			continue;
8337
8338		/* Push remaining network devices to init_net */
8339		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8340		err = dev_change_net_namespace(dev, &init_net, fb_name);
8341		if (err) {
8342			pr_emerg("%s: failed to move %s to init_net: %d\n",
8343				 __func__, dev->name, err);
8344			BUG();
8345		}
8346	}
8347	rtnl_unlock();
8348}
8349
8350static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8351{
8352	/* Return with the rtnl_lock held when there are no network
8353	 * devices unregistering in any network namespace in net_list.
8354	 */
8355	struct net *net;
8356	bool unregistering;
8357	DEFINE_WAIT_FUNC(wait, woken_wake_function);
8358
8359	add_wait_queue(&netdev_unregistering_wq, &wait);
8360	for (;;) {
8361		unregistering = false;
8362		rtnl_lock();
8363		list_for_each_entry(net, net_list, exit_list) {
8364			if (net->dev_unreg_count > 0) {
8365				unregistering = true;
8366				break;
8367			}
8368		}
8369		if (!unregistering)
8370			break;
8371		__rtnl_unlock();
8372
8373		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8374	}
8375	remove_wait_queue(&netdev_unregistering_wq, &wait);
8376}
8377
8378static void __net_exit default_device_exit_batch(struct list_head *net_list)
8379{
8380	/* At exit all network devices most be removed from a network
8381	 * namespace.  Do this in the reverse order of registration.
8382	 * Do this across as many network namespaces as possible to
8383	 * improve batching efficiency.
8384	 */
8385	struct net_device *dev;
8386	struct net *net;
8387	LIST_HEAD(dev_kill_list);
8388
8389	/* To prevent network device cleanup code from dereferencing
8390	 * loopback devices or network devices that have been freed
8391	 * wait here for all pending unregistrations to complete,
8392	 * before unregistring the loopback device and allowing the
8393	 * network namespace be freed.
8394	 *
8395	 * The netdev todo list containing all network devices
8396	 * unregistrations that happen in default_device_exit_batch
8397	 * will run in the rtnl_unlock() at the end of
8398	 * default_device_exit_batch.
8399	 */
8400	rtnl_lock_unregistering(net_list);
8401	list_for_each_entry(net, net_list, exit_list) {
8402		for_each_netdev_reverse(net, dev) {
8403			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8404				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8405			else
8406				unregister_netdevice_queue(dev, &dev_kill_list);
8407		}
8408	}
8409	unregister_netdevice_many(&dev_kill_list);
8410	rtnl_unlock();
8411}
8412
8413static struct pernet_operations __net_initdata default_device_ops = {
8414	.exit = default_device_exit,
8415	.exit_batch = default_device_exit_batch,
8416};
8417
8418/*
8419 *	Initialize the DEV module. At boot time this walks the device list and
8420 *	unhooks any devices that fail to initialise (normally hardware not
8421 *	present) and leaves us with a valid list of present and active devices.
8422 *
8423 */
8424
8425/*
8426 *       This is called single threaded during boot, so no need
8427 *       to take the rtnl semaphore.
8428 */
8429static int __init net_dev_init(void)
8430{
8431	int i, rc = -ENOMEM;
8432
8433	BUG_ON(!dev_boot_phase);
8434
8435	if (dev_proc_init())
8436		goto out;
8437
8438	if (netdev_kobject_init())
8439		goto out;
8440
8441	INIT_LIST_HEAD(&ptype_all);
8442	for (i = 0; i < PTYPE_HASH_SIZE; i++)
8443		INIT_LIST_HEAD(&ptype_base[i]);
8444
8445	INIT_LIST_HEAD(&offload_base);
8446
8447	if (register_pernet_subsys(&netdev_net_ops))
8448		goto out;
8449
8450	/*
8451	 *	Initialise the packet receive queues.
8452	 */
8453
8454	for_each_possible_cpu(i) {
8455		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
8456		struct softnet_data *sd = &per_cpu(softnet_data, i);
8457
8458		INIT_WORK(flush, flush_backlog);
8459
8460		skb_queue_head_init(&sd->input_pkt_queue);
8461		skb_queue_head_init(&sd->process_queue);
 
 
 
8462		INIT_LIST_HEAD(&sd->poll_list);
8463		sd->output_queue_tailp = &sd->output_queue;
8464#ifdef CONFIG_RPS
8465		sd->csd.func = rps_trigger_softirq;
8466		sd->csd.info = sd;
8467		sd->cpu = i;
8468#endif
8469
8470		sd->backlog.poll = process_backlog;
8471		sd->backlog.weight = weight_p;
8472	}
8473
8474	dev_boot_phase = 0;
8475
8476	/* The loopback device is special if any other network devices
8477	 * is present in a network namespace the loopback device must
8478	 * be present. Since we now dynamically allocate and free the
8479	 * loopback device ensure this invariant is maintained by
8480	 * keeping the loopback device as the first device on the
8481	 * list of network devices.  Ensuring the loopback devices
8482	 * is the first device that appears and the last network device
8483	 * that disappears.
8484	 */
8485	if (register_pernet_device(&loopback_net_ops))
8486		goto out;
8487
8488	if (register_pernet_device(&default_device_ops))
8489		goto out;
8490
8491	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8492	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8493
8494	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
8495				       NULL, dev_cpu_dead);
8496	WARN_ON(rc < 0);
8497	dst_subsys_init();
8498	rc = 0;
8499out:
8500	return rc;
8501}
8502
8503subsys_initcall(net_dev_init);