Linux Audio

Check our new training course

Loading...
v5.4
    1// SPDX-License-Identifier: GPL-2.0-or-later
    2/*
    3 *      NET3    Protocol independent device support routines.
 
 
 
 
 
    4 *
    5 *	Derived from the non IP parts of dev.c 1.0.19
    6 *              Authors:	Ross Biro
    7 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
    8 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
    9 *
   10 *	Additional Authors:
   11 *		Florian la Roche <rzsfl@rz.uni-sb.de>
   12 *		Alan Cox <gw4pts@gw4pts.ampr.org>
   13 *		David Hinds <dahinds@users.sourceforge.net>
   14 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
   15 *		Adam Sulmicki <adam@cfar.umd.edu>
   16 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
   17 *
   18 *	Changes:
   19 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
   20 *                                      to 2 if register_netdev gets called
   21 *                                      before net_dev_init & also removed a
   22 *                                      few lines of code in the process.
   23 *		Alan Cox	:	device private ioctl copies fields back.
   24 *		Alan Cox	:	Transmit queue code does relevant
   25 *					stunts to keep the queue safe.
   26 *		Alan Cox	:	Fixed double lock.
   27 *		Alan Cox	:	Fixed promisc NULL pointer trap
   28 *		????????	:	Support the full private ioctl range
   29 *		Alan Cox	:	Moved ioctl permission check into
   30 *					drivers
   31 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
   32 *		Alan Cox	:	100 backlog just doesn't cut it when
   33 *					you start doing multicast video 8)
   34 *		Alan Cox	:	Rewrote net_bh and list manager.
   35 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
   36 *		Alan Cox	:	Took out transmit every packet pass
   37 *					Saved a few bytes in the ioctl handler
   38 *		Alan Cox	:	Network driver sets packet type before
   39 *					calling netif_rx. Saves a function
   40 *					call a packet.
   41 *		Alan Cox	:	Hashed net_bh()
   42 *		Richard Kooijman:	Timestamp fixes.
   43 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
   44 *		Alan Cox	:	Device lock protection.
   45 *              Alan Cox        :       Fixed nasty side effect of device close
   46 *					changes.
   47 *		Rudi Cilibrasi	:	Pass the right thing to
   48 *					set_mac_address()
   49 *		Dave Miller	:	32bit quantity for the device lock to
   50 *					make it work out on a Sparc.
   51 *		Bjorn Ekwall	:	Added KERNELD hack.
   52 *		Alan Cox	:	Cleaned up the backlog initialise.
   53 *		Craig Metz	:	SIOCGIFCONF fix if space for under
   54 *					1 device.
   55 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
   56 *					is no device open function.
   57 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
   58 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
   59 *		Cyrus Durgin	:	Cleaned for KMOD
   60 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
   61 *					A network device unload needs to purge
   62 *					the backlog queue.
   63 *	Paul Rusty Russell	:	SIOCSIFNAME
   64 *              Pekka Riikonen  :	Netdev boot-time settings code
   65 *              Andrew Morton   :       Make unregister_netdevice wait
   66 *                                      indefinitely on dev->refcnt
   67 *              J Hadi Salim    :       - Backlog queue sampling
   68 *				        - netif_rx() feedback
   69 */
   70
   71#include <linux/uaccess.h>
   72#include <linux/bitops.h>
   73#include <linux/capability.h>
   74#include <linux/cpu.h>
   75#include <linux/types.h>
   76#include <linux/kernel.h>
   77#include <linux/hash.h>
   78#include <linux/slab.h>
   79#include <linux/sched.h>
   80#include <linux/sched/mm.h>
   81#include <linux/mutex.h>
   82#include <linux/string.h>
   83#include <linux/mm.h>
   84#include <linux/socket.h>
   85#include <linux/sockios.h>
   86#include <linux/errno.h>
   87#include <linux/interrupt.h>
   88#include <linux/if_ether.h>
   89#include <linux/netdevice.h>
   90#include <linux/etherdevice.h>
   91#include <linux/ethtool.h>
 
   92#include <linux/skbuff.h>
   93#include <linux/bpf.h>
   94#include <linux/bpf_trace.h>
   95#include <net/net_namespace.h>
   96#include <net/sock.h>
   97#include <net/busy_poll.h>
   98#include <linux/rtnetlink.h>
   99#include <linux/stat.h>
  100#include <net/dst.h>
  101#include <net/dst_metadata.h>
  102#include <net/pkt_sched.h>
  103#include <net/pkt_cls.h>
  104#include <net/checksum.h>
  105#include <net/xfrm.h>
  106#include <linux/highmem.h>
  107#include <linux/init.h>
  108#include <linux/module.h>
  109#include <linux/netpoll.h>
  110#include <linux/rcupdate.h>
  111#include <linux/delay.h>
  112#include <net/iw_handler.h>
  113#include <asm/current.h>
  114#include <linux/audit.h>
  115#include <linux/dmaengine.h>
  116#include <linux/err.h>
  117#include <linux/ctype.h>
  118#include <linux/if_arp.h>
  119#include <linux/if_vlan.h>
  120#include <linux/ip.h>
  121#include <net/ip.h>
  122#include <net/mpls.h>
  123#include <linux/ipv6.h>
  124#include <linux/in.h>
  125#include <linux/jhash.h>
  126#include <linux/random.h>
  127#include <trace/events/napi.h>
  128#include <trace/events/net.h>
  129#include <trace/events/skb.h>
 
  130#include <linux/inetdevice.h>
  131#include <linux/cpu_rmap.h>
  132#include <linux/static_key.h>
  133#include <linux/hashtable.h>
  134#include <linux/vmalloc.h>
  135#include <linux/if_macvlan.h>
  136#include <linux/errqueue.h>
  137#include <linux/hrtimer.h>
  138#include <linux/netfilter_ingress.h>
  139#include <linux/crash_dump.h>
  140#include <linux/sctp.h>
  141#include <net/udp_tunnel.h>
  142#include <linux/net_namespace.h>
  143#include <linux/indirect_call_wrapper.h>
  144#include <net/devlink.h>
  145
  146#include "net-sysfs.h"
  147
 
  148#define MAX_GRO_SKBS 8
  149#define MAX_NEST_DEV 8
  150
  151/* This should be increased if a protocol with a bigger head is added. */
  152#define GRO_MAX_HEAD (MAX_HEADER + 128)
  153
  154static DEFINE_SPINLOCK(ptype_lock);
  155static DEFINE_SPINLOCK(offload_lock);
  156struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
  157struct list_head ptype_all __read_mostly;	/* Taps */
  158static struct list_head offload_base __read_mostly;
  159
  160static int netif_rx_internal(struct sk_buff *skb);
  161static int call_netdevice_notifiers_info(unsigned long val,
 
  162					 struct netdev_notifier_info *info);
  163static int call_netdevice_notifiers_extack(unsigned long val,
  164					   struct net_device *dev,
  165					   struct netlink_ext_ack *extack);
  166static struct napi_struct *napi_by_id(unsigned int napi_id);
  167
  168/*
  169 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
  170 * semaphore.
  171 *
  172 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
  173 *
  174 * Writers must hold the rtnl semaphore while they loop through the
  175 * dev_base_head list, and hold dev_base_lock for writing when they do the
  176 * actual updates.  This allows pure readers to access the list even
  177 * while a writer is preparing to update it.
  178 *
  179 * To put it another way, dev_base_lock is held for writing only to
  180 * protect against pure readers; the rtnl semaphore provides the
  181 * protection against other writers.
  182 *
  183 * See, for example usages, register_netdevice() and
  184 * unregister_netdevice(), which must be called with the rtnl
  185 * semaphore held.
  186 */
  187DEFINE_RWLOCK(dev_base_lock);
  188EXPORT_SYMBOL(dev_base_lock);
  189
  190static DEFINE_MUTEX(ifalias_mutex);
  191
  192/* protects napi_hash addition/deletion and napi_gen_id */
  193static DEFINE_SPINLOCK(napi_hash_lock);
  194
  195static unsigned int napi_gen_id = NR_CPUS;
  196static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
  197
  198static seqcount_t devnet_rename_seq;
  199
  200static inline void dev_base_seq_inc(struct net *net)
  201{
  202	while (++net->dev_base_seq == 0)
  203		;
  204}
  205
  206static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
  207{
  208	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
  209
  210	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
  211}
  212
  213static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
  214{
  215	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
  216}
  217
  218static inline void rps_lock(struct softnet_data *sd)
  219{
  220#ifdef CONFIG_RPS
  221	spin_lock(&sd->input_pkt_queue.lock);
  222#endif
  223}
  224
  225static inline void rps_unlock(struct softnet_data *sd)
  226{
  227#ifdef CONFIG_RPS
  228	spin_unlock(&sd->input_pkt_queue.lock);
  229#endif
  230}
  231
  232/* Device list insertion */
  233static void list_netdevice(struct net_device *dev)
  234{
  235	struct net *net = dev_net(dev);
  236
  237	ASSERT_RTNL();
  238
  239	write_lock_bh(&dev_base_lock);
  240	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
  241	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
  242	hlist_add_head_rcu(&dev->index_hlist,
  243			   dev_index_hash(net, dev->ifindex));
  244	write_unlock_bh(&dev_base_lock);
  245
  246	dev_base_seq_inc(net);
  247}
  248
  249/* Device list removal
  250 * caller must respect a RCU grace period before freeing/reusing dev
  251 */
  252static void unlist_netdevice(struct net_device *dev)
  253{
  254	ASSERT_RTNL();
  255
  256	/* Unlink dev from the device chain */
  257	write_lock_bh(&dev_base_lock);
  258	list_del_rcu(&dev->dev_list);
  259	hlist_del_rcu(&dev->name_hlist);
  260	hlist_del_rcu(&dev->index_hlist);
  261	write_unlock_bh(&dev_base_lock);
  262
  263	dev_base_seq_inc(dev_net(dev));
  264}
  265
  266/*
  267 *	Our notifier list
  268 */
  269
  270static RAW_NOTIFIER_HEAD(netdev_chain);
  271
  272/*
  273 *	Device drivers call our routines to queue packets here. We empty the
  274 *	queue in the local softnet handler.
  275 */
  276
  277DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
  278EXPORT_PER_CPU_SYMBOL(softnet_data);
  279
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  280/*******************************************************************************
  281 *
  282 *		Protocol management and registration routines
  283 *
  284 *******************************************************************************/
  285
 
 
 
  286
  287/*
  288 *	Add a protocol ID to the list. Now that the input handler is
  289 *	smarter we can dispense with all the messy stuff that used to be
  290 *	here.
  291 *
  292 *	BEWARE!!! Protocol handlers, mangling input packets,
  293 *	MUST BE last in hash buckets and checking protocol handlers
  294 *	MUST start from promiscuous ptype_all chain in net_bh.
  295 *	It is true now, do not change it.
  296 *	Explanation follows: if protocol handler, mangling packet, will
  297 *	be the first on list, it is not able to sense, that packet
  298 *	is cloned and should be copied-on-write, so that it will
  299 *	change it and subsequent readers will get broken packet.
  300 *							--ANK (980803)
  301 */
  302
  303static inline struct list_head *ptype_head(const struct packet_type *pt)
  304{
  305	if (pt->type == htons(ETH_P_ALL))
  306		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
  307	else
  308		return pt->dev ? &pt->dev->ptype_specific :
  309				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
  310}
  311
  312/**
  313 *	dev_add_pack - add packet handler
  314 *	@pt: packet type declaration
  315 *
  316 *	Add a protocol handler to the networking stack. The passed &packet_type
  317 *	is linked into kernel lists and may not be freed until it has been
  318 *	removed from the kernel lists.
  319 *
  320 *	This call does not sleep therefore it can not
  321 *	guarantee all CPU's that are in middle of receiving packets
  322 *	will see the new packet type (until the next received packet).
  323 */
  324
  325void dev_add_pack(struct packet_type *pt)
  326{
  327	struct list_head *head = ptype_head(pt);
  328
  329	spin_lock(&ptype_lock);
  330	list_add_rcu(&pt->list, head);
  331	spin_unlock(&ptype_lock);
  332}
  333EXPORT_SYMBOL(dev_add_pack);
  334
  335/**
  336 *	__dev_remove_pack	 - remove packet handler
  337 *	@pt: packet type declaration
  338 *
  339 *	Remove a protocol handler that was previously added to the kernel
  340 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  341 *	from the kernel lists and can be freed or reused once this function
  342 *	returns.
  343 *
  344 *      The packet type might still be in use by receivers
  345 *	and must not be freed until after all the CPU's have gone
  346 *	through a quiescent state.
  347 */
  348void __dev_remove_pack(struct packet_type *pt)
  349{
  350	struct list_head *head = ptype_head(pt);
  351	struct packet_type *pt1;
  352
  353	spin_lock(&ptype_lock);
  354
  355	list_for_each_entry(pt1, head, list) {
  356		if (pt == pt1) {
  357			list_del_rcu(&pt->list);
  358			goto out;
  359		}
  360	}
  361
  362	pr_warn("dev_remove_pack: %p not found\n", pt);
  363out:
  364	spin_unlock(&ptype_lock);
  365}
  366EXPORT_SYMBOL(__dev_remove_pack);
  367
  368/**
  369 *	dev_remove_pack	 - remove packet handler
  370 *	@pt: packet type declaration
  371 *
  372 *	Remove a protocol handler that was previously added to the kernel
  373 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  374 *	from the kernel lists and can be freed or reused once this function
  375 *	returns.
  376 *
  377 *	This call sleeps to guarantee that no CPU is looking at the packet
  378 *	type after return.
  379 */
  380void dev_remove_pack(struct packet_type *pt)
  381{
  382	__dev_remove_pack(pt);
  383
  384	synchronize_net();
  385}
  386EXPORT_SYMBOL(dev_remove_pack);
  387
  388
  389/**
  390 *	dev_add_offload - register offload handlers
  391 *	@po: protocol offload declaration
  392 *
  393 *	Add protocol offload handlers to the networking stack. The passed
  394 *	&proto_offload is linked into kernel lists and may not be freed until
  395 *	it has been removed from the kernel lists.
  396 *
  397 *	This call does not sleep therefore it can not
  398 *	guarantee all CPU's that are in middle of receiving packets
  399 *	will see the new offload handlers (until the next received packet).
  400 */
  401void dev_add_offload(struct packet_offload *po)
  402{
  403	struct packet_offload *elem;
  404
  405	spin_lock(&offload_lock);
  406	list_for_each_entry(elem, &offload_base, list) {
  407		if (po->priority < elem->priority)
  408			break;
  409	}
  410	list_add_rcu(&po->list, elem->list.prev);
  411	spin_unlock(&offload_lock);
  412}
  413EXPORT_SYMBOL(dev_add_offload);
  414
  415/**
  416 *	__dev_remove_offload	 - remove offload handler
  417 *	@po: packet offload declaration
  418 *
  419 *	Remove a protocol offload handler that was previously added to the
  420 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
  421 *	is removed from the kernel lists and can be freed or reused once this
  422 *	function returns.
  423 *
  424 *      The packet type might still be in use by receivers
  425 *	and must not be freed until after all the CPU's have gone
  426 *	through a quiescent state.
  427 */
  428static void __dev_remove_offload(struct packet_offload *po)
  429{
  430	struct list_head *head = &offload_base;
  431	struct packet_offload *po1;
  432
  433	spin_lock(&offload_lock);
  434
  435	list_for_each_entry(po1, head, list) {
  436		if (po == po1) {
  437			list_del_rcu(&po->list);
  438			goto out;
  439		}
  440	}
  441
  442	pr_warn("dev_remove_offload: %p not found\n", po);
  443out:
  444	spin_unlock(&offload_lock);
  445}
  446
  447/**
  448 *	dev_remove_offload	 - remove packet offload handler
  449 *	@po: packet offload declaration
  450 *
  451 *	Remove a packet offload handler that was previously added to the kernel
  452 *	offload handlers by dev_add_offload(). The passed &offload_type is
  453 *	removed from the kernel lists and can be freed or reused once this
  454 *	function returns.
  455 *
  456 *	This call sleeps to guarantee that no CPU is looking at the packet
  457 *	type after return.
  458 */
  459void dev_remove_offload(struct packet_offload *po)
  460{
  461	__dev_remove_offload(po);
  462
  463	synchronize_net();
  464}
  465EXPORT_SYMBOL(dev_remove_offload);
  466
  467/******************************************************************************
  468 *
  469 *		      Device Boot-time Settings Routines
  470 *
  471 ******************************************************************************/
  472
  473/* Boot time configuration table */
  474static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
  475
  476/**
  477 *	netdev_boot_setup_add	- add new setup entry
  478 *	@name: name of the device
  479 *	@map: configured settings for the device
  480 *
  481 *	Adds new setup entry to the dev_boot_setup list.  The function
  482 *	returns 0 on error and 1 on success.  This is a generic routine to
  483 *	all netdevices.
  484 */
  485static int netdev_boot_setup_add(char *name, struct ifmap *map)
  486{
  487	struct netdev_boot_setup *s;
  488	int i;
  489
  490	s = dev_boot_setup;
  491	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
  492		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
  493			memset(s[i].name, 0, sizeof(s[i].name));
  494			strlcpy(s[i].name, name, IFNAMSIZ);
  495			memcpy(&s[i].map, map, sizeof(s[i].map));
  496			break;
  497		}
  498	}
  499
  500	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
  501}
  502
  503/**
  504 * netdev_boot_setup_check	- check boot time settings
  505 * @dev: the netdevice
  506 *
  507 * Check boot time settings for the device.
  508 * The found settings are set for the device to be used
  509 * later in the device probing.
  510 * Returns 0 if no settings found, 1 if they are.
  511 */
  512int netdev_boot_setup_check(struct net_device *dev)
  513{
  514	struct netdev_boot_setup *s = dev_boot_setup;
  515	int i;
  516
  517	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
  518		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
  519		    !strcmp(dev->name, s[i].name)) {
  520			dev->irq = s[i].map.irq;
  521			dev->base_addr = s[i].map.base_addr;
  522			dev->mem_start = s[i].map.mem_start;
  523			dev->mem_end = s[i].map.mem_end;
  524			return 1;
  525		}
  526	}
  527	return 0;
  528}
  529EXPORT_SYMBOL(netdev_boot_setup_check);
  530
  531
  532/**
  533 * netdev_boot_base	- get address from boot time settings
  534 * @prefix: prefix for network device
  535 * @unit: id for network device
  536 *
  537 * Check boot time settings for the base address of device.
  538 * The found settings are set for the device to be used
  539 * later in the device probing.
  540 * Returns 0 if no settings found.
  541 */
  542unsigned long netdev_boot_base(const char *prefix, int unit)
  543{
  544	const struct netdev_boot_setup *s = dev_boot_setup;
  545	char name[IFNAMSIZ];
  546	int i;
  547
  548	sprintf(name, "%s%d", prefix, unit);
  549
  550	/*
  551	 * If device already registered then return base of 1
  552	 * to indicate not to probe for this interface
  553	 */
  554	if (__dev_get_by_name(&init_net, name))
  555		return 1;
  556
  557	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
  558		if (!strcmp(name, s[i].name))
  559			return s[i].map.base_addr;
  560	return 0;
  561}
  562
  563/*
  564 * Saves at boot time configured settings for any netdevice.
  565 */
  566int __init netdev_boot_setup(char *str)
  567{
  568	int ints[5];
  569	struct ifmap map;
  570
  571	str = get_options(str, ARRAY_SIZE(ints), ints);
  572	if (!str || !*str)
  573		return 0;
  574
  575	/* Save settings */
  576	memset(&map, 0, sizeof(map));
  577	if (ints[0] > 0)
  578		map.irq = ints[1];
  579	if (ints[0] > 1)
  580		map.base_addr = ints[2];
  581	if (ints[0] > 2)
  582		map.mem_start = ints[3];
  583	if (ints[0] > 3)
  584		map.mem_end = ints[4];
  585
  586	/* Add new entry to the list */
  587	return netdev_boot_setup_add(str, &map);
  588}
  589
  590__setup("netdev=", netdev_boot_setup);
  591
  592/*******************************************************************************
  593 *
  594 *			    Device Interface Subroutines
  595 *
  596 *******************************************************************************/
  597
  598/**
  599 *	dev_get_iflink	- get 'iflink' value of a interface
  600 *	@dev: targeted interface
  601 *
  602 *	Indicates the ifindex the interface is linked to.
  603 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
  604 */
  605
  606int dev_get_iflink(const struct net_device *dev)
  607{
  608	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
  609		return dev->netdev_ops->ndo_get_iflink(dev);
  610
  611	return dev->ifindex;
  612}
  613EXPORT_SYMBOL(dev_get_iflink);
  614
  615/**
  616 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
  617 *	@dev: targeted interface
  618 *	@skb: The packet.
  619 *
  620 *	For better visibility of tunnel traffic OVS needs to retrieve
  621 *	egress tunnel information for a packet. Following API allows
  622 *	user to get this info.
  623 */
  624int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
  625{
  626	struct ip_tunnel_info *info;
  627
  628	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
  629		return -EINVAL;
  630
  631	info = skb_tunnel_info_unclone(skb);
  632	if (!info)
  633		return -ENOMEM;
  634	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
  635		return -EINVAL;
  636
  637	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
  638}
  639EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
  640
  641/**
  642 *	__dev_get_by_name	- find a device by its name
  643 *	@net: the applicable net namespace
  644 *	@name: name to find
  645 *
  646 *	Find an interface by name. Must be called under RTNL semaphore
  647 *	or @dev_base_lock. If the name is found a pointer to the device
  648 *	is returned. If the name is not found then %NULL is returned. The
  649 *	reference counters are not incremented so the caller must be
  650 *	careful with locks.
  651 */
  652
  653struct net_device *__dev_get_by_name(struct net *net, const char *name)
  654{
  655	struct net_device *dev;
  656	struct hlist_head *head = dev_name_hash(net, name);
  657
  658	hlist_for_each_entry(dev, head, name_hlist)
  659		if (!strncmp(dev->name, name, IFNAMSIZ))
  660			return dev;
  661
  662	return NULL;
  663}
  664EXPORT_SYMBOL(__dev_get_by_name);
  665
  666/**
  667 * dev_get_by_name_rcu	- find a device by its name
  668 * @net: the applicable net namespace
  669 * @name: name to find
  670 *
  671 * Find an interface by name.
  672 * If the name is found a pointer to the device is returned.
  673 * If the name is not found then %NULL is returned.
  674 * The reference counters are not incremented so the caller must be
  675 * careful with locks. The caller must hold RCU lock.
  676 */
  677
  678struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
  679{
  680	struct net_device *dev;
  681	struct hlist_head *head = dev_name_hash(net, name);
  682
  683	hlist_for_each_entry_rcu(dev, head, name_hlist)
  684		if (!strncmp(dev->name, name, IFNAMSIZ))
  685			return dev;
  686
  687	return NULL;
  688}
  689EXPORT_SYMBOL(dev_get_by_name_rcu);
  690
  691/**
  692 *	dev_get_by_name		- find a device by its name
  693 *	@net: the applicable net namespace
  694 *	@name: name to find
  695 *
  696 *	Find an interface by name. This can be called from any
  697 *	context and does its own locking. The returned handle has
  698 *	the usage count incremented and the caller must use dev_put() to
  699 *	release it when it is no longer needed. %NULL is returned if no
  700 *	matching device is found.
  701 */
  702
  703struct net_device *dev_get_by_name(struct net *net, const char *name)
  704{
  705	struct net_device *dev;
  706
  707	rcu_read_lock();
  708	dev = dev_get_by_name_rcu(net, name);
  709	if (dev)
  710		dev_hold(dev);
  711	rcu_read_unlock();
  712	return dev;
  713}
  714EXPORT_SYMBOL(dev_get_by_name);
  715
  716/**
  717 *	__dev_get_by_index - find a device by its ifindex
  718 *	@net: the applicable net namespace
  719 *	@ifindex: index of device
  720 *
  721 *	Search for an interface by index. Returns %NULL if the device
  722 *	is not found or a pointer to the device. The device has not
  723 *	had its reference counter increased so the caller must be careful
  724 *	about locking. The caller must hold either the RTNL semaphore
  725 *	or @dev_base_lock.
  726 */
  727
  728struct net_device *__dev_get_by_index(struct net *net, int ifindex)
  729{
  730	struct net_device *dev;
  731	struct hlist_head *head = dev_index_hash(net, ifindex);
  732
  733	hlist_for_each_entry(dev, head, index_hlist)
  734		if (dev->ifindex == ifindex)
  735			return dev;
  736
  737	return NULL;
  738}
  739EXPORT_SYMBOL(__dev_get_by_index);
  740
  741/**
  742 *	dev_get_by_index_rcu - find a device by its ifindex
  743 *	@net: the applicable net namespace
  744 *	@ifindex: index of device
  745 *
  746 *	Search for an interface by index. Returns %NULL if the device
  747 *	is not found or a pointer to the device. The device has not
  748 *	had its reference counter increased so the caller must be careful
  749 *	about locking. The caller must hold RCU lock.
  750 */
  751
  752struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
  753{
  754	struct net_device *dev;
  755	struct hlist_head *head = dev_index_hash(net, ifindex);
  756
  757	hlist_for_each_entry_rcu(dev, head, index_hlist)
  758		if (dev->ifindex == ifindex)
  759			return dev;
  760
  761	return NULL;
  762}
  763EXPORT_SYMBOL(dev_get_by_index_rcu);
  764
  765
  766/**
  767 *	dev_get_by_index - find a device by its ifindex
  768 *	@net: the applicable net namespace
  769 *	@ifindex: index of device
  770 *
  771 *	Search for an interface by index. Returns NULL if the device
  772 *	is not found or a pointer to the device. The device returned has
  773 *	had a reference added and the pointer is safe until the user calls
  774 *	dev_put to indicate they have finished with it.
  775 */
  776
  777struct net_device *dev_get_by_index(struct net *net, int ifindex)
  778{
  779	struct net_device *dev;
  780
  781	rcu_read_lock();
  782	dev = dev_get_by_index_rcu(net, ifindex);
  783	if (dev)
  784		dev_hold(dev);
  785	rcu_read_unlock();
  786	return dev;
  787}
  788EXPORT_SYMBOL(dev_get_by_index);
  789
  790/**
  791 *	dev_get_by_napi_id - find a device by napi_id
  792 *	@napi_id: ID of the NAPI struct
  793 *
  794 *	Search for an interface by NAPI ID. Returns %NULL if the device
  795 *	is not found or a pointer to the device. The device has not had
  796 *	its reference counter increased so the caller must be careful
  797 *	about locking. The caller must hold RCU lock.
  798 */
  799
  800struct net_device *dev_get_by_napi_id(unsigned int napi_id)
  801{
  802	struct napi_struct *napi;
  803
  804	WARN_ON_ONCE(!rcu_read_lock_held());
  805
  806	if (napi_id < MIN_NAPI_ID)
  807		return NULL;
  808
  809	napi = napi_by_id(napi_id);
  810
  811	return napi ? napi->dev : NULL;
  812}
  813EXPORT_SYMBOL(dev_get_by_napi_id);
  814
  815/**
  816 *	netdev_get_name - get a netdevice name, knowing its ifindex.
  817 *	@net: network namespace
  818 *	@name: a pointer to the buffer where the name will be stored.
  819 *	@ifindex: the ifindex of the interface to get the name from.
  820 *
  821 *	The use of raw_seqcount_begin() and cond_resched() before
  822 *	retrying is required as we want to give the writers a chance
  823 *	to complete when CONFIG_PREEMPT is not set.
  824 */
  825int netdev_get_name(struct net *net, char *name, int ifindex)
  826{
  827	struct net_device *dev;
  828	unsigned int seq;
  829
  830retry:
  831	seq = raw_seqcount_begin(&devnet_rename_seq);
  832	rcu_read_lock();
  833	dev = dev_get_by_index_rcu(net, ifindex);
  834	if (!dev) {
  835		rcu_read_unlock();
  836		return -ENODEV;
  837	}
  838
  839	strcpy(name, dev->name);
  840	rcu_read_unlock();
  841	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
  842		cond_resched();
  843		goto retry;
  844	}
  845
  846	return 0;
  847}
  848
  849/**
  850 *	dev_getbyhwaddr_rcu - find a device by its hardware address
  851 *	@net: the applicable net namespace
  852 *	@type: media type of device
  853 *	@ha: hardware address
  854 *
  855 *	Search for an interface by MAC address. Returns NULL if the device
  856 *	is not found or a pointer to the device.
  857 *	The caller must hold RCU or RTNL.
  858 *	The returned device has not had its ref count increased
  859 *	and the caller must therefore be careful about locking
  860 *
  861 */
  862
  863struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
  864				       const char *ha)
  865{
  866	struct net_device *dev;
  867
  868	for_each_netdev_rcu(net, dev)
  869		if (dev->type == type &&
  870		    !memcmp(dev->dev_addr, ha, dev->addr_len))
  871			return dev;
  872
  873	return NULL;
  874}
  875EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
  876
  877struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
  878{
  879	struct net_device *dev;
  880
  881	ASSERT_RTNL();
  882	for_each_netdev(net, dev)
  883		if (dev->type == type)
  884			return dev;
  885
  886	return NULL;
  887}
  888EXPORT_SYMBOL(__dev_getfirstbyhwtype);
  889
  890struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
  891{
  892	struct net_device *dev, *ret = NULL;
  893
  894	rcu_read_lock();
  895	for_each_netdev_rcu(net, dev)
  896		if (dev->type == type) {
  897			dev_hold(dev);
  898			ret = dev;
  899			break;
  900		}
  901	rcu_read_unlock();
  902	return ret;
  903}
  904EXPORT_SYMBOL(dev_getfirstbyhwtype);
  905
  906/**
  907 *	__dev_get_by_flags - find any device with given flags
  908 *	@net: the applicable net namespace
  909 *	@if_flags: IFF_* values
  910 *	@mask: bitmask of bits in if_flags to check
  911 *
  912 *	Search for any interface with the given flags. Returns NULL if a device
  913 *	is not found or a pointer to the device. Must be called inside
  914 *	rtnl_lock(), and result refcount is unchanged.
  915 */
  916
  917struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
  918				      unsigned short mask)
  919{
  920	struct net_device *dev, *ret;
  921
  922	ASSERT_RTNL();
  923
  924	ret = NULL;
  925	for_each_netdev(net, dev) {
  926		if (((dev->flags ^ if_flags) & mask) == 0) {
  927			ret = dev;
  928			break;
  929		}
  930	}
  931	return ret;
  932}
  933EXPORT_SYMBOL(__dev_get_by_flags);
  934
  935/**
  936 *	dev_valid_name - check if name is okay for network device
  937 *	@name: name string
  938 *
  939 *	Network device names need to be valid file names to
  940 *	to allow sysfs to work.  We also disallow any kind of
  941 *	whitespace.
  942 */
  943bool dev_valid_name(const char *name)
  944{
  945	if (*name == '\0')
  946		return false;
  947	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
  948		return false;
  949	if (!strcmp(name, ".") || !strcmp(name, ".."))
  950		return false;
  951
  952	while (*name) {
  953		if (*name == '/' || *name == ':' || isspace(*name))
  954			return false;
  955		name++;
  956	}
  957	return true;
  958}
  959EXPORT_SYMBOL(dev_valid_name);
  960
  961/**
  962 *	__dev_alloc_name - allocate a name for a device
  963 *	@net: network namespace to allocate the device name in
  964 *	@name: name format string
  965 *	@buf:  scratch buffer and result name string
  966 *
  967 *	Passed a format string - eg "lt%d" it will try and find a suitable
  968 *	id. It scans list of devices to build up a free map, then chooses
  969 *	the first empty slot. The caller must hold the dev_base or rtnl lock
  970 *	while allocating the name and adding the device in order to avoid
  971 *	duplicates.
  972 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
  973 *	Returns the number of the unit assigned or a negative errno code.
  974 */
  975
  976static int __dev_alloc_name(struct net *net, const char *name, char *buf)
  977{
  978	int i = 0;
  979	const char *p;
  980	const int max_netdevices = 8*PAGE_SIZE;
  981	unsigned long *inuse;
  982	struct net_device *d;
  983
  984	if (!dev_valid_name(name))
  985		return -EINVAL;
  986
  987	p = strchr(name, '%');
  988	if (p) {
  989		/*
  990		 * Verify the string as this thing may have come from
  991		 * the user.  There must be either one "%d" and no other "%"
  992		 * characters.
  993		 */
  994		if (p[1] != 'd' || strchr(p + 2, '%'))
  995			return -EINVAL;
  996
  997		/* Use one page as a bit array of possible slots */
  998		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
  999		if (!inuse)
 1000			return -ENOMEM;
 1001
 1002		for_each_netdev(net, d) {
 1003			if (!sscanf(d->name, name, &i))
 1004				continue;
 1005			if (i < 0 || i >= max_netdevices)
 1006				continue;
 1007
 1008			/*  avoid cases where sscanf is not exact inverse of printf */
 1009			snprintf(buf, IFNAMSIZ, name, i);
 1010			if (!strncmp(buf, d->name, IFNAMSIZ))
 1011				set_bit(i, inuse);
 1012		}
 1013
 1014		i = find_first_zero_bit(inuse, max_netdevices);
 1015		free_page((unsigned long) inuse);
 1016	}
 1017
 1018	snprintf(buf, IFNAMSIZ, name, i);
 
 1019	if (!__dev_get_by_name(net, buf))
 1020		return i;
 1021
 1022	/* It is possible to run out of possible slots
 1023	 * when the name is long and there isn't enough space left
 1024	 * for the digits, or if all bits are used.
 1025	 */
 1026	return -ENFILE;
 1027}
 1028
 1029static int dev_alloc_name_ns(struct net *net,
 1030			     struct net_device *dev,
 1031			     const char *name)
 1032{
 1033	char buf[IFNAMSIZ];
 1034	int ret;
 1035
 1036	BUG_ON(!net);
 1037	ret = __dev_alloc_name(net, name, buf);
 1038	if (ret >= 0)
 1039		strlcpy(dev->name, buf, IFNAMSIZ);
 1040	return ret;
 1041}
 1042
 1043/**
 1044 *	dev_alloc_name - allocate a name for a device
 1045 *	@dev: device
 1046 *	@name: name format string
 1047 *
 1048 *	Passed a format string - eg "lt%d" it will try and find a suitable
 1049 *	id. It scans list of devices to build up a free map, then chooses
 1050 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 1051 *	while allocating the name and adding the device in order to avoid
 1052 *	duplicates.
 1053 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 1054 *	Returns the number of the unit assigned or a negative errno code.
 1055 */
 1056
 1057int dev_alloc_name(struct net_device *dev, const char *name)
 1058{
 1059	return dev_alloc_name_ns(dev_net(dev), dev, name);
 
 
 
 
 
 
 
 
 
 1060}
 1061EXPORT_SYMBOL(dev_alloc_name);
 1062
 1063int dev_get_valid_name(struct net *net, struct net_device *dev,
 1064		       const char *name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 1065{
 1066	BUG_ON(!net);
 1067
 1068	if (!dev_valid_name(name))
 1069		return -EINVAL;
 1070
 1071	if (strchr(name, '%'))
 1072		return dev_alloc_name_ns(net, dev, name);
 1073	else if (__dev_get_by_name(net, name))
 1074		return -EEXIST;
 1075	else if (dev->name != name)
 1076		strlcpy(dev->name, name, IFNAMSIZ);
 1077
 1078	return 0;
 1079}
 1080EXPORT_SYMBOL(dev_get_valid_name);
 1081
 1082/**
 1083 *	dev_change_name - change name of a device
 1084 *	@dev: device
 1085 *	@newname: name (or format string) must be at least IFNAMSIZ
 1086 *
 1087 *	Change name of a device, can pass format strings "eth%d".
 1088 *	for wildcarding.
 1089 */
 1090int dev_change_name(struct net_device *dev, const char *newname)
 1091{
 1092	unsigned char old_assign_type;
 1093	char oldname[IFNAMSIZ];
 1094	int err = 0;
 1095	int ret;
 1096	struct net *net;
 1097
 1098	ASSERT_RTNL();
 1099	BUG_ON(!dev_net(dev));
 1100
 1101	net = dev_net(dev);
 1102
 1103	/* Some auto-enslaved devices e.g. failover slaves are
 1104	 * special, as userspace might rename the device after
 1105	 * the interface had been brought up and running since
 1106	 * the point kernel initiated auto-enslavement. Allow
 1107	 * live name change even when these slave devices are
 1108	 * up and running.
 1109	 *
 1110	 * Typically, users of these auto-enslaving devices
 1111	 * don't actually care about slave name change, as
 1112	 * they are supposed to operate on master interface
 1113	 * directly.
 1114	 */
 1115	if (dev->flags & IFF_UP &&
 1116	    likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
 1117		return -EBUSY;
 1118
 1119	write_seqcount_begin(&devnet_rename_seq);
 1120
 1121	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
 1122		write_seqcount_end(&devnet_rename_seq);
 1123		return 0;
 1124	}
 1125
 1126	memcpy(oldname, dev->name, IFNAMSIZ);
 1127
 1128	err = dev_get_valid_name(net, dev, newname);
 1129	if (err < 0) {
 1130		write_seqcount_end(&devnet_rename_seq);
 1131		return err;
 1132	}
 1133
 1134	if (oldname[0] && !strchr(oldname, '%'))
 1135		netdev_info(dev, "renamed from %s\n", oldname);
 1136
 1137	old_assign_type = dev->name_assign_type;
 1138	dev->name_assign_type = NET_NAME_RENAMED;
 1139
 1140rollback:
 1141	ret = device_rename(&dev->dev, dev->name);
 1142	if (ret) {
 1143		memcpy(dev->name, oldname, IFNAMSIZ);
 1144		dev->name_assign_type = old_assign_type;
 1145		write_seqcount_end(&devnet_rename_seq);
 1146		return ret;
 1147	}
 1148
 1149	write_seqcount_end(&devnet_rename_seq);
 1150
 1151	netdev_adjacent_rename_links(dev, oldname);
 1152
 1153	write_lock_bh(&dev_base_lock);
 1154	hlist_del_rcu(&dev->name_hlist);
 1155	write_unlock_bh(&dev_base_lock);
 1156
 1157	synchronize_rcu();
 1158
 1159	write_lock_bh(&dev_base_lock);
 1160	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 1161	write_unlock_bh(&dev_base_lock);
 1162
 1163	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 1164	ret = notifier_to_errno(ret);
 1165
 1166	if (ret) {
 1167		/* err >= 0 after dev_alloc_name() or stores the first errno */
 1168		if (err >= 0) {
 1169			err = ret;
 1170			write_seqcount_begin(&devnet_rename_seq);
 1171			memcpy(dev->name, oldname, IFNAMSIZ);
 1172			memcpy(oldname, newname, IFNAMSIZ);
 1173			dev->name_assign_type = old_assign_type;
 1174			old_assign_type = NET_NAME_RENAMED;
 1175			goto rollback;
 1176		} else {
 1177			pr_err("%s: name change rollback failed: %d\n",
 1178			       dev->name, ret);
 1179		}
 1180	}
 1181
 1182	return err;
 1183}
 1184
 1185/**
 1186 *	dev_set_alias - change ifalias of a device
 1187 *	@dev: device
 1188 *	@alias: name up to IFALIASZ
 1189 *	@len: limit of bytes to copy from info
 1190 *
 1191 *	Set ifalias for a device,
 1192 */
 1193int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 1194{
 1195	struct dev_ifalias *new_alias = NULL;
 
 
 1196
 1197	if (len >= IFALIASZ)
 1198		return -EINVAL;
 1199
 1200	if (len) {
 1201		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
 1202		if (!new_alias)
 1203			return -ENOMEM;
 1204
 1205		memcpy(new_alias->ifalias, alias, len);
 1206		new_alias->ifalias[len] = 0;
 1207	}
 1208
 1209	mutex_lock(&ifalias_mutex);
 1210	rcu_swap_protected(dev->ifalias, new_alias,
 1211			   mutex_is_locked(&ifalias_mutex));
 1212	mutex_unlock(&ifalias_mutex);
 1213
 1214	if (new_alias)
 1215		kfree_rcu(new_alias, rcuhead);
 1216
 
 1217	return len;
 1218}
 1219EXPORT_SYMBOL(dev_set_alias);
 1220
 1221/**
 1222 *	dev_get_alias - get ifalias of a device
 1223 *	@dev: device
 1224 *	@name: buffer to store name of ifalias
 1225 *	@len: size of buffer
 1226 *
 1227 *	get ifalias for a device.  Caller must make sure dev cannot go
 1228 *	away,  e.g. rcu read lock or own a reference count to device.
 1229 */
 1230int dev_get_alias(const struct net_device *dev, char *name, size_t len)
 1231{
 1232	const struct dev_ifalias *alias;
 1233	int ret = 0;
 1234
 1235	rcu_read_lock();
 1236	alias = rcu_dereference(dev->ifalias);
 1237	if (alias)
 1238		ret = snprintf(name, len, "%s", alias->ifalias);
 1239	rcu_read_unlock();
 1240
 1241	return ret;
 1242}
 1243
 1244/**
 1245 *	netdev_features_change - device changes features
 1246 *	@dev: device to cause notification
 1247 *
 1248 *	Called to indicate a device has changed features.
 1249 */
 1250void netdev_features_change(struct net_device *dev)
 1251{
 1252	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
 1253}
 1254EXPORT_SYMBOL(netdev_features_change);
 1255
 1256/**
 1257 *	netdev_state_change - device changes state
 1258 *	@dev: device to cause notification
 1259 *
 1260 *	Called to indicate a device has changed state. This function calls
 1261 *	the notifier chains for netdev_chain and sends a NEWLINK message
 1262 *	to the routing socket.
 1263 */
 1264void netdev_state_change(struct net_device *dev)
 1265{
 1266	if (dev->flags & IFF_UP) {
 1267		struct netdev_notifier_change_info change_info = {
 1268			.info.dev = dev,
 1269		};
 1270
 1271		call_netdevice_notifiers_info(NETDEV_CHANGE,
 
 1272					      &change_info.info);
 1273		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
 1274	}
 1275}
 1276EXPORT_SYMBOL(netdev_state_change);
 1277
 1278/**
 1279 * netdev_notify_peers - notify network peers about existence of @dev
 1280 * @dev: network device
 1281 *
 1282 * Generate traffic such that interested network peers are aware of
 1283 * @dev, such as by generating a gratuitous ARP. This may be used when
 1284 * a device wants to inform the rest of the network about some sort of
 1285 * reconfiguration such as a failover event or virtual machine
 1286 * migration.
 1287 */
 1288void netdev_notify_peers(struct net_device *dev)
 1289{
 1290	rtnl_lock();
 1291	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
 1292	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
 1293	rtnl_unlock();
 1294}
 1295EXPORT_SYMBOL(netdev_notify_peers);
 1296
 1297static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1298{
 1299	const struct net_device_ops *ops = dev->netdev_ops;
 1300	int ret;
 1301
 1302	ASSERT_RTNL();
 1303
 1304	if (!netif_device_present(dev))
 1305		return -ENODEV;
 1306
 1307	/* Block netpoll from trying to do any rx path servicing.
 1308	 * If we don't do this there is a chance ndo_poll_controller
 1309	 * or ndo_poll may be running while we open the device
 1310	 */
 1311	netpoll_poll_disable(dev);
 1312
 1313	ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
 1314	ret = notifier_to_errno(ret);
 1315	if (ret)
 1316		return ret;
 1317
 1318	set_bit(__LINK_STATE_START, &dev->state);
 1319
 1320	if (ops->ndo_validate_addr)
 1321		ret = ops->ndo_validate_addr(dev);
 1322
 1323	if (!ret && ops->ndo_open)
 1324		ret = ops->ndo_open(dev);
 1325
 1326	netpoll_poll_enable(dev);
 1327
 1328	if (ret)
 1329		clear_bit(__LINK_STATE_START, &dev->state);
 1330	else {
 1331		dev->flags |= IFF_UP;
 1332		dev_set_rx_mode(dev);
 1333		dev_activate(dev);
 1334		add_device_randomness(dev->dev_addr, dev->addr_len);
 1335	}
 1336
 1337	return ret;
 1338}
 1339
 1340/**
 1341 *	dev_open	- prepare an interface for use.
 1342 *	@dev: device to open
 1343 *	@extack: netlink extended ack
 1344 *
 1345 *	Takes a device from down to up state. The device's private open
 1346 *	function is invoked and then the multicast lists are loaded. Finally
 1347 *	the device is moved into the up state and a %NETDEV_UP message is
 1348 *	sent to the netdev notifier chain.
 1349 *
 1350 *	Calling this function on an active interface is a nop. On a failure
 1351 *	a negative errno code is returned.
 1352 */
 1353int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1354{
 1355	int ret;
 1356
 1357	if (dev->flags & IFF_UP)
 1358		return 0;
 1359
 1360	ret = __dev_open(dev, extack);
 1361	if (ret < 0)
 1362		return ret;
 1363
 1364	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
 1365	call_netdevice_notifiers(NETDEV_UP, dev);
 1366
 1367	return ret;
 1368}
 1369EXPORT_SYMBOL(dev_open);
 1370
 1371static void __dev_close_many(struct list_head *head)
 1372{
 1373	struct net_device *dev;
 1374
 1375	ASSERT_RTNL();
 1376	might_sleep();
 1377
 1378	list_for_each_entry(dev, head, close_list) {
 1379		/* Temporarily disable netpoll until the interface is down */
 1380		netpoll_poll_disable(dev);
 1381
 1382		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
 1383
 1384		clear_bit(__LINK_STATE_START, &dev->state);
 1385
 1386		/* Synchronize to scheduled poll. We cannot touch poll list, it
 1387		 * can be even on different cpu. So just clear netif_running().
 1388		 *
 1389		 * dev->stop() will invoke napi_disable() on all of it's
 1390		 * napi_struct instances on this device.
 1391		 */
 1392		smp_mb__after_atomic(); /* Commit netif_running(). */
 1393	}
 1394
 1395	dev_deactivate_many(head);
 1396
 1397	list_for_each_entry(dev, head, close_list) {
 1398		const struct net_device_ops *ops = dev->netdev_ops;
 1399
 1400		/*
 1401		 *	Call the device specific close. This cannot fail.
 1402		 *	Only if device is UP
 1403		 *
 1404		 *	We allow it to be called even after a DETACH hot-plug
 1405		 *	event.
 1406		 */
 1407		if (ops->ndo_stop)
 1408			ops->ndo_stop(dev);
 1409
 1410		dev->flags &= ~IFF_UP;
 1411		netpoll_poll_enable(dev);
 1412	}
 
 
 1413}
 1414
 1415static void __dev_close(struct net_device *dev)
 1416{
 
 1417	LIST_HEAD(single);
 1418
 1419	list_add(&dev->close_list, &single);
 1420	__dev_close_many(&single);
 1421	list_del(&single);
 
 
 1422}
 1423
 1424void dev_close_many(struct list_head *head, bool unlink)
 1425{
 1426	struct net_device *dev, *tmp;
 1427
 1428	/* Remove the devices that don't need to be closed */
 1429	list_for_each_entry_safe(dev, tmp, head, close_list)
 1430		if (!(dev->flags & IFF_UP))
 1431			list_del_init(&dev->close_list);
 1432
 1433	__dev_close_many(head);
 1434
 1435	list_for_each_entry_safe(dev, tmp, head, close_list) {
 1436		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
 1437		call_netdevice_notifiers(NETDEV_DOWN, dev);
 1438		if (unlink)
 1439			list_del_init(&dev->close_list);
 1440	}
 
 
 1441}
 1442EXPORT_SYMBOL(dev_close_many);
 1443
 1444/**
 1445 *	dev_close - shutdown an interface.
 1446 *	@dev: device to shutdown
 1447 *
 1448 *	This function moves an active device into down state. A
 1449 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
 1450 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
 1451 *	chain.
 1452 */
 1453void dev_close(struct net_device *dev)
 1454{
 1455	if (dev->flags & IFF_UP) {
 1456		LIST_HEAD(single);
 1457
 1458		list_add(&dev->close_list, &single);
 1459		dev_close_many(&single, true);
 1460		list_del(&single);
 1461	}
 
 1462}
 1463EXPORT_SYMBOL(dev_close);
 1464
 1465
 1466/**
 1467 *	dev_disable_lro - disable Large Receive Offload on a device
 1468 *	@dev: device
 1469 *
 1470 *	Disable Large Receive Offload (LRO) on a net device.  Must be
 1471 *	called under RTNL.  This is needed if received packets may be
 1472 *	forwarded to another interface.
 1473 */
 1474void dev_disable_lro(struct net_device *dev)
 1475{
 1476	struct net_device *lower_dev;
 1477	struct list_head *iter;
 1478
 1479	dev->wanted_features &= ~NETIF_F_LRO;
 1480	netdev_update_features(dev);
 1481
 1482	if (unlikely(dev->features & NETIF_F_LRO))
 1483		netdev_WARN(dev, "failed to disable LRO!\n");
 1484
 1485	netdev_for_each_lower_dev(dev, lower_dev, iter)
 1486		dev_disable_lro(lower_dev);
 1487}
 1488EXPORT_SYMBOL(dev_disable_lro);
 1489
 1490/**
 1491 *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
 1492 *	@dev: device
 1493 *
 1494 *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
 1495 *	called under RTNL.  This is needed if Generic XDP is installed on
 1496 *	the device.
 1497 */
 1498static void dev_disable_gro_hw(struct net_device *dev)
 1499{
 1500	dev->wanted_features &= ~NETIF_F_GRO_HW;
 1501	netdev_update_features(dev);
 1502
 1503	if (unlikely(dev->features & NETIF_F_GRO_HW))
 1504		netdev_WARN(dev, "failed to disable GRO_HW!\n");
 1505}
 1506
 1507const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 1508{
 1509#define N(val) 						\
 1510	case NETDEV_##val:				\
 1511		return "NETDEV_" __stringify(val);
 1512	switch (cmd) {
 1513	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
 1514	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
 1515	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
 1516	N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
 1517	N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
 1518	N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
 1519	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
 1520	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
 1521	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
 1522	N(PRE_CHANGEADDR)
 1523	}
 1524#undef N
 1525	return "UNKNOWN_NETDEV_EVENT";
 1526}
 1527EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
 1528
 1529static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
 1530				   struct net_device *dev)
 1531{
 1532	struct netdev_notifier_info info = {
 1533		.dev = dev,
 1534	};
 1535
 
 1536	return nb->notifier_call(nb, val, &info);
 1537}
 1538
 1539static int dev_boot_phase = 1;
 1540
 1541/**
 1542 * register_netdevice_notifier - register a network notifier block
 1543 * @nb: notifier
 1544 *
 1545 * Register a notifier to be called when network device events occur.
 1546 * The notifier passed is linked into the kernel structures and must
 1547 * not be reused until it has been unregistered. A negative errno code
 1548 * is returned on a failure.
 1549 *
 1550 * When registered all registration and up events are replayed
 1551 * to the new notifier to allow device to have a race free
 1552 * view of the network device list.
 1553 */
 1554
 1555int register_netdevice_notifier(struct notifier_block *nb)
 1556{
 1557	struct net_device *dev;
 1558	struct net_device *last;
 1559	struct net *net;
 1560	int err;
 1561
 1562	/* Close race with setup_net() and cleanup_net() */
 1563	down_write(&pernet_ops_rwsem);
 1564	rtnl_lock();
 1565	err = raw_notifier_chain_register(&netdev_chain, nb);
 1566	if (err)
 1567		goto unlock;
 1568	if (dev_boot_phase)
 1569		goto unlock;
 1570	for_each_net(net) {
 1571		for_each_netdev(net, dev) {
 1572			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
 1573			err = notifier_to_errno(err);
 1574			if (err)
 1575				goto rollback;
 1576
 1577			if (!(dev->flags & IFF_UP))
 1578				continue;
 1579
 1580			call_netdevice_notifier(nb, NETDEV_UP, dev);
 1581		}
 1582	}
 1583
 1584unlock:
 1585	rtnl_unlock();
 1586	up_write(&pernet_ops_rwsem);
 1587	return err;
 1588
 1589rollback:
 1590	last = dev;
 1591	for_each_net(net) {
 1592		for_each_netdev(net, dev) {
 1593			if (dev == last)
 1594				goto outroll;
 1595
 1596			if (dev->flags & IFF_UP) {
 1597				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
 1598							dev);
 1599				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
 1600			}
 1601			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
 1602		}
 1603	}
 1604
 1605outroll:
 1606	raw_notifier_chain_unregister(&netdev_chain, nb);
 1607	goto unlock;
 1608}
 1609EXPORT_SYMBOL(register_netdevice_notifier);
 1610
 1611/**
 1612 * unregister_netdevice_notifier - unregister a network notifier block
 1613 * @nb: notifier
 1614 *
 1615 * Unregister a notifier previously registered by
 1616 * register_netdevice_notifier(). The notifier is unlinked into the
 1617 * kernel structures and may then be reused. A negative errno code
 1618 * is returned on a failure.
 1619 *
 1620 * After unregistering unregister and down device events are synthesized
 1621 * for all devices on the device list to the removed notifier to remove
 1622 * the need for special case cleanup code.
 1623 */
 1624
 1625int unregister_netdevice_notifier(struct notifier_block *nb)
 1626{
 1627	struct net_device *dev;
 1628	struct net *net;
 1629	int err;
 1630
 1631	/* Close race with setup_net() and cleanup_net() */
 1632	down_write(&pernet_ops_rwsem);
 1633	rtnl_lock();
 1634	err = raw_notifier_chain_unregister(&netdev_chain, nb);
 1635	if (err)
 1636		goto unlock;
 1637
 1638	for_each_net(net) {
 1639		for_each_netdev(net, dev) {
 1640			if (dev->flags & IFF_UP) {
 1641				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
 1642							dev);
 1643				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
 1644			}
 1645			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
 1646		}
 1647	}
 1648unlock:
 1649	rtnl_unlock();
 1650	up_write(&pernet_ops_rwsem);
 1651	return err;
 1652}
 1653EXPORT_SYMBOL(unregister_netdevice_notifier);
 1654
 1655/**
 1656 *	call_netdevice_notifiers_info - call all network notifier blocks
 1657 *	@val: value passed unmodified to notifier function
 
 1658 *	@info: notifier information data
 1659 *
 1660 *	Call all network notifier blocks.  Parameters and return value
 1661 *	are as for raw_notifier_call_chain().
 1662 */
 1663
 1664static int call_netdevice_notifiers_info(unsigned long val,
 
 1665					 struct netdev_notifier_info *info)
 1666{
 1667	ASSERT_RTNL();
 
 1668	return raw_notifier_call_chain(&netdev_chain, val, info);
 1669}
 1670
 1671static int call_netdevice_notifiers_extack(unsigned long val,
 1672					   struct net_device *dev,
 1673					   struct netlink_ext_ack *extack)
 1674{
 1675	struct netdev_notifier_info info = {
 1676		.dev = dev,
 1677		.extack = extack,
 1678	};
 1679
 1680	return call_netdevice_notifiers_info(val, &info);
 1681}
 1682
 1683/**
 1684 *	call_netdevice_notifiers - call all network notifier blocks
 1685 *      @val: value passed unmodified to notifier function
 1686 *      @dev: net_device pointer passed unmodified to notifier function
 1687 *
 1688 *	Call all network notifier blocks.  Parameters and return value
 1689 *	are as for raw_notifier_call_chain().
 1690 */
 1691
 1692int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
 1693{
 1694	return call_netdevice_notifiers_extack(val, dev, NULL);
 1695}
 1696EXPORT_SYMBOL(call_netdevice_notifiers);
 1697
 1698/**
 1699 *	call_netdevice_notifiers_mtu - call all network notifier blocks
 1700 *	@val: value passed unmodified to notifier function
 1701 *	@dev: net_device pointer passed unmodified to notifier function
 1702 *	@arg: additional u32 argument passed to the notifier function
 1703 *
 1704 *	Call all network notifier blocks.  Parameters and return value
 1705 *	are as for raw_notifier_call_chain().
 1706 */
 1707static int call_netdevice_notifiers_mtu(unsigned long val,
 1708					struct net_device *dev, u32 arg)
 1709{
 1710	struct netdev_notifier_info_ext info = {
 1711		.info.dev = dev,
 1712		.ext.mtu = arg,
 1713	};
 1714
 1715	BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
 1716
 1717	return call_netdevice_notifiers_info(val, &info.info);
 1718}
 
 1719
 1720#ifdef CONFIG_NET_INGRESS
 1721static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
 1722
 1723void net_inc_ingress_queue(void)
 1724{
 1725	static_branch_inc(&ingress_needed_key);
 1726}
 1727EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
 1728
 1729void net_dec_ingress_queue(void)
 1730{
 1731	static_branch_dec(&ingress_needed_key);
 1732}
 1733EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
 1734#endif
 1735
 1736#ifdef CONFIG_NET_EGRESS
 1737static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
 1738
 1739void net_inc_egress_queue(void)
 1740{
 1741	static_branch_inc(&egress_needed_key);
 1742}
 1743EXPORT_SYMBOL_GPL(net_inc_egress_queue);
 1744
 1745void net_dec_egress_queue(void)
 1746{
 1747	static_branch_dec(&egress_needed_key);
 1748}
 1749EXPORT_SYMBOL_GPL(net_dec_egress_queue);
 1750#endif
 1751
 1752static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
 1753#ifdef CONFIG_JUMP_LABEL
 
 
 
 
 1754static atomic_t netstamp_needed_deferred;
 1755static atomic_t netstamp_wanted;
 1756static void netstamp_clear(struct work_struct *work)
 1757{
 1758	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
 1759	int wanted;
 1760
 1761	wanted = atomic_add_return(deferred, &netstamp_wanted);
 1762	if (wanted > 0)
 1763		static_branch_enable(&netstamp_needed_key);
 1764	else
 1765		static_branch_disable(&netstamp_needed_key);
 1766}
 1767static DECLARE_WORK(netstamp_work, netstamp_clear);
 1768#endif
 1769
 1770void net_enable_timestamp(void)
 1771{
 1772#ifdef CONFIG_JUMP_LABEL
 1773	int wanted;
 1774
 1775	while (1) {
 1776		wanted = atomic_read(&netstamp_wanted);
 1777		if (wanted <= 0)
 1778			break;
 1779		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
 1780			return;
 1781	}
 1782	atomic_inc(&netstamp_needed_deferred);
 1783	schedule_work(&netstamp_work);
 1784#else
 1785	static_branch_inc(&netstamp_needed_key);
 1786#endif
 
 1787}
 1788EXPORT_SYMBOL(net_enable_timestamp);
 1789
 1790void net_disable_timestamp(void)
 1791{
 1792#ifdef CONFIG_JUMP_LABEL
 1793	int wanted;
 1794
 1795	while (1) {
 1796		wanted = atomic_read(&netstamp_wanted);
 1797		if (wanted <= 1)
 1798			break;
 1799		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
 1800			return;
 1801	}
 1802	atomic_dec(&netstamp_needed_deferred);
 1803	schedule_work(&netstamp_work);
 1804#else
 1805	static_branch_dec(&netstamp_needed_key);
 1806#endif
 
 1807}
 1808EXPORT_SYMBOL(net_disable_timestamp);
 1809
 1810static inline void net_timestamp_set(struct sk_buff *skb)
 1811{
 1812	skb->tstamp = 0;
 1813	if (static_branch_unlikely(&netstamp_needed_key))
 1814		__net_timestamp(skb);
 1815}
 1816
 1817#define net_timestamp_check(COND, SKB)				\
 1818	if (static_branch_unlikely(&netstamp_needed_key)) {	\
 1819		if ((COND) && !(SKB)->tstamp)			\
 1820			__net_timestamp(SKB);			\
 1821	}							\
 1822
 1823bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
 1824{
 1825	unsigned int len;
 1826
 1827	if (!(dev->flags & IFF_UP))
 1828		return false;
 1829
 1830	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
 1831	if (skb->len <= len)
 1832		return true;
 1833
 1834	/* if TSO is enabled, we don't care about the length as the packet
 1835	 * could be forwarded without being segmented before
 1836	 */
 1837	if (skb_is_gso(skb))
 1838		return true;
 1839
 1840	return false;
 1841}
 1842EXPORT_SYMBOL_GPL(is_skb_forwardable);
 1843
 1844int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 1845{
 1846	int ret = ____dev_forward_skb(dev, skb);
 1847
 1848	if (likely(!ret)) {
 1849		skb->protocol = eth_type_trans(skb, dev);
 1850		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 1851	}
 1852
 1853	return ret;
 
 
 
 
 
 1854}
 1855EXPORT_SYMBOL_GPL(__dev_forward_skb);
 1856
 1857/**
 1858 * dev_forward_skb - loopback an skb to another netif
 1859 *
 1860 * @dev: destination network device
 1861 * @skb: buffer to forward
 1862 *
 1863 * return values:
 1864 *	NET_RX_SUCCESS	(no congestion)
 1865 *	NET_RX_DROP     (packet was dropped, but freed)
 1866 *
 1867 * dev_forward_skb can be used for injecting an skb from the
 1868 * start_xmit function of one device into the receive queue
 1869 * of another device.
 1870 *
 1871 * The receiving device may be in another namespace, so
 1872 * we have to clear all information in the skb that could
 1873 * impact namespace isolation.
 1874 */
 1875int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 1876{
 1877	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
 1878}
 1879EXPORT_SYMBOL_GPL(dev_forward_skb);
 1880
 1881static inline int deliver_skb(struct sk_buff *skb,
 1882			      struct packet_type *pt_prev,
 1883			      struct net_device *orig_dev)
 1884{
 1885	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 1886		return -ENOMEM;
 1887	refcount_inc(&skb->users);
 1888	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 1889}
 1890
 1891static inline void deliver_ptype_list_skb(struct sk_buff *skb,
 1892					  struct packet_type **pt,
 1893					  struct net_device *orig_dev,
 1894					  __be16 type,
 1895					  struct list_head *ptype_list)
 1896{
 1897	struct packet_type *ptype, *pt_prev = *pt;
 1898
 1899	list_for_each_entry_rcu(ptype, ptype_list, list) {
 1900		if (ptype->type != type)
 1901			continue;
 1902		if (pt_prev)
 1903			deliver_skb(skb, pt_prev, orig_dev);
 1904		pt_prev = ptype;
 1905	}
 1906	*pt = pt_prev;
 1907}
 1908
 1909static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
 1910{
 1911	if (!ptype->af_packet_priv || !skb->sk)
 1912		return false;
 1913
 1914	if (ptype->id_match)
 1915		return ptype->id_match(ptype, skb->sk);
 1916	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
 1917		return true;
 1918
 1919	return false;
 1920}
 1921
 1922/**
 1923 * dev_nit_active - return true if any network interface taps are in use
 1924 *
 1925 * @dev: network device to check for the presence of taps
 1926 */
 1927bool dev_nit_active(struct net_device *dev)
 1928{
 1929	return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
 1930}
 1931EXPORT_SYMBOL_GPL(dev_nit_active);
 1932
 1933/*
 1934 *	Support routine. Sends outgoing frames to any network
 1935 *	taps currently in use.
 1936 */
 1937
 1938void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 1939{
 1940	struct packet_type *ptype;
 1941	struct sk_buff *skb2 = NULL;
 1942	struct packet_type *pt_prev = NULL;
 1943	struct list_head *ptype_list = &ptype_all;
 1944
 1945	rcu_read_lock();
 1946again:
 1947	list_for_each_entry_rcu(ptype, ptype_list, list) {
 1948		if (ptype->ignore_outgoing)
 1949			continue;
 1950
 1951		/* Never send packets back to the socket
 1952		 * they originated from - MvS (miquels@drinkel.ow.org)
 1953		 */
 1954		if (skb_loop_sk(ptype, skb))
 1955			continue;
 1956
 1957		if (pt_prev) {
 1958			deliver_skb(skb2, pt_prev, skb->dev);
 1959			pt_prev = ptype;
 1960			continue;
 1961		}
 1962
 1963		/* need to clone skb, done only once */
 1964		skb2 = skb_clone(skb, GFP_ATOMIC);
 1965		if (!skb2)
 1966			goto out_unlock;
 1967
 1968		net_timestamp_set(skb2);
 1969
 1970		/* skb->nh should be correctly
 1971		 * set by sender, so that the second statement is
 1972		 * just protection against buggy protocols.
 1973		 */
 1974		skb_reset_mac_header(skb2);
 1975
 1976		if (skb_network_header(skb2) < skb2->data ||
 1977		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
 1978			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
 1979					     ntohs(skb2->protocol),
 1980					     dev->name);
 1981			skb_reset_network_header(skb2);
 1982		}
 1983
 1984		skb2->transport_header = skb2->network_header;
 1985		skb2->pkt_type = PACKET_OUTGOING;
 1986		pt_prev = ptype;
 1987	}
 1988
 1989	if (ptype_list == &ptype_all) {
 1990		ptype_list = &dev->ptype_all;
 1991		goto again;
 1992	}
 1993out_unlock:
 1994	if (pt_prev) {
 1995		if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
 1996			pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
 1997		else
 1998			kfree_skb(skb2);
 1999	}
 2000	rcu_read_unlock();
 2001}
 2002EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
 2003
 2004/**
 2005 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
 2006 * @dev: Network device
 2007 * @txq: number of queues available
 2008 *
 2009 * If real_num_tx_queues is changed the tc mappings may no longer be
 2010 * valid. To resolve this verify the tc mapping remains valid and if
 2011 * not NULL the mapping. With no priorities mapping to this
 2012 * offset/count pair it will no longer be used. In the worst case TC0
 2013 * is invalid nothing can be done so disable priority mappings. If is
 2014 * expected that drivers will fix this mapping if they can before
 2015 * calling netif_set_real_num_tx_queues.
 2016 */
 2017static void netif_setup_tc(struct net_device *dev, unsigned int txq)
 2018{
 2019	int i;
 2020	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2021
 2022	/* If TC0 is invalidated disable TC mapping */
 2023	if (tc->offset + tc->count > txq) {
 2024		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
 2025		dev->num_tc = 0;
 2026		return;
 2027	}
 2028
 2029	/* Invalidated prio to tc mappings set to TC0 */
 2030	for (i = 1; i < TC_BITMASK + 1; i++) {
 2031		int q = netdev_get_prio_tc_map(dev, i);
 2032
 2033		tc = &dev->tc_to_txq[q];
 2034		if (tc->offset + tc->count > txq) {
 2035			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
 2036				i, q);
 2037			netdev_set_prio_tc_map(dev, i, 0);
 2038		}
 2039	}
 2040}
 2041
 2042int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
 2043{
 2044	if (dev->num_tc) {
 2045		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2046		int i;
 2047
 2048		/* walk through the TCs and see if it falls into any of them */
 2049		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
 2050			if ((txq - tc->offset) < tc->count)
 2051				return i;
 2052		}
 2053
 2054		/* didn't find it, just return -1 to indicate no match */
 2055		return -1;
 2056	}
 2057
 2058	return 0;
 2059}
 2060EXPORT_SYMBOL(netdev_txq_to_tc);
 2061
 2062#ifdef CONFIG_XPS
 2063struct static_key xps_needed __read_mostly;
 2064EXPORT_SYMBOL(xps_needed);
 2065struct static_key xps_rxqs_needed __read_mostly;
 2066EXPORT_SYMBOL(xps_rxqs_needed);
 2067static DEFINE_MUTEX(xps_map_mutex);
 2068#define xmap_dereference(P)		\
 2069	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
 2070
 2071static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
 2072			     int tci, u16 index)
 2073{
 2074	struct xps_map *map = NULL;
 2075	int pos;
 2076
 2077	if (dev_maps)
 2078		map = xmap_dereference(dev_maps->attr_map[tci]);
 2079	if (!map)
 2080		return false;
 2081
 2082	for (pos = map->len; pos--;) {
 2083		if (map->queues[pos] != index)
 2084			continue;
 2085
 2086		if (map->len > 1) {
 2087			map->queues[pos] = map->queues[--map->len];
 
 
 
 
 
 
 
 2088			break;
 2089		}
 2090
 2091		RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
 2092		kfree_rcu(map, rcu);
 2093		return false;
 2094	}
 2095
 2096	return true;
 2097}
 2098
 2099static bool remove_xps_queue_cpu(struct net_device *dev,
 2100				 struct xps_dev_maps *dev_maps,
 2101				 int cpu, u16 offset, u16 count)
 2102{
 2103	int num_tc = dev->num_tc ? : 1;
 
 2104	bool active = false;
 2105	int tci;
 2106
 2107	for (tci = cpu * num_tc; num_tc--; tci++) {
 2108		int i, j;
 
 
 
 2109
 2110		for (i = count, j = offset; i--; j++) {
 2111			if (!remove_xps_queue(dev_maps, tci, j))
 
 2112				break;
 2113		}
 2114
 2115		active |= i < 0;
 2116	}
 2117
 2118	return active;
 2119}
 2120
 2121static void reset_xps_maps(struct net_device *dev,
 2122			   struct xps_dev_maps *dev_maps,
 2123			   bool is_rxqs_map)
 2124{
 2125	if (is_rxqs_map) {
 2126		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
 2127		RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
 2128	} else {
 2129		RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
 2130	}
 2131	static_key_slow_dec_cpuslocked(&xps_needed);
 2132	kfree_rcu(dev_maps, rcu);
 2133}
 2134
 2135static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
 2136			   struct xps_dev_maps *dev_maps, unsigned int nr_ids,
 2137			   u16 offset, u16 count, bool is_rxqs_map)
 2138{
 2139	bool active = false;
 2140	int i, j;
 2141
 2142	for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
 2143	     j < nr_ids;)
 2144		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
 2145					       count);
 2146	if (!active)
 2147		reset_xps_maps(dev, dev_maps, is_rxqs_map);
 2148
 2149	if (!is_rxqs_map) {
 2150		for (i = offset + (count - 1); count--; i--) {
 2151			netdev_queue_numa_node_write(
 2152				netdev_get_tx_queue(dev, i),
 2153				NUMA_NO_NODE);
 2154		}
 2155	}
 2156}
 2157
 2158static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
 2159				   u16 count)
 2160{
 2161	const unsigned long *possible_mask = NULL;
 2162	struct xps_dev_maps *dev_maps;
 2163	unsigned int nr_ids;
 2164
 2165	if (!static_key_false(&xps_needed))
 2166		return;
 2167
 2168	cpus_read_lock();
 2169	mutex_lock(&xps_map_mutex);
 2170
 2171	if (static_key_false(&xps_rxqs_needed)) {
 2172		dev_maps = xmap_dereference(dev->xps_rxqs_map);
 2173		if (dev_maps) {
 2174			nr_ids = dev->num_rx_queues;
 2175			clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
 2176				       offset, count, true);
 2177		}
 2178	}
 2179
 2180	dev_maps = xmap_dereference(dev->xps_cpus_map);
 2181	if (!dev_maps)
 2182		goto out_no_maps;
 2183
 2184	if (num_possible_cpus() > 1)
 2185		possible_mask = cpumask_bits(cpu_possible_mask);
 2186	nr_ids = nr_cpu_ids;
 2187	clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
 2188		       false);
 2189
 2190out_no_maps:
 2191	mutex_unlock(&xps_map_mutex);
 2192	cpus_read_unlock();
 2193}
 2194
 2195static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
 2196{
 2197	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
 2198}
 2199
 2200static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
 2201				      u16 index, bool is_rxqs_map)
 2202{
 2203	struct xps_map *new_map;
 2204	int alloc_len = XPS_MIN_MAP_ALLOC;
 2205	int i, pos;
 2206
 2207	for (pos = 0; map && pos < map->len; pos++) {
 2208		if (map->queues[pos] != index)
 2209			continue;
 2210		return map;
 2211	}
 2212
 2213	/* Need to add tx-queue to this CPU's/rx-queue's existing map */
 2214	if (map) {
 2215		if (pos < map->alloc_len)
 2216			return map;
 2217
 2218		alloc_len = map->alloc_len * 2;
 2219	}
 2220
 2221	/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
 2222	 *  map
 2223	 */
 2224	if (is_rxqs_map)
 2225		new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
 2226	else
 2227		new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
 2228				       cpu_to_node(attr_index));
 2229	if (!new_map)
 2230		return NULL;
 2231
 2232	for (i = 0; i < pos; i++)
 2233		new_map->queues[i] = map->queues[i];
 2234	new_map->alloc_len = alloc_len;
 2235	new_map->len = pos;
 2236
 2237	return new_map;
 2238}
 2239
 2240/* Must be called under cpus_read_lock */
 2241int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 2242			  u16 index, bool is_rxqs_map)
 2243{
 2244	const unsigned long *online_mask = NULL, *possible_mask = NULL;
 2245	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
 2246	int i, j, tci, numa_node_id = -2;
 2247	int maps_sz, num_tc = 1, tc = 0;
 2248	struct xps_map *map, *new_map;
 
 
 2249	bool active = false;
 2250	unsigned int nr_ids;
 2251
 2252	if (dev->num_tc) {
 2253		/* Do not allow XPS on subordinate device directly */
 2254		num_tc = dev->num_tc;
 2255		if (num_tc < 0)
 2256			return -EINVAL;
 2257
 2258		/* If queue belongs to subordinate dev use its map */
 2259		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
 2260
 2261		tc = netdev_txq_to_tc(dev, index);
 2262		if (tc < 0)
 2263			return -EINVAL;
 2264	}
 2265
 2266	mutex_lock(&xps_map_mutex);
 2267	if (is_rxqs_map) {
 2268		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
 2269		dev_maps = xmap_dereference(dev->xps_rxqs_map);
 2270		nr_ids = dev->num_rx_queues;
 2271	} else {
 2272		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
 2273		if (num_possible_cpus() > 1) {
 2274			online_mask = cpumask_bits(cpu_online_mask);
 2275			possible_mask = cpumask_bits(cpu_possible_mask);
 2276		}
 2277		dev_maps = xmap_dereference(dev->xps_cpus_map);
 2278		nr_ids = nr_cpu_ids;
 2279	}
 2280
 2281	if (maps_sz < L1_CACHE_BYTES)
 2282		maps_sz = L1_CACHE_BYTES;
 2283
 2284	/* allocate memory for queue storage */
 2285	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
 2286	     j < nr_ids;) {
 
 
 2287		if (!new_dev_maps)
 2288			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
 2289		if (!new_dev_maps) {
 2290			mutex_unlock(&xps_map_mutex);
 2291			return -ENOMEM;
 2292		}
 2293
 2294		tci = j * num_tc + tc;
 2295		map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
 2296				 NULL;
 2297
 2298		map = expand_xps_map(map, j, index, is_rxqs_map);
 2299		if (!map)
 2300			goto error;
 2301
 2302		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2303	}
 2304
 2305	if (!new_dev_maps)
 2306		goto out_no_new_maps;
 2307
 2308	if (!dev_maps) {
 2309		/* Increment static keys at most once per type */
 2310		static_key_slow_inc_cpuslocked(&xps_needed);
 2311		if (is_rxqs_map)
 2312			static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
 2313	}
 2314
 2315	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2316	     j < nr_ids;) {
 2317		/* copy maps belonging to foreign traffic classes */
 2318		for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
 2319			/* fill in the new device map from the old device map */
 2320			map = xmap_dereference(dev_maps->attr_map[tci]);
 2321			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2322		}
 2323
 2324		/* We need to explicitly update tci as prevous loop
 2325		 * could break out early if dev_maps is NULL.
 2326		 */
 2327		tci = j * num_tc + tc;
 2328
 2329		if (netif_attr_test_mask(j, mask, nr_ids) &&
 2330		    netif_attr_test_online(j, online_mask, nr_ids)) {
 2331			/* add tx-queue to CPU/rx-queue maps */
 2332			int pos = 0;
 2333
 2334			map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2335			while ((pos < map->len) && (map->queues[pos] != index))
 2336				pos++;
 2337
 2338			if (pos == map->len)
 2339				map->queues[map->len++] = index;
 2340#ifdef CONFIG_NUMA
 2341			if (!is_rxqs_map) {
 2342				if (numa_node_id == -2)
 2343					numa_node_id = cpu_to_node(j);
 2344				else if (numa_node_id != cpu_to_node(j))
 2345					numa_node_id = -1;
 2346			}
 2347#endif
 2348		} else if (dev_maps) {
 2349			/* fill in the new device map from the old device map */
 2350			map = xmap_dereference(dev_maps->attr_map[tci]);
 2351			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2352		}
 2353
 2354		/* copy maps belonging to foreign traffic classes */
 2355		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
 2356			/* fill in the new device map from the old device map */
 2357			map = xmap_dereference(dev_maps->attr_map[tci]);
 2358			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2359		}
 2360	}
 2361
 2362	if (is_rxqs_map)
 2363		rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
 2364	else
 2365		rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
 2366
 2367	/* Cleanup old maps */
 2368	if (!dev_maps)
 2369		goto out_no_old_maps;
 2370
 2371	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2372	     j < nr_ids;) {
 2373		for (i = num_tc, tci = j * num_tc; i--; tci++) {
 2374			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2375			map = xmap_dereference(dev_maps->attr_map[tci]);
 2376			if (map && map != new_map)
 2377				kfree_rcu(map, rcu);
 2378		}
 2379	}
 2380
 2381	kfree_rcu(dev_maps, rcu);
 
 2382
 2383out_no_old_maps:
 2384	dev_maps = new_dev_maps;
 2385	active = true;
 2386
 2387out_no_new_maps:
 2388	if (!is_rxqs_map) {
 2389		/* update Tx queue numa node */
 2390		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
 2391					     (numa_node_id >= 0) ?
 2392					     numa_node_id : NUMA_NO_NODE);
 2393	}
 2394
 2395	if (!dev_maps)
 2396		goto out_no_maps;
 2397
 2398	/* removes tx-queue from unused CPUs/rx-queues */
 2399	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2400	     j < nr_ids;) {
 2401		for (i = tc, tci = j * num_tc; i--; tci++)
 2402			active |= remove_xps_queue(dev_maps, tci, index);
 2403		if (!netif_attr_test_mask(j, mask, nr_ids) ||
 2404		    !netif_attr_test_online(j, online_mask, nr_ids))
 2405			active |= remove_xps_queue(dev_maps, tci, index);
 2406		for (i = num_tc - tc, tci++; --i; tci++)
 2407			active |= remove_xps_queue(dev_maps, tci, index);
 2408	}
 2409
 2410	/* free map if not active */
 2411	if (!active)
 2412		reset_xps_maps(dev, dev_maps, is_rxqs_map);
 
 
 2413
 2414out_no_maps:
 2415	mutex_unlock(&xps_map_mutex);
 2416
 2417	return 0;
 2418error:
 2419	/* remove any maps that we added */
 2420	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2421	     j < nr_ids;) {
 2422		for (i = num_tc, tci = j * num_tc; i--; tci++) {
 2423			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2424			map = dev_maps ?
 2425			      xmap_dereference(dev_maps->attr_map[tci]) :
 2426			      NULL;
 2427			if (new_map && new_map != map)
 2428				kfree(new_map);
 2429		}
 2430	}
 2431
 2432	mutex_unlock(&xps_map_mutex);
 2433
 2434	kfree(new_dev_maps);
 2435	return -ENOMEM;
 2436}
 2437EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
 2438
 2439int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 2440			u16 index)
 2441{
 2442	int ret;
 2443
 2444	cpus_read_lock();
 2445	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
 2446	cpus_read_unlock();
 2447
 2448	return ret;
 2449}
 2450EXPORT_SYMBOL(netif_set_xps_queue);
 2451
 2452#endif
 2453static void netdev_unbind_all_sb_channels(struct net_device *dev)
 2454{
 2455	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2456
 2457	/* Unbind any subordinate channels */
 2458	while (txq-- != &dev->_tx[0]) {
 2459		if (txq->sb_dev)
 2460			netdev_unbind_sb_channel(dev, txq->sb_dev);
 2461	}
 2462}
 2463
 2464void netdev_reset_tc(struct net_device *dev)
 2465{
 2466#ifdef CONFIG_XPS
 2467	netif_reset_xps_queues_gt(dev, 0);
 2468#endif
 2469	netdev_unbind_all_sb_channels(dev);
 2470
 2471	/* Reset TC configuration of device */
 2472	dev->num_tc = 0;
 2473	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
 2474	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
 2475}
 2476EXPORT_SYMBOL(netdev_reset_tc);
 2477
 2478int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
 2479{
 2480	if (tc >= dev->num_tc)
 2481		return -EINVAL;
 2482
 2483#ifdef CONFIG_XPS
 2484	netif_reset_xps_queues(dev, offset, count);
 2485#endif
 2486	dev->tc_to_txq[tc].count = count;
 2487	dev->tc_to_txq[tc].offset = offset;
 2488	return 0;
 2489}
 2490EXPORT_SYMBOL(netdev_set_tc_queue);
 2491
 2492int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
 2493{
 2494	if (num_tc > TC_MAX_QUEUE)
 2495		return -EINVAL;
 2496
 2497#ifdef CONFIG_XPS
 2498	netif_reset_xps_queues_gt(dev, 0);
 2499#endif
 2500	netdev_unbind_all_sb_channels(dev);
 2501
 2502	dev->num_tc = num_tc;
 2503	return 0;
 2504}
 2505EXPORT_SYMBOL(netdev_set_num_tc);
 2506
 2507void netdev_unbind_sb_channel(struct net_device *dev,
 2508			      struct net_device *sb_dev)
 2509{
 2510	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2511
 2512#ifdef CONFIG_XPS
 2513	netif_reset_xps_queues_gt(sb_dev, 0);
 2514#endif
 2515	memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
 2516	memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
 2517
 2518	while (txq-- != &dev->_tx[0]) {
 2519		if (txq->sb_dev == sb_dev)
 2520			txq->sb_dev = NULL;
 2521	}
 2522}
 2523EXPORT_SYMBOL(netdev_unbind_sb_channel);
 2524
 2525int netdev_bind_sb_channel_queue(struct net_device *dev,
 2526				 struct net_device *sb_dev,
 2527				 u8 tc, u16 count, u16 offset)
 2528{
 2529	/* Make certain the sb_dev and dev are already configured */
 2530	if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
 2531		return -EINVAL;
 2532
 2533	/* We cannot hand out queues we don't have */
 2534	if ((offset + count) > dev->real_num_tx_queues)
 2535		return -EINVAL;
 2536
 2537	/* Record the mapping */
 2538	sb_dev->tc_to_txq[tc].count = count;
 2539	sb_dev->tc_to_txq[tc].offset = offset;
 2540
 2541	/* Provide a way for Tx queue to find the tc_to_txq map or
 2542	 * XPS map for itself.
 2543	 */
 2544	while (count--)
 2545		netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
 2546
 2547	return 0;
 2548}
 2549EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
 2550
 2551int netdev_set_sb_channel(struct net_device *dev, u16 channel)
 2552{
 2553	/* Do not use a multiqueue device to represent a subordinate channel */
 2554	if (netif_is_multiqueue(dev))
 2555		return -ENODEV;
 2556
 2557	/* We allow channels 1 - 32767 to be used for subordinate channels.
 2558	 * Channel 0 is meant to be "native" mode and used only to represent
 2559	 * the main root device. We allow writing 0 to reset the device back
 2560	 * to normal mode after being used as a subordinate channel.
 2561	 */
 2562	if (channel > S16_MAX)
 2563		return -EINVAL;
 2564
 2565	dev->num_tc = -channel;
 2566
 2567	return 0;
 2568}
 2569EXPORT_SYMBOL(netdev_set_sb_channel);
 2570
 2571/*
 2572 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
 2573 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
 2574 */
 2575int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
 2576{
 2577	bool disabling;
 2578	int rc;
 2579
 2580	disabling = txq < dev->real_num_tx_queues;
 2581
 2582	if (txq < 1 || txq > dev->num_tx_queues)
 2583		return -EINVAL;
 2584
 2585	if (dev->reg_state == NETREG_REGISTERED ||
 2586	    dev->reg_state == NETREG_UNREGISTERING) {
 2587		ASSERT_RTNL();
 2588
 2589		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
 2590						  txq);
 2591		if (rc)
 2592			return rc;
 2593
 2594		if (dev->num_tc)
 2595			netif_setup_tc(dev, txq);
 2596
 2597		dev->real_num_tx_queues = txq;
 2598
 2599		if (disabling) {
 2600			synchronize_net();
 2601			qdisc_reset_all_tx_gt(dev, txq);
 2602#ifdef CONFIG_XPS
 2603			netif_reset_xps_queues_gt(dev, txq);
 2604#endif
 2605		}
 2606	} else {
 2607		dev->real_num_tx_queues = txq;
 2608	}
 2609
 
 2610	return 0;
 2611}
 2612EXPORT_SYMBOL(netif_set_real_num_tx_queues);
 2613
 2614#ifdef CONFIG_SYSFS
 2615/**
 2616 *	netif_set_real_num_rx_queues - set actual number of RX queues used
 2617 *	@dev: Network device
 2618 *	@rxq: Actual number of RX queues
 2619 *
 2620 *	This must be called either with the rtnl_lock held or before
 2621 *	registration of the net device.  Returns 0 on success, or a
 2622 *	negative error code.  If called before registration, it always
 2623 *	succeeds.
 2624 */
 2625int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
 2626{
 2627	int rc;
 2628
 2629	if (rxq < 1 || rxq > dev->num_rx_queues)
 2630		return -EINVAL;
 2631
 2632	if (dev->reg_state == NETREG_REGISTERED) {
 2633		ASSERT_RTNL();
 2634
 2635		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
 2636						  rxq);
 2637		if (rc)
 2638			return rc;
 2639	}
 2640
 2641	dev->real_num_rx_queues = rxq;
 2642	return 0;
 2643}
 2644EXPORT_SYMBOL(netif_set_real_num_rx_queues);
 2645#endif
 2646
 2647/**
 2648 * netif_get_num_default_rss_queues - default number of RSS queues
 2649 *
 2650 * This routine should set an upper limit on the number of RSS queues
 2651 * used by default by multiqueue devices.
 2652 */
 2653int netif_get_num_default_rss_queues(void)
 2654{
 2655	return is_kdump_kernel() ?
 2656		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
 2657}
 2658EXPORT_SYMBOL(netif_get_num_default_rss_queues);
 2659
 2660static void __netif_reschedule(struct Qdisc *q)
 2661{
 2662	struct softnet_data *sd;
 2663	unsigned long flags;
 2664
 2665	local_irq_save(flags);
 2666	sd = this_cpu_ptr(&softnet_data);
 2667	q->next_sched = NULL;
 2668	*sd->output_queue_tailp = q;
 2669	sd->output_queue_tailp = &q->next_sched;
 2670	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 2671	local_irq_restore(flags);
 2672}
 2673
 2674void __netif_schedule(struct Qdisc *q)
 2675{
 2676	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
 2677		__netif_reschedule(q);
 2678}
 2679EXPORT_SYMBOL(__netif_schedule);
 2680
 2681struct dev_kfree_skb_cb {
 2682	enum skb_free_reason reason;
 2683};
 2684
 2685static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
 2686{
 2687	return (struct dev_kfree_skb_cb *)skb->cb;
 2688}
 2689
 2690void netif_schedule_queue(struct netdev_queue *txq)
 2691{
 2692	rcu_read_lock();
 2693	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
 2694		struct Qdisc *q = rcu_dereference(txq->qdisc);
 2695
 2696		__netif_schedule(q);
 2697	}
 2698	rcu_read_unlock();
 2699}
 2700EXPORT_SYMBOL(netif_schedule_queue);
 2701
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 2702void netif_tx_wake_queue(struct netdev_queue *dev_queue)
 2703{
 2704	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
 2705		struct Qdisc *q;
 2706
 2707		rcu_read_lock();
 2708		q = rcu_dereference(dev_queue->qdisc);
 2709		__netif_schedule(q);
 2710		rcu_read_unlock();
 2711	}
 2712}
 2713EXPORT_SYMBOL(netif_tx_wake_queue);
 2714
 2715void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
 2716{
 2717	unsigned long flags;
 2718
 2719	if (unlikely(!skb))
 2720		return;
 2721
 2722	if (likely(refcount_read(&skb->users) == 1)) {
 2723		smp_rmb();
 2724		refcount_set(&skb->users, 0);
 2725	} else if (likely(!refcount_dec_and_test(&skb->users))) {
 2726		return;
 2727	}
 2728	get_kfree_skb_cb(skb)->reason = reason;
 2729	local_irq_save(flags);
 2730	skb->next = __this_cpu_read(softnet_data.completion_queue);
 2731	__this_cpu_write(softnet_data.completion_queue, skb);
 2732	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 2733	local_irq_restore(flags);
 2734}
 2735EXPORT_SYMBOL(__dev_kfree_skb_irq);
 2736
 2737void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
 2738{
 2739	if (in_irq() || irqs_disabled())
 2740		__dev_kfree_skb_irq(skb, reason);
 2741	else
 2742		dev_kfree_skb(skb);
 2743}
 2744EXPORT_SYMBOL(__dev_kfree_skb_any);
 2745
 2746
 2747/**
 2748 * netif_device_detach - mark device as removed
 2749 * @dev: network device
 2750 *
 2751 * Mark device as removed from system and therefore no longer available.
 2752 */
 2753void netif_device_detach(struct net_device *dev)
 2754{
 2755	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
 2756	    netif_running(dev)) {
 2757		netif_tx_stop_all_queues(dev);
 2758	}
 2759}
 2760EXPORT_SYMBOL(netif_device_detach);
 2761
 2762/**
 2763 * netif_device_attach - mark device as attached
 2764 * @dev: network device
 2765 *
 2766 * Mark device as attached from system and restart if needed.
 2767 */
 2768void netif_device_attach(struct net_device *dev)
 2769{
 2770	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
 2771	    netif_running(dev)) {
 2772		netif_tx_wake_all_queues(dev);
 2773		__netdev_watchdog_up(dev);
 2774	}
 2775}
 2776EXPORT_SYMBOL(netif_device_attach);
 2777
 2778/*
 2779 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
 2780 * to be used as a distribution range.
 2781 */
 2782static u16 skb_tx_hash(const struct net_device *dev,
 2783		       const struct net_device *sb_dev,
 2784		       struct sk_buff *skb)
 2785{
 2786	u32 hash;
 2787	u16 qoffset = 0;
 2788	u16 qcount = dev->real_num_tx_queues;
 2789
 2790	if (dev->num_tc) {
 2791		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
 2792
 2793		qoffset = sb_dev->tc_to_txq[tc].offset;
 2794		qcount = sb_dev->tc_to_txq[tc].count;
 2795	}
 2796
 2797	if (skb_rx_queue_recorded(skb)) {
 2798		hash = skb_get_rx_queue(skb);
 2799		while (unlikely(hash >= qcount))
 2800			hash -= qcount;
 2801		return hash + qoffset;
 
 
 
 
 
 
 2802	}
 2803
 2804	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
 2805}
 
 2806
 2807static void skb_warn_bad_offload(const struct sk_buff *skb)
 2808{
 2809	static const netdev_features_t null_features;
 2810	struct net_device *dev = skb->dev;
 2811	const char *name = "";
 2812
 2813	if (!net_ratelimit())
 2814		return;
 2815
 2816	if (dev) {
 2817		if (dev->dev.parent)
 2818			name = dev_driver_string(dev->dev.parent);
 2819		else
 2820			name = netdev_name(dev);
 2821	}
 2822	skb_dump(KERN_WARNING, skb, false);
 2823	WARN(1, "%s: caps=(%pNF, %pNF)\n",
 2824	     name, dev ? &dev->features : &null_features,
 2825	     skb->sk ? &skb->sk->sk_route_caps : &null_features);
 
 
 2826}
 2827
 2828/*
 2829 * Invalidate hardware checksum when packet is to be mangled, and
 2830 * complete checksum manually on outgoing path.
 2831 */
 2832int skb_checksum_help(struct sk_buff *skb)
 2833{
 2834	__wsum csum;
 2835	int ret = 0, offset;
 2836
 2837	if (skb->ip_summed == CHECKSUM_COMPLETE)
 2838		goto out_set_summed;
 2839
 2840	if (unlikely(skb_shinfo(skb)->gso_size)) {
 2841		skb_warn_bad_offload(skb);
 2842		return -EINVAL;
 2843	}
 2844
 2845	/* Before computing a checksum, we should make sure no frag could
 2846	 * be modified by an external entity : checksum could be wrong.
 2847	 */
 2848	if (skb_has_shared_frag(skb)) {
 2849		ret = __skb_linearize(skb);
 2850		if (ret)
 2851			goto out;
 2852	}
 2853
 2854	offset = skb_checksum_start_offset(skb);
 2855	BUG_ON(offset >= skb_headlen(skb));
 2856	csum = skb_checksum(skb, offset, skb->len - offset, 0);
 2857
 2858	offset += skb->csum_offset;
 2859	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
 2860
 2861	if (skb_cloned(skb) &&
 2862	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
 2863		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
 2864		if (ret)
 2865			goto out;
 2866	}
 2867
 2868	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
 2869out_set_summed:
 2870	skb->ip_summed = CHECKSUM_NONE;
 2871out:
 2872	return ret;
 2873}
 2874EXPORT_SYMBOL(skb_checksum_help);
 2875
 2876int skb_crc32c_csum_help(struct sk_buff *skb)
 2877{
 2878	__le32 crc32c_csum;
 2879	int ret = 0, offset, start;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 2880
 2881	if (skb->ip_summed != CHECKSUM_PARTIAL)
 2882		goto out;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 2883
 2884	if (unlikely(skb_is_gso(skb)))
 2885		goto out;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 2886
 2887	/* Before computing a checksum, we should make sure no frag could
 2888	 * be modified by an external entity : checksum could be wrong.
 2889	 */
 2890	if (unlikely(skb_has_shared_frag(skb))) {
 2891		ret = __skb_linearize(skb);
 2892		if (ret)
 2893			goto out;
 2894	}
 2895	start = skb_checksum_start_offset(skb);
 2896	offset = start + offsetof(struct sctphdr, checksum);
 2897	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
 2898		ret = -EINVAL;
 2899		goto out;
 2900	}
 2901	if (skb_cloned(skb) &&
 2902	    !skb_clone_writable(skb, offset + sizeof(__le32))) {
 2903		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
 2904		if (ret)
 2905			goto out;
 2906	}
 2907	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
 2908						  skb->len - start, ~(__u32)0,
 2909						  crc32c_csum_stub));
 2910	*(__le32 *)(skb->data + offset) = crc32c_csum;
 2911	skb->ip_summed = CHECKSUM_NONE;
 2912	skb->csum_not_inet = 0;
 2913out:
 2914	return ret;
 
 2915}
 
 2916
 2917__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
 2918{
 2919	__be16 type = skb->protocol;
 2920
 2921	/* Tunnel gso handlers can set protocol to ethernet. */
 2922	if (type == htons(ETH_P_TEB)) {
 2923		struct ethhdr *eth;
 2924
 2925		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
 2926			return 0;
 2927
 2928		eth = (struct ethhdr *)skb->data;
 2929		type = eth->h_proto;
 2930	}
 2931
 2932	return __vlan_get_protocol(skb, type, depth);
 2933}
 2934
 2935/**
 2936 *	skb_mac_gso_segment - mac layer segmentation handler.
 2937 *	@skb: buffer to segment
 2938 *	@features: features for the output path (see dev->features)
 2939 */
 2940struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
 2941				    netdev_features_t features)
 2942{
 2943	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
 2944	struct packet_offload *ptype;
 2945	int vlan_depth = skb->mac_len;
 2946	__be16 type = skb_network_protocol(skb, &vlan_depth);
 2947
 2948	if (unlikely(!type))
 2949		return ERR_PTR(-EINVAL);
 2950
 2951	__skb_pull(skb, vlan_depth);
 2952
 2953	rcu_read_lock();
 2954	list_for_each_entry_rcu(ptype, &offload_base, list) {
 2955		if (ptype->type == type && ptype->callbacks.gso_segment) {
 2956			segs = ptype->callbacks.gso_segment(skb, features);
 2957			break;
 2958		}
 2959	}
 2960	rcu_read_unlock();
 2961
 2962	__skb_push(skb, skb->data - skb_mac_header(skb));
 2963
 2964	return segs;
 2965}
 2966EXPORT_SYMBOL(skb_mac_gso_segment);
 2967
 2968
 2969/* openvswitch calls this on rx path, so we need a different check.
 2970 */
 2971static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
 2972{
 2973	if (tx_path)
 2974		return skb->ip_summed != CHECKSUM_PARTIAL &&
 2975		       skb->ip_summed != CHECKSUM_UNNECESSARY;
 2976
 2977	return skb->ip_summed == CHECKSUM_NONE;
 2978}
 2979
 2980/**
 2981 *	__skb_gso_segment - Perform segmentation on skb.
 2982 *	@skb: buffer to segment
 2983 *	@features: features for the output path (see dev->features)
 2984 *	@tx_path: whether it is called in TX path
 2985 *
 2986 *	This function segments the given skb and returns a list of segments.
 2987 *
 2988 *	It may return NULL if the skb requires no segmentation.  This is
 2989 *	only possible when GSO is used for verifying header integrity.
 2990 *
 2991 *	Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
 2992 */
 2993struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 2994				  netdev_features_t features, bool tx_path)
 2995{
 2996	struct sk_buff *segs;
 2997
 2998	if (unlikely(skb_needs_check(skb, tx_path))) {
 2999		int err;
 3000
 3001		/* We're going to init ->check field in TCP or UDP header */
 
 3002		err = skb_cow_head(skb, 0);
 3003		if (err < 0)
 3004			return ERR_PTR(err);
 3005	}
 3006
 3007	/* Only report GSO partial support if it will enable us to
 3008	 * support segmentation on this frame without needing additional
 3009	 * work.
 3010	 */
 3011	if (features & NETIF_F_GSO_PARTIAL) {
 3012		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
 3013		struct net_device *dev = skb->dev;
 3014
 3015		partial_features |= dev->features & dev->gso_partial_features;
 3016		if (!skb_gso_ok(skb, features | partial_features))
 3017			features &= ~NETIF_F_GSO_PARTIAL;
 3018	}
 3019
 3020	BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
 3021		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
 3022
 3023	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
 3024	SKB_GSO_CB(skb)->encap_level = 0;
 3025
 3026	skb_reset_mac_header(skb);
 3027	skb_reset_mac_len(skb);
 3028
 3029	segs = skb_mac_gso_segment(skb, features);
 3030
 3031	if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
 3032		skb_warn_bad_offload(skb);
 3033
 3034	return segs;
 3035}
 3036EXPORT_SYMBOL(__skb_gso_segment);
 3037
 3038/* Take action when hardware reception checksum errors are detected. */
 3039#ifdef CONFIG_BUG
 3040void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
 3041{
 3042	if (net_ratelimit()) {
 3043		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
 3044		skb_dump(KERN_ERR, skb, true);
 3045		dump_stack();
 3046	}
 3047}
 3048EXPORT_SYMBOL(netdev_rx_csum_fault);
 3049#endif
 3050
 3051/* XXX: check that highmem exists at all on the given machine. */
 
 
 
 
 3052static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 3053{
 3054#ifdef CONFIG_HIGHMEM
 3055	int i;
 3056
 3057	if (!(dev->features & NETIF_F_HIGHDMA)) {
 3058		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 3059			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 3060
 3061			if (PageHighMem(skb_frag_page(frag)))
 3062				return 1;
 3063		}
 3064	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 3065#endif
 3066	return 0;
 3067}
 3068
 3069/* If MPLS offload request, verify we are testing hardware MPLS features
 3070 * instead of standard features for the netdev.
 3071 */
 3072#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
 3073static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3074					   netdev_features_t features,
 3075					   __be16 type)
 3076{
 3077	if (eth_p_mpls(type))
 3078		features &= skb->dev->mpls_features;
 3079
 3080	return features;
 3081}
 3082#else
 3083static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3084					   netdev_features_t features,
 3085					   __be16 type)
 3086{
 3087	return features;
 3088}
 3089#endif
 3090
 3091static netdev_features_t harmonize_features(struct sk_buff *skb,
 3092	netdev_features_t features)
 3093{
 3094	int tmp;
 3095	__be16 type;
 3096
 3097	type = skb_network_protocol(skb, &tmp);
 3098	features = net_mpls_features(skb, features, type);
 3099
 3100	if (skb->ip_summed != CHECKSUM_NONE &&
 3101	    !can_checksum_protocol(features, type)) {
 3102		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
 3103	}
 3104	if (illegal_highdma(skb->dev, skb))
 3105		features &= ~NETIF_F_SG;
 
 3106
 3107	return features;
 3108}
 3109
 3110netdev_features_t passthru_features_check(struct sk_buff *skb,
 3111					  struct net_device *dev,
 3112					  netdev_features_t features)
 3113{
 3114	return features;
 3115}
 3116EXPORT_SYMBOL(passthru_features_check);
 3117
 3118static netdev_features_t dflt_features_check(struct sk_buff *skb,
 3119					     struct net_device *dev,
 3120					     netdev_features_t features)
 3121{
 3122	return vlan_features_check(skb, features);
 3123}
 3124
 3125static netdev_features_t gso_features_check(const struct sk_buff *skb,
 3126					    struct net_device *dev,
 3127					    netdev_features_t features)
 3128{
 3129	u16 gso_segs = skb_shinfo(skb)->gso_segs;
 3130
 3131	if (gso_segs > dev->gso_max_segs)
 3132		return features & ~NETIF_F_GSO_MASK;
 3133
 3134	/* Support for GSO partial features requires software
 3135	 * intervention before we can actually process the packets
 3136	 * so we need to strip support for any partial features now
 3137	 * and we can pull them back in after we have partially
 3138	 * segmented the frame.
 3139	 */
 3140	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
 3141		features &= ~dev->gso_partial_features;
 3142
 3143	/* Make sure to clear the IPv4 ID mangling feature if the
 3144	 * IPv4 header has the potential to be fragmented.
 3145	 */
 3146	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
 3147		struct iphdr *iph = skb->encapsulation ?
 3148				    inner_ip_hdr(skb) : ip_hdr(skb);
 3149
 3150		if (!(iph->frag_off & htons(IP_DF)))
 3151			features &= ~NETIF_F_TSO_MANGLEID;
 3152	}
 3153
 3154	return features;
 3155}
 3156
 3157netdev_features_t netif_skb_features(struct sk_buff *skb)
 3158{
 3159	struct net_device *dev = skb->dev;
 3160	netdev_features_t features = dev->features;
 
 3161
 3162	if (skb_is_gso(skb))
 3163		features = gso_features_check(skb, dev, features);
 3164
 3165	/* If encapsulation offload request, verify we are testing
 3166	 * hardware encapsulation features instead of standard
 3167	 * features for the netdev
 3168	 */
 3169	if (skb->encapsulation)
 3170		features &= dev->hw_enc_features;
 3171
 3172	if (skb_vlan_tagged(skb))
 3173		features = netdev_intersect_features(features,
 3174						     dev->vlan_features |
 3175						     NETIF_F_HW_VLAN_CTAG_TX |
 3176						     NETIF_F_HW_VLAN_STAG_TX);
 3177
 3178	if (dev->netdev_ops->ndo_features_check)
 3179		features &= dev->netdev_ops->ndo_features_check(skb, dev,
 3180								features);
 3181	else
 3182		features &= dflt_features_check(skb, dev, features);
 3183
 3184	return harmonize_features(skb, features);
 3185}
 3186EXPORT_SYMBOL(netif_skb_features);
 3187
 3188static int xmit_one(struct sk_buff *skb, struct net_device *dev,
 3189		    struct netdev_queue *txq, bool more)
 3190{
 3191	unsigned int len;
 3192	int rc;
 3193
 3194	if (dev_nit_active(dev))
 3195		dev_queue_xmit_nit(skb, dev);
 3196
 3197	len = skb->len;
 3198	trace_net_dev_start_xmit(skb, dev);
 3199	rc = netdev_start_xmit(skb, dev, txq, more);
 3200	trace_net_dev_xmit(skb, rc, dev, len);
 3201
 3202	return rc;
 3203}
 3204
 3205struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
 3206				    struct netdev_queue *txq, int *ret)
 3207{
 3208	struct sk_buff *skb = first;
 3209	int rc = NETDEV_TX_OK;
 3210
 3211	while (skb) {
 3212		struct sk_buff *next = skb->next;
 3213
 3214		skb_mark_not_on_list(skb);
 3215		rc = xmit_one(skb, dev, txq, next != NULL);
 3216		if (unlikely(!dev_xmit_complete(rc))) {
 3217			skb->next = next;
 3218			goto out;
 3219		}
 3220
 3221		skb = next;
 3222		if (netif_tx_queue_stopped(txq) && skb) {
 3223			rc = NETDEV_TX_BUSY;
 3224			break;
 3225		}
 3226	}
 3227
 3228out:
 3229	*ret = rc;
 3230	return skb;
 3231}
 3232
 3233static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
 3234					  netdev_features_t features)
 3235{
 3236	if (skb_vlan_tag_present(skb) &&
 3237	    !vlan_hw_offload_capable(features, skb->vlan_proto))
 3238		skb = __vlan_hwaccel_push_inside(skb);
 3239	return skb;
 3240}
 3241
 3242int skb_csum_hwoffload_help(struct sk_buff *skb,
 3243			    const netdev_features_t features)
 3244{
 3245	if (unlikely(skb->csum_not_inet))
 3246		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
 3247			skb_crc32c_csum_help(skb);
 3248
 3249	return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
 3250}
 3251EXPORT_SYMBOL(skb_csum_hwoffload_help);
 3252
 3253static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
 3254{
 3255	netdev_features_t features;
 3256
 
 
 
 3257	features = netif_skb_features(skb);
 3258	skb = validate_xmit_vlan(skb, features);
 3259	if (unlikely(!skb))
 3260		goto out_null;
 3261
 3262	skb = sk_validate_xmit_skb(skb, dev);
 3263	if (unlikely(!skb))
 3264		goto out_null;
 3265
 3266	if (netif_needs_gso(skb, features)) {
 3267		struct sk_buff *segs;
 3268
 3269		segs = skb_gso_segment(skb, features);
 3270		if (IS_ERR(segs)) {
 3271			goto out_kfree_skb;
 3272		} else if (segs) {
 3273			consume_skb(skb);
 3274			skb = segs;
 3275		}
 3276	} else {
 3277		if (skb_needs_linearize(skb, features) &&
 3278		    __skb_linearize(skb))
 3279			goto out_kfree_skb;
 3280
 3281		/* If packet is not checksummed and device does not
 3282		 * support checksumming for this protocol, complete
 3283		 * checksumming here.
 3284		 */
 3285		if (skb->ip_summed == CHECKSUM_PARTIAL) {
 3286			if (skb->encapsulation)
 3287				skb_set_inner_transport_header(skb,
 3288							       skb_checksum_start_offset(skb));
 3289			else
 3290				skb_set_transport_header(skb,
 3291							 skb_checksum_start_offset(skb));
 3292			if (skb_csum_hwoffload_help(skb, features))
 
 3293				goto out_kfree_skb;
 3294		}
 3295	}
 3296
 3297	skb = validate_xmit_xfrm(skb, features, again);
 3298
 3299	return skb;
 3300
 3301out_kfree_skb:
 3302	kfree_skb(skb);
 3303out_null:
 3304	atomic_long_inc(&dev->tx_dropped);
 3305	return NULL;
 3306}
 3307
 3308struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
 3309{
 3310	struct sk_buff *next, *head = NULL, *tail;
 3311
 3312	for (; skb != NULL; skb = next) {
 3313		next = skb->next;
 3314		skb_mark_not_on_list(skb);
 3315
 3316		/* in case skb wont be segmented, point to itself */
 3317		skb->prev = skb;
 3318
 3319		skb = validate_xmit_skb(skb, dev, again);
 3320		if (!skb)
 3321			continue;
 3322
 3323		if (!head)
 3324			head = skb;
 3325		else
 3326			tail->next = skb;
 3327		/* If skb was segmented, skb->prev points to
 3328		 * the last segment. If not, it still contains skb.
 3329		 */
 3330		tail = skb->prev;
 3331	}
 3332	return head;
 3333}
 3334EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
 3335
 3336static void qdisc_pkt_len_init(struct sk_buff *skb)
 3337{
 3338	const struct skb_shared_info *shinfo = skb_shinfo(skb);
 3339
 3340	qdisc_skb_cb(skb)->pkt_len = skb->len;
 3341
 3342	/* To get more precise estimation of bytes sent on wire,
 3343	 * we add to pkt_len the headers size of all segments
 3344	 */
 3345	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
 3346		unsigned int hdr_len;
 3347		u16 gso_segs = shinfo->gso_segs;
 3348
 3349		/* mac layer + network layer */
 3350		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
 3351
 3352		/* + transport layer */
 3353		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
 3354			const struct tcphdr *th;
 3355			struct tcphdr _tcphdr;
 3356
 3357			th = skb_header_pointer(skb, skb_transport_offset(skb),
 3358						sizeof(_tcphdr), &_tcphdr);
 3359			if (likely(th))
 3360				hdr_len += __tcp_hdrlen(th);
 3361		} else {
 3362			struct udphdr _udphdr;
 3363
 3364			if (skb_header_pointer(skb, skb_transport_offset(skb),
 3365					       sizeof(_udphdr), &_udphdr))
 3366				hdr_len += sizeof(struct udphdr);
 3367		}
 3368
 3369		if (shinfo->gso_type & SKB_GSO_DODGY)
 3370			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
 3371						shinfo->gso_size);
 3372
 3373		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
 3374	}
 3375}
 3376
 3377static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 3378				 struct net_device *dev,
 3379				 struct netdev_queue *txq)
 3380{
 3381	spinlock_t *root_lock = qdisc_lock(q);
 3382	struct sk_buff *to_free = NULL;
 3383	bool contended;
 3384	int rc;
 3385
 3386	qdisc_calculate_pkt_len(skb, q);
 3387
 3388	if (q->flags & TCQ_F_NOLOCK) {
 3389		if ((q->flags & TCQ_F_CAN_BYPASS) && q->empty &&
 3390		    qdisc_run_begin(q)) {
 3391			if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
 3392					      &q->state))) {
 3393				__qdisc_drop(skb, &to_free);
 3394				rc = NET_XMIT_DROP;
 3395				goto end_run;
 3396			}
 3397			qdisc_bstats_cpu_update(q, skb);
 3398
 3399			rc = NET_XMIT_SUCCESS;
 3400			if (sch_direct_xmit(skb, q, dev, txq, NULL, true))
 3401				__qdisc_run(q);
 3402
 3403end_run:
 3404			qdisc_run_end(q);
 3405		} else {
 3406			rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
 3407			qdisc_run(q);
 3408		}
 3409
 3410		if (unlikely(to_free))
 3411			kfree_skb_list(to_free);
 3412		return rc;
 3413	}
 3414
 3415	/*
 3416	 * Heuristic to force contended enqueues to serialize on a
 3417	 * separate lock before trying to get qdisc main lock.
 3418	 * This permits qdisc->running owner to get the lock more
 3419	 * often and dequeue packets faster.
 3420	 */
 3421	contended = qdisc_is_running(q);
 3422	if (unlikely(contended))
 3423		spin_lock(&q->busylock);
 3424
 3425	spin_lock(root_lock);
 3426	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
 3427		__qdisc_drop(skb, &to_free);
 3428		rc = NET_XMIT_DROP;
 3429	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
 3430		   qdisc_run_begin(q)) {
 3431		/*
 3432		 * This is a work-conserving queue; there are no old skbs
 3433		 * waiting to be sent out; and the qdisc is not running -
 3434		 * xmit the skb directly.
 3435		 */
 3436
 3437		qdisc_bstats_update(q, skb);
 3438
 3439		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
 3440			if (unlikely(contended)) {
 3441				spin_unlock(&q->busylock);
 3442				contended = false;
 3443			}
 3444			__qdisc_run(q);
 3445		}
 
 3446
 3447		qdisc_run_end(q);
 3448		rc = NET_XMIT_SUCCESS;
 3449	} else {
 3450		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
 3451		if (qdisc_run_begin(q)) {
 3452			if (unlikely(contended)) {
 3453				spin_unlock(&q->busylock);
 3454				contended = false;
 3455			}
 3456			__qdisc_run(q);
 3457			qdisc_run_end(q);
 3458		}
 3459	}
 3460	spin_unlock(root_lock);
 3461	if (unlikely(to_free))
 3462		kfree_skb_list(to_free);
 3463	if (unlikely(contended))
 3464		spin_unlock(&q->busylock);
 3465	return rc;
 3466}
 3467
 3468#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 3469static void skb_update_prio(struct sk_buff *skb)
 3470{
 3471	const struct netprio_map *map;
 3472	const struct sock *sk;
 3473	unsigned int prioidx;
 3474
 3475	if (skb->priority)
 3476		return;
 3477	map = rcu_dereference_bh(skb->dev->priomap);
 3478	if (!map)
 3479		return;
 3480	sk = skb_to_full_sk(skb);
 3481	if (!sk)
 3482		return;
 3483
 3484	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
 3485
 3486	if (prioidx < map->priomap_len)
 3487		skb->priority = map->priomap[prioidx];
 3488}
 3489#else
 3490#define skb_update_prio(skb)
 3491#endif
 3492
 
 
 
 
 
 3493/**
 3494 *	dev_loopback_xmit - loop back @skb
 3495 *	@net: network namespace this loopback is happening in
 3496 *	@sk:  sk needed to be a netfilter okfn
 3497 *	@skb: buffer to transmit
 3498 */
 3499int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
 3500{
 3501	skb_reset_mac_header(skb);
 3502	__skb_pull(skb, skb_network_offset(skb));
 3503	skb->pkt_type = PACKET_LOOPBACK;
 3504	skb->ip_summed = CHECKSUM_UNNECESSARY;
 3505	WARN_ON(!skb_dst(skb));
 3506	skb_dst_force(skb);
 3507	netif_rx_ni(skb);
 3508	return 0;
 3509}
 3510EXPORT_SYMBOL(dev_loopback_xmit);
 3511
 3512#ifdef CONFIG_NET_EGRESS
 3513static struct sk_buff *
 3514sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 3515{
 3516	struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
 3517	struct tcf_result cl_res;
 3518
 3519	if (!miniq)
 3520		return skb;
 3521
 3522	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
 3523	mini_qdisc_bstats_cpu_update(miniq, skb);
 
 
 3524
 3525	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
 3526	case TC_ACT_OK:
 3527	case TC_ACT_RECLASSIFY:
 3528		skb->tc_index = TC_H_MIN(cl_res.classid);
 3529		break;
 3530	case TC_ACT_SHOT:
 3531		mini_qdisc_qstats_cpu_drop(miniq);
 3532		*ret = NET_XMIT_DROP;
 3533		kfree_skb(skb);
 3534		return NULL;
 3535	case TC_ACT_STOLEN:
 3536	case TC_ACT_QUEUED:
 3537	case TC_ACT_TRAP:
 3538		*ret = NET_XMIT_SUCCESS;
 3539		consume_skb(skb);
 
 3540		return NULL;
 3541	case TC_ACT_REDIRECT:
 3542		/* No need to push/pop skb's mac_header here on egress! */
 3543		skb_do_redirect(skb);
 3544		*ret = NET_XMIT_SUCCESS;
 3545		return NULL;
 3546	default:
 3547		break;
 3548	}
 3549
 3550	return skb;
 3551}
 3552#endif /* CONFIG_NET_EGRESS */
 3553
 3554#ifdef CONFIG_XPS
 3555static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
 3556			       struct xps_dev_maps *dev_maps, unsigned int tci)
 3557{
 3558	struct xps_map *map;
 3559	int queue_index = -1;
 3560
 3561	if (dev->num_tc) {
 3562		tci *= dev->num_tc;
 3563		tci += netdev_get_prio_tc_map(dev, skb->priority);
 3564	}
 3565
 3566	map = rcu_dereference(dev_maps->attr_map[tci]);
 3567	if (map) {
 3568		if (map->len == 1)
 3569			queue_index = map->queues[0];
 3570		else
 3571			queue_index = map->queues[reciprocal_scale(
 3572						skb_get_hash(skb), map->len)];
 3573		if (unlikely(queue_index >= dev->real_num_tx_queues))
 3574			queue_index = -1;
 3575	}
 3576	return queue_index;
 3577}
 3578#endif
 3579
 3580static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
 3581			 struct sk_buff *skb)
 3582{
 3583#ifdef CONFIG_XPS
 3584	struct xps_dev_maps *dev_maps;
 3585	struct sock *sk = skb->sk;
 3586	int queue_index = -1;
 3587
 3588	if (!static_key_false(&xps_needed))
 3589		return -1;
 3590
 3591	rcu_read_lock();
 3592	if (!static_key_false(&xps_rxqs_needed))
 3593		goto get_cpus_map;
 3594
 3595	dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
 3596	if (dev_maps) {
 3597		int tci = sk_rx_queue_get(sk);
 3598
 3599		if (tci >= 0 && tci < dev->num_rx_queues)
 3600			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 3601							  tci);
 3602	}
 3603
 3604get_cpus_map:
 3605	if (queue_index < 0) {
 3606		dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
 3607		if (dev_maps) {
 3608			unsigned int tci = skb->sender_cpu - 1;
 3609
 3610			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 3611							  tci);
 3612		}
 3613	}
 3614	rcu_read_unlock();
 3615
 3616	return queue_index;
 3617#else
 3618	return -1;
 3619#endif
 3620}
 3621
 3622u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
 3623		     struct net_device *sb_dev)
 3624{
 3625	return 0;
 3626}
 3627EXPORT_SYMBOL(dev_pick_tx_zero);
 3628
 3629u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
 3630		       struct net_device *sb_dev)
 3631{
 3632	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
 3633}
 3634EXPORT_SYMBOL(dev_pick_tx_cpu_id);
 3635
 3636u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
 3637		     struct net_device *sb_dev)
 3638{
 3639	struct sock *sk = skb->sk;
 3640	int queue_index = sk_tx_queue_get(sk);
 3641
 3642	sb_dev = sb_dev ? : dev;
 3643
 3644	if (queue_index < 0 || skb->ooo_okay ||
 3645	    queue_index >= dev->real_num_tx_queues) {
 3646		int new_index = get_xps_queue(dev, sb_dev, skb);
 3647
 3648		if (new_index < 0)
 3649			new_index = skb_tx_hash(dev, sb_dev, skb);
 3650
 3651		if (queue_index != new_index && sk &&
 3652		    sk_fullsock(sk) &&
 3653		    rcu_access_pointer(sk->sk_dst_cache))
 3654			sk_tx_queue_set(sk, new_index);
 3655
 3656		queue_index = new_index;
 3657	}
 3658
 3659	return queue_index;
 3660}
 3661EXPORT_SYMBOL(netdev_pick_tx);
 3662
 3663struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
 3664					 struct sk_buff *skb,
 3665					 struct net_device *sb_dev)
 3666{
 3667	int queue_index = 0;
 3668
 3669#ifdef CONFIG_XPS
 3670	u32 sender_cpu = skb->sender_cpu - 1;
 3671
 3672	if (sender_cpu >= (u32)NR_CPUS)
 3673		skb->sender_cpu = raw_smp_processor_id() + 1;
 3674#endif
 3675
 3676	if (dev->real_num_tx_queues != 1) {
 3677		const struct net_device_ops *ops = dev->netdev_ops;
 3678
 3679		if (ops->ndo_select_queue)
 3680			queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
 
 3681		else
 3682			queue_index = netdev_pick_tx(dev, skb, sb_dev);
 3683
 3684		queue_index = netdev_cap_txqueue(dev, queue_index);
 
 3685	}
 3686
 3687	skb_set_queue_mapping(skb, queue_index);
 3688	return netdev_get_tx_queue(dev, queue_index);
 3689}
 3690
 3691/**
 3692 *	__dev_queue_xmit - transmit a buffer
 3693 *	@skb: buffer to transmit
 3694 *	@sb_dev: suboordinate device used for L2 forwarding offload
 3695 *
 3696 *	Queue a buffer for transmission to a network device. The caller must
 3697 *	have set the device and priority and built the buffer before calling
 3698 *	this function. The function can be called from an interrupt.
 3699 *
 3700 *	A negative errno code is returned on a failure. A success does not
 3701 *	guarantee the frame will be transmitted as it may be dropped due
 3702 *	to congestion or traffic shaping.
 3703 *
 3704 * -----------------------------------------------------------------------------------
 3705 *      I notice this method can also return errors from the queue disciplines,
 3706 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
 3707 *      be positive.
 3708 *
 3709 *      Regardless of the return value, the skb is consumed, so it is currently
 3710 *      difficult to retry a send to this method.  (You can bump the ref count
 3711 *      before sending to hold a reference for retry if you are careful.)
 3712 *
 3713 *      When calling this method, interrupts MUST be enabled.  This is because
 3714 *      the BH enable code must have IRQs enabled so that it will not deadlock.
 3715 *          --BLG
 3716 */
 3717static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 3718{
 3719	struct net_device *dev = skb->dev;
 3720	struct netdev_queue *txq;
 3721	struct Qdisc *q;
 3722	int rc = -ENOMEM;
 3723	bool again = false;
 3724
 3725	skb_reset_mac_header(skb);
 3726
 3727	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
 3728		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
 3729
 3730	/* Disable soft irqs for various locks below. Also
 3731	 * stops preemption for RCU.
 3732	 */
 3733	rcu_read_lock_bh();
 3734
 3735	skb_update_prio(skb);
 3736
 3737	qdisc_pkt_len_init(skb);
 3738#ifdef CONFIG_NET_CLS_ACT
 3739	skb->tc_at_ingress = 0;
 3740# ifdef CONFIG_NET_EGRESS
 3741	if (static_branch_unlikely(&egress_needed_key)) {
 3742		skb = sch_handle_egress(skb, &rc, dev);
 3743		if (!skb)
 3744			goto out;
 3745	}
 3746# endif
 3747#endif
 3748	/* If device/qdisc don't need skb->dst, release it right now while
 3749	 * its hot in this cpu cache.
 3750	 */
 3751	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
 3752		skb_dst_drop(skb);
 3753	else
 3754		skb_dst_force(skb);
 3755
 3756	txq = netdev_core_pick_tx(dev, skb, sb_dev);
 
 
 
 
 
 
 
 
 
 
 3757	q = rcu_dereference_bh(txq->qdisc);
 3758
 3759	trace_net_dev_queue(skb);
 3760	if (q->enqueue) {
 3761		rc = __dev_xmit_skb(skb, q, dev, txq);
 3762		goto out;
 3763	}
 3764
 3765	/* The device has no queue. Common case for software devices:
 3766	 * loopback, all the sorts of tunnels...
 3767
 3768	 * Really, it is unlikely that netif_tx_lock protection is necessary
 3769	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
 3770	 * counters.)
 3771	 * However, it is possible, that they rely on protection
 3772	 * made by us here.
 3773
 3774	 * Check this and shot the lock. It is not prone from deadlocks.
 3775	 *Either shot noqueue qdisc, it is even simpler 8)
 3776	 */
 3777	if (dev->flags & IFF_UP) {
 3778		int cpu = smp_processor_id(); /* ok because BHs are off */
 3779
 3780		if (txq->xmit_lock_owner != cpu) {
 3781			if (dev_xmit_recursion())
 
 3782				goto recursion_alert;
 3783
 3784			skb = validate_xmit_skb(skb, dev, &again);
 3785			if (!skb)
 3786				goto out;
 3787
 3788			HARD_TX_LOCK(dev, txq, cpu);
 3789
 3790			if (!netif_xmit_stopped(txq)) {
 3791				dev_xmit_recursion_inc();
 3792				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
 3793				dev_xmit_recursion_dec();
 3794				if (dev_xmit_complete(rc)) {
 3795					HARD_TX_UNLOCK(dev, txq);
 3796					goto out;
 3797				}
 3798			}
 3799			HARD_TX_UNLOCK(dev, txq);
 3800			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
 3801					     dev->name);
 3802		} else {
 3803			/* Recursion is detected! It is possible,
 3804			 * unfortunately
 3805			 */
 3806recursion_alert:
 3807			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
 3808					     dev->name);
 3809		}
 3810	}
 3811
 3812	rc = -ENETDOWN;
 
 3813	rcu_read_unlock_bh();
 3814
 3815	atomic_long_inc(&dev->tx_dropped);
 3816	kfree_skb_list(skb);
 3817	return rc;
 3818out:
 3819	rcu_read_unlock_bh();
 3820	return rc;
 3821}
 3822
 3823int dev_queue_xmit(struct sk_buff *skb)
 3824{
 3825	return __dev_queue_xmit(skb, NULL);
 3826}
 3827EXPORT_SYMBOL(dev_queue_xmit);
 3828
 3829int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
 3830{
 3831	return __dev_queue_xmit(skb, sb_dev);
 3832}
 3833EXPORT_SYMBOL(dev_queue_xmit_accel);
 3834
 3835int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
 3836{
 3837	struct net_device *dev = skb->dev;
 3838	struct sk_buff *orig_skb = skb;
 3839	struct netdev_queue *txq;
 3840	int ret = NETDEV_TX_BUSY;
 3841	bool again = false;
 3842
 3843	if (unlikely(!netif_running(dev) ||
 3844		     !netif_carrier_ok(dev)))
 3845		goto drop;
 3846
 3847	skb = validate_xmit_skb_list(skb, dev, &again);
 3848	if (skb != orig_skb)
 3849		goto drop;
 3850
 3851	skb_set_queue_mapping(skb, queue_id);
 3852	txq = skb_get_tx_queue(dev, skb);
 3853
 3854	local_bh_disable();
 3855
 3856	HARD_TX_LOCK(dev, txq, smp_processor_id());
 3857	if (!netif_xmit_frozen_or_drv_stopped(txq))
 3858		ret = netdev_start_xmit(skb, dev, txq, false);
 3859	HARD_TX_UNLOCK(dev, txq);
 3860
 3861	local_bh_enable();
 3862
 3863	if (!dev_xmit_complete(ret))
 3864		kfree_skb(skb);
 3865
 3866	return ret;
 3867drop:
 3868	atomic_long_inc(&dev->tx_dropped);
 3869	kfree_skb_list(skb);
 3870	return NET_XMIT_DROP;
 3871}
 3872EXPORT_SYMBOL(dev_direct_xmit);
 3873
 3874/*************************************************************************
 3875 *			Receiver routines
 3876 *************************************************************************/
 3877
 3878int netdev_max_backlog __read_mostly = 1000;
 3879EXPORT_SYMBOL(netdev_max_backlog);
 3880
 3881int netdev_tstamp_prequeue __read_mostly = 1;
 3882int netdev_budget __read_mostly = 300;
 3883unsigned int __read_mostly netdev_budget_usecs = 2000;
 3884int weight_p __read_mostly = 64;           /* old backlog weight */
 3885int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
 3886int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
 3887int dev_rx_weight __read_mostly = 64;
 3888int dev_tx_weight __read_mostly = 64;
 3889/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
 3890int gro_normal_batch __read_mostly = 8;
 3891
 3892/* Called with irq disabled */
 3893static inline void ____napi_schedule(struct softnet_data *sd,
 3894				     struct napi_struct *napi)
 3895{
 3896	list_add_tail(&napi->poll_list, &sd->poll_list);
 3897	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 3898}
 3899
 3900#ifdef CONFIG_RPS
 3901
 3902/* One global table that all flow-based protocols share. */
 3903struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
 3904EXPORT_SYMBOL(rps_sock_flow_table);
 3905u32 rps_cpu_mask __read_mostly;
 3906EXPORT_SYMBOL(rps_cpu_mask);
 3907
 3908struct static_key_false rps_needed __read_mostly;
 3909EXPORT_SYMBOL(rps_needed);
 3910struct static_key_false rfs_needed __read_mostly;
 3911EXPORT_SYMBOL(rfs_needed);
 3912
 3913static struct rps_dev_flow *
 3914set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 3915	    struct rps_dev_flow *rflow, u16 next_cpu)
 3916{
 3917	if (next_cpu < nr_cpu_ids) {
 3918#ifdef CONFIG_RFS_ACCEL
 3919		struct netdev_rx_queue *rxqueue;
 3920		struct rps_dev_flow_table *flow_table;
 3921		struct rps_dev_flow *old_rflow;
 3922		u32 flow_id;
 3923		u16 rxq_index;
 3924		int rc;
 3925
 3926		/* Should we steer this flow to a different hardware queue? */
 3927		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
 3928		    !(dev->features & NETIF_F_NTUPLE))
 3929			goto out;
 3930		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
 3931		if (rxq_index == skb_get_rx_queue(skb))
 3932			goto out;
 3933
 3934		rxqueue = dev->_rx + rxq_index;
 3935		flow_table = rcu_dereference(rxqueue->rps_flow_table);
 3936		if (!flow_table)
 3937			goto out;
 3938		flow_id = skb_get_hash(skb) & flow_table->mask;
 3939		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
 3940							rxq_index, flow_id);
 3941		if (rc < 0)
 3942			goto out;
 3943		old_rflow = rflow;
 3944		rflow = &flow_table->flows[flow_id];
 3945		rflow->filter = rc;
 3946		if (old_rflow->filter == rflow->filter)
 3947			old_rflow->filter = RPS_NO_FILTER;
 3948	out:
 3949#endif
 3950		rflow->last_qtail =
 3951			per_cpu(softnet_data, next_cpu).input_queue_head;
 3952	}
 3953
 3954	rflow->cpu = next_cpu;
 3955	return rflow;
 3956}
 3957
 3958/*
 3959 * get_rps_cpu is called from netif_receive_skb and returns the target
 3960 * CPU from the RPS map of the receiving queue for a given skb.
 3961 * rcu_read_lock must be held on entry.
 3962 */
 3963static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 3964		       struct rps_dev_flow **rflowp)
 3965{
 3966	const struct rps_sock_flow_table *sock_flow_table;
 3967	struct netdev_rx_queue *rxqueue = dev->_rx;
 3968	struct rps_dev_flow_table *flow_table;
 3969	struct rps_map *map;
 3970	int cpu = -1;
 3971	u32 tcpu;
 3972	u32 hash;
 3973
 3974	if (skb_rx_queue_recorded(skb)) {
 3975		u16 index = skb_get_rx_queue(skb);
 3976
 3977		if (unlikely(index >= dev->real_num_rx_queues)) {
 3978			WARN_ONCE(dev->real_num_rx_queues > 1,
 3979				  "%s received packet on queue %u, but number "
 3980				  "of RX queues is %u\n",
 3981				  dev->name, index, dev->real_num_rx_queues);
 3982			goto done;
 3983		}
 3984		rxqueue += index;
 3985	}
 3986
 3987	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
 3988
 3989	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 3990	map = rcu_dereference(rxqueue->rps_map);
 3991	if (!flow_table && !map)
 3992		goto done;
 3993
 3994	skb_reset_network_header(skb);
 3995	hash = skb_get_hash(skb);
 3996	if (!hash)
 3997		goto done;
 3998
 3999	sock_flow_table = rcu_dereference(rps_sock_flow_table);
 4000	if (flow_table && sock_flow_table) {
 4001		struct rps_dev_flow *rflow;
 4002		u32 next_cpu;
 4003		u32 ident;
 4004
 4005		/* First check into global flow table if there is a match */
 4006		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
 4007		if ((ident ^ hash) & ~rps_cpu_mask)
 4008			goto try_rps;
 4009
 4010		next_cpu = ident & rps_cpu_mask;
 4011
 4012		/* OK, now we know there is a match,
 4013		 * we can look at the local (per receive queue) flow table
 4014		 */
 4015		rflow = &flow_table->flows[hash & flow_table->mask];
 4016		tcpu = rflow->cpu;
 4017
 4018		/*
 4019		 * If the desired CPU (where last recvmsg was done) is
 4020		 * different from current CPU (one in the rx-queue flow
 4021		 * table entry), switch if one of the following holds:
 4022		 *   - Current CPU is unset (>= nr_cpu_ids).
 4023		 *   - Current CPU is offline.
 4024		 *   - The current CPU's queue tail has advanced beyond the
 4025		 *     last packet that was enqueued using this table entry.
 4026		 *     This guarantees that all previous packets for the flow
 4027		 *     have been dequeued, thus preserving in order delivery.
 4028		 */
 4029		if (unlikely(tcpu != next_cpu) &&
 4030		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
 4031		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
 4032		      rflow->last_qtail)) >= 0)) {
 4033			tcpu = next_cpu;
 4034			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
 4035		}
 4036
 4037		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
 4038			*rflowp = rflow;
 4039			cpu = tcpu;
 4040			goto done;
 4041		}
 4042	}
 4043
 4044try_rps:
 4045
 4046	if (map) {
 4047		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
 4048		if (cpu_online(tcpu)) {
 4049			cpu = tcpu;
 4050			goto done;
 4051		}
 4052	}
 4053
 4054done:
 4055	return cpu;
 4056}
 4057
 4058#ifdef CONFIG_RFS_ACCEL
 4059
 4060/**
 4061 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
 4062 * @dev: Device on which the filter was set
 4063 * @rxq_index: RX queue index
 4064 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
 4065 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
 4066 *
 4067 * Drivers that implement ndo_rx_flow_steer() should periodically call
 4068 * this function for each installed filter and remove the filters for
 4069 * which it returns %true.
 4070 */
 4071bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
 4072			 u32 flow_id, u16 filter_id)
 4073{
 4074	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
 4075	struct rps_dev_flow_table *flow_table;
 4076	struct rps_dev_flow *rflow;
 4077	bool expire = true;
 4078	unsigned int cpu;
 4079
 4080	rcu_read_lock();
 4081	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4082	if (flow_table && flow_id <= flow_table->mask) {
 4083		rflow = &flow_table->flows[flow_id];
 4084		cpu = READ_ONCE(rflow->cpu);
 4085		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
 4086		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
 4087			   rflow->last_qtail) <
 4088		     (int)(10 * flow_table->mask)))
 4089			expire = false;
 4090	}
 4091	rcu_read_unlock();
 4092	return expire;
 4093}
 4094EXPORT_SYMBOL(rps_may_expire_flow);
 4095
 4096#endif /* CONFIG_RFS_ACCEL */
 4097
 4098/* Called from hardirq (IPI) context */
 4099static void rps_trigger_softirq(void *data)
 4100{
 4101	struct softnet_data *sd = data;
 4102
 4103	____napi_schedule(sd, &sd->backlog);
 4104	sd->received_rps++;
 4105}
 4106
 4107#endif /* CONFIG_RPS */
 4108
 4109/*
 4110 * Check if this softnet_data structure is another cpu one
 4111 * If yes, queue it to our IPI list and return 1
 4112 * If no, return 0
 4113 */
 4114static int rps_ipi_queued(struct softnet_data *sd)
 4115{
 4116#ifdef CONFIG_RPS
 4117	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
 4118
 4119	if (sd != mysd) {
 4120		sd->rps_ipi_next = mysd->rps_ipi_list;
 4121		mysd->rps_ipi_list = sd;
 4122
 4123		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4124		return 1;
 4125	}
 4126#endif /* CONFIG_RPS */
 4127	return 0;
 4128}
 4129
 4130#ifdef CONFIG_NET_FLOW_LIMIT
 4131int netdev_flow_limit_table_len __read_mostly = (1 << 12);
 4132#endif
 4133
 4134static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
 4135{
 4136#ifdef CONFIG_NET_FLOW_LIMIT
 4137	struct sd_flow_limit *fl;
 4138	struct softnet_data *sd;
 4139	unsigned int old_flow, new_flow;
 4140
 4141	if (qlen < (netdev_max_backlog >> 1))
 4142		return false;
 4143
 4144	sd = this_cpu_ptr(&softnet_data);
 4145
 4146	rcu_read_lock();
 4147	fl = rcu_dereference(sd->flow_limit);
 4148	if (fl) {
 4149		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
 4150		old_flow = fl->history[fl->history_head];
 4151		fl->history[fl->history_head] = new_flow;
 4152
 4153		fl->history_head++;
 4154		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
 4155
 4156		if (likely(fl->buckets[old_flow]))
 4157			fl->buckets[old_flow]--;
 4158
 4159		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
 4160			fl->count++;
 4161			rcu_read_unlock();
 4162			return true;
 4163		}
 4164	}
 4165	rcu_read_unlock();
 4166#endif
 4167	return false;
 4168}
 4169
 4170/*
 4171 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
 4172 * queue (may be a remote CPU queue).
 4173 */
 4174static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 4175			      unsigned int *qtail)
 4176{
 4177	struct softnet_data *sd;
 4178	unsigned long flags;
 4179	unsigned int qlen;
 4180
 4181	sd = &per_cpu(softnet_data, cpu);
 4182
 4183	local_irq_save(flags);
 4184
 4185	rps_lock(sd);
 4186	if (!netif_running(skb->dev))
 4187		goto drop;
 4188	qlen = skb_queue_len(&sd->input_pkt_queue);
 4189	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
 4190		if (qlen) {
 4191enqueue:
 4192			__skb_queue_tail(&sd->input_pkt_queue, skb);
 4193			input_queue_tail_incr_save(sd, qtail);
 4194			rps_unlock(sd);
 4195			local_irq_restore(flags);
 4196			return NET_RX_SUCCESS;
 4197		}
 4198
 4199		/* Schedule NAPI for backlog device
 4200		 * We can use non atomic operation since we own the queue lock
 4201		 */
 4202		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
 4203			if (!rps_ipi_queued(sd))
 4204				____napi_schedule(sd, &sd->backlog);
 4205		}
 4206		goto enqueue;
 4207	}
 4208
 4209drop:
 4210	sd->dropped++;
 4211	rps_unlock(sd);
 4212
 4213	local_irq_restore(flags);
 4214
 4215	atomic_long_inc(&skb->dev->rx_dropped);
 4216	kfree_skb(skb);
 4217	return NET_RX_DROP;
 4218}
 4219
 4220static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
 4221{
 4222	struct net_device *dev = skb->dev;
 4223	struct netdev_rx_queue *rxqueue;
 4224
 4225	rxqueue = dev->_rx;
 4226
 4227	if (skb_rx_queue_recorded(skb)) {
 4228		u16 index = skb_get_rx_queue(skb);
 4229
 4230		if (unlikely(index >= dev->real_num_rx_queues)) {
 4231			WARN_ONCE(dev->real_num_rx_queues > 1,
 4232				  "%s received packet on queue %u, but number "
 4233				  "of RX queues is %u\n",
 4234				  dev->name, index, dev->real_num_rx_queues);
 4235
 4236			return rxqueue; /* Return first rxqueue */
 4237		}
 4238		rxqueue += index;
 4239	}
 4240	return rxqueue;
 4241}
 4242
 4243static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 4244				     struct xdp_buff *xdp,
 4245				     struct bpf_prog *xdp_prog)
 4246{
 4247	struct netdev_rx_queue *rxqueue;
 4248	void *orig_data, *orig_data_end;
 4249	u32 metalen, act = XDP_DROP;
 4250	__be16 orig_eth_type;
 4251	struct ethhdr *eth;
 4252	bool orig_bcast;
 4253	int hlen, off;
 4254	u32 mac_len;
 4255
 4256	/* Reinjected packets coming from act_mirred or similar should
 4257	 * not get XDP generic processing.
 4258	 */
 4259	if (skb_cloned(skb) || skb_is_tc_redirected(skb))
 4260		return XDP_PASS;
 4261
 4262	/* XDP packets must be linear and must have sufficient headroom
 4263	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
 4264	 * native XDP provides, thus we need to do it here as well.
 4265	 */
 4266	if (skb_is_nonlinear(skb) ||
 4267	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
 4268		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
 4269		int troom = skb->tail + skb->data_len - skb->end;
 4270
 4271		/* In case we have to go down the path and also linearize,
 4272		 * then lets do the pskb_expand_head() work just once here.
 4273		 */
 4274		if (pskb_expand_head(skb,
 4275				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
 4276				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
 4277			goto do_drop;
 4278		if (skb_linearize(skb))
 4279			goto do_drop;
 4280	}
 4281
 4282	/* The XDP program wants to see the packet starting at the MAC
 4283	 * header.
 4284	 */
 4285	mac_len = skb->data - skb_mac_header(skb);
 4286	hlen = skb_headlen(skb) + mac_len;
 4287	xdp->data = skb->data - mac_len;
 4288	xdp->data_meta = xdp->data;
 4289	xdp->data_end = xdp->data + hlen;
 4290	xdp->data_hard_start = skb->data - skb_headroom(skb);
 4291	orig_data_end = xdp->data_end;
 4292	orig_data = xdp->data;
 4293	eth = (struct ethhdr *)xdp->data;
 4294	orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
 4295	orig_eth_type = eth->h_proto;
 4296
 4297	rxqueue = netif_get_rxqueue(skb);
 4298	xdp->rxq = &rxqueue->xdp_rxq;
 4299
 4300	act = bpf_prog_run_xdp(xdp_prog, xdp);
 4301
 4302	/* check if bpf_xdp_adjust_head was used */
 4303	off = xdp->data - orig_data;
 4304	if (off) {
 4305		if (off > 0)
 4306			__skb_pull(skb, off);
 4307		else if (off < 0)
 4308			__skb_push(skb, -off);
 4309
 4310		skb->mac_header += off;
 4311		skb_reset_network_header(skb);
 4312	}
 4313
 4314	/* check if bpf_xdp_adjust_tail was used. it can only "shrink"
 4315	 * pckt.
 4316	 */
 4317	off = orig_data_end - xdp->data_end;
 4318	if (off != 0) {
 4319		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
 4320		skb->len -= off;
 4321
 4322	}
 4323
 4324	/* check if XDP changed eth hdr such SKB needs update */
 4325	eth = (struct ethhdr *)xdp->data;
 4326	if ((orig_eth_type != eth->h_proto) ||
 4327	    (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
 4328		__skb_push(skb, ETH_HLEN);
 4329		skb->protocol = eth_type_trans(skb, skb->dev);
 4330	}
 4331
 4332	switch (act) {
 4333	case XDP_REDIRECT:
 4334	case XDP_TX:
 4335		__skb_push(skb, mac_len);
 4336		break;
 4337	case XDP_PASS:
 4338		metalen = xdp->data - xdp->data_meta;
 4339		if (metalen)
 4340			skb_metadata_set(skb, metalen);
 4341		break;
 4342	default:
 4343		bpf_warn_invalid_xdp_action(act);
 4344		/* fall through */
 4345	case XDP_ABORTED:
 4346		trace_xdp_exception(skb->dev, xdp_prog, act);
 4347		/* fall through */
 4348	case XDP_DROP:
 4349	do_drop:
 4350		kfree_skb(skb);
 4351		break;
 4352	}
 4353
 4354	return act;
 4355}
 4356
 4357/* When doing generic XDP we have to bypass the qdisc layer and the
 4358 * network taps in order to match in-driver-XDP behavior.
 4359 */
 4360void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 4361{
 4362	struct net_device *dev = skb->dev;
 4363	struct netdev_queue *txq;
 4364	bool free_skb = true;
 4365	int cpu, rc;
 4366
 4367	txq = netdev_core_pick_tx(dev, skb, NULL);
 4368	cpu = smp_processor_id();
 4369	HARD_TX_LOCK(dev, txq, cpu);
 4370	if (!netif_xmit_stopped(txq)) {
 4371		rc = netdev_start_xmit(skb, dev, txq, 0);
 4372		if (dev_xmit_complete(rc))
 4373			free_skb = false;
 4374	}
 4375	HARD_TX_UNLOCK(dev, txq);
 4376	if (free_skb) {
 4377		trace_xdp_exception(dev, xdp_prog, XDP_TX);
 4378		kfree_skb(skb);
 4379	}
 4380}
 4381EXPORT_SYMBOL_GPL(generic_xdp_tx);
 4382
 4383static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
 4384
 4385int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
 4386{
 4387	if (xdp_prog) {
 4388		struct xdp_buff xdp;
 4389		u32 act;
 4390		int err;
 4391
 4392		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
 4393		if (act != XDP_PASS) {
 4394			switch (act) {
 4395			case XDP_REDIRECT:
 4396				err = xdp_do_generic_redirect(skb->dev, skb,
 4397							      &xdp, xdp_prog);
 4398				if (err)
 4399					goto out_redir;
 4400				break;
 4401			case XDP_TX:
 4402				generic_xdp_tx(skb, xdp_prog);
 4403				break;
 4404			}
 4405			return XDP_DROP;
 4406		}
 4407	}
 4408	return XDP_PASS;
 4409out_redir:
 4410	kfree_skb(skb);
 4411	return XDP_DROP;
 4412}
 4413EXPORT_SYMBOL_GPL(do_xdp_generic);
 4414
 4415static int netif_rx_internal(struct sk_buff *skb)
 4416{
 4417	int ret;
 4418
 4419	net_timestamp_check(netdev_tstamp_prequeue, skb);
 4420
 4421	trace_netif_rx(skb);
 4422
 4423#ifdef CONFIG_RPS
 4424	if (static_branch_unlikely(&rps_needed)) {
 4425		struct rps_dev_flow voidflow, *rflow = &voidflow;
 4426		int cpu;
 4427
 4428		preempt_disable();
 4429		rcu_read_lock();
 4430
 4431		cpu = get_rps_cpu(skb->dev, skb, &rflow);
 4432		if (cpu < 0)
 4433			cpu = smp_processor_id();
 4434
 4435		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 4436
 4437		rcu_read_unlock();
 4438		preempt_enable();
 4439	} else
 4440#endif
 4441	{
 4442		unsigned int qtail;
 4443
 4444		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
 4445		put_cpu();
 4446	}
 4447	return ret;
 4448}
 4449
 4450/**
 4451 *	netif_rx	-	post buffer to the network code
 4452 *	@skb: buffer to post
 4453 *
 4454 *	This function receives a packet from a device driver and queues it for
 4455 *	the upper (protocol) levels to process.  It always succeeds. The buffer
 4456 *	may be dropped during processing for congestion control or by the
 4457 *	protocol layers.
 4458 *
 4459 *	return values:
 4460 *	NET_RX_SUCCESS	(no congestion)
 4461 *	NET_RX_DROP     (packet was dropped)
 4462 *
 4463 */
 4464
 4465int netif_rx(struct sk_buff *skb)
 4466{
 4467	int ret;
 4468
 4469	trace_netif_rx_entry(skb);
 4470
 4471	ret = netif_rx_internal(skb);
 4472	trace_netif_rx_exit(ret);
 4473
 4474	return ret;
 4475}
 4476EXPORT_SYMBOL(netif_rx);
 4477
 4478int netif_rx_ni(struct sk_buff *skb)
 4479{
 4480	int err;
 4481
 4482	trace_netif_rx_ni_entry(skb);
 4483
 4484	preempt_disable();
 4485	err = netif_rx_internal(skb);
 4486	if (local_softirq_pending())
 4487		do_softirq();
 4488	preempt_enable();
 4489	trace_netif_rx_ni_exit(err);
 4490
 4491	return err;
 4492}
 4493EXPORT_SYMBOL(netif_rx_ni);
 4494
 4495static __latent_entropy void net_tx_action(struct softirq_action *h)
 4496{
 4497	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 4498
 4499	if (sd->completion_queue) {
 4500		struct sk_buff *clist;
 4501
 4502		local_irq_disable();
 4503		clist = sd->completion_queue;
 4504		sd->completion_queue = NULL;
 4505		local_irq_enable();
 4506
 4507		while (clist) {
 4508			struct sk_buff *skb = clist;
 4509
 4510			clist = clist->next;
 4511
 4512			WARN_ON(refcount_read(&skb->users));
 4513			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
 4514				trace_consume_skb(skb);
 4515			else
 4516				trace_kfree_skb(skb, net_tx_action);
 4517
 4518			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
 4519				__kfree_skb(skb);
 4520			else
 4521				__kfree_skb_defer(skb);
 4522		}
 4523
 4524		__kfree_skb_flush();
 4525	}
 4526
 4527	if (sd->output_queue) {
 4528		struct Qdisc *head;
 4529
 4530		local_irq_disable();
 4531		head = sd->output_queue;
 4532		sd->output_queue = NULL;
 4533		sd->output_queue_tailp = &sd->output_queue;
 4534		local_irq_enable();
 4535
 4536		while (head) {
 4537			struct Qdisc *q = head;
 4538			spinlock_t *root_lock = NULL;
 4539
 4540			head = head->next_sched;
 4541
 4542			if (!(q->flags & TCQ_F_NOLOCK)) {
 4543				root_lock = qdisc_lock(q);
 4544				spin_lock(root_lock);
 4545			}
 4546			/* We need to make sure head->next_sched is read
 4547			 * before clearing __QDISC_STATE_SCHED
 4548			 */
 4549			smp_mb__before_atomic();
 4550			clear_bit(__QDISC_STATE_SCHED, &q->state);
 4551			qdisc_run(q);
 4552			if (root_lock)
 4553				spin_unlock(root_lock);
 
 
 
 
 
 
 
 
 
 
 4554		}
 4555	}
 4556
 4557	xfrm_dev_backlog(sd);
 4558}
 4559
 4560#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
 
 4561/* This hook is defined here for ATM LANE */
 4562int (*br_fdb_test_addr_hook)(struct net_device *dev,
 4563			     unsigned char *addr) __read_mostly;
 4564EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
 4565#endif
 4566
 4567static inline struct sk_buff *
 4568sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 4569		   struct net_device *orig_dev)
 4570{
 4571#ifdef CONFIG_NET_CLS_ACT
 4572	struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
 4573	struct tcf_result cl_res;
 4574
 4575	/* If there's at least one ingress present somewhere (so
 4576	 * we get here via enabled static key), remaining devices
 4577	 * that are not configured with an ingress qdisc will bail
 4578	 * out here.
 4579	 */
 4580	if (!miniq)
 4581		return skb;
 4582
 4583	if (*pt_prev) {
 4584		*ret = deliver_skb(skb, *pt_prev, orig_dev);
 4585		*pt_prev = NULL;
 4586	}
 4587
 4588	qdisc_skb_cb(skb)->pkt_len = skb->len;
 4589	skb->tc_at_ingress = 1;
 4590	mini_qdisc_bstats_cpu_update(miniq, skb);
 4591
 4592	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
 4593	case TC_ACT_OK:
 4594	case TC_ACT_RECLASSIFY:
 4595		skb->tc_index = TC_H_MIN(cl_res.classid);
 4596		break;
 4597	case TC_ACT_SHOT:
 4598		mini_qdisc_qstats_cpu_drop(miniq);
 4599		kfree_skb(skb);
 4600		return NULL;
 4601	case TC_ACT_STOLEN:
 4602	case TC_ACT_QUEUED:
 4603	case TC_ACT_TRAP:
 4604		consume_skb(skb);
 4605		return NULL;
 4606	case TC_ACT_REDIRECT:
 4607		/* skb_mac_header check was done by cls/act_bpf, so
 4608		 * we can safely push the L2 header back before
 4609		 * redirecting to another netdev
 4610		 */
 4611		__skb_push(skb, skb->mac_len);
 4612		skb_do_redirect(skb);
 4613		return NULL;
 4614	case TC_ACT_CONSUMED:
 4615		return NULL;
 4616	default:
 4617		break;
 4618	}
 4619#endif /* CONFIG_NET_CLS_ACT */
 4620	return skb;
 4621}
 4622
 4623/**
 4624 *	netdev_is_rx_handler_busy - check if receive handler is registered
 4625 *	@dev: device to check
 4626 *
 4627 *	Check if a receive handler is already registered for a given device.
 4628 *	Return true if there one.
 4629 *
 4630 *	The caller must hold the rtnl_mutex.
 4631 */
 4632bool netdev_is_rx_handler_busy(struct net_device *dev)
 4633{
 4634	ASSERT_RTNL();
 4635	return dev && rtnl_dereference(dev->rx_handler);
 4636}
 4637EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
 4638
 4639/**
 4640 *	netdev_rx_handler_register - register receive handler
 4641 *	@dev: device to register a handler for
 4642 *	@rx_handler: receive handler to register
 4643 *	@rx_handler_data: data pointer that is used by rx handler
 4644 *
 4645 *	Register a receive handler for a device. This handler will then be
 4646 *	called from __netif_receive_skb. A negative errno code is returned
 4647 *	on a failure.
 4648 *
 4649 *	The caller must hold the rtnl_mutex.
 4650 *
 4651 *	For a general description of rx_handler, see enum rx_handler_result.
 4652 */
 4653int netdev_rx_handler_register(struct net_device *dev,
 4654			       rx_handler_func_t *rx_handler,
 4655			       void *rx_handler_data)
 4656{
 4657	if (netdev_is_rx_handler_busy(dev))
 4658		return -EBUSY;
 4659
 4660	if (dev->priv_flags & IFF_NO_RX_HANDLER)
 4661		return -EINVAL;
 4662
 4663	/* Note: rx_handler_data must be set before rx_handler */
 4664	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
 4665	rcu_assign_pointer(dev->rx_handler, rx_handler);
 4666
 4667	return 0;
 4668}
 4669EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
 4670
 4671/**
 4672 *	netdev_rx_handler_unregister - unregister receive handler
 4673 *	@dev: device to unregister a handler from
 4674 *
 4675 *	Unregister a receive handler from a device.
 4676 *
 4677 *	The caller must hold the rtnl_mutex.
 4678 */
 4679void netdev_rx_handler_unregister(struct net_device *dev)
 4680{
 4681
 4682	ASSERT_RTNL();
 4683	RCU_INIT_POINTER(dev->rx_handler, NULL);
 4684	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
 4685	 * section has a guarantee to see a non NULL rx_handler_data
 4686	 * as well.
 4687	 */
 4688	synchronize_net();
 4689	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
 4690}
 4691EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
 4692
 4693/*
 4694 * Limit the use of PFMEMALLOC reserves to those protocols that implement
 4695 * the special handling of PFMEMALLOC skbs.
 4696 */
 4697static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
 4698{
 4699	switch (skb->protocol) {
 4700	case htons(ETH_P_ARP):
 4701	case htons(ETH_P_IP):
 4702	case htons(ETH_P_IPV6):
 4703	case htons(ETH_P_8021Q):
 4704	case htons(ETH_P_8021AD):
 4705		return true;
 4706	default:
 4707		return false;
 4708	}
 4709}
 4710
 4711static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
 4712			     int *ret, struct net_device *orig_dev)
 4713{
 4714#ifdef CONFIG_NETFILTER_INGRESS
 4715	if (nf_hook_ingress_active(skb)) {
 4716		int ingress_retval;
 4717
 4718		if (*pt_prev) {
 4719			*ret = deliver_skb(skb, *pt_prev, orig_dev);
 4720			*pt_prev = NULL;
 4721		}
 4722
 4723		rcu_read_lock();
 4724		ingress_retval = nf_hook_ingress(skb);
 4725		rcu_read_unlock();
 4726		return ingress_retval;
 4727	}
 4728#endif /* CONFIG_NETFILTER_INGRESS */
 4729	return 0;
 4730}
 4731
 4732static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
 4733				    struct packet_type **ppt_prev)
 4734{
 4735	struct packet_type *ptype, *pt_prev;
 4736	rx_handler_func_t *rx_handler;
 4737	struct net_device *orig_dev;
 4738	bool deliver_exact = false;
 4739	int ret = NET_RX_DROP;
 4740	__be16 type;
 4741
 4742	net_timestamp_check(!netdev_tstamp_prequeue, skb);
 4743
 4744	trace_netif_receive_skb(skb);
 4745
 4746	orig_dev = skb->dev;
 4747
 4748	skb_reset_network_header(skb);
 4749	if (!skb_transport_header_was_set(skb))
 4750		skb_reset_transport_header(skb);
 4751	skb_reset_mac_len(skb);
 4752
 4753	pt_prev = NULL;
 4754
 4755another_round:
 4756	skb->skb_iif = skb->dev->ifindex;
 4757
 4758	__this_cpu_inc(softnet_data.processed);
 4759
 4760	if (static_branch_unlikely(&generic_xdp_needed_key)) {
 4761		int ret2;
 4762
 4763		preempt_disable();
 4764		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
 4765		preempt_enable();
 4766
 4767		if (ret2 != XDP_PASS)
 4768			return NET_RX_DROP;
 4769		skb_reset_mac_len(skb);
 4770	}
 4771
 4772	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
 4773	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 4774		skb = skb_vlan_untag(skb);
 4775		if (unlikely(!skb))
 4776			goto out;
 4777	}
 4778
 4779	if (skb_skip_tc_classify(skb))
 4780		goto skip_classify;
 
 
 
 
 4781
 4782	if (pfmemalloc)
 4783		goto skip_taps;
 4784
 4785	list_for_each_entry_rcu(ptype, &ptype_all, list) {
 4786		if (pt_prev)
 4787			ret = deliver_skb(skb, pt_prev, orig_dev);
 4788		pt_prev = ptype;
 4789	}
 4790
 4791	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
 4792		if (pt_prev)
 4793			ret = deliver_skb(skb, pt_prev, orig_dev);
 4794		pt_prev = ptype;
 4795	}
 4796
 4797skip_taps:
 4798#ifdef CONFIG_NET_INGRESS
 4799	if (static_branch_unlikely(&ingress_needed_key)) {
 4800		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
 4801		if (!skb)
 4802			goto out;
 4803
 4804		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
 4805			goto out;
 4806	}
 4807#endif
 4808	skb_reset_tc(skb);
 4809skip_classify:
 
 
 4810	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
 4811		goto drop;
 4812
 4813	if (skb_vlan_tag_present(skb)) {
 4814		if (pt_prev) {
 4815			ret = deliver_skb(skb, pt_prev, orig_dev);
 4816			pt_prev = NULL;
 4817		}
 4818		if (vlan_do_receive(&skb))
 4819			goto another_round;
 4820		else if (unlikely(!skb))
 4821			goto out;
 4822	}
 4823
 4824	rx_handler = rcu_dereference(skb->dev->rx_handler);
 4825	if (rx_handler) {
 4826		if (pt_prev) {
 4827			ret = deliver_skb(skb, pt_prev, orig_dev);
 4828			pt_prev = NULL;
 4829		}
 4830		switch (rx_handler(&skb)) {
 4831		case RX_HANDLER_CONSUMED:
 4832			ret = NET_RX_SUCCESS;
 4833			goto out;
 4834		case RX_HANDLER_ANOTHER:
 4835			goto another_round;
 4836		case RX_HANDLER_EXACT:
 4837			deliver_exact = true;
 4838		case RX_HANDLER_PASS:
 4839			break;
 4840		default:
 4841			BUG();
 4842		}
 4843	}
 4844
 4845	if (unlikely(skb_vlan_tag_present(skb))) {
 4846check_vlan_id:
 4847		if (skb_vlan_tag_get_id(skb)) {
 4848			/* Vlan id is non 0 and vlan_do_receive() above couldn't
 4849			 * find vlan device.
 4850			 */
 4851			skb->pkt_type = PACKET_OTHERHOST;
 4852		} else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
 4853			   skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 4854			/* Outer header is 802.1P with vlan 0, inner header is
 4855			 * 802.1Q or 802.1AD and vlan_do_receive() above could
 4856			 * not find vlan dev for vlan id 0.
 4857			 */
 4858			__vlan_hwaccel_clear_tag(skb);
 4859			skb = skb_vlan_untag(skb);
 4860			if (unlikely(!skb))
 4861				goto out;
 4862			if (vlan_do_receive(&skb))
 4863				/* After stripping off 802.1P header with vlan 0
 4864				 * vlan dev is found for inner header.
 4865				 */
 4866				goto another_round;
 4867			else if (unlikely(!skb))
 4868				goto out;
 4869			else
 4870				/* We have stripped outer 802.1P vlan 0 header.
 4871				 * But could not find vlan dev.
 4872				 * check again for vlan id to set OTHERHOST.
 4873				 */
 4874				goto check_vlan_id;
 4875		}
 4876		/* Note: we might in the future use prio bits
 4877		 * and set skb->priority like in vlan_do_receive()
 4878		 * For the time being, just ignore Priority Code Point
 4879		 */
 4880		__vlan_hwaccel_clear_tag(skb);
 4881	}
 4882
 4883	type = skb->protocol;
 4884
 4885	/* deliver only exact match when indicated */
 4886	if (likely(!deliver_exact)) {
 4887		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 4888				       &ptype_base[ntohs(type) &
 4889						   PTYPE_HASH_MASK]);
 4890	}
 4891
 4892	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 4893			       &orig_dev->ptype_specific);
 4894
 4895	if (unlikely(skb->dev != orig_dev)) {
 4896		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 4897				       &skb->dev->ptype_specific);
 4898	}
 4899
 4900	if (pt_prev) {
 4901		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 4902			goto drop;
 4903		*ppt_prev = pt_prev;
 
 4904	} else {
 4905drop:
 4906		if (!deliver_exact)
 4907			atomic_long_inc(&skb->dev->rx_dropped);
 4908		else
 4909			atomic_long_inc(&skb->dev->rx_nohandler);
 4910		kfree_skb(skb);
 4911		/* Jamal, now you will not able to escape explaining
 4912		 * me how you were going to use this. :-)
 4913		 */
 4914		ret = NET_RX_DROP;
 4915	}
 4916
 4917out:
 4918	return ret;
 4919}
 4920
 4921static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
 4922{
 4923	struct net_device *orig_dev = skb->dev;
 4924	struct packet_type *pt_prev = NULL;
 4925	int ret;
 4926
 4927	ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
 4928	if (pt_prev)
 4929		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
 4930					 skb->dev, pt_prev, orig_dev);
 4931	return ret;
 4932}
 4933
 4934/**
 4935 *	netif_receive_skb_core - special purpose version of netif_receive_skb
 4936 *	@skb: buffer to process
 4937 *
 4938 *	More direct receive version of netif_receive_skb().  It should
 4939 *	only be used by callers that have a need to skip RPS and Generic XDP.
 4940 *	Caller must also take care of handling if (page_is_)pfmemalloc.
 4941 *
 4942 *	This function may only be called from softirq context and interrupts
 4943 *	should be enabled.
 4944 *
 4945 *	Return values (usually ignored):
 4946 *	NET_RX_SUCCESS: no congestion
 4947 *	NET_RX_DROP: packet was dropped
 4948 */
 4949int netif_receive_skb_core(struct sk_buff *skb)
 4950{
 4951	int ret;
 4952
 4953	rcu_read_lock();
 4954	ret = __netif_receive_skb_one_core(skb, false);
 4955	rcu_read_unlock();
 4956
 4957	return ret;
 4958}
 4959EXPORT_SYMBOL(netif_receive_skb_core);
 4960
 4961static inline void __netif_receive_skb_list_ptype(struct list_head *head,
 4962						  struct packet_type *pt_prev,
 4963						  struct net_device *orig_dev)
 4964{
 4965	struct sk_buff *skb, *next;
 4966
 4967	if (!pt_prev)
 4968		return;
 4969	if (list_empty(head))
 4970		return;
 4971	if (pt_prev->list_func != NULL)
 4972		INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
 4973				   ip_list_rcv, head, pt_prev, orig_dev);
 4974	else
 4975		list_for_each_entry_safe(skb, next, head, list) {
 4976			skb_list_del_init(skb);
 4977			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 4978		}
 4979}
 4980
 4981static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
 4982{
 4983	/* Fast-path assumptions:
 4984	 * - There is no RX handler.
 4985	 * - Only one packet_type matches.
 4986	 * If either of these fails, we will end up doing some per-packet
 4987	 * processing in-line, then handling the 'last ptype' for the whole
 4988	 * sublist.  This can't cause out-of-order delivery to any single ptype,
 4989	 * because the 'last ptype' must be constant across the sublist, and all
 4990	 * other ptypes are handled per-packet.
 4991	 */
 4992	/* Current (common) ptype of sublist */
 4993	struct packet_type *pt_curr = NULL;
 4994	/* Current (common) orig_dev of sublist */
 4995	struct net_device *od_curr = NULL;
 4996	struct list_head sublist;
 4997	struct sk_buff *skb, *next;
 4998
 4999	INIT_LIST_HEAD(&sublist);
 5000	list_for_each_entry_safe(skb, next, head, list) {
 5001		struct net_device *orig_dev = skb->dev;
 5002		struct packet_type *pt_prev = NULL;
 5003
 5004		skb_list_del_init(skb);
 5005		__netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
 5006		if (!pt_prev)
 5007			continue;
 5008		if (pt_curr != pt_prev || od_curr != orig_dev) {
 5009			/* dispatch old sublist */
 5010			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 5011			/* start new sublist */
 5012			INIT_LIST_HEAD(&sublist);
 5013			pt_curr = pt_prev;
 5014			od_curr = orig_dev;
 5015		}
 5016		list_add_tail(&skb->list, &sublist);
 5017	}
 5018
 5019	/* dispatch final sublist */
 5020	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 5021}
 5022
 5023static int __netif_receive_skb(struct sk_buff *skb)
 5024{
 5025	int ret;
 5026
 5027	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
 5028		unsigned int noreclaim_flag;
 5029
 5030		/*
 5031		 * PFMEMALLOC skbs are special, they should
 5032		 * - be delivered to SOCK_MEMALLOC sockets only
 5033		 * - stay away from userspace
 5034		 * - have bounded memory usage
 5035		 *
 5036		 * Use PF_MEMALLOC as this saves us from propagating the allocation
 5037		 * context down to all allocation sites.
 5038		 */
 5039		noreclaim_flag = memalloc_noreclaim_save();
 5040		ret = __netif_receive_skb_one_core(skb, true);
 5041		memalloc_noreclaim_restore(noreclaim_flag);
 5042	} else
 5043		ret = __netif_receive_skb_one_core(skb, false);
 5044
 5045	return ret;
 5046}
 5047
 5048static void __netif_receive_skb_list(struct list_head *head)
 5049{
 5050	unsigned long noreclaim_flag = 0;
 5051	struct sk_buff *skb, *next;
 5052	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
 5053
 5054	list_for_each_entry_safe(skb, next, head, list) {
 5055		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
 5056			struct list_head sublist;
 5057
 5058			/* Handle the previous sublist */
 5059			list_cut_before(&sublist, head, &skb->list);
 5060			if (!list_empty(&sublist))
 5061				__netif_receive_skb_list_core(&sublist, pfmemalloc);
 5062			pfmemalloc = !pfmemalloc;
 5063			/* See comments in __netif_receive_skb */
 5064			if (pfmemalloc)
 5065				noreclaim_flag = memalloc_noreclaim_save();
 5066			else
 5067				memalloc_noreclaim_restore(noreclaim_flag);
 5068		}
 5069	}
 5070	/* Handle the remaining sublist */
 5071	if (!list_empty(head))
 5072		__netif_receive_skb_list_core(head, pfmemalloc);
 5073	/* Restore pflags */
 5074	if (pfmemalloc)
 5075		memalloc_noreclaim_restore(noreclaim_flag);
 5076}
 5077
 5078static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 5079{
 5080	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
 5081	struct bpf_prog *new = xdp->prog;
 5082	int ret = 0;
 5083
 5084	switch (xdp->command) {
 5085	case XDP_SETUP_PROG:
 5086		rcu_assign_pointer(dev->xdp_prog, new);
 5087		if (old)
 5088			bpf_prog_put(old);
 5089
 5090		if (old && !new) {
 5091			static_branch_dec(&generic_xdp_needed_key);
 5092		} else if (new && !old) {
 5093			static_branch_inc(&generic_xdp_needed_key);
 5094			dev_disable_lro(dev);
 5095			dev_disable_gro_hw(dev);
 5096		}
 5097		break;
 5098
 5099	case XDP_QUERY_PROG:
 5100		xdp->prog_id = old ? old->aux->id : 0;
 5101		break;
 5102
 5103	default:
 5104		ret = -EINVAL;
 5105		break;
 5106	}
 5107
 5108	return ret;
 5109}
 5110
 5111static int netif_receive_skb_internal(struct sk_buff *skb)
 5112{
 5113	int ret;
 5114
 5115	net_timestamp_check(netdev_tstamp_prequeue, skb);
 5116
 5117	if (skb_defer_rx_timestamp(skb))
 5118		return NET_RX_SUCCESS;
 5119
 5120	rcu_read_lock();
 
 5121#ifdef CONFIG_RPS
 5122	if (static_branch_unlikely(&rps_needed)) {
 5123		struct rps_dev_flow voidflow, *rflow = &voidflow;
 5124		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5125
 5126		if (cpu >= 0) {
 5127			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5128			rcu_read_unlock();
 5129			return ret;
 5130		}
 5131	}
 5132#endif
 5133	ret = __netif_receive_skb(skb);
 5134	rcu_read_unlock();
 5135	return ret;
 5136}
 5137
 5138static void netif_receive_skb_list_internal(struct list_head *head)
 5139{
 5140	struct sk_buff *skb, *next;
 5141	struct list_head sublist;
 5142
 5143	INIT_LIST_HEAD(&sublist);
 5144	list_for_each_entry_safe(skb, next, head, list) {
 5145		net_timestamp_check(netdev_tstamp_prequeue, skb);
 5146		skb_list_del_init(skb);
 5147		if (!skb_defer_rx_timestamp(skb))
 5148			list_add_tail(&skb->list, &sublist);
 5149	}
 5150	list_splice_init(&sublist, head);
 5151
 5152	rcu_read_lock();
 5153#ifdef CONFIG_RPS
 5154	if (static_branch_unlikely(&rps_needed)) {
 5155		list_for_each_entry_safe(skb, next, head, list) {
 5156			struct rps_dev_flow voidflow, *rflow = &voidflow;
 5157			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5158
 5159			if (cpu >= 0) {
 5160				/* Will be handled, remove from list */
 5161				skb_list_del_init(skb);
 5162				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5163			}
 5164		}
 5165	}
 5166#endif
 5167	__netif_receive_skb_list(head);
 5168	rcu_read_unlock();
 5169}
 5170
 5171/**
 5172 *	netif_receive_skb - process receive buffer from network
 5173 *	@skb: buffer to process
 5174 *
 5175 *	netif_receive_skb() is the main receive data processing function.
 5176 *	It always succeeds. The buffer may be dropped during processing
 5177 *	for congestion control or by the protocol layers.
 5178 *
 5179 *	This function may only be called from softirq context and interrupts
 5180 *	should be enabled.
 5181 *
 5182 *	Return values (usually ignored):
 5183 *	NET_RX_SUCCESS: no congestion
 5184 *	NET_RX_DROP: packet was dropped
 5185 */
 5186int netif_receive_skb(struct sk_buff *skb)
 5187{
 5188	int ret;
 5189
 5190	trace_netif_receive_skb_entry(skb);
 5191
 5192	ret = netif_receive_skb_internal(skb);
 5193	trace_netif_receive_skb_exit(ret);
 5194
 5195	return ret;
 5196}
 5197EXPORT_SYMBOL(netif_receive_skb);
 5198
 5199/**
 5200 *	netif_receive_skb_list - process many receive buffers from network
 5201 *	@head: list of skbs to process.
 5202 *
 5203 *	Since return value of netif_receive_skb() is normally ignored, and
 5204 *	wouldn't be meaningful for a list, this function returns void.
 5205 *
 5206 *	This function may only be called from softirq context and interrupts
 5207 *	should be enabled.
 5208 */
 5209void netif_receive_skb_list(struct list_head *head)
 5210{
 5211	struct sk_buff *skb;
 5212
 5213	if (list_empty(head))
 5214		return;
 5215	if (trace_netif_receive_skb_list_entry_enabled()) {
 5216		list_for_each_entry(skb, head, list)
 5217			trace_netif_receive_skb_list_entry(skb);
 5218	}
 5219	netif_receive_skb_list_internal(head);
 5220	trace_netif_receive_skb_list_exit(0);
 5221}
 5222EXPORT_SYMBOL(netif_receive_skb_list);
 5223
 5224DEFINE_PER_CPU(struct work_struct, flush_works);
 5225
 5226/* Network device is going away, flush any packets still pending */
 5227static void flush_backlog(struct work_struct *work)
 5228{
 
 
 5229	struct sk_buff *skb, *tmp;
 5230	struct softnet_data *sd;
 5231
 5232	local_bh_disable();
 5233	sd = this_cpu_ptr(&softnet_data);
 5234
 5235	local_irq_disable();
 5236	rps_lock(sd);
 5237	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
 5238		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 5239			__skb_unlink(skb, &sd->input_pkt_queue);
 5240			kfree_skb(skb);
 5241			input_queue_head_incr(sd);
 5242		}
 5243	}
 5244	rps_unlock(sd);
 5245	local_irq_enable();
 5246
 5247	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
 5248		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 5249			__skb_unlink(skb, &sd->process_queue);
 5250			kfree_skb(skb);
 5251			input_queue_head_incr(sd);
 5252		}
 5253	}
 5254	local_bh_enable();
 5255}
 5256
 5257static void flush_all_backlogs(void)
 5258{
 5259	unsigned int cpu;
 5260
 5261	get_online_cpus();
 5262
 5263	for_each_online_cpu(cpu)
 5264		queue_work_on(cpu, system_highpri_wq,
 5265			      per_cpu_ptr(&flush_works, cpu));
 5266
 5267	for_each_online_cpu(cpu)
 5268		flush_work(per_cpu_ptr(&flush_works, cpu));
 5269
 5270	put_online_cpus();
 5271}
 5272
 5273INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
 5274INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
 5275static int napi_gro_complete(struct sk_buff *skb)
 5276{
 5277	struct packet_offload *ptype;
 5278	__be16 type = skb->protocol;
 5279	struct list_head *head = &offload_base;
 5280	int err = -ENOENT;
 5281
 5282	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
 5283
 5284	if (NAPI_GRO_CB(skb)->count == 1) {
 5285		skb_shinfo(skb)->gso_size = 0;
 5286		goto out;
 5287	}
 5288
 5289	rcu_read_lock();
 5290	list_for_each_entry_rcu(ptype, head, list) {
 5291		if (ptype->type != type || !ptype->callbacks.gro_complete)
 5292			continue;
 5293
 5294		err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
 5295					 ipv6_gro_complete, inet_gro_complete,
 5296					 skb, 0);
 5297		break;
 5298	}
 5299	rcu_read_unlock();
 5300
 5301	if (err) {
 5302		WARN_ON(&ptype->list == head);
 5303		kfree_skb(skb);
 5304		return NET_RX_SUCCESS;
 5305	}
 5306
 5307out:
 5308	return netif_receive_skb_internal(skb);
 5309}
 5310
 5311static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
 5312				   bool flush_old)
 5313{
 5314	struct list_head *head = &napi->gro_hash[index].list;
 5315	struct sk_buff *skb, *p;
 5316
 5317	list_for_each_entry_safe_reverse(skb, p, head, list) {
 5318		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
 5319			return;
 5320		skb_list_del_init(skb);
 5321		napi_gro_complete(skb);
 5322		napi->gro_hash[index].count--;
 5323	}
 5324
 5325	if (!napi->gro_hash[index].count)
 5326		__clear_bit(index, &napi->gro_bitmask);
 5327}
 5328
 5329/* napi->gro_hash[].list contains packets ordered by age.
 5330 * youngest packets at the head of it.
 5331 * Complete skbs in reverse order to reduce latencies.
 5332 */
 5333void napi_gro_flush(struct napi_struct *napi, bool flush_old)
 5334{
 5335	unsigned long bitmask = napi->gro_bitmask;
 5336	unsigned int i, base = ~0U;
 
 
 
 
 
 
 
 
 
 
 
 5337
 5338	while ((i = ffs(bitmask)) != 0) {
 5339		bitmask >>= i;
 5340		base += i;
 5341		__napi_gro_flush_chain(napi, base, flush_old);
 5342	}
 
 
 5343}
 5344EXPORT_SYMBOL(napi_gro_flush);
 5345
 5346static struct list_head *gro_list_prepare(struct napi_struct *napi,
 5347					  struct sk_buff *skb)
 5348{
 
 5349	unsigned int maclen = skb->dev->hard_header_len;
 5350	u32 hash = skb_get_hash_raw(skb);
 5351	struct list_head *head;
 5352	struct sk_buff *p;
 5353
 5354	head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
 5355	list_for_each_entry(p, head, list) {
 5356		unsigned long diffs;
 5357
 5358		NAPI_GRO_CB(p)->flush = 0;
 5359
 5360		if (hash != skb_get_hash_raw(p)) {
 5361			NAPI_GRO_CB(p)->same_flow = 0;
 5362			continue;
 5363		}
 5364
 5365		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
 5366		diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
 5367		if (skb_vlan_tag_present(p))
 5368			diffs |= p->vlan_tci ^ skb->vlan_tci;
 5369		diffs |= skb_metadata_dst_cmp(p, skb);
 5370		diffs |= skb_metadata_differs(p, skb);
 5371		if (maclen == ETH_HLEN)
 5372			diffs |= compare_ether_header(skb_mac_header(p),
 5373						      skb_mac_header(skb));
 5374		else if (!diffs)
 5375			diffs = memcmp(skb_mac_header(p),
 5376				       skb_mac_header(skb),
 5377				       maclen);
 5378		NAPI_GRO_CB(p)->same_flow = !diffs;
 5379	}
 5380
 5381	return head;
 5382}
 5383
 5384static void skb_gro_reset_offset(struct sk_buff *skb)
 5385{
 5386	const struct skb_shared_info *pinfo = skb_shinfo(skb);
 5387	const skb_frag_t *frag0 = &pinfo->frags[0];
 5388
 5389	NAPI_GRO_CB(skb)->data_offset = 0;
 5390	NAPI_GRO_CB(skb)->frag0 = NULL;
 5391	NAPI_GRO_CB(skb)->frag0_len = 0;
 5392
 5393	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
 5394	    pinfo->nr_frags &&
 5395	    !PageHighMem(skb_frag_page(frag0))) {
 5396		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
 5397		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
 5398						    skb_frag_size(frag0),
 5399						    skb->end - skb->tail);
 5400	}
 5401}
 5402
 5403static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
 5404{
 5405	struct skb_shared_info *pinfo = skb_shinfo(skb);
 5406
 5407	BUG_ON(skb->end - skb->tail < grow);
 5408
 5409	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
 5410
 5411	skb->data_len -= grow;
 5412	skb->tail += grow;
 5413
 5414	skb_frag_off_add(&pinfo->frags[0], grow);
 5415	skb_frag_size_sub(&pinfo->frags[0], grow);
 5416
 5417	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
 5418		skb_frag_unref(skb, 0);
 5419		memmove(pinfo->frags, pinfo->frags + 1,
 5420			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
 5421	}
 5422}
 5423
 5424static void gro_flush_oldest(struct list_head *head)
 5425{
 5426	struct sk_buff *oldest;
 5427
 5428	oldest = list_last_entry(head, struct sk_buff, list);
 5429
 5430	/* We are called with head length >= MAX_GRO_SKBS, so this is
 5431	 * impossible.
 5432	 */
 5433	if (WARN_ON_ONCE(!oldest))
 5434		return;
 5435
 5436	/* Do not adjust napi->gro_hash[].count, caller is adding a new
 5437	 * SKB to the chain.
 5438	 */
 5439	skb_list_del_init(oldest);
 5440	napi_gro_complete(oldest);
 5441}
 5442
 5443INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
 5444							   struct sk_buff *));
 5445INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
 5446							   struct sk_buff *));
 5447static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 5448{
 5449	u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
 5450	struct list_head *head = &offload_base;
 5451	struct packet_offload *ptype;
 5452	__be16 type = skb->protocol;
 5453	struct list_head *gro_head;
 5454	struct sk_buff *pp = NULL;
 5455	enum gro_result ret;
 5456	int same_flow;
 
 5457	int grow;
 5458
 5459	if (netif_elide_gro(skb->dev))
 5460		goto normal;
 5461
 5462	gro_head = gro_list_prepare(napi, skb);
 
 
 
 5463
 5464	rcu_read_lock();
 5465	list_for_each_entry_rcu(ptype, head, list) {
 5466		if (ptype->type != type || !ptype->callbacks.gro_receive)
 5467			continue;
 5468
 5469		skb_set_network_header(skb, skb_gro_offset(skb));
 5470		skb_reset_mac_len(skb);
 5471		NAPI_GRO_CB(skb)->same_flow = 0;
 5472		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
 5473		NAPI_GRO_CB(skb)->free = 0;
 5474		NAPI_GRO_CB(skb)->encap_mark = 0;
 5475		NAPI_GRO_CB(skb)->recursion_counter = 0;
 5476		NAPI_GRO_CB(skb)->is_fou = 0;
 5477		NAPI_GRO_CB(skb)->is_atomic = 1;
 5478		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
 5479
 5480		/* Setup for GRO checksum validation */
 5481		switch (skb->ip_summed) {
 5482		case CHECKSUM_COMPLETE:
 5483			NAPI_GRO_CB(skb)->csum = skb->csum;
 5484			NAPI_GRO_CB(skb)->csum_valid = 1;
 5485			NAPI_GRO_CB(skb)->csum_cnt = 0;
 5486			break;
 5487		case CHECKSUM_UNNECESSARY:
 5488			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
 5489			NAPI_GRO_CB(skb)->csum_valid = 0;
 5490			break;
 5491		default:
 5492			NAPI_GRO_CB(skb)->csum_cnt = 0;
 5493			NAPI_GRO_CB(skb)->csum_valid = 0;
 5494		}
 5495
 5496		pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
 5497					ipv6_gro_receive, inet_gro_receive,
 5498					gro_head, skb);
 5499		break;
 5500	}
 5501	rcu_read_unlock();
 5502
 5503	if (&ptype->list == head)
 5504		goto normal;
 5505
 5506	if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
 5507		ret = GRO_CONSUMED;
 5508		goto ok;
 5509	}
 5510
 5511	same_flow = NAPI_GRO_CB(skb)->same_flow;
 5512	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
 5513
 5514	if (pp) {
 5515		skb_list_del_init(pp);
 5516		napi_gro_complete(pp);
 5517		napi->gro_hash[hash].count--;
 
 
 
 5518	}
 5519
 5520	if (same_flow)
 5521		goto ok;
 5522
 5523	if (NAPI_GRO_CB(skb)->flush)
 5524		goto normal;
 5525
 5526	if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
 5527		gro_flush_oldest(gro_head);
 
 
 
 
 
 
 
 
 
 5528	} else {
 5529		napi->gro_hash[hash].count++;
 5530	}
 5531	NAPI_GRO_CB(skb)->count = 1;
 5532	NAPI_GRO_CB(skb)->age = jiffies;
 5533	NAPI_GRO_CB(skb)->last = skb;
 5534	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
 5535	list_add(&skb->list, gro_head);
 
 5536	ret = GRO_HELD;
 5537
 5538pull:
 5539	grow = skb_gro_offset(skb) - skb_headlen(skb);
 5540	if (grow > 0)
 5541		gro_pull_from_frag0(skb, grow);
 5542ok:
 5543	if (napi->gro_hash[hash].count) {
 5544		if (!test_bit(hash, &napi->gro_bitmask))
 5545			__set_bit(hash, &napi->gro_bitmask);
 5546	} else if (test_bit(hash, &napi->gro_bitmask)) {
 5547		__clear_bit(hash, &napi->gro_bitmask);
 5548	}
 5549
 5550	return ret;
 5551
 5552normal:
 5553	ret = GRO_NORMAL;
 5554	goto pull;
 5555}
 5556
 5557struct packet_offload *gro_find_receive_by_type(__be16 type)
 5558{
 5559	struct list_head *offload_head = &offload_base;
 5560	struct packet_offload *ptype;
 5561
 5562	list_for_each_entry_rcu(ptype, offload_head, list) {
 5563		if (ptype->type != type || !ptype->callbacks.gro_receive)
 5564			continue;
 5565		return ptype;
 5566	}
 5567	return NULL;
 5568}
 5569EXPORT_SYMBOL(gro_find_receive_by_type);
 5570
 5571struct packet_offload *gro_find_complete_by_type(__be16 type)
 5572{
 5573	struct list_head *offload_head = &offload_base;
 5574	struct packet_offload *ptype;
 5575
 5576	list_for_each_entry_rcu(ptype, offload_head, list) {
 5577		if (ptype->type != type || !ptype->callbacks.gro_complete)
 5578			continue;
 5579		return ptype;
 5580	}
 5581	return NULL;
 5582}
 5583EXPORT_SYMBOL(gro_find_complete_by_type);
 5584
 5585static void napi_skb_free_stolen_head(struct sk_buff *skb)
 5586{
 5587	skb_dst_drop(skb);
 5588	skb_ext_put(skb);
 5589	kmem_cache_free(skbuff_head_cache, skb);
 5590}
 5591
 5592static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 5593{
 5594	switch (ret) {
 5595	case GRO_NORMAL:
 5596		if (netif_receive_skb_internal(skb))
 5597			ret = GRO_DROP;
 5598		break;
 5599
 5600	case GRO_DROP:
 5601		kfree_skb(skb);
 5602		break;
 5603
 5604	case GRO_MERGED_FREE:
 5605		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
 5606			napi_skb_free_stolen_head(skb);
 5607		else
 
 5608			__kfree_skb(skb);
 
 5609		break;
 5610
 5611	case GRO_HELD:
 5612	case GRO_MERGED:
 5613	case GRO_CONSUMED:
 5614		break;
 5615	}
 5616
 5617	return ret;
 5618}
 5619
 5620gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 5621{
 5622	gro_result_t ret;
 5623
 5624	skb_mark_napi_id(skb, napi);
 5625	trace_napi_gro_receive_entry(skb);
 5626
 5627	skb_gro_reset_offset(skb);
 5628
 5629	ret = napi_skb_finish(dev_gro_receive(napi, skb), skb);
 5630	trace_napi_gro_receive_exit(ret);
 5631
 5632	return ret;
 5633}
 5634EXPORT_SYMBOL(napi_gro_receive);
 5635
 5636static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
 5637{
 5638	if (unlikely(skb->pfmemalloc)) {
 5639		consume_skb(skb);
 5640		return;
 5641	}
 5642	__skb_pull(skb, skb_headlen(skb));
 5643	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
 5644	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
 5645	__vlan_hwaccel_clear_tag(skb);
 5646	skb->dev = napi->dev;
 5647	skb->skb_iif = 0;
 5648
 5649	/* eth_type_trans() assumes pkt_type is PACKET_HOST */
 5650	skb->pkt_type = PACKET_HOST;
 5651
 5652	skb->encapsulation = 0;
 5653	skb_shinfo(skb)->gso_type = 0;
 5654	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
 5655	skb_ext_reset(skb);
 5656
 5657	napi->skb = skb;
 5658}
 5659
 5660struct sk_buff *napi_get_frags(struct napi_struct *napi)
 5661{
 5662	struct sk_buff *skb = napi->skb;
 5663
 5664	if (!skb) {
 5665		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
 5666		if (skb) {
 5667			napi->skb = skb;
 5668			skb_mark_napi_id(skb, napi);
 5669		}
 5670	}
 5671	return skb;
 5672}
 5673EXPORT_SYMBOL(napi_get_frags);
 5674
 5675/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
 5676static void gro_normal_list(struct napi_struct *napi)
 5677{
 5678	if (!napi->rx_count)
 5679		return;
 5680	netif_receive_skb_list_internal(&napi->rx_list);
 5681	INIT_LIST_HEAD(&napi->rx_list);
 5682	napi->rx_count = 0;
 5683}
 5684
 5685/* Queue one GRO_NORMAL SKB up for list processing.  If batch size exceeded,
 5686 * pass the whole batch up to the stack.
 5687 */
 5688static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb)
 5689{
 5690	list_add_tail(&skb->list, &napi->rx_list);
 5691	if (++napi->rx_count >= gro_normal_batch)
 5692		gro_normal_list(napi);
 5693}
 5694
 5695static gro_result_t napi_frags_finish(struct napi_struct *napi,
 5696				      struct sk_buff *skb,
 5697				      gro_result_t ret)
 5698{
 5699	switch (ret) {
 5700	case GRO_NORMAL:
 5701	case GRO_HELD:
 5702		__skb_push(skb, ETH_HLEN);
 5703		skb->protocol = eth_type_trans(skb, skb->dev);
 5704		if (ret == GRO_NORMAL)
 5705			gro_normal_one(napi, skb);
 5706		break;
 5707
 5708	case GRO_DROP:
 5709		napi_reuse_skb(napi, skb);
 5710		break;
 5711
 5712	case GRO_MERGED_FREE:
 5713		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
 5714			napi_skb_free_stolen_head(skb);
 5715		else
 5716			napi_reuse_skb(napi, skb);
 5717		break;
 5718
 5719	case GRO_MERGED:
 5720	case GRO_CONSUMED:
 5721		break;
 5722	}
 5723
 5724	return ret;
 5725}
 5726
 5727/* Upper GRO stack assumes network header starts at gro_offset=0
 5728 * Drivers could call both napi_gro_frags() and napi_gro_receive()
 5729 * We copy ethernet header into skb->data to have a common layout.
 5730 */
 5731static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 5732{
 5733	struct sk_buff *skb = napi->skb;
 5734	const struct ethhdr *eth;
 5735	unsigned int hlen = sizeof(*eth);
 5736
 5737	napi->skb = NULL;
 5738
 5739	skb_reset_mac_header(skb);
 5740	skb_gro_reset_offset(skb);
 5741
 
 5742	if (unlikely(skb_gro_header_hard(skb, hlen))) {
 5743		eth = skb_gro_header_slow(skb, hlen, 0);
 5744		if (unlikely(!eth)) {
 5745			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
 5746					     __func__, napi->dev->name);
 5747			napi_reuse_skb(napi, skb);
 5748			return NULL;
 5749		}
 5750	} else {
 5751		eth = (const struct ethhdr *)skb->data;
 5752		gro_pull_from_frag0(skb, hlen);
 5753		NAPI_GRO_CB(skb)->frag0 += hlen;
 5754		NAPI_GRO_CB(skb)->frag0_len -= hlen;
 5755	}
 5756	__skb_pull(skb, hlen);
 5757
 5758	/*
 5759	 * This works because the only protocols we care about don't require
 5760	 * special handling.
 5761	 * We'll fix it up properly in napi_frags_finish()
 5762	 */
 5763	skb->protocol = eth->h_proto;
 5764
 5765	return skb;
 5766}
 5767
 5768gro_result_t napi_gro_frags(struct napi_struct *napi)
 5769{
 5770	gro_result_t ret;
 5771	struct sk_buff *skb = napi_frags_skb(napi);
 5772
 5773	if (!skb)
 5774		return GRO_DROP;
 5775
 5776	trace_napi_gro_frags_entry(skb);
 5777
 5778	ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
 5779	trace_napi_gro_frags_exit(ret);
 5780
 5781	return ret;
 5782}
 5783EXPORT_SYMBOL(napi_gro_frags);
 5784
 5785/* Compute the checksum from gro_offset and return the folded value
 5786 * after adding in any pseudo checksum.
 5787 */
 5788__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
 5789{
 5790	__wsum wsum;
 5791	__sum16 sum;
 5792
 5793	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
 5794
 5795	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
 5796	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
 5797	/* See comments in __skb_checksum_complete(). */
 5798	if (likely(!sum)) {
 5799		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
 5800		    !skb->csum_complete_sw)
 5801			netdev_rx_csum_fault(skb->dev, skb);
 5802	}
 5803
 5804	NAPI_GRO_CB(skb)->csum = wsum;
 5805	NAPI_GRO_CB(skb)->csum_valid = 1;
 5806
 5807	return sum;
 5808}
 5809EXPORT_SYMBOL(__skb_gro_checksum_complete);
 5810
 5811static void net_rps_send_ipi(struct softnet_data *remsd)
 5812{
 5813#ifdef CONFIG_RPS
 5814	while (remsd) {
 5815		struct softnet_data *next = remsd->rps_ipi_next;
 5816
 5817		if (cpu_online(remsd->cpu))
 5818			smp_call_function_single_async(remsd->cpu, &remsd->csd);
 5819		remsd = next;
 5820	}
 5821#endif
 5822}
 5823
 5824/*
 5825 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
 5826 * Note: called with local irq disabled, but exits with local irq enabled.
 5827 */
 5828static void net_rps_action_and_irq_enable(struct softnet_data *sd)
 5829{
 5830#ifdef CONFIG_RPS
 5831	struct softnet_data *remsd = sd->rps_ipi_list;
 5832
 5833	if (remsd) {
 5834		sd->rps_ipi_list = NULL;
 5835
 5836		local_irq_enable();
 5837
 5838		/* Send pending IPI's to kick RPS processing on remote cpus. */
 5839		net_rps_send_ipi(remsd);
 
 
 
 
 
 
 
 5840	} else
 5841#endif
 5842		local_irq_enable();
 5843}
 5844
 5845static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
 5846{
 5847#ifdef CONFIG_RPS
 5848	return sd->rps_ipi_list != NULL;
 5849#else
 5850	return false;
 5851#endif
 5852}
 5853
 5854static int process_backlog(struct napi_struct *napi, int quota)
 5855{
 5856	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
 5857	bool again = true;
 5858	int work = 0;
 
 5859
 5860	/* Check if we have pending ipi, its better to send them now,
 5861	 * not waiting net_rx_action() end.
 5862	 */
 5863	if (sd_has_rps_ipi_waiting(sd)) {
 5864		local_irq_disable();
 5865		net_rps_action_and_irq_enable(sd);
 5866	}
 5867
 5868	napi->weight = dev_rx_weight;
 5869	while (again) {
 
 5870		struct sk_buff *skb;
 5871
 5872		while ((skb = __skb_dequeue(&sd->process_queue))) {
 5873			rcu_read_lock();
 
 5874			__netif_receive_skb(skb);
 5875			rcu_read_unlock();
 
 5876			input_queue_head_incr(sd);
 5877			if (++work >= quota)
 
 5878				return work;
 5879
 5880		}
 5881
 5882		local_irq_disable();
 5883		rps_lock(sd);
 5884		if (skb_queue_empty(&sd->input_pkt_queue)) {
 5885			/*
 5886			 * Inline a custom version of __napi_complete().
 5887			 * only current cpu owns and manipulates this napi,
 5888			 * and NAPI_STATE_SCHED is the only possible flag set
 5889			 * on backlog.
 5890			 * We can use a plain write instead of clear_bit(),
 5891			 * and we dont need an smp_mb() memory barrier.
 5892			 */
 5893			napi->state = 0;
 5894			again = false;
 5895		} else {
 5896			skb_queue_splice_tail_init(&sd->input_pkt_queue,
 5897						   &sd->process_queue);
 5898		}
 
 
 
 5899		rps_unlock(sd);
 5900		local_irq_enable();
 5901	}
 
 5902
 5903	return work;
 5904}
 5905
 5906/**
 5907 * __napi_schedule - schedule for receive
 5908 * @n: entry to schedule
 5909 *
 5910 * The entry's receive function will be scheduled to run.
 5911 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
 5912 */
 5913void __napi_schedule(struct napi_struct *n)
 5914{
 5915	unsigned long flags;
 5916
 5917	local_irq_save(flags);
 5918	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 5919	local_irq_restore(flags);
 5920}
 5921EXPORT_SYMBOL(__napi_schedule);
 5922
 5923/**
 5924 *	napi_schedule_prep - check if napi can be scheduled
 5925 *	@n: napi context
 5926 *
 5927 * Test if NAPI routine is already running, and if not mark
 5928 * it as running.  This is used as a condition variable
 5929 * insure only one NAPI poll instance runs.  We also make
 5930 * sure there is no pending NAPI disable.
 5931 */
 5932bool napi_schedule_prep(struct napi_struct *n)
 5933{
 5934	unsigned long val, new;
 5935
 5936	do {
 5937		val = READ_ONCE(n->state);
 5938		if (unlikely(val & NAPIF_STATE_DISABLE))
 5939			return false;
 5940		new = val | NAPIF_STATE_SCHED;
 5941
 5942		/* Sets STATE_MISSED bit if STATE_SCHED was already set
 5943		 * This was suggested by Alexander Duyck, as compiler
 5944		 * emits better code than :
 5945		 * if (val & NAPIF_STATE_SCHED)
 5946		 *     new |= NAPIF_STATE_MISSED;
 5947		 */
 5948		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
 5949						   NAPIF_STATE_MISSED;
 5950	} while (cmpxchg(&n->state, val, new) != val);
 5951
 5952	return !(val & NAPIF_STATE_SCHED);
 5953}
 5954EXPORT_SYMBOL(napi_schedule_prep);
 5955
 5956/**
 5957 * __napi_schedule_irqoff - schedule for receive
 5958 * @n: entry to schedule
 5959 *
 5960 * Variant of __napi_schedule() assuming hard irqs are masked
 5961 */
 5962void __napi_schedule_irqoff(struct napi_struct *n)
 5963{
 5964	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 5965}
 5966EXPORT_SYMBOL(__napi_schedule_irqoff);
 5967
 5968bool napi_complete_done(struct napi_struct *n, int work_done)
 
 
 
 
 
 
 
 
 
 
 5969{
 5970	unsigned long flags, val, new;
 5971
 5972	/*
 5973	 * 1) Don't let napi dequeue from the cpu poll list
 5974	 *    just in case its running on a different cpu.
 5975	 * 2) If we are busy polling, do nothing here, we have
 5976	 *    the guarantee we will be called later.
 5977	 */
 5978	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
 5979				 NAPIF_STATE_IN_BUSY_POLL)))
 5980		return false;
 5981
 5982	gro_normal_list(n);
 5983
 5984	if (n->gro_bitmask) {
 5985		unsigned long timeout = 0;
 5986
 5987		if (work_done)
 5988			timeout = n->dev->gro_flush_timeout;
 5989
 5990		/* When the NAPI instance uses a timeout and keeps postponing
 5991		 * it, we need to bound somehow the time packets are kept in
 5992		 * the GRO layer
 5993		 */
 5994		napi_gro_flush(n, !!timeout);
 5995		if (timeout)
 5996			hrtimer_start(&n->timer, ns_to_ktime(timeout),
 5997				      HRTIMER_MODE_REL_PINNED);
 
 
 5998	}
 5999	if (unlikely(!list_empty(&n->poll_list))) {
 
 
 6000		/* If n->poll_list is not empty, we need to mask irqs */
 6001		local_irq_save(flags);
 6002		list_del_init(&n->poll_list);
 6003		local_irq_restore(flags);
 6004	}
 6005
 6006	do {
 6007		val = READ_ONCE(n->state);
 6008
 6009		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
 6010
 6011		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
 6012
 6013		/* If STATE_MISSED was set, leave STATE_SCHED set,
 6014		 * because we will call napi->poll() one more time.
 6015		 * This C code was suggested by Alexander Duyck to help gcc.
 6016		 */
 6017		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
 6018						    NAPIF_STATE_SCHED;
 6019	} while (cmpxchg(&n->state, val, new) != val);
 6020
 6021	if (unlikely(val & NAPIF_STATE_MISSED)) {
 6022		__napi_schedule(n);
 6023		return false;
 6024	}
 6025
 6026	return true;
 6027}
 6028EXPORT_SYMBOL(napi_complete_done);
 6029
 6030/* must be called under rcu_read_lock(), as we dont take a reference */
 6031static struct napi_struct *napi_by_id(unsigned int napi_id)
 6032{
 6033	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
 6034	struct napi_struct *napi;
 6035
 6036	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
 6037		if (napi->napi_id == napi_id)
 6038			return napi;
 6039
 6040	return NULL;
 6041}
 6042
 6043#if defined(CONFIG_NET_RX_BUSY_POLL)
 6044
 6045#define BUSY_POLL_BUDGET 8
 6046
 6047static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
 6048{
 6049	int rc;
 6050
 6051	/* Busy polling means there is a high chance device driver hard irq
 6052	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
 6053	 * set in napi_schedule_prep().
 6054	 * Since we are about to call napi->poll() once more, we can safely
 6055	 * clear NAPI_STATE_MISSED.
 6056	 *
 6057	 * Note: x86 could use a single "lock and ..." instruction
 6058	 * to perform these two clear_bit()
 6059	 */
 6060	clear_bit(NAPI_STATE_MISSED, &napi->state);
 6061	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
 6062
 6063	local_bh_disable();
 6064
 6065	/* All we really want here is to re-enable device interrupts.
 6066	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
 6067	 */
 6068	rc = napi->poll(napi, BUSY_POLL_BUDGET);
 6069	/* We can't gro_normal_list() here, because napi->poll() might have
 6070	 * rearmed the napi (napi_complete_done()) in which case it could
 6071	 * already be running on another CPU.
 6072	 */
 6073	trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
 6074	netpoll_poll_unlock(have_poll_lock);
 6075	if (rc == BUSY_POLL_BUDGET) {
 6076		/* As the whole budget was spent, we still own the napi so can
 6077		 * safely handle the rx_list.
 6078		 */
 6079		gro_normal_list(napi);
 6080		__napi_schedule(napi);
 6081	}
 6082	local_bh_enable();
 6083}
 6084
 6085void napi_busy_loop(unsigned int napi_id,
 6086		    bool (*loop_end)(void *, unsigned long),
 6087		    void *loop_end_arg)
 6088{
 6089	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
 6090	int (*napi_poll)(struct napi_struct *napi, int budget);
 6091	void *have_poll_lock = NULL;
 6092	struct napi_struct *napi;
 6093
 6094restart:
 6095	napi_poll = NULL;
 6096
 6097	rcu_read_lock();
 6098
 6099	napi = napi_by_id(napi_id);
 6100	if (!napi)
 6101		goto out;
 6102
 6103	preempt_disable();
 6104	for (;;) {
 6105		int work = 0;
 6106
 
 
 6107		local_bh_disable();
 6108		if (!napi_poll) {
 6109			unsigned long val = READ_ONCE(napi->state);
 6110
 6111			/* If multiple threads are competing for this napi,
 6112			 * we avoid dirtying napi->state as much as we can.
 6113			 */
 6114			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
 6115				   NAPIF_STATE_IN_BUSY_POLL))
 6116				goto count;
 6117			if (cmpxchg(&napi->state, val,
 6118				    val | NAPIF_STATE_IN_BUSY_POLL |
 6119					  NAPIF_STATE_SCHED) != val)
 6120				goto count;
 6121			have_poll_lock = netpoll_poll_lock(napi);
 6122			napi_poll = napi->poll;
 6123		}
 6124		work = napi_poll(napi, BUSY_POLL_BUDGET);
 6125		trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
 6126		gro_normal_list(napi);
 6127count:
 6128		if (work > 0)
 6129			__NET_ADD_STATS(dev_net(napi->dev),
 6130					LINUX_MIB_BUSYPOLLRXPACKETS, work);
 6131		local_bh_enable();
 6132
 6133		if (!loop_end || loop_end(loop_end_arg, start_time))
 6134			break;
 6135
 6136		if (unlikely(need_resched())) {
 6137			if (napi_poll)
 6138				busy_poll_stop(napi, have_poll_lock);
 6139			preempt_enable();
 6140			rcu_read_unlock();
 6141			cond_resched();
 6142			if (loop_end(loop_end_arg, start_time))
 6143				return;
 6144			goto restart;
 6145		}
 6146		cpu_relax();
 6147	}
 6148	if (napi_poll)
 6149		busy_poll_stop(napi, have_poll_lock);
 6150	preempt_enable();
 6151out:
 6152	rcu_read_unlock();
 
 6153}
 6154EXPORT_SYMBOL(napi_busy_loop);
 6155
 6156#endif /* CONFIG_NET_RX_BUSY_POLL */
 6157
 6158static void napi_hash_add(struct napi_struct *napi)
 6159{
 6160	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
 6161	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
 6162		return;
 6163
 6164	spin_lock(&napi_hash_lock);
 6165
 6166	/* 0..NR_CPUS range is reserved for sender_cpu use */
 6167	do {
 6168		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
 6169			napi_gen_id = MIN_NAPI_ID;
 6170	} while (napi_by_id(napi_gen_id));
 6171	napi->napi_id = napi_gen_id;
 6172
 6173	hlist_add_head_rcu(&napi->napi_hash_node,
 6174			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
 6175
 6176	spin_unlock(&napi_hash_lock);
 6177}
 
 6178
 6179/* Warning : caller is responsible to make sure rcu grace period
 6180 * is respected before freeing memory containing @napi
 6181 */
 6182bool napi_hash_del(struct napi_struct *napi)
 6183{
 6184	bool rcu_sync_needed = false;
 6185
 6186	spin_lock(&napi_hash_lock);
 6187
 6188	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
 6189		rcu_sync_needed = true;
 6190		hlist_del_rcu(&napi->napi_hash_node);
 6191	}
 6192	spin_unlock(&napi_hash_lock);
 6193	return rcu_sync_needed;
 6194}
 6195EXPORT_SYMBOL_GPL(napi_hash_del);
 6196
 6197static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 6198{
 6199	struct napi_struct *napi;
 6200
 6201	napi = container_of(timer, struct napi_struct, timer);
 6202
 6203	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
 6204	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
 6205	 */
 6206	if (napi->gro_bitmask && !napi_disable_pending(napi) &&
 6207	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
 6208		__napi_schedule_irqoff(napi);
 6209
 6210	return HRTIMER_NORESTART;
 6211}
 6212
 6213static void init_gro_hash(struct napi_struct *napi)
 6214{
 6215	int i;
 6216
 6217	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6218		INIT_LIST_HEAD(&napi->gro_hash[i].list);
 6219		napi->gro_hash[i].count = 0;
 6220	}
 6221	napi->gro_bitmask = 0;
 6222}
 6223
 6224void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 6225		    int (*poll)(struct napi_struct *, int), int weight)
 6226{
 6227	INIT_LIST_HEAD(&napi->poll_list);
 6228	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 6229	napi->timer.function = napi_watchdog;
 6230	init_gro_hash(napi);
 
 6231	napi->skb = NULL;
 6232	INIT_LIST_HEAD(&napi->rx_list);
 6233	napi->rx_count = 0;
 6234	napi->poll = poll;
 6235	if (weight > NAPI_POLL_WEIGHT)
 6236		netdev_err_once(dev, "%s() called with weight %d\n", __func__,
 6237				weight);
 6238	napi->weight = weight;
 6239	list_add(&napi->dev_list, &dev->napi_list);
 6240	napi->dev = dev;
 6241#ifdef CONFIG_NETPOLL
 
 6242	napi->poll_owner = -1;
 6243#endif
 6244	set_bit(NAPI_STATE_SCHED, &napi->state);
 6245	napi_hash_add(napi);
 6246}
 6247EXPORT_SYMBOL(netif_napi_add);
 6248
 6249void napi_disable(struct napi_struct *n)
 6250{
 6251	might_sleep();
 6252	set_bit(NAPI_STATE_DISABLE, &n->state);
 6253
 6254	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
 6255		msleep(1);
 6256	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
 6257		msleep(1);
 6258
 6259	hrtimer_cancel(&n->timer);
 6260
 6261	clear_bit(NAPI_STATE_DISABLE, &n->state);
 6262}
 6263EXPORT_SYMBOL(napi_disable);
 6264
 6265static void flush_gro_hash(struct napi_struct *napi)
 6266{
 6267	int i;
 6268
 6269	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6270		struct sk_buff *skb, *n;
 6271
 6272		list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
 6273			kfree_skb(skb);
 6274		napi->gro_hash[i].count = 0;
 6275	}
 6276}
 6277
 6278/* Must be called in process context */
 6279void netif_napi_del(struct napi_struct *napi)
 6280{
 6281	might_sleep();
 6282	if (napi_hash_del(napi))
 6283		synchronize_net();
 6284	list_del_init(&napi->dev_list);
 6285	napi_free_frags(napi);
 6286
 6287	flush_gro_hash(napi);
 6288	napi->gro_bitmask = 0;
 
 6289}
 6290EXPORT_SYMBOL(netif_napi_del);
 6291
 6292static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 6293{
 6294	void *have;
 6295	int work, weight;
 6296
 6297	list_del_init(&n->poll_list);
 6298
 6299	have = netpoll_poll_lock(n);
 6300
 6301	weight = n->weight;
 6302
 6303	/* This NAPI_STATE_SCHED test is for avoiding a race
 6304	 * with netpoll's poll_napi().  Only the entity which
 6305	 * obtains the lock and sees NAPI_STATE_SCHED set will
 6306	 * actually make the ->poll() call.  Therefore we avoid
 6307	 * accidentally calling ->poll() when NAPI is not scheduled.
 6308	 */
 6309	work = 0;
 6310	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 6311		work = n->poll(n, weight);
 6312		trace_napi_poll(n, work, weight);
 6313	}
 6314
 6315	WARN_ON_ONCE(work > weight);
 6316
 6317	if (likely(work < weight))
 6318		goto out_unlock;
 6319
 6320	/* Drivers must not modify the NAPI state if they
 6321	 * consume the entire weight.  In such cases this code
 6322	 * still "owns" the NAPI instance and therefore can
 6323	 * move the instance around on the list at-will.
 6324	 */
 6325	if (unlikely(napi_disable_pending(n))) {
 6326		napi_complete(n);
 6327		goto out_unlock;
 6328	}
 6329
 6330	gro_normal_list(n);
 6331
 6332	if (n->gro_bitmask) {
 6333		/* flush too old packets
 6334		 * If HZ < 1000, flush all packets.
 6335		 */
 6336		napi_gro_flush(n, HZ >= 1000);
 6337	}
 6338
 6339	/* Some drivers may have called napi_schedule
 6340	 * prior to exhausting their budget.
 6341	 */
 6342	if (unlikely(!list_empty(&n->poll_list))) {
 6343		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
 6344			     n->dev ? n->dev->name : "backlog");
 6345		goto out_unlock;
 6346	}
 6347
 6348	list_add_tail(&n->poll_list, repoll);
 6349
 6350out_unlock:
 6351	netpoll_poll_unlock(have);
 6352
 6353	return work;
 6354}
 6355
 6356static __latent_entropy void net_rx_action(struct softirq_action *h)
 6357{
 6358	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 6359	unsigned long time_limit = jiffies +
 6360		usecs_to_jiffies(netdev_budget_usecs);
 6361	int budget = netdev_budget;
 6362	LIST_HEAD(list);
 6363	LIST_HEAD(repoll);
 6364
 6365	local_irq_disable();
 6366	list_splice_init(&sd->poll_list, &list);
 6367	local_irq_enable();
 6368
 6369	for (;;) {
 6370		struct napi_struct *n;
 6371
 6372		if (list_empty(&list)) {
 6373			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
 6374				goto out;
 6375			break;
 6376		}
 6377
 6378		n = list_first_entry(&list, struct napi_struct, poll_list);
 6379		budget -= napi_poll(n, &repoll);
 6380
 6381		/* If softirq window is exhausted then punt.
 6382		 * Allow this to run for 2 jiffies since which will allow
 6383		 * an average latency of 1.5/HZ.
 6384		 */
 6385		if (unlikely(budget <= 0 ||
 6386			     time_after_eq(jiffies, time_limit))) {
 6387			sd->time_squeeze++;
 6388			break;
 6389		}
 6390	}
 6391
 
 6392	local_irq_disable();
 6393
 6394	list_splice_tail_init(&sd->poll_list, &list);
 6395	list_splice_tail(&repoll, &list);
 6396	list_splice(&list, &sd->poll_list);
 6397	if (!list_empty(&sd->poll_list))
 6398		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 6399
 6400	net_rps_action_and_irq_enable(sd);
 6401out:
 6402	__kfree_skb_flush();
 6403}
 6404
 6405struct netdev_adjacent {
 6406	struct net_device *dev;
 6407
 6408	/* upper master flag, there can only be one master device per list */
 6409	bool master;
 6410
 6411	/* lookup ignore flag */
 6412	bool ignore;
 6413
 6414	/* counter for the number of times this device was added to us */
 6415	u16 ref_nr;
 6416
 6417	/* private field for the users */
 6418	void *private;
 6419
 6420	struct list_head list;
 6421	struct rcu_head rcu;
 6422};
 6423
 6424static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
 6425						 struct list_head *adj_list)
 6426{
 6427	struct netdev_adjacent *adj;
 6428
 6429	list_for_each_entry(adj, adj_list, list) {
 6430		if (adj->dev == adj_dev)
 6431			return adj;
 6432	}
 6433	return NULL;
 6434}
 6435
 6436static int ____netdev_has_upper_dev(struct net_device *upper_dev, void *data)
 6437{
 6438	struct net_device *dev = data;
 6439
 6440	return upper_dev == dev;
 6441}
 6442
 6443/**
 6444 * netdev_has_upper_dev - Check if device is linked to an upper device
 6445 * @dev: device
 6446 * @upper_dev: upper device to check
 6447 *
 6448 * Find out if a device is linked to specified upper device and return true
 6449 * in case it is. Note that this checks only immediate upper device,
 6450 * not through a complete stack of devices. The caller must hold the RTNL lock.
 6451 */
 6452bool netdev_has_upper_dev(struct net_device *dev,
 6453			  struct net_device *upper_dev)
 6454{
 6455	ASSERT_RTNL();
 6456
 6457	return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 6458					     upper_dev);
 6459}
 6460EXPORT_SYMBOL(netdev_has_upper_dev);
 6461
 6462/**
 6463 * netdev_has_upper_dev_all - Check if device is linked to an upper device
 6464 * @dev: device
 6465 * @upper_dev: upper device to check
 6466 *
 6467 * Find out if a device is linked to specified upper device and return true
 6468 * in case it is. Note that this checks the entire upper device chain.
 6469 * The caller must hold rcu lock.
 6470 */
 6471
 6472bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
 6473				  struct net_device *upper_dev)
 6474{
 6475	return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 6476					       upper_dev);
 6477}
 6478EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
 6479
 6480/**
 6481 * netdev_has_any_upper_dev - Check if device is linked to some device
 6482 * @dev: device
 6483 *
 6484 * Find out if a device is linked to an upper device and return true in case
 6485 * it is. The caller must hold the RTNL lock.
 6486 */
 6487bool netdev_has_any_upper_dev(struct net_device *dev)
 6488{
 6489	ASSERT_RTNL();
 6490
 6491	return !list_empty(&dev->adj_list.upper);
 6492}
 6493EXPORT_SYMBOL(netdev_has_any_upper_dev);
 6494
 6495/**
 6496 * netdev_master_upper_dev_get - Get master upper device
 6497 * @dev: device
 6498 *
 6499 * Find a master upper device and return pointer to it or NULL in case
 6500 * it's not there. The caller must hold the RTNL lock.
 6501 */
 6502struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
 6503{
 6504	struct netdev_adjacent *upper;
 6505
 6506	ASSERT_RTNL();
 6507
 6508	if (list_empty(&dev->adj_list.upper))
 6509		return NULL;
 6510
 6511	upper = list_first_entry(&dev->adj_list.upper,
 6512				 struct netdev_adjacent, list);
 6513	if (likely(upper->master))
 6514		return upper->dev;
 6515	return NULL;
 6516}
 6517EXPORT_SYMBOL(netdev_master_upper_dev_get);
 6518
 6519static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
 6520{
 6521	struct netdev_adjacent *upper;
 6522
 6523	ASSERT_RTNL();
 6524
 6525	if (list_empty(&dev->adj_list.upper))
 6526		return NULL;
 6527
 6528	upper = list_first_entry(&dev->adj_list.upper,
 6529				 struct netdev_adjacent, list);
 6530	if (likely(upper->master) && !upper->ignore)
 6531		return upper->dev;
 6532	return NULL;
 6533}
 6534
 6535/**
 6536 * netdev_has_any_lower_dev - Check if device is linked to some device
 6537 * @dev: device
 6538 *
 6539 * Find out if a device is linked to a lower device and return true in case
 6540 * it is. The caller must hold the RTNL lock.
 6541 */
 6542static bool netdev_has_any_lower_dev(struct net_device *dev)
 6543{
 6544	ASSERT_RTNL();
 6545
 6546	return !list_empty(&dev->adj_list.lower);
 6547}
 6548
 6549void *netdev_adjacent_get_private(struct list_head *adj_list)
 6550{
 6551	struct netdev_adjacent *adj;
 6552
 6553	adj = list_entry(adj_list, struct netdev_adjacent, list);
 6554
 6555	return adj->private;
 6556}
 6557EXPORT_SYMBOL(netdev_adjacent_get_private);
 6558
 6559/**
 6560 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
 6561 * @dev: device
 6562 * @iter: list_head ** of the current position
 6563 *
 6564 * Gets the next device from the dev's upper list, starting from iter
 6565 * position. The caller must hold RCU read lock.
 6566 */
 6567struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
 6568						 struct list_head **iter)
 6569{
 6570	struct netdev_adjacent *upper;
 6571
 6572	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 6573
 6574	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 6575
 6576	if (&upper->list == &dev->adj_list.upper)
 6577		return NULL;
 6578
 6579	*iter = &upper->list;
 6580
 6581	return upper->dev;
 6582}
 6583EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
 6584
 6585static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
 6586						  struct list_head **iter,
 6587						  bool *ignore)
 6588{
 6589	struct netdev_adjacent *upper;
 6590
 6591	upper = list_entry((*iter)->next, struct netdev_adjacent, list);
 6592
 6593	if (&upper->list == &dev->adj_list.upper)
 6594		return NULL;
 6595
 6596	*iter = &upper->list;
 6597	*ignore = upper->ignore;
 6598
 6599	return upper->dev;
 6600}
 6601
 6602static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
 6603						    struct list_head **iter)
 6604{
 6605	struct netdev_adjacent *upper;
 6606
 6607	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 6608
 6609	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 6610
 6611	if (&upper->list == &dev->adj_list.upper)
 6612		return NULL;
 6613
 6614	*iter = &upper->list;
 6615
 6616	return upper->dev;
 6617}
 6618
 6619static int __netdev_walk_all_upper_dev(struct net_device *dev,
 6620				       int (*fn)(struct net_device *dev,
 6621						 void *data),
 6622				       void *data)
 6623{
 6624	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 6625	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 6626	int ret, cur = 0;
 6627	bool ignore;
 6628
 6629	now = dev;
 6630	iter = &dev->adj_list.upper;
 6631
 6632	while (1) {
 6633		if (now != dev) {
 6634			ret = fn(now, data);
 6635			if (ret)
 6636				return ret;
 6637		}
 6638
 6639		next = NULL;
 6640		while (1) {
 6641			udev = __netdev_next_upper_dev(now, &iter, &ignore);
 6642			if (!udev)
 6643				break;
 6644			if (ignore)
 6645				continue;
 6646
 6647			next = udev;
 6648			niter = &udev->adj_list.upper;
 6649			dev_stack[cur] = now;
 6650			iter_stack[cur++] = iter;
 6651			break;
 6652		}
 6653
 6654		if (!next) {
 6655			if (!cur)
 6656				return 0;
 6657			next = dev_stack[--cur];
 6658			niter = iter_stack[cur];
 6659		}
 6660
 6661		now = next;
 6662		iter = niter;
 6663	}
 6664
 6665	return 0;
 6666}
 6667
 6668int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
 6669				  int (*fn)(struct net_device *dev,
 6670					    void *data),
 6671				  void *data)
 6672{
 6673	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 6674	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 6675	int ret, cur = 0;
 6676
 6677	now = dev;
 6678	iter = &dev->adj_list.upper;
 6679
 6680	while (1) {
 6681		if (now != dev) {
 6682			ret = fn(now, data);
 6683			if (ret)
 6684				return ret;
 6685		}
 6686
 6687		next = NULL;
 6688		while (1) {
 6689			udev = netdev_next_upper_dev_rcu(now, &iter);
 6690			if (!udev)
 6691				break;
 6692
 6693			next = udev;
 6694			niter = &udev->adj_list.upper;
 6695			dev_stack[cur] = now;
 6696			iter_stack[cur++] = iter;
 6697			break;
 6698		}
 6699
 6700		if (!next) {
 6701			if (!cur)
 6702				return 0;
 6703			next = dev_stack[--cur];
 6704			niter = iter_stack[cur];
 6705		}
 6706
 6707		now = next;
 6708		iter = niter;
 6709	}
 6710
 6711	return 0;
 6712}
 6713EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
 6714
 6715static bool __netdev_has_upper_dev(struct net_device *dev,
 6716				   struct net_device *upper_dev)
 6717{
 6718	ASSERT_RTNL();
 6719
 6720	return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
 6721					   upper_dev);
 6722}
 6723
 6724/**
 6725 * netdev_lower_get_next_private - Get the next ->private from the
 6726 *				   lower neighbour list
 6727 * @dev: device
 6728 * @iter: list_head ** of the current position
 6729 *
 6730 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 6731 * list, starting from iter position. The caller must hold either hold the
 6732 * RTNL lock or its own locking that guarantees that the neighbour lower
 6733 * list will remain unchanged.
 6734 */
 6735void *netdev_lower_get_next_private(struct net_device *dev,
 6736				    struct list_head **iter)
 6737{
 6738	struct netdev_adjacent *lower;
 6739
 6740	lower = list_entry(*iter, struct netdev_adjacent, list);
 6741
 6742	if (&lower->list == &dev->adj_list.lower)
 6743		return NULL;
 6744
 6745	*iter = lower->list.next;
 6746
 6747	return lower->private;
 6748}
 6749EXPORT_SYMBOL(netdev_lower_get_next_private);
 6750
 6751/**
 6752 * netdev_lower_get_next_private_rcu - Get the next ->private from the
 6753 *				       lower neighbour list, RCU
 6754 *				       variant
 6755 * @dev: device
 6756 * @iter: list_head ** of the current position
 6757 *
 6758 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 6759 * list, starting from iter position. The caller must hold RCU read lock.
 6760 */
 6761void *netdev_lower_get_next_private_rcu(struct net_device *dev,
 6762					struct list_head **iter)
 6763{
 6764	struct netdev_adjacent *lower;
 6765
 6766	WARN_ON_ONCE(!rcu_read_lock_held());
 6767
 6768	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 6769
 6770	if (&lower->list == &dev->adj_list.lower)
 6771		return NULL;
 6772
 6773	*iter = &lower->list;
 6774
 6775	return lower->private;
 6776}
 6777EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
 6778
 6779/**
 6780 * netdev_lower_get_next - Get the next device from the lower neighbour
 6781 *                         list
 6782 * @dev: device
 6783 * @iter: list_head ** of the current position
 6784 *
 6785 * Gets the next netdev_adjacent from the dev's lower neighbour
 6786 * list, starting from iter position. The caller must hold RTNL lock or
 6787 * its own locking that guarantees that the neighbour lower
 6788 * list will remain unchanged.
 6789 */
 6790void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
 6791{
 6792	struct netdev_adjacent *lower;
 6793
 6794	lower = list_entry(*iter, struct netdev_adjacent, list);
 6795
 6796	if (&lower->list == &dev->adj_list.lower)
 6797		return NULL;
 6798
 6799	*iter = lower->list.next;
 6800
 6801	return lower->dev;
 6802}
 6803EXPORT_SYMBOL(netdev_lower_get_next);
 6804
 6805static struct net_device *netdev_next_lower_dev(struct net_device *dev,
 6806						struct list_head **iter)
 6807{
 6808	struct netdev_adjacent *lower;
 6809
 6810	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 6811
 6812	if (&lower->list == &dev->adj_list.lower)
 6813		return NULL;
 6814
 6815	*iter = &lower->list;
 6816
 6817	return lower->dev;
 6818}
 6819
 6820static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
 6821						  struct list_head **iter,
 6822						  bool *ignore)
 6823{
 6824	struct netdev_adjacent *lower;
 6825
 6826	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 6827
 6828	if (&lower->list == &dev->adj_list.lower)
 6829		return NULL;
 6830
 6831	*iter = &lower->list;
 6832	*ignore = lower->ignore;
 6833
 6834	return lower->dev;
 6835}
 6836
 6837int netdev_walk_all_lower_dev(struct net_device *dev,
 6838			      int (*fn)(struct net_device *dev,
 6839					void *data),
 6840			      void *data)
 6841{
 6842	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 6843	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 6844	int ret, cur = 0;
 6845
 6846	now = dev;
 6847	iter = &dev->adj_list.lower;
 6848
 6849	while (1) {
 6850		if (now != dev) {
 6851			ret = fn(now, data);
 6852			if (ret)
 6853				return ret;
 6854		}
 6855
 6856		next = NULL;
 6857		while (1) {
 6858			ldev = netdev_next_lower_dev(now, &iter);
 6859			if (!ldev)
 6860				break;
 6861
 6862			next = ldev;
 6863			niter = &ldev->adj_list.lower;
 6864			dev_stack[cur] = now;
 6865			iter_stack[cur++] = iter;
 6866			break;
 6867		}
 6868
 6869		if (!next) {
 6870			if (!cur)
 6871				return 0;
 6872			next = dev_stack[--cur];
 6873			niter = iter_stack[cur];
 6874		}
 6875
 6876		now = next;
 6877		iter = niter;
 6878	}
 6879
 6880	return 0;
 6881}
 6882EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
 6883
 6884static int __netdev_walk_all_lower_dev(struct net_device *dev,
 6885				       int (*fn)(struct net_device *dev,
 6886						 void *data),
 6887				       void *data)
 6888{
 6889	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 6890	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 6891	int ret, cur = 0;
 6892	bool ignore;
 6893
 6894	now = dev;
 6895	iter = &dev->adj_list.lower;
 6896
 6897	while (1) {
 6898		if (now != dev) {
 6899			ret = fn(now, data);
 6900			if (ret)
 6901				return ret;
 6902		}
 6903
 6904		next = NULL;
 6905		while (1) {
 6906			ldev = __netdev_next_lower_dev(now, &iter, &ignore);
 6907			if (!ldev)
 6908				break;
 6909			if (ignore)
 6910				continue;
 6911
 6912			next = ldev;
 6913			niter = &ldev->adj_list.lower;
 6914			dev_stack[cur] = now;
 6915			iter_stack[cur++] = iter;
 6916			break;
 6917		}
 6918
 6919		if (!next) {
 6920			if (!cur)
 6921				return 0;
 6922			next = dev_stack[--cur];
 6923			niter = iter_stack[cur];
 6924		}
 6925
 6926		now = next;
 6927		iter = niter;
 6928	}
 6929
 6930	return 0;
 6931}
 6932
 6933static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
 6934						    struct list_head **iter)
 6935{
 6936	struct netdev_adjacent *lower;
 6937
 6938	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 6939	if (&lower->list == &dev->adj_list.lower)
 6940		return NULL;
 6941
 6942	*iter = &lower->list;
 6943
 6944	return lower->dev;
 6945}
 6946
 6947static u8 __netdev_upper_depth(struct net_device *dev)
 6948{
 6949	struct net_device *udev;
 6950	struct list_head *iter;
 6951	u8 max_depth = 0;
 6952	bool ignore;
 6953
 6954	for (iter = &dev->adj_list.upper,
 6955	     udev = __netdev_next_upper_dev(dev, &iter, &ignore);
 6956	     udev;
 6957	     udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
 6958		if (ignore)
 6959			continue;
 6960		if (max_depth < udev->upper_level)
 6961			max_depth = udev->upper_level;
 6962	}
 6963
 6964	return max_depth;
 6965}
 6966
 6967static u8 __netdev_lower_depth(struct net_device *dev)
 6968{
 6969	struct net_device *ldev;
 6970	struct list_head *iter;
 6971	u8 max_depth = 0;
 6972	bool ignore;
 6973
 6974	for (iter = &dev->adj_list.lower,
 6975	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
 6976	     ldev;
 6977	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
 6978		if (ignore)
 6979			continue;
 6980		if (max_depth < ldev->lower_level)
 6981			max_depth = ldev->lower_level;
 6982	}
 6983
 6984	return max_depth;
 6985}
 6986
 6987static int __netdev_update_upper_level(struct net_device *dev, void *data)
 6988{
 6989	dev->upper_level = __netdev_upper_depth(dev) + 1;
 6990	return 0;
 6991}
 6992
 6993static int __netdev_update_lower_level(struct net_device *dev, void *data)
 6994{
 6995	dev->lower_level = __netdev_lower_depth(dev) + 1;
 6996	return 0;
 6997}
 6998
 6999int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
 7000				  int (*fn)(struct net_device *dev,
 7001					    void *data),
 7002				  void *data)
 7003{
 7004	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7005	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7006	int ret, cur = 0;
 7007
 7008	now = dev;
 7009	iter = &dev->adj_list.lower;
 7010
 7011	while (1) {
 7012		if (now != dev) {
 7013			ret = fn(now, data);
 7014			if (ret)
 7015				return ret;
 7016		}
 7017
 7018		next = NULL;
 7019		while (1) {
 7020			ldev = netdev_next_lower_dev_rcu(now, &iter);
 7021			if (!ldev)
 7022				break;
 7023
 7024			next = ldev;
 7025			niter = &ldev->adj_list.lower;
 7026			dev_stack[cur] = now;
 7027			iter_stack[cur++] = iter;
 7028			break;
 7029		}
 7030
 7031		if (!next) {
 7032			if (!cur)
 7033				return 0;
 7034			next = dev_stack[--cur];
 7035			niter = iter_stack[cur];
 7036		}
 7037
 7038		now = next;
 7039		iter = niter;
 7040	}
 7041
 7042	return 0;
 7043}
 7044EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
 7045
 7046/**
 7047 * netdev_lower_get_first_private_rcu - Get the first ->private from the
 7048 *				       lower neighbour list, RCU
 7049 *				       variant
 7050 * @dev: device
 7051 *
 7052 * Gets the first netdev_adjacent->private from the dev's lower neighbour
 7053 * list. The caller must hold RCU read lock.
 7054 */
 7055void *netdev_lower_get_first_private_rcu(struct net_device *dev)
 7056{
 7057	struct netdev_adjacent *lower;
 7058
 7059	lower = list_first_or_null_rcu(&dev->adj_list.lower,
 7060			struct netdev_adjacent, list);
 7061	if (lower)
 7062		return lower->private;
 7063	return NULL;
 7064}
 7065EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
 7066
 7067/**
 7068 * netdev_master_upper_dev_get_rcu - Get master upper device
 7069 * @dev: device
 7070 *
 7071 * Find a master upper device and return pointer to it or NULL in case
 7072 * it's not there. The caller must hold the RCU read lock.
 7073 */
 7074struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
 7075{
 7076	struct netdev_adjacent *upper;
 7077
 7078	upper = list_first_or_null_rcu(&dev->adj_list.upper,
 7079				       struct netdev_adjacent, list);
 7080	if (upper && likely(upper->master))
 7081		return upper->dev;
 7082	return NULL;
 7083}
 7084EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
 7085
 7086static int netdev_adjacent_sysfs_add(struct net_device *dev,
 7087			      struct net_device *adj_dev,
 7088			      struct list_head *dev_list)
 7089{
 7090	char linkname[IFNAMSIZ+7];
 7091
 7092	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7093		"upper_%s" : "lower_%s", adj_dev->name);
 7094	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
 7095				 linkname);
 7096}
 7097static void netdev_adjacent_sysfs_del(struct net_device *dev,
 7098			       char *name,
 7099			       struct list_head *dev_list)
 7100{
 7101	char linkname[IFNAMSIZ+7];
 7102
 7103	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7104		"upper_%s" : "lower_%s", name);
 7105	sysfs_remove_link(&(dev->dev.kobj), linkname);
 7106}
 7107
 7108static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
 7109						 struct net_device *adj_dev,
 7110						 struct list_head *dev_list)
 7111{
 7112	return (dev_list == &dev->adj_list.upper ||
 7113		dev_list == &dev->adj_list.lower) &&
 7114		net_eq(dev_net(dev), dev_net(adj_dev));
 7115}
 7116
 7117static int __netdev_adjacent_dev_insert(struct net_device *dev,
 7118					struct net_device *adj_dev,
 7119					struct list_head *dev_list,
 7120					void *private, bool master)
 7121{
 7122	struct netdev_adjacent *adj;
 7123	int ret;
 7124
 7125	adj = __netdev_find_adj(adj_dev, dev_list);
 7126
 7127	if (adj) {
 7128		adj->ref_nr += 1;
 7129		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
 7130			 dev->name, adj_dev->name, adj->ref_nr);
 7131
 7132		return 0;
 7133	}
 7134
 7135	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
 7136	if (!adj)
 7137		return -ENOMEM;
 7138
 7139	adj->dev = adj_dev;
 7140	adj->master = master;
 7141	adj->ref_nr = 1;
 7142	adj->private = private;
 7143	adj->ignore = false;
 7144	dev_hold(adj_dev);
 7145
 7146	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
 7147		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
 7148
 7149	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
 7150		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
 7151		if (ret)
 7152			goto free_adj;
 7153	}
 7154
 7155	/* Ensure that master link is always the first item in list. */
 7156	if (master) {
 7157		ret = sysfs_create_link(&(dev->dev.kobj),
 7158					&(adj_dev->dev.kobj), "master");
 7159		if (ret)
 7160			goto remove_symlinks;
 7161
 7162		list_add_rcu(&adj->list, dev_list);
 7163	} else {
 7164		list_add_tail_rcu(&adj->list, dev_list);
 7165	}
 7166
 7167	return 0;
 7168
 7169remove_symlinks:
 7170	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 7171		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 7172free_adj:
 7173	kfree(adj);
 7174	dev_put(adj_dev);
 7175
 7176	return ret;
 7177}
 7178
 7179static void __netdev_adjacent_dev_remove(struct net_device *dev,
 7180					 struct net_device *adj_dev,
 7181					 u16 ref_nr,
 7182					 struct list_head *dev_list)
 7183{
 7184	struct netdev_adjacent *adj;
 7185
 7186	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
 7187		 dev->name, adj_dev->name, ref_nr);
 7188
 7189	adj = __netdev_find_adj(adj_dev, dev_list);
 7190
 7191	if (!adj) {
 7192		pr_err("Adjacency does not exist for device %s from %s\n",
 7193		       dev->name, adj_dev->name);
 7194		WARN_ON(1);
 7195		return;
 7196	}
 7197
 7198	if (adj->ref_nr > ref_nr) {
 7199		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
 7200			 dev->name, adj_dev->name, ref_nr,
 7201			 adj->ref_nr - ref_nr);
 7202		adj->ref_nr -= ref_nr;
 7203		return;
 7204	}
 7205
 7206	if (adj->master)
 7207		sysfs_remove_link(&(dev->dev.kobj), "master");
 7208
 7209	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 7210		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 7211
 7212	list_del_rcu(&adj->list);
 7213	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
 7214		 adj_dev->name, dev->name, adj_dev->name);
 7215	dev_put(adj_dev);
 7216	kfree_rcu(adj, rcu);
 7217}
 7218
 7219static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
 7220					    struct net_device *upper_dev,
 7221					    struct list_head *up_list,
 7222					    struct list_head *down_list,
 7223					    void *private, bool master)
 7224{
 7225	int ret;
 7226
 7227	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
 7228					   private, master);
 7229	if (ret)
 7230		return ret;
 7231
 7232	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
 7233					   private, false);
 7234	if (ret) {
 7235		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
 7236		return ret;
 7237	}
 7238
 7239	return 0;
 7240}
 7241
 
 
 
 
 
 
 
 
 
 7242static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
 7243					       struct net_device *upper_dev,
 7244					       u16 ref_nr,
 7245					       struct list_head *up_list,
 7246					       struct list_head *down_list)
 7247{
 7248	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
 7249	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
 
 
 
 
 
 
 
 
 7250}
 7251
 7252static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
 7253						struct net_device *upper_dev,
 7254						void *private, bool master)
 7255{
 7256	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
 7257						&dev->adj_list.upper,
 7258						&upper_dev->adj_list.lower,
 7259						private, master);
 
 
 
 
 
 
 
 
 
 
 
 7260}
 7261
 7262static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
 7263						   struct net_device *upper_dev)
 7264{
 7265	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
 
 7266					   &dev->adj_list.upper,
 7267					   &upper_dev->adj_list.lower);
 7268}
 7269
 7270static int __netdev_upper_dev_link(struct net_device *dev,
 7271				   struct net_device *upper_dev, bool master,
 7272				   void *upper_priv, void *upper_info,
 7273				   struct netlink_ext_ack *extack)
 7274{
 7275	struct netdev_notifier_changeupper_info changeupper_info = {
 7276		.info = {
 7277			.dev = dev,
 7278			.extack = extack,
 7279		},
 7280		.upper_dev = upper_dev,
 7281		.master = master,
 7282		.linking = true,
 7283		.upper_info = upper_info,
 7284	};
 7285	struct net_device *master_dev;
 7286	int ret = 0;
 7287
 7288	ASSERT_RTNL();
 7289
 7290	if (dev == upper_dev)
 7291		return -EBUSY;
 7292
 7293	/* To prevent loops, check if dev is not upper device to upper_dev. */
 7294	if (__netdev_has_upper_dev(upper_dev, dev))
 7295		return -EBUSY;
 7296
 7297	if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
 7298		return -EMLINK;
 7299
 7300	if (!master) {
 7301		if (__netdev_has_upper_dev(dev, upper_dev))
 7302			return -EEXIST;
 7303	} else {
 7304		master_dev = __netdev_master_upper_dev_get(dev);
 7305		if (master_dev)
 7306			return master_dev == upper_dev ? -EEXIST : -EBUSY;
 7307	}
 7308
 7309	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 7310					    &changeupper_info.info);
 7311	ret = notifier_to_errno(ret);
 7312	if (ret)
 7313		return ret;
 7314
 7315	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
 7316						   master);
 7317	if (ret)
 7318		return ret;
 7319
 7320	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 7321					    &changeupper_info.info);
 7322	ret = notifier_to_errno(ret);
 7323	if (ret)
 7324		goto rollback;
 7325
 7326	__netdev_update_upper_level(dev, NULL);
 7327	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 7328
 7329	__netdev_update_lower_level(upper_dev, NULL);
 7330	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 7331				    NULL);
 
 
 
 
 7332
 7333	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 7334
 7335rollback:
 7336	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 7337
 7338	return ret;
 7339}
 7340
 7341/**
 7342 * netdev_upper_dev_link - Add a link to the upper device
 7343 * @dev: device
 7344 * @upper_dev: new upper device
 7345 * @extack: netlink extended ack
 7346 *
 7347 * Adds a link to device which is upper to this one. The caller must hold
 7348 * the RTNL lock. On a failure a negative errno code is returned.
 7349 * On success the reference counts are adjusted and the function
 7350 * returns zero.
 7351 */
 7352int netdev_upper_dev_link(struct net_device *dev,
 7353			  struct net_device *upper_dev,
 7354			  struct netlink_ext_ack *extack)
 7355{
 7356	return __netdev_upper_dev_link(dev, upper_dev, false,
 7357				       NULL, NULL, extack);
 7358}
 7359EXPORT_SYMBOL(netdev_upper_dev_link);
 7360
 7361/**
 7362 * netdev_master_upper_dev_link - Add a master link to the upper device
 7363 * @dev: device
 7364 * @upper_dev: new upper device
 7365 * @upper_priv: upper device private
 7366 * @upper_info: upper info to be passed down via notifier
 7367 * @extack: netlink extended ack
 7368 *
 7369 * Adds a link to device which is upper to this one. In this case, only
 7370 * one master upper device can be linked, although other non-master devices
 7371 * might be linked as well. The caller must hold the RTNL lock.
 7372 * On a failure a negative errno code is returned. On success the reference
 7373 * counts are adjusted and the function returns zero.
 7374 */
 7375int netdev_master_upper_dev_link(struct net_device *dev,
 7376				 struct net_device *upper_dev,
 7377				 void *upper_priv, void *upper_info,
 7378				 struct netlink_ext_ack *extack)
 7379{
 7380	return __netdev_upper_dev_link(dev, upper_dev, true,
 7381				       upper_priv, upper_info, extack);
 7382}
 7383EXPORT_SYMBOL(netdev_master_upper_dev_link);
 7384
 7385/**
 7386 * netdev_upper_dev_unlink - Removes a link to upper device
 7387 * @dev: device
 7388 * @upper_dev: new upper device
 7389 *
 7390 * Removes a link to device which is upper to this one. The caller must hold
 7391 * the RTNL lock.
 7392 */
 7393void netdev_upper_dev_unlink(struct net_device *dev,
 7394			     struct net_device *upper_dev)
 7395{
 7396	struct netdev_notifier_changeupper_info changeupper_info = {
 7397		.info = {
 7398			.dev = dev,
 7399		},
 7400		.upper_dev = upper_dev,
 7401		.linking = false,
 7402	};
 7403
 7404	ASSERT_RTNL();
 7405
 
 7406	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
 
 7407
 7408	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 7409				      &changeupper_info.info);
 7410
 7411	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 7412
 7413	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 7414				      &changeupper_info.info);
 7415
 7416	__netdev_update_upper_level(dev, NULL);
 7417	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 7418
 7419	__netdev_update_lower_level(upper_dev, NULL);
 7420	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 7421				    NULL);
 7422}
 7423EXPORT_SYMBOL(netdev_upper_dev_unlink);
 7424
 7425static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
 7426				      struct net_device *lower_dev,
 7427				      bool val)
 7428{
 7429	struct netdev_adjacent *adj;
 7430
 7431	adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
 7432	if (adj)
 7433		adj->ignore = val;
 7434
 7435	adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
 7436	if (adj)
 7437		adj->ignore = val;
 7438}
 7439
 7440static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
 7441					struct net_device *lower_dev)
 7442{
 7443	__netdev_adjacent_dev_set(upper_dev, lower_dev, true);
 7444}
 7445
 7446static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
 7447				       struct net_device *lower_dev)
 7448{
 7449	__netdev_adjacent_dev_set(upper_dev, lower_dev, false);
 7450}
 7451
 7452int netdev_adjacent_change_prepare(struct net_device *old_dev,
 7453				   struct net_device *new_dev,
 7454				   struct net_device *dev,
 7455				   struct netlink_ext_ack *extack)
 7456{
 7457	int err;
 7458
 7459	if (!new_dev)
 7460		return 0;
 7461
 7462	if (old_dev && new_dev != old_dev)
 7463		netdev_adjacent_dev_disable(dev, old_dev);
 7464
 7465	err = netdev_upper_dev_link(new_dev, dev, extack);
 7466	if (err) {
 7467		if (old_dev && new_dev != old_dev)
 7468			netdev_adjacent_dev_enable(dev, old_dev);
 7469		return err;
 7470	}
 7471
 7472	return 0;
 7473}
 7474EXPORT_SYMBOL(netdev_adjacent_change_prepare);
 7475
 7476void netdev_adjacent_change_commit(struct net_device *old_dev,
 7477				   struct net_device *new_dev,
 7478				   struct net_device *dev)
 7479{
 7480	if (!new_dev || !old_dev)
 7481		return;
 7482
 7483	if (new_dev == old_dev)
 7484		return;
 7485
 7486	netdev_adjacent_dev_enable(dev, old_dev);
 7487	netdev_upper_dev_unlink(old_dev, dev);
 7488}
 7489EXPORT_SYMBOL(netdev_adjacent_change_commit);
 7490
 7491void netdev_adjacent_change_abort(struct net_device *old_dev,
 7492				  struct net_device *new_dev,
 7493				  struct net_device *dev)
 7494{
 7495	if (!new_dev)
 7496		return;
 7497
 7498	if (old_dev && new_dev != old_dev)
 7499		netdev_adjacent_dev_enable(dev, old_dev);
 7500
 7501	netdev_upper_dev_unlink(new_dev, dev);
 
 7502}
 7503EXPORT_SYMBOL(netdev_adjacent_change_abort);
 7504
 7505/**
 7506 * netdev_bonding_info_change - Dispatch event about slave change
 7507 * @dev: device
 7508 * @bonding_info: info to dispatch
 7509 *
 7510 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
 7511 * The caller must hold the RTNL lock.
 7512 */
 7513void netdev_bonding_info_change(struct net_device *dev,
 7514				struct netdev_bonding_info *bonding_info)
 7515{
 7516	struct netdev_notifier_bonding_info info = {
 7517		.info.dev = dev,
 7518	};
 7519
 7520	memcpy(&info.bonding_info, bonding_info,
 7521	       sizeof(struct netdev_bonding_info));
 7522	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
 7523				      &info.info);
 7524}
 7525EXPORT_SYMBOL(netdev_bonding_info_change);
 7526
 7527static void netdev_adjacent_add_links(struct net_device *dev)
 7528{
 7529	struct netdev_adjacent *iter;
 7530
 7531	struct net *net = dev_net(dev);
 7532
 7533	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 7534		if (!net_eq(net, dev_net(iter->dev)))
 7535			continue;
 7536		netdev_adjacent_sysfs_add(iter->dev, dev,
 7537					  &iter->dev->adj_list.lower);
 7538		netdev_adjacent_sysfs_add(dev, iter->dev,
 7539					  &dev->adj_list.upper);
 7540	}
 7541
 7542	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 7543		if (!net_eq(net, dev_net(iter->dev)))
 7544			continue;
 7545		netdev_adjacent_sysfs_add(iter->dev, dev,
 7546					  &iter->dev->adj_list.upper);
 7547		netdev_adjacent_sysfs_add(dev, iter->dev,
 7548					  &dev->adj_list.lower);
 7549	}
 7550}
 7551
 7552static void netdev_adjacent_del_links(struct net_device *dev)
 7553{
 7554	struct netdev_adjacent *iter;
 7555
 7556	struct net *net = dev_net(dev);
 7557
 7558	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 7559		if (!net_eq(net, dev_net(iter->dev)))
 7560			continue;
 7561		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 7562					  &iter->dev->adj_list.lower);
 7563		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 7564					  &dev->adj_list.upper);
 7565	}
 7566
 7567	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 7568		if (!net_eq(net, dev_net(iter->dev)))
 7569			continue;
 7570		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 7571					  &iter->dev->adj_list.upper);
 7572		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 7573					  &dev->adj_list.lower);
 7574	}
 7575}
 7576
 7577void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
 7578{
 7579	struct netdev_adjacent *iter;
 7580
 7581	struct net *net = dev_net(dev);
 7582
 7583	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 7584		if (!net_eq(net, dev_net(iter->dev)))
 7585			continue;
 7586		netdev_adjacent_sysfs_del(iter->dev, oldname,
 7587					  &iter->dev->adj_list.lower);
 7588		netdev_adjacent_sysfs_add(iter->dev, dev,
 7589					  &iter->dev->adj_list.lower);
 7590	}
 7591
 7592	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 7593		if (!net_eq(net, dev_net(iter->dev)))
 7594			continue;
 7595		netdev_adjacent_sysfs_del(iter->dev, oldname,
 7596					  &iter->dev->adj_list.upper);
 7597		netdev_adjacent_sysfs_add(iter->dev, dev,
 7598					  &iter->dev->adj_list.upper);
 7599	}
 7600}
 7601
 7602void *netdev_lower_dev_get_private(struct net_device *dev,
 7603				   struct net_device *lower_dev)
 7604{
 7605	struct netdev_adjacent *lower;
 7606
 7607	if (!lower_dev)
 7608		return NULL;
 7609	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
 7610	if (!lower)
 7611		return NULL;
 7612
 7613	return lower->private;
 7614}
 7615EXPORT_SYMBOL(netdev_lower_dev_get_private);
 7616
 7617
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 7618/**
 7619 * netdev_lower_change - Dispatch event about lower device state change
 7620 * @lower_dev: device
 7621 * @lower_state_info: state to dispatch
 7622 *
 7623 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
 7624 * The caller must hold the RTNL lock.
 7625 */
 7626void netdev_lower_state_changed(struct net_device *lower_dev,
 7627				void *lower_state_info)
 7628{
 7629	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
 7630		.info.dev = lower_dev,
 7631	};
 7632
 7633	ASSERT_RTNL();
 7634	changelowerstate_info.lower_state_info = lower_state_info;
 7635	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
 7636				      &changelowerstate_info.info);
 7637}
 7638EXPORT_SYMBOL(netdev_lower_state_changed);
 7639
 7640static void dev_change_rx_flags(struct net_device *dev, int flags)
 7641{
 7642	const struct net_device_ops *ops = dev->netdev_ops;
 7643
 7644	if (ops->ndo_change_rx_flags)
 7645		ops->ndo_change_rx_flags(dev, flags);
 7646}
 7647
 7648static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
 7649{
 7650	unsigned int old_flags = dev->flags;
 7651	kuid_t uid;
 7652	kgid_t gid;
 7653
 7654	ASSERT_RTNL();
 7655
 7656	dev->flags |= IFF_PROMISC;
 7657	dev->promiscuity += inc;
 7658	if (dev->promiscuity == 0) {
 7659		/*
 7660		 * Avoid overflow.
 7661		 * If inc causes overflow, untouch promisc and return error.
 7662		 */
 7663		if (inc < 0)
 7664			dev->flags &= ~IFF_PROMISC;
 7665		else {
 7666			dev->promiscuity -= inc;
 7667			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
 7668				dev->name);
 7669			return -EOVERFLOW;
 7670		}
 7671	}
 7672	if (dev->flags != old_flags) {
 7673		pr_info("device %s %s promiscuous mode\n",
 7674			dev->name,
 7675			dev->flags & IFF_PROMISC ? "entered" : "left");
 7676		if (audit_enabled) {
 7677			current_uid_gid(&uid, &gid);
 7678			audit_log(audit_context(), GFP_ATOMIC,
 7679				  AUDIT_ANOM_PROMISCUOUS,
 7680				  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
 7681				  dev->name, (dev->flags & IFF_PROMISC),
 7682				  (old_flags & IFF_PROMISC),
 7683				  from_kuid(&init_user_ns, audit_get_loginuid(current)),
 7684				  from_kuid(&init_user_ns, uid),
 7685				  from_kgid(&init_user_ns, gid),
 7686				  audit_get_sessionid(current));
 7687		}
 7688
 7689		dev_change_rx_flags(dev, IFF_PROMISC);
 7690	}
 7691	if (notify)
 7692		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
 7693	return 0;
 7694}
 7695
 7696/**
 7697 *	dev_set_promiscuity	- update promiscuity count on a device
 7698 *	@dev: device
 7699 *	@inc: modifier
 7700 *
 7701 *	Add or remove promiscuity from a device. While the count in the device
 7702 *	remains above zero the interface remains promiscuous. Once it hits zero
 7703 *	the device reverts back to normal filtering operation. A negative inc
 7704 *	value is used to drop promiscuity on the device.
 7705 *	Return 0 if successful or a negative errno code on error.
 7706 */
 7707int dev_set_promiscuity(struct net_device *dev, int inc)
 7708{
 7709	unsigned int old_flags = dev->flags;
 7710	int err;
 7711
 7712	err = __dev_set_promiscuity(dev, inc, true);
 7713	if (err < 0)
 7714		return err;
 7715	if (dev->flags != old_flags)
 7716		dev_set_rx_mode(dev);
 7717	return err;
 7718}
 7719EXPORT_SYMBOL(dev_set_promiscuity);
 7720
 7721static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
 7722{
 7723	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
 7724
 7725	ASSERT_RTNL();
 7726
 7727	dev->flags |= IFF_ALLMULTI;
 7728	dev->allmulti += inc;
 7729	if (dev->allmulti == 0) {
 7730		/*
 7731		 * Avoid overflow.
 7732		 * If inc causes overflow, untouch allmulti and return error.
 7733		 */
 7734		if (inc < 0)
 7735			dev->flags &= ~IFF_ALLMULTI;
 7736		else {
 7737			dev->allmulti -= inc;
 7738			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
 7739				dev->name);
 7740			return -EOVERFLOW;
 7741		}
 7742	}
 7743	if (dev->flags ^ old_flags) {
 7744		dev_change_rx_flags(dev, IFF_ALLMULTI);
 7745		dev_set_rx_mode(dev);
 7746		if (notify)
 7747			__dev_notify_flags(dev, old_flags,
 7748					   dev->gflags ^ old_gflags);
 7749	}
 7750	return 0;
 7751}
 7752
 7753/**
 7754 *	dev_set_allmulti	- update allmulti count on a device
 7755 *	@dev: device
 7756 *	@inc: modifier
 7757 *
 7758 *	Add or remove reception of all multicast frames to a device. While the
 7759 *	count in the device remains above zero the interface remains listening
 7760 *	to all interfaces. Once it hits zero the device reverts back to normal
 7761 *	filtering operation. A negative @inc value is used to drop the counter
 7762 *	when releasing a resource needing all multicasts.
 7763 *	Return 0 if successful or a negative errno code on error.
 7764 */
 7765
 7766int dev_set_allmulti(struct net_device *dev, int inc)
 7767{
 7768	return __dev_set_allmulti(dev, inc, true);
 7769}
 7770EXPORT_SYMBOL(dev_set_allmulti);
 7771
 7772/*
 7773 *	Upload unicast and multicast address lists to device and
 7774 *	configure RX filtering. When the device doesn't support unicast
 7775 *	filtering it is put in promiscuous mode while unicast addresses
 7776 *	are present.
 7777 */
 7778void __dev_set_rx_mode(struct net_device *dev)
 7779{
 7780	const struct net_device_ops *ops = dev->netdev_ops;
 7781
 7782	/* dev_open will call this function so the list will stay sane. */
 7783	if (!(dev->flags&IFF_UP))
 7784		return;
 7785
 7786	if (!netif_device_present(dev))
 7787		return;
 7788
 7789	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
 7790		/* Unicast addresses changes may only happen under the rtnl,
 7791		 * therefore calling __dev_set_promiscuity here is safe.
 7792		 */
 7793		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
 7794			__dev_set_promiscuity(dev, 1, false);
 7795			dev->uc_promisc = true;
 7796		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
 7797			__dev_set_promiscuity(dev, -1, false);
 7798			dev->uc_promisc = false;
 7799		}
 7800	}
 7801
 7802	if (ops->ndo_set_rx_mode)
 7803		ops->ndo_set_rx_mode(dev);
 7804}
 7805
 7806void dev_set_rx_mode(struct net_device *dev)
 7807{
 7808	netif_addr_lock_bh(dev);
 7809	__dev_set_rx_mode(dev);
 7810	netif_addr_unlock_bh(dev);
 7811}
 7812
 7813/**
 7814 *	dev_get_flags - get flags reported to userspace
 7815 *	@dev: device
 7816 *
 7817 *	Get the combination of flag bits exported through APIs to userspace.
 7818 */
 7819unsigned int dev_get_flags(const struct net_device *dev)
 7820{
 7821	unsigned int flags;
 7822
 7823	flags = (dev->flags & ~(IFF_PROMISC |
 7824				IFF_ALLMULTI |
 7825				IFF_RUNNING |
 7826				IFF_LOWER_UP |
 7827				IFF_DORMANT)) |
 7828		(dev->gflags & (IFF_PROMISC |
 7829				IFF_ALLMULTI));
 7830
 7831	if (netif_running(dev)) {
 7832		if (netif_oper_up(dev))
 7833			flags |= IFF_RUNNING;
 7834		if (netif_carrier_ok(dev))
 7835			flags |= IFF_LOWER_UP;
 7836		if (netif_dormant(dev))
 7837			flags |= IFF_DORMANT;
 7838	}
 7839
 7840	return flags;
 7841}
 7842EXPORT_SYMBOL(dev_get_flags);
 7843
 7844int __dev_change_flags(struct net_device *dev, unsigned int flags,
 7845		       struct netlink_ext_ack *extack)
 7846{
 7847	unsigned int old_flags = dev->flags;
 7848	int ret;
 7849
 7850	ASSERT_RTNL();
 7851
 7852	/*
 7853	 *	Set the flags on our device.
 7854	 */
 7855
 7856	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
 7857			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
 7858			       IFF_AUTOMEDIA)) |
 7859		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
 7860				    IFF_ALLMULTI));
 7861
 7862	/*
 7863	 *	Load in the correct multicast list now the flags have changed.
 7864	 */
 7865
 7866	if ((old_flags ^ flags) & IFF_MULTICAST)
 7867		dev_change_rx_flags(dev, IFF_MULTICAST);
 7868
 7869	dev_set_rx_mode(dev);
 7870
 7871	/*
 7872	 *	Have we downed the interface. We handle IFF_UP ourselves
 7873	 *	according to user attempts to set it, rather than blindly
 7874	 *	setting it.
 7875	 */
 7876
 7877	ret = 0;
 7878	if ((old_flags ^ flags) & IFF_UP) {
 7879		if (old_flags & IFF_UP)
 7880			__dev_close(dev);
 7881		else
 7882			ret = __dev_open(dev, extack);
 7883	}
 7884
 7885	if ((flags ^ dev->gflags) & IFF_PROMISC) {
 7886		int inc = (flags & IFF_PROMISC) ? 1 : -1;
 7887		unsigned int old_flags = dev->flags;
 7888
 7889		dev->gflags ^= IFF_PROMISC;
 7890
 7891		if (__dev_set_promiscuity(dev, inc, false) >= 0)
 7892			if (dev->flags != old_flags)
 7893				dev_set_rx_mode(dev);
 7894	}
 7895
 7896	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
 7897	 * is important. Some (broken) drivers set IFF_PROMISC, when
 7898	 * IFF_ALLMULTI is requested not asking us and not reporting.
 7899	 */
 7900	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
 7901		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
 7902
 7903		dev->gflags ^= IFF_ALLMULTI;
 7904		__dev_set_allmulti(dev, inc, false);
 7905	}
 7906
 7907	return ret;
 7908}
 7909
 7910void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
 7911			unsigned int gchanges)
 7912{
 7913	unsigned int changes = dev->flags ^ old_flags;
 7914
 7915	if (gchanges)
 7916		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
 7917
 7918	if (changes & IFF_UP) {
 7919		if (dev->flags & IFF_UP)
 7920			call_netdevice_notifiers(NETDEV_UP, dev);
 7921		else
 7922			call_netdevice_notifiers(NETDEV_DOWN, dev);
 7923	}
 7924
 7925	if (dev->flags & IFF_UP &&
 7926	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
 7927		struct netdev_notifier_change_info change_info = {
 7928			.info = {
 7929				.dev = dev,
 7930			},
 7931			.flags_changed = changes,
 7932		};
 7933
 7934		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
 
 
 7935	}
 7936}
 7937
 7938/**
 7939 *	dev_change_flags - change device settings
 7940 *	@dev: device
 7941 *	@flags: device state flags
 7942 *	@extack: netlink extended ack
 7943 *
 7944 *	Change settings on device based state flags. The flags are
 7945 *	in the userspace exported format.
 7946 */
 7947int dev_change_flags(struct net_device *dev, unsigned int flags,
 7948		     struct netlink_ext_ack *extack)
 7949{
 7950	int ret;
 7951	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
 7952
 7953	ret = __dev_change_flags(dev, flags, extack);
 7954	if (ret < 0)
 7955		return ret;
 7956
 7957	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
 7958	__dev_notify_flags(dev, old_flags, changes);
 7959	return ret;
 7960}
 7961EXPORT_SYMBOL(dev_change_flags);
 7962
 7963int __dev_set_mtu(struct net_device *dev, int new_mtu)
 7964{
 7965	const struct net_device_ops *ops = dev->netdev_ops;
 7966
 7967	if (ops->ndo_change_mtu)
 7968		return ops->ndo_change_mtu(dev, new_mtu);
 7969
 7970	dev->mtu = new_mtu;
 7971	return 0;
 7972}
 7973EXPORT_SYMBOL(__dev_set_mtu);
 7974
 7975/**
 7976 *	dev_set_mtu_ext - Change maximum transfer unit
 7977 *	@dev: device
 7978 *	@new_mtu: new transfer unit
 7979 *	@extack: netlink extended ack
 7980 *
 7981 *	Change the maximum transfer size of the network device.
 7982 */
 7983int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
 7984		    struct netlink_ext_ack *extack)
 7985{
 7986	int err, orig_mtu;
 7987
 7988	if (new_mtu == dev->mtu)
 7989		return 0;
 7990
 7991	/* MTU must be positive, and in range */
 7992	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
 7993		NL_SET_ERR_MSG(extack, "mtu less than device minimum");
 7994		return -EINVAL;
 7995	}
 7996
 7997	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
 7998		NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
 7999		return -EINVAL;
 8000	}
 8001
 8002	if (!netif_device_present(dev))
 8003		return -ENODEV;
 8004
 8005	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
 8006	err = notifier_to_errno(err);
 8007	if (err)
 8008		return err;
 8009
 8010	orig_mtu = dev->mtu;
 8011	err = __dev_set_mtu(dev, new_mtu);
 8012
 8013	if (!err) {
 8014		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 8015						   orig_mtu);
 8016		err = notifier_to_errno(err);
 8017		if (err) {
 8018			/* setting mtu back and notifying everyone again,
 8019			 * so that they have a chance to revert changes.
 8020			 */
 8021			__dev_set_mtu(dev, orig_mtu);
 8022			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 8023						     new_mtu);
 8024		}
 8025	}
 8026	return err;
 8027}
 8028
 8029int dev_set_mtu(struct net_device *dev, int new_mtu)
 8030{
 8031	struct netlink_ext_ack extack;
 8032	int err;
 8033
 8034	memset(&extack, 0, sizeof(extack));
 8035	err = dev_set_mtu_ext(dev, new_mtu, &extack);
 8036	if (err && extack._msg)
 8037		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
 8038	return err;
 8039}
 8040EXPORT_SYMBOL(dev_set_mtu);
 8041
 8042/**
 8043 *	dev_change_tx_queue_len - Change TX queue length of a netdevice
 8044 *	@dev: device
 8045 *	@new_len: new tx queue length
 8046 */
 8047int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
 8048{
 8049	unsigned int orig_len = dev->tx_queue_len;
 8050	int res;
 8051
 8052	if (new_len != (unsigned int)new_len)
 8053		return -ERANGE;
 8054
 8055	if (new_len != orig_len) {
 8056		dev->tx_queue_len = new_len;
 8057		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
 8058		res = notifier_to_errno(res);
 8059		if (res)
 8060			goto err_rollback;
 8061		res = dev_qdisc_change_tx_queue_len(dev);
 8062		if (res)
 8063			goto err_rollback;
 8064	}
 8065
 8066	return 0;
 8067
 8068err_rollback:
 8069	netdev_err(dev, "refused to change device tx_queue_len\n");
 8070	dev->tx_queue_len = orig_len;
 8071	return res;
 8072}
 8073
 8074/**
 8075 *	dev_set_group - Change group this device belongs to
 8076 *	@dev: device
 8077 *	@new_group: group this device should belong to
 8078 */
 8079void dev_set_group(struct net_device *dev, int new_group)
 8080{
 8081	dev->group = new_group;
 8082}
 8083EXPORT_SYMBOL(dev_set_group);
 8084
 8085/**
 8086 *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
 8087 *	@dev: device
 8088 *	@addr: new address
 8089 *	@extack: netlink extended ack
 8090 */
 8091int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
 8092			      struct netlink_ext_ack *extack)
 8093{
 8094	struct netdev_notifier_pre_changeaddr_info info = {
 8095		.info.dev = dev,
 8096		.info.extack = extack,
 8097		.dev_addr = addr,
 8098	};
 8099	int rc;
 8100
 8101	rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
 8102	return notifier_to_errno(rc);
 8103}
 8104EXPORT_SYMBOL(dev_pre_changeaddr_notify);
 8105
 8106/**
 8107 *	dev_set_mac_address - Change Media Access Control Address
 8108 *	@dev: device
 8109 *	@sa: new address
 8110 *	@extack: netlink extended ack
 8111 *
 8112 *	Change the hardware (MAC) address of the device
 8113 */
 8114int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
 8115			struct netlink_ext_ack *extack)
 8116{
 8117	const struct net_device_ops *ops = dev->netdev_ops;
 8118	int err;
 8119
 8120	if (!ops->ndo_set_mac_address)
 8121		return -EOPNOTSUPP;
 8122	if (sa->sa_family != dev->type)
 8123		return -EINVAL;
 8124	if (!netif_device_present(dev))
 8125		return -ENODEV;
 8126	err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
 8127	if (err)
 8128		return err;
 8129	err = ops->ndo_set_mac_address(dev, sa);
 8130	if (err)
 8131		return err;
 8132	dev->addr_assign_type = NET_ADDR_SET;
 8133	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
 8134	add_device_randomness(dev->dev_addr, dev->addr_len);
 8135	return 0;
 8136}
 8137EXPORT_SYMBOL(dev_set_mac_address);
 8138
 8139/**
 8140 *	dev_change_carrier - Change device carrier
 8141 *	@dev: device
 8142 *	@new_carrier: new value
 8143 *
 8144 *	Change device carrier
 8145 */
 8146int dev_change_carrier(struct net_device *dev, bool new_carrier)
 8147{
 8148	const struct net_device_ops *ops = dev->netdev_ops;
 8149
 8150	if (!ops->ndo_change_carrier)
 8151		return -EOPNOTSUPP;
 8152	if (!netif_device_present(dev))
 8153		return -ENODEV;
 8154	return ops->ndo_change_carrier(dev, new_carrier);
 8155}
 8156EXPORT_SYMBOL(dev_change_carrier);
 8157
 8158/**
 8159 *	dev_get_phys_port_id - Get device physical port ID
 8160 *	@dev: device
 8161 *	@ppid: port ID
 8162 *
 8163 *	Get device physical port ID
 8164 */
 8165int dev_get_phys_port_id(struct net_device *dev,
 8166			 struct netdev_phys_item_id *ppid)
 8167{
 8168	const struct net_device_ops *ops = dev->netdev_ops;
 8169
 8170	if (!ops->ndo_get_phys_port_id)
 8171		return -EOPNOTSUPP;
 8172	return ops->ndo_get_phys_port_id(dev, ppid);
 8173}
 8174EXPORT_SYMBOL(dev_get_phys_port_id);
 8175
 8176/**
 8177 *	dev_get_phys_port_name - Get device physical port name
 8178 *	@dev: device
 8179 *	@name: port name
 8180 *	@len: limit of bytes to copy to name
 8181 *
 8182 *	Get device physical port name
 8183 */
 8184int dev_get_phys_port_name(struct net_device *dev,
 8185			   char *name, size_t len)
 8186{
 8187	const struct net_device_ops *ops = dev->netdev_ops;
 8188	int err;
 8189
 8190	if (ops->ndo_get_phys_port_name) {
 8191		err = ops->ndo_get_phys_port_name(dev, name, len);
 8192		if (err != -EOPNOTSUPP)
 8193			return err;
 8194	}
 8195	return devlink_compat_phys_port_name_get(dev, name, len);
 8196}
 8197EXPORT_SYMBOL(dev_get_phys_port_name);
 8198
 8199/**
 8200 *	dev_get_port_parent_id - Get the device's port parent identifier
 8201 *	@dev: network device
 8202 *	@ppid: pointer to a storage for the port's parent identifier
 8203 *	@recurse: allow/disallow recursion to lower devices
 8204 *
 8205 *	Get the devices's port parent identifier
 8206 */
 8207int dev_get_port_parent_id(struct net_device *dev,
 8208			   struct netdev_phys_item_id *ppid,
 8209			   bool recurse)
 8210{
 8211	const struct net_device_ops *ops = dev->netdev_ops;
 8212	struct netdev_phys_item_id first = { };
 8213	struct net_device *lower_dev;
 8214	struct list_head *iter;
 8215	int err;
 8216
 8217	if (ops->ndo_get_port_parent_id) {
 8218		err = ops->ndo_get_port_parent_id(dev, ppid);
 8219		if (err != -EOPNOTSUPP)
 8220			return err;
 8221	}
 8222
 8223	err = devlink_compat_switch_id_get(dev, ppid);
 8224	if (!err || err != -EOPNOTSUPP)
 8225		return err;
 8226
 8227	if (!recurse)
 8228		return -EOPNOTSUPP;
 8229
 8230	netdev_for_each_lower_dev(dev, lower_dev, iter) {
 8231		err = dev_get_port_parent_id(lower_dev, ppid, recurse);
 8232		if (err)
 8233			break;
 8234		if (!first.id_len)
 8235			first = *ppid;
 8236		else if (memcmp(&first, ppid, sizeof(*ppid)))
 8237			return -ENODATA;
 8238	}
 8239
 8240	return err;
 8241}
 8242EXPORT_SYMBOL(dev_get_port_parent_id);
 8243
 8244/**
 8245 *	netdev_port_same_parent_id - Indicate if two network devices have
 8246 *	the same port parent identifier
 8247 *	@a: first network device
 8248 *	@b: second network device
 8249 */
 8250bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
 8251{
 8252	struct netdev_phys_item_id a_id = { };
 8253	struct netdev_phys_item_id b_id = { };
 8254
 8255	if (dev_get_port_parent_id(a, &a_id, true) ||
 8256	    dev_get_port_parent_id(b, &b_id, true))
 8257		return false;
 8258
 8259	return netdev_phys_item_id_same(&a_id, &b_id);
 8260}
 8261EXPORT_SYMBOL(netdev_port_same_parent_id);
 8262
 8263/**
 8264 *	dev_change_proto_down - update protocol port state information
 8265 *	@dev: device
 8266 *	@proto_down: new value
 8267 *
 8268 *	This info can be used by switch drivers to set the phys state of the
 8269 *	port.
 8270 */
 8271int dev_change_proto_down(struct net_device *dev, bool proto_down)
 8272{
 8273	const struct net_device_ops *ops = dev->netdev_ops;
 8274
 8275	if (!ops->ndo_change_proto_down)
 8276		return -EOPNOTSUPP;
 8277	if (!netif_device_present(dev))
 8278		return -ENODEV;
 8279	return ops->ndo_change_proto_down(dev, proto_down);
 8280}
 8281EXPORT_SYMBOL(dev_change_proto_down);
 8282
 8283/**
 8284 *	dev_change_proto_down_generic - generic implementation for
 8285 * 	ndo_change_proto_down that sets carrier according to
 8286 * 	proto_down.
 8287 *
 8288 *	@dev: device
 8289 *	@proto_down: new value
 8290 */
 8291int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
 8292{
 8293	if (proto_down)
 8294		netif_carrier_off(dev);
 8295	else
 8296		netif_carrier_on(dev);
 8297	dev->proto_down = proto_down;
 8298	return 0;
 8299}
 8300EXPORT_SYMBOL(dev_change_proto_down_generic);
 8301
 8302u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
 8303		    enum bpf_netdev_command cmd)
 8304{
 8305	struct netdev_bpf xdp;
 8306
 8307	if (!bpf_op)
 8308		return 0;
 8309
 8310	memset(&xdp, 0, sizeof(xdp));
 8311	xdp.command = cmd;
 8312
 8313	/* Query must always succeed. */
 8314	WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG);
 8315
 8316	return xdp.prog_id;
 8317}
 8318
 8319static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
 8320			   struct netlink_ext_ack *extack, u32 flags,
 8321			   struct bpf_prog *prog)
 8322{
 8323	struct netdev_bpf xdp;
 8324
 8325	memset(&xdp, 0, sizeof(xdp));
 8326	if (flags & XDP_FLAGS_HW_MODE)
 8327		xdp.command = XDP_SETUP_PROG_HW;
 8328	else
 8329		xdp.command = XDP_SETUP_PROG;
 8330	xdp.extack = extack;
 8331	xdp.flags = flags;
 8332	xdp.prog = prog;
 8333
 8334	return bpf_op(dev, &xdp);
 8335}
 8336
 8337static void dev_xdp_uninstall(struct net_device *dev)
 8338{
 8339	struct netdev_bpf xdp;
 8340	bpf_op_t ndo_bpf;
 8341
 8342	/* Remove generic XDP */
 8343	WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL));
 8344
 8345	/* Remove from the driver */
 8346	ndo_bpf = dev->netdev_ops->ndo_bpf;
 8347	if (!ndo_bpf)
 8348		return;
 8349
 8350	memset(&xdp, 0, sizeof(xdp));
 8351	xdp.command = XDP_QUERY_PROG;
 8352	WARN_ON(ndo_bpf(dev, &xdp));
 8353	if (xdp.prog_id)
 8354		WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
 8355					NULL));
 8356
 8357	/* Remove HW offload */
 8358	memset(&xdp, 0, sizeof(xdp));
 8359	xdp.command = XDP_QUERY_PROG_HW;
 8360	if (!ndo_bpf(dev, &xdp) && xdp.prog_id)
 8361		WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
 8362					NULL));
 8363}
 8364
 8365/**
 8366 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
 8367 *	@dev: device
 8368 *	@extack: netlink extended ack
 8369 *	@fd: new program fd or negative value to clear
 8370 *	@flags: xdp-related flags
 8371 *
 8372 *	Set or clear a bpf program for a device
 8373 */
 8374int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 8375		      int fd, u32 flags)
 8376{
 8377	const struct net_device_ops *ops = dev->netdev_ops;
 8378	enum bpf_netdev_command query;
 8379	struct bpf_prog *prog = NULL;
 8380	bpf_op_t bpf_op, bpf_chk;
 8381	bool offload;
 8382	int err;
 8383
 8384	ASSERT_RTNL();
 8385
 8386	offload = flags & XDP_FLAGS_HW_MODE;
 8387	query = offload ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;
 8388
 8389	bpf_op = bpf_chk = ops->ndo_bpf;
 8390	if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) {
 8391		NL_SET_ERR_MSG(extack, "underlying driver does not support XDP in native mode");
 8392		return -EOPNOTSUPP;
 8393	}
 8394	if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE))
 8395		bpf_op = generic_xdp_install;
 8396	if (bpf_op == bpf_chk)
 8397		bpf_chk = generic_xdp_install;
 8398
 8399	if (fd >= 0) {
 8400		u32 prog_id;
 8401
 8402		if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) {
 8403			NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time");
 8404			return -EEXIST;
 8405		}
 8406
 8407		prog_id = __dev_xdp_query(dev, bpf_op, query);
 8408		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && prog_id) {
 8409			NL_SET_ERR_MSG(extack, "XDP program already attached");
 8410			return -EBUSY;
 8411		}
 8412
 8413		prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
 8414					     bpf_op == ops->ndo_bpf);
 8415		if (IS_ERR(prog))
 8416			return PTR_ERR(prog);
 8417
 8418		if (!offload && bpf_prog_is_dev_bound(prog->aux)) {
 8419			NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported");
 8420			bpf_prog_put(prog);
 8421			return -EINVAL;
 8422		}
 8423
 8424		/* prog->aux->id may be 0 for orphaned device-bound progs */
 8425		if (prog->aux->id && prog->aux->id == prog_id) {
 8426			bpf_prog_put(prog);
 8427			return 0;
 8428		}
 8429	} else {
 8430		if (!__dev_xdp_query(dev, bpf_op, query))
 8431			return 0;
 8432	}
 8433
 8434	err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
 8435	if (err < 0 && prog)
 8436		bpf_prog_put(prog);
 8437
 8438	return err;
 8439}
 8440
 8441/**
 8442 *	dev_new_index	-	allocate an ifindex
 8443 *	@net: the applicable net namespace
 8444 *
 8445 *	Returns a suitable unique value for a new device interface
 8446 *	number.  The caller must hold the rtnl semaphore or the
 8447 *	dev_base_lock to be sure it remains unique.
 8448 */
 8449static int dev_new_index(struct net *net)
 8450{
 8451	int ifindex = net->ifindex;
 8452
 8453	for (;;) {
 8454		if (++ifindex <= 0)
 8455			ifindex = 1;
 8456		if (!__dev_get_by_index(net, ifindex))
 8457			return net->ifindex = ifindex;
 8458	}
 8459}
 8460
 8461/* Delayed registration/unregisteration */
 8462static LIST_HEAD(net_todo_list);
 8463DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
 8464
 8465static void net_set_todo(struct net_device *dev)
 8466{
 8467	list_add_tail(&dev->todo_list, &net_todo_list);
 8468	dev_net(dev)->dev_unreg_count++;
 8469}
 8470
 8471static void rollback_registered_many(struct list_head *head)
 8472{
 8473	struct net_device *dev, *tmp;
 8474	LIST_HEAD(close_head);
 8475
 8476	BUG_ON(dev_boot_phase);
 8477	ASSERT_RTNL();
 8478
 8479	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
 8480		/* Some devices call without registering
 8481		 * for initialization unwind. Remove those
 8482		 * devices and proceed with the remaining.
 8483		 */
 8484		if (dev->reg_state == NETREG_UNINITIALIZED) {
 8485			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
 8486				 dev->name, dev);
 8487
 8488			WARN_ON(1);
 8489			list_del(&dev->unreg_list);
 8490			continue;
 8491		}
 8492		dev->dismantle = true;
 8493		BUG_ON(dev->reg_state != NETREG_REGISTERED);
 8494	}
 8495
 8496	/* If device is running, close it first. */
 8497	list_for_each_entry(dev, head, unreg_list)
 8498		list_add_tail(&dev->close_list, &close_head);
 8499	dev_close_many(&close_head, true);
 8500
 8501	list_for_each_entry(dev, head, unreg_list) {
 8502		/* And unlink it from device chain. */
 8503		unlist_netdevice(dev);
 8504
 8505		dev->reg_state = NETREG_UNREGISTERING;
 
 8506	}
 8507	flush_all_backlogs();
 8508
 8509	synchronize_net();
 8510
 8511	list_for_each_entry(dev, head, unreg_list) {
 8512		struct sk_buff *skb = NULL;
 8513
 8514		/* Shutdown queueing discipline. */
 8515		dev_shutdown(dev);
 8516
 8517		dev_xdp_uninstall(dev);
 8518
 8519		/* Notify protocols, that we are about to destroy
 8520		 * this device. They should clean all the things.
 8521		 */
 8522		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 8523
 8524		if (!dev->rtnl_link_ops ||
 8525		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 8526			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
 8527						     GFP_KERNEL, NULL, 0);
 8528
 8529		/*
 8530		 *	Flush the unicast and multicast chains
 8531		 */
 8532		dev_uc_flush(dev);
 8533		dev_mc_flush(dev);
 8534
 8535		if (dev->netdev_ops->ndo_uninit)
 8536			dev->netdev_ops->ndo_uninit(dev);
 8537
 8538		if (skb)
 8539			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
 8540
 8541		/* Notifier chain MUST detach us all upper devices. */
 8542		WARN_ON(netdev_has_any_upper_dev(dev));
 8543		WARN_ON(netdev_has_any_lower_dev(dev));
 8544
 8545		/* Remove entries from kobject tree */
 8546		netdev_unregister_kobject(dev);
 8547#ifdef CONFIG_XPS
 8548		/* Remove XPS queueing entries */
 8549		netif_reset_xps_queues_gt(dev, 0);
 8550#endif
 8551	}
 8552
 8553	synchronize_net();
 8554
 8555	list_for_each_entry(dev, head, unreg_list)
 8556		dev_put(dev);
 8557}
 8558
 8559static void rollback_registered(struct net_device *dev)
 8560{
 8561	LIST_HEAD(single);
 8562
 8563	list_add(&dev->unreg_list, &single);
 8564	rollback_registered_many(&single);
 8565	list_del(&single);
 8566}
 8567
 8568static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
 8569	struct net_device *upper, netdev_features_t features)
 8570{
 8571	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
 8572	netdev_features_t feature;
 8573	int feature_bit;
 8574
 8575	for_each_netdev_feature(upper_disables, feature_bit) {
 8576		feature = __NETIF_F_BIT(feature_bit);
 8577		if (!(upper->wanted_features & feature)
 8578		    && (features & feature)) {
 8579			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
 8580				   &feature, upper->name);
 8581			features &= ~feature;
 8582		}
 8583	}
 8584
 8585	return features;
 8586}
 8587
 8588static void netdev_sync_lower_features(struct net_device *upper,
 8589	struct net_device *lower, netdev_features_t features)
 8590{
 8591	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
 8592	netdev_features_t feature;
 8593	int feature_bit;
 8594
 8595	for_each_netdev_feature(upper_disables, feature_bit) {
 8596		feature = __NETIF_F_BIT(feature_bit);
 8597		if (!(features & feature) && (lower->features & feature)) {
 8598			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
 8599				   &feature, lower->name);
 8600			lower->wanted_features &= ~feature;
 8601			netdev_update_features(lower);
 8602
 8603			if (unlikely(lower->features & feature))
 8604				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
 8605					    &feature, lower->name);
 8606		}
 8607	}
 8608}
 8609
 8610static netdev_features_t netdev_fix_features(struct net_device *dev,
 8611	netdev_features_t features)
 8612{
 8613	/* Fix illegal checksum combinations */
 8614	if ((features & NETIF_F_HW_CSUM) &&
 8615	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
 8616		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
 8617		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
 8618	}
 8619
 8620	/* TSO requires that SG is present as well. */
 8621	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
 8622		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
 8623		features &= ~NETIF_F_ALL_TSO;
 8624	}
 8625
 8626	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
 8627					!(features & NETIF_F_IP_CSUM)) {
 8628		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
 8629		features &= ~NETIF_F_TSO;
 8630		features &= ~NETIF_F_TSO_ECN;
 8631	}
 8632
 8633	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
 8634					 !(features & NETIF_F_IPV6_CSUM)) {
 8635		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
 8636		features &= ~NETIF_F_TSO6;
 8637	}
 8638
 8639	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
 8640	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
 8641		features &= ~NETIF_F_TSO_MANGLEID;
 8642
 8643	/* TSO ECN requires that TSO is present as well. */
 8644	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
 8645		features &= ~NETIF_F_TSO_ECN;
 8646
 8647	/* Software GSO depends on SG. */
 8648	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
 8649		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
 8650		features &= ~NETIF_F_GSO;
 8651	}
 8652
 8653	/* GSO partial features require GSO partial be set */
 8654	if ((features & dev->gso_partial_features) &&
 8655	    !(features & NETIF_F_GSO_PARTIAL)) {
 8656		netdev_dbg(dev,
 8657			   "Dropping partially supported GSO features since no GSO partial.\n");
 8658		features &= ~dev->gso_partial_features;
 8659	}
 8660
 8661	if (!(features & NETIF_F_RXCSUM)) {
 8662		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
 8663		 * successfully merged by hardware must also have the
 8664		 * checksum verified by hardware.  If the user does not
 8665		 * want to enable RXCSUM, logically, we should disable GRO_HW.
 8666		 */
 8667		if (features & NETIF_F_GRO_HW) {
 8668			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
 8669			features &= ~NETIF_F_GRO_HW;
 8670		}
 8671	}
 8672
 8673	/* LRO/HW-GRO features cannot be combined with RX-FCS */
 8674	if (features & NETIF_F_RXFCS) {
 8675		if (features & NETIF_F_LRO) {
 8676			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
 8677			features &= ~NETIF_F_LRO;
 8678		}
 8679
 8680		if (features & NETIF_F_GRO_HW) {
 8681			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
 8682			features &= ~NETIF_F_GRO_HW;
 8683		}
 8684	}
 8685
 8686	return features;
 8687}
 8688
 8689int __netdev_update_features(struct net_device *dev)
 8690{
 8691	struct net_device *upper, *lower;
 8692	netdev_features_t features;
 8693	struct list_head *iter;
 8694	int err = -1;
 8695
 8696	ASSERT_RTNL();
 8697
 8698	features = netdev_get_wanted_features(dev);
 8699
 8700	if (dev->netdev_ops->ndo_fix_features)
 8701		features = dev->netdev_ops->ndo_fix_features(dev, features);
 8702
 8703	/* driver might be less strict about feature dependencies */
 8704	features = netdev_fix_features(dev, features);
 8705
 8706	/* some features can't be enabled if they're off an an upper device */
 8707	netdev_for_each_upper_dev_rcu(dev, upper, iter)
 8708		features = netdev_sync_upper_features(dev, upper, features);
 8709
 8710	if (dev->features == features)
 8711		goto sync_lower;
 8712
 8713	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
 8714		&dev->features, &features);
 8715
 8716	if (dev->netdev_ops->ndo_set_features)
 8717		err = dev->netdev_ops->ndo_set_features(dev, features);
 8718	else
 8719		err = 0;
 8720
 8721	if (unlikely(err < 0)) {
 8722		netdev_err(dev,
 8723			"set_features() failed (%d); wanted %pNF, left %pNF\n",
 8724			err, &features, &dev->features);
 8725		/* return non-0 since some features might have changed and
 8726		 * it's better to fire a spurious notification than miss it
 8727		 */
 8728		return -1;
 8729	}
 8730
 8731sync_lower:
 8732	/* some features must be disabled on lower devices when disabled
 8733	 * on an upper device (think: bonding master or bridge)
 8734	 */
 8735	netdev_for_each_lower_dev(dev, lower, iter)
 8736		netdev_sync_lower_features(dev, lower, features);
 8737
 8738	if (!err) {
 8739		netdev_features_t diff = features ^ dev->features;
 8740
 8741		if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
 8742			/* udp_tunnel_{get,drop}_rx_info both need
 8743			 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
 8744			 * device, or they won't do anything.
 8745			 * Thus we need to update dev->features
 8746			 * *before* calling udp_tunnel_get_rx_info,
 8747			 * but *after* calling udp_tunnel_drop_rx_info.
 8748			 */
 8749			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
 8750				dev->features = features;
 8751				udp_tunnel_get_rx_info(dev);
 8752			} else {
 8753				udp_tunnel_drop_rx_info(dev);
 8754			}
 8755		}
 8756
 8757		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
 8758			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
 8759				dev->features = features;
 8760				err |= vlan_get_rx_ctag_filter_info(dev);
 8761			} else {
 8762				vlan_drop_rx_ctag_filter_info(dev);
 8763			}
 8764		}
 8765
 8766		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
 8767			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
 8768				dev->features = features;
 8769				err |= vlan_get_rx_stag_filter_info(dev);
 8770			} else {
 8771				vlan_drop_rx_stag_filter_info(dev);
 8772			}
 8773		}
 8774
 8775		dev->features = features;
 8776	}
 8777
 8778	return err < 0 ? 0 : 1;
 8779}
 8780
 8781/**
 8782 *	netdev_update_features - recalculate device features
 8783 *	@dev: the device to check
 8784 *
 8785 *	Recalculate dev->features set and send notifications if it
 8786 *	has changed. Should be called after driver or hardware dependent
 8787 *	conditions might have changed that influence the features.
 8788 */
 8789void netdev_update_features(struct net_device *dev)
 8790{
 8791	if (__netdev_update_features(dev))
 8792		netdev_features_change(dev);
 8793}
 8794EXPORT_SYMBOL(netdev_update_features);
 8795
 8796/**
 8797 *	netdev_change_features - recalculate device features
 8798 *	@dev: the device to check
 8799 *
 8800 *	Recalculate dev->features set and send notifications even
 8801 *	if they have not changed. Should be called instead of
 8802 *	netdev_update_features() if also dev->vlan_features might
 8803 *	have changed to allow the changes to be propagated to stacked
 8804 *	VLAN devices.
 8805 */
 8806void netdev_change_features(struct net_device *dev)
 8807{
 8808	__netdev_update_features(dev);
 8809	netdev_features_change(dev);
 8810}
 8811EXPORT_SYMBOL(netdev_change_features);
 8812
 8813/**
 8814 *	netif_stacked_transfer_operstate -	transfer operstate
 8815 *	@rootdev: the root or lower level device to transfer state from
 8816 *	@dev: the device to transfer operstate to
 8817 *
 8818 *	Transfer operational state from root to device. This is normally
 8819 *	called when a stacking relationship exists between the root
 8820 *	device and the device(a leaf device).
 8821 */
 8822void netif_stacked_transfer_operstate(const struct net_device *rootdev,
 8823					struct net_device *dev)
 8824{
 8825	if (rootdev->operstate == IF_OPER_DORMANT)
 8826		netif_dormant_on(dev);
 8827	else
 8828		netif_dormant_off(dev);
 8829
 8830	if (netif_carrier_ok(rootdev))
 8831		netif_carrier_on(dev);
 8832	else
 8833		netif_carrier_off(dev);
 
 
 
 8834}
 8835EXPORT_SYMBOL(netif_stacked_transfer_operstate);
 8836
 
 8837static int netif_alloc_rx_queues(struct net_device *dev)
 8838{
 8839	unsigned int i, count = dev->num_rx_queues;
 8840	struct netdev_rx_queue *rx;
 8841	size_t sz = count * sizeof(*rx);
 8842	int err = 0;
 8843
 8844	BUG_ON(count < 1);
 8845
 8846	rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 8847	if (!rx)
 8848		return -ENOMEM;
 8849
 
 
 8850	dev->_rx = rx;
 8851
 8852	for (i = 0; i < count; i++) {
 8853		rx[i].dev = dev;
 8854
 8855		/* XDP RX-queue setup */
 8856		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
 8857		if (err < 0)
 8858			goto err_rxq_info;
 8859	}
 8860	return 0;
 8861
 8862err_rxq_info:
 8863	/* Rollback successful reg's and free other resources */
 8864	while (i--)
 8865		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
 8866	kvfree(dev->_rx);
 8867	dev->_rx = NULL;
 8868	return err;
 8869}
 8870
 8871static void netif_free_rx_queues(struct net_device *dev)
 8872{
 8873	unsigned int i, count = dev->num_rx_queues;
 8874
 8875	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
 8876	if (!dev->_rx)
 8877		return;
 8878
 8879	for (i = 0; i < count; i++)
 8880		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
 8881
 8882	kvfree(dev->_rx);
 8883}
 
 8884
 8885static void netdev_init_one_queue(struct net_device *dev,
 8886				  struct netdev_queue *queue, void *_unused)
 8887{
 8888	/* Initialize queue lock */
 8889	spin_lock_init(&queue->_xmit_lock);
 8890	lockdep_set_class(&queue->_xmit_lock, &dev->qdisc_xmit_lock_key);
 8891	queue->xmit_lock_owner = -1;
 8892	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
 8893	queue->dev = dev;
 8894#ifdef CONFIG_BQL
 8895	dql_init(&queue->dql, HZ);
 8896#endif
 8897}
 8898
 8899static void netif_free_tx_queues(struct net_device *dev)
 8900{
 8901	kvfree(dev->_tx);
 8902}
 8903
 8904static int netif_alloc_netdev_queues(struct net_device *dev)
 8905{
 8906	unsigned int count = dev->num_tx_queues;
 8907	struct netdev_queue *tx;
 8908	size_t sz = count * sizeof(*tx);
 8909
 8910	if (count < 1 || count > 0xffff)
 8911		return -EINVAL;
 8912
 8913	tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 8914	if (!tx)
 8915		return -ENOMEM;
 8916
 
 
 8917	dev->_tx = tx;
 8918
 8919	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
 8920	spin_lock_init(&dev->tx_global_lock);
 8921
 8922	return 0;
 8923}
 8924
 8925void netif_tx_stop_all_queues(struct net_device *dev)
 8926{
 8927	unsigned int i;
 8928
 8929	for (i = 0; i < dev->num_tx_queues; i++) {
 8930		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 8931
 8932		netif_tx_stop_queue(txq);
 8933	}
 8934}
 8935EXPORT_SYMBOL(netif_tx_stop_all_queues);
 8936
 8937static void netdev_register_lockdep_key(struct net_device *dev)
 8938{
 8939	lockdep_register_key(&dev->qdisc_tx_busylock_key);
 8940	lockdep_register_key(&dev->qdisc_running_key);
 8941	lockdep_register_key(&dev->qdisc_xmit_lock_key);
 8942	lockdep_register_key(&dev->addr_list_lock_key);
 8943}
 8944
 8945static void netdev_unregister_lockdep_key(struct net_device *dev)
 8946{
 8947	lockdep_unregister_key(&dev->qdisc_tx_busylock_key);
 8948	lockdep_unregister_key(&dev->qdisc_running_key);
 8949	lockdep_unregister_key(&dev->qdisc_xmit_lock_key);
 8950	lockdep_unregister_key(&dev->addr_list_lock_key);
 8951}
 8952
 8953void netdev_update_lockdep_key(struct net_device *dev)
 8954{
 8955	struct netdev_queue *queue;
 8956	int i;
 8957
 8958	lockdep_unregister_key(&dev->qdisc_xmit_lock_key);
 8959	lockdep_unregister_key(&dev->addr_list_lock_key);
 8960
 8961	lockdep_register_key(&dev->qdisc_xmit_lock_key);
 8962	lockdep_register_key(&dev->addr_list_lock_key);
 8963
 8964	lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key);
 8965	for (i = 0; i < dev->num_tx_queues; i++) {
 8966		queue = netdev_get_tx_queue(dev, i);
 8967
 8968		lockdep_set_class(&queue->_xmit_lock,
 8969				  &dev->qdisc_xmit_lock_key);
 8970	}
 8971}
 8972EXPORT_SYMBOL(netdev_update_lockdep_key);
 8973
 8974/**
 8975 *	register_netdevice	- register a network device
 8976 *	@dev: device to register
 8977 *
 8978 *	Take a completed network device structure and add it to the kernel
 8979 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
 8980 *	chain. 0 is returned on success. A negative errno code is returned
 8981 *	on a failure to set up the device, or if the name is a duplicate.
 8982 *
 8983 *	Callers must hold the rtnl semaphore. You may want
 8984 *	register_netdev() instead of this.
 8985 *
 8986 *	BUGS:
 8987 *	The locking appears insufficient to guarantee two parallel registers
 8988 *	will not get the same name.
 8989 */
 8990
 8991int register_netdevice(struct net_device *dev)
 8992{
 8993	int ret;
 8994	struct net *net = dev_net(dev);
 8995
 8996	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
 8997		     NETDEV_FEATURE_COUNT);
 8998	BUG_ON(dev_boot_phase);
 8999	ASSERT_RTNL();
 9000
 9001	might_sleep();
 9002
 9003	/* When net_device's are persistent, this will be fatal. */
 9004	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
 9005	BUG_ON(!net);
 9006
 9007	spin_lock_init(&dev->addr_list_lock);
 9008	lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key);
 9009
 9010	ret = dev_get_valid_name(net, dev, dev->name);
 9011	if (ret < 0)
 9012		goto out;
 9013
 9014	/* Init, if this function is available */
 9015	if (dev->netdev_ops->ndo_init) {
 9016		ret = dev->netdev_ops->ndo_init(dev);
 9017		if (ret) {
 9018			if (ret > 0)
 9019				ret = -EIO;
 9020			goto out;
 9021		}
 9022	}
 9023
 9024	if (((dev->hw_features | dev->features) &
 9025	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
 9026	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
 9027	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
 9028		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
 9029		ret = -EINVAL;
 9030		goto err_uninit;
 9031	}
 9032
 9033	ret = -EBUSY;
 9034	if (!dev->ifindex)
 9035		dev->ifindex = dev_new_index(net);
 9036	else if (__dev_get_by_index(net, dev->ifindex))
 9037		goto err_uninit;
 9038
 9039	/* Transfer changeable features to wanted_features and enable
 9040	 * software offloads (GSO and GRO).
 9041	 */
 9042	dev->hw_features |= NETIF_F_SOFT_FEATURES;
 9043	dev->features |= NETIF_F_SOFT_FEATURES;
 9044
 9045	if (dev->netdev_ops->ndo_udp_tunnel_add) {
 9046		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
 9047		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
 9048	}
 9049
 9050	dev->wanted_features = dev->features & dev->hw_features;
 9051
 9052	if (!(dev->flags & IFF_LOOPBACK))
 9053		dev->hw_features |= NETIF_F_NOCACHE_COPY;
 9054
 9055	/* If IPv4 TCP segmentation offload is supported we should also
 9056	 * allow the device to enable segmenting the frame with the option
 9057	 * of ignoring a static IP ID value.  This doesn't enable the
 9058	 * feature itself but allows the user to enable it later.
 9059	 */
 9060	if (dev->hw_features & NETIF_F_TSO)
 9061		dev->hw_features |= NETIF_F_TSO_MANGLEID;
 9062	if (dev->vlan_features & NETIF_F_TSO)
 9063		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
 9064	if (dev->mpls_features & NETIF_F_TSO)
 9065		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
 9066	if (dev->hw_enc_features & NETIF_F_TSO)
 9067		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
 9068
 9069	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
 9070	 */
 9071	dev->vlan_features |= NETIF_F_HIGHDMA;
 9072
 9073	/* Make NETIF_F_SG inheritable to tunnel devices.
 9074	 */
 9075	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
 9076
 9077	/* Make NETIF_F_SG inheritable to MPLS.
 9078	 */
 9079	dev->mpls_features |= NETIF_F_SG;
 9080
 9081	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
 9082	ret = notifier_to_errno(ret);
 9083	if (ret)
 9084		goto err_uninit;
 9085
 9086	ret = netdev_register_kobject(dev);
 9087	if (ret)
 9088		goto err_uninit;
 9089	dev->reg_state = NETREG_REGISTERED;
 9090
 9091	__netdev_update_features(dev);
 9092
 9093	/*
 9094	 *	Default initial state at registry is that the
 9095	 *	device is present.
 9096	 */
 9097
 9098	set_bit(__LINK_STATE_PRESENT, &dev->state);
 9099
 9100	linkwatch_init_dev(dev);
 9101
 9102	dev_init_scheduler(dev);
 9103	dev_hold(dev);
 9104	list_netdevice(dev);
 9105	add_device_randomness(dev->dev_addr, dev->addr_len);
 9106
 9107	/* If the device has permanent device address, driver should
 9108	 * set dev_addr and also addr_assign_type should be set to
 9109	 * NET_ADDR_PERM (default value).
 9110	 */
 9111	if (dev->addr_assign_type == NET_ADDR_PERM)
 9112		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
 9113
 9114	/* Notify protocols, that a new device appeared. */
 9115	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
 9116	ret = notifier_to_errno(ret);
 9117	if (ret) {
 9118		rollback_registered(dev);
 9119		rcu_barrier();
 9120
 9121		dev->reg_state = NETREG_UNREGISTERED;
 9122	}
 9123	/*
 9124	 *	Prevent userspace races by waiting until the network
 9125	 *	device is fully setup before sending notifications.
 9126	 */
 9127	if (!dev->rtnl_link_ops ||
 9128	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 9129		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
 9130
 9131out:
 9132	return ret;
 9133
 9134err_uninit:
 9135	if (dev->netdev_ops->ndo_uninit)
 9136		dev->netdev_ops->ndo_uninit(dev);
 9137	if (dev->priv_destructor)
 9138		dev->priv_destructor(dev);
 9139	goto out;
 9140}
 9141EXPORT_SYMBOL(register_netdevice);
 9142
 9143/**
 9144 *	init_dummy_netdev	- init a dummy network device for NAPI
 9145 *	@dev: device to init
 9146 *
 9147 *	This takes a network device structure and initialize the minimum
 9148 *	amount of fields so it can be used to schedule NAPI polls without
 9149 *	registering a full blown interface. This is to be used by drivers
 9150 *	that need to tie several hardware interfaces to a single NAPI
 9151 *	poll scheduler due to HW limitations.
 9152 */
 9153int init_dummy_netdev(struct net_device *dev)
 9154{
 9155	/* Clear everything. Note we don't initialize spinlocks
 9156	 * are they aren't supposed to be taken by any of the
 9157	 * NAPI code and this dummy netdev is supposed to be
 9158	 * only ever used for NAPI polls
 9159	 */
 9160	memset(dev, 0, sizeof(struct net_device));
 9161
 9162	/* make sure we BUG if trying to hit standard
 9163	 * register/unregister code path
 9164	 */
 9165	dev->reg_state = NETREG_DUMMY;
 9166
 9167	/* NAPI wants this */
 9168	INIT_LIST_HEAD(&dev->napi_list);
 9169
 9170	/* a dummy interface is started by default */
 9171	set_bit(__LINK_STATE_PRESENT, &dev->state);
 9172	set_bit(__LINK_STATE_START, &dev->state);
 9173
 9174	/* napi_busy_loop stats accounting wants this */
 9175	dev_net_set(dev, &init_net);
 9176
 9177	/* Note : We dont allocate pcpu_refcnt for dummy devices,
 9178	 * because users of this 'device' dont need to change
 9179	 * its refcount.
 9180	 */
 9181
 9182	return 0;
 9183}
 9184EXPORT_SYMBOL_GPL(init_dummy_netdev);
 9185
 9186
 9187/**
 9188 *	register_netdev	- register a network device
 9189 *	@dev: device to register
 9190 *
 9191 *	Take a completed network device structure and add it to the kernel
 9192 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
 9193 *	chain. 0 is returned on success. A negative errno code is returned
 9194 *	on a failure to set up the device, or if the name is a duplicate.
 9195 *
 9196 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
 9197 *	and expands the device name if you passed a format string to
 9198 *	alloc_netdev.
 9199 */
 9200int register_netdev(struct net_device *dev)
 9201{
 9202	int err;
 9203
 9204	if (rtnl_lock_killable())
 9205		return -EINTR;
 9206	err = register_netdevice(dev);
 9207	rtnl_unlock();
 9208	return err;
 9209}
 9210EXPORT_SYMBOL(register_netdev);
 9211
 9212int netdev_refcnt_read(const struct net_device *dev)
 9213{
 9214	int i, refcnt = 0;
 9215
 9216	for_each_possible_cpu(i)
 9217		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
 9218	return refcnt;
 9219}
 9220EXPORT_SYMBOL(netdev_refcnt_read);
 9221
 9222/**
 9223 * netdev_wait_allrefs - wait until all references are gone.
 9224 * @dev: target net_device
 9225 *
 9226 * This is called when unregistering network devices.
 9227 *
 9228 * Any protocol or device that holds a reference should register
 9229 * for netdevice notification, and cleanup and put back the
 9230 * reference if they receive an UNREGISTER event.
 9231 * We can get stuck here if buggy protocols don't correctly
 9232 * call dev_put.
 9233 */
 9234static void netdev_wait_allrefs(struct net_device *dev)
 9235{
 9236	unsigned long rebroadcast_time, warning_time;
 9237	int refcnt;
 9238
 9239	linkwatch_forget_dev(dev);
 9240
 9241	rebroadcast_time = warning_time = jiffies;
 9242	refcnt = netdev_refcnt_read(dev);
 9243
 9244	while (refcnt != 0) {
 9245		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
 9246			rtnl_lock();
 9247
 9248			/* Rebroadcast unregister notification */
 9249			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 9250
 9251			__rtnl_unlock();
 9252			rcu_barrier();
 9253			rtnl_lock();
 9254
 
 9255			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
 9256				     &dev->state)) {
 9257				/* We must not have linkwatch events
 9258				 * pending on unregister. If this
 9259				 * happens, we simply run the queue
 9260				 * unscheduled, resulting in a noop
 9261				 * for this device.
 9262				 */
 9263				linkwatch_run_queue();
 9264			}
 9265
 9266			__rtnl_unlock();
 9267
 9268			rebroadcast_time = jiffies;
 9269		}
 9270
 9271		msleep(250);
 9272
 9273		refcnt = netdev_refcnt_read(dev);
 9274
 9275		if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
 9276			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
 9277				 dev->name, refcnt);
 9278			warning_time = jiffies;
 9279		}
 9280	}
 9281}
 9282
 9283/* The sequence is:
 9284 *
 9285 *	rtnl_lock();
 9286 *	...
 9287 *	register_netdevice(x1);
 9288 *	register_netdevice(x2);
 9289 *	...
 9290 *	unregister_netdevice(y1);
 9291 *	unregister_netdevice(y2);
 9292 *      ...
 9293 *	rtnl_unlock();
 9294 *	free_netdev(y1);
 9295 *	free_netdev(y2);
 9296 *
 9297 * We are invoked by rtnl_unlock().
 9298 * This allows us to deal with problems:
 9299 * 1) We can delete sysfs objects which invoke hotplug
 9300 *    without deadlocking with linkwatch via keventd.
 9301 * 2) Since we run with the RTNL semaphore not held, we can sleep
 9302 *    safely in order to wait for the netdev refcnt to drop to zero.
 9303 *
 9304 * We must not return until all unregister events added during
 9305 * the interval the lock was held have been completed.
 9306 */
 9307void netdev_run_todo(void)
 9308{
 9309	struct list_head list;
 9310
 9311	/* Snapshot list, allow later requests */
 9312	list_replace_init(&net_todo_list, &list);
 9313
 9314	__rtnl_unlock();
 9315
 9316
 9317	/* Wait for rcu callbacks to finish before next phase */
 9318	if (!list_empty(&list))
 9319		rcu_barrier();
 9320
 9321	while (!list_empty(&list)) {
 9322		struct net_device *dev
 9323			= list_first_entry(&list, struct net_device, todo_list);
 9324		list_del(&dev->todo_list);
 9325
 
 
 
 
 9326		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
 9327			pr_err("network todo '%s' but state %d\n",
 9328			       dev->name, dev->reg_state);
 9329			dump_stack();
 9330			continue;
 9331		}
 9332
 9333		dev->reg_state = NETREG_UNREGISTERED;
 9334
 9335		netdev_wait_allrefs(dev);
 9336
 9337		/* paranoia */
 9338		BUG_ON(netdev_refcnt_read(dev));
 9339		BUG_ON(!list_empty(&dev->ptype_all));
 9340		BUG_ON(!list_empty(&dev->ptype_specific));
 9341		WARN_ON(rcu_access_pointer(dev->ip_ptr));
 9342		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
 9343#if IS_ENABLED(CONFIG_DECNET)
 9344		WARN_ON(dev->dn_ptr);
 9345#endif
 9346		if (dev->priv_destructor)
 9347			dev->priv_destructor(dev);
 9348		if (dev->needs_free_netdev)
 9349			free_netdev(dev);
 9350
 9351		/* Report a network device has been unregistered */
 9352		rtnl_lock();
 9353		dev_net(dev)->dev_unreg_count--;
 9354		__rtnl_unlock();
 9355		wake_up(&netdev_unregistering_wq);
 9356
 9357		/* Free network device */
 9358		kobject_put(&dev->dev.kobj);
 9359	}
 9360}
 9361
 9362/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
 9363 * all the same fields in the same order as net_device_stats, with only
 9364 * the type differing, but rtnl_link_stats64 may have additional fields
 9365 * at the end for newer counters.
 9366 */
 9367void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
 9368			     const struct net_device_stats *netdev_stats)
 9369{
 9370#if BITS_PER_LONG == 64
 9371	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
 9372	memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
 9373	/* zero out counters that only exist in rtnl_link_stats64 */
 9374	memset((char *)stats64 + sizeof(*netdev_stats), 0,
 9375	       sizeof(*stats64) - sizeof(*netdev_stats));
 9376#else
 9377	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
 9378	const unsigned long *src = (const unsigned long *)netdev_stats;
 9379	u64 *dst = (u64 *)stats64;
 9380
 9381	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
 9382	for (i = 0; i < n; i++)
 9383		dst[i] = src[i];
 9384	/* zero out counters that only exist in rtnl_link_stats64 */
 9385	memset((char *)stats64 + n * sizeof(u64), 0,
 9386	       sizeof(*stats64) - n * sizeof(u64));
 9387#endif
 9388}
 9389EXPORT_SYMBOL(netdev_stats_to_stats64);
 9390
 9391/**
 9392 *	dev_get_stats	- get network device statistics
 9393 *	@dev: device to get statistics from
 9394 *	@storage: place to store stats
 9395 *
 9396 *	Get network statistics from device. Return @storage.
 9397 *	The device driver may provide its own method by setting
 9398 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
 9399 *	otherwise the internal statistics structure is used.
 9400 */
 9401struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
 9402					struct rtnl_link_stats64 *storage)
 9403{
 9404	const struct net_device_ops *ops = dev->netdev_ops;
 9405
 9406	if (ops->ndo_get_stats64) {
 9407		memset(storage, 0, sizeof(*storage));
 9408		ops->ndo_get_stats64(dev, storage);
 9409	} else if (ops->ndo_get_stats) {
 9410		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
 9411	} else {
 9412		netdev_stats_to_stats64(storage, &dev->stats);
 9413	}
 9414	storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
 9415	storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
 9416	storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
 9417	return storage;
 9418}
 9419EXPORT_SYMBOL(dev_get_stats);
 9420
 9421struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
 9422{
 9423	struct netdev_queue *queue = dev_ingress_queue(dev);
 9424
 9425#ifdef CONFIG_NET_CLS_ACT
 9426	if (queue)
 9427		return queue;
 9428	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
 9429	if (!queue)
 9430		return NULL;
 9431	netdev_init_one_queue(dev, queue, NULL);
 9432	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
 9433	queue->qdisc_sleeping = &noop_qdisc;
 9434	rcu_assign_pointer(dev->ingress_queue, queue);
 9435#endif
 9436	return queue;
 9437}
 9438
 9439static const struct ethtool_ops default_ethtool_ops;
 9440
 9441void netdev_set_default_ethtool_ops(struct net_device *dev,
 9442				    const struct ethtool_ops *ops)
 9443{
 9444	if (dev->ethtool_ops == &default_ethtool_ops)
 9445		dev->ethtool_ops = ops;
 9446}
 9447EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
 9448
 9449void netdev_freemem(struct net_device *dev)
 9450{
 9451	char *addr = (char *)dev - dev->padded;
 9452
 9453	kvfree(addr);
 9454}
 9455
 9456/**
 9457 * alloc_netdev_mqs - allocate network device
 9458 * @sizeof_priv: size of private data to allocate space for
 9459 * @name: device name format string
 9460 * @name_assign_type: origin of device name
 9461 * @setup: callback to initialize device
 9462 * @txqs: the number of TX subqueues to allocate
 9463 * @rxqs: the number of RX subqueues to allocate
 9464 *
 9465 * Allocates a struct net_device with private data area for driver use
 9466 * and performs basic initialization.  Also allocates subqueue structs
 9467 * for each queue on the device.
 9468 */
 9469struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 9470		unsigned char name_assign_type,
 9471		void (*setup)(struct net_device *),
 9472		unsigned int txqs, unsigned int rxqs)
 9473{
 9474	struct net_device *dev;
 9475	unsigned int alloc_size;
 9476	struct net_device *p;
 9477
 9478	BUG_ON(strlen(name) >= sizeof(dev->name));
 9479
 9480	if (txqs < 1) {
 9481		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
 9482		return NULL;
 9483	}
 9484
 
 9485	if (rxqs < 1) {
 9486		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
 9487		return NULL;
 9488	}
 
 9489
 9490	alloc_size = sizeof(struct net_device);
 9491	if (sizeof_priv) {
 9492		/* ensure 32-byte alignment of private area */
 9493		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
 9494		alloc_size += sizeof_priv;
 9495	}
 9496	/* ensure 32-byte alignment of whole construct */
 9497	alloc_size += NETDEV_ALIGN - 1;
 9498
 9499	p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 
 
 9500	if (!p)
 9501		return NULL;
 9502
 9503	dev = PTR_ALIGN(p, NETDEV_ALIGN);
 9504	dev->padded = (char *)dev - (char *)p;
 9505
 9506	dev->pcpu_refcnt = alloc_percpu(int);
 9507	if (!dev->pcpu_refcnt)
 9508		goto free_dev;
 9509
 9510	if (dev_addr_init(dev))
 9511		goto free_pcpu;
 9512
 9513	dev_mc_init(dev);
 9514	dev_uc_init(dev);
 9515
 9516	dev_net_set(dev, &init_net);
 9517
 9518	netdev_register_lockdep_key(dev);
 9519
 9520	dev->gso_max_size = GSO_MAX_SIZE;
 9521	dev->gso_max_segs = GSO_MAX_SEGS;
 9522	dev->upper_level = 1;
 9523	dev->lower_level = 1;
 9524
 9525	INIT_LIST_HEAD(&dev->napi_list);
 9526	INIT_LIST_HEAD(&dev->unreg_list);
 9527	INIT_LIST_HEAD(&dev->close_list);
 9528	INIT_LIST_HEAD(&dev->link_watch_list);
 9529	INIT_LIST_HEAD(&dev->adj_list.upper);
 9530	INIT_LIST_HEAD(&dev->adj_list.lower);
 
 
 9531	INIT_LIST_HEAD(&dev->ptype_all);
 9532	INIT_LIST_HEAD(&dev->ptype_specific);
 9533#ifdef CONFIG_NET_SCHED
 9534	hash_init(dev->qdisc_hash);
 9535#endif
 9536	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
 9537	setup(dev);
 9538
 9539	if (!dev->tx_queue_len) {
 9540		dev->priv_flags |= IFF_NO_QUEUE;
 9541		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
 9542	}
 9543
 9544	dev->num_tx_queues = txqs;
 9545	dev->real_num_tx_queues = txqs;
 9546	if (netif_alloc_netdev_queues(dev))
 9547		goto free_all;
 9548
 
 9549	dev->num_rx_queues = rxqs;
 9550	dev->real_num_rx_queues = rxqs;
 9551	if (netif_alloc_rx_queues(dev))
 9552		goto free_all;
 
 9553
 9554	strcpy(dev->name, name);
 9555	dev->name_assign_type = name_assign_type;
 9556	dev->group = INIT_NETDEV_GROUP;
 9557	if (!dev->ethtool_ops)
 9558		dev->ethtool_ops = &default_ethtool_ops;
 9559
 9560	nf_hook_ingress_init(dev);
 9561
 9562	return dev;
 9563
 9564free_all:
 9565	free_netdev(dev);
 9566	return NULL;
 9567
 9568free_pcpu:
 9569	free_percpu(dev->pcpu_refcnt);
 9570free_dev:
 9571	netdev_freemem(dev);
 9572	return NULL;
 9573}
 9574EXPORT_SYMBOL(alloc_netdev_mqs);
 9575
 9576/**
 9577 * free_netdev - free network device
 9578 * @dev: device
 9579 *
 9580 * This function does the last stage of destroying an allocated device
 9581 * interface. The reference to the device object is released. If this
 9582 * is the last reference then it will be freed.Must be called in process
 9583 * context.
 9584 */
 9585void free_netdev(struct net_device *dev)
 9586{
 9587	struct napi_struct *p, *n;
 9588
 9589	might_sleep();
 9590	netif_free_tx_queues(dev);
 9591	netif_free_rx_queues(dev);
 
 
 9592
 9593	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
 9594
 9595	/* Flush device addresses */
 9596	dev_addr_flush(dev);
 9597
 9598	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 9599		netif_napi_del(p);
 9600
 9601	free_percpu(dev->pcpu_refcnt);
 9602	dev->pcpu_refcnt = NULL;
 9603
 9604	netdev_unregister_lockdep_key(dev);
 9605
 9606	/*  Compatibility with error handling in drivers */
 9607	if (dev->reg_state == NETREG_UNINITIALIZED) {
 9608		netdev_freemem(dev);
 9609		return;
 9610	}
 9611
 9612	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
 9613	dev->reg_state = NETREG_RELEASED;
 9614
 9615	/* will free via device release */
 9616	put_device(&dev->dev);
 9617}
 9618EXPORT_SYMBOL(free_netdev);
 9619
 9620/**
 9621 *	synchronize_net -  Synchronize with packet receive processing
 9622 *
 9623 *	Wait for packets currently being received to be done.
 9624 *	Does not block later packets from starting.
 9625 */
 9626void synchronize_net(void)
 9627{
 9628	might_sleep();
 9629	if (rtnl_is_locked())
 9630		synchronize_rcu_expedited();
 9631	else
 9632		synchronize_rcu();
 9633}
 9634EXPORT_SYMBOL(synchronize_net);
 9635
 9636/**
 9637 *	unregister_netdevice_queue - remove device from the kernel
 9638 *	@dev: device
 9639 *	@head: list
 9640 *
 9641 *	This function shuts down a device interface and removes it
 9642 *	from the kernel tables.
 9643 *	If head not NULL, device is queued to be unregistered later.
 9644 *
 9645 *	Callers must hold the rtnl semaphore.  You may want
 9646 *	unregister_netdev() instead of this.
 9647 */
 9648
 9649void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
 9650{
 9651	ASSERT_RTNL();
 9652
 9653	if (head) {
 9654		list_move_tail(&dev->unreg_list, head);
 9655	} else {
 9656		rollback_registered(dev);
 9657		/* Finish processing unregister after unlock */
 9658		net_set_todo(dev);
 9659	}
 9660}
 9661EXPORT_SYMBOL(unregister_netdevice_queue);
 9662
 9663/**
 9664 *	unregister_netdevice_many - unregister many devices
 9665 *	@head: list of devices
 9666 *
 9667 *  Note: As most callers use a stack allocated list_head,
 9668 *  we force a list_del() to make sure stack wont be corrupted later.
 9669 */
 9670void unregister_netdevice_many(struct list_head *head)
 9671{
 9672	struct net_device *dev;
 9673
 9674	if (!list_empty(head)) {
 9675		rollback_registered_many(head);
 9676		list_for_each_entry(dev, head, unreg_list)
 9677			net_set_todo(dev);
 9678		list_del(head);
 9679	}
 9680}
 9681EXPORT_SYMBOL(unregister_netdevice_many);
 9682
 9683/**
 9684 *	unregister_netdev - remove device from the kernel
 9685 *	@dev: device
 9686 *
 9687 *	This function shuts down a device interface and removes it
 9688 *	from the kernel tables.
 9689 *
 9690 *	This is just a wrapper for unregister_netdevice that takes
 9691 *	the rtnl semaphore.  In general you want to use this and not
 9692 *	unregister_netdevice.
 9693 */
 9694void unregister_netdev(struct net_device *dev)
 9695{
 9696	rtnl_lock();
 9697	unregister_netdevice(dev);
 9698	rtnl_unlock();
 9699}
 9700EXPORT_SYMBOL(unregister_netdev);
 9701
 9702/**
 9703 *	dev_change_net_namespace - move device to different nethost namespace
 9704 *	@dev: device
 9705 *	@net: network namespace
 9706 *	@pat: If not NULL name pattern to try if the current device name
 9707 *	      is already taken in the destination network namespace.
 9708 *
 9709 *	This function shuts down a device interface and moves it
 9710 *	to a new network namespace. On success 0 is returned, on
 9711 *	a failure a netagive errno code is returned.
 9712 *
 9713 *	Callers must hold the rtnl semaphore.
 9714 */
 9715
 9716int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
 9717{
 9718	int err, new_nsid, new_ifindex;
 9719
 9720	ASSERT_RTNL();
 9721
 9722	/* Don't allow namespace local devices to be moved. */
 9723	err = -EINVAL;
 9724	if (dev->features & NETIF_F_NETNS_LOCAL)
 9725		goto out;
 9726
 9727	/* Ensure the device has been registrered */
 9728	if (dev->reg_state != NETREG_REGISTERED)
 9729		goto out;
 9730
 9731	/* Get out if there is nothing todo */
 9732	err = 0;
 9733	if (net_eq(dev_net(dev), net))
 9734		goto out;
 9735
 9736	/* Pick the destination device name, and ensure
 9737	 * we can use it in the destination network namespace.
 9738	 */
 9739	err = -EEXIST;
 9740	if (__dev_get_by_name(net, dev->name)) {
 9741		/* We get here if we can't use the current device name */
 9742		if (!pat)
 9743			goto out;
 9744		err = dev_get_valid_name(net, dev, pat);
 9745		if (err < 0)
 9746			goto out;
 9747	}
 9748
 9749	/*
 9750	 * And now a mini version of register_netdevice unregister_netdevice.
 9751	 */
 9752
 9753	/* If device is running close it first. */
 9754	dev_close(dev);
 9755
 9756	/* And unlink it from device chain */
 
 9757	unlist_netdevice(dev);
 9758
 9759	synchronize_net();
 9760
 9761	/* Shutdown queueing discipline. */
 9762	dev_shutdown(dev);
 9763
 9764	/* Notify protocols, that we are about to destroy
 9765	 * this device. They should clean all the things.
 9766	 *
 9767	 * Note that dev->reg_state stays at NETREG_REGISTERED.
 9768	 * This is wanted because this way 8021q and macvlan know
 9769	 * the device is just moving and can keep their slaves up.
 9770	 */
 9771	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 9772	rcu_barrier();
 9773
 9774	new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
 9775	/* If there is an ifindex conflict assign a new one */
 9776	if (__dev_get_by_index(net, dev->ifindex))
 9777		new_ifindex = dev_new_index(net);
 9778	else
 9779		new_ifindex = dev->ifindex;
 9780
 9781	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
 9782			    new_ifindex);
 9783
 9784	/*
 9785	 *	Flush the unicast and multicast chains
 9786	 */
 9787	dev_uc_flush(dev);
 9788	dev_mc_flush(dev);
 9789
 9790	/* Send a netdev-removed uevent to the old namespace */
 9791	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
 9792	netdev_adjacent_del_links(dev);
 9793
 9794	/* Actually switch the network namespace */
 9795	dev_net_set(dev, net);
 9796	dev->ifindex = new_ifindex;
 
 
 
 9797
 9798	/* Send a netdev-add uevent to the new namespace */
 9799	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
 9800	netdev_adjacent_add_links(dev);
 9801
 9802	/* Fixup kobjects */
 9803	err = device_rename(&dev->dev, dev->name);
 9804	WARN_ON(err);
 9805
 9806	/* Add the device back in the hashes */
 9807	list_netdevice(dev);
 9808
 9809	/* Notify protocols, that a new device appeared. */
 9810	call_netdevice_notifiers(NETDEV_REGISTER, dev);
 9811
 9812	/*
 9813	 *	Prevent userspace races by waiting until the network
 9814	 *	device is fully setup before sending notifications.
 9815	 */
 9816	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
 9817
 9818	synchronize_net();
 9819	err = 0;
 9820out:
 9821	return err;
 9822}
 9823EXPORT_SYMBOL_GPL(dev_change_net_namespace);
 9824
 9825static int dev_cpu_dead(unsigned int oldcpu)
 
 
 9826{
 9827	struct sk_buff **list_skb;
 9828	struct sk_buff *skb;
 9829	unsigned int cpu;
 9830	struct softnet_data *sd, *oldsd, *remsd = NULL;
 
 
 
 9831
 9832	local_irq_disable();
 9833	cpu = smp_processor_id();
 9834	sd = &per_cpu(softnet_data, cpu);
 9835	oldsd = &per_cpu(softnet_data, oldcpu);
 9836
 9837	/* Find end of our completion_queue. */
 9838	list_skb = &sd->completion_queue;
 9839	while (*list_skb)
 9840		list_skb = &(*list_skb)->next;
 9841	/* Append completion queue from offline CPU. */
 9842	*list_skb = oldsd->completion_queue;
 9843	oldsd->completion_queue = NULL;
 9844
 9845	/* Append output queue from offline CPU. */
 9846	if (oldsd->output_queue) {
 9847		*sd->output_queue_tailp = oldsd->output_queue;
 9848		sd->output_queue_tailp = oldsd->output_queue_tailp;
 9849		oldsd->output_queue = NULL;
 9850		oldsd->output_queue_tailp = &oldsd->output_queue;
 9851	}
 9852	/* Append NAPI poll list from offline CPU, with one exception :
 9853	 * process_backlog() must be called by cpu owning percpu backlog.
 9854	 * We properly handle process_queue & input_pkt_queue later.
 9855	 */
 9856	while (!list_empty(&oldsd->poll_list)) {
 9857		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
 9858							    struct napi_struct,
 9859							    poll_list);
 9860
 9861		list_del_init(&napi->poll_list);
 9862		if (napi->poll == process_backlog)
 9863			napi->state = 0;
 9864		else
 9865			____napi_schedule(sd, napi);
 9866	}
 9867
 9868	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 9869	local_irq_enable();
 9870
 9871#ifdef CONFIG_RPS
 9872	remsd = oldsd->rps_ipi_list;
 9873	oldsd->rps_ipi_list = NULL;
 9874#endif
 9875	/* send out pending IPI's on offline CPU */
 9876	net_rps_send_ipi(remsd);
 9877
 9878	/* Process offline CPU's input_pkt_queue */
 9879	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
 9880		netif_rx_ni(skb);
 9881		input_queue_head_incr(oldsd);
 9882	}
 9883	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
 9884		netif_rx_ni(skb);
 9885		input_queue_head_incr(oldsd);
 9886	}
 9887
 9888	return 0;
 9889}
 9890
 
 9891/**
 9892 *	netdev_increment_features - increment feature set by one
 9893 *	@all: current feature set
 9894 *	@one: new feature set
 9895 *	@mask: mask feature set
 9896 *
 9897 *	Computes a new feature set after adding a device with feature set
 9898 *	@one to the master device with current feature set @all.  Will not
 9899 *	enable anything that is off in @mask. Returns the new feature set.
 9900 */
 9901netdev_features_t netdev_increment_features(netdev_features_t all,
 9902	netdev_features_t one, netdev_features_t mask)
 9903{
 9904	if (mask & NETIF_F_HW_CSUM)
 9905		mask |= NETIF_F_CSUM_MASK;
 9906	mask |= NETIF_F_VLAN_CHALLENGED;
 9907
 9908	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
 9909	all &= one | ~NETIF_F_ALL_FOR_ALL;
 9910
 9911	/* If one device supports hw checksumming, set for all. */
 9912	if (all & NETIF_F_HW_CSUM)
 9913		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
 9914
 9915	return all;
 9916}
 9917EXPORT_SYMBOL(netdev_increment_features);
 9918
 9919static struct hlist_head * __net_init netdev_create_hash(void)
 9920{
 9921	int i;
 9922	struct hlist_head *hash;
 9923
 9924	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
 9925	if (hash != NULL)
 9926		for (i = 0; i < NETDEV_HASHENTRIES; i++)
 9927			INIT_HLIST_HEAD(&hash[i]);
 9928
 9929	return hash;
 9930}
 9931
 9932/* Initialize per network namespace state */
 9933static int __net_init netdev_init(struct net *net)
 9934{
 9935	BUILD_BUG_ON(GRO_HASH_BUCKETS >
 9936		     8 * FIELD_SIZEOF(struct napi_struct, gro_bitmask));
 9937
 9938	if (net != &init_net)
 9939		INIT_LIST_HEAD(&net->dev_base_head);
 9940
 9941	net->dev_name_head = netdev_create_hash();
 9942	if (net->dev_name_head == NULL)
 9943		goto err_name;
 9944
 9945	net->dev_index_head = netdev_create_hash();
 9946	if (net->dev_index_head == NULL)
 9947		goto err_idx;
 9948
 9949	return 0;
 9950
 9951err_idx:
 9952	kfree(net->dev_name_head);
 9953err_name:
 9954	return -ENOMEM;
 9955}
 9956
 9957/**
 9958 *	netdev_drivername - network driver for the device
 9959 *	@dev: network device
 9960 *
 9961 *	Determine network driver for device.
 9962 */
 9963const char *netdev_drivername(const struct net_device *dev)
 9964{
 9965	const struct device_driver *driver;
 9966	const struct device *parent;
 9967	const char *empty = "";
 9968
 9969	parent = dev->dev.parent;
 9970	if (!parent)
 9971		return empty;
 9972
 9973	driver = parent->driver;
 9974	if (driver && driver->name)
 9975		return driver->name;
 9976	return empty;
 9977}
 9978
 9979static void __netdev_printk(const char *level, const struct net_device *dev,
 9980			    struct va_format *vaf)
 9981{
 9982	if (dev && dev->dev.parent) {
 9983		dev_printk_emit(level[1] - '0',
 9984				dev->dev.parent,
 9985				"%s %s %s%s: %pV",
 9986				dev_driver_string(dev->dev.parent),
 9987				dev_name(dev->dev.parent),
 9988				netdev_name(dev), netdev_reg_state(dev),
 9989				vaf);
 9990	} else if (dev) {
 9991		printk("%s%s%s: %pV",
 9992		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
 9993	} else {
 9994		printk("%s(NULL net_device): %pV", level, vaf);
 9995	}
 9996}
 9997
 9998void netdev_printk(const char *level, const struct net_device *dev,
 9999		   const char *format, ...)
10000{
10001	struct va_format vaf;
10002	va_list args;
10003
10004	va_start(args, format);
10005
10006	vaf.fmt = format;
10007	vaf.va = &args;
10008
10009	__netdev_printk(level, dev, &vaf);
10010
10011	va_end(args);
10012}
10013EXPORT_SYMBOL(netdev_printk);
10014
10015#define define_netdev_printk_level(func, level)			\
10016void func(const struct net_device *dev, const char *fmt, ...)	\
10017{								\
10018	struct va_format vaf;					\
10019	va_list args;						\
10020								\
10021	va_start(args, fmt);					\
10022								\
10023	vaf.fmt = fmt;						\
10024	vaf.va = &args;						\
10025								\
10026	__netdev_printk(level, dev, &vaf);			\
10027								\
10028	va_end(args);						\
10029}								\
10030EXPORT_SYMBOL(func);
10031
10032define_netdev_printk_level(netdev_emerg, KERN_EMERG);
10033define_netdev_printk_level(netdev_alert, KERN_ALERT);
10034define_netdev_printk_level(netdev_crit, KERN_CRIT);
10035define_netdev_printk_level(netdev_err, KERN_ERR);
10036define_netdev_printk_level(netdev_warn, KERN_WARNING);
10037define_netdev_printk_level(netdev_notice, KERN_NOTICE);
10038define_netdev_printk_level(netdev_info, KERN_INFO);
10039
10040static void __net_exit netdev_exit(struct net *net)
10041{
10042	kfree(net->dev_name_head);
10043	kfree(net->dev_index_head);
10044	if (net != &init_net)
10045		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
10046}
10047
10048static struct pernet_operations __net_initdata netdev_net_ops = {
10049	.init = netdev_init,
10050	.exit = netdev_exit,
10051};
10052
10053static void __net_exit default_device_exit(struct net *net)
10054{
10055	struct net_device *dev, *aux;
10056	/*
10057	 * Push all migratable network devices back to the
10058	 * initial network namespace
10059	 */
10060	rtnl_lock();
10061	for_each_netdev_safe(net, dev, aux) {
10062		int err;
10063		char fb_name[IFNAMSIZ];
10064
10065		/* Ignore unmoveable devices (i.e. loopback) */
10066		if (dev->features & NETIF_F_NETNS_LOCAL)
10067			continue;
10068
10069		/* Leave virtual devices for the generic cleanup */
10070		if (dev->rtnl_link_ops)
10071			continue;
10072
10073		/* Push remaining network devices to init_net */
10074		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
10075		if (__dev_get_by_name(&init_net, fb_name))
10076			snprintf(fb_name, IFNAMSIZ, "dev%%d");
10077		err = dev_change_net_namespace(dev, &init_net, fb_name);
10078		if (err) {
10079			pr_emerg("%s: failed to move %s to init_net: %d\n",
10080				 __func__, dev->name, err);
10081			BUG();
10082		}
10083	}
10084	rtnl_unlock();
10085}
10086
10087static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
10088{
10089	/* Return with the rtnl_lock held when there are no network
10090	 * devices unregistering in any network namespace in net_list.
10091	 */
10092	struct net *net;
10093	bool unregistering;
10094	DEFINE_WAIT_FUNC(wait, woken_wake_function);
10095
10096	add_wait_queue(&netdev_unregistering_wq, &wait);
10097	for (;;) {
10098		unregistering = false;
10099		rtnl_lock();
10100		list_for_each_entry(net, net_list, exit_list) {
10101			if (net->dev_unreg_count > 0) {
10102				unregistering = true;
10103				break;
10104			}
10105		}
10106		if (!unregistering)
10107			break;
10108		__rtnl_unlock();
10109
10110		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
10111	}
10112	remove_wait_queue(&netdev_unregistering_wq, &wait);
10113}
10114
10115static void __net_exit default_device_exit_batch(struct list_head *net_list)
10116{
10117	/* At exit all network devices most be removed from a network
10118	 * namespace.  Do this in the reverse order of registration.
10119	 * Do this across as many network namespaces as possible to
10120	 * improve batching efficiency.
10121	 */
10122	struct net_device *dev;
10123	struct net *net;
10124	LIST_HEAD(dev_kill_list);
10125
10126	/* To prevent network device cleanup code from dereferencing
10127	 * loopback devices or network devices that have been freed
10128	 * wait here for all pending unregistrations to complete,
10129	 * before unregistring the loopback device and allowing the
10130	 * network namespace be freed.
10131	 *
10132	 * The netdev todo list containing all network devices
10133	 * unregistrations that happen in default_device_exit_batch
10134	 * will run in the rtnl_unlock() at the end of
10135	 * default_device_exit_batch.
10136	 */
10137	rtnl_lock_unregistering(net_list);
10138	list_for_each_entry(net, net_list, exit_list) {
10139		for_each_netdev_reverse(net, dev) {
10140			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
10141				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
10142			else
10143				unregister_netdevice_queue(dev, &dev_kill_list);
10144		}
10145	}
10146	unregister_netdevice_many(&dev_kill_list);
10147	rtnl_unlock();
10148}
10149
10150static struct pernet_operations __net_initdata default_device_ops = {
10151	.exit = default_device_exit,
10152	.exit_batch = default_device_exit_batch,
10153};
10154
10155/*
10156 *	Initialize the DEV module. At boot time this walks the device list and
10157 *	unhooks any devices that fail to initialise (normally hardware not
10158 *	present) and leaves us with a valid list of present and active devices.
10159 *
10160 */
10161
10162/*
10163 *       This is called single threaded during boot, so no need
10164 *       to take the rtnl semaphore.
10165 */
10166static int __init net_dev_init(void)
10167{
10168	int i, rc = -ENOMEM;
10169
10170	BUG_ON(!dev_boot_phase);
10171
10172	if (dev_proc_init())
10173		goto out;
10174
10175	if (netdev_kobject_init())
10176		goto out;
10177
10178	INIT_LIST_HEAD(&ptype_all);
10179	for (i = 0; i < PTYPE_HASH_SIZE; i++)
10180		INIT_LIST_HEAD(&ptype_base[i]);
10181
10182	INIT_LIST_HEAD(&offload_base);
10183
10184	if (register_pernet_subsys(&netdev_net_ops))
10185		goto out;
10186
10187	/*
10188	 *	Initialise the packet receive queues.
10189	 */
10190
10191	for_each_possible_cpu(i) {
10192		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
10193		struct softnet_data *sd = &per_cpu(softnet_data, i);
10194
10195		INIT_WORK(flush, flush_backlog);
10196
10197		skb_queue_head_init(&sd->input_pkt_queue);
10198		skb_queue_head_init(&sd->process_queue);
10199#ifdef CONFIG_XFRM_OFFLOAD
10200		skb_queue_head_init(&sd->xfrm_backlog);
10201#endif
10202		INIT_LIST_HEAD(&sd->poll_list);
10203		sd->output_queue_tailp = &sd->output_queue;
10204#ifdef CONFIG_RPS
10205		sd->csd.func = rps_trigger_softirq;
10206		sd->csd.info = sd;
10207		sd->cpu = i;
10208#endif
10209
10210		init_gro_hash(&sd->backlog);
10211		sd->backlog.poll = process_backlog;
10212		sd->backlog.weight = weight_p;
10213	}
10214
10215	dev_boot_phase = 0;
10216
10217	/* The loopback device is special if any other network devices
10218	 * is present in a network namespace the loopback device must
10219	 * be present. Since we now dynamically allocate and free the
10220	 * loopback device ensure this invariant is maintained by
10221	 * keeping the loopback device as the first device on the
10222	 * list of network devices.  Ensuring the loopback devices
10223	 * is the first device that appears and the last network device
10224	 * that disappears.
10225	 */
10226	if (register_pernet_device(&loopback_net_ops))
10227		goto out;
10228
10229	if (register_pernet_device(&default_device_ops))
10230		goto out;
10231
10232	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
10233	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
10234
10235	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
10236				       NULL, dev_cpu_dead);
10237	WARN_ON(rc < 0);
10238	rc = 0;
10239out:
10240	return rc;
10241}
10242
10243subsys_initcall(net_dev_init);
v4.6
 
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
 
  84#include <linux/mutex.h>
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
 
 
  97#include <net/net_namespace.h>
  98#include <net/sock.h>
  99#include <net/busy_poll.h>
 100#include <linux/rtnetlink.h>
 101#include <linux/stat.h>
 102#include <net/dst.h>
 103#include <net/dst_metadata.h>
 104#include <net/pkt_sched.h>
 
 105#include <net/checksum.h>
 106#include <net/xfrm.h>
 107#include <linux/highmem.h>
 108#include <linux/init.h>
 109#include <linux/module.h>
 110#include <linux/netpoll.h>
 111#include <linux/rcupdate.h>
 112#include <linux/delay.h>
 113#include <net/iw_handler.h>
 114#include <asm/current.h>
 115#include <linux/audit.h>
 116#include <linux/dmaengine.h>
 117#include <linux/err.h>
 118#include <linux/ctype.h>
 119#include <linux/if_arp.h>
 120#include <linux/if_vlan.h>
 121#include <linux/ip.h>
 122#include <net/ip.h>
 123#include <net/mpls.h>
 124#include <linux/ipv6.h>
 125#include <linux/in.h>
 126#include <linux/jhash.h>
 127#include <linux/random.h>
 128#include <trace/events/napi.h>
 129#include <trace/events/net.h>
 130#include <trace/events/skb.h>
 131#include <linux/pci.h>
 132#include <linux/inetdevice.h>
 133#include <linux/cpu_rmap.h>
 134#include <linux/static_key.h>
 135#include <linux/hashtable.h>
 136#include <linux/vmalloc.h>
 137#include <linux/if_macvlan.h>
 138#include <linux/errqueue.h>
 139#include <linux/hrtimer.h>
 140#include <linux/netfilter_ingress.h>
 
 141#include <linux/sctp.h>
 
 
 
 
 142
 143#include "net-sysfs.h"
 144
 145/* Instead of increasing this, you should create a hash table. */
 146#define MAX_GRO_SKBS 8
 
 147
 148/* This should be increased if a protocol with a bigger head is added. */
 149#define GRO_MAX_HEAD (MAX_HEADER + 128)
 150
 151static DEFINE_SPINLOCK(ptype_lock);
 152static DEFINE_SPINLOCK(offload_lock);
 153struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 154struct list_head ptype_all __read_mostly;	/* Taps */
 155static struct list_head offload_base __read_mostly;
 156
 157static int netif_rx_internal(struct sk_buff *skb);
 158static int call_netdevice_notifiers_info(unsigned long val,
 159					 struct net_device *dev,
 160					 struct netdev_notifier_info *info);
 
 
 
 
 161
 162/*
 163 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 164 * semaphore.
 165 *
 166 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 167 *
 168 * Writers must hold the rtnl semaphore while they loop through the
 169 * dev_base_head list, and hold dev_base_lock for writing when they do the
 170 * actual updates.  This allows pure readers to access the list even
 171 * while a writer is preparing to update it.
 172 *
 173 * To put it another way, dev_base_lock is held for writing only to
 174 * protect against pure readers; the rtnl semaphore provides the
 175 * protection against other writers.
 176 *
 177 * See, for example usages, register_netdevice() and
 178 * unregister_netdevice(), which must be called with the rtnl
 179 * semaphore held.
 180 */
 181DEFINE_RWLOCK(dev_base_lock);
 182EXPORT_SYMBOL(dev_base_lock);
 183
 
 
 184/* protects napi_hash addition/deletion and napi_gen_id */
 185static DEFINE_SPINLOCK(napi_hash_lock);
 186
 187static unsigned int napi_gen_id = NR_CPUS;
 188static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 189
 190static seqcount_t devnet_rename_seq;
 191
 192static inline void dev_base_seq_inc(struct net *net)
 193{
 194	while (++net->dev_base_seq == 0);
 
 195}
 196
 197static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 198{
 199	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 200
 201	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 202}
 203
 204static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 205{
 206	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 207}
 208
 209static inline void rps_lock(struct softnet_data *sd)
 210{
 211#ifdef CONFIG_RPS
 212	spin_lock(&sd->input_pkt_queue.lock);
 213#endif
 214}
 215
 216static inline void rps_unlock(struct softnet_data *sd)
 217{
 218#ifdef CONFIG_RPS
 219	spin_unlock(&sd->input_pkt_queue.lock);
 220#endif
 221}
 222
 223/* Device list insertion */
 224static void list_netdevice(struct net_device *dev)
 225{
 226	struct net *net = dev_net(dev);
 227
 228	ASSERT_RTNL();
 229
 230	write_lock_bh(&dev_base_lock);
 231	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 232	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 233	hlist_add_head_rcu(&dev->index_hlist,
 234			   dev_index_hash(net, dev->ifindex));
 235	write_unlock_bh(&dev_base_lock);
 236
 237	dev_base_seq_inc(net);
 238}
 239
 240/* Device list removal
 241 * caller must respect a RCU grace period before freeing/reusing dev
 242 */
 243static void unlist_netdevice(struct net_device *dev)
 244{
 245	ASSERT_RTNL();
 246
 247	/* Unlink dev from the device chain */
 248	write_lock_bh(&dev_base_lock);
 249	list_del_rcu(&dev->dev_list);
 250	hlist_del_rcu(&dev->name_hlist);
 251	hlist_del_rcu(&dev->index_hlist);
 252	write_unlock_bh(&dev_base_lock);
 253
 254	dev_base_seq_inc(dev_net(dev));
 255}
 256
 257/*
 258 *	Our notifier list
 259 */
 260
 261static RAW_NOTIFIER_HEAD(netdev_chain);
 262
 263/*
 264 *	Device drivers call our routines to queue packets here. We empty the
 265 *	queue in the local softnet handler.
 266 */
 267
 268DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 269EXPORT_PER_CPU_SYMBOL(softnet_data);
 270
 271#ifdef CONFIG_LOCKDEP
 272/*
 273 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 274 * according to dev->type
 275 */
 276static const unsigned short netdev_lock_type[] =
 277	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 278	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 279	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 280	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 281	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 282	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 283	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 284	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 285	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 286	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 287	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 288	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 289	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 290	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 291	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 292
 293static const char *const netdev_lock_name[] =
 294	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 295	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 296	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 297	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 298	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 299	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 300	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 301	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 302	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 303	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 304	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 305	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 306	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 307	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 308	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 309
 310static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 311static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 312
 313static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 314{
 315	int i;
 316
 317	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 318		if (netdev_lock_type[i] == dev_type)
 319			return i;
 320	/* the last key is used by default */
 321	return ARRAY_SIZE(netdev_lock_type) - 1;
 322}
 323
 324static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 325						 unsigned short dev_type)
 326{
 327	int i;
 328
 329	i = netdev_lock_pos(dev_type);
 330	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 331				   netdev_lock_name[i]);
 332}
 333
 334static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 335{
 336	int i;
 337
 338	i = netdev_lock_pos(dev->type);
 339	lockdep_set_class_and_name(&dev->addr_list_lock,
 340				   &netdev_addr_lock_key[i],
 341				   netdev_lock_name[i]);
 342}
 343#else
 344static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 345						 unsigned short dev_type)
 346{
 347}
 348static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 349{
 350}
 351#endif
 352
 353/*******************************************************************************
 
 
 
 
 354
 355		Protocol management and registration routines
 356
 357*******************************************************************************/
 358
 359/*
 360 *	Add a protocol ID to the list. Now that the input handler is
 361 *	smarter we can dispense with all the messy stuff that used to be
 362 *	here.
 363 *
 364 *	BEWARE!!! Protocol handlers, mangling input packets,
 365 *	MUST BE last in hash buckets and checking protocol handlers
 366 *	MUST start from promiscuous ptype_all chain in net_bh.
 367 *	It is true now, do not change it.
 368 *	Explanation follows: if protocol handler, mangling packet, will
 369 *	be the first on list, it is not able to sense, that packet
 370 *	is cloned and should be copied-on-write, so that it will
 371 *	change it and subsequent readers will get broken packet.
 372 *							--ANK (980803)
 373 */
 374
 375static inline struct list_head *ptype_head(const struct packet_type *pt)
 376{
 377	if (pt->type == htons(ETH_P_ALL))
 378		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 379	else
 380		return pt->dev ? &pt->dev->ptype_specific :
 381				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 382}
 383
 384/**
 385 *	dev_add_pack - add packet handler
 386 *	@pt: packet type declaration
 387 *
 388 *	Add a protocol handler to the networking stack. The passed &packet_type
 389 *	is linked into kernel lists and may not be freed until it has been
 390 *	removed from the kernel lists.
 391 *
 392 *	This call does not sleep therefore it can not
 393 *	guarantee all CPU's that are in middle of receiving packets
 394 *	will see the new packet type (until the next received packet).
 395 */
 396
 397void dev_add_pack(struct packet_type *pt)
 398{
 399	struct list_head *head = ptype_head(pt);
 400
 401	spin_lock(&ptype_lock);
 402	list_add_rcu(&pt->list, head);
 403	spin_unlock(&ptype_lock);
 404}
 405EXPORT_SYMBOL(dev_add_pack);
 406
 407/**
 408 *	__dev_remove_pack	 - remove packet handler
 409 *	@pt: packet type declaration
 410 *
 411 *	Remove a protocol handler that was previously added to the kernel
 412 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 413 *	from the kernel lists and can be freed or reused once this function
 414 *	returns.
 415 *
 416 *      The packet type might still be in use by receivers
 417 *	and must not be freed until after all the CPU's have gone
 418 *	through a quiescent state.
 419 */
 420void __dev_remove_pack(struct packet_type *pt)
 421{
 422	struct list_head *head = ptype_head(pt);
 423	struct packet_type *pt1;
 424
 425	spin_lock(&ptype_lock);
 426
 427	list_for_each_entry(pt1, head, list) {
 428		if (pt == pt1) {
 429			list_del_rcu(&pt->list);
 430			goto out;
 431		}
 432	}
 433
 434	pr_warn("dev_remove_pack: %p not found\n", pt);
 435out:
 436	spin_unlock(&ptype_lock);
 437}
 438EXPORT_SYMBOL(__dev_remove_pack);
 439
 440/**
 441 *	dev_remove_pack	 - remove packet handler
 442 *	@pt: packet type declaration
 443 *
 444 *	Remove a protocol handler that was previously added to the kernel
 445 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 446 *	from the kernel lists and can be freed or reused once this function
 447 *	returns.
 448 *
 449 *	This call sleeps to guarantee that no CPU is looking at the packet
 450 *	type after return.
 451 */
 452void dev_remove_pack(struct packet_type *pt)
 453{
 454	__dev_remove_pack(pt);
 455
 456	synchronize_net();
 457}
 458EXPORT_SYMBOL(dev_remove_pack);
 459
 460
 461/**
 462 *	dev_add_offload - register offload handlers
 463 *	@po: protocol offload declaration
 464 *
 465 *	Add protocol offload handlers to the networking stack. The passed
 466 *	&proto_offload is linked into kernel lists and may not be freed until
 467 *	it has been removed from the kernel lists.
 468 *
 469 *	This call does not sleep therefore it can not
 470 *	guarantee all CPU's that are in middle of receiving packets
 471 *	will see the new offload handlers (until the next received packet).
 472 */
 473void dev_add_offload(struct packet_offload *po)
 474{
 475	struct packet_offload *elem;
 476
 477	spin_lock(&offload_lock);
 478	list_for_each_entry(elem, &offload_base, list) {
 479		if (po->priority < elem->priority)
 480			break;
 481	}
 482	list_add_rcu(&po->list, elem->list.prev);
 483	spin_unlock(&offload_lock);
 484}
 485EXPORT_SYMBOL(dev_add_offload);
 486
 487/**
 488 *	__dev_remove_offload	 - remove offload handler
 489 *	@po: packet offload declaration
 490 *
 491 *	Remove a protocol offload handler that was previously added to the
 492 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
 493 *	is removed from the kernel lists and can be freed or reused once this
 494 *	function returns.
 495 *
 496 *      The packet type might still be in use by receivers
 497 *	and must not be freed until after all the CPU's have gone
 498 *	through a quiescent state.
 499 */
 500static void __dev_remove_offload(struct packet_offload *po)
 501{
 502	struct list_head *head = &offload_base;
 503	struct packet_offload *po1;
 504
 505	spin_lock(&offload_lock);
 506
 507	list_for_each_entry(po1, head, list) {
 508		if (po == po1) {
 509			list_del_rcu(&po->list);
 510			goto out;
 511		}
 512	}
 513
 514	pr_warn("dev_remove_offload: %p not found\n", po);
 515out:
 516	spin_unlock(&offload_lock);
 517}
 518
 519/**
 520 *	dev_remove_offload	 - remove packet offload handler
 521 *	@po: packet offload declaration
 522 *
 523 *	Remove a packet offload handler that was previously added to the kernel
 524 *	offload handlers by dev_add_offload(). The passed &offload_type is
 525 *	removed from the kernel lists and can be freed or reused once this
 526 *	function returns.
 527 *
 528 *	This call sleeps to guarantee that no CPU is looking at the packet
 529 *	type after return.
 530 */
 531void dev_remove_offload(struct packet_offload *po)
 532{
 533	__dev_remove_offload(po);
 534
 535	synchronize_net();
 536}
 537EXPORT_SYMBOL(dev_remove_offload);
 538
 539/******************************************************************************
 540
 541		      Device Boot-time Settings Routines
 542
 543*******************************************************************************/
 544
 545/* Boot time configuration table */
 546static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 547
 548/**
 549 *	netdev_boot_setup_add	- add new setup entry
 550 *	@name: name of the device
 551 *	@map: configured settings for the device
 552 *
 553 *	Adds new setup entry to the dev_boot_setup list.  The function
 554 *	returns 0 on error and 1 on success.  This is a generic routine to
 555 *	all netdevices.
 556 */
 557static int netdev_boot_setup_add(char *name, struct ifmap *map)
 558{
 559	struct netdev_boot_setup *s;
 560	int i;
 561
 562	s = dev_boot_setup;
 563	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 564		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 565			memset(s[i].name, 0, sizeof(s[i].name));
 566			strlcpy(s[i].name, name, IFNAMSIZ);
 567			memcpy(&s[i].map, map, sizeof(s[i].map));
 568			break;
 569		}
 570	}
 571
 572	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 573}
 574
 575/**
 576 *	netdev_boot_setup_check	- check boot time settings
 577 *	@dev: the netdevice
 578 *
 579 * 	Check boot time settings for the device.
 580 *	The found settings are set for the device to be used
 581 *	later in the device probing.
 582 *	Returns 0 if no settings found, 1 if they are.
 583 */
 584int netdev_boot_setup_check(struct net_device *dev)
 585{
 586	struct netdev_boot_setup *s = dev_boot_setup;
 587	int i;
 588
 589	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 590		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 591		    !strcmp(dev->name, s[i].name)) {
 592			dev->irq 	= s[i].map.irq;
 593			dev->base_addr 	= s[i].map.base_addr;
 594			dev->mem_start 	= s[i].map.mem_start;
 595			dev->mem_end 	= s[i].map.mem_end;
 596			return 1;
 597		}
 598	}
 599	return 0;
 600}
 601EXPORT_SYMBOL(netdev_boot_setup_check);
 602
 603
 604/**
 605 *	netdev_boot_base	- get address from boot time settings
 606 *	@prefix: prefix for network device
 607 *	@unit: id for network device
 608 *
 609 * 	Check boot time settings for the base address of device.
 610 *	The found settings are set for the device to be used
 611 *	later in the device probing.
 612 *	Returns 0 if no settings found.
 613 */
 614unsigned long netdev_boot_base(const char *prefix, int unit)
 615{
 616	const struct netdev_boot_setup *s = dev_boot_setup;
 617	char name[IFNAMSIZ];
 618	int i;
 619
 620	sprintf(name, "%s%d", prefix, unit);
 621
 622	/*
 623	 * If device already registered then return base of 1
 624	 * to indicate not to probe for this interface
 625	 */
 626	if (__dev_get_by_name(&init_net, name))
 627		return 1;
 628
 629	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 630		if (!strcmp(name, s[i].name))
 631			return s[i].map.base_addr;
 632	return 0;
 633}
 634
 635/*
 636 * Saves at boot time configured settings for any netdevice.
 637 */
 638int __init netdev_boot_setup(char *str)
 639{
 640	int ints[5];
 641	struct ifmap map;
 642
 643	str = get_options(str, ARRAY_SIZE(ints), ints);
 644	if (!str || !*str)
 645		return 0;
 646
 647	/* Save settings */
 648	memset(&map, 0, sizeof(map));
 649	if (ints[0] > 0)
 650		map.irq = ints[1];
 651	if (ints[0] > 1)
 652		map.base_addr = ints[2];
 653	if (ints[0] > 2)
 654		map.mem_start = ints[3];
 655	if (ints[0] > 3)
 656		map.mem_end = ints[4];
 657
 658	/* Add new entry to the list */
 659	return netdev_boot_setup_add(str, &map);
 660}
 661
 662__setup("netdev=", netdev_boot_setup);
 663
 664/*******************************************************************************
 665
 666			    Device Interface Subroutines
 667
 668*******************************************************************************/
 669
 670/**
 671 *	dev_get_iflink	- get 'iflink' value of a interface
 672 *	@dev: targeted interface
 673 *
 674 *	Indicates the ifindex the interface is linked to.
 675 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
 676 */
 677
 678int dev_get_iflink(const struct net_device *dev)
 679{
 680	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 681		return dev->netdev_ops->ndo_get_iflink(dev);
 682
 683	return dev->ifindex;
 684}
 685EXPORT_SYMBOL(dev_get_iflink);
 686
 687/**
 688 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
 689 *	@dev: targeted interface
 690 *	@skb: The packet.
 691 *
 692 *	For better visibility of tunnel traffic OVS needs to retrieve
 693 *	egress tunnel information for a packet. Following API allows
 694 *	user to get this info.
 695 */
 696int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 697{
 698	struct ip_tunnel_info *info;
 699
 700	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 701		return -EINVAL;
 702
 703	info = skb_tunnel_info_unclone(skb);
 704	if (!info)
 705		return -ENOMEM;
 706	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 707		return -EINVAL;
 708
 709	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 710}
 711EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 712
 713/**
 714 *	__dev_get_by_name	- find a device by its name
 715 *	@net: the applicable net namespace
 716 *	@name: name to find
 717 *
 718 *	Find an interface by name. Must be called under RTNL semaphore
 719 *	or @dev_base_lock. If the name is found a pointer to the device
 720 *	is returned. If the name is not found then %NULL is returned. The
 721 *	reference counters are not incremented so the caller must be
 722 *	careful with locks.
 723 */
 724
 725struct net_device *__dev_get_by_name(struct net *net, const char *name)
 726{
 727	struct net_device *dev;
 728	struct hlist_head *head = dev_name_hash(net, name);
 729
 730	hlist_for_each_entry(dev, head, name_hlist)
 731		if (!strncmp(dev->name, name, IFNAMSIZ))
 732			return dev;
 733
 734	return NULL;
 735}
 736EXPORT_SYMBOL(__dev_get_by_name);
 737
 738/**
 739 *	dev_get_by_name_rcu	- find a device by its name
 740 *	@net: the applicable net namespace
 741 *	@name: name to find
 742 *
 743 *	Find an interface by name.
 744 *	If the name is found a pointer to the device is returned.
 745 * 	If the name is not found then %NULL is returned.
 746 *	The reference counters are not incremented so the caller must be
 747 *	careful with locks. The caller must hold RCU lock.
 748 */
 749
 750struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 751{
 752	struct net_device *dev;
 753	struct hlist_head *head = dev_name_hash(net, name);
 754
 755	hlist_for_each_entry_rcu(dev, head, name_hlist)
 756		if (!strncmp(dev->name, name, IFNAMSIZ))
 757			return dev;
 758
 759	return NULL;
 760}
 761EXPORT_SYMBOL(dev_get_by_name_rcu);
 762
 763/**
 764 *	dev_get_by_name		- find a device by its name
 765 *	@net: the applicable net namespace
 766 *	@name: name to find
 767 *
 768 *	Find an interface by name. This can be called from any
 769 *	context and does its own locking. The returned handle has
 770 *	the usage count incremented and the caller must use dev_put() to
 771 *	release it when it is no longer needed. %NULL is returned if no
 772 *	matching device is found.
 773 */
 774
 775struct net_device *dev_get_by_name(struct net *net, const char *name)
 776{
 777	struct net_device *dev;
 778
 779	rcu_read_lock();
 780	dev = dev_get_by_name_rcu(net, name);
 781	if (dev)
 782		dev_hold(dev);
 783	rcu_read_unlock();
 784	return dev;
 785}
 786EXPORT_SYMBOL(dev_get_by_name);
 787
 788/**
 789 *	__dev_get_by_index - find a device by its ifindex
 790 *	@net: the applicable net namespace
 791 *	@ifindex: index of device
 792 *
 793 *	Search for an interface by index. Returns %NULL if the device
 794 *	is not found or a pointer to the device. The device has not
 795 *	had its reference counter increased so the caller must be careful
 796 *	about locking. The caller must hold either the RTNL semaphore
 797 *	or @dev_base_lock.
 798 */
 799
 800struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 801{
 802	struct net_device *dev;
 803	struct hlist_head *head = dev_index_hash(net, ifindex);
 804
 805	hlist_for_each_entry(dev, head, index_hlist)
 806		if (dev->ifindex == ifindex)
 807			return dev;
 808
 809	return NULL;
 810}
 811EXPORT_SYMBOL(__dev_get_by_index);
 812
 813/**
 814 *	dev_get_by_index_rcu - find a device by its ifindex
 815 *	@net: the applicable net namespace
 816 *	@ifindex: index of device
 817 *
 818 *	Search for an interface by index. Returns %NULL if the device
 819 *	is not found or a pointer to the device. The device has not
 820 *	had its reference counter increased so the caller must be careful
 821 *	about locking. The caller must hold RCU lock.
 822 */
 823
 824struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 825{
 826	struct net_device *dev;
 827	struct hlist_head *head = dev_index_hash(net, ifindex);
 828
 829	hlist_for_each_entry_rcu(dev, head, index_hlist)
 830		if (dev->ifindex == ifindex)
 831			return dev;
 832
 833	return NULL;
 834}
 835EXPORT_SYMBOL(dev_get_by_index_rcu);
 836
 837
 838/**
 839 *	dev_get_by_index - find a device by its ifindex
 840 *	@net: the applicable net namespace
 841 *	@ifindex: index of device
 842 *
 843 *	Search for an interface by index. Returns NULL if the device
 844 *	is not found or a pointer to the device. The device returned has
 845 *	had a reference added and the pointer is safe until the user calls
 846 *	dev_put to indicate they have finished with it.
 847 */
 848
 849struct net_device *dev_get_by_index(struct net *net, int ifindex)
 850{
 851	struct net_device *dev;
 852
 853	rcu_read_lock();
 854	dev = dev_get_by_index_rcu(net, ifindex);
 855	if (dev)
 856		dev_hold(dev);
 857	rcu_read_unlock();
 858	return dev;
 859}
 860EXPORT_SYMBOL(dev_get_by_index);
 861
 862/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 863 *	netdev_get_name - get a netdevice name, knowing its ifindex.
 864 *	@net: network namespace
 865 *	@name: a pointer to the buffer where the name will be stored.
 866 *	@ifindex: the ifindex of the interface to get the name from.
 867 *
 868 *	The use of raw_seqcount_begin() and cond_resched() before
 869 *	retrying is required as we want to give the writers a chance
 870 *	to complete when CONFIG_PREEMPT is not set.
 871 */
 872int netdev_get_name(struct net *net, char *name, int ifindex)
 873{
 874	struct net_device *dev;
 875	unsigned int seq;
 876
 877retry:
 878	seq = raw_seqcount_begin(&devnet_rename_seq);
 879	rcu_read_lock();
 880	dev = dev_get_by_index_rcu(net, ifindex);
 881	if (!dev) {
 882		rcu_read_unlock();
 883		return -ENODEV;
 884	}
 885
 886	strcpy(name, dev->name);
 887	rcu_read_unlock();
 888	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 889		cond_resched();
 890		goto retry;
 891	}
 892
 893	return 0;
 894}
 895
 896/**
 897 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 898 *	@net: the applicable net namespace
 899 *	@type: media type of device
 900 *	@ha: hardware address
 901 *
 902 *	Search for an interface by MAC address. Returns NULL if the device
 903 *	is not found or a pointer to the device.
 904 *	The caller must hold RCU or RTNL.
 905 *	The returned device has not had its ref count increased
 906 *	and the caller must therefore be careful about locking
 907 *
 908 */
 909
 910struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 911				       const char *ha)
 912{
 913	struct net_device *dev;
 914
 915	for_each_netdev_rcu(net, dev)
 916		if (dev->type == type &&
 917		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 918			return dev;
 919
 920	return NULL;
 921}
 922EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 923
 924struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 925{
 926	struct net_device *dev;
 927
 928	ASSERT_RTNL();
 929	for_each_netdev(net, dev)
 930		if (dev->type == type)
 931			return dev;
 932
 933	return NULL;
 934}
 935EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 936
 937struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 938{
 939	struct net_device *dev, *ret = NULL;
 940
 941	rcu_read_lock();
 942	for_each_netdev_rcu(net, dev)
 943		if (dev->type == type) {
 944			dev_hold(dev);
 945			ret = dev;
 946			break;
 947		}
 948	rcu_read_unlock();
 949	return ret;
 950}
 951EXPORT_SYMBOL(dev_getfirstbyhwtype);
 952
 953/**
 954 *	__dev_get_by_flags - find any device with given flags
 955 *	@net: the applicable net namespace
 956 *	@if_flags: IFF_* values
 957 *	@mask: bitmask of bits in if_flags to check
 958 *
 959 *	Search for any interface with the given flags. Returns NULL if a device
 960 *	is not found or a pointer to the device. Must be called inside
 961 *	rtnl_lock(), and result refcount is unchanged.
 962 */
 963
 964struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 965				      unsigned short mask)
 966{
 967	struct net_device *dev, *ret;
 968
 969	ASSERT_RTNL();
 970
 971	ret = NULL;
 972	for_each_netdev(net, dev) {
 973		if (((dev->flags ^ if_flags) & mask) == 0) {
 974			ret = dev;
 975			break;
 976		}
 977	}
 978	return ret;
 979}
 980EXPORT_SYMBOL(__dev_get_by_flags);
 981
 982/**
 983 *	dev_valid_name - check if name is okay for network device
 984 *	@name: name string
 985 *
 986 *	Network device names need to be valid file names to
 987 *	to allow sysfs to work.  We also disallow any kind of
 988 *	whitespace.
 989 */
 990bool dev_valid_name(const char *name)
 991{
 992	if (*name == '\0')
 993		return false;
 994	if (strlen(name) >= IFNAMSIZ)
 995		return false;
 996	if (!strcmp(name, ".") || !strcmp(name, ".."))
 997		return false;
 998
 999	while (*name) {
1000		if (*name == '/' || *name == ':' || isspace(*name))
1001			return false;
1002		name++;
1003	}
1004	return true;
1005}
1006EXPORT_SYMBOL(dev_valid_name);
1007
1008/**
1009 *	__dev_alloc_name - allocate a name for a device
1010 *	@net: network namespace to allocate the device name in
1011 *	@name: name format string
1012 *	@buf:  scratch buffer and result name string
1013 *
1014 *	Passed a format string - eg "lt%d" it will try and find a suitable
1015 *	id. It scans list of devices to build up a free map, then chooses
1016 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1017 *	while allocating the name and adding the device in order to avoid
1018 *	duplicates.
1019 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1020 *	Returns the number of the unit assigned or a negative errno code.
1021 */
1022
1023static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1024{
1025	int i = 0;
1026	const char *p;
1027	const int max_netdevices = 8*PAGE_SIZE;
1028	unsigned long *inuse;
1029	struct net_device *d;
1030
1031	p = strnchr(name, IFNAMSIZ-1, '%');
 
 
 
1032	if (p) {
1033		/*
1034		 * Verify the string as this thing may have come from
1035		 * the user.  There must be either one "%d" and no other "%"
1036		 * characters.
1037		 */
1038		if (p[1] != 'd' || strchr(p + 2, '%'))
1039			return -EINVAL;
1040
1041		/* Use one page as a bit array of possible slots */
1042		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1043		if (!inuse)
1044			return -ENOMEM;
1045
1046		for_each_netdev(net, d) {
1047			if (!sscanf(d->name, name, &i))
1048				continue;
1049			if (i < 0 || i >= max_netdevices)
1050				continue;
1051
1052			/*  avoid cases where sscanf is not exact inverse of printf */
1053			snprintf(buf, IFNAMSIZ, name, i);
1054			if (!strncmp(buf, d->name, IFNAMSIZ))
1055				set_bit(i, inuse);
1056		}
1057
1058		i = find_first_zero_bit(inuse, max_netdevices);
1059		free_page((unsigned long) inuse);
1060	}
1061
1062	if (buf != name)
1063		snprintf(buf, IFNAMSIZ, name, i);
1064	if (!__dev_get_by_name(net, buf))
1065		return i;
1066
1067	/* It is possible to run out of possible slots
1068	 * when the name is long and there isn't enough space left
1069	 * for the digits, or if all bits are used.
1070	 */
1071	return -ENFILE;
1072}
1073
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1074/**
1075 *	dev_alloc_name - allocate a name for a device
1076 *	@dev: device
1077 *	@name: name format string
1078 *
1079 *	Passed a format string - eg "lt%d" it will try and find a suitable
1080 *	id. It scans list of devices to build up a free map, then chooses
1081 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1082 *	while allocating the name and adding the device in order to avoid
1083 *	duplicates.
1084 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1085 *	Returns the number of the unit assigned or a negative errno code.
1086 */
1087
1088int dev_alloc_name(struct net_device *dev, const char *name)
1089{
1090	char buf[IFNAMSIZ];
1091	struct net *net;
1092	int ret;
1093
1094	BUG_ON(!dev_net(dev));
1095	net = dev_net(dev);
1096	ret = __dev_alloc_name(net, name, buf);
1097	if (ret >= 0)
1098		strlcpy(dev->name, buf, IFNAMSIZ);
1099	return ret;
1100}
1101EXPORT_SYMBOL(dev_alloc_name);
1102
1103static int dev_alloc_name_ns(struct net *net,
1104			     struct net_device *dev,
1105			     const char *name)
1106{
1107	char buf[IFNAMSIZ];
1108	int ret;
1109
1110	ret = __dev_alloc_name(net, name, buf);
1111	if (ret >= 0)
1112		strlcpy(dev->name, buf, IFNAMSIZ);
1113	return ret;
1114}
1115
1116static int dev_get_valid_name(struct net *net,
1117			      struct net_device *dev,
1118			      const char *name)
1119{
1120	BUG_ON(!net);
1121
1122	if (!dev_valid_name(name))
1123		return -EINVAL;
1124
1125	if (strchr(name, '%'))
1126		return dev_alloc_name_ns(net, dev, name);
1127	else if (__dev_get_by_name(net, name))
1128		return -EEXIST;
1129	else if (dev->name != name)
1130		strlcpy(dev->name, name, IFNAMSIZ);
1131
1132	return 0;
1133}
 
1134
1135/**
1136 *	dev_change_name - change name of a device
1137 *	@dev: device
1138 *	@newname: name (or format string) must be at least IFNAMSIZ
1139 *
1140 *	Change name of a device, can pass format strings "eth%d".
1141 *	for wildcarding.
1142 */
1143int dev_change_name(struct net_device *dev, const char *newname)
1144{
1145	unsigned char old_assign_type;
1146	char oldname[IFNAMSIZ];
1147	int err = 0;
1148	int ret;
1149	struct net *net;
1150
1151	ASSERT_RTNL();
1152	BUG_ON(!dev_net(dev));
1153
1154	net = dev_net(dev);
1155	if (dev->flags & IFF_UP)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1156		return -EBUSY;
1157
1158	write_seqcount_begin(&devnet_rename_seq);
1159
1160	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1161		write_seqcount_end(&devnet_rename_seq);
1162		return 0;
1163	}
1164
1165	memcpy(oldname, dev->name, IFNAMSIZ);
1166
1167	err = dev_get_valid_name(net, dev, newname);
1168	if (err < 0) {
1169		write_seqcount_end(&devnet_rename_seq);
1170		return err;
1171	}
1172
1173	if (oldname[0] && !strchr(oldname, '%'))
1174		netdev_info(dev, "renamed from %s\n", oldname);
1175
1176	old_assign_type = dev->name_assign_type;
1177	dev->name_assign_type = NET_NAME_RENAMED;
1178
1179rollback:
1180	ret = device_rename(&dev->dev, dev->name);
1181	if (ret) {
1182		memcpy(dev->name, oldname, IFNAMSIZ);
1183		dev->name_assign_type = old_assign_type;
1184		write_seqcount_end(&devnet_rename_seq);
1185		return ret;
1186	}
1187
1188	write_seqcount_end(&devnet_rename_seq);
1189
1190	netdev_adjacent_rename_links(dev, oldname);
1191
1192	write_lock_bh(&dev_base_lock);
1193	hlist_del_rcu(&dev->name_hlist);
1194	write_unlock_bh(&dev_base_lock);
1195
1196	synchronize_rcu();
1197
1198	write_lock_bh(&dev_base_lock);
1199	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1200	write_unlock_bh(&dev_base_lock);
1201
1202	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1203	ret = notifier_to_errno(ret);
1204
1205	if (ret) {
1206		/* err >= 0 after dev_alloc_name() or stores the first errno */
1207		if (err >= 0) {
1208			err = ret;
1209			write_seqcount_begin(&devnet_rename_seq);
1210			memcpy(dev->name, oldname, IFNAMSIZ);
1211			memcpy(oldname, newname, IFNAMSIZ);
1212			dev->name_assign_type = old_assign_type;
1213			old_assign_type = NET_NAME_RENAMED;
1214			goto rollback;
1215		} else {
1216			pr_err("%s: name change rollback failed: %d\n",
1217			       dev->name, ret);
1218		}
1219	}
1220
1221	return err;
1222}
1223
1224/**
1225 *	dev_set_alias - change ifalias of a device
1226 *	@dev: device
1227 *	@alias: name up to IFALIASZ
1228 *	@len: limit of bytes to copy from info
1229 *
1230 *	Set ifalias for a device,
1231 */
1232int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1233{
1234	char *new_ifalias;
1235
1236	ASSERT_RTNL();
1237
1238	if (len >= IFALIASZ)
1239		return -EINVAL;
1240
1241	if (!len) {
1242		kfree(dev->ifalias);
1243		dev->ifalias = NULL;
1244		return 0;
 
 
 
1245	}
1246
1247	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1248	if (!new_ifalias)
1249		return -ENOMEM;
1250	dev->ifalias = new_ifalias;
 
 
 
1251
1252	strlcpy(dev->ifalias, alias, len+1);
1253	return len;
1254}
 
1255
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1256
1257/**
1258 *	netdev_features_change - device changes features
1259 *	@dev: device to cause notification
1260 *
1261 *	Called to indicate a device has changed features.
1262 */
1263void netdev_features_change(struct net_device *dev)
1264{
1265	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1266}
1267EXPORT_SYMBOL(netdev_features_change);
1268
1269/**
1270 *	netdev_state_change - device changes state
1271 *	@dev: device to cause notification
1272 *
1273 *	Called to indicate a device has changed state. This function calls
1274 *	the notifier chains for netdev_chain and sends a NEWLINK message
1275 *	to the routing socket.
1276 */
1277void netdev_state_change(struct net_device *dev)
1278{
1279	if (dev->flags & IFF_UP) {
1280		struct netdev_notifier_change_info change_info;
 
 
1281
1282		change_info.flags_changed = 0;
1283		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1284					      &change_info.info);
1285		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1286	}
1287}
1288EXPORT_SYMBOL(netdev_state_change);
1289
1290/**
1291 * 	netdev_notify_peers - notify network peers about existence of @dev
1292 * 	@dev: network device
1293 *
1294 * Generate traffic such that interested network peers are aware of
1295 * @dev, such as by generating a gratuitous ARP. This may be used when
1296 * a device wants to inform the rest of the network about some sort of
1297 * reconfiguration such as a failover event or virtual machine
1298 * migration.
1299 */
1300void netdev_notify_peers(struct net_device *dev)
1301{
1302	rtnl_lock();
1303	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
 
1304	rtnl_unlock();
1305}
1306EXPORT_SYMBOL(netdev_notify_peers);
1307
1308static int __dev_open(struct net_device *dev)
1309{
1310	const struct net_device_ops *ops = dev->netdev_ops;
1311	int ret;
1312
1313	ASSERT_RTNL();
1314
1315	if (!netif_device_present(dev))
1316		return -ENODEV;
1317
1318	/* Block netpoll from trying to do any rx path servicing.
1319	 * If we don't do this there is a chance ndo_poll_controller
1320	 * or ndo_poll may be running while we open the device
1321	 */
1322	netpoll_poll_disable(dev);
1323
1324	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1325	ret = notifier_to_errno(ret);
1326	if (ret)
1327		return ret;
1328
1329	set_bit(__LINK_STATE_START, &dev->state);
1330
1331	if (ops->ndo_validate_addr)
1332		ret = ops->ndo_validate_addr(dev);
1333
1334	if (!ret && ops->ndo_open)
1335		ret = ops->ndo_open(dev);
1336
1337	netpoll_poll_enable(dev);
1338
1339	if (ret)
1340		clear_bit(__LINK_STATE_START, &dev->state);
1341	else {
1342		dev->flags |= IFF_UP;
1343		dev_set_rx_mode(dev);
1344		dev_activate(dev);
1345		add_device_randomness(dev->dev_addr, dev->addr_len);
1346	}
1347
1348	return ret;
1349}
1350
1351/**
1352 *	dev_open	- prepare an interface for use.
1353 *	@dev:	device to open
 
1354 *
1355 *	Takes a device from down to up state. The device's private open
1356 *	function is invoked and then the multicast lists are loaded. Finally
1357 *	the device is moved into the up state and a %NETDEV_UP message is
1358 *	sent to the netdev notifier chain.
1359 *
1360 *	Calling this function on an active interface is a nop. On a failure
1361 *	a negative errno code is returned.
1362 */
1363int dev_open(struct net_device *dev)
1364{
1365	int ret;
1366
1367	if (dev->flags & IFF_UP)
1368		return 0;
1369
1370	ret = __dev_open(dev);
1371	if (ret < 0)
1372		return ret;
1373
1374	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1375	call_netdevice_notifiers(NETDEV_UP, dev);
1376
1377	return ret;
1378}
1379EXPORT_SYMBOL(dev_open);
1380
1381static int __dev_close_many(struct list_head *head)
1382{
1383	struct net_device *dev;
1384
1385	ASSERT_RTNL();
1386	might_sleep();
1387
1388	list_for_each_entry(dev, head, close_list) {
1389		/* Temporarily disable netpoll until the interface is down */
1390		netpoll_poll_disable(dev);
1391
1392		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1393
1394		clear_bit(__LINK_STATE_START, &dev->state);
1395
1396		/* Synchronize to scheduled poll. We cannot touch poll list, it
1397		 * can be even on different cpu. So just clear netif_running().
1398		 *
1399		 * dev->stop() will invoke napi_disable() on all of it's
1400		 * napi_struct instances on this device.
1401		 */
1402		smp_mb__after_atomic(); /* Commit netif_running(). */
1403	}
1404
1405	dev_deactivate_many(head);
1406
1407	list_for_each_entry(dev, head, close_list) {
1408		const struct net_device_ops *ops = dev->netdev_ops;
1409
1410		/*
1411		 *	Call the device specific close. This cannot fail.
1412		 *	Only if device is UP
1413		 *
1414		 *	We allow it to be called even after a DETACH hot-plug
1415		 *	event.
1416		 */
1417		if (ops->ndo_stop)
1418			ops->ndo_stop(dev);
1419
1420		dev->flags &= ~IFF_UP;
1421		netpoll_poll_enable(dev);
1422	}
1423
1424	return 0;
1425}
1426
1427static int __dev_close(struct net_device *dev)
1428{
1429	int retval;
1430	LIST_HEAD(single);
1431
1432	list_add(&dev->close_list, &single);
1433	retval = __dev_close_many(&single);
1434	list_del(&single);
1435
1436	return retval;
1437}
1438
1439int dev_close_many(struct list_head *head, bool unlink)
1440{
1441	struct net_device *dev, *tmp;
1442
1443	/* Remove the devices that don't need to be closed */
1444	list_for_each_entry_safe(dev, tmp, head, close_list)
1445		if (!(dev->flags & IFF_UP))
1446			list_del_init(&dev->close_list);
1447
1448	__dev_close_many(head);
1449
1450	list_for_each_entry_safe(dev, tmp, head, close_list) {
1451		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1452		call_netdevice_notifiers(NETDEV_DOWN, dev);
1453		if (unlink)
1454			list_del_init(&dev->close_list);
1455	}
1456
1457	return 0;
1458}
1459EXPORT_SYMBOL(dev_close_many);
1460
1461/**
1462 *	dev_close - shutdown an interface.
1463 *	@dev: device to shutdown
1464 *
1465 *	This function moves an active device into down state. A
1466 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1467 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1468 *	chain.
1469 */
1470int dev_close(struct net_device *dev)
1471{
1472	if (dev->flags & IFF_UP) {
1473		LIST_HEAD(single);
1474
1475		list_add(&dev->close_list, &single);
1476		dev_close_many(&single, true);
1477		list_del(&single);
1478	}
1479	return 0;
1480}
1481EXPORT_SYMBOL(dev_close);
1482
1483
1484/**
1485 *	dev_disable_lro - disable Large Receive Offload on a device
1486 *	@dev: device
1487 *
1488 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1489 *	called under RTNL.  This is needed if received packets may be
1490 *	forwarded to another interface.
1491 */
1492void dev_disable_lro(struct net_device *dev)
1493{
1494	struct net_device *lower_dev;
1495	struct list_head *iter;
1496
1497	dev->wanted_features &= ~NETIF_F_LRO;
1498	netdev_update_features(dev);
1499
1500	if (unlikely(dev->features & NETIF_F_LRO))
1501		netdev_WARN(dev, "failed to disable LRO!\n");
1502
1503	netdev_for_each_lower_dev(dev, lower_dev, iter)
1504		dev_disable_lro(lower_dev);
1505}
1506EXPORT_SYMBOL(dev_disable_lro);
1507
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1508static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1509				   struct net_device *dev)
1510{
1511	struct netdev_notifier_info info;
 
 
1512
1513	netdev_notifier_info_init(&info, dev);
1514	return nb->notifier_call(nb, val, &info);
1515}
1516
1517static int dev_boot_phase = 1;
1518
1519/**
1520 *	register_netdevice_notifier - register a network notifier block
1521 *	@nb: notifier
1522 *
1523 *	Register a notifier to be called when network device events occur.
1524 *	The notifier passed is linked into the kernel structures and must
1525 *	not be reused until it has been unregistered. A negative errno code
1526 *	is returned on a failure.
1527 *
1528 * 	When registered all registration and up events are replayed
1529 *	to the new notifier to allow device to have a race free
1530 *	view of the network device list.
1531 */
1532
1533int register_netdevice_notifier(struct notifier_block *nb)
1534{
1535	struct net_device *dev;
1536	struct net_device *last;
1537	struct net *net;
1538	int err;
1539
 
 
1540	rtnl_lock();
1541	err = raw_notifier_chain_register(&netdev_chain, nb);
1542	if (err)
1543		goto unlock;
1544	if (dev_boot_phase)
1545		goto unlock;
1546	for_each_net(net) {
1547		for_each_netdev(net, dev) {
1548			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1549			err = notifier_to_errno(err);
1550			if (err)
1551				goto rollback;
1552
1553			if (!(dev->flags & IFF_UP))
1554				continue;
1555
1556			call_netdevice_notifier(nb, NETDEV_UP, dev);
1557		}
1558	}
1559
1560unlock:
1561	rtnl_unlock();
 
1562	return err;
1563
1564rollback:
1565	last = dev;
1566	for_each_net(net) {
1567		for_each_netdev(net, dev) {
1568			if (dev == last)
1569				goto outroll;
1570
1571			if (dev->flags & IFF_UP) {
1572				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1573							dev);
1574				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1575			}
1576			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1577		}
1578	}
1579
1580outroll:
1581	raw_notifier_chain_unregister(&netdev_chain, nb);
1582	goto unlock;
1583}
1584EXPORT_SYMBOL(register_netdevice_notifier);
1585
1586/**
1587 *	unregister_netdevice_notifier - unregister a network notifier block
1588 *	@nb: notifier
1589 *
1590 *	Unregister a notifier previously registered by
1591 *	register_netdevice_notifier(). The notifier is unlinked into the
1592 *	kernel structures and may then be reused. A negative errno code
1593 *	is returned on a failure.
1594 *
1595 * 	After unregistering unregister and down device events are synthesized
1596 *	for all devices on the device list to the removed notifier to remove
1597 *	the need for special case cleanup code.
1598 */
1599
1600int unregister_netdevice_notifier(struct notifier_block *nb)
1601{
1602	struct net_device *dev;
1603	struct net *net;
1604	int err;
1605
 
 
1606	rtnl_lock();
1607	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1608	if (err)
1609		goto unlock;
1610
1611	for_each_net(net) {
1612		for_each_netdev(net, dev) {
1613			if (dev->flags & IFF_UP) {
1614				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1615							dev);
1616				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1617			}
1618			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1619		}
1620	}
1621unlock:
1622	rtnl_unlock();
 
1623	return err;
1624}
1625EXPORT_SYMBOL(unregister_netdevice_notifier);
1626
1627/**
1628 *	call_netdevice_notifiers_info - call all network notifier blocks
1629 *	@val: value passed unmodified to notifier function
1630 *	@dev: net_device pointer passed unmodified to notifier function
1631 *	@info: notifier information data
1632 *
1633 *	Call all network notifier blocks.  Parameters and return value
1634 *	are as for raw_notifier_call_chain().
1635 */
1636
1637static int call_netdevice_notifiers_info(unsigned long val,
1638					 struct net_device *dev,
1639					 struct netdev_notifier_info *info)
1640{
1641	ASSERT_RTNL();
1642	netdev_notifier_info_init(info, dev);
1643	return raw_notifier_call_chain(&netdev_chain, val, info);
1644}
1645
 
 
 
 
 
 
 
 
 
 
 
 
1646/**
1647 *	call_netdevice_notifiers - call all network notifier blocks
1648 *      @val: value passed unmodified to notifier function
1649 *      @dev: net_device pointer passed unmodified to notifier function
1650 *
1651 *	Call all network notifier blocks.  Parameters and return value
1652 *	are as for raw_notifier_call_chain().
1653 */
1654
1655int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1656{
1657	struct netdev_notifier_info info;
 
 
1658
1659	return call_netdevice_notifiers_info(val, dev, &info);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1660}
1661EXPORT_SYMBOL(call_netdevice_notifiers);
1662
1663#ifdef CONFIG_NET_INGRESS
1664static struct static_key ingress_needed __read_mostly;
1665
1666void net_inc_ingress_queue(void)
1667{
1668	static_key_slow_inc(&ingress_needed);
1669}
1670EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1671
1672void net_dec_ingress_queue(void)
1673{
1674	static_key_slow_dec(&ingress_needed);
1675}
1676EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1677#endif
1678
1679#ifdef CONFIG_NET_EGRESS
1680static struct static_key egress_needed __read_mostly;
1681
1682void net_inc_egress_queue(void)
1683{
1684	static_key_slow_inc(&egress_needed);
1685}
1686EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1687
1688void net_dec_egress_queue(void)
1689{
1690	static_key_slow_dec(&egress_needed);
1691}
1692EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1693#endif
1694
1695static struct static_key netstamp_needed __read_mostly;
1696#ifdef HAVE_JUMP_LABEL
1697/* We are not allowed to call static_key_slow_dec() from irq context
1698 * If net_disable_timestamp() is called from irq context, defer the
1699 * static_key_slow_dec() calls.
1700 */
1701static atomic_t netstamp_needed_deferred;
 
 
 
 
 
 
 
 
 
 
 
 
 
1702#endif
1703
1704void net_enable_timestamp(void)
1705{
1706#ifdef HAVE_JUMP_LABEL
1707	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1708
1709	if (deferred) {
1710		while (--deferred)
1711			static_key_slow_dec(&netstamp_needed);
1712		return;
 
 
1713	}
 
 
 
 
1714#endif
1715	static_key_slow_inc(&netstamp_needed);
1716}
1717EXPORT_SYMBOL(net_enable_timestamp);
1718
1719void net_disable_timestamp(void)
1720{
1721#ifdef HAVE_JUMP_LABEL
1722	if (in_interrupt()) {
1723		atomic_inc(&netstamp_needed_deferred);
1724		return;
 
 
 
 
 
1725	}
 
 
 
 
1726#endif
1727	static_key_slow_dec(&netstamp_needed);
1728}
1729EXPORT_SYMBOL(net_disable_timestamp);
1730
1731static inline void net_timestamp_set(struct sk_buff *skb)
1732{
1733	skb->tstamp.tv64 = 0;
1734	if (static_key_false(&netstamp_needed))
1735		__net_timestamp(skb);
1736}
1737
1738#define net_timestamp_check(COND, SKB)			\
1739	if (static_key_false(&netstamp_needed)) {		\
1740		if ((COND) && !(SKB)->tstamp.tv64)	\
1741			__net_timestamp(SKB);		\
1742	}						\
1743
1744bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1745{
1746	unsigned int len;
1747
1748	if (!(dev->flags & IFF_UP))
1749		return false;
1750
1751	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1752	if (skb->len <= len)
1753		return true;
1754
1755	/* if TSO is enabled, we don't care about the length as the packet
1756	 * could be forwarded without being segmented before
1757	 */
1758	if (skb_is_gso(skb))
1759		return true;
1760
1761	return false;
1762}
1763EXPORT_SYMBOL_GPL(is_skb_forwardable);
1764
1765int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1766{
1767	if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1768	    unlikely(!is_skb_forwardable(dev, skb))) {
1769		atomic_long_inc(&dev->rx_dropped);
1770		kfree_skb(skb);
1771		return NET_RX_DROP;
1772	}
1773
1774	skb_scrub_packet(skb, true);
1775	skb->priority = 0;
1776	skb->protocol = eth_type_trans(skb, dev);
1777	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1778
1779	return 0;
1780}
1781EXPORT_SYMBOL_GPL(__dev_forward_skb);
1782
1783/**
1784 * dev_forward_skb - loopback an skb to another netif
1785 *
1786 * @dev: destination network device
1787 * @skb: buffer to forward
1788 *
1789 * return values:
1790 *	NET_RX_SUCCESS	(no congestion)
1791 *	NET_RX_DROP     (packet was dropped, but freed)
1792 *
1793 * dev_forward_skb can be used for injecting an skb from the
1794 * start_xmit function of one device into the receive queue
1795 * of another device.
1796 *
1797 * The receiving device may be in another namespace, so
1798 * we have to clear all information in the skb that could
1799 * impact namespace isolation.
1800 */
1801int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1802{
1803	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1804}
1805EXPORT_SYMBOL_GPL(dev_forward_skb);
1806
1807static inline int deliver_skb(struct sk_buff *skb,
1808			      struct packet_type *pt_prev,
1809			      struct net_device *orig_dev)
1810{
1811	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1812		return -ENOMEM;
1813	atomic_inc(&skb->users);
1814	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1815}
1816
1817static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1818					  struct packet_type **pt,
1819					  struct net_device *orig_dev,
1820					  __be16 type,
1821					  struct list_head *ptype_list)
1822{
1823	struct packet_type *ptype, *pt_prev = *pt;
1824
1825	list_for_each_entry_rcu(ptype, ptype_list, list) {
1826		if (ptype->type != type)
1827			continue;
1828		if (pt_prev)
1829			deliver_skb(skb, pt_prev, orig_dev);
1830		pt_prev = ptype;
1831	}
1832	*pt = pt_prev;
1833}
1834
1835static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1836{
1837	if (!ptype->af_packet_priv || !skb->sk)
1838		return false;
1839
1840	if (ptype->id_match)
1841		return ptype->id_match(ptype, skb->sk);
1842	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1843		return true;
1844
1845	return false;
1846}
1847
 
 
 
 
 
 
 
 
 
 
 
1848/*
1849 *	Support routine. Sends outgoing frames to any network
1850 *	taps currently in use.
1851 */
1852
1853static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1854{
1855	struct packet_type *ptype;
1856	struct sk_buff *skb2 = NULL;
1857	struct packet_type *pt_prev = NULL;
1858	struct list_head *ptype_list = &ptype_all;
1859
1860	rcu_read_lock();
1861again:
1862	list_for_each_entry_rcu(ptype, ptype_list, list) {
 
 
 
1863		/* Never send packets back to the socket
1864		 * they originated from - MvS (miquels@drinkel.ow.org)
1865		 */
1866		if (skb_loop_sk(ptype, skb))
1867			continue;
1868
1869		if (pt_prev) {
1870			deliver_skb(skb2, pt_prev, skb->dev);
1871			pt_prev = ptype;
1872			continue;
1873		}
1874
1875		/* need to clone skb, done only once */
1876		skb2 = skb_clone(skb, GFP_ATOMIC);
1877		if (!skb2)
1878			goto out_unlock;
1879
1880		net_timestamp_set(skb2);
1881
1882		/* skb->nh should be correctly
1883		 * set by sender, so that the second statement is
1884		 * just protection against buggy protocols.
1885		 */
1886		skb_reset_mac_header(skb2);
1887
1888		if (skb_network_header(skb2) < skb2->data ||
1889		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1890			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1891					     ntohs(skb2->protocol),
1892					     dev->name);
1893			skb_reset_network_header(skb2);
1894		}
1895
1896		skb2->transport_header = skb2->network_header;
1897		skb2->pkt_type = PACKET_OUTGOING;
1898		pt_prev = ptype;
1899	}
1900
1901	if (ptype_list == &ptype_all) {
1902		ptype_list = &dev->ptype_all;
1903		goto again;
1904	}
1905out_unlock:
1906	if (pt_prev)
1907		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
 
 
 
 
1908	rcu_read_unlock();
1909}
 
1910
1911/**
1912 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1913 * @dev: Network device
1914 * @txq: number of queues available
1915 *
1916 * If real_num_tx_queues is changed the tc mappings may no longer be
1917 * valid. To resolve this verify the tc mapping remains valid and if
1918 * not NULL the mapping. With no priorities mapping to this
1919 * offset/count pair it will no longer be used. In the worst case TC0
1920 * is invalid nothing can be done so disable priority mappings. If is
1921 * expected that drivers will fix this mapping if they can before
1922 * calling netif_set_real_num_tx_queues.
1923 */
1924static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1925{
1926	int i;
1927	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1928
1929	/* If TC0 is invalidated disable TC mapping */
1930	if (tc->offset + tc->count > txq) {
1931		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1932		dev->num_tc = 0;
1933		return;
1934	}
1935
1936	/* Invalidated prio to tc mappings set to TC0 */
1937	for (i = 1; i < TC_BITMASK + 1; i++) {
1938		int q = netdev_get_prio_tc_map(dev, i);
1939
1940		tc = &dev->tc_to_txq[q];
1941		if (tc->offset + tc->count > txq) {
1942			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1943				i, q);
1944			netdev_set_prio_tc_map(dev, i, 0);
1945		}
1946	}
1947}
1948
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1949#ifdef CONFIG_XPS
 
 
 
 
1950static DEFINE_MUTEX(xps_map_mutex);
1951#define xmap_dereference(P)		\
1952	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1953
1954static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1955					int cpu, u16 index)
1956{
1957	struct xps_map *map = NULL;
1958	int pos;
1959
1960	if (dev_maps)
1961		map = xmap_dereference(dev_maps->cpu_map[cpu]);
 
 
 
 
 
 
1962
1963	for (pos = 0; map && pos < map->len; pos++) {
1964		if (map->queues[pos] == index) {
1965			if (map->len > 1) {
1966				map->queues[pos] = map->queues[--map->len];
1967			} else {
1968				RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1969				kfree_rcu(map, rcu);
1970				map = NULL;
1971			}
1972			break;
1973		}
 
 
 
 
1974	}
1975
1976	return map;
1977}
1978
1979static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
 
 
1980{
1981	struct xps_dev_maps *dev_maps;
1982	int cpu, i;
1983	bool active = false;
 
1984
1985	mutex_lock(&xps_map_mutex);
1986	dev_maps = xmap_dereference(dev->xps_maps);
1987
1988	if (!dev_maps)
1989		goto out_no_maps;
1990
1991	for_each_possible_cpu(cpu) {
1992		for (i = index; i < dev->num_tx_queues; i++) {
1993			if (!remove_xps_queue(dev_maps, cpu, i))
1994				break;
1995		}
1996		if (i == dev->num_tx_queues)
1997			active = true;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1998	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1999
2000	if (!active) {
2001		RCU_INIT_POINTER(dev->xps_maps, NULL);
2002		kfree_rcu(dev_maps, rcu);
 
 
 
2003	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2004
2005	for (i = index; i < dev->num_tx_queues; i++)
2006		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2007					     NUMA_NO_NODE);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2008
2009out_no_maps:
2010	mutex_unlock(&xps_map_mutex);
 
 
 
 
 
 
2011}
2012
2013static struct xps_map *expand_xps_map(struct xps_map *map,
2014				      int cpu, u16 index)
2015{
2016	struct xps_map *new_map;
2017	int alloc_len = XPS_MIN_MAP_ALLOC;
2018	int i, pos;
2019
2020	for (pos = 0; map && pos < map->len; pos++) {
2021		if (map->queues[pos] != index)
2022			continue;
2023		return map;
2024	}
2025
2026	/* Need to add queue to this CPU's existing map */
2027	if (map) {
2028		if (pos < map->alloc_len)
2029			return map;
2030
2031		alloc_len = map->alloc_len * 2;
2032	}
2033
2034	/* Need to allocate new map to store queue on this CPU's map */
2035	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2036			       cpu_to_node(cpu));
 
 
 
 
 
2037	if (!new_map)
2038		return NULL;
2039
2040	for (i = 0; i < pos; i++)
2041		new_map->queues[i] = map->queues[i];
2042	new_map->alloc_len = alloc_len;
2043	new_map->len = pos;
2044
2045	return new_map;
2046}
2047
2048int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2049			u16 index)
 
2050{
 
2051	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
 
 
2052	struct xps_map *map, *new_map;
2053	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2054	int cpu, numa_node_id = -2;
2055	bool active = false;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2056
2057	mutex_lock(&xps_map_mutex);
 
 
 
 
 
 
 
 
 
 
 
 
 
2058
2059	dev_maps = xmap_dereference(dev->xps_maps);
 
2060
2061	/* allocate memory for queue storage */
2062	for_each_online_cpu(cpu) {
2063		if (!cpumask_test_cpu(cpu, mask))
2064			continue;
2065
2066		if (!new_dev_maps)
2067			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2068		if (!new_dev_maps) {
2069			mutex_unlock(&xps_map_mutex);
2070			return -ENOMEM;
2071		}
2072
2073		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
 
2074				 NULL;
2075
2076		map = expand_xps_map(map, cpu, index);
2077		if (!map)
2078			goto error;
2079
2080		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2081	}
2082
2083	if (!new_dev_maps)
2084		goto out_no_new_maps;
2085
2086	for_each_possible_cpu(cpu) {
2087		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2088			/* add queue to CPU maps */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2089			int pos = 0;
2090
2091			map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2092			while ((pos < map->len) && (map->queues[pos] != index))
2093				pos++;
2094
2095			if (pos == map->len)
2096				map->queues[map->len++] = index;
2097#ifdef CONFIG_NUMA
2098			if (numa_node_id == -2)
2099				numa_node_id = cpu_to_node(cpu);
2100			else if (numa_node_id != cpu_to_node(cpu))
2101				numa_node_id = -1;
 
 
2102#endif
2103		} else if (dev_maps) {
2104			/* fill in the new device map from the old device map */
2105			map = xmap_dereference(dev_maps->cpu_map[cpu]);
2106			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2107		}
2108
 
 
 
 
 
 
2109	}
2110
2111	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
 
 
 
2112
2113	/* Cleanup old maps */
2114	if (dev_maps) {
2115		for_each_possible_cpu(cpu) {
2116			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2117			map = xmap_dereference(dev_maps->cpu_map[cpu]);
 
 
 
 
2118			if (map && map != new_map)
2119				kfree_rcu(map, rcu);
2120		}
 
2121
2122		kfree_rcu(dev_maps, rcu);
2123	}
2124
 
2125	dev_maps = new_dev_maps;
2126	active = true;
2127
2128out_no_new_maps:
2129	/* update Tx queue numa node */
2130	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2131				     (numa_node_id >= 0) ? numa_node_id :
2132				     NUMA_NO_NODE);
 
 
2133
2134	if (!dev_maps)
2135		goto out_no_maps;
2136
2137	/* removes queue from unused CPUs */
2138	for_each_possible_cpu(cpu) {
2139		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2140			continue;
2141
2142		if (remove_xps_queue(dev_maps, cpu, index))
2143			active = true;
 
 
 
2144	}
2145
2146	/* free map if not active */
2147	if (!active) {
2148		RCU_INIT_POINTER(dev->xps_maps, NULL);
2149		kfree_rcu(dev_maps, rcu);
2150	}
2151
2152out_no_maps:
2153	mutex_unlock(&xps_map_mutex);
2154
2155	return 0;
2156error:
2157	/* remove any maps that we added */
2158	for_each_possible_cpu(cpu) {
2159		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2160		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2161				 NULL;
2162		if (new_map && new_map != map)
2163			kfree(new_map);
 
 
 
 
2164	}
2165
2166	mutex_unlock(&xps_map_mutex);
2167
2168	kfree(new_dev_maps);
2169	return -ENOMEM;
2170}
 
 
 
 
 
 
 
 
 
 
 
 
 
2171EXPORT_SYMBOL(netif_set_xps_queue);
2172
2173#endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2174/*
2175 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2176 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2177 */
2178int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2179{
 
2180	int rc;
2181
 
 
2182	if (txq < 1 || txq > dev->num_tx_queues)
2183		return -EINVAL;
2184
2185	if (dev->reg_state == NETREG_REGISTERED ||
2186	    dev->reg_state == NETREG_UNREGISTERING) {
2187		ASSERT_RTNL();
2188
2189		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2190						  txq);
2191		if (rc)
2192			return rc;
2193
2194		if (dev->num_tc)
2195			netif_setup_tc(dev, txq);
2196
2197		if (txq < dev->real_num_tx_queues) {
 
 
 
2198			qdisc_reset_all_tx_gt(dev, txq);
2199#ifdef CONFIG_XPS
2200			netif_reset_xps_queues_gt(dev, txq);
2201#endif
2202		}
 
 
2203	}
2204
2205	dev->real_num_tx_queues = txq;
2206	return 0;
2207}
2208EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2209
2210#ifdef CONFIG_SYSFS
2211/**
2212 *	netif_set_real_num_rx_queues - set actual number of RX queues used
2213 *	@dev: Network device
2214 *	@rxq: Actual number of RX queues
2215 *
2216 *	This must be called either with the rtnl_lock held or before
2217 *	registration of the net device.  Returns 0 on success, or a
2218 *	negative error code.  If called before registration, it always
2219 *	succeeds.
2220 */
2221int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2222{
2223	int rc;
2224
2225	if (rxq < 1 || rxq > dev->num_rx_queues)
2226		return -EINVAL;
2227
2228	if (dev->reg_state == NETREG_REGISTERED) {
2229		ASSERT_RTNL();
2230
2231		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2232						  rxq);
2233		if (rc)
2234			return rc;
2235	}
2236
2237	dev->real_num_rx_queues = rxq;
2238	return 0;
2239}
2240EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2241#endif
2242
2243/**
2244 * netif_get_num_default_rss_queues - default number of RSS queues
2245 *
2246 * This routine should set an upper limit on the number of RSS queues
2247 * used by default by multiqueue devices.
2248 */
2249int netif_get_num_default_rss_queues(void)
2250{
2251	return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
 
2252}
2253EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2254
2255static inline void __netif_reschedule(struct Qdisc *q)
2256{
2257	struct softnet_data *sd;
2258	unsigned long flags;
2259
2260	local_irq_save(flags);
2261	sd = this_cpu_ptr(&softnet_data);
2262	q->next_sched = NULL;
2263	*sd->output_queue_tailp = q;
2264	sd->output_queue_tailp = &q->next_sched;
2265	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2266	local_irq_restore(flags);
2267}
2268
2269void __netif_schedule(struct Qdisc *q)
2270{
2271	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2272		__netif_reschedule(q);
2273}
2274EXPORT_SYMBOL(__netif_schedule);
2275
2276struct dev_kfree_skb_cb {
2277	enum skb_free_reason reason;
2278};
2279
2280static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2281{
2282	return (struct dev_kfree_skb_cb *)skb->cb;
2283}
2284
2285void netif_schedule_queue(struct netdev_queue *txq)
2286{
2287	rcu_read_lock();
2288	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2289		struct Qdisc *q = rcu_dereference(txq->qdisc);
2290
2291		__netif_schedule(q);
2292	}
2293	rcu_read_unlock();
2294}
2295EXPORT_SYMBOL(netif_schedule_queue);
2296
2297/**
2298 *	netif_wake_subqueue - allow sending packets on subqueue
2299 *	@dev: network device
2300 *	@queue_index: sub queue index
2301 *
2302 * Resume individual transmit queue of a device with multiple transmit queues.
2303 */
2304void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2305{
2306	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2307
2308	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2309		struct Qdisc *q;
2310
2311		rcu_read_lock();
2312		q = rcu_dereference(txq->qdisc);
2313		__netif_schedule(q);
2314		rcu_read_unlock();
2315	}
2316}
2317EXPORT_SYMBOL(netif_wake_subqueue);
2318
2319void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2320{
2321	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2322		struct Qdisc *q;
2323
2324		rcu_read_lock();
2325		q = rcu_dereference(dev_queue->qdisc);
2326		__netif_schedule(q);
2327		rcu_read_unlock();
2328	}
2329}
2330EXPORT_SYMBOL(netif_tx_wake_queue);
2331
2332void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2333{
2334	unsigned long flags;
2335
2336	if (likely(atomic_read(&skb->users) == 1)) {
 
 
 
2337		smp_rmb();
2338		atomic_set(&skb->users, 0);
2339	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2340		return;
2341	}
2342	get_kfree_skb_cb(skb)->reason = reason;
2343	local_irq_save(flags);
2344	skb->next = __this_cpu_read(softnet_data.completion_queue);
2345	__this_cpu_write(softnet_data.completion_queue, skb);
2346	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2347	local_irq_restore(flags);
2348}
2349EXPORT_SYMBOL(__dev_kfree_skb_irq);
2350
2351void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2352{
2353	if (in_irq() || irqs_disabled())
2354		__dev_kfree_skb_irq(skb, reason);
2355	else
2356		dev_kfree_skb(skb);
2357}
2358EXPORT_SYMBOL(__dev_kfree_skb_any);
2359
2360
2361/**
2362 * netif_device_detach - mark device as removed
2363 * @dev: network device
2364 *
2365 * Mark device as removed from system and therefore no longer available.
2366 */
2367void netif_device_detach(struct net_device *dev)
2368{
2369	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2370	    netif_running(dev)) {
2371		netif_tx_stop_all_queues(dev);
2372	}
2373}
2374EXPORT_SYMBOL(netif_device_detach);
2375
2376/**
2377 * netif_device_attach - mark device as attached
2378 * @dev: network device
2379 *
2380 * Mark device as attached from system and restart if needed.
2381 */
2382void netif_device_attach(struct net_device *dev)
2383{
2384	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2385	    netif_running(dev)) {
2386		netif_tx_wake_all_queues(dev);
2387		__netdev_watchdog_up(dev);
2388	}
2389}
2390EXPORT_SYMBOL(netif_device_attach);
2391
2392/*
2393 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2394 * to be used as a distribution range.
2395 */
2396u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2397		  unsigned int num_tx_queues)
 
2398{
2399	u32 hash;
2400	u16 qoffset = 0;
2401	u16 qcount = num_tx_queues;
 
 
 
 
 
 
 
2402
2403	if (skb_rx_queue_recorded(skb)) {
2404		hash = skb_get_rx_queue(skb);
2405		while (unlikely(hash >= num_tx_queues))
2406			hash -= num_tx_queues;
2407		return hash;
2408	}
2409
2410	if (dev->num_tc) {
2411		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2412		qoffset = dev->tc_to_txq[tc].offset;
2413		qcount = dev->tc_to_txq[tc].count;
2414	}
2415
2416	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2417}
2418EXPORT_SYMBOL(__skb_tx_hash);
2419
2420static void skb_warn_bad_offload(const struct sk_buff *skb)
2421{
2422	static const netdev_features_t null_features = 0;
2423	struct net_device *dev = skb->dev;
2424	const char *name = "";
2425
2426	if (!net_ratelimit())
2427		return;
2428
2429	if (dev) {
2430		if (dev->dev.parent)
2431			name = dev_driver_string(dev->dev.parent);
2432		else
2433			name = netdev_name(dev);
2434	}
2435	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2436	     "gso_type=%d ip_summed=%d\n",
2437	     name, dev ? &dev->features : &null_features,
2438	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2439	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2440	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2441}
2442
2443/*
2444 * Invalidate hardware checksum when packet is to be mangled, and
2445 * complete checksum manually on outgoing path.
2446 */
2447int skb_checksum_help(struct sk_buff *skb)
2448{
2449	__wsum csum;
2450	int ret = 0, offset;
2451
2452	if (skb->ip_summed == CHECKSUM_COMPLETE)
2453		goto out_set_summed;
2454
2455	if (unlikely(skb_shinfo(skb)->gso_size)) {
2456		skb_warn_bad_offload(skb);
2457		return -EINVAL;
2458	}
2459
2460	/* Before computing a checksum, we should make sure no frag could
2461	 * be modified by an external entity : checksum could be wrong.
2462	 */
2463	if (skb_has_shared_frag(skb)) {
2464		ret = __skb_linearize(skb);
2465		if (ret)
2466			goto out;
2467	}
2468
2469	offset = skb_checksum_start_offset(skb);
2470	BUG_ON(offset >= skb_headlen(skb));
2471	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2472
2473	offset += skb->csum_offset;
2474	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2475
2476	if (skb_cloned(skb) &&
2477	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2478		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2479		if (ret)
2480			goto out;
2481	}
2482
2483	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
2484out_set_summed:
2485	skb->ip_summed = CHECKSUM_NONE;
2486out:
2487	return ret;
2488}
2489EXPORT_SYMBOL(skb_checksum_help);
2490
2491/* skb_csum_offload_check - Driver helper function to determine if a device
2492 * with limited checksum offload capabilities is able to offload the checksum
2493 * for a given packet.
2494 *
2495 * Arguments:
2496 *   skb - sk_buff for the packet in question
2497 *   spec - contains the description of what device can offload
2498 *   csum_encapped - returns true if the checksum being offloaded is
2499 *	      encpasulated. That is it is checksum for the transport header
2500 *	      in the inner headers.
2501 *   checksum_help - when set indicates that helper function should
2502 *	      call skb_checksum_help if offload checks fail
2503 *
2504 * Returns:
2505 *   true: Packet has passed the checksum checks and should be offloadable to
2506 *	   the device (a driver may still need to check for additional
2507 *	   restrictions of its device)
2508 *   false: Checksum is not offloadable. If checksum_help was set then
2509 *	   skb_checksum_help was called to resolve checksum for non-GSO
2510 *	   packets and when IP protocol is not SCTP
2511 */
2512bool __skb_csum_offload_chk(struct sk_buff *skb,
2513			    const struct skb_csum_offl_spec *spec,
2514			    bool *csum_encapped,
2515			    bool csum_help)
2516{
2517	struct iphdr *iph;
2518	struct ipv6hdr *ipv6;
2519	void *nhdr;
2520	int protocol;
2521	u8 ip_proto;
2522
2523	if (skb->protocol == htons(ETH_P_8021Q) ||
2524	    skb->protocol == htons(ETH_P_8021AD)) {
2525		if (!spec->vlan_okay)
2526			goto need_help;
2527	}
2528
2529	/* We check whether the checksum refers to a transport layer checksum in
2530	 * the outermost header or an encapsulated transport layer checksum that
2531	 * corresponds to the inner headers of the skb. If the checksum is for
2532	 * something else in the packet we need help.
2533	 */
2534	if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
2535		/* Non-encapsulated checksum */
2536		protocol = eproto_to_ipproto(vlan_get_protocol(skb));
2537		nhdr = skb_network_header(skb);
2538		*csum_encapped = false;
2539		if (spec->no_not_encapped)
2540			goto need_help;
2541	} else if (skb->encapsulation && spec->encap_okay &&
2542		   skb_checksum_start_offset(skb) ==
2543		   skb_inner_transport_offset(skb)) {
2544		/* Encapsulated checksum */
2545		*csum_encapped = true;
2546		switch (skb->inner_protocol_type) {
2547		case ENCAP_TYPE_ETHER:
2548			protocol = eproto_to_ipproto(skb->inner_protocol);
2549			break;
2550		case ENCAP_TYPE_IPPROTO:
2551			protocol = skb->inner_protocol;
2552			break;
2553		}
2554		nhdr = skb_inner_network_header(skb);
2555	} else {
2556		goto need_help;
2557	}
2558
2559	switch (protocol) {
2560	case IPPROTO_IP:
2561		if (!spec->ipv4_okay)
2562			goto need_help;
2563		iph = nhdr;
2564		ip_proto = iph->protocol;
2565		if (iph->ihl != 5 && !spec->ip_options_okay)
2566			goto need_help;
2567		break;
2568	case IPPROTO_IPV6:
2569		if (!spec->ipv6_okay)
2570			goto need_help;
2571		if (spec->no_encapped_ipv6 && *csum_encapped)
2572			goto need_help;
2573		ipv6 = nhdr;
2574		nhdr += sizeof(*ipv6);
2575		ip_proto = ipv6->nexthdr;
2576		break;
2577	default:
2578		goto need_help;
2579	}
2580
2581ip_proto_again:
2582	switch (ip_proto) {
2583	case IPPROTO_TCP:
2584		if (!spec->tcp_okay ||
2585		    skb->csum_offset != offsetof(struct tcphdr, check))
2586			goto need_help;
2587		break;
2588	case IPPROTO_UDP:
2589		if (!spec->udp_okay ||
2590		    skb->csum_offset != offsetof(struct udphdr, check))
2591			goto need_help;
2592		break;
2593	case IPPROTO_SCTP:
2594		if (!spec->sctp_okay ||
2595		    skb->csum_offset != offsetof(struct sctphdr, checksum))
2596			goto cant_help;
2597		break;
2598	case NEXTHDR_HOP:
2599	case NEXTHDR_ROUTING:
2600	case NEXTHDR_DEST: {
2601		u8 *opthdr = nhdr;
2602
2603		if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
2604			goto need_help;
2605
2606		ip_proto = opthdr[0];
2607		nhdr += (opthdr[1] + 1) << 3;
2608
2609		goto ip_proto_again;
 
 
 
 
 
 
2610	}
2611	default:
2612		goto need_help;
 
 
 
2613	}
2614
2615	/* Passed the tests for offloading checksum */
2616	return true;
2617
2618need_help:
2619	if (csum_help && !skb_shinfo(skb)->gso_size)
2620		skb_checksum_help(skb);
2621cant_help:
2622	return false;
2623}
2624EXPORT_SYMBOL(__skb_csum_offload_chk);
2625
2626__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2627{
2628	__be16 type = skb->protocol;
2629
2630	/* Tunnel gso handlers can set protocol to ethernet. */
2631	if (type == htons(ETH_P_TEB)) {
2632		struct ethhdr *eth;
2633
2634		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2635			return 0;
2636
2637		eth = (struct ethhdr *)skb_mac_header(skb);
2638		type = eth->h_proto;
2639	}
2640
2641	return __vlan_get_protocol(skb, type, depth);
2642}
2643
2644/**
2645 *	skb_mac_gso_segment - mac layer segmentation handler.
2646 *	@skb: buffer to segment
2647 *	@features: features for the output path (see dev->features)
2648 */
2649struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2650				    netdev_features_t features)
2651{
2652	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2653	struct packet_offload *ptype;
2654	int vlan_depth = skb->mac_len;
2655	__be16 type = skb_network_protocol(skb, &vlan_depth);
2656
2657	if (unlikely(!type))
2658		return ERR_PTR(-EINVAL);
2659
2660	__skb_pull(skb, vlan_depth);
2661
2662	rcu_read_lock();
2663	list_for_each_entry_rcu(ptype, &offload_base, list) {
2664		if (ptype->type == type && ptype->callbacks.gso_segment) {
2665			segs = ptype->callbacks.gso_segment(skb, features);
2666			break;
2667		}
2668	}
2669	rcu_read_unlock();
2670
2671	__skb_push(skb, skb->data - skb_mac_header(skb));
2672
2673	return segs;
2674}
2675EXPORT_SYMBOL(skb_mac_gso_segment);
2676
2677
2678/* openvswitch calls this on rx path, so we need a different check.
2679 */
2680static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2681{
2682	if (tx_path)
2683		return skb->ip_summed != CHECKSUM_PARTIAL;
2684	else
2685		return skb->ip_summed == CHECKSUM_NONE;
 
2686}
2687
2688/**
2689 *	__skb_gso_segment - Perform segmentation on skb.
2690 *	@skb: buffer to segment
2691 *	@features: features for the output path (see dev->features)
2692 *	@tx_path: whether it is called in TX path
2693 *
2694 *	This function segments the given skb and returns a list of segments.
2695 *
2696 *	It may return NULL if the skb requires no segmentation.  This is
2697 *	only possible when GSO is used for verifying header integrity.
2698 *
2699 *	Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2700 */
2701struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2702				  netdev_features_t features, bool tx_path)
2703{
 
 
2704	if (unlikely(skb_needs_check(skb, tx_path))) {
2705		int err;
2706
2707		skb_warn_bad_offload(skb);
2708
2709		err = skb_cow_head(skb, 0);
2710		if (err < 0)
2711			return ERR_PTR(err);
2712	}
2713
 
 
 
 
 
 
 
 
 
 
 
 
 
2714	BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2715		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2716
2717	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2718	SKB_GSO_CB(skb)->encap_level = 0;
2719
2720	skb_reset_mac_header(skb);
2721	skb_reset_mac_len(skb);
2722
2723	return skb_mac_gso_segment(skb, features);
 
 
 
 
 
2724}
2725EXPORT_SYMBOL(__skb_gso_segment);
2726
2727/* Take action when hardware reception checksum errors are detected. */
2728#ifdef CONFIG_BUG
2729void netdev_rx_csum_fault(struct net_device *dev)
2730{
2731	if (net_ratelimit()) {
2732		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
 
2733		dump_stack();
2734	}
2735}
2736EXPORT_SYMBOL(netdev_rx_csum_fault);
2737#endif
2738
2739/* Actually, we should eliminate this check as soon as we know, that:
2740 * 1. IOMMU is present and allows to map all the memory.
2741 * 2. No high memory really exists on this machine.
2742 */
2743
2744static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2745{
2746#ifdef CONFIG_HIGHMEM
2747	int i;
 
2748	if (!(dev->features & NETIF_F_HIGHDMA)) {
2749		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2750			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
2751			if (PageHighMem(skb_frag_page(frag)))
2752				return 1;
2753		}
2754	}
2755
2756	if (PCI_DMA_BUS_IS_PHYS) {
2757		struct device *pdev = dev->dev.parent;
2758
2759		if (!pdev)
2760			return 0;
2761		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2762			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2763			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2764			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2765				return 1;
2766		}
2767	}
2768#endif
2769	return 0;
2770}
2771
2772/* If MPLS offload request, verify we are testing hardware MPLS features
2773 * instead of standard features for the netdev.
2774 */
2775#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2776static netdev_features_t net_mpls_features(struct sk_buff *skb,
2777					   netdev_features_t features,
2778					   __be16 type)
2779{
2780	if (eth_p_mpls(type))
2781		features &= skb->dev->mpls_features;
2782
2783	return features;
2784}
2785#else
2786static netdev_features_t net_mpls_features(struct sk_buff *skb,
2787					   netdev_features_t features,
2788					   __be16 type)
2789{
2790	return features;
2791}
2792#endif
2793
2794static netdev_features_t harmonize_features(struct sk_buff *skb,
2795	netdev_features_t features)
2796{
2797	int tmp;
2798	__be16 type;
2799
2800	type = skb_network_protocol(skb, &tmp);
2801	features = net_mpls_features(skb, features, type);
2802
2803	if (skb->ip_summed != CHECKSUM_NONE &&
2804	    !can_checksum_protocol(features, type)) {
2805		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2806	} else if (illegal_highdma(skb->dev, skb)) {
 
2807		features &= ~NETIF_F_SG;
2808	}
2809
2810	return features;
2811}
2812
2813netdev_features_t passthru_features_check(struct sk_buff *skb,
2814					  struct net_device *dev,
2815					  netdev_features_t features)
2816{
2817	return features;
2818}
2819EXPORT_SYMBOL(passthru_features_check);
2820
2821static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2822					     struct net_device *dev,
2823					     netdev_features_t features)
2824{
2825	return vlan_features_check(skb, features);
2826}
2827
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2828netdev_features_t netif_skb_features(struct sk_buff *skb)
2829{
2830	struct net_device *dev = skb->dev;
2831	netdev_features_t features = dev->features;
2832	u16 gso_segs = skb_shinfo(skb)->gso_segs;
2833
2834	if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2835		features &= ~NETIF_F_GSO_MASK;
2836
2837	/* If encapsulation offload request, verify we are testing
2838	 * hardware encapsulation features instead of standard
2839	 * features for the netdev
2840	 */
2841	if (skb->encapsulation)
2842		features &= dev->hw_enc_features;
2843
2844	if (skb_vlan_tagged(skb))
2845		features = netdev_intersect_features(features,
2846						     dev->vlan_features |
2847						     NETIF_F_HW_VLAN_CTAG_TX |
2848						     NETIF_F_HW_VLAN_STAG_TX);
2849
2850	if (dev->netdev_ops->ndo_features_check)
2851		features &= dev->netdev_ops->ndo_features_check(skb, dev,
2852								features);
2853	else
2854		features &= dflt_features_check(skb, dev, features);
2855
2856	return harmonize_features(skb, features);
2857}
2858EXPORT_SYMBOL(netif_skb_features);
2859
2860static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2861		    struct netdev_queue *txq, bool more)
2862{
2863	unsigned int len;
2864	int rc;
2865
2866	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2867		dev_queue_xmit_nit(skb, dev);
2868
2869	len = skb->len;
2870	trace_net_dev_start_xmit(skb, dev);
2871	rc = netdev_start_xmit(skb, dev, txq, more);
2872	trace_net_dev_xmit(skb, rc, dev, len);
2873
2874	return rc;
2875}
2876
2877struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2878				    struct netdev_queue *txq, int *ret)
2879{
2880	struct sk_buff *skb = first;
2881	int rc = NETDEV_TX_OK;
2882
2883	while (skb) {
2884		struct sk_buff *next = skb->next;
2885
2886		skb->next = NULL;
2887		rc = xmit_one(skb, dev, txq, next != NULL);
2888		if (unlikely(!dev_xmit_complete(rc))) {
2889			skb->next = next;
2890			goto out;
2891		}
2892
2893		skb = next;
2894		if (netif_xmit_stopped(txq) && skb) {
2895			rc = NETDEV_TX_BUSY;
2896			break;
2897		}
2898	}
2899
2900out:
2901	*ret = rc;
2902	return skb;
2903}
2904
2905static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2906					  netdev_features_t features)
2907{
2908	if (skb_vlan_tag_present(skb) &&
2909	    !vlan_hw_offload_capable(features, skb->vlan_proto))
2910		skb = __vlan_hwaccel_push_inside(skb);
2911	return skb;
2912}
2913
2914static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
 
 
 
 
 
 
 
 
 
 
 
2915{
2916	netdev_features_t features;
2917
2918	if (skb->next)
2919		return skb;
2920
2921	features = netif_skb_features(skb);
2922	skb = validate_xmit_vlan(skb, features);
2923	if (unlikely(!skb))
2924		goto out_null;
2925
 
 
 
 
2926	if (netif_needs_gso(skb, features)) {
2927		struct sk_buff *segs;
2928
2929		segs = skb_gso_segment(skb, features);
2930		if (IS_ERR(segs)) {
2931			goto out_kfree_skb;
2932		} else if (segs) {
2933			consume_skb(skb);
2934			skb = segs;
2935		}
2936	} else {
2937		if (skb_needs_linearize(skb, features) &&
2938		    __skb_linearize(skb))
2939			goto out_kfree_skb;
2940
2941		/* If packet is not checksummed and device does not
2942		 * support checksumming for this protocol, complete
2943		 * checksumming here.
2944		 */
2945		if (skb->ip_summed == CHECKSUM_PARTIAL) {
2946			if (skb->encapsulation)
2947				skb_set_inner_transport_header(skb,
2948							       skb_checksum_start_offset(skb));
2949			else
2950				skb_set_transport_header(skb,
2951							 skb_checksum_start_offset(skb));
2952			if (!(features & NETIF_F_CSUM_MASK) &&
2953			    skb_checksum_help(skb))
2954				goto out_kfree_skb;
2955		}
2956	}
2957
 
 
2958	return skb;
2959
2960out_kfree_skb:
2961	kfree_skb(skb);
2962out_null:
 
2963	return NULL;
2964}
2965
2966struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2967{
2968	struct sk_buff *next, *head = NULL, *tail;
2969
2970	for (; skb != NULL; skb = next) {
2971		next = skb->next;
2972		skb->next = NULL;
2973
2974		/* in case skb wont be segmented, point to itself */
2975		skb->prev = skb;
2976
2977		skb = validate_xmit_skb(skb, dev);
2978		if (!skb)
2979			continue;
2980
2981		if (!head)
2982			head = skb;
2983		else
2984			tail->next = skb;
2985		/* If skb was segmented, skb->prev points to
2986		 * the last segment. If not, it still contains skb.
2987		 */
2988		tail = skb->prev;
2989	}
2990	return head;
2991}
 
2992
2993static void qdisc_pkt_len_init(struct sk_buff *skb)
2994{
2995	const struct skb_shared_info *shinfo = skb_shinfo(skb);
2996
2997	qdisc_skb_cb(skb)->pkt_len = skb->len;
2998
2999	/* To get more precise estimation of bytes sent on wire,
3000	 * we add to pkt_len the headers size of all segments
3001	 */
3002	if (shinfo->gso_size)  {
3003		unsigned int hdr_len;
3004		u16 gso_segs = shinfo->gso_segs;
3005
3006		/* mac layer + network layer */
3007		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3008
3009		/* + transport layer */
3010		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3011			hdr_len += tcp_hdrlen(skb);
3012		else
3013			hdr_len += sizeof(struct udphdr);
 
 
 
 
 
 
 
 
 
 
 
3014
3015		if (shinfo->gso_type & SKB_GSO_DODGY)
3016			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3017						shinfo->gso_size);
3018
3019		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3020	}
3021}
3022
3023static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3024				 struct net_device *dev,
3025				 struct netdev_queue *txq)
3026{
3027	spinlock_t *root_lock = qdisc_lock(q);
 
3028	bool contended;
3029	int rc;
3030
3031	qdisc_calculate_pkt_len(skb, q);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3032	/*
3033	 * Heuristic to force contended enqueues to serialize on a
3034	 * separate lock before trying to get qdisc main lock.
3035	 * This permits __QDISC___STATE_RUNNING owner to get the lock more
3036	 * often and dequeue packets faster.
3037	 */
3038	contended = qdisc_is_running(q);
3039	if (unlikely(contended))
3040		spin_lock(&q->busylock);
3041
3042	spin_lock(root_lock);
3043	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3044		kfree_skb(skb);
3045		rc = NET_XMIT_DROP;
3046	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3047		   qdisc_run_begin(q)) {
3048		/*
3049		 * This is a work-conserving queue; there are no old skbs
3050		 * waiting to be sent out; and the qdisc is not running -
3051		 * xmit the skb directly.
3052		 */
3053
3054		qdisc_bstats_update(q, skb);
3055
3056		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3057			if (unlikely(contended)) {
3058				spin_unlock(&q->busylock);
3059				contended = false;
3060			}
3061			__qdisc_run(q);
3062		} else
3063			qdisc_run_end(q);
3064
 
3065		rc = NET_XMIT_SUCCESS;
3066	} else {
3067		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
3068		if (qdisc_run_begin(q)) {
3069			if (unlikely(contended)) {
3070				spin_unlock(&q->busylock);
3071				contended = false;
3072			}
3073			__qdisc_run(q);
 
3074		}
3075	}
3076	spin_unlock(root_lock);
 
 
3077	if (unlikely(contended))
3078		spin_unlock(&q->busylock);
3079	return rc;
3080}
3081
3082#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3083static void skb_update_prio(struct sk_buff *skb)
3084{
3085	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
 
 
3086
3087	if (!skb->priority && skb->sk && map) {
3088		unsigned int prioidx =
3089			sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
 
 
 
 
 
3090
3091		if (prioidx < map->priomap_len)
3092			skb->priority = map->priomap[prioidx];
3093	}
 
3094}
3095#else
3096#define skb_update_prio(skb)
3097#endif
3098
3099DEFINE_PER_CPU(int, xmit_recursion);
3100EXPORT_SYMBOL(xmit_recursion);
3101
3102#define RECURSION_LIMIT 10
3103
3104/**
3105 *	dev_loopback_xmit - loop back @skb
3106 *	@net: network namespace this loopback is happening in
3107 *	@sk:  sk needed to be a netfilter okfn
3108 *	@skb: buffer to transmit
3109 */
3110int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3111{
3112	skb_reset_mac_header(skb);
3113	__skb_pull(skb, skb_network_offset(skb));
3114	skb->pkt_type = PACKET_LOOPBACK;
3115	skb->ip_summed = CHECKSUM_UNNECESSARY;
3116	WARN_ON(!skb_dst(skb));
3117	skb_dst_force(skb);
3118	netif_rx_ni(skb);
3119	return 0;
3120}
3121EXPORT_SYMBOL(dev_loopback_xmit);
3122
3123#ifdef CONFIG_NET_EGRESS
3124static struct sk_buff *
3125sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3126{
3127	struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3128	struct tcf_result cl_res;
3129
3130	if (!cl)
3131		return skb;
3132
3133	/* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3134	 * earlier by the caller.
3135	 */
3136	qdisc_bstats_cpu_update(cl->q, skb);
3137
3138	switch (tc_classify(skb, cl, &cl_res, false)) {
3139	case TC_ACT_OK:
3140	case TC_ACT_RECLASSIFY:
3141		skb->tc_index = TC_H_MIN(cl_res.classid);
3142		break;
3143	case TC_ACT_SHOT:
3144		qdisc_qstats_cpu_drop(cl->q);
3145		*ret = NET_XMIT_DROP;
3146		goto drop;
 
3147	case TC_ACT_STOLEN:
3148	case TC_ACT_QUEUED:
 
3149		*ret = NET_XMIT_SUCCESS;
3150drop:
3151		kfree_skb(skb);
3152		return NULL;
3153	case TC_ACT_REDIRECT:
3154		/* No need to push/pop skb's mac_header here on egress! */
3155		skb_do_redirect(skb);
3156		*ret = NET_XMIT_SUCCESS;
3157		return NULL;
3158	default:
3159		break;
3160	}
3161
3162	return skb;
3163}
3164#endif /* CONFIG_NET_EGRESS */
3165
3166static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3167{
3168#ifdef CONFIG_XPS
3169	struct xps_dev_maps *dev_maps;
3170	struct xps_map *map;
3171	int queue_index = -1;
3172
 
 
 
3173	rcu_read_lock();
3174	dev_maps = rcu_dereference(dev->xps_maps);
 
 
 
3175	if (dev_maps) {
3176		map = rcu_dereference(
3177		    dev_maps->cpu_map[skb->sender_cpu - 1]);
3178		if (map) {
3179			if (map->len == 1)
3180				queue_index = map->queues[0];
3181			else
3182				queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3183									   map->len)];
3184			if (unlikely(queue_index >= dev->real_num_tx_queues))
3185				queue_index = -1;
 
 
 
 
 
3186		}
3187	}
3188	rcu_read_unlock();
3189
3190	return queue_index;
3191#else
3192	return -1;
3193#endif
3194}
3195
3196static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3197{
3198	struct sock *sk = skb->sk;
3199	int queue_index = sk_tx_queue_get(sk);
3200
 
 
3201	if (queue_index < 0 || skb->ooo_okay ||
3202	    queue_index >= dev->real_num_tx_queues) {
3203		int new_index = get_xps_queue(dev, skb);
 
3204		if (new_index < 0)
3205			new_index = skb_tx_hash(dev, skb);
3206
3207		if (queue_index != new_index && sk &&
3208		    sk_fullsock(sk) &&
3209		    rcu_access_pointer(sk->sk_dst_cache))
3210			sk_tx_queue_set(sk, new_index);
3211
3212		queue_index = new_index;
3213	}
3214
3215	return queue_index;
3216}
 
3217
3218struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3219				    struct sk_buff *skb,
3220				    void *accel_priv)
3221{
3222	int queue_index = 0;
3223
3224#ifdef CONFIG_XPS
3225	u32 sender_cpu = skb->sender_cpu - 1;
3226
3227	if (sender_cpu >= (u32)NR_CPUS)
3228		skb->sender_cpu = raw_smp_processor_id() + 1;
3229#endif
3230
3231	if (dev->real_num_tx_queues != 1) {
3232		const struct net_device_ops *ops = dev->netdev_ops;
 
3233		if (ops->ndo_select_queue)
3234			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3235							    __netdev_pick_tx);
3236		else
3237			queue_index = __netdev_pick_tx(dev, skb);
3238
3239		if (!accel_priv)
3240			queue_index = netdev_cap_txqueue(dev, queue_index);
3241	}
3242
3243	skb_set_queue_mapping(skb, queue_index);
3244	return netdev_get_tx_queue(dev, queue_index);
3245}
3246
3247/**
3248 *	__dev_queue_xmit - transmit a buffer
3249 *	@skb: buffer to transmit
3250 *	@accel_priv: private data used for L2 forwarding offload
3251 *
3252 *	Queue a buffer for transmission to a network device. The caller must
3253 *	have set the device and priority and built the buffer before calling
3254 *	this function. The function can be called from an interrupt.
3255 *
3256 *	A negative errno code is returned on a failure. A success does not
3257 *	guarantee the frame will be transmitted as it may be dropped due
3258 *	to congestion or traffic shaping.
3259 *
3260 * -----------------------------------------------------------------------------------
3261 *      I notice this method can also return errors from the queue disciplines,
3262 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3263 *      be positive.
3264 *
3265 *      Regardless of the return value, the skb is consumed, so it is currently
3266 *      difficult to retry a send to this method.  (You can bump the ref count
3267 *      before sending to hold a reference for retry if you are careful.)
3268 *
3269 *      When calling this method, interrupts MUST be enabled.  This is because
3270 *      the BH enable code must have IRQs enabled so that it will not deadlock.
3271 *          --BLG
3272 */
3273static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3274{
3275	struct net_device *dev = skb->dev;
3276	struct netdev_queue *txq;
3277	struct Qdisc *q;
3278	int rc = -ENOMEM;
 
3279
3280	skb_reset_mac_header(skb);
3281
3282	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3283		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3284
3285	/* Disable soft irqs for various locks below. Also
3286	 * stops preemption for RCU.
3287	 */
3288	rcu_read_lock_bh();
3289
3290	skb_update_prio(skb);
3291
3292	qdisc_pkt_len_init(skb);
3293#ifdef CONFIG_NET_CLS_ACT
3294	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3295# ifdef CONFIG_NET_EGRESS
3296	if (static_key_false(&egress_needed)) {
3297		skb = sch_handle_egress(skb, &rc, dev);
3298		if (!skb)
3299			goto out;
3300	}
3301# endif
3302#endif
3303	/* If device/qdisc don't need skb->dst, release it right now while
3304	 * its hot in this cpu cache.
3305	 */
3306	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3307		skb_dst_drop(skb);
3308	else
3309		skb_dst_force(skb);
3310
3311#ifdef CONFIG_NET_SWITCHDEV
3312	/* Don't forward if offload device already forwarded */
3313	if (skb->offload_fwd_mark &&
3314	    skb->offload_fwd_mark == dev->offload_fwd_mark) {
3315		consume_skb(skb);
3316		rc = NET_XMIT_SUCCESS;
3317		goto out;
3318	}
3319#endif
3320
3321	txq = netdev_pick_tx(dev, skb, accel_priv);
3322	q = rcu_dereference_bh(txq->qdisc);
3323
3324	trace_net_dev_queue(skb);
3325	if (q->enqueue) {
3326		rc = __dev_xmit_skb(skb, q, dev, txq);
3327		goto out;
3328	}
3329
3330	/* The device has no queue. Common case for software devices:
3331	   loopback, all the sorts of tunnels...
3332
3333	   Really, it is unlikely that netif_tx_lock protection is necessary
3334	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3335	   counters.)
3336	   However, it is possible, that they rely on protection
3337	   made by us here.
3338
3339	   Check this and shot the lock. It is not prone from deadlocks.
3340	   Either shot noqueue qdisc, it is even simpler 8)
3341	 */
3342	if (dev->flags & IFF_UP) {
3343		int cpu = smp_processor_id(); /* ok because BHs are off */
3344
3345		if (txq->xmit_lock_owner != cpu) {
3346
3347			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
3348				goto recursion_alert;
3349
3350			skb = validate_xmit_skb(skb, dev);
3351			if (!skb)
3352				goto drop;
3353
3354			HARD_TX_LOCK(dev, txq, cpu);
3355
3356			if (!netif_xmit_stopped(txq)) {
3357				__this_cpu_inc(xmit_recursion);
3358				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3359				__this_cpu_dec(xmit_recursion);
3360				if (dev_xmit_complete(rc)) {
3361					HARD_TX_UNLOCK(dev, txq);
3362					goto out;
3363				}
3364			}
3365			HARD_TX_UNLOCK(dev, txq);
3366			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3367					     dev->name);
3368		} else {
3369			/* Recursion is detected! It is possible,
3370			 * unfortunately
3371			 */
3372recursion_alert:
3373			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3374					     dev->name);
3375		}
3376	}
3377
3378	rc = -ENETDOWN;
3379drop:
3380	rcu_read_unlock_bh();
3381
3382	atomic_long_inc(&dev->tx_dropped);
3383	kfree_skb_list(skb);
3384	return rc;
3385out:
3386	rcu_read_unlock_bh();
3387	return rc;
3388}
3389
3390int dev_queue_xmit(struct sk_buff *skb)
3391{
3392	return __dev_queue_xmit(skb, NULL);
3393}
3394EXPORT_SYMBOL(dev_queue_xmit);
3395
3396int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3397{
3398	return __dev_queue_xmit(skb, accel_priv);
3399}
3400EXPORT_SYMBOL(dev_queue_xmit_accel);
3401
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3402
3403/*=======================================================================
3404			Receiver routines
3405  =======================================================================*/
3406
3407int netdev_max_backlog __read_mostly = 1000;
3408EXPORT_SYMBOL(netdev_max_backlog);
3409
3410int netdev_tstamp_prequeue __read_mostly = 1;
3411int netdev_budget __read_mostly = 300;
3412int weight_p __read_mostly = 64;            /* old backlog weight */
 
 
 
 
 
 
 
3413
3414/* Called with irq disabled */
3415static inline void ____napi_schedule(struct softnet_data *sd,
3416				     struct napi_struct *napi)
3417{
3418	list_add_tail(&napi->poll_list, &sd->poll_list);
3419	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3420}
3421
3422#ifdef CONFIG_RPS
3423
3424/* One global table that all flow-based protocols share. */
3425struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3426EXPORT_SYMBOL(rps_sock_flow_table);
3427u32 rps_cpu_mask __read_mostly;
3428EXPORT_SYMBOL(rps_cpu_mask);
3429
3430struct static_key rps_needed __read_mostly;
 
 
 
3431
3432static struct rps_dev_flow *
3433set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3434	    struct rps_dev_flow *rflow, u16 next_cpu)
3435{
3436	if (next_cpu < nr_cpu_ids) {
3437#ifdef CONFIG_RFS_ACCEL
3438		struct netdev_rx_queue *rxqueue;
3439		struct rps_dev_flow_table *flow_table;
3440		struct rps_dev_flow *old_rflow;
3441		u32 flow_id;
3442		u16 rxq_index;
3443		int rc;
3444
3445		/* Should we steer this flow to a different hardware queue? */
3446		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3447		    !(dev->features & NETIF_F_NTUPLE))
3448			goto out;
3449		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3450		if (rxq_index == skb_get_rx_queue(skb))
3451			goto out;
3452
3453		rxqueue = dev->_rx + rxq_index;
3454		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3455		if (!flow_table)
3456			goto out;
3457		flow_id = skb_get_hash(skb) & flow_table->mask;
3458		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3459							rxq_index, flow_id);
3460		if (rc < 0)
3461			goto out;
3462		old_rflow = rflow;
3463		rflow = &flow_table->flows[flow_id];
3464		rflow->filter = rc;
3465		if (old_rflow->filter == rflow->filter)
3466			old_rflow->filter = RPS_NO_FILTER;
3467	out:
3468#endif
3469		rflow->last_qtail =
3470			per_cpu(softnet_data, next_cpu).input_queue_head;
3471	}
3472
3473	rflow->cpu = next_cpu;
3474	return rflow;
3475}
3476
3477/*
3478 * get_rps_cpu is called from netif_receive_skb and returns the target
3479 * CPU from the RPS map of the receiving queue for a given skb.
3480 * rcu_read_lock must be held on entry.
3481 */
3482static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3483		       struct rps_dev_flow **rflowp)
3484{
3485	const struct rps_sock_flow_table *sock_flow_table;
3486	struct netdev_rx_queue *rxqueue = dev->_rx;
3487	struct rps_dev_flow_table *flow_table;
3488	struct rps_map *map;
3489	int cpu = -1;
3490	u32 tcpu;
3491	u32 hash;
3492
3493	if (skb_rx_queue_recorded(skb)) {
3494		u16 index = skb_get_rx_queue(skb);
3495
3496		if (unlikely(index >= dev->real_num_rx_queues)) {
3497			WARN_ONCE(dev->real_num_rx_queues > 1,
3498				  "%s received packet on queue %u, but number "
3499				  "of RX queues is %u\n",
3500				  dev->name, index, dev->real_num_rx_queues);
3501			goto done;
3502		}
3503		rxqueue += index;
3504	}
3505
3506	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3507
3508	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3509	map = rcu_dereference(rxqueue->rps_map);
3510	if (!flow_table && !map)
3511		goto done;
3512
3513	skb_reset_network_header(skb);
3514	hash = skb_get_hash(skb);
3515	if (!hash)
3516		goto done;
3517
3518	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3519	if (flow_table && sock_flow_table) {
3520		struct rps_dev_flow *rflow;
3521		u32 next_cpu;
3522		u32 ident;
3523
3524		/* First check into global flow table if there is a match */
3525		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3526		if ((ident ^ hash) & ~rps_cpu_mask)
3527			goto try_rps;
3528
3529		next_cpu = ident & rps_cpu_mask;
3530
3531		/* OK, now we know there is a match,
3532		 * we can look at the local (per receive queue) flow table
3533		 */
3534		rflow = &flow_table->flows[hash & flow_table->mask];
3535		tcpu = rflow->cpu;
3536
3537		/*
3538		 * If the desired CPU (where last recvmsg was done) is
3539		 * different from current CPU (one in the rx-queue flow
3540		 * table entry), switch if one of the following holds:
3541		 *   - Current CPU is unset (>= nr_cpu_ids).
3542		 *   - Current CPU is offline.
3543		 *   - The current CPU's queue tail has advanced beyond the
3544		 *     last packet that was enqueued using this table entry.
3545		 *     This guarantees that all previous packets for the flow
3546		 *     have been dequeued, thus preserving in order delivery.
3547		 */
3548		if (unlikely(tcpu != next_cpu) &&
3549		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3550		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3551		      rflow->last_qtail)) >= 0)) {
3552			tcpu = next_cpu;
3553			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3554		}
3555
3556		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3557			*rflowp = rflow;
3558			cpu = tcpu;
3559			goto done;
3560		}
3561	}
3562
3563try_rps:
3564
3565	if (map) {
3566		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3567		if (cpu_online(tcpu)) {
3568			cpu = tcpu;
3569			goto done;
3570		}
3571	}
3572
3573done:
3574	return cpu;
3575}
3576
3577#ifdef CONFIG_RFS_ACCEL
3578
3579/**
3580 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3581 * @dev: Device on which the filter was set
3582 * @rxq_index: RX queue index
3583 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3584 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3585 *
3586 * Drivers that implement ndo_rx_flow_steer() should periodically call
3587 * this function for each installed filter and remove the filters for
3588 * which it returns %true.
3589 */
3590bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3591			 u32 flow_id, u16 filter_id)
3592{
3593	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3594	struct rps_dev_flow_table *flow_table;
3595	struct rps_dev_flow *rflow;
3596	bool expire = true;
3597	unsigned int cpu;
3598
3599	rcu_read_lock();
3600	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3601	if (flow_table && flow_id <= flow_table->mask) {
3602		rflow = &flow_table->flows[flow_id];
3603		cpu = ACCESS_ONCE(rflow->cpu);
3604		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3605		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3606			   rflow->last_qtail) <
3607		     (int)(10 * flow_table->mask)))
3608			expire = false;
3609	}
3610	rcu_read_unlock();
3611	return expire;
3612}
3613EXPORT_SYMBOL(rps_may_expire_flow);
3614
3615#endif /* CONFIG_RFS_ACCEL */
3616
3617/* Called from hardirq (IPI) context */
3618static void rps_trigger_softirq(void *data)
3619{
3620	struct softnet_data *sd = data;
3621
3622	____napi_schedule(sd, &sd->backlog);
3623	sd->received_rps++;
3624}
3625
3626#endif /* CONFIG_RPS */
3627
3628/*
3629 * Check if this softnet_data structure is another cpu one
3630 * If yes, queue it to our IPI list and return 1
3631 * If no, return 0
3632 */
3633static int rps_ipi_queued(struct softnet_data *sd)
3634{
3635#ifdef CONFIG_RPS
3636	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3637
3638	if (sd != mysd) {
3639		sd->rps_ipi_next = mysd->rps_ipi_list;
3640		mysd->rps_ipi_list = sd;
3641
3642		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3643		return 1;
3644	}
3645#endif /* CONFIG_RPS */
3646	return 0;
3647}
3648
3649#ifdef CONFIG_NET_FLOW_LIMIT
3650int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3651#endif
3652
3653static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3654{
3655#ifdef CONFIG_NET_FLOW_LIMIT
3656	struct sd_flow_limit *fl;
3657	struct softnet_data *sd;
3658	unsigned int old_flow, new_flow;
3659
3660	if (qlen < (netdev_max_backlog >> 1))
3661		return false;
3662
3663	sd = this_cpu_ptr(&softnet_data);
3664
3665	rcu_read_lock();
3666	fl = rcu_dereference(sd->flow_limit);
3667	if (fl) {
3668		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3669		old_flow = fl->history[fl->history_head];
3670		fl->history[fl->history_head] = new_flow;
3671
3672		fl->history_head++;
3673		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3674
3675		if (likely(fl->buckets[old_flow]))
3676			fl->buckets[old_flow]--;
3677
3678		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3679			fl->count++;
3680			rcu_read_unlock();
3681			return true;
3682		}
3683	}
3684	rcu_read_unlock();
3685#endif
3686	return false;
3687}
3688
3689/*
3690 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3691 * queue (may be a remote CPU queue).
3692 */
3693static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3694			      unsigned int *qtail)
3695{
3696	struct softnet_data *sd;
3697	unsigned long flags;
3698	unsigned int qlen;
3699
3700	sd = &per_cpu(softnet_data, cpu);
3701
3702	local_irq_save(flags);
3703
3704	rps_lock(sd);
3705	if (!netif_running(skb->dev))
3706		goto drop;
3707	qlen = skb_queue_len(&sd->input_pkt_queue);
3708	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3709		if (qlen) {
3710enqueue:
3711			__skb_queue_tail(&sd->input_pkt_queue, skb);
3712			input_queue_tail_incr_save(sd, qtail);
3713			rps_unlock(sd);
3714			local_irq_restore(flags);
3715			return NET_RX_SUCCESS;
3716		}
3717
3718		/* Schedule NAPI for backlog device
3719		 * We can use non atomic operation since we own the queue lock
3720		 */
3721		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3722			if (!rps_ipi_queued(sd))
3723				____napi_schedule(sd, &sd->backlog);
3724		}
3725		goto enqueue;
3726	}
3727
3728drop:
3729	sd->dropped++;
3730	rps_unlock(sd);
3731
3732	local_irq_restore(flags);
3733
3734	atomic_long_inc(&skb->dev->rx_dropped);
3735	kfree_skb(skb);
3736	return NET_RX_DROP;
3737}
3738
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3739static int netif_rx_internal(struct sk_buff *skb)
3740{
3741	int ret;
3742
3743	net_timestamp_check(netdev_tstamp_prequeue, skb);
3744
3745	trace_netif_rx(skb);
 
3746#ifdef CONFIG_RPS
3747	if (static_key_false(&rps_needed)) {
3748		struct rps_dev_flow voidflow, *rflow = &voidflow;
3749		int cpu;
3750
3751		preempt_disable();
3752		rcu_read_lock();
3753
3754		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3755		if (cpu < 0)
3756			cpu = smp_processor_id();
3757
3758		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3759
3760		rcu_read_unlock();
3761		preempt_enable();
3762	} else
3763#endif
3764	{
3765		unsigned int qtail;
 
3766		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3767		put_cpu();
3768	}
3769	return ret;
3770}
3771
3772/**
3773 *	netif_rx	-	post buffer to the network code
3774 *	@skb: buffer to post
3775 *
3776 *	This function receives a packet from a device driver and queues it for
3777 *	the upper (protocol) levels to process.  It always succeeds. The buffer
3778 *	may be dropped during processing for congestion control or by the
3779 *	protocol layers.
3780 *
3781 *	return values:
3782 *	NET_RX_SUCCESS	(no congestion)
3783 *	NET_RX_DROP     (packet was dropped)
3784 *
3785 */
3786
3787int netif_rx(struct sk_buff *skb)
3788{
 
 
3789	trace_netif_rx_entry(skb);
3790
3791	return netif_rx_internal(skb);
 
 
 
3792}
3793EXPORT_SYMBOL(netif_rx);
3794
3795int netif_rx_ni(struct sk_buff *skb)
3796{
3797	int err;
3798
3799	trace_netif_rx_ni_entry(skb);
3800
3801	preempt_disable();
3802	err = netif_rx_internal(skb);
3803	if (local_softirq_pending())
3804		do_softirq();
3805	preempt_enable();
 
3806
3807	return err;
3808}
3809EXPORT_SYMBOL(netif_rx_ni);
3810
3811static void net_tx_action(struct softirq_action *h)
3812{
3813	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3814
3815	if (sd->completion_queue) {
3816		struct sk_buff *clist;
3817
3818		local_irq_disable();
3819		clist = sd->completion_queue;
3820		sd->completion_queue = NULL;
3821		local_irq_enable();
3822
3823		while (clist) {
3824			struct sk_buff *skb = clist;
 
3825			clist = clist->next;
3826
3827			WARN_ON(atomic_read(&skb->users));
3828			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3829				trace_consume_skb(skb);
3830			else
3831				trace_kfree_skb(skb, net_tx_action);
3832
3833			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3834				__kfree_skb(skb);
3835			else
3836				__kfree_skb_defer(skb);
3837		}
3838
3839		__kfree_skb_flush();
3840	}
3841
3842	if (sd->output_queue) {
3843		struct Qdisc *head;
3844
3845		local_irq_disable();
3846		head = sd->output_queue;
3847		sd->output_queue = NULL;
3848		sd->output_queue_tailp = &sd->output_queue;
3849		local_irq_enable();
3850
3851		while (head) {
3852			struct Qdisc *q = head;
3853			spinlock_t *root_lock;
3854
3855			head = head->next_sched;
3856
3857			root_lock = qdisc_lock(q);
3858			if (spin_trylock(root_lock)) {
3859				smp_mb__before_atomic();
3860				clear_bit(__QDISC_STATE_SCHED,
3861					  &q->state);
3862				qdisc_run(q);
 
 
 
 
 
3863				spin_unlock(root_lock);
3864			} else {
3865				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3866					      &q->state)) {
3867					__netif_reschedule(q);
3868				} else {
3869					smp_mb__before_atomic();
3870					clear_bit(__QDISC_STATE_SCHED,
3871						  &q->state);
3872				}
3873			}
3874		}
3875	}
 
 
3876}
3877
3878#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3879    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3880/* This hook is defined here for ATM LANE */
3881int (*br_fdb_test_addr_hook)(struct net_device *dev,
3882			     unsigned char *addr) __read_mostly;
3883EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3884#endif
3885
3886static inline struct sk_buff *
3887sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3888		   struct net_device *orig_dev)
3889{
3890#ifdef CONFIG_NET_CLS_ACT
3891	struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3892	struct tcf_result cl_res;
3893
3894	/* If there's at least one ingress present somewhere (so
3895	 * we get here via enabled static key), remaining devices
3896	 * that are not configured with an ingress qdisc will bail
3897	 * out here.
3898	 */
3899	if (!cl)
3900		return skb;
 
3901	if (*pt_prev) {
3902		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3903		*pt_prev = NULL;
3904	}
3905
3906	qdisc_skb_cb(skb)->pkt_len = skb->len;
3907	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3908	qdisc_bstats_cpu_update(cl->q, skb);
3909
3910	switch (tc_classify(skb, cl, &cl_res, false)) {
3911	case TC_ACT_OK:
3912	case TC_ACT_RECLASSIFY:
3913		skb->tc_index = TC_H_MIN(cl_res.classid);
3914		break;
3915	case TC_ACT_SHOT:
3916		qdisc_qstats_cpu_drop(cl->q);
 
 
3917	case TC_ACT_STOLEN:
3918	case TC_ACT_QUEUED:
3919		kfree_skb(skb);
 
3920		return NULL;
3921	case TC_ACT_REDIRECT:
3922		/* skb_mac_header check was done by cls/act_bpf, so
3923		 * we can safely push the L2 header back before
3924		 * redirecting to another netdev
3925		 */
3926		__skb_push(skb, skb->mac_len);
3927		skb_do_redirect(skb);
3928		return NULL;
 
 
3929	default:
3930		break;
3931	}
3932#endif /* CONFIG_NET_CLS_ACT */
3933	return skb;
3934}
3935
3936/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3937 *	netdev_rx_handler_register - register receive handler
3938 *	@dev: device to register a handler for
3939 *	@rx_handler: receive handler to register
3940 *	@rx_handler_data: data pointer that is used by rx handler
3941 *
3942 *	Register a receive handler for a device. This handler will then be
3943 *	called from __netif_receive_skb. A negative errno code is returned
3944 *	on a failure.
3945 *
3946 *	The caller must hold the rtnl_mutex.
3947 *
3948 *	For a general description of rx_handler, see enum rx_handler_result.
3949 */
3950int netdev_rx_handler_register(struct net_device *dev,
3951			       rx_handler_func_t *rx_handler,
3952			       void *rx_handler_data)
3953{
3954	ASSERT_RTNL();
 
3955
3956	if (dev->rx_handler)
3957		return -EBUSY;
3958
3959	/* Note: rx_handler_data must be set before rx_handler */
3960	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3961	rcu_assign_pointer(dev->rx_handler, rx_handler);
3962
3963	return 0;
3964}
3965EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3966
3967/**
3968 *	netdev_rx_handler_unregister - unregister receive handler
3969 *	@dev: device to unregister a handler from
3970 *
3971 *	Unregister a receive handler from a device.
3972 *
3973 *	The caller must hold the rtnl_mutex.
3974 */
3975void netdev_rx_handler_unregister(struct net_device *dev)
3976{
3977
3978	ASSERT_RTNL();
3979	RCU_INIT_POINTER(dev->rx_handler, NULL);
3980	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3981	 * section has a guarantee to see a non NULL rx_handler_data
3982	 * as well.
3983	 */
3984	synchronize_net();
3985	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3986}
3987EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3988
3989/*
3990 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3991 * the special handling of PFMEMALLOC skbs.
3992 */
3993static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3994{
3995	switch (skb->protocol) {
3996	case htons(ETH_P_ARP):
3997	case htons(ETH_P_IP):
3998	case htons(ETH_P_IPV6):
3999	case htons(ETH_P_8021Q):
4000	case htons(ETH_P_8021AD):
4001		return true;
4002	default:
4003		return false;
4004	}
4005}
4006
4007static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4008			     int *ret, struct net_device *orig_dev)
4009{
4010#ifdef CONFIG_NETFILTER_INGRESS
4011	if (nf_hook_ingress_active(skb)) {
 
 
4012		if (*pt_prev) {
4013			*ret = deliver_skb(skb, *pt_prev, orig_dev);
4014			*pt_prev = NULL;
4015		}
4016
4017		return nf_hook_ingress(skb);
 
 
 
4018	}
4019#endif /* CONFIG_NETFILTER_INGRESS */
4020	return 0;
4021}
4022
4023static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
 
4024{
4025	struct packet_type *ptype, *pt_prev;
4026	rx_handler_func_t *rx_handler;
4027	struct net_device *orig_dev;
4028	bool deliver_exact = false;
4029	int ret = NET_RX_DROP;
4030	__be16 type;
4031
4032	net_timestamp_check(!netdev_tstamp_prequeue, skb);
4033
4034	trace_netif_receive_skb(skb);
4035
4036	orig_dev = skb->dev;
4037
4038	skb_reset_network_header(skb);
4039	if (!skb_transport_header_was_set(skb))
4040		skb_reset_transport_header(skb);
4041	skb_reset_mac_len(skb);
4042
4043	pt_prev = NULL;
4044
4045another_round:
4046	skb->skb_iif = skb->dev->ifindex;
4047
4048	__this_cpu_inc(softnet_data.processed);
4049
 
 
 
 
 
 
 
 
 
 
 
 
4050	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4051	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4052		skb = skb_vlan_untag(skb);
4053		if (unlikely(!skb))
4054			goto out;
4055	}
4056
4057#ifdef CONFIG_NET_CLS_ACT
4058	if (skb->tc_verd & TC_NCLS) {
4059		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4060		goto ncls;
4061	}
4062#endif
4063
4064	if (pfmemalloc)
4065		goto skip_taps;
4066
4067	list_for_each_entry_rcu(ptype, &ptype_all, list) {
4068		if (pt_prev)
4069			ret = deliver_skb(skb, pt_prev, orig_dev);
4070		pt_prev = ptype;
4071	}
4072
4073	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4074		if (pt_prev)
4075			ret = deliver_skb(skb, pt_prev, orig_dev);
4076		pt_prev = ptype;
4077	}
4078
4079skip_taps:
4080#ifdef CONFIG_NET_INGRESS
4081	if (static_key_false(&ingress_needed)) {
4082		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4083		if (!skb)
4084			goto out;
4085
4086		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4087			goto out;
4088	}
4089#endif
4090#ifdef CONFIG_NET_CLS_ACT
4091	skb->tc_verd = 0;
4092ncls:
4093#endif
4094	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4095		goto drop;
4096
4097	if (skb_vlan_tag_present(skb)) {
4098		if (pt_prev) {
4099			ret = deliver_skb(skb, pt_prev, orig_dev);
4100			pt_prev = NULL;
4101		}
4102		if (vlan_do_receive(&skb))
4103			goto another_round;
4104		else if (unlikely(!skb))
4105			goto out;
4106	}
4107
4108	rx_handler = rcu_dereference(skb->dev->rx_handler);
4109	if (rx_handler) {
4110		if (pt_prev) {
4111			ret = deliver_skb(skb, pt_prev, orig_dev);
4112			pt_prev = NULL;
4113		}
4114		switch (rx_handler(&skb)) {
4115		case RX_HANDLER_CONSUMED:
4116			ret = NET_RX_SUCCESS;
4117			goto out;
4118		case RX_HANDLER_ANOTHER:
4119			goto another_round;
4120		case RX_HANDLER_EXACT:
4121			deliver_exact = true;
4122		case RX_HANDLER_PASS:
4123			break;
4124		default:
4125			BUG();
4126		}
4127	}
4128
4129	if (unlikely(skb_vlan_tag_present(skb))) {
4130		if (skb_vlan_tag_get_id(skb))
 
 
 
 
4131			skb->pkt_type = PACKET_OTHERHOST;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4132		/* Note: we might in the future use prio bits
4133		 * and set skb->priority like in vlan_do_receive()
4134		 * For the time being, just ignore Priority Code Point
4135		 */
4136		skb->vlan_tci = 0;
4137	}
4138
4139	type = skb->protocol;
4140
4141	/* deliver only exact match when indicated */
4142	if (likely(!deliver_exact)) {
4143		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4144				       &ptype_base[ntohs(type) &
4145						   PTYPE_HASH_MASK]);
4146	}
4147
4148	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4149			       &orig_dev->ptype_specific);
4150
4151	if (unlikely(skb->dev != orig_dev)) {
4152		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4153				       &skb->dev->ptype_specific);
4154	}
4155
4156	if (pt_prev) {
4157		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4158			goto drop;
4159		else
4160			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4161	} else {
4162drop:
4163		if (!deliver_exact)
4164			atomic_long_inc(&skb->dev->rx_dropped);
4165		else
4166			atomic_long_inc(&skb->dev->rx_nohandler);
4167		kfree_skb(skb);
4168		/* Jamal, now you will not able to escape explaining
4169		 * me how you were going to use this. :-)
4170		 */
4171		ret = NET_RX_DROP;
4172	}
4173
4174out:
4175	return ret;
4176}
4177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4178static int __netif_receive_skb(struct sk_buff *skb)
4179{
4180	int ret;
4181
4182	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4183		unsigned long pflags = current->flags;
4184
4185		/*
4186		 * PFMEMALLOC skbs are special, they should
4187		 * - be delivered to SOCK_MEMALLOC sockets only
4188		 * - stay away from userspace
4189		 * - have bounded memory usage
4190		 *
4191		 * Use PF_MEMALLOC as this saves us from propagating the allocation
4192		 * context down to all allocation sites.
4193		 */
4194		current->flags |= PF_MEMALLOC;
4195		ret = __netif_receive_skb_core(skb, true);
4196		tsk_restore_flags(current, pflags, PF_MEMALLOC);
4197	} else
4198		ret = __netif_receive_skb_core(skb, false);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4199
4200	return ret;
4201}
4202
4203static int netif_receive_skb_internal(struct sk_buff *skb)
4204{
4205	int ret;
4206
4207	net_timestamp_check(netdev_tstamp_prequeue, skb);
4208
4209	if (skb_defer_rx_timestamp(skb))
4210		return NET_RX_SUCCESS;
4211
4212	rcu_read_lock();
4213
4214#ifdef CONFIG_RPS
4215	if (static_key_false(&rps_needed)) {
4216		struct rps_dev_flow voidflow, *rflow = &voidflow;
4217		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4218
4219		if (cpu >= 0) {
4220			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4221			rcu_read_unlock();
4222			return ret;
4223		}
4224	}
4225#endif
4226	ret = __netif_receive_skb(skb);
4227	rcu_read_unlock();
4228	return ret;
4229}
4230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4231/**
4232 *	netif_receive_skb - process receive buffer from network
4233 *	@skb: buffer to process
4234 *
4235 *	netif_receive_skb() is the main receive data processing function.
4236 *	It always succeeds. The buffer may be dropped during processing
4237 *	for congestion control or by the protocol layers.
4238 *
4239 *	This function may only be called from softirq context and interrupts
4240 *	should be enabled.
4241 *
4242 *	Return values (usually ignored):
4243 *	NET_RX_SUCCESS: no congestion
4244 *	NET_RX_DROP: packet was dropped
4245 */
4246int netif_receive_skb(struct sk_buff *skb)
4247{
 
 
4248	trace_netif_receive_skb_entry(skb);
4249
4250	return netif_receive_skb_internal(skb);
 
 
 
4251}
4252EXPORT_SYMBOL(netif_receive_skb);
4253
4254/* Network device is going away, flush any packets still pending
4255 * Called with irqs disabled.
 
 
 
 
 
 
 
4256 */
4257static void flush_backlog(void *arg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4258{
4259	struct net_device *dev = arg;
4260	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4261	struct sk_buff *skb, *tmp;
 
4262
 
 
 
 
4263	rps_lock(sd);
4264	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4265		if (skb->dev == dev) {
4266			__skb_unlink(skb, &sd->input_pkt_queue);
4267			kfree_skb(skb);
4268			input_queue_head_incr(sd);
4269		}
4270	}
4271	rps_unlock(sd);
 
4272
4273	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4274		if (skb->dev == dev) {
4275			__skb_unlink(skb, &sd->process_queue);
4276			kfree_skb(skb);
4277			input_queue_head_incr(sd);
4278		}
4279	}
 
4280}
4281
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4282static int napi_gro_complete(struct sk_buff *skb)
4283{
4284	struct packet_offload *ptype;
4285	__be16 type = skb->protocol;
4286	struct list_head *head = &offload_base;
4287	int err = -ENOENT;
4288
4289	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4290
4291	if (NAPI_GRO_CB(skb)->count == 1) {
4292		skb_shinfo(skb)->gso_size = 0;
4293		goto out;
4294	}
4295
4296	rcu_read_lock();
4297	list_for_each_entry_rcu(ptype, head, list) {
4298		if (ptype->type != type || !ptype->callbacks.gro_complete)
4299			continue;
4300
4301		err = ptype->callbacks.gro_complete(skb, 0);
 
 
4302		break;
4303	}
4304	rcu_read_unlock();
4305
4306	if (err) {
4307		WARN_ON(&ptype->list == head);
4308		kfree_skb(skb);
4309		return NET_RX_SUCCESS;
4310	}
4311
4312out:
4313	return netif_receive_skb_internal(skb);
4314}
4315
4316/* napi->gro_list contains packets ordered by age.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4317 * youngest packets at the head of it.
4318 * Complete skbs in reverse order to reduce latencies.
4319 */
4320void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4321{
4322	struct sk_buff *skb, *prev = NULL;
4323
4324	/* scan list and build reverse chain */
4325	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4326		skb->prev = prev;
4327		prev = skb;
4328	}
4329
4330	for (skb = prev; skb; skb = prev) {
4331		skb->next = NULL;
4332
4333		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4334			return;
4335
4336		prev = skb->prev;
4337		napi_gro_complete(skb);
4338		napi->gro_count--;
 
4339	}
4340
4341	napi->gro_list = NULL;
4342}
4343EXPORT_SYMBOL(napi_gro_flush);
4344
4345static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
 
4346{
4347	struct sk_buff *p;
4348	unsigned int maclen = skb->dev->hard_header_len;
4349	u32 hash = skb_get_hash_raw(skb);
 
 
4350
4351	for (p = napi->gro_list; p; p = p->next) {
 
4352		unsigned long diffs;
4353
4354		NAPI_GRO_CB(p)->flush = 0;
4355
4356		if (hash != skb_get_hash_raw(p)) {
4357			NAPI_GRO_CB(p)->same_flow = 0;
4358			continue;
4359		}
4360
4361		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4362		diffs |= p->vlan_tci ^ skb->vlan_tci;
 
 
4363		diffs |= skb_metadata_dst_cmp(p, skb);
 
4364		if (maclen == ETH_HLEN)
4365			diffs |= compare_ether_header(skb_mac_header(p),
4366						      skb_mac_header(skb));
4367		else if (!diffs)
4368			diffs = memcmp(skb_mac_header(p),
4369				       skb_mac_header(skb),
4370				       maclen);
4371		NAPI_GRO_CB(p)->same_flow = !diffs;
4372	}
 
 
4373}
4374
4375static void skb_gro_reset_offset(struct sk_buff *skb)
4376{
4377	const struct skb_shared_info *pinfo = skb_shinfo(skb);
4378	const skb_frag_t *frag0 = &pinfo->frags[0];
4379
4380	NAPI_GRO_CB(skb)->data_offset = 0;
4381	NAPI_GRO_CB(skb)->frag0 = NULL;
4382	NAPI_GRO_CB(skb)->frag0_len = 0;
4383
4384	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4385	    pinfo->nr_frags &&
4386	    !PageHighMem(skb_frag_page(frag0))) {
4387		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4388		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
 
 
4389	}
4390}
4391
4392static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4393{
4394	struct skb_shared_info *pinfo = skb_shinfo(skb);
4395
4396	BUG_ON(skb->end - skb->tail < grow);
4397
4398	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4399
4400	skb->data_len -= grow;
4401	skb->tail += grow;
4402
4403	pinfo->frags[0].page_offset += grow;
4404	skb_frag_size_sub(&pinfo->frags[0], grow);
4405
4406	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4407		skb_frag_unref(skb, 0);
4408		memmove(pinfo->frags, pinfo->frags + 1,
4409			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
4410	}
4411}
4412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4413static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4414{
4415	struct sk_buff **pp = NULL;
 
4416	struct packet_offload *ptype;
4417	__be16 type = skb->protocol;
4418	struct list_head *head = &offload_base;
 
 
4419	int same_flow;
4420	enum gro_result ret;
4421	int grow;
4422
4423	if (!(skb->dev->features & NETIF_F_GRO))
4424		goto normal;
4425
4426	if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4427		goto normal;
4428
4429	gro_list_prepare(napi, skb);
4430
4431	rcu_read_lock();
4432	list_for_each_entry_rcu(ptype, head, list) {
4433		if (ptype->type != type || !ptype->callbacks.gro_receive)
4434			continue;
4435
4436		skb_set_network_header(skb, skb_gro_offset(skb));
4437		skb_reset_mac_len(skb);
4438		NAPI_GRO_CB(skb)->same_flow = 0;
4439		NAPI_GRO_CB(skb)->flush = 0;
4440		NAPI_GRO_CB(skb)->free = 0;
4441		NAPI_GRO_CB(skb)->encap_mark = 0;
 
4442		NAPI_GRO_CB(skb)->is_fou = 0;
 
4443		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4444
4445		/* Setup for GRO checksum validation */
4446		switch (skb->ip_summed) {
4447		case CHECKSUM_COMPLETE:
4448			NAPI_GRO_CB(skb)->csum = skb->csum;
4449			NAPI_GRO_CB(skb)->csum_valid = 1;
4450			NAPI_GRO_CB(skb)->csum_cnt = 0;
4451			break;
4452		case CHECKSUM_UNNECESSARY:
4453			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4454			NAPI_GRO_CB(skb)->csum_valid = 0;
4455			break;
4456		default:
4457			NAPI_GRO_CB(skb)->csum_cnt = 0;
4458			NAPI_GRO_CB(skb)->csum_valid = 0;
4459		}
4460
4461		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
 
 
4462		break;
4463	}
4464	rcu_read_unlock();
4465
4466	if (&ptype->list == head)
4467		goto normal;
4468
 
 
 
 
 
4469	same_flow = NAPI_GRO_CB(skb)->same_flow;
4470	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4471
4472	if (pp) {
4473		struct sk_buff *nskb = *pp;
4474
4475		*pp = nskb->next;
4476		nskb->next = NULL;
4477		napi_gro_complete(nskb);
4478		napi->gro_count--;
4479	}
4480
4481	if (same_flow)
4482		goto ok;
4483
4484	if (NAPI_GRO_CB(skb)->flush)
4485		goto normal;
4486
4487	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4488		struct sk_buff *nskb = napi->gro_list;
4489
4490		/* locate the end of the list to select the 'oldest' flow */
4491		while (nskb->next) {
4492			pp = &nskb->next;
4493			nskb = *pp;
4494		}
4495		*pp = NULL;
4496		nskb->next = NULL;
4497		napi_gro_complete(nskb);
4498	} else {
4499		napi->gro_count++;
4500	}
4501	NAPI_GRO_CB(skb)->count = 1;
4502	NAPI_GRO_CB(skb)->age = jiffies;
4503	NAPI_GRO_CB(skb)->last = skb;
4504	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4505	skb->next = napi->gro_list;
4506	napi->gro_list = skb;
4507	ret = GRO_HELD;
4508
4509pull:
4510	grow = skb_gro_offset(skb) - skb_headlen(skb);
4511	if (grow > 0)
4512		gro_pull_from_frag0(skb, grow);
4513ok:
 
 
 
 
 
 
 
4514	return ret;
4515
4516normal:
4517	ret = GRO_NORMAL;
4518	goto pull;
4519}
4520
4521struct packet_offload *gro_find_receive_by_type(__be16 type)
4522{
4523	struct list_head *offload_head = &offload_base;
4524	struct packet_offload *ptype;
4525
4526	list_for_each_entry_rcu(ptype, offload_head, list) {
4527		if (ptype->type != type || !ptype->callbacks.gro_receive)
4528			continue;
4529		return ptype;
4530	}
4531	return NULL;
4532}
4533EXPORT_SYMBOL(gro_find_receive_by_type);
4534
4535struct packet_offload *gro_find_complete_by_type(__be16 type)
4536{
4537	struct list_head *offload_head = &offload_base;
4538	struct packet_offload *ptype;
4539
4540	list_for_each_entry_rcu(ptype, offload_head, list) {
4541		if (ptype->type != type || !ptype->callbacks.gro_complete)
4542			continue;
4543		return ptype;
4544	}
4545	return NULL;
4546}
4547EXPORT_SYMBOL(gro_find_complete_by_type);
4548
 
 
 
 
 
 
 
4549static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4550{
4551	switch (ret) {
4552	case GRO_NORMAL:
4553		if (netif_receive_skb_internal(skb))
4554			ret = GRO_DROP;
4555		break;
4556
4557	case GRO_DROP:
4558		kfree_skb(skb);
4559		break;
4560
4561	case GRO_MERGED_FREE:
4562		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4563			skb_dst_drop(skb);
4564			kmem_cache_free(skbuff_head_cache, skb);
4565		} else {
4566			__kfree_skb(skb);
4567		}
4568		break;
4569
4570	case GRO_HELD:
4571	case GRO_MERGED:
 
4572		break;
4573	}
4574
4575	return ret;
4576}
4577
4578gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4579{
 
 
4580	skb_mark_napi_id(skb, napi);
4581	trace_napi_gro_receive_entry(skb);
4582
4583	skb_gro_reset_offset(skb);
4584
4585	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
 
 
 
4586}
4587EXPORT_SYMBOL(napi_gro_receive);
4588
4589static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4590{
4591	if (unlikely(skb->pfmemalloc)) {
4592		consume_skb(skb);
4593		return;
4594	}
4595	__skb_pull(skb, skb_headlen(skb));
4596	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4597	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4598	skb->vlan_tci = 0;
4599	skb->dev = napi->dev;
4600	skb->skb_iif = 0;
 
 
 
 
4601	skb->encapsulation = 0;
4602	skb_shinfo(skb)->gso_type = 0;
4603	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
 
4604
4605	napi->skb = skb;
4606}
4607
4608struct sk_buff *napi_get_frags(struct napi_struct *napi)
4609{
4610	struct sk_buff *skb = napi->skb;
4611
4612	if (!skb) {
4613		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4614		if (skb) {
4615			napi->skb = skb;
4616			skb_mark_napi_id(skb, napi);
4617		}
4618	}
4619	return skb;
4620}
4621EXPORT_SYMBOL(napi_get_frags);
4622
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4623static gro_result_t napi_frags_finish(struct napi_struct *napi,
4624				      struct sk_buff *skb,
4625				      gro_result_t ret)
4626{
4627	switch (ret) {
4628	case GRO_NORMAL:
4629	case GRO_HELD:
4630		__skb_push(skb, ETH_HLEN);
4631		skb->protocol = eth_type_trans(skb, skb->dev);
4632		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4633			ret = GRO_DROP;
4634		break;
4635
4636	case GRO_DROP:
 
 
 
4637	case GRO_MERGED_FREE:
4638		napi_reuse_skb(napi, skb);
 
 
 
4639		break;
4640
4641	case GRO_MERGED:
 
4642		break;
4643	}
4644
4645	return ret;
4646}
4647
4648/* Upper GRO stack assumes network header starts at gro_offset=0
4649 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4650 * We copy ethernet header into skb->data to have a common layout.
4651 */
4652static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4653{
4654	struct sk_buff *skb = napi->skb;
4655	const struct ethhdr *eth;
4656	unsigned int hlen = sizeof(*eth);
4657
4658	napi->skb = NULL;
4659
4660	skb_reset_mac_header(skb);
4661	skb_gro_reset_offset(skb);
4662
4663	eth = skb_gro_header_fast(skb, 0);
4664	if (unlikely(skb_gro_header_hard(skb, hlen))) {
4665		eth = skb_gro_header_slow(skb, hlen, 0);
4666		if (unlikely(!eth)) {
 
 
4667			napi_reuse_skb(napi, skb);
4668			return NULL;
4669		}
4670	} else {
 
4671		gro_pull_from_frag0(skb, hlen);
4672		NAPI_GRO_CB(skb)->frag0 += hlen;
4673		NAPI_GRO_CB(skb)->frag0_len -= hlen;
4674	}
4675	__skb_pull(skb, hlen);
4676
4677	/*
4678	 * This works because the only protocols we care about don't require
4679	 * special handling.
4680	 * We'll fix it up properly in napi_frags_finish()
4681	 */
4682	skb->protocol = eth->h_proto;
4683
4684	return skb;
4685}
4686
4687gro_result_t napi_gro_frags(struct napi_struct *napi)
4688{
 
4689	struct sk_buff *skb = napi_frags_skb(napi);
4690
4691	if (!skb)
4692		return GRO_DROP;
4693
4694	trace_napi_gro_frags_entry(skb);
4695
4696	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
 
 
 
4697}
4698EXPORT_SYMBOL(napi_gro_frags);
4699
4700/* Compute the checksum from gro_offset and return the folded value
4701 * after adding in any pseudo checksum.
4702 */
4703__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4704{
4705	__wsum wsum;
4706	__sum16 sum;
4707
4708	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4709
4710	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4711	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
 
4712	if (likely(!sum)) {
4713		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4714		    !skb->csum_complete_sw)
4715			netdev_rx_csum_fault(skb->dev);
4716	}
4717
4718	NAPI_GRO_CB(skb)->csum = wsum;
4719	NAPI_GRO_CB(skb)->csum_valid = 1;
4720
4721	return sum;
4722}
4723EXPORT_SYMBOL(__skb_gro_checksum_complete);
4724
 
 
 
 
 
 
 
 
 
 
 
 
 
4725/*
4726 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4727 * Note: called with local irq disabled, but exits with local irq enabled.
4728 */
4729static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4730{
4731#ifdef CONFIG_RPS
4732	struct softnet_data *remsd = sd->rps_ipi_list;
4733
4734	if (remsd) {
4735		sd->rps_ipi_list = NULL;
4736
4737		local_irq_enable();
4738
4739		/* Send pending IPI's to kick RPS processing on remote cpus. */
4740		while (remsd) {
4741			struct softnet_data *next = remsd->rps_ipi_next;
4742
4743			if (cpu_online(remsd->cpu))
4744				smp_call_function_single_async(remsd->cpu,
4745							   &remsd->csd);
4746			remsd = next;
4747		}
4748	} else
4749#endif
4750		local_irq_enable();
4751}
4752
4753static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4754{
4755#ifdef CONFIG_RPS
4756	return sd->rps_ipi_list != NULL;
4757#else
4758	return false;
4759#endif
4760}
4761
4762static int process_backlog(struct napi_struct *napi, int quota)
4763{
 
 
4764	int work = 0;
4765	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4766
4767	/* Check if we have pending ipi, its better to send them now,
4768	 * not waiting net_rx_action() end.
4769	 */
4770	if (sd_has_rps_ipi_waiting(sd)) {
4771		local_irq_disable();
4772		net_rps_action_and_irq_enable(sd);
4773	}
4774
4775	napi->weight = weight_p;
4776	local_irq_disable();
4777	while (1) {
4778		struct sk_buff *skb;
4779
4780		while ((skb = __skb_dequeue(&sd->process_queue))) {
4781			rcu_read_lock();
4782			local_irq_enable();
4783			__netif_receive_skb(skb);
4784			rcu_read_unlock();
4785			local_irq_disable();
4786			input_queue_head_incr(sd);
4787			if (++work >= quota) {
4788				local_irq_enable();
4789				return work;
4790			}
4791		}
4792
 
4793		rps_lock(sd);
4794		if (skb_queue_empty(&sd->input_pkt_queue)) {
4795			/*
4796			 * Inline a custom version of __napi_complete().
4797			 * only current cpu owns and manipulates this napi,
4798			 * and NAPI_STATE_SCHED is the only possible flag set
4799			 * on backlog.
4800			 * We can use a plain write instead of clear_bit(),
4801			 * and we dont need an smp_mb() memory barrier.
4802			 */
4803			napi->state = 0;
4804			rps_unlock(sd);
4805
4806			break;
 
4807		}
4808
4809		skb_queue_splice_tail_init(&sd->input_pkt_queue,
4810					   &sd->process_queue);
4811		rps_unlock(sd);
 
4812	}
4813	local_irq_enable();
4814
4815	return work;
4816}
4817
4818/**
4819 * __napi_schedule - schedule for receive
4820 * @n: entry to schedule
4821 *
4822 * The entry's receive function will be scheduled to run.
4823 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4824 */
4825void __napi_schedule(struct napi_struct *n)
4826{
4827	unsigned long flags;
4828
4829	local_irq_save(flags);
4830	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4831	local_irq_restore(flags);
4832}
4833EXPORT_SYMBOL(__napi_schedule);
4834
4835/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4836 * __napi_schedule_irqoff - schedule for receive
4837 * @n: entry to schedule
4838 *
4839 * Variant of __napi_schedule() assuming hard irqs are masked
4840 */
4841void __napi_schedule_irqoff(struct napi_struct *n)
4842{
4843	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4844}
4845EXPORT_SYMBOL(__napi_schedule_irqoff);
4846
4847void __napi_complete(struct napi_struct *n)
4848{
4849	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4850
4851	list_del_init(&n->poll_list);
4852	smp_mb__before_atomic();
4853	clear_bit(NAPI_STATE_SCHED, &n->state);
4854}
4855EXPORT_SYMBOL(__napi_complete);
4856
4857void napi_complete_done(struct napi_struct *n, int work_done)
4858{
4859	unsigned long flags;
4860
4861	/*
4862	 * don't let napi dequeue from the cpu poll list
4863	 * just in case its running on a different cpu
 
 
4864	 */
4865	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4866		return;
 
 
 
4867
4868	if (n->gro_list) {
4869		unsigned long timeout = 0;
4870
4871		if (work_done)
4872			timeout = n->dev->gro_flush_timeout;
4873
 
 
 
 
 
4874		if (timeout)
4875			hrtimer_start(&n->timer, ns_to_ktime(timeout),
4876				      HRTIMER_MODE_REL_PINNED);
4877		else
4878			napi_gro_flush(n, false);
4879	}
4880	if (likely(list_empty(&n->poll_list))) {
4881		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4882	} else {
4883		/* If n->poll_list is not empty, we need to mask irqs */
4884		local_irq_save(flags);
4885		__napi_complete(n);
4886		local_irq_restore(flags);
4887	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4888}
4889EXPORT_SYMBOL(napi_complete_done);
4890
4891/* must be called under rcu_read_lock(), as we dont take a reference */
4892static struct napi_struct *napi_by_id(unsigned int napi_id)
4893{
4894	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4895	struct napi_struct *napi;
4896
4897	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4898		if (napi->napi_id == napi_id)
4899			return napi;
4900
4901	return NULL;
4902}
4903
4904#if defined(CONFIG_NET_RX_BUSY_POLL)
 
4905#define BUSY_POLL_BUDGET 8
4906bool sk_busy_loop(struct sock *sk, int nonblock)
 
4907{
4908	unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4909	int (*busy_poll)(struct napi_struct *dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4910	struct napi_struct *napi;
4911	int rc = false;
 
 
4912
4913	rcu_read_lock();
4914
4915	napi = napi_by_id(sk->sk_napi_id);
4916	if (!napi)
4917		goto out;
4918
4919	/* Note: ndo_busy_poll method is optional in linux-4.5 */
4920	busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
 
4921
4922	do {
4923		rc = 0;
4924		local_bh_disable();
4925		if (busy_poll) {
4926			rc = busy_poll(napi);
4927		} else if (napi_schedule_prep(napi)) {
4928			void *have = netpoll_poll_lock(napi);
4929
4930			if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
4931				rc = napi->poll(napi, BUSY_POLL_BUDGET);
4932				trace_napi_poll(napi);
4933				if (rc == BUSY_POLL_BUDGET) {
4934					napi_complete_done(napi, rc);
4935					napi_schedule(napi);
4936				}
4937			}
4938			netpoll_poll_unlock(have);
4939		}
4940		if (rc > 0)
4941			NET_ADD_STATS_BH(sock_net(sk),
4942					 LINUX_MIB_BUSYPOLLRXPACKETS, rc);
 
 
 
 
 
4943		local_bh_enable();
4944
4945		if (rc == LL_FLUSH_FAILED)
4946			break; /* permanent failure */
4947
 
 
 
 
 
 
 
 
 
 
4948		cpu_relax();
4949	} while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
4950		 !need_resched() && !busy_loop_timeout(end_time));
4951
4952	rc = !skb_queue_empty(&sk->sk_receive_queue);
4953out:
4954	rcu_read_unlock();
4955	return rc;
4956}
4957EXPORT_SYMBOL(sk_busy_loop);
4958
4959#endif /* CONFIG_NET_RX_BUSY_POLL */
4960
4961void napi_hash_add(struct napi_struct *napi)
4962{
4963	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
4964	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
4965		return;
4966
4967	spin_lock(&napi_hash_lock);
4968
4969	/* 0..NR_CPUS+1 range is reserved for sender_cpu use */
4970	do {
4971		if (unlikely(++napi_gen_id < NR_CPUS + 1))
4972			napi_gen_id = NR_CPUS + 1;
4973	} while (napi_by_id(napi_gen_id));
4974	napi->napi_id = napi_gen_id;
4975
4976	hlist_add_head_rcu(&napi->napi_hash_node,
4977			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4978
4979	spin_unlock(&napi_hash_lock);
4980}
4981EXPORT_SYMBOL_GPL(napi_hash_add);
4982
4983/* Warning : caller is responsible to make sure rcu grace period
4984 * is respected before freeing memory containing @napi
4985 */
4986bool napi_hash_del(struct napi_struct *napi)
4987{
4988	bool rcu_sync_needed = false;
4989
4990	spin_lock(&napi_hash_lock);
4991
4992	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
4993		rcu_sync_needed = true;
4994		hlist_del_rcu(&napi->napi_hash_node);
4995	}
4996	spin_unlock(&napi_hash_lock);
4997	return rcu_sync_needed;
4998}
4999EXPORT_SYMBOL_GPL(napi_hash_del);
5000
5001static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5002{
5003	struct napi_struct *napi;
5004
5005	napi = container_of(timer, struct napi_struct, timer);
5006	if (napi->gro_list)
5007		napi_schedule(napi);
 
 
 
 
 
5008
5009	return HRTIMER_NORESTART;
5010}
5011
 
 
 
 
 
 
 
 
 
 
 
5012void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5013		    int (*poll)(struct napi_struct *, int), int weight)
5014{
5015	INIT_LIST_HEAD(&napi->poll_list);
5016	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5017	napi->timer.function = napi_watchdog;
5018	napi->gro_count = 0;
5019	napi->gro_list = NULL;
5020	napi->skb = NULL;
 
 
5021	napi->poll = poll;
5022	if (weight > NAPI_POLL_WEIGHT)
5023		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5024			    weight, dev->name);
5025	napi->weight = weight;
5026	list_add(&napi->dev_list, &dev->napi_list);
5027	napi->dev = dev;
5028#ifdef CONFIG_NETPOLL
5029	spin_lock_init(&napi->poll_lock);
5030	napi->poll_owner = -1;
5031#endif
5032	set_bit(NAPI_STATE_SCHED, &napi->state);
5033	napi_hash_add(napi);
5034}
5035EXPORT_SYMBOL(netif_napi_add);
5036
5037void napi_disable(struct napi_struct *n)
5038{
5039	might_sleep();
5040	set_bit(NAPI_STATE_DISABLE, &n->state);
5041
5042	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5043		msleep(1);
5044	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5045		msleep(1);
5046
5047	hrtimer_cancel(&n->timer);
5048
5049	clear_bit(NAPI_STATE_DISABLE, &n->state);
5050}
5051EXPORT_SYMBOL(napi_disable);
5052
 
 
 
 
 
 
 
 
 
 
 
 
 
5053/* Must be called in process context */
5054void netif_napi_del(struct napi_struct *napi)
5055{
5056	might_sleep();
5057	if (napi_hash_del(napi))
5058		synchronize_net();
5059	list_del_init(&napi->dev_list);
5060	napi_free_frags(napi);
5061
5062	kfree_skb_list(napi->gro_list);
5063	napi->gro_list = NULL;
5064	napi->gro_count = 0;
5065}
5066EXPORT_SYMBOL(netif_napi_del);
5067
5068static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5069{
5070	void *have;
5071	int work, weight;
5072
5073	list_del_init(&n->poll_list);
5074
5075	have = netpoll_poll_lock(n);
5076
5077	weight = n->weight;
5078
5079	/* This NAPI_STATE_SCHED test is for avoiding a race
5080	 * with netpoll's poll_napi().  Only the entity which
5081	 * obtains the lock and sees NAPI_STATE_SCHED set will
5082	 * actually make the ->poll() call.  Therefore we avoid
5083	 * accidentally calling ->poll() when NAPI is not scheduled.
5084	 */
5085	work = 0;
5086	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5087		work = n->poll(n, weight);
5088		trace_napi_poll(n);
5089	}
5090
5091	WARN_ON_ONCE(work > weight);
5092
5093	if (likely(work < weight))
5094		goto out_unlock;
5095
5096	/* Drivers must not modify the NAPI state if they
5097	 * consume the entire weight.  In such cases this code
5098	 * still "owns" the NAPI instance and therefore can
5099	 * move the instance around on the list at-will.
5100	 */
5101	if (unlikely(napi_disable_pending(n))) {
5102		napi_complete(n);
5103		goto out_unlock;
5104	}
5105
5106	if (n->gro_list) {
 
 
5107		/* flush too old packets
5108		 * If HZ < 1000, flush all packets.
5109		 */
5110		napi_gro_flush(n, HZ >= 1000);
5111	}
5112
5113	/* Some drivers may have called napi_schedule
5114	 * prior to exhausting their budget.
5115	 */
5116	if (unlikely(!list_empty(&n->poll_list))) {
5117		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5118			     n->dev ? n->dev->name : "backlog");
5119		goto out_unlock;
5120	}
5121
5122	list_add_tail(&n->poll_list, repoll);
5123
5124out_unlock:
5125	netpoll_poll_unlock(have);
5126
5127	return work;
5128}
5129
5130static void net_rx_action(struct softirq_action *h)
5131{
5132	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5133	unsigned long time_limit = jiffies + 2;
 
5134	int budget = netdev_budget;
5135	LIST_HEAD(list);
5136	LIST_HEAD(repoll);
5137
5138	local_irq_disable();
5139	list_splice_init(&sd->poll_list, &list);
5140	local_irq_enable();
5141
5142	for (;;) {
5143		struct napi_struct *n;
5144
5145		if (list_empty(&list)) {
5146			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5147				return;
5148			break;
5149		}
5150
5151		n = list_first_entry(&list, struct napi_struct, poll_list);
5152		budget -= napi_poll(n, &repoll);
5153
5154		/* If softirq window is exhausted then punt.
5155		 * Allow this to run for 2 jiffies since which will allow
5156		 * an average latency of 1.5/HZ.
5157		 */
5158		if (unlikely(budget <= 0 ||
5159			     time_after_eq(jiffies, time_limit))) {
5160			sd->time_squeeze++;
5161			break;
5162		}
5163	}
5164
5165	__kfree_skb_flush();
5166	local_irq_disable();
5167
5168	list_splice_tail_init(&sd->poll_list, &list);
5169	list_splice_tail(&repoll, &list);
5170	list_splice(&list, &sd->poll_list);
5171	if (!list_empty(&sd->poll_list))
5172		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
5173
5174	net_rps_action_and_irq_enable(sd);
 
 
5175}
5176
5177struct netdev_adjacent {
5178	struct net_device *dev;
5179
5180	/* upper master flag, there can only be one master device per list */
5181	bool master;
5182
 
 
 
5183	/* counter for the number of times this device was added to us */
5184	u16 ref_nr;
5185
5186	/* private field for the users */
5187	void *private;
5188
5189	struct list_head list;
5190	struct rcu_head rcu;
5191};
5192
5193static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5194						 struct list_head *adj_list)
5195{
5196	struct netdev_adjacent *adj;
5197
5198	list_for_each_entry(adj, adj_list, list) {
5199		if (adj->dev == adj_dev)
5200			return adj;
5201	}
5202	return NULL;
5203}
5204
 
 
 
 
 
 
 
5205/**
5206 * netdev_has_upper_dev - Check if device is linked to an upper device
5207 * @dev: device
5208 * @upper_dev: upper device to check
5209 *
5210 * Find out if a device is linked to specified upper device and return true
5211 * in case it is. Note that this checks only immediate upper device,
5212 * not through a complete stack of devices. The caller must hold the RTNL lock.
5213 */
5214bool netdev_has_upper_dev(struct net_device *dev,
5215			  struct net_device *upper_dev)
5216{
5217	ASSERT_RTNL();
5218
5219	return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
 
5220}
5221EXPORT_SYMBOL(netdev_has_upper_dev);
5222
5223/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5224 * netdev_has_any_upper_dev - Check if device is linked to some device
5225 * @dev: device
5226 *
5227 * Find out if a device is linked to an upper device and return true in case
5228 * it is. The caller must hold the RTNL lock.
5229 */
5230static bool netdev_has_any_upper_dev(struct net_device *dev)
5231{
5232	ASSERT_RTNL();
5233
5234	return !list_empty(&dev->all_adj_list.upper);
5235}
 
5236
5237/**
5238 * netdev_master_upper_dev_get - Get master upper device
5239 * @dev: device
5240 *
5241 * Find a master upper device and return pointer to it or NULL in case
5242 * it's not there. The caller must hold the RTNL lock.
5243 */
5244struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5245{
5246	struct netdev_adjacent *upper;
5247
5248	ASSERT_RTNL();
5249
5250	if (list_empty(&dev->adj_list.upper))
5251		return NULL;
5252
5253	upper = list_first_entry(&dev->adj_list.upper,
5254				 struct netdev_adjacent, list);
5255	if (likely(upper->master))
5256		return upper->dev;
5257	return NULL;
5258}
5259EXPORT_SYMBOL(netdev_master_upper_dev_get);
5260
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5261void *netdev_adjacent_get_private(struct list_head *adj_list)
5262{
5263	struct netdev_adjacent *adj;
5264
5265	adj = list_entry(adj_list, struct netdev_adjacent, list);
5266
5267	return adj->private;
5268}
5269EXPORT_SYMBOL(netdev_adjacent_get_private);
5270
5271/**
5272 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5273 * @dev: device
5274 * @iter: list_head ** of the current position
5275 *
5276 * Gets the next device from the dev's upper list, starting from iter
5277 * position. The caller must hold RCU read lock.
5278 */
5279struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5280						 struct list_head **iter)
5281{
5282	struct netdev_adjacent *upper;
5283
5284	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5285
5286	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5287
5288	if (&upper->list == &dev->adj_list.upper)
5289		return NULL;
5290
5291	*iter = &upper->list;
5292
5293	return upper->dev;
5294}
5295EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5296
5297/**
5298 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5299 * @dev: device
5300 * @iter: list_head ** of the current position
5301 *
5302 * Gets the next device from the dev's upper list, starting from iter
5303 * position. The caller must hold RCU read lock.
5304 */
5305struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5306						     struct list_head **iter)
 
 
 
 
 
 
 
 
 
5307{
5308	struct netdev_adjacent *upper;
5309
5310	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5311
5312	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5313
5314	if (&upper->list == &dev->all_adj_list.upper)
5315		return NULL;
5316
5317	*iter = &upper->list;
5318
5319	return upper->dev;
5320}
5321EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5322
5323/**
5324 * netdev_lower_get_next_private - Get the next ->private from the
5325 *				   lower neighbour list
5326 * @dev: device
5327 * @iter: list_head ** of the current position
5328 *
5329 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5330 * list, starting from iter position. The caller must hold either hold the
5331 * RTNL lock or its own locking that guarantees that the neighbour lower
5332 * list will remain unchanged.
5333 */
5334void *netdev_lower_get_next_private(struct net_device *dev,
5335				    struct list_head **iter)
5336{
5337	struct netdev_adjacent *lower;
5338
5339	lower = list_entry(*iter, struct netdev_adjacent, list);
5340
5341	if (&lower->list == &dev->adj_list.lower)
5342		return NULL;
5343
5344	*iter = lower->list.next;
5345
5346	return lower->private;
5347}
5348EXPORT_SYMBOL(netdev_lower_get_next_private);
5349
5350/**
5351 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5352 *				       lower neighbour list, RCU
5353 *				       variant
5354 * @dev: device
5355 * @iter: list_head ** of the current position
5356 *
5357 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5358 * list, starting from iter position. The caller must hold RCU read lock.
5359 */
5360void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5361					struct list_head **iter)
5362{
5363	struct netdev_adjacent *lower;
5364
5365	WARN_ON_ONCE(!rcu_read_lock_held());
5366
5367	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5368
5369	if (&lower->list == &dev->adj_list.lower)
5370		return NULL;
5371
5372	*iter = &lower->list;
5373
5374	return lower->private;
5375}
5376EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5377
5378/**
5379 * netdev_lower_get_next - Get the next device from the lower neighbour
5380 *                         list
5381 * @dev: device
5382 * @iter: list_head ** of the current position
5383 *
5384 * Gets the next netdev_adjacent from the dev's lower neighbour
5385 * list, starting from iter position. The caller must hold RTNL lock or
5386 * its own locking that guarantees that the neighbour lower
5387 * list will remain unchanged.
5388 */
5389void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5390{
5391	struct netdev_adjacent *lower;
5392
5393	lower = list_entry(*iter, struct netdev_adjacent, list);
5394
5395	if (&lower->list == &dev->adj_list.lower)
5396		return NULL;
5397
5398	*iter = lower->list.next;
5399
5400	return lower->dev;
5401}
5402EXPORT_SYMBOL(netdev_lower_get_next);
5403
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5404/**
5405 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5406 *				       lower neighbour list, RCU
5407 *				       variant
5408 * @dev: device
5409 *
5410 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5411 * list. The caller must hold RCU read lock.
5412 */
5413void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5414{
5415	struct netdev_adjacent *lower;
5416
5417	lower = list_first_or_null_rcu(&dev->adj_list.lower,
5418			struct netdev_adjacent, list);
5419	if (lower)
5420		return lower->private;
5421	return NULL;
5422}
5423EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5424
5425/**
5426 * netdev_master_upper_dev_get_rcu - Get master upper device
5427 * @dev: device
5428 *
5429 * Find a master upper device and return pointer to it or NULL in case
5430 * it's not there. The caller must hold the RCU read lock.
5431 */
5432struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5433{
5434	struct netdev_adjacent *upper;
5435
5436	upper = list_first_or_null_rcu(&dev->adj_list.upper,
5437				       struct netdev_adjacent, list);
5438	if (upper && likely(upper->master))
5439		return upper->dev;
5440	return NULL;
5441}
5442EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5443
5444static int netdev_adjacent_sysfs_add(struct net_device *dev,
5445			      struct net_device *adj_dev,
5446			      struct list_head *dev_list)
5447{
5448	char linkname[IFNAMSIZ+7];
 
5449	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5450		"upper_%s" : "lower_%s", adj_dev->name);
5451	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5452				 linkname);
5453}
5454static void netdev_adjacent_sysfs_del(struct net_device *dev,
5455			       char *name,
5456			       struct list_head *dev_list)
5457{
5458	char linkname[IFNAMSIZ+7];
 
5459	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5460		"upper_%s" : "lower_%s", name);
5461	sysfs_remove_link(&(dev->dev.kobj), linkname);
5462}
5463
5464static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5465						 struct net_device *adj_dev,
5466						 struct list_head *dev_list)
5467{
5468	return (dev_list == &dev->adj_list.upper ||
5469		dev_list == &dev->adj_list.lower) &&
5470		net_eq(dev_net(dev), dev_net(adj_dev));
5471}
5472
5473static int __netdev_adjacent_dev_insert(struct net_device *dev,
5474					struct net_device *adj_dev,
5475					struct list_head *dev_list,
5476					void *private, bool master)
5477{
5478	struct netdev_adjacent *adj;
5479	int ret;
5480
5481	adj = __netdev_find_adj(adj_dev, dev_list);
5482
5483	if (adj) {
5484		adj->ref_nr++;
 
 
 
5485		return 0;
5486	}
5487
5488	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5489	if (!adj)
5490		return -ENOMEM;
5491
5492	adj->dev = adj_dev;
5493	adj->master = master;
5494	adj->ref_nr = 1;
5495	adj->private = private;
 
5496	dev_hold(adj_dev);
5497
5498	pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5499		 adj_dev->name, dev->name, adj_dev->name);
5500
5501	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5502		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5503		if (ret)
5504			goto free_adj;
5505	}
5506
5507	/* Ensure that master link is always the first item in list. */
5508	if (master) {
5509		ret = sysfs_create_link(&(dev->dev.kobj),
5510					&(adj_dev->dev.kobj), "master");
5511		if (ret)
5512			goto remove_symlinks;
5513
5514		list_add_rcu(&adj->list, dev_list);
5515	} else {
5516		list_add_tail_rcu(&adj->list, dev_list);
5517	}
5518
5519	return 0;
5520
5521remove_symlinks:
5522	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5523		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5524free_adj:
5525	kfree(adj);
5526	dev_put(adj_dev);
5527
5528	return ret;
5529}
5530
5531static void __netdev_adjacent_dev_remove(struct net_device *dev,
5532					 struct net_device *adj_dev,
 
5533					 struct list_head *dev_list)
5534{
5535	struct netdev_adjacent *adj;
5536
 
 
 
5537	adj = __netdev_find_adj(adj_dev, dev_list);
5538
5539	if (!adj) {
5540		pr_err("tried to remove device %s from %s\n",
5541		       dev->name, adj_dev->name);
5542		BUG();
 
5543	}
5544
5545	if (adj->ref_nr > 1) {
5546		pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5547			 adj->ref_nr-1);
5548		adj->ref_nr--;
 
5549		return;
5550	}
5551
5552	if (adj->master)
5553		sysfs_remove_link(&(dev->dev.kobj), "master");
5554
5555	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5556		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5557
5558	list_del_rcu(&adj->list);
5559	pr_debug("dev_put for %s, because link removed from %s to %s\n",
5560		 adj_dev->name, dev->name, adj_dev->name);
5561	dev_put(adj_dev);
5562	kfree_rcu(adj, rcu);
5563}
5564
5565static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5566					    struct net_device *upper_dev,
5567					    struct list_head *up_list,
5568					    struct list_head *down_list,
5569					    void *private, bool master)
5570{
5571	int ret;
5572
5573	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5574					   master);
5575	if (ret)
5576		return ret;
5577
5578	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5579					   false);
5580	if (ret) {
5581		__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5582		return ret;
5583	}
5584
5585	return 0;
5586}
5587
5588static int __netdev_adjacent_dev_link(struct net_device *dev,
5589				      struct net_device *upper_dev)
5590{
5591	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5592						&dev->all_adj_list.upper,
5593						&upper_dev->all_adj_list.lower,
5594						NULL, false);
5595}
5596
5597static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5598					       struct net_device *upper_dev,
 
5599					       struct list_head *up_list,
5600					       struct list_head *down_list)
5601{
5602	__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5603	__netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5604}
5605
5606static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5607					 struct net_device *upper_dev)
5608{
5609	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5610					   &dev->all_adj_list.upper,
5611					   &upper_dev->all_adj_list.lower);
5612}
5613
5614static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5615						struct net_device *upper_dev,
5616						void *private, bool master)
5617{
5618	int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5619
5620	if (ret)
5621		return ret;
5622
5623	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5624					       &dev->adj_list.upper,
5625					       &upper_dev->adj_list.lower,
5626					       private, master);
5627	if (ret) {
5628		__netdev_adjacent_dev_unlink(dev, upper_dev);
5629		return ret;
5630	}
5631
5632	return 0;
5633}
5634
5635static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5636						   struct net_device *upper_dev)
5637{
5638	__netdev_adjacent_dev_unlink(dev, upper_dev);
5639	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5640					   &dev->adj_list.upper,
5641					   &upper_dev->adj_list.lower);
5642}
5643
5644static int __netdev_upper_dev_link(struct net_device *dev,
5645				   struct net_device *upper_dev, bool master,
5646				   void *upper_priv, void *upper_info)
 
5647{
5648	struct netdev_notifier_changeupper_info changeupper_info;
5649	struct netdev_adjacent *i, *j, *to_i, *to_j;
 
 
 
 
 
 
 
 
 
5650	int ret = 0;
5651
5652	ASSERT_RTNL();
5653
5654	if (dev == upper_dev)
5655		return -EBUSY;
5656
5657	/* To prevent loops, check if dev is not upper device to upper_dev. */
5658	if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5659		return -EBUSY;
5660
5661	if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5662		return -EEXIST;
5663
5664	if (master && netdev_master_upper_dev_get(dev))
5665		return -EBUSY;
5666
5667	changeupper_info.upper_dev = upper_dev;
5668	changeupper_info.master = master;
5669	changeupper_info.linking = true;
5670	changeupper_info.upper_info = upper_info;
 
5671
5672	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5673					    &changeupper_info.info);
5674	ret = notifier_to_errno(ret);
5675	if (ret)
5676		return ret;
5677
5678	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5679						   master);
5680	if (ret)
5681		return ret;
5682
5683	/* Now that we linked these devs, make all the upper_dev's
5684	 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5685	 * versa, and don't forget the devices itself. All of these
5686	 * links are non-neighbours.
5687	 */
5688	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5689		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5690			pr_debug("Interlinking %s with %s, non-neighbour\n",
5691				 i->dev->name, j->dev->name);
5692			ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5693			if (ret)
5694				goto rollback_mesh;
5695		}
5696	}
5697
5698	/* add dev to every upper_dev's upper device */
5699	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5700		pr_debug("linking %s's upper device %s with %s\n",
5701			 upper_dev->name, i->dev->name, dev->name);
5702		ret = __netdev_adjacent_dev_link(dev, i->dev);
5703		if (ret)
5704			goto rollback_upper_mesh;
5705	}
5706
5707	/* add upper_dev to every dev's lower device */
5708	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5709		pr_debug("linking %s's lower device %s with %s\n", dev->name,
5710			 i->dev->name, upper_dev->name);
5711		ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5712		if (ret)
5713			goto rollback_lower_mesh;
5714	}
5715
5716	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5717					    &changeupper_info.info);
5718	ret = notifier_to_errno(ret);
5719	if (ret)
5720		goto rollback_lower_mesh;
5721
5722	return 0;
 
5723
5724rollback_lower_mesh:
5725	to_i = i;
5726	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5727		if (i == to_i)
5728			break;
5729		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5730	}
5731
5732	i = NULL;
5733
5734rollback_upper_mesh:
5735	to_i = i;
5736	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5737		if (i == to_i)
5738			break;
5739		__netdev_adjacent_dev_unlink(dev, i->dev);
5740	}
5741
5742	i = j = NULL;
5743
5744rollback_mesh:
5745	to_i = i;
5746	to_j = j;
5747	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5748		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5749			if (i == to_i && j == to_j)
5750				break;
5751			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5752		}
5753		if (i == to_i)
5754			break;
5755	}
5756
 
5757	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5758
5759	return ret;
5760}
5761
5762/**
5763 * netdev_upper_dev_link - Add a link to the upper device
5764 * @dev: device
5765 * @upper_dev: new upper device
 
5766 *
5767 * Adds a link to device which is upper to this one. The caller must hold
5768 * the RTNL lock. On a failure a negative errno code is returned.
5769 * On success the reference counts are adjusted and the function
5770 * returns zero.
5771 */
5772int netdev_upper_dev_link(struct net_device *dev,
5773			  struct net_device *upper_dev)
 
5774{
5775	return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
 
5776}
5777EXPORT_SYMBOL(netdev_upper_dev_link);
5778
5779/**
5780 * netdev_master_upper_dev_link - Add a master link to the upper device
5781 * @dev: device
5782 * @upper_dev: new upper device
5783 * @upper_priv: upper device private
5784 * @upper_info: upper info to be passed down via notifier
 
5785 *
5786 * Adds a link to device which is upper to this one. In this case, only
5787 * one master upper device can be linked, although other non-master devices
5788 * might be linked as well. The caller must hold the RTNL lock.
5789 * On a failure a negative errno code is returned. On success the reference
5790 * counts are adjusted and the function returns zero.
5791 */
5792int netdev_master_upper_dev_link(struct net_device *dev,
5793				 struct net_device *upper_dev,
5794				 void *upper_priv, void *upper_info)
 
5795{
5796	return __netdev_upper_dev_link(dev, upper_dev, true,
5797				       upper_priv, upper_info);
5798}
5799EXPORT_SYMBOL(netdev_master_upper_dev_link);
5800
5801/**
5802 * netdev_upper_dev_unlink - Removes a link to upper device
5803 * @dev: device
5804 * @upper_dev: new upper device
5805 *
5806 * Removes a link to device which is upper to this one. The caller must hold
5807 * the RTNL lock.
5808 */
5809void netdev_upper_dev_unlink(struct net_device *dev,
5810			     struct net_device *upper_dev)
5811{
5812	struct netdev_notifier_changeupper_info changeupper_info;
5813	struct netdev_adjacent *i, *j;
 
 
 
 
 
 
5814	ASSERT_RTNL();
5815
5816	changeupper_info.upper_dev = upper_dev;
5817	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5818	changeupper_info.linking = false;
5819
5820	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5821				      &changeupper_info.info);
5822
5823	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5824
5825	/* Here is the tricky part. We must remove all dev's lower
5826	 * devices from all upper_dev's upper devices and vice
5827	 * versa, to maintain the graph relationship.
5828	 */
5829	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5830		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5831			__netdev_adjacent_dev_unlink(i->dev, j->dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5832
5833	/* remove also the devices itself from lower/upper device
5834	 * list
5835	 */
5836	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5837		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
 
5838
5839	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5840		__netdev_adjacent_dev_unlink(dev, i->dev);
5841
5842	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5843				      &changeupper_info.info);
5844}
5845EXPORT_SYMBOL(netdev_upper_dev_unlink);
5846
5847/**
5848 * netdev_bonding_info_change - Dispatch event about slave change
5849 * @dev: device
5850 * @bonding_info: info to dispatch
5851 *
5852 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5853 * The caller must hold the RTNL lock.
5854 */
5855void netdev_bonding_info_change(struct net_device *dev,
5856				struct netdev_bonding_info *bonding_info)
5857{
5858	struct netdev_notifier_bonding_info	info;
 
 
5859
5860	memcpy(&info.bonding_info, bonding_info,
5861	       sizeof(struct netdev_bonding_info));
5862	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5863				      &info.info);
5864}
5865EXPORT_SYMBOL(netdev_bonding_info_change);
5866
5867static void netdev_adjacent_add_links(struct net_device *dev)
5868{
5869	struct netdev_adjacent *iter;
5870
5871	struct net *net = dev_net(dev);
5872
5873	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5874		if (!net_eq(net,dev_net(iter->dev)))
5875			continue;
5876		netdev_adjacent_sysfs_add(iter->dev, dev,
5877					  &iter->dev->adj_list.lower);
5878		netdev_adjacent_sysfs_add(dev, iter->dev,
5879					  &dev->adj_list.upper);
5880	}
5881
5882	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5883		if (!net_eq(net,dev_net(iter->dev)))
5884			continue;
5885		netdev_adjacent_sysfs_add(iter->dev, dev,
5886					  &iter->dev->adj_list.upper);
5887		netdev_adjacent_sysfs_add(dev, iter->dev,
5888					  &dev->adj_list.lower);
5889	}
5890}
5891
5892static void netdev_adjacent_del_links(struct net_device *dev)
5893{
5894	struct netdev_adjacent *iter;
5895
5896	struct net *net = dev_net(dev);
5897
5898	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5899		if (!net_eq(net,dev_net(iter->dev)))
5900			continue;
5901		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5902					  &iter->dev->adj_list.lower);
5903		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5904					  &dev->adj_list.upper);
5905	}
5906
5907	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5908		if (!net_eq(net,dev_net(iter->dev)))
5909			continue;
5910		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5911					  &iter->dev->adj_list.upper);
5912		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5913					  &dev->adj_list.lower);
5914	}
5915}
5916
5917void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5918{
5919	struct netdev_adjacent *iter;
5920
5921	struct net *net = dev_net(dev);
5922
5923	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5924		if (!net_eq(net,dev_net(iter->dev)))
5925			continue;
5926		netdev_adjacent_sysfs_del(iter->dev, oldname,
5927					  &iter->dev->adj_list.lower);
5928		netdev_adjacent_sysfs_add(iter->dev, dev,
5929					  &iter->dev->adj_list.lower);
5930	}
5931
5932	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5933		if (!net_eq(net,dev_net(iter->dev)))
5934			continue;
5935		netdev_adjacent_sysfs_del(iter->dev, oldname,
5936					  &iter->dev->adj_list.upper);
5937		netdev_adjacent_sysfs_add(iter->dev, dev,
5938					  &iter->dev->adj_list.upper);
5939	}
5940}
5941
5942void *netdev_lower_dev_get_private(struct net_device *dev,
5943				   struct net_device *lower_dev)
5944{
5945	struct netdev_adjacent *lower;
5946
5947	if (!lower_dev)
5948		return NULL;
5949	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
5950	if (!lower)
5951		return NULL;
5952
5953	return lower->private;
5954}
5955EXPORT_SYMBOL(netdev_lower_dev_get_private);
5956
5957
5958int dev_get_nest_level(struct net_device *dev,
5959		       bool (*type_check)(const struct net_device *dev))
5960{
5961	struct net_device *lower = NULL;
5962	struct list_head *iter;
5963	int max_nest = -1;
5964	int nest;
5965
5966	ASSERT_RTNL();
5967
5968	netdev_for_each_lower_dev(dev, lower, iter) {
5969		nest = dev_get_nest_level(lower, type_check);
5970		if (max_nest < nest)
5971			max_nest = nest;
5972	}
5973
5974	if (type_check(dev))
5975		max_nest++;
5976
5977	return max_nest;
5978}
5979EXPORT_SYMBOL(dev_get_nest_level);
5980
5981/**
5982 * netdev_lower_change - Dispatch event about lower device state change
5983 * @lower_dev: device
5984 * @lower_state_info: state to dispatch
5985 *
5986 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
5987 * The caller must hold the RTNL lock.
5988 */
5989void netdev_lower_state_changed(struct net_device *lower_dev,
5990				void *lower_state_info)
5991{
5992	struct netdev_notifier_changelowerstate_info changelowerstate_info;
 
 
5993
5994	ASSERT_RTNL();
5995	changelowerstate_info.lower_state_info = lower_state_info;
5996	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
5997				      &changelowerstate_info.info);
5998}
5999EXPORT_SYMBOL(netdev_lower_state_changed);
6000
6001static void dev_change_rx_flags(struct net_device *dev, int flags)
6002{
6003	const struct net_device_ops *ops = dev->netdev_ops;
6004
6005	if (ops->ndo_change_rx_flags)
6006		ops->ndo_change_rx_flags(dev, flags);
6007}
6008
6009static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6010{
6011	unsigned int old_flags = dev->flags;
6012	kuid_t uid;
6013	kgid_t gid;
6014
6015	ASSERT_RTNL();
6016
6017	dev->flags |= IFF_PROMISC;
6018	dev->promiscuity += inc;
6019	if (dev->promiscuity == 0) {
6020		/*
6021		 * Avoid overflow.
6022		 * If inc causes overflow, untouch promisc and return error.
6023		 */
6024		if (inc < 0)
6025			dev->flags &= ~IFF_PROMISC;
6026		else {
6027			dev->promiscuity -= inc;
6028			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6029				dev->name);
6030			return -EOVERFLOW;
6031		}
6032	}
6033	if (dev->flags != old_flags) {
6034		pr_info("device %s %s promiscuous mode\n",
6035			dev->name,
6036			dev->flags & IFF_PROMISC ? "entered" : "left");
6037		if (audit_enabled) {
6038			current_uid_gid(&uid, &gid);
6039			audit_log(current->audit_context, GFP_ATOMIC,
6040				AUDIT_ANOM_PROMISCUOUS,
6041				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6042				dev->name, (dev->flags & IFF_PROMISC),
6043				(old_flags & IFF_PROMISC),
6044				from_kuid(&init_user_ns, audit_get_loginuid(current)),
6045				from_kuid(&init_user_ns, uid),
6046				from_kgid(&init_user_ns, gid),
6047				audit_get_sessionid(current));
6048		}
6049
6050		dev_change_rx_flags(dev, IFF_PROMISC);
6051	}
6052	if (notify)
6053		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
6054	return 0;
6055}
6056
6057/**
6058 *	dev_set_promiscuity	- update promiscuity count on a device
6059 *	@dev: device
6060 *	@inc: modifier
6061 *
6062 *	Add or remove promiscuity from a device. While the count in the device
6063 *	remains above zero the interface remains promiscuous. Once it hits zero
6064 *	the device reverts back to normal filtering operation. A negative inc
6065 *	value is used to drop promiscuity on the device.
6066 *	Return 0 if successful or a negative errno code on error.
6067 */
6068int dev_set_promiscuity(struct net_device *dev, int inc)
6069{
6070	unsigned int old_flags = dev->flags;
6071	int err;
6072
6073	err = __dev_set_promiscuity(dev, inc, true);
6074	if (err < 0)
6075		return err;
6076	if (dev->flags != old_flags)
6077		dev_set_rx_mode(dev);
6078	return err;
6079}
6080EXPORT_SYMBOL(dev_set_promiscuity);
6081
6082static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6083{
6084	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6085
6086	ASSERT_RTNL();
6087
6088	dev->flags |= IFF_ALLMULTI;
6089	dev->allmulti += inc;
6090	if (dev->allmulti == 0) {
6091		/*
6092		 * Avoid overflow.
6093		 * If inc causes overflow, untouch allmulti and return error.
6094		 */
6095		if (inc < 0)
6096			dev->flags &= ~IFF_ALLMULTI;
6097		else {
6098			dev->allmulti -= inc;
6099			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6100				dev->name);
6101			return -EOVERFLOW;
6102		}
6103	}
6104	if (dev->flags ^ old_flags) {
6105		dev_change_rx_flags(dev, IFF_ALLMULTI);
6106		dev_set_rx_mode(dev);
6107		if (notify)
6108			__dev_notify_flags(dev, old_flags,
6109					   dev->gflags ^ old_gflags);
6110	}
6111	return 0;
6112}
6113
6114/**
6115 *	dev_set_allmulti	- update allmulti count on a device
6116 *	@dev: device
6117 *	@inc: modifier
6118 *
6119 *	Add or remove reception of all multicast frames to a device. While the
6120 *	count in the device remains above zero the interface remains listening
6121 *	to all interfaces. Once it hits zero the device reverts back to normal
6122 *	filtering operation. A negative @inc value is used to drop the counter
6123 *	when releasing a resource needing all multicasts.
6124 *	Return 0 if successful or a negative errno code on error.
6125 */
6126
6127int dev_set_allmulti(struct net_device *dev, int inc)
6128{
6129	return __dev_set_allmulti(dev, inc, true);
6130}
6131EXPORT_SYMBOL(dev_set_allmulti);
6132
6133/*
6134 *	Upload unicast and multicast address lists to device and
6135 *	configure RX filtering. When the device doesn't support unicast
6136 *	filtering it is put in promiscuous mode while unicast addresses
6137 *	are present.
6138 */
6139void __dev_set_rx_mode(struct net_device *dev)
6140{
6141	const struct net_device_ops *ops = dev->netdev_ops;
6142
6143	/* dev_open will call this function so the list will stay sane. */
6144	if (!(dev->flags&IFF_UP))
6145		return;
6146
6147	if (!netif_device_present(dev))
6148		return;
6149
6150	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6151		/* Unicast addresses changes may only happen under the rtnl,
6152		 * therefore calling __dev_set_promiscuity here is safe.
6153		 */
6154		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6155			__dev_set_promiscuity(dev, 1, false);
6156			dev->uc_promisc = true;
6157		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6158			__dev_set_promiscuity(dev, -1, false);
6159			dev->uc_promisc = false;
6160		}
6161	}
6162
6163	if (ops->ndo_set_rx_mode)
6164		ops->ndo_set_rx_mode(dev);
6165}
6166
6167void dev_set_rx_mode(struct net_device *dev)
6168{
6169	netif_addr_lock_bh(dev);
6170	__dev_set_rx_mode(dev);
6171	netif_addr_unlock_bh(dev);
6172}
6173
6174/**
6175 *	dev_get_flags - get flags reported to userspace
6176 *	@dev: device
6177 *
6178 *	Get the combination of flag bits exported through APIs to userspace.
6179 */
6180unsigned int dev_get_flags(const struct net_device *dev)
6181{
6182	unsigned int flags;
6183
6184	flags = (dev->flags & ~(IFF_PROMISC |
6185				IFF_ALLMULTI |
6186				IFF_RUNNING |
6187				IFF_LOWER_UP |
6188				IFF_DORMANT)) |
6189		(dev->gflags & (IFF_PROMISC |
6190				IFF_ALLMULTI));
6191
6192	if (netif_running(dev)) {
6193		if (netif_oper_up(dev))
6194			flags |= IFF_RUNNING;
6195		if (netif_carrier_ok(dev))
6196			flags |= IFF_LOWER_UP;
6197		if (netif_dormant(dev))
6198			flags |= IFF_DORMANT;
6199	}
6200
6201	return flags;
6202}
6203EXPORT_SYMBOL(dev_get_flags);
6204
6205int __dev_change_flags(struct net_device *dev, unsigned int flags)
 
6206{
6207	unsigned int old_flags = dev->flags;
6208	int ret;
6209
6210	ASSERT_RTNL();
6211
6212	/*
6213	 *	Set the flags on our device.
6214	 */
6215
6216	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6217			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6218			       IFF_AUTOMEDIA)) |
6219		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6220				    IFF_ALLMULTI));
6221
6222	/*
6223	 *	Load in the correct multicast list now the flags have changed.
6224	 */
6225
6226	if ((old_flags ^ flags) & IFF_MULTICAST)
6227		dev_change_rx_flags(dev, IFF_MULTICAST);
6228
6229	dev_set_rx_mode(dev);
6230
6231	/*
6232	 *	Have we downed the interface. We handle IFF_UP ourselves
6233	 *	according to user attempts to set it, rather than blindly
6234	 *	setting it.
6235	 */
6236
6237	ret = 0;
6238	if ((old_flags ^ flags) & IFF_UP)
6239		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
 
 
 
 
6240
6241	if ((flags ^ dev->gflags) & IFF_PROMISC) {
6242		int inc = (flags & IFF_PROMISC) ? 1 : -1;
6243		unsigned int old_flags = dev->flags;
6244
6245		dev->gflags ^= IFF_PROMISC;
6246
6247		if (__dev_set_promiscuity(dev, inc, false) >= 0)
6248			if (dev->flags != old_flags)
6249				dev_set_rx_mode(dev);
6250	}
6251
6252	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6253	   is important. Some (broken) drivers set IFF_PROMISC, when
6254	   IFF_ALLMULTI is requested not asking us and not reporting.
6255	 */
6256	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6257		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6258
6259		dev->gflags ^= IFF_ALLMULTI;
6260		__dev_set_allmulti(dev, inc, false);
6261	}
6262
6263	return ret;
6264}
6265
6266void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6267			unsigned int gchanges)
6268{
6269	unsigned int changes = dev->flags ^ old_flags;
6270
6271	if (gchanges)
6272		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6273
6274	if (changes & IFF_UP) {
6275		if (dev->flags & IFF_UP)
6276			call_netdevice_notifiers(NETDEV_UP, dev);
6277		else
6278			call_netdevice_notifiers(NETDEV_DOWN, dev);
6279	}
6280
6281	if (dev->flags & IFF_UP &&
6282	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6283		struct netdev_notifier_change_info change_info;
 
 
 
 
 
6284
6285		change_info.flags_changed = changes;
6286		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6287					      &change_info.info);
6288	}
6289}
6290
6291/**
6292 *	dev_change_flags - change device settings
6293 *	@dev: device
6294 *	@flags: device state flags
 
6295 *
6296 *	Change settings on device based state flags. The flags are
6297 *	in the userspace exported format.
6298 */
6299int dev_change_flags(struct net_device *dev, unsigned int flags)
 
6300{
6301	int ret;
6302	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6303
6304	ret = __dev_change_flags(dev, flags);
6305	if (ret < 0)
6306		return ret;
6307
6308	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6309	__dev_notify_flags(dev, old_flags, changes);
6310	return ret;
6311}
6312EXPORT_SYMBOL(dev_change_flags);
6313
6314static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6315{
6316	const struct net_device_ops *ops = dev->netdev_ops;
6317
6318	if (ops->ndo_change_mtu)
6319		return ops->ndo_change_mtu(dev, new_mtu);
6320
6321	dev->mtu = new_mtu;
6322	return 0;
6323}
 
6324
6325/**
6326 *	dev_set_mtu - Change maximum transfer unit
6327 *	@dev: device
6328 *	@new_mtu: new transfer unit
 
6329 *
6330 *	Change the maximum transfer size of the network device.
6331 */
6332int dev_set_mtu(struct net_device *dev, int new_mtu)
 
6333{
6334	int err, orig_mtu;
6335
6336	if (new_mtu == dev->mtu)
6337		return 0;
6338
6339	/*	MTU must be positive.	 */
6340	if (new_mtu < 0)
 
 
 
 
 
 
6341		return -EINVAL;
 
6342
6343	if (!netif_device_present(dev))
6344		return -ENODEV;
6345
6346	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6347	err = notifier_to_errno(err);
6348	if (err)
6349		return err;
6350
6351	orig_mtu = dev->mtu;
6352	err = __dev_set_mtu(dev, new_mtu);
6353
6354	if (!err) {
6355		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
 
6356		err = notifier_to_errno(err);
6357		if (err) {
6358			/* setting mtu back and notifying everyone again,
6359			 * so that they have a chance to revert changes.
6360			 */
6361			__dev_set_mtu(dev, orig_mtu);
6362			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
 
6363		}
6364	}
6365	return err;
6366}
 
 
 
 
 
 
 
 
 
 
 
 
6367EXPORT_SYMBOL(dev_set_mtu);
6368
6369/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6370 *	dev_set_group - Change group this device belongs to
6371 *	@dev: device
6372 *	@new_group: group this device should belong to
6373 */
6374void dev_set_group(struct net_device *dev, int new_group)
6375{
6376	dev->group = new_group;
6377}
6378EXPORT_SYMBOL(dev_set_group);
6379
6380/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6381 *	dev_set_mac_address - Change Media Access Control Address
6382 *	@dev: device
6383 *	@sa: new address
 
6384 *
6385 *	Change the hardware (MAC) address of the device
6386 */
6387int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
 
6388{
6389	const struct net_device_ops *ops = dev->netdev_ops;
6390	int err;
6391
6392	if (!ops->ndo_set_mac_address)
6393		return -EOPNOTSUPP;
6394	if (sa->sa_family != dev->type)
6395		return -EINVAL;
6396	if (!netif_device_present(dev))
6397		return -ENODEV;
 
 
 
6398	err = ops->ndo_set_mac_address(dev, sa);
6399	if (err)
6400		return err;
6401	dev->addr_assign_type = NET_ADDR_SET;
6402	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6403	add_device_randomness(dev->dev_addr, dev->addr_len);
6404	return 0;
6405}
6406EXPORT_SYMBOL(dev_set_mac_address);
6407
6408/**
6409 *	dev_change_carrier - Change device carrier
6410 *	@dev: device
6411 *	@new_carrier: new value
6412 *
6413 *	Change device carrier
6414 */
6415int dev_change_carrier(struct net_device *dev, bool new_carrier)
6416{
6417	const struct net_device_ops *ops = dev->netdev_ops;
6418
6419	if (!ops->ndo_change_carrier)
6420		return -EOPNOTSUPP;
6421	if (!netif_device_present(dev))
6422		return -ENODEV;
6423	return ops->ndo_change_carrier(dev, new_carrier);
6424}
6425EXPORT_SYMBOL(dev_change_carrier);
6426
6427/**
6428 *	dev_get_phys_port_id - Get device physical port ID
6429 *	@dev: device
6430 *	@ppid: port ID
6431 *
6432 *	Get device physical port ID
6433 */
6434int dev_get_phys_port_id(struct net_device *dev,
6435			 struct netdev_phys_item_id *ppid)
6436{
6437	const struct net_device_ops *ops = dev->netdev_ops;
6438
6439	if (!ops->ndo_get_phys_port_id)
6440		return -EOPNOTSUPP;
6441	return ops->ndo_get_phys_port_id(dev, ppid);
6442}
6443EXPORT_SYMBOL(dev_get_phys_port_id);
6444
6445/**
6446 *	dev_get_phys_port_name - Get device physical port name
6447 *	@dev: device
6448 *	@name: port name
6449 *	@len: limit of bytes to copy to name
6450 *
6451 *	Get device physical port name
6452 */
6453int dev_get_phys_port_name(struct net_device *dev,
6454			   char *name, size_t len)
6455{
6456	const struct net_device_ops *ops = dev->netdev_ops;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6457
6458	if (!ops->ndo_get_phys_port_name)
 
 
 
 
 
 
 
 
 
 
6459		return -EOPNOTSUPP;
6460	return ops->ndo_get_phys_port_name(dev, name, len);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6461}
6462EXPORT_SYMBOL(dev_get_phys_port_name);
6463
6464/**
6465 *	dev_change_proto_down - update protocol port state information
6466 *	@dev: device
6467 *	@proto_down: new value
6468 *
6469 *	This info can be used by switch drivers to set the phys state of the
6470 *	port.
6471 */
6472int dev_change_proto_down(struct net_device *dev, bool proto_down)
6473{
6474	const struct net_device_ops *ops = dev->netdev_ops;
6475
6476	if (!ops->ndo_change_proto_down)
6477		return -EOPNOTSUPP;
6478	if (!netif_device_present(dev))
6479		return -ENODEV;
6480	return ops->ndo_change_proto_down(dev, proto_down);
6481}
6482EXPORT_SYMBOL(dev_change_proto_down);
6483
6484/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6485 *	dev_new_index	-	allocate an ifindex
6486 *	@net: the applicable net namespace
6487 *
6488 *	Returns a suitable unique value for a new device interface
6489 *	number.  The caller must hold the rtnl semaphore or the
6490 *	dev_base_lock to be sure it remains unique.
6491 */
6492static int dev_new_index(struct net *net)
6493{
6494	int ifindex = net->ifindex;
 
6495	for (;;) {
6496		if (++ifindex <= 0)
6497			ifindex = 1;
6498		if (!__dev_get_by_index(net, ifindex))
6499			return net->ifindex = ifindex;
6500	}
6501}
6502
6503/* Delayed registration/unregisteration */
6504static LIST_HEAD(net_todo_list);
6505DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6506
6507static void net_set_todo(struct net_device *dev)
6508{
6509	list_add_tail(&dev->todo_list, &net_todo_list);
6510	dev_net(dev)->dev_unreg_count++;
6511}
6512
6513static void rollback_registered_many(struct list_head *head)
6514{
6515	struct net_device *dev, *tmp;
6516	LIST_HEAD(close_head);
6517
6518	BUG_ON(dev_boot_phase);
6519	ASSERT_RTNL();
6520
6521	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6522		/* Some devices call without registering
6523		 * for initialization unwind. Remove those
6524		 * devices and proceed with the remaining.
6525		 */
6526		if (dev->reg_state == NETREG_UNINITIALIZED) {
6527			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6528				 dev->name, dev);
6529
6530			WARN_ON(1);
6531			list_del(&dev->unreg_list);
6532			continue;
6533		}
6534		dev->dismantle = true;
6535		BUG_ON(dev->reg_state != NETREG_REGISTERED);
6536	}
6537
6538	/* If device is running, close it first. */
6539	list_for_each_entry(dev, head, unreg_list)
6540		list_add_tail(&dev->close_list, &close_head);
6541	dev_close_many(&close_head, true);
6542
6543	list_for_each_entry(dev, head, unreg_list) {
6544		/* And unlink it from device chain. */
6545		unlist_netdevice(dev);
6546
6547		dev->reg_state = NETREG_UNREGISTERING;
6548		on_each_cpu(flush_backlog, dev, 1);
6549	}
 
6550
6551	synchronize_net();
6552
6553	list_for_each_entry(dev, head, unreg_list) {
6554		struct sk_buff *skb = NULL;
6555
6556		/* Shutdown queueing discipline. */
6557		dev_shutdown(dev);
6558
 
6559
6560		/* Notify protocols, that we are about to destroy
6561		   this device. They should clean all the things.
6562		*/
6563		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6564
6565		if (!dev->rtnl_link_ops ||
6566		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6567			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6568						     GFP_KERNEL);
6569
6570		/*
6571		 *	Flush the unicast and multicast chains
6572		 */
6573		dev_uc_flush(dev);
6574		dev_mc_flush(dev);
6575
6576		if (dev->netdev_ops->ndo_uninit)
6577			dev->netdev_ops->ndo_uninit(dev);
6578
6579		if (skb)
6580			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6581
6582		/* Notifier chain MUST detach us all upper devices. */
6583		WARN_ON(netdev_has_any_upper_dev(dev));
 
6584
6585		/* Remove entries from kobject tree */
6586		netdev_unregister_kobject(dev);
6587#ifdef CONFIG_XPS
6588		/* Remove XPS queueing entries */
6589		netif_reset_xps_queues_gt(dev, 0);
6590#endif
6591	}
6592
6593	synchronize_net();
6594
6595	list_for_each_entry(dev, head, unreg_list)
6596		dev_put(dev);
6597}
6598
6599static void rollback_registered(struct net_device *dev)
6600{
6601	LIST_HEAD(single);
6602
6603	list_add(&dev->unreg_list, &single);
6604	rollback_registered_many(&single);
6605	list_del(&single);
6606}
6607
6608static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6609	struct net_device *upper, netdev_features_t features)
6610{
6611	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6612	netdev_features_t feature;
6613	int feature_bit;
6614
6615	for_each_netdev_feature(&upper_disables, feature_bit) {
6616		feature = __NETIF_F_BIT(feature_bit);
6617		if (!(upper->wanted_features & feature)
6618		    && (features & feature)) {
6619			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6620				   &feature, upper->name);
6621			features &= ~feature;
6622		}
6623	}
6624
6625	return features;
6626}
6627
6628static void netdev_sync_lower_features(struct net_device *upper,
6629	struct net_device *lower, netdev_features_t features)
6630{
6631	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6632	netdev_features_t feature;
6633	int feature_bit;
6634
6635	for_each_netdev_feature(&upper_disables, feature_bit) {
6636		feature = __NETIF_F_BIT(feature_bit);
6637		if (!(features & feature) && (lower->features & feature)) {
6638			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6639				   &feature, lower->name);
6640			lower->wanted_features &= ~feature;
6641			netdev_update_features(lower);
6642
6643			if (unlikely(lower->features & feature))
6644				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6645					    &feature, lower->name);
6646		}
6647	}
6648}
6649
6650static netdev_features_t netdev_fix_features(struct net_device *dev,
6651	netdev_features_t features)
6652{
6653	/* Fix illegal checksum combinations */
6654	if ((features & NETIF_F_HW_CSUM) &&
6655	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6656		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6657		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6658	}
6659
6660	/* TSO requires that SG is present as well. */
6661	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6662		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6663		features &= ~NETIF_F_ALL_TSO;
6664	}
6665
6666	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6667					!(features & NETIF_F_IP_CSUM)) {
6668		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6669		features &= ~NETIF_F_TSO;
6670		features &= ~NETIF_F_TSO_ECN;
6671	}
6672
6673	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6674					 !(features & NETIF_F_IPV6_CSUM)) {
6675		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6676		features &= ~NETIF_F_TSO6;
6677	}
6678
 
 
 
 
6679	/* TSO ECN requires that TSO is present as well. */
6680	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6681		features &= ~NETIF_F_TSO_ECN;
6682
6683	/* Software GSO depends on SG. */
6684	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6685		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6686		features &= ~NETIF_F_GSO;
6687	}
6688
6689	/* UFO needs SG and checksumming */
6690	if (features & NETIF_F_UFO) {
6691		/* maybe split UFO into V4 and V6? */
6692		if (!(features & NETIF_F_HW_CSUM) &&
6693		    ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6694		     (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6695			netdev_dbg(dev,
6696				"Dropping NETIF_F_UFO since no checksum offload features.\n");
6697			features &= ~NETIF_F_UFO;
6698		}
6699
6700		if (!(features & NETIF_F_SG)) {
6701			netdev_dbg(dev,
6702				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6703			features &= ~NETIF_F_UFO;
 
 
6704		}
6705	}
6706
6707#ifdef CONFIG_NET_RX_BUSY_POLL
6708	if (dev->netdev_ops->ndo_busy_poll)
6709		features |= NETIF_F_BUSY_POLL;
6710	else
6711#endif
6712		features &= ~NETIF_F_BUSY_POLL;
 
 
 
 
 
 
6713
6714	return features;
6715}
6716
6717int __netdev_update_features(struct net_device *dev)
6718{
6719	struct net_device *upper, *lower;
6720	netdev_features_t features;
6721	struct list_head *iter;
6722	int err = -1;
6723
6724	ASSERT_RTNL();
6725
6726	features = netdev_get_wanted_features(dev);
6727
6728	if (dev->netdev_ops->ndo_fix_features)
6729		features = dev->netdev_ops->ndo_fix_features(dev, features);
6730
6731	/* driver might be less strict about feature dependencies */
6732	features = netdev_fix_features(dev, features);
6733
6734	/* some features can't be enabled if they're off an an upper device */
6735	netdev_for_each_upper_dev_rcu(dev, upper, iter)
6736		features = netdev_sync_upper_features(dev, upper, features);
6737
6738	if (dev->features == features)
6739		goto sync_lower;
6740
6741	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6742		&dev->features, &features);
6743
6744	if (dev->netdev_ops->ndo_set_features)
6745		err = dev->netdev_ops->ndo_set_features(dev, features);
6746	else
6747		err = 0;
6748
6749	if (unlikely(err < 0)) {
6750		netdev_err(dev,
6751			"set_features() failed (%d); wanted %pNF, left %pNF\n",
6752			err, &features, &dev->features);
6753		/* return non-0 since some features might have changed and
6754		 * it's better to fire a spurious notification than miss it
6755		 */
6756		return -1;
6757	}
6758
6759sync_lower:
6760	/* some features must be disabled on lower devices when disabled
6761	 * on an upper device (think: bonding master or bridge)
6762	 */
6763	netdev_for_each_lower_dev(dev, lower, iter)
6764		netdev_sync_lower_features(dev, lower, features);
6765
6766	if (!err)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6767		dev->features = features;
 
6768
6769	return err < 0 ? 0 : 1;
6770}
6771
6772/**
6773 *	netdev_update_features - recalculate device features
6774 *	@dev: the device to check
6775 *
6776 *	Recalculate dev->features set and send notifications if it
6777 *	has changed. Should be called after driver or hardware dependent
6778 *	conditions might have changed that influence the features.
6779 */
6780void netdev_update_features(struct net_device *dev)
6781{
6782	if (__netdev_update_features(dev))
6783		netdev_features_change(dev);
6784}
6785EXPORT_SYMBOL(netdev_update_features);
6786
6787/**
6788 *	netdev_change_features - recalculate device features
6789 *	@dev: the device to check
6790 *
6791 *	Recalculate dev->features set and send notifications even
6792 *	if they have not changed. Should be called instead of
6793 *	netdev_update_features() if also dev->vlan_features might
6794 *	have changed to allow the changes to be propagated to stacked
6795 *	VLAN devices.
6796 */
6797void netdev_change_features(struct net_device *dev)
6798{
6799	__netdev_update_features(dev);
6800	netdev_features_change(dev);
6801}
6802EXPORT_SYMBOL(netdev_change_features);
6803
6804/**
6805 *	netif_stacked_transfer_operstate -	transfer operstate
6806 *	@rootdev: the root or lower level device to transfer state from
6807 *	@dev: the device to transfer operstate to
6808 *
6809 *	Transfer operational state from root to device. This is normally
6810 *	called when a stacking relationship exists between the root
6811 *	device and the device(a leaf device).
6812 */
6813void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6814					struct net_device *dev)
6815{
6816	if (rootdev->operstate == IF_OPER_DORMANT)
6817		netif_dormant_on(dev);
6818	else
6819		netif_dormant_off(dev);
6820
6821	if (netif_carrier_ok(rootdev)) {
6822		if (!netif_carrier_ok(dev))
6823			netif_carrier_on(dev);
6824	} else {
6825		if (netif_carrier_ok(dev))
6826			netif_carrier_off(dev);
6827	}
6828}
6829EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6830
6831#ifdef CONFIG_SYSFS
6832static int netif_alloc_rx_queues(struct net_device *dev)
6833{
6834	unsigned int i, count = dev->num_rx_queues;
6835	struct netdev_rx_queue *rx;
6836	size_t sz = count * sizeof(*rx);
 
6837
6838	BUG_ON(count < 1);
6839
6840	rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6841	if (!rx) {
6842		rx = vzalloc(sz);
6843		if (!rx)
6844			return -ENOMEM;
6845	}
6846	dev->_rx = rx;
6847
6848	for (i = 0; i < count; i++)
6849		rx[i].dev = dev;
 
 
 
 
 
 
6850	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6851}
6852#endif
6853
6854static void netdev_init_one_queue(struct net_device *dev,
6855				  struct netdev_queue *queue, void *_unused)
6856{
6857	/* Initialize queue lock */
6858	spin_lock_init(&queue->_xmit_lock);
6859	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6860	queue->xmit_lock_owner = -1;
6861	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6862	queue->dev = dev;
6863#ifdef CONFIG_BQL
6864	dql_init(&queue->dql, HZ);
6865#endif
6866}
6867
6868static void netif_free_tx_queues(struct net_device *dev)
6869{
6870	kvfree(dev->_tx);
6871}
6872
6873static int netif_alloc_netdev_queues(struct net_device *dev)
6874{
6875	unsigned int count = dev->num_tx_queues;
6876	struct netdev_queue *tx;
6877	size_t sz = count * sizeof(*tx);
6878
6879	if (count < 1 || count > 0xffff)
6880		return -EINVAL;
6881
6882	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6883	if (!tx) {
6884		tx = vzalloc(sz);
6885		if (!tx)
6886			return -ENOMEM;
6887	}
6888	dev->_tx = tx;
6889
6890	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6891	spin_lock_init(&dev->tx_global_lock);
6892
6893	return 0;
6894}
6895
6896void netif_tx_stop_all_queues(struct net_device *dev)
6897{
6898	unsigned int i;
6899
6900	for (i = 0; i < dev->num_tx_queues; i++) {
6901		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 
6902		netif_tx_stop_queue(txq);
6903	}
6904}
6905EXPORT_SYMBOL(netif_tx_stop_all_queues);
6906
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6907/**
6908 *	register_netdevice	- register a network device
6909 *	@dev: device to register
6910 *
6911 *	Take a completed network device structure and add it to the kernel
6912 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6913 *	chain. 0 is returned on success. A negative errno code is returned
6914 *	on a failure to set up the device, or if the name is a duplicate.
6915 *
6916 *	Callers must hold the rtnl semaphore. You may want
6917 *	register_netdev() instead of this.
6918 *
6919 *	BUGS:
6920 *	The locking appears insufficient to guarantee two parallel registers
6921 *	will not get the same name.
6922 */
6923
6924int register_netdevice(struct net_device *dev)
6925{
6926	int ret;
6927	struct net *net = dev_net(dev);
6928
 
 
6929	BUG_ON(dev_boot_phase);
6930	ASSERT_RTNL();
6931
6932	might_sleep();
6933
6934	/* When net_device's are persistent, this will be fatal. */
6935	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6936	BUG_ON(!net);
6937
6938	spin_lock_init(&dev->addr_list_lock);
6939	netdev_set_addr_lockdep_class(dev);
6940
6941	ret = dev_get_valid_name(net, dev, dev->name);
6942	if (ret < 0)
6943		goto out;
6944
6945	/* Init, if this function is available */
6946	if (dev->netdev_ops->ndo_init) {
6947		ret = dev->netdev_ops->ndo_init(dev);
6948		if (ret) {
6949			if (ret > 0)
6950				ret = -EIO;
6951			goto out;
6952		}
6953	}
6954
6955	if (((dev->hw_features | dev->features) &
6956	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
6957	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6958	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6959		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6960		ret = -EINVAL;
6961		goto err_uninit;
6962	}
6963
6964	ret = -EBUSY;
6965	if (!dev->ifindex)
6966		dev->ifindex = dev_new_index(net);
6967	else if (__dev_get_by_index(net, dev->ifindex))
6968		goto err_uninit;
6969
6970	/* Transfer changeable features to wanted_features and enable
6971	 * software offloads (GSO and GRO).
6972	 */
6973	dev->hw_features |= NETIF_F_SOFT_FEATURES;
6974	dev->features |= NETIF_F_SOFT_FEATURES;
 
 
 
 
 
 
6975	dev->wanted_features = dev->features & dev->hw_features;
6976
6977	if (!(dev->flags & IFF_LOOPBACK)) {
6978		dev->hw_features |= NETIF_F_NOCACHE_COPY;
6979	}
 
 
 
 
 
 
 
 
 
 
 
 
 
6980
6981	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6982	 */
6983	dev->vlan_features |= NETIF_F_HIGHDMA;
6984
6985	/* Make NETIF_F_SG inheritable to tunnel devices.
6986	 */
6987	dev->hw_enc_features |= NETIF_F_SG;
6988
6989	/* Make NETIF_F_SG inheritable to MPLS.
6990	 */
6991	dev->mpls_features |= NETIF_F_SG;
6992
6993	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6994	ret = notifier_to_errno(ret);
6995	if (ret)
6996		goto err_uninit;
6997
6998	ret = netdev_register_kobject(dev);
6999	if (ret)
7000		goto err_uninit;
7001	dev->reg_state = NETREG_REGISTERED;
7002
7003	__netdev_update_features(dev);
7004
7005	/*
7006	 *	Default initial state at registry is that the
7007	 *	device is present.
7008	 */
7009
7010	set_bit(__LINK_STATE_PRESENT, &dev->state);
7011
7012	linkwatch_init_dev(dev);
7013
7014	dev_init_scheduler(dev);
7015	dev_hold(dev);
7016	list_netdevice(dev);
7017	add_device_randomness(dev->dev_addr, dev->addr_len);
7018
7019	/* If the device has permanent device address, driver should
7020	 * set dev_addr and also addr_assign_type should be set to
7021	 * NET_ADDR_PERM (default value).
7022	 */
7023	if (dev->addr_assign_type == NET_ADDR_PERM)
7024		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7025
7026	/* Notify protocols, that a new device appeared. */
7027	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7028	ret = notifier_to_errno(ret);
7029	if (ret) {
7030		rollback_registered(dev);
 
 
7031		dev->reg_state = NETREG_UNREGISTERED;
7032	}
7033	/*
7034	 *	Prevent userspace races by waiting until the network
7035	 *	device is fully setup before sending notifications.
7036	 */
7037	if (!dev->rtnl_link_ops ||
7038	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7039		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7040
7041out:
7042	return ret;
7043
7044err_uninit:
7045	if (dev->netdev_ops->ndo_uninit)
7046		dev->netdev_ops->ndo_uninit(dev);
 
 
7047	goto out;
7048}
7049EXPORT_SYMBOL(register_netdevice);
7050
7051/**
7052 *	init_dummy_netdev	- init a dummy network device for NAPI
7053 *	@dev: device to init
7054 *
7055 *	This takes a network device structure and initialize the minimum
7056 *	amount of fields so it can be used to schedule NAPI polls without
7057 *	registering a full blown interface. This is to be used by drivers
7058 *	that need to tie several hardware interfaces to a single NAPI
7059 *	poll scheduler due to HW limitations.
7060 */
7061int init_dummy_netdev(struct net_device *dev)
7062{
7063	/* Clear everything. Note we don't initialize spinlocks
7064	 * are they aren't supposed to be taken by any of the
7065	 * NAPI code and this dummy netdev is supposed to be
7066	 * only ever used for NAPI polls
7067	 */
7068	memset(dev, 0, sizeof(struct net_device));
7069
7070	/* make sure we BUG if trying to hit standard
7071	 * register/unregister code path
7072	 */
7073	dev->reg_state = NETREG_DUMMY;
7074
7075	/* NAPI wants this */
7076	INIT_LIST_HEAD(&dev->napi_list);
7077
7078	/* a dummy interface is started by default */
7079	set_bit(__LINK_STATE_PRESENT, &dev->state);
7080	set_bit(__LINK_STATE_START, &dev->state);
7081
 
 
 
7082	/* Note : We dont allocate pcpu_refcnt for dummy devices,
7083	 * because users of this 'device' dont need to change
7084	 * its refcount.
7085	 */
7086
7087	return 0;
7088}
7089EXPORT_SYMBOL_GPL(init_dummy_netdev);
7090
7091
7092/**
7093 *	register_netdev	- register a network device
7094 *	@dev: device to register
7095 *
7096 *	Take a completed network device structure and add it to the kernel
7097 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7098 *	chain. 0 is returned on success. A negative errno code is returned
7099 *	on a failure to set up the device, or if the name is a duplicate.
7100 *
7101 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
7102 *	and expands the device name if you passed a format string to
7103 *	alloc_netdev.
7104 */
7105int register_netdev(struct net_device *dev)
7106{
7107	int err;
7108
7109	rtnl_lock();
 
7110	err = register_netdevice(dev);
7111	rtnl_unlock();
7112	return err;
7113}
7114EXPORT_SYMBOL(register_netdev);
7115
7116int netdev_refcnt_read(const struct net_device *dev)
7117{
7118	int i, refcnt = 0;
7119
7120	for_each_possible_cpu(i)
7121		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7122	return refcnt;
7123}
7124EXPORT_SYMBOL(netdev_refcnt_read);
7125
7126/**
7127 * netdev_wait_allrefs - wait until all references are gone.
7128 * @dev: target net_device
7129 *
7130 * This is called when unregistering network devices.
7131 *
7132 * Any protocol or device that holds a reference should register
7133 * for netdevice notification, and cleanup and put back the
7134 * reference if they receive an UNREGISTER event.
7135 * We can get stuck here if buggy protocols don't correctly
7136 * call dev_put.
7137 */
7138static void netdev_wait_allrefs(struct net_device *dev)
7139{
7140	unsigned long rebroadcast_time, warning_time;
7141	int refcnt;
7142
7143	linkwatch_forget_dev(dev);
7144
7145	rebroadcast_time = warning_time = jiffies;
7146	refcnt = netdev_refcnt_read(dev);
7147
7148	while (refcnt != 0) {
7149		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7150			rtnl_lock();
7151
7152			/* Rebroadcast unregister notification */
7153			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7154
7155			__rtnl_unlock();
7156			rcu_barrier();
7157			rtnl_lock();
7158
7159			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7160			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7161				     &dev->state)) {
7162				/* We must not have linkwatch events
7163				 * pending on unregister. If this
7164				 * happens, we simply run the queue
7165				 * unscheduled, resulting in a noop
7166				 * for this device.
7167				 */
7168				linkwatch_run_queue();
7169			}
7170
7171			__rtnl_unlock();
7172
7173			rebroadcast_time = jiffies;
7174		}
7175
7176		msleep(250);
7177
7178		refcnt = netdev_refcnt_read(dev);
7179
7180		if (time_after(jiffies, warning_time + 10 * HZ)) {
7181			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7182				 dev->name, refcnt);
7183			warning_time = jiffies;
7184		}
7185	}
7186}
7187
7188/* The sequence is:
7189 *
7190 *	rtnl_lock();
7191 *	...
7192 *	register_netdevice(x1);
7193 *	register_netdevice(x2);
7194 *	...
7195 *	unregister_netdevice(y1);
7196 *	unregister_netdevice(y2);
7197 *      ...
7198 *	rtnl_unlock();
7199 *	free_netdev(y1);
7200 *	free_netdev(y2);
7201 *
7202 * We are invoked by rtnl_unlock().
7203 * This allows us to deal with problems:
7204 * 1) We can delete sysfs objects which invoke hotplug
7205 *    without deadlocking with linkwatch via keventd.
7206 * 2) Since we run with the RTNL semaphore not held, we can sleep
7207 *    safely in order to wait for the netdev refcnt to drop to zero.
7208 *
7209 * We must not return until all unregister events added during
7210 * the interval the lock was held have been completed.
7211 */
7212void netdev_run_todo(void)
7213{
7214	struct list_head list;
7215
7216	/* Snapshot list, allow later requests */
7217	list_replace_init(&net_todo_list, &list);
7218
7219	__rtnl_unlock();
7220
7221
7222	/* Wait for rcu callbacks to finish before next phase */
7223	if (!list_empty(&list))
7224		rcu_barrier();
7225
7226	while (!list_empty(&list)) {
7227		struct net_device *dev
7228			= list_first_entry(&list, struct net_device, todo_list);
7229		list_del(&dev->todo_list);
7230
7231		rtnl_lock();
7232		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7233		__rtnl_unlock();
7234
7235		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7236			pr_err("network todo '%s' but state %d\n",
7237			       dev->name, dev->reg_state);
7238			dump_stack();
7239			continue;
7240		}
7241
7242		dev->reg_state = NETREG_UNREGISTERED;
7243
7244		netdev_wait_allrefs(dev);
7245
7246		/* paranoia */
7247		BUG_ON(netdev_refcnt_read(dev));
7248		BUG_ON(!list_empty(&dev->ptype_all));
7249		BUG_ON(!list_empty(&dev->ptype_specific));
7250		WARN_ON(rcu_access_pointer(dev->ip_ptr));
7251		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
 
7252		WARN_ON(dev->dn_ptr);
7253
7254		if (dev->destructor)
7255			dev->destructor(dev);
 
 
7256
7257		/* Report a network device has been unregistered */
7258		rtnl_lock();
7259		dev_net(dev)->dev_unreg_count--;
7260		__rtnl_unlock();
7261		wake_up(&netdev_unregistering_wq);
7262
7263		/* Free network device */
7264		kobject_put(&dev->dev.kobj);
7265	}
7266}
7267
7268/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7269 * all the same fields in the same order as net_device_stats, with only
7270 * the type differing, but rtnl_link_stats64 may have additional fields
7271 * at the end for newer counters.
7272 */
7273void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7274			     const struct net_device_stats *netdev_stats)
7275{
7276#if BITS_PER_LONG == 64
7277	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7278	memcpy(stats64, netdev_stats, sizeof(*stats64));
7279	/* zero out counters that only exist in rtnl_link_stats64 */
7280	memset((char *)stats64 + sizeof(*netdev_stats), 0,
7281	       sizeof(*stats64) - sizeof(*netdev_stats));
7282#else
7283	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7284	const unsigned long *src = (const unsigned long *)netdev_stats;
7285	u64 *dst = (u64 *)stats64;
7286
7287	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7288	for (i = 0; i < n; i++)
7289		dst[i] = src[i];
7290	/* zero out counters that only exist in rtnl_link_stats64 */
7291	memset((char *)stats64 + n * sizeof(u64), 0,
7292	       sizeof(*stats64) - n * sizeof(u64));
7293#endif
7294}
7295EXPORT_SYMBOL(netdev_stats_to_stats64);
7296
7297/**
7298 *	dev_get_stats	- get network device statistics
7299 *	@dev: device to get statistics from
7300 *	@storage: place to store stats
7301 *
7302 *	Get network statistics from device. Return @storage.
7303 *	The device driver may provide its own method by setting
7304 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7305 *	otherwise the internal statistics structure is used.
7306 */
7307struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7308					struct rtnl_link_stats64 *storage)
7309{
7310	const struct net_device_ops *ops = dev->netdev_ops;
7311
7312	if (ops->ndo_get_stats64) {
7313		memset(storage, 0, sizeof(*storage));
7314		ops->ndo_get_stats64(dev, storage);
7315	} else if (ops->ndo_get_stats) {
7316		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7317	} else {
7318		netdev_stats_to_stats64(storage, &dev->stats);
7319	}
7320	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7321	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7322	storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7323	return storage;
7324}
7325EXPORT_SYMBOL(dev_get_stats);
7326
7327struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7328{
7329	struct netdev_queue *queue = dev_ingress_queue(dev);
7330
7331#ifdef CONFIG_NET_CLS_ACT
7332	if (queue)
7333		return queue;
7334	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7335	if (!queue)
7336		return NULL;
7337	netdev_init_one_queue(dev, queue, NULL);
7338	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7339	queue->qdisc_sleeping = &noop_qdisc;
7340	rcu_assign_pointer(dev->ingress_queue, queue);
7341#endif
7342	return queue;
7343}
7344
7345static const struct ethtool_ops default_ethtool_ops;
7346
7347void netdev_set_default_ethtool_ops(struct net_device *dev,
7348				    const struct ethtool_ops *ops)
7349{
7350	if (dev->ethtool_ops == &default_ethtool_ops)
7351		dev->ethtool_ops = ops;
7352}
7353EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7354
7355void netdev_freemem(struct net_device *dev)
7356{
7357	char *addr = (char *)dev - dev->padded;
7358
7359	kvfree(addr);
7360}
7361
7362/**
7363 *	alloc_netdev_mqs - allocate network device
7364 *	@sizeof_priv:		size of private data to allocate space for
7365 *	@name:			device name format string
7366 *	@name_assign_type: 	origin of device name
7367 *	@setup:			callback to initialize device
7368 *	@txqs:			the number of TX subqueues to allocate
7369 *	@rxqs:			the number of RX subqueues to allocate
7370 *
7371 *	Allocates a struct net_device with private data area for driver use
7372 *	and performs basic initialization.  Also allocates subqueue structs
7373 *	for each queue on the device.
7374 */
7375struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7376		unsigned char name_assign_type,
7377		void (*setup)(struct net_device *),
7378		unsigned int txqs, unsigned int rxqs)
7379{
7380	struct net_device *dev;
7381	size_t alloc_size;
7382	struct net_device *p;
7383
7384	BUG_ON(strlen(name) >= sizeof(dev->name));
7385
7386	if (txqs < 1) {
7387		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7388		return NULL;
7389	}
7390
7391#ifdef CONFIG_SYSFS
7392	if (rxqs < 1) {
7393		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7394		return NULL;
7395	}
7396#endif
7397
7398	alloc_size = sizeof(struct net_device);
7399	if (sizeof_priv) {
7400		/* ensure 32-byte alignment of private area */
7401		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7402		alloc_size += sizeof_priv;
7403	}
7404	/* ensure 32-byte alignment of whole construct */
7405	alloc_size += NETDEV_ALIGN - 1;
7406
7407	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7408	if (!p)
7409		p = vzalloc(alloc_size);
7410	if (!p)
7411		return NULL;
7412
7413	dev = PTR_ALIGN(p, NETDEV_ALIGN);
7414	dev->padded = (char *)dev - (char *)p;
7415
7416	dev->pcpu_refcnt = alloc_percpu(int);
7417	if (!dev->pcpu_refcnt)
7418		goto free_dev;
7419
7420	if (dev_addr_init(dev))
7421		goto free_pcpu;
7422
7423	dev_mc_init(dev);
7424	dev_uc_init(dev);
7425
7426	dev_net_set(dev, &init_net);
7427
 
 
7428	dev->gso_max_size = GSO_MAX_SIZE;
7429	dev->gso_max_segs = GSO_MAX_SEGS;
7430	dev->gso_min_segs = 0;
 
7431
7432	INIT_LIST_HEAD(&dev->napi_list);
7433	INIT_LIST_HEAD(&dev->unreg_list);
7434	INIT_LIST_HEAD(&dev->close_list);
7435	INIT_LIST_HEAD(&dev->link_watch_list);
7436	INIT_LIST_HEAD(&dev->adj_list.upper);
7437	INIT_LIST_HEAD(&dev->adj_list.lower);
7438	INIT_LIST_HEAD(&dev->all_adj_list.upper);
7439	INIT_LIST_HEAD(&dev->all_adj_list.lower);
7440	INIT_LIST_HEAD(&dev->ptype_all);
7441	INIT_LIST_HEAD(&dev->ptype_specific);
 
 
 
7442	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7443	setup(dev);
7444
7445	if (!dev->tx_queue_len) {
7446		dev->priv_flags |= IFF_NO_QUEUE;
7447		dev->tx_queue_len = 1;
7448	}
7449
7450	dev->num_tx_queues = txqs;
7451	dev->real_num_tx_queues = txqs;
7452	if (netif_alloc_netdev_queues(dev))
7453		goto free_all;
7454
7455#ifdef CONFIG_SYSFS
7456	dev->num_rx_queues = rxqs;
7457	dev->real_num_rx_queues = rxqs;
7458	if (netif_alloc_rx_queues(dev))
7459		goto free_all;
7460#endif
7461
7462	strcpy(dev->name, name);
7463	dev->name_assign_type = name_assign_type;
7464	dev->group = INIT_NETDEV_GROUP;
7465	if (!dev->ethtool_ops)
7466		dev->ethtool_ops = &default_ethtool_ops;
7467
7468	nf_hook_ingress_init(dev);
7469
7470	return dev;
7471
7472free_all:
7473	free_netdev(dev);
7474	return NULL;
7475
7476free_pcpu:
7477	free_percpu(dev->pcpu_refcnt);
7478free_dev:
7479	netdev_freemem(dev);
7480	return NULL;
7481}
7482EXPORT_SYMBOL(alloc_netdev_mqs);
7483
7484/**
7485 *	free_netdev - free network device
7486 *	@dev: device
7487 *
7488 *	This function does the last stage of destroying an allocated device
7489 * 	interface. The reference to the device object is released.
7490 *	If this is the last reference then it will be freed.
7491 *	Must be called in process context.
7492 */
7493void free_netdev(struct net_device *dev)
7494{
7495	struct napi_struct *p, *n;
7496
7497	might_sleep();
7498	netif_free_tx_queues(dev);
7499#ifdef CONFIG_SYSFS
7500	kvfree(dev->_rx);
7501#endif
7502
7503	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7504
7505	/* Flush device addresses */
7506	dev_addr_flush(dev);
7507
7508	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7509		netif_napi_del(p);
7510
7511	free_percpu(dev->pcpu_refcnt);
7512	dev->pcpu_refcnt = NULL;
7513
 
 
7514	/*  Compatibility with error handling in drivers */
7515	if (dev->reg_state == NETREG_UNINITIALIZED) {
7516		netdev_freemem(dev);
7517		return;
7518	}
7519
7520	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7521	dev->reg_state = NETREG_RELEASED;
7522
7523	/* will free via device release */
7524	put_device(&dev->dev);
7525}
7526EXPORT_SYMBOL(free_netdev);
7527
7528/**
7529 *	synchronize_net -  Synchronize with packet receive processing
7530 *
7531 *	Wait for packets currently being received to be done.
7532 *	Does not block later packets from starting.
7533 */
7534void synchronize_net(void)
7535{
7536	might_sleep();
7537	if (rtnl_is_locked())
7538		synchronize_rcu_expedited();
7539	else
7540		synchronize_rcu();
7541}
7542EXPORT_SYMBOL(synchronize_net);
7543
7544/**
7545 *	unregister_netdevice_queue - remove device from the kernel
7546 *	@dev: device
7547 *	@head: list
7548 *
7549 *	This function shuts down a device interface and removes it
7550 *	from the kernel tables.
7551 *	If head not NULL, device is queued to be unregistered later.
7552 *
7553 *	Callers must hold the rtnl semaphore.  You may want
7554 *	unregister_netdev() instead of this.
7555 */
7556
7557void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7558{
7559	ASSERT_RTNL();
7560
7561	if (head) {
7562		list_move_tail(&dev->unreg_list, head);
7563	} else {
7564		rollback_registered(dev);
7565		/* Finish processing unregister after unlock */
7566		net_set_todo(dev);
7567	}
7568}
7569EXPORT_SYMBOL(unregister_netdevice_queue);
7570
7571/**
7572 *	unregister_netdevice_many - unregister many devices
7573 *	@head: list of devices
7574 *
7575 *  Note: As most callers use a stack allocated list_head,
7576 *  we force a list_del() to make sure stack wont be corrupted later.
7577 */
7578void unregister_netdevice_many(struct list_head *head)
7579{
7580	struct net_device *dev;
7581
7582	if (!list_empty(head)) {
7583		rollback_registered_many(head);
7584		list_for_each_entry(dev, head, unreg_list)
7585			net_set_todo(dev);
7586		list_del(head);
7587	}
7588}
7589EXPORT_SYMBOL(unregister_netdevice_many);
7590
7591/**
7592 *	unregister_netdev - remove device from the kernel
7593 *	@dev: device
7594 *
7595 *	This function shuts down a device interface and removes it
7596 *	from the kernel tables.
7597 *
7598 *	This is just a wrapper for unregister_netdevice that takes
7599 *	the rtnl semaphore.  In general you want to use this and not
7600 *	unregister_netdevice.
7601 */
7602void unregister_netdev(struct net_device *dev)
7603{
7604	rtnl_lock();
7605	unregister_netdevice(dev);
7606	rtnl_unlock();
7607}
7608EXPORT_SYMBOL(unregister_netdev);
7609
7610/**
7611 *	dev_change_net_namespace - move device to different nethost namespace
7612 *	@dev: device
7613 *	@net: network namespace
7614 *	@pat: If not NULL name pattern to try if the current device name
7615 *	      is already taken in the destination network namespace.
7616 *
7617 *	This function shuts down a device interface and moves it
7618 *	to a new network namespace. On success 0 is returned, on
7619 *	a failure a netagive errno code is returned.
7620 *
7621 *	Callers must hold the rtnl semaphore.
7622 */
7623
7624int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7625{
7626	int err;
7627
7628	ASSERT_RTNL();
7629
7630	/* Don't allow namespace local devices to be moved. */
7631	err = -EINVAL;
7632	if (dev->features & NETIF_F_NETNS_LOCAL)
7633		goto out;
7634
7635	/* Ensure the device has been registrered */
7636	if (dev->reg_state != NETREG_REGISTERED)
7637		goto out;
7638
7639	/* Get out if there is nothing todo */
7640	err = 0;
7641	if (net_eq(dev_net(dev), net))
7642		goto out;
7643
7644	/* Pick the destination device name, and ensure
7645	 * we can use it in the destination network namespace.
7646	 */
7647	err = -EEXIST;
7648	if (__dev_get_by_name(net, dev->name)) {
7649		/* We get here if we can't use the current device name */
7650		if (!pat)
7651			goto out;
7652		if (dev_get_valid_name(net, dev, pat) < 0)
 
7653			goto out;
7654	}
7655
7656	/*
7657	 * And now a mini version of register_netdevice unregister_netdevice.
7658	 */
7659
7660	/* If device is running close it first. */
7661	dev_close(dev);
7662
7663	/* And unlink it from device chain */
7664	err = -ENODEV;
7665	unlist_netdevice(dev);
7666
7667	synchronize_net();
7668
7669	/* Shutdown queueing discipline. */
7670	dev_shutdown(dev);
7671
7672	/* Notify protocols, that we are about to destroy
7673	   this device. They should clean all the things.
7674
7675	   Note that dev->reg_state stays at NETREG_REGISTERED.
7676	   This is wanted because this way 8021q and macvlan know
7677	   the device is just moving and can keep their slaves up.
7678	*/
7679	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7680	rcu_barrier();
7681	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7682	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
 
 
 
 
 
 
 
 
7683
7684	/*
7685	 *	Flush the unicast and multicast chains
7686	 */
7687	dev_uc_flush(dev);
7688	dev_mc_flush(dev);
7689
7690	/* Send a netdev-removed uevent to the old namespace */
7691	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7692	netdev_adjacent_del_links(dev);
7693
7694	/* Actually switch the network namespace */
7695	dev_net_set(dev, net);
7696
7697	/* If there is an ifindex conflict assign a new one */
7698	if (__dev_get_by_index(net, dev->ifindex))
7699		dev->ifindex = dev_new_index(net);
7700
7701	/* Send a netdev-add uevent to the new namespace */
7702	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7703	netdev_adjacent_add_links(dev);
7704
7705	/* Fixup kobjects */
7706	err = device_rename(&dev->dev, dev->name);
7707	WARN_ON(err);
7708
7709	/* Add the device back in the hashes */
7710	list_netdevice(dev);
7711
7712	/* Notify protocols, that a new device appeared. */
7713	call_netdevice_notifiers(NETDEV_REGISTER, dev);
7714
7715	/*
7716	 *	Prevent userspace races by waiting until the network
7717	 *	device is fully setup before sending notifications.
7718	 */
7719	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7720
7721	synchronize_net();
7722	err = 0;
7723out:
7724	return err;
7725}
7726EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7727
7728static int dev_cpu_callback(struct notifier_block *nfb,
7729			    unsigned long action,
7730			    void *ocpu)
7731{
7732	struct sk_buff **list_skb;
7733	struct sk_buff *skb;
7734	unsigned int cpu, oldcpu = (unsigned long)ocpu;
7735	struct softnet_data *sd, *oldsd;
7736
7737	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7738		return NOTIFY_OK;
7739
7740	local_irq_disable();
7741	cpu = smp_processor_id();
7742	sd = &per_cpu(softnet_data, cpu);
7743	oldsd = &per_cpu(softnet_data, oldcpu);
7744
7745	/* Find end of our completion_queue. */
7746	list_skb = &sd->completion_queue;
7747	while (*list_skb)
7748		list_skb = &(*list_skb)->next;
7749	/* Append completion queue from offline CPU. */
7750	*list_skb = oldsd->completion_queue;
7751	oldsd->completion_queue = NULL;
7752
7753	/* Append output queue from offline CPU. */
7754	if (oldsd->output_queue) {
7755		*sd->output_queue_tailp = oldsd->output_queue;
7756		sd->output_queue_tailp = oldsd->output_queue_tailp;
7757		oldsd->output_queue = NULL;
7758		oldsd->output_queue_tailp = &oldsd->output_queue;
7759	}
7760	/* Append NAPI poll list from offline CPU, with one exception :
7761	 * process_backlog() must be called by cpu owning percpu backlog.
7762	 * We properly handle process_queue & input_pkt_queue later.
7763	 */
7764	while (!list_empty(&oldsd->poll_list)) {
7765		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7766							    struct napi_struct,
7767							    poll_list);
7768
7769		list_del_init(&napi->poll_list);
7770		if (napi->poll == process_backlog)
7771			napi->state = 0;
7772		else
7773			____napi_schedule(sd, napi);
7774	}
7775
7776	raise_softirq_irqoff(NET_TX_SOFTIRQ);
7777	local_irq_enable();
7778
 
 
 
 
 
 
 
7779	/* Process offline CPU's input_pkt_queue */
7780	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7781		netif_rx_ni(skb);
7782		input_queue_head_incr(oldsd);
7783	}
7784	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7785		netif_rx_ni(skb);
7786		input_queue_head_incr(oldsd);
7787	}
7788
7789	return NOTIFY_OK;
7790}
7791
7792
7793/**
7794 *	netdev_increment_features - increment feature set by one
7795 *	@all: current feature set
7796 *	@one: new feature set
7797 *	@mask: mask feature set
7798 *
7799 *	Computes a new feature set after adding a device with feature set
7800 *	@one to the master device with current feature set @all.  Will not
7801 *	enable anything that is off in @mask. Returns the new feature set.
7802 */
7803netdev_features_t netdev_increment_features(netdev_features_t all,
7804	netdev_features_t one, netdev_features_t mask)
7805{
7806	if (mask & NETIF_F_HW_CSUM)
7807		mask |= NETIF_F_CSUM_MASK;
7808	mask |= NETIF_F_VLAN_CHALLENGED;
7809
7810	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
7811	all &= one | ~NETIF_F_ALL_FOR_ALL;
7812
7813	/* If one device supports hw checksumming, set for all. */
7814	if (all & NETIF_F_HW_CSUM)
7815		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
7816
7817	return all;
7818}
7819EXPORT_SYMBOL(netdev_increment_features);
7820
7821static struct hlist_head * __net_init netdev_create_hash(void)
7822{
7823	int i;
7824	struct hlist_head *hash;
7825
7826	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7827	if (hash != NULL)
7828		for (i = 0; i < NETDEV_HASHENTRIES; i++)
7829			INIT_HLIST_HEAD(&hash[i]);
7830
7831	return hash;
7832}
7833
7834/* Initialize per network namespace state */
7835static int __net_init netdev_init(struct net *net)
7836{
 
 
 
7837	if (net != &init_net)
7838		INIT_LIST_HEAD(&net->dev_base_head);
7839
7840	net->dev_name_head = netdev_create_hash();
7841	if (net->dev_name_head == NULL)
7842		goto err_name;
7843
7844	net->dev_index_head = netdev_create_hash();
7845	if (net->dev_index_head == NULL)
7846		goto err_idx;
7847
7848	return 0;
7849
7850err_idx:
7851	kfree(net->dev_name_head);
7852err_name:
7853	return -ENOMEM;
7854}
7855
7856/**
7857 *	netdev_drivername - network driver for the device
7858 *	@dev: network device
7859 *
7860 *	Determine network driver for device.
7861 */
7862const char *netdev_drivername(const struct net_device *dev)
7863{
7864	const struct device_driver *driver;
7865	const struct device *parent;
7866	const char *empty = "";
7867
7868	parent = dev->dev.parent;
7869	if (!parent)
7870		return empty;
7871
7872	driver = parent->driver;
7873	if (driver && driver->name)
7874		return driver->name;
7875	return empty;
7876}
7877
7878static void __netdev_printk(const char *level, const struct net_device *dev,
7879			    struct va_format *vaf)
7880{
7881	if (dev && dev->dev.parent) {
7882		dev_printk_emit(level[1] - '0',
7883				dev->dev.parent,
7884				"%s %s %s%s: %pV",
7885				dev_driver_string(dev->dev.parent),
7886				dev_name(dev->dev.parent),
7887				netdev_name(dev), netdev_reg_state(dev),
7888				vaf);
7889	} else if (dev) {
7890		printk("%s%s%s: %pV",
7891		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
7892	} else {
7893		printk("%s(NULL net_device): %pV", level, vaf);
7894	}
7895}
7896
7897void netdev_printk(const char *level, const struct net_device *dev,
7898		   const char *format, ...)
7899{
7900	struct va_format vaf;
7901	va_list args;
7902
7903	va_start(args, format);
7904
7905	vaf.fmt = format;
7906	vaf.va = &args;
7907
7908	__netdev_printk(level, dev, &vaf);
7909
7910	va_end(args);
7911}
7912EXPORT_SYMBOL(netdev_printk);
7913
7914#define define_netdev_printk_level(func, level)			\
7915void func(const struct net_device *dev, const char *fmt, ...)	\
7916{								\
7917	struct va_format vaf;					\
7918	va_list args;						\
7919								\
7920	va_start(args, fmt);					\
7921								\
7922	vaf.fmt = fmt;						\
7923	vaf.va = &args;						\
7924								\
7925	__netdev_printk(level, dev, &vaf);			\
7926								\
7927	va_end(args);						\
7928}								\
7929EXPORT_SYMBOL(func);
7930
7931define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7932define_netdev_printk_level(netdev_alert, KERN_ALERT);
7933define_netdev_printk_level(netdev_crit, KERN_CRIT);
7934define_netdev_printk_level(netdev_err, KERN_ERR);
7935define_netdev_printk_level(netdev_warn, KERN_WARNING);
7936define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7937define_netdev_printk_level(netdev_info, KERN_INFO);
7938
7939static void __net_exit netdev_exit(struct net *net)
7940{
7941	kfree(net->dev_name_head);
7942	kfree(net->dev_index_head);
 
 
7943}
7944
7945static struct pernet_operations __net_initdata netdev_net_ops = {
7946	.init = netdev_init,
7947	.exit = netdev_exit,
7948};
7949
7950static void __net_exit default_device_exit(struct net *net)
7951{
7952	struct net_device *dev, *aux;
7953	/*
7954	 * Push all migratable network devices back to the
7955	 * initial network namespace
7956	 */
7957	rtnl_lock();
7958	for_each_netdev_safe(net, dev, aux) {
7959		int err;
7960		char fb_name[IFNAMSIZ];
7961
7962		/* Ignore unmoveable devices (i.e. loopback) */
7963		if (dev->features & NETIF_F_NETNS_LOCAL)
7964			continue;
7965
7966		/* Leave virtual devices for the generic cleanup */
7967		if (dev->rtnl_link_ops)
7968			continue;
7969
7970		/* Push remaining network devices to init_net */
7971		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
 
 
7972		err = dev_change_net_namespace(dev, &init_net, fb_name);
7973		if (err) {
7974			pr_emerg("%s: failed to move %s to init_net: %d\n",
7975				 __func__, dev->name, err);
7976			BUG();
7977		}
7978	}
7979	rtnl_unlock();
7980}
7981
7982static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7983{
7984	/* Return with the rtnl_lock held when there are no network
7985	 * devices unregistering in any network namespace in net_list.
7986	 */
7987	struct net *net;
7988	bool unregistering;
7989	DEFINE_WAIT_FUNC(wait, woken_wake_function);
7990
7991	add_wait_queue(&netdev_unregistering_wq, &wait);
7992	for (;;) {
7993		unregistering = false;
7994		rtnl_lock();
7995		list_for_each_entry(net, net_list, exit_list) {
7996			if (net->dev_unreg_count > 0) {
7997				unregistering = true;
7998				break;
7999			}
8000		}
8001		if (!unregistering)
8002			break;
8003		__rtnl_unlock();
8004
8005		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8006	}
8007	remove_wait_queue(&netdev_unregistering_wq, &wait);
8008}
8009
8010static void __net_exit default_device_exit_batch(struct list_head *net_list)
8011{
8012	/* At exit all network devices most be removed from a network
8013	 * namespace.  Do this in the reverse order of registration.
8014	 * Do this across as many network namespaces as possible to
8015	 * improve batching efficiency.
8016	 */
8017	struct net_device *dev;
8018	struct net *net;
8019	LIST_HEAD(dev_kill_list);
8020
8021	/* To prevent network device cleanup code from dereferencing
8022	 * loopback devices or network devices that have been freed
8023	 * wait here for all pending unregistrations to complete,
8024	 * before unregistring the loopback device and allowing the
8025	 * network namespace be freed.
8026	 *
8027	 * The netdev todo list containing all network devices
8028	 * unregistrations that happen in default_device_exit_batch
8029	 * will run in the rtnl_unlock() at the end of
8030	 * default_device_exit_batch.
8031	 */
8032	rtnl_lock_unregistering(net_list);
8033	list_for_each_entry(net, net_list, exit_list) {
8034		for_each_netdev_reverse(net, dev) {
8035			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8036				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8037			else
8038				unregister_netdevice_queue(dev, &dev_kill_list);
8039		}
8040	}
8041	unregister_netdevice_many(&dev_kill_list);
8042	rtnl_unlock();
8043}
8044
8045static struct pernet_operations __net_initdata default_device_ops = {
8046	.exit = default_device_exit,
8047	.exit_batch = default_device_exit_batch,
8048};
8049
8050/*
8051 *	Initialize the DEV module. At boot time this walks the device list and
8052 *	unhooks any devices that fail to initialise (normally hardware not
8053 *	present) and leaves us with a valid list of present and active devices.
8054 *
8055 */
8056
8057/*
8058 *       This is called single threaded during boot, so no need
8059 *       to take the rtnl semaphore.
8060 */
8061static int __init net_dev_init(void)
8062{
8063	int i, rc = -ENOMEM;
8064
8065	BUG_ON(!dev_boot_phase);
8066
8067	if (dev_proc_init())
8068		goto out;
8069
8070	if (netdev_kobject_init())
8071		goto out;
8072
8073	INIT_LIST_HEAD(&ptype_all);
8074	for (i = 0; i < PTYPE_HASH_SIZE; i++)
8075		INIT_LIST_HEAD(&ptype_base[i]);
8076
8077	INIT_LIST_HEAD(&offload_base);
8078
8079	if (register_pernet_subsys(&netdev_net_ops))
8080		goto out;
8081
8082	/*
8083	 *	Initialise the packet receive queues.
8084	 */
8085
8086	for_each_possible_cpu(i) {
 
8087		struct softnet_data *sd = &per_cpu(softnet_data, i);
8088
 
 
8089		skb_queue_head_init(&sd->input_pkt_queue);
8090		skb_queue_head_init(&sd->process_queue);
 
 
 
8091		INIT_LIST_HEAD(&sd->poll_list);
8092		sd->output_queue_tailp = &sd->output_queue;
8093#ifdef CONFIG_RPS
8094		sd->csd.func = rps_trigger_softirq;
8095		sd->csd.info = sd;
8096		sd->cpu = i;
8097#endif
8098
 
8099		sd->backlog.poll = process_backlog;
8100		sd->backlog.weight = weight_p;
8101	}
8102
8103	dev_boot_phase = 0;
8104
8105	/* The loopback device is special if any other network devices
8106	 * is present in a network namespace the loopback device must
8107	 * be present. Since we now dynamically allocate and free the
8108	 * loopback device ensure this invariant is maintained by
8109	 * keeping the loopback device as the first device on the
8110	 * list of network devices.  Ensuring the loopback devices
8111	 * is the first device that appears and the last network device
8112	 * that disappears.
8113	 */
8114	if (register_pernet_device(&loopback_net_ops))
8115		goto out;
8116
8117	if (register_pernet_device(&default_device_ops))
8118		goto out;
8119
8120	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8121	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8122
8123	hotcpu_notifier(dev_cpu_callback, 0);
8124	dst_subsys_init();
 
8125	rc = 0;
8126out:
8127	return rc;
8128}
8129
8130subsys_initcall(net_dev_init);