Linux Audio

Check our new training course

Loading...
v5.4
    1// SPDX-License-Identifier: GPL-2.0-or-later
    2/*
    3 *      NET3    Protocol independent device support routines.
    4 *
 
 
 
 
 
    5 *	Derived from the non IP parts of dev.c 1.0.19
    6 *              Authors:	Ross Biro
    7 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
    8 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
    9 *
   10 *	Additional Authors:
   11 *		Florian la Roche <rzsfl@rz.uni-sb.de>
   12 *		Alan Cox <gw4pts@gw4pts.ampr.org>
   13 *		David Hinds <dahinds@users.sourceforge.net>
   14 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
   15 *		Adam Sulmicki <adam@cfar.umd.edu>
   16 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
   17 *
   18 *	Changes:
   19 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
   20 *                                      to 2 if register_netdev gets called
   21 *                                      before net_dev_init & also removed a
   22 *                                      few lines of code in the process.
   23 *		Alan Cox	:	device private ioctl copies fields back.
   24 *		Alan Cox	:	Transmit queue code does relevant
   25 *					stunts to keep the queue safe.
   26 *		Alan Cox	:	Fixed double lock.
   27 *		Alan Cox	:	Fixed promisc NULL pointer trap
   28 *		????????	:	Support the full private ioctl range
   29 *		Alan Cox	:	Moved ioctl permission check into
   30 *					drivers
   31 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
   32 *		Alan Cox	:	100 backlog just doesn't cut it when
   33 *					you start doing multicast video 8)
   34 *		Alan Cox	:	Rewrote net_bh and list manager.
   35 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
   36 *		Alan Cox	:	Took out transmit every packet pass
   37 *					Saved a few bytes in the ioctl handler
   38 *		Alan Cox	:	Network driver sets packet type before
   39 *					calling netif_rx. Saves a function
   40 *					call a packet.
   41 *		Alan Cox	:	Hashed net_bh()
   42 *		Richard Kooijman:	Timestamp fixes.
   43 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
   44 *		Alan Cox	:	Device lock protection.
   45 *              Alan Cox        :       Fixed nasty side effect of device close
   46 *					changes.
   47 *		Rudi Cilibrasi	:	Pass the right thing to
   48 *					set_mac_address()
   49 *		Dave Miller	:	32bit quantity for the device lock to
   50 *					make it work out on a Sparc.
   51 *		Bjorn Ekwall	:	Added KERNELD hack.
   52 *		Alan Cox	:	Cleaned up the backlog initialise.
   53 *		Craig Metz	:	SIOCGIFCONF fix if space for under
   54 *					1 device.
   55 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
   56 *					is no device open function.
   57 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
   58 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
   59 *		Cyrus Durgin	:	Cleaned for KMOD
   60 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
   61 *					A network device unload needs to purge
   62 *					the backlog queue.
   63 *	Paul Rusty Russell	:	SIOCSIFNAME
   64 *              Pekka Riikonen  :	Netdev boot-time settings code
   65 *              Andrew Morton   :       Make unregister_netdevice wait
   66 *                                      indefinitely on dev->refcnt
   67 *              J Hadi Salim    :       - Backlog queue sampling
   68 *				        - netif_rx() feedback
   69 */
   70
   71#include <linux/uaccess.h>
   72#include <linux/bitops.h>
   73#include <linux/capability.h>
   74#include <linux/cpu.h>
   75#include <linux/types.h>
   76#include <linux/kernel.h>
   77#include <linux/hash.h>
   78#include <linux/slab.h>
   79#include <linux/sched.h>
   80#include <linux/sched/mm.h>
   81#include <linux/mutex.h>
   82#include <linux/string.h>
   83#include <linux/mm.h>
   84#include <linux/socket.h>
   85#include <linux/sockios.h>
   86#include <linux/errno.h>
   87#include <linux/interrupt.h>
   88#include <linux/if_ether.h>
   89#include <linux/netdevice.h>
   90#include <linux/etherdevice.h>
   91#include <linux/ethtool.h>
 
   92#include <linux/skbuff.h>
   93#include <linux/bpf.h>
   94#include <linux/bpf_trace.h>
   95#include <net/net_namespace.h>
   96#include <net/sock.h>
   97#include <net/busy_poll.h>
   98#include <linux/rtnetlink.h>
   99#include <linux/stat.h>
  100#include <net/dst.h>
  101#include <net/dst_metadata.h>
  102#include <net/pkt_sched.h>
  103#include <net/pkt_cls.h>
  104#include <net/checksum.h>
  105#include <net/xfrm.h>
  106#include <linux/highmem.h>
  107#include <linux/init.h>
  108#include <linux/module.h>
  109#include <linux/netpoll.h>
  110#include <linux/rcupdate.h>
  111#include <linux/delay.h>
  112#include <net/iw_handler.h>
  113#include <asm/current.h>
  114#include <linux/audit.h>
  115#include <linux/dmaengine.h>
  116#include <linux/err.h>
  117#include <linux/ctype.h>
  118#include <linux/if_arp.h>
  119#include <linux/if_vlan.h>
  120#include <linux/ip.h>
  121#include <net/ip.h>
  122#include <net/mpls.h>
  123#include <linux/ipv6.h>
  124#include <linux/in.h>
  125#include <linux/jhash.h>
  126#include <linux/random.h>
  127#include <trace/events/napi.h>
  128#include <trace/events/net.h>
  129#include <trace/events/skb.h>
 
  130#include <linux/inetdevice.h>
  131#include <linux/cpu_rmap.h>
  132#include <linux/static_key.h>
  133#include <linux/hashtable.h>
  134#include <linux/vmalloc.h>
  135#include <linux/if_macvlan.h>
  136#include <linux/errqueue.h>
  137#include <linux/hrtimer.h>
  138#include <linux/netfilter_ingress.h>
  139#include <linux/crash_dump.h>
  140#include <linux/sctp.h>
  141#include <net/udp_tunnel.h>
  142#include <linux/net_namespace.h>
  143#include <linux/indirect_call_wrapper.h>
  144#include <net/devlink.h>
  145
  146#include "net-sysfs.h"
  147
 
  148#define MAX_GRO_SKBS 8
  149#define MAX_NEST_DEV 8
  150
  151/* This should be increased if a protocol with a bigger head is added. */
  152#define GRO_MAX_HEAD (MAX_HEADER + 128)
  153
  154static DEFINE_SPINLOCK(ptype_lock);
  155static DEFINE_SPINLOCK(offload_lock);
  156struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
  157struct list_head ptype_all __read_mostly;	/* Taps */
  158static struct list_head offload_base __read_mostly;
  159
  160static int netif_rx_internal(struct sk_buff *skb);
  161static int call_netdevice_notifiers_info(unsigned long val,
  162					 struct netdev_notifier_info *info);
  163static int call_netdevice_notifiers_extack(unsigned long val,
  164					   struct net_device *dev,
  165					   struct netlink_ext_ack *extack);
  166static struct napi_struct *napi_by_id(unsigned int napi_id);
  167
  168/*
  169 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
  170 * semaphore.
  171 *
  172 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
  173 *
  174 * Writers must hold the rtnl semaphore while they loop through the
  175 * dev_base_head list, and hold dev_base_lock for writing when they do the
  176 * actual updates.  This allows pure readers to access the list even
  177 * while a writer is preparing to update it.
  178 *
  179 * To put it another way, dev_base_lock is held for writing only to
  180 * protect against pure readers; the rtnl semaphore provides the
  181 * protection against other writers.
  182 *
  183 * See, for example usages, register_netdevice() and
  184 * unregister_netdevice(), which must be called with the rtnl
  185 * semaphore held.
  186 */
  187DEFINE_RWLOCK(dev_base_lock);
  188EXPORT_SYMBOL(dev_base_lock);
  189
  190static DEFINE_MUTEX(ifalias_mutex);
  191
  192/* protects napi_hash addition/deletion and napi_gen_id */
  193static DEFINE_SPINLOCK(napi_hash_lock);
  194
  195static unsigned int napi_gen_id = NR_CPUS;
  196static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
  197
  198static seqcount_t devnet_rename_seq;
  199
  200static inline void dev_base_seq_inc(struct net *net)
  201{
  202	while (++net->dev_base_seq == 0)
  203		;
  204}
  205
  206static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
  207{
  208	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
  209
  210	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
  211}
  212
  213static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
  214{
  215	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
  216}
  217
  218static inline void rps_lock(struct softnet_data *sd)
  219{
  220#ifdef CONFIG_RPS
  221	spin_lock(&sd->input_pkt_queue.lock);
  222#endif
  223}
  224
  225static inline void rps_unlock(struct softnet_data *sd)
  226{
  227#ifdef CONFIG_RPS
  228	spin_unlock(&sd->input_pkt_queue.lock);
  229#endif
  230}
  231
  232/* Device list insertion */
  233static void list_netdevice(struct net_device *dev)
  234{
  235	struct net *net = dev_net(dev);
  236
  237	ASSERT_RTNL();
  238
  239	write_lock_bh(&dev_base_lock);
  240	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
  241	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
  242	hlist_add_head_rcu(&dev->index_hlist,
  243			   dev_index_hash(net, dev->ifindex));
  244	write_unlock_bh(&dev_base_lock);
  245
  246	dev_base_seq_inc(net);
  247}
  248
  249/* Device list removal
  250 * caller must respect a RCU grace period before freeing/reusing dev
  251 */
  252static void unlist_netdevice(struct net_device *dev)
  253{
  254	ASSERT_RTNL();
  255
  256	/* Unlink dev from the device chain */
  257	write_lock_bh(&dev_base_lock);
  258	list_del_rcu(&dev->dev_list);
  259	hlist_del_rcu(&dev->name_hlist);
  260	hlist_del_rcu(&dev->index_hlist);
  261	write_unlock_bh(&dev_base_lock);
  262
  263	dev_base_seq_inc(dev_net(dev));
  264}
  265
  266/*
  267 *	Our notifier list
  268 */
  269
  270static RAW_NOTIFIER_HEAD(netdev_chain);
  271
  272/*
  273 *	Device drivers call our routines to queue packets here. We empty the
  274 *	queue in the local softnet handler.
  275 */
  276
  277DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
  278EXPORT_PER_CPU_SYMBOL(softnet_data);
  279
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  280/*******************************************************************************
  281 *
  282 *		Protocol management and registration routines
  283 *
  284 *******************************************************************************/
  285
  286
  287/*
  288 *	Add a protocol ID to the list. Now that the input handler is
  289 *	smarter we can dispense with all the messy stuff that used to be
  290 *	here.
  291 *
  292 *	BEWARE!!! Protocol handlers, mangling input packets,
  293 *	MUST BE last in hash buckets and checking protocol handlers
  294 *	MUST start from promiscuous ptype_all chain in net_bh.
  295 *	It is true now, do not change it.
  296 *	Explanation follows: if protocol handler, mangling packet, will
  297 *	be the first on list, it is not able to sense, that packet
  298 *	is cloned and should be copied-on-write, so that it will
  299 *	change it and subsequent readers will get broken packet.
  300 *							--ANK (980803)
  301 */
  302
  303static inline struct list_head *ptype_head(const struct packet_type *pt)
  304{
  305	if (pt->type == htons(ETH_P_ALL))
  306		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
  307	else
  308		return pt->dev ? &pt->dev->ptype_specific :
  309				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
  310}
  311
  312/**
  313 *	dev_add_pack - add packet handler
  314 *	@pt: packet type declaration
  315 *
  316 *	Add a protocol handler to the networking stack. The passed &packet_type
  317 *	is linked into kernel lists and may not be freed until it has been
  318 *	removed from the kernel lists.
  319 *
  320 *	This call does not sleep therefore it can not
  321 *	guarantee all CPU's that are in middle of receiving packets
  322 *	will see the new packet type (until the next received packet).
  323 */
  324
  325void dev_add_pack(struct packet_type *pt)
  326{
  327	struct list_head *head = ptype_head(pt);
  328
  329	spin_lock(&ptype_lock);
  330	list_add_rcu(&pt->list, head);
  331	spin_unlock(&ptype_lock);
  332}
  333EXPORT_SYMBOL(dev_add_pack);
  334
  335/**
  336 *	__dev_remove_pack	 - remove packet handler
  337 *	@pt: packet type declaration
  338 *
  339 *	Remove a protocol handler that was previously added to the kernel
  340 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  341 *	from the kernel lists and can be freed or reused once this function
  342 *	returns.
  343 *
  344 *      The packet type might still be in use by receivers
  345 *	and must not be freed until after all the CPU's have gone
  346 *	through a quiescent state.
  347 */
  348void __dev_remove_pack(struct packet_type *pt)
  349{
  350	struct list_head *head = ptype_head(pt);
  351	struct packet_type *pt1;
  352
  353	spin_lock(&ptype_lock);
  354
  355	list_for_each_entry(pt1, head, list) {
  356		if (pt == pt1) {
  357			list_del_rcu(&pt->list);
  358			goto out;
  359		}
  360	}
  361
  362	pr_warn("dev_remove_pack: %p not found\n", pt);
  363out:
  364	spin_unlock(&ptype_lock);
  365}
  366EXPORT_SYMBOL(__dev_remove_pack);
  367
  368/**
  369 *	dev_remove_pack	 - remove packet handler
  370 *	@pt: packet type declaration
  371 *
  372 *	Remove a protocol handler that was previously added to the kernel
  373 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  374 *	from the kernel lists and can be freed or reused once this function
  375 *	returns.
  376 *
  377 *	This call sleeps to guarantee that no CPU is looking at the packet
  378 *	type after return.
  379 */
  380void dev_remove_pack(struct packet_type *pt)
  381{
  382	__dev_remove_pack(pt);
  383
  384	synchronize_net();
  385}
  386EXPORT_SYMBOL(dev_remove_pack);
  387
  388
  389/**
  390 *	dev_add_offload - register offload handlers
  391 *	@po: protocol offload declaration
  392 *
  393 *	Add protocol offload handlers to the networking stack. The passed
  394 *	&proto_offload is linked into kernel lists and may not be freed until
  395 *	it has been removed from the kernel lists.
  396 *
  397 *	This call does not sleep therefore it can not
  398 *	guarantee all CPU's that are in middle of receiving packets
  399 *	will see the new offload handlers (until the next received packet).
  400 */
  401void dev_add_offload(struct packet_offload *po)
  402{
  403	struct packet_offload *elem;
  404
  405	spin_lock(&offload_lock);
  406	list_for_each_entry(elem, &offload_base, list) {
  407		if (po->priority < elem->priority)
  408			break;
  409	}
  410	list_add_rcu(&po->list, elem->list.prev);
  411	spin_unlock(&offload_lock);
  412}
  413EXPORT_SYMBOL(dev_add_offload);
  414
  415/**
  416 *	__dev_remove_offload	 - remove offload handler
  417 *	@po: packet offload declaration
  418 *
  419 *	Remove a protocol offload handler that was previously added to the
  420 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
  421 *	is removed from the kernel lists and can be freed or reused once this
  422 *	function returns.
  423 *
  424 *      The packet type might still be in use by receivers
  425 *	and must not be freed until after all the CPU's have gone
  426 *	through a quiescent state.
  427 */
  428static void __dev_remove_offload(struct packet_offload *po)
  429{
  430	struct list_head *head = &offload_base;
  431	struct packet_offload *po1;
  432
  433	spin_lock(&offload_lock);
  434
  435	list_for_each_entry(po1, head, list) {
  436		if (po == po1) {
  437			list_del_rcu(&po->list);
  438			goto out;
  439		}
  440	}
  441
  442	pr_warn("dev_remove_offload: %p not found\n", po);
  443out:
  444	spin_unlock(&offload_lock);
  445}
  446
  447/**
  448 *	dev_remove_offload	 - remove packet offload handler
  449 *	@po: packet offload declaration
  450 *
  451 *	Remove a packet offload handler that was previously added to the kernel
  452 *	offload handlers by dev_add_offload(). The passed &offload_type is
  453 *	removed from the kernel lists and can be freed or reused once this
  454 *	function returns.
  455 *
  456 *	This call sleeps to guarantee that no CPU is looking at the packet
  457 *	type after return.
  458 */
  459void dev_remove_offload(struct packet_offload *po)
  460{
  461	__dev_remove_offload(po);
  462
  463	synchronize_net();
  464}
  465EXPORT_SYMBOL(dev_remove_offload);
  466
  467/******************************************************************************
  468 *
  469 *		      Device Boot-time Settings Routines
  470 *
  471 ******************************************************************************/
  472
  473/* Boot time configuration table */
  474static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
  475
  476/**
  477 *	netdev_boot_setup_add	- add new setup entry
  478 *	@name: name of the device
  479 *	@map: configured settings for the device
  480 *
  481 *	Adds new setup entry to the dev_boot_setup list.  The function
  482 *	returns 0 on error and 1 on success.  This is a generic routine to
  483 *	all netdevices.
  484 */
  485static int netdev_boot_setup_add(char *name, struct ifmap *map)
  486{
  487	struct netdev_boot_setup *s;
  488	int i;
  489
  490	s = dev_boot_setup;
  491	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
  492		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
  493			memset(s[i].name, 0, sizeof(s[i].name));
  494			strlcpy(s[i].name, name, IFNAMSIZ);
  495			memcpy(&s[i].map, map, sizeof(s[i].map));
  496			break;
  497		}
  498	}
  499
  500	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
  501}
  502
  503/**
  504 * netdev_boot_setup_check	- check boot time settings
  505 * @dev: the netdevice
  506 *
  507 * Check boot time settings for the device.
  508 * The found settings are set for the device to be used
  509 * later in the device probing.
  510 * Returns 0 if no settings found, 1 if they are.
  511 */
  512int netdev_boot_setup_check(struct net_device *dev)
  513{
  514	struct netdev_boot_setup *s = dev_boot_setup;
  515	int i;
  516
  517	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
  518		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
  519		    !strcmp(dev->name, s[i].name)) {
  520			dev->irq = s[i].map.irq;
  521			dev->base_addr = s[i].map.base_addr;
  522			dev->mem_start = s[i].map.mem_start;
  523			dev->mem_end = s[i].map.mem_end;
  524			return 1;
  525		}
  526	}
  527	return 0;
  528}
  529EXPORT_SYMBOL(netdev_boot_setup_check);
  530
  531
  532/**
  533 * netdev_boot_base	- get address from boot time settings
  534 * @prefix: prefix for network device
  535 * @unit: id for network device
  536 *
  537 * Check boot time settings for the base address of device.
  538 * The found settings are set for the device to be used
  539 * later in the device probing.
  540 * Returns 0 if no settings found.
  541 */
  542unsigned long netdev_boot_base(const char *prefix, int unit)
  543{
  544	const struct netdev_boot_setup *s = dev_boot_setup;
  545	char name[IFNAMSIZ];
  546	int i;
  547
  548	sprintf(name, "%s%d", prefix, unit);
  549
  550	/*
  551	 * If device already registered then return base of 1
  552	 * to indicate not to probe for this interface
  553	 */
  554	if (__dev_get_by_name(&init_net, name))
  555		return 1;
  556
  557	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
  558		if (!strcmp(name, s[i].name))
  559			return s[i].map.base_addr;
  560	return 0;
  561}
  562
  563/*
  564 * Saves at boot time configured settings for any netdevice.
  565 */
  566int __init netdev_boot_setup(char *str)
  567{
  568	int ints[5];
  569	struct ifmap map;
  570
  571	str = get_options(str, ARRAY_SIZE(ints), ints);
  572	if (!str || !*str)
  573		return 0;
  574
  575	/* Save settings */
  576	memset(&map, 0, sizeof(map));
  577	if (ints[0] > 0)
  578		map.irq = ints[1];
  579	if (ints[0] > 1)
  580		map.base_addr = ints[2];
  581	if (ints[0] > 2)
  582		map.mem_start = ints[3];
  583	if (ints[0] > 3)
  584		map.mem_end = ints[4];
  585
  586	/* Add new entry to the list */
  587	return netdev_boot_setup_add(str, &map);
  588}
  589
  590__setup("netdev=", netdev_boot_setup);
  591
  592/*******************************************************************************
  593 *
  594 *			    Device Interface Subroutines
  595 *
  596 *******************************************************************************/
  597
  598/**
  599 *	dev_get_iflink	- get 'iflink' value of a interface
  600 *	@dev: targeted interface
  601 *
  602 *	Indicates the ifindex the interface is linked to.
  603 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
  604 */
  605
  606int dev_get_iflink(const struct net_device *dev)
  607{
  608	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
  609		return dev->netdev_ops->ndo_get_iflink(dev);
  610
  611	return dev->ifindex;
  612}
  613EXPORT_SYMBOL(dev_get_iflink);
  614
  615/**
  616 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
  617 *	@dev: targeted interface
  618 *	@skb: The packet.
  619 *
  620 *	For better visibility of tunnel traffic OVS needs to retrieve
  621 *	egress tunnel information for a packet. Following API allows
  622 *	user to get this info.
  623 */
  624int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
  625{
  626	struct ip_tunnel_info *info;
  627
  628	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
  629		return -EINVAL;
  630
  631	info = skb_tunnel_info_unclone(skb);
  632	if (!info)
  633		return -ENOMEM;
  634	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
  635		return -EINVAL;
  636
  637	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
  638}
  639EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
  640
  641/**
  642 *	__dev_get_by_name	- find a device by its name
  643 *	@net: the applicable net namespace
  644 *	@name: name to find
  645 *
  646 *	Find an interface by name. Must be called under RTNL semaphore
  647 *	or @dev_base_lock. If the name is found a pointer to the device
  648 *	is returned. If the name is not found then %NULL is returned. The
  649 *	reference counters are not incremented so the caller must be
  650 *	careful with locks.
  651 */
  652
  653struct net_device *__dev_get_by_name(struct net *net, const char *name)
  654{
  655	struct net_device *dev;
  656	struct hlist_head *head = dev_name_hash(net, name);
  657
  658	hlist_for_each_entry(dev, head, name_hlist)
  659		if (!strncmp(dev->name, name, IFNAMSIZ))
  660			return dev;
  661
  662	return NULL;
  663}
  664EXPORT_SYMBOL(__dev_get_by_name);
  665
  666/**
  667 * dev_get_by_name_rcu	- find a device by its name
  668 * @net: the applicable net namespace
  669 * @name: name to find
  670 *
  671 * Find an interface by name.
  672 * If the name is found a pointer to the device is returned.
  673 * If the name is not found then %NULL is returned.
  674 * The reference counters are not incremented so the caller must be
  675 * careful with locks. The caller must hold RCU lock.
  676 */
  677
  678struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
  679{
  680	struct net_device *dev;
  681	struct hlist_head *head = dev_name_hash(net, name);
  682
  683	hlist_for_each_entry_rcu(dev, head, name_hlist)
  684		if (!strncmp(dev->name, name, IFNAMSIZ))
  685			return dev;
  686
  687	return NULL;
  688}
  689EXPORT_SYMBOL(dev_get_by_name_rcu);
  690
  691/**
  692 *	dev_get_by_name		- find a device by its name
  693 *	@net: the applicable net namespace
  694 *	@name: name to find
  695 *
  696 *	Find an interface by name. This can be called from any
  697 *	context and does its own locking. The returned handle has
  698 *	the usage count incremented and the caller must use dev_put() to
  699 *	release it when it is no longer needed. %NULL is returned if no
  700 *	matching device is found.
  701 */
  702
  703struct net_device *dev_get_by_name(struct net *net, const char *name)
  704{
  705	struct net_device *dev;
  706
  707	rcu_read_lock();
  708	dev = dev_get_by_name_rcu(net, name);
  709	if (dev)
  710		dev_hold(dev);
  711	rcu_read_unlock();
  712	return dev;
  713}
  714EXPORT_SYMBOL(dev_get_by_name);
  715
  716/**
  717 *	__dev_get_by_index - find a device by its ifindex
  718 *	@net: the applicable net namespace
  719 *	@ifindex: index of device
  720 *
  721 *	Search for an interface by index. Returns %NULL if the device
  722 *	is not found or a pointer to the device. The device has not
  723 *	had its reference counter increased so the caller must be careful
  724 *	about locking. The caller must hold either the RTNL semaphore
  725 *	or @dev_base_lock.
  726 */
  727
  728struct net_device *__dev_get_by_index(struct net *net, int ifindex)
  729{
  730	struct net_device *dev;
  731	struct hlist_head *head = dev_index_hash(net, ifindex);
  732
  733	hlist_for_each_entry(dev, head, index_hlist)
  734		if (dev->ifindex == ifindex)
  735			return dev;
  736
  737	return NULL;
  738}
  739EXPORT_SYMBOL(__dev_get_by_index);
  740
  741/**
  742 *	dev_get_by_index_rcu - find a device by its ifindex
  743 *	@net: the applicable net namespace
  744 *	@ifindex: index of device
  745 *
  746 *	Search for an interface by index. Returns %NULL if the device
  747 *	is not found or a pointer to the device. The device has not
  748 *	had its reference counter increased so the caller must be careful
  749 *	about locking. The caller must hold RCU lock.
  750 */
  751
  752struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
  753{
  754	struct net_device *dev;
  755	struct hlist_head *head = dev_index_hash(net, ifindex);
  756
  757	hlist_for_each_entry_rcu(dev, head, index_hlist)
  758		if (dev->ifindex == ifindex)
  759			return dev;
  760
  761	return NULL;
  762}
  763EXPORT_SYMBOL(dev_get_by_index_rcu);
  764
  765
  766/**
  767 *	dev_get_by_index - find a device by its ifindex
  768 *	@net: the applicable net namespace
  769 *	@ifindex: index of device
  770 *
  771 *	Search for an interface by index. Returns NULL if the device
  772 *	is not found or a pointer to the device. The device returned has
  773 *	had a reference added and the pointer is safe until the user calls
  774 *	dev_put to indicate they have finished with it.
  775 */
  776
  777struct net_device *dev_get_by_index(struct net *net, int ifindex)
  778{
  779	struct net_device *dev;
  780
  781	rcu_read_lock();
  782	dev = dev_get_by_index_rcu(net, ifindex);
  783	if (dev)
  784		dev_hold(dev);
  785	rcu_read_unlock();
  786	return dev;
  787}
  788EXPORT_SYMBOL(dev_get_by_index);
  789
  790/**
  791 *	dev_get_by_napi_id - find a device by napi_id
  792 *	@napi_id: ID of the NAPI struct
  793 *
  794 *	Search for an interface by NAPI ID. Returns %NULL if the device
  795 *	is not found or a pointer to the device. The device has not had
  796 *	its reference counter increased so the caller must be careful
  797 *	about locking. The caller must hold RCU lock.
  798 */
  799
  800struct net_device *dev_get_by_napi_id(unsigned int napi_id)
  801{
  802	struct napi_struct *napi;
  803
  804	WARN_ON_ONCE(!rcu_read_lock_held());
  805
  806	if (napi_id < MIN_NAPI_ID)
  807		return NULL;
  808
  809	napi = napi_by_id(napi_id);
  810
  811	return napi ? napi->dev : NULL;
  812}
  813EXPORT_SYMBOL(dev_get_by_napi_id);
  814
  815/**
  816 *	netdev_get_name - get a netdevice name, knowing its ifindex.
  817 *	@net: network namespace
  818 *	@name: a pointer to the buffer where the name will be stored.
  819 *	@ifindex: the ifindex of the interface to get the name from.
  820 *
  821 *	The use of raw_seqcount_begin() and cond_resched() before
  822 *	retrying is required as we want to give the writers a chance
  823 *	to complete when CONFIG_PREEMPT is not set.
  824 */
  825int netdev_get_name(struct net *net, char *name, int ifindex)
  826{
  827	struct net_device *dev;
  828	unsigned int seq;
  829
  830retry:
  831	seq = raw_seqcount_begin(&devnet_rename_seq);
  832	rcu_read_lock();
  833	dev = dev_get_by_index_rcu(net, ifindex);
  834	if (!dev) {
  835		rcu_read_unlock();
  836		return -ENODEV;
  837	}
  838
  839	strcpy(name, dev->name);
  840	rcu_read_unlock();
  841	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
  842		cond_resched();
  843		goto retry;
  844	}
  845
  846	return 0;
  847}
  848
  849/**
  850 *	dev_getbyhwaddr_rcu - find a device by its hardware address
  851 *	@net: the applicable net namespace
  852 *	@type: media type of device
  853 *	@ha: hardware address
  854 *
  855 *	Search for an interface by MAC address. Returns NULL if the device
  856 *	is not found or a pointer to the device.
  857 *	The caller must hold RCU or RTNL.
  858 *	The returned device has not had its ref count increased
  859 *	and the caller must therefore be careful about locking
  860 *
  861 */
  862
  863struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
  864				       const char *ha)
  865{
  866	struct net_device *dev;
  867
  868	for_each_netdev_rcu(net, dev)
  869		if (dev->type == type &&
  870		    !memcmp(dev->dev_addr, ha, dev->addr_len))
  871			return dev;
  872
  873	return NULL;
  874}
  875EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
  876
  877struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
  878{
  879	struct net_device *dev;
  880
  881	ASSERT_RTNL();
  882	for_each_netdev(net, dev)
  883		if (dev->type == type)
  884			return dev;
  885
  886	return NULL;
  887}
  888EXPORT_SYMBOL(__dev_getfirstbyhwtype);
  889
  890struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
  891{
  892	struct net_device *dev, *ret = NULL;
  893
  894	rcu_read_lock();
  895	for_each_netdev_rcu(net, dev)
  896		if (dev->type == type) {
  897			dev_hold(dev);
  898			ret = dev;
  899			break;
  900		}
  901	rcu_read_unlock();
  902	return ret;
  903}
  904EXPORT_SYMBOL(dev_getfirstbyhwtype);
  905
  906/**
  907 *	__dev_get_by_flags - find any device with given flags
  908 *	@net: the applicable net namespace
  909 *	@if_flags: IFF_* values
  910 *	@mask: bitmask of bits in if_flags to check
  911 *
  912 *	Search for any interface with the given flags. Returns NULL if a device
  913 *	is not found or a pointer to the device. Must be called inside
  914 *	rtnl_lock(), and result refcount is unchanged.
  915 */
  916
  917struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
  918				      unsigned short mask)
  919{
  920	struct net_device *dev, *ret;
  921
  922	ASSERT_RTNL();
  923
  924	ret = NULL;
  925	for_each_netdev(net, dev) {
  926		if (((dev->flags ^ if_flags) & mask) == 0) {
  927			ret = dev;
  928			break;
  929		}
  930	}
  931	return ret;
  932}
  933EXPORT_SYMBOL(__dev_get_by_flags);
  934
  935/**
  936 *	dev_valid_name - check if name is okay for network device
  937 *	@name: name string
  938 *
  939 *	Network device names need to be valid file names to
  940 *	to allow sysfs to work.  We also disallow any kind of
  941 *	whitespace.
  942 */
  943bool dev_valid_name(const char *name)
  944{
  945	if (*name == '\0')
  946		return false;
  947	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
  948		return false;
  949	if (!strcmp(name, ".") || !strcmp(name, ".."))
  950		return false;
  951
  952	while (*name) {
  953		if (*name == '/' || *name == ':' || isspace(*name))
  954			return false;
  955		name++;
  956	}
  957	return true;
  958}
  959EXPORT_SYMBOL(dev_valid_name);
  960
  961/**
  962 *	__dev_alloc_name - allocate a name for a device
  963 *	@net: network namespace to allocate the device name in
  964 *	@name: name format string
  965 *	@buf:  scratch buffer and result name string
  966 *
  967 *	Passed a format string - eg "lt%d" it will try and find a suitable
  968 *	id. It scans list of devices to build up a free map, then chooses
  969 *	the first empty slot. The caller must hold the dev_base or rtnl lock
  970 *	while allocating the name and adding the device in order to avoid
  971 *	duplicates.
  972 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
  973 *	Returns the number of the unit assigned or a negative errno code.
  974 */
  975
  976static int __dev_alloc_name(struct net *net, const char *name, char *buf)
  977{
  978	int i = 0;
  979	const char *p;
  980	const int max_netdevices = 8*PAGE_SIZE;
  981	unsigned long *inuse;
  982	struct net_device *d;
  983
  984	if (!dev_valid_name(name))
  985		return -EINVAL;
  986
  987	p = strchr(name, '%');
  988	if (p) {
  989		/*
  990		 * Verify the string as this thing may have come from
  991		 * the user.  There must be either one "%d" and no other "%"
  992		 * characters.
  993		 */
  994		if (p[1] != 'd' || strchr(p + 2, '%'))
  995			return -EINVAL;
  996
  997		/* Use one page as a bit array of possible slots */
  998		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
  999		if (!inuse)
 1000			return -ENOMEM;
 1001
 1002		for_each_netdev(net, d) {
 1003			if (!sscanf(d->name, name, &i))
 1004				continue;
 1005			if (i < 0 || i >= max_netdevices)
 1006				continue;
 1007
 1008			/*  avoid cases where sscanf is not exact inverse of printf */
 1009			snprintf(buf, IFNAMSIZ, name, i);
 1010			if (!strncmp(buf, d->name, IFNAMSIZ))
 1011				set_bit(i, inuse);
 1012		}
 1013
 1014		i = find_first_zero_bit(inuse, max_netdevices);
 1015		free_page((unsigned long) inuse);
 1016	}
 1017
 1018	snprintf(buf, IFNAMSIZ, name, i);
 1019	if (!__dev_get_by_name(net, buf))
 1020		return i;
 1021
 1022	/* It is possible to run out of possible slots
 1023	 * when the name is long and there isn't enough space left
 1024	 * for the digits, or if all bits are used.
 1025	 */
 1026	return -ENFILE;
 1027}
 1028
 1029static int dev_alloc_name_ns(struct net *net,
 1030			     struct net_device *dev,
 1031			     const char *name)
 1032{
 1033	char buf[IFNAMSIZ];
 1034	int ret;
 1035
 1036	BUG_ON(!net);
 1037	ret = __dev_alloc_name(net, name, buf);
 1038	if (ret >= 0)
 1039		strlcpy(dev->name, buf, IFNAMSIZ);
 1040	return ret;
 1041}
 1042
 1043/**
 1044 *	dev_alloc_name - allocate a name for a device
 1045 *	@dev: device
 1046 *	@name: name format string
 1047 *
 1048 *	Passed a format string - eg "lt%d" it will try and find a suitable
 1049 *	id. It scans list of devices to build up a free map, then chooses
 1050 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 1051 *	while allocating the name and adding the device in order to avoid
 1052 *	duplicates.
 1053 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 1054 *	Returns the number of the unit assigned or a negative errno code.
 1055 */
 1056
 1057int dev_alloc_name(struct net_device *dev, const char *name)
 1058{
 1059	return dev_alloc_name_ns(dev_net(dev), dev, name);
 1060}
 1061EXPORT_SYMBOL(dev_alloc_name);
 1062
 1063int dev_get_valid_name(struct net *net, struct net_device *dev,
 1064		       const char *name)
 1065{
 1066	BUG_ON(!net);
 1067
 1068	if (!dev_valid_name(name))
 1069		return -EINVAL;
 1070
 1071	if (strchr(name, '%'))
 1072		return dev_alloc_name_ns(net, dev, name);
 1073	else if (__dev_get_by_name(net, name))
 1074		return -EEXIST;
 1075	else if (dev->name != name)
 1076		strlcpy(dev->name, name, IFNAMSIZ);
 1077
 1078	return 0;
 1079}
 1080EXPORT_SYMBOL(dev_get_valid_name);
 1081
 1082/**
 1083 *	dev_change_name - change name of a device
 1084 *	@dev: device
 1085 *	@newname: name (or format string) must be at least IFNAMSIZ
 1086 *
 1087 *	Change name of a device, can pass format strings "eth%d".
 1088 *	for wildcarding.
 1089 */
 1090int dev_change_name(struct net_device *dev, const char *newname)
 1091{
 1092	unsigned char old_assign_type;
 1093	char oldname[IFNAMSIZ];
 1094	int err = 0;
 1095	int ret;
 1096	struct net *net;
 1097
 1098	ASSERT_RTNL();
 1099	BUG_ON(!dev_net(dev));
 1100
 1101	net = dev_net(dev);
 1102
 1103	/* Some auto-enslaved devices e.g. failover slaves are
 1104	 * special, as userspace might rename the device after
 1105	 * the interface had been brought up and running since
 1106	 * the point kernel initiated auto-enslavement. Allow
 1107	 * live name change even when these slave devices are
 1108	 * up and running.
 1109	 *
 1110	 * Typically, users of these auto-enslaving devices
 1111	 * don't actually care about slave name change, as
 1112	 * they are supposed to operate on master interface
 1113	 * directly.
 1114	 */
 1115	if (dev->flags & IFF_UP &&
 1116	    likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
 1117		return -EBUSY;
 1118
 1119	write_seqcount_begin(&devnet_rename_seq);
 1120
 1121	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
 1122		write_seqcount_end(&devnet_rename_seq);
 1123		return 0;
 1124	}
 1125
 1126	memcpy(oldname, dev->name, IFNAMSIZ);
 1127
 1128	err = dev_get_valid_name(net, dev, newname);
 1129	if (err < 0) {
 1130		write_seqcount_end(&devnet_rename_seq);
 1131		return err;
 1132	}
 1133
 1134	if (oldname[0] && !strchr(oldname, '%'))
 1135		netdev_info(dev, "renamed from %s\n", oldname);
 1136
 1137	old_assign_type = dev->name_assign_type;
 1138	dev->name_assign_type = NET_NAME_RENAMED;
 1139
 1140rollback:
 1141	ret = device_rename(&dev->dev, dev->name);
 1142	if (ret) {
 1143		memcpy(dev->name, oldname, IFNAMSIZ);
 1144		dev->name_assign_type = old_assign_type;
 1145		write_seqcount_end(&devnet_rename_seq);
 1146		return ret;
 1147	}
 1148
 1149	write_seqcount_end(&devnet_rename_seq);
 1150
 1151	netdev_adjacent_rename_links(dev, oldname);
 1152
 1153	write_lock_bh(&dev_base_lock);
 1154	hlist_del_rcu(&dev->name_hlist);
 1155	write_unlock_bh(&dev_base_lock);
 1156
 1157	synchronize_rcu();
 1158
 1159	write_lock_bh(&dev_base_lock);
 1160	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 1161	write_unlock_bh(&dev_base_lock);
 1162
 1163	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 1164	ret = notifier_to_errno(ret);
 1165
 1166	if (ret) {
 1167		/* err >= 0 after dev_alloc_name() or stores the first errno */
 1168		if (err >= 0) {
 1169			err = ret;
 1170			write_seqcount_begin(&devnet_rename_seq);
 1171			memcpy(dev->name, oldname, IFNAMSIZ);
 1172			memcpy(oldname, newname, IFNAMSIZ);
 1173			dev->name_assign_type = old_assign_type;
 1174			old_assign_type = NET_NAME_RENAMED;
 1175			goto rollback;
 1176		} else {
 1177			pr_err("%s: name change rollback failed: %d\n",
 1178			       dev->name, ret);
 1179		}
 1180	}
 1181
 1182	return err;
 1183}
 1184
 1185/**
 1186 *	dev_set_alias - change ifalias of a device
 1187 *	@dev: device
 1188 *	@alias: name up to IFALIASZ
 1189 *	@len: limit of bytes to copy from info
 1190 *
 1191 *	Set ifalias for a device,
 1192 */
 1193int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 1194{
 1195	struct dev_ifalias *new_alias = NULL;
 1196
 1197	if (len >= IFALIASZ)
 1198		return -EINVAL;
 1199
 1200	if (len) {
 1201		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
 1202		if (!new_alias)
 1203			return -ENOMEM;
 1204
 1205		memcpy(new_alias->ifalias, alias, len);
 1206		new_alias->ifalias[len] = 0;
 1207	}
 1208
 1209	mutex_lock(&ifalias_mutex);
 1210	rcu_swap_protected(dev->ifalias, new_alias,
 1211			   mutex_is_locked(&ifalias_mutex));
 1212	mutex_unlock(&ifalias_mutex);
 1213
 1214	if (new_alias)
 1215		kfree_rcu(new_alias, rcuhead);
 1216
 1217	return len;
 1218}
 1219EXPORT_SYMBOL(dev_set_alias);
 1220
 1221/**
 1222 *	dev_get_alias - get ifalias of a device
 1223 *	@dev: device
 1224 *	@name: buffer to store name of ifalias
 1225 *	@len: size of buffer
 1226 *
 1227 *	get ifalias for a device.  Caller must make sure dev cannot go
 1228 *	away,  e.g. rcu read lock or own a reference count to device.
 1229 */
 1230int dev_get_alias(const struct net_device *dev, char *name, size_t len)
 1231{
 1232	const struct dev_ifalias *alias;
 1233	int ret = 0;
 1234
 1235	rcu_read_lock();
 1236	alias = rcu_dereference(dev->ifalias);
 1237	if (alias)
 1238		ret = snprintf(name, len, "%s", alias->ifalias);
 1239	rcu_read_unlock();
 1240
 1241	return ret;
 1242}
 1243
 1244/**
 1245 *	netdev_features_change - device changes features
 1246 *	@dev: device to cause notification
 1247 *
 1248 *	Called to indicate a device has changed features.
 1249 */
 1250void netdev_features_change(struct net_device *dev)
 1251{
 1252	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
 1253}
 1254EXPORT_SYMBOL(netdev_features_change);
 1255
 1256/**
 1257 *	netdev_state_change - device changes state
 1258 *	@dev: device to cause notification
 1259 *
 1260 *	Called to indicate a device has changed state. This function calls
 1261 *	the notifier chains for netdev_chain and sends a NEWLINK message
 1262 *	to the routing socket.
 1263 */
 1264void netdev_state_change(struct net_device *dev)
 1265{
 1266	if (dev->flags & IFF_UP) {
 1267		struct netdev_notifier_change_info change_info = {
 1268			.info.dev = dev,
 1269		};
 1270
 1271		call_netdevice_notifiers_info(NETDEV_CHANGE,
 1272					      &change_info.info);
 1273		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
 1274	}
 1275}
 1276EXPORT_SYMBOL(netdev_state_change);
 1277
 1278/**
 1279 * netdev_notify_peers - notify network peers about existence of @dev
 1280 * @dev: network device
 1281 *
 1282 * Generate traffic such that interested network peers are aware of
 1283 * @dev, such as by generating a gratuitous ARP. This may be used when
 1284 * a device wants to inform the rest of the network about some sort of
 1285 * reconfiguration such as a failover event or virtual machine
 1286 * migration.
 1287 */
 1288void netdev_notify_peers(struct net_device *dev)
 1289{
 1290	rtnl_lock();
 1291	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
 1292	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
 1293	rtnl_unlock();
 1294}
 1295EXPORT_SYMBOL(netdev_notify_peers);
 1296
 1297static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1298{
 1299	const struct net_device_ops *ops = dev->netdev_ops;
 1300	int ret;
 1301
 1302	ASSERT_RTNL();
 1303
 1304	if (!netif_device_present(dev))
 1305		return -ENODEV;
 1306
 1307	/* Block netpoll from trying to do any rx path servicing.
 1308	 * If we don't do this there is a chance ndo_poll_controller
 1309	 * or ndo_poll may be running while we open the device
 1310	 */
 1311	netpoll_poll_disable(dev);
 1312
 1313	ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
 1314	ret = notifier_to_errno(ret);
 1315	if (ret)
 1316		return ret;
 1317
 1318	set_bit(__LINK_STATE_START, &dev->state);
 1319
 1320	if (ops->ndo_validate_addr)
 1321		ret = ops->ndo_validate_addr(dev);
 1322
 1323	if (!ret && ops->ndo_open)
 1324		ret = ops->ndo_open(dev);
 1325
 1326	netpoll_poll_enable(dev);
 1327
 1328	if (ret)
 1329		clear_bit(__LINK_STATE_START, &dev->state);
 1330	else {
 1331		dev->flags |= IFF_UP;
 1332		dev_set_rx_mode(dev);
 1333		dev_activate(dev);
 1334		add_device_randomness(dev->dev_addr, dev->addr_len);
 1335	}
 1336
 1337	return ret;
 1338}
 1339
 1340/**
 1341 *	dev_open	- prepare an interface for use.
 1342 *	@dev: device to open
 1343 *	@extack: netlink extended ack
 1344 *
 1345 *	Takes a device from down to up state. The device's private open
 1346 *	function is invoked and then the multicast lists are loaded. Finally
 1347 *	the device is moved into the up state and a %NETDEV_UP message is
 1348 *	sent to the netdev notifier chain.
 1349 *
 1350 *	Calling this function on an active interface is a nop. On a failure
 1351 *	a negative errno code is returned.
 1352 */
 1353int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1354{
 1355	int ret;
 1356
 1357	if (dev->flags & IFF_UP)
 1358		return 0;
 1359
 1360	ret = __dev_open(dev, extack);
 1361	if (ret < 0)
 1362		return ret;
 1363
 1364	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
 1365	call_netdevice_notifiers(NETDEV_UP, dev);
 1366
 1367	return ret;
 1368}
 1369EXPORT_SYMBOL(dev_open);
 1370
 1371static void __dev_close_many(struct list_head *head)
 1372{
 1373	struct net_device *dev;
 1374
 1375	ASSERT_RTNL();
 1376	might_sleep();
 1377
 1378	list_for_each_entry(dev, head, close_list) {
 1379		/* Temporarily disable netpoll until the interface is down */
 1380		netpoll_poll_disable(dev);
 1381
 1382		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
 1383
 1384		clear_bit(__LINK_STATE_START, &dev->state);
 1385
 1386		/* Synchronize to scheduled poll. We cannot touch poll list, it
 1387		 * can be even on different cpu. So just clear netif_running().
 1388		 *
 1389		 * dev->stop() will invoke napi_disable() on all of it's
 1390		 * napi_struct instances on this device.
 1391		 */
 1392		smp_mb__after_atomic(); /* Commit netif_running(). */
 1393	}
 1394
 1395	dev_deactivate_many(head);
 1396
 1397	list_for_each_entry(dev, head, close_list) {
 1398		const struct net_device_ops *ops = dev->netdev_ops;
 1399
 1400		/*
 1401		 *	Call the device specific close. This cannot fail.
 1402		 *	Only if device is UP
 1403		 *
 1404		 *	We allow it to be called even after a DETACH hot-plug
 1405		 *	event.
 1406		 */
 1407		if (ops->ndo_stop)
 1408			ops->ndo_stop(dev);
 1409
 1410		dev->flags &= ~IFF_UP;
 1411		netpoll_poll_enable(dev);
 1412	}
 1413}
 1414
 1415static void __dev_close(struct net_device *dev)
 1416{
 1417	LIST_HEAD(single);
 1418
 1419	list_add(&dev->close_list, &single);
 1420	__dev_close_many(&single);
 1421	list_del(&single);
 1422}
 1423
 1424void dev_close_many(struct list_head *head, bool unlink)
 1425{
 1426	struct net_device *dev, *tmp;
 1427
 1428	/* Remove the devices that don't need to be closed */
 1429	list_for_each_entry_safe(dev, tmp, head, close_list)
 1430		if (!(dev->flags & IFF_UP))
 1431			list_del_init(&dev->close_list);
 1432
 1433	__dev_close_many(head);
 1434
 1435	list_for_each_entry_safe(dev, tmp, head, close_list) {
 1436		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
 1437		call_netdevice_notifiers(NETDEV_DOWN, dev);
 1438		if (unlink)
 1439			list_del_init(&dev->close_list);
 1440	}
 1441}
 1442EXPORT_SYMBOL(dev_close_many);
 1443
 1444/**
 1445 *	dev_close - shutdown an interface.
 1446 *	@dev: device to shutdown
 1447 *
 1448 *	This function moves an active device into down state. A
 1449 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
 1450 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
 1451 *	chain.
 1452 */
 1453void dev_close(struct net_device *dev)
 1454{
 1455	if (dev->flags & IFF_UP) {
 1456		LIST_HEAD(single);
 1457
 1458		list_add(&dev->close_list, &single);
 1459		dev_close_many(&single, true);
 1460		list_del(&single);
 1461	}
 1462}
 1463EXPORT_SYMBOL(dev_close);
 1464
 1465
 1466/**
 1467 *	dev_disable_lro - disable Large Receive Offload on a device
 1468 *	@dev: device
 1469 *
 1470 *	Disable Large Receive Offload (LRO) on a net device.  Must be
 1471 *	called under RTNL.  This is needed if received packets may be
 1472 *	forwarded to another interface.
 1473 */
 1474void dev_disable_lro(struct net_device *dev)
 1475{
 1476	struct net_device *lower_dev;
 1477	struct list_head *iter;
 1478
 1479	dev->wanted_features &= ~NETIF_F_LRO;
 1480	netdev_update_features(dev);
 1481
 1482	if (unlikely(dev->features & NETIF_F_LRO))
 1483		netdev_WARN(dev, "failed to disable LRO!\n");
 1484
 1485	netdev_for_each_lower_dev(dev, lower_dev, iter)
 1486		dev_disable_lro(lower_dev);
 1487}
 1488EXPORT_SYMBOL(dev_disable_lro);
 1489
 1490/**
 1491 *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
 1492 *	@dev: device
 1493 *
 1494 *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
 1495 *	called under RTNL.  This is needed if Generic XDP is installed on
 1496 *	the device.
 1497 */
 1498static void dev_disable_gro_hw(struct net_device *dev)
 1499{
 1500	dev->wanted_features &= ~NETIF_F_GRO_HW;
 1501	netdev_update_features(dev);
 1502
 1503	if (unlikely(dev->features & NETIF_F_GRO_HW))
 1504		netdev_WARN(dev, "failed to disable GRO_HW!\n");
 1505}
 1506
 1507const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 1508{
 1509#define N(val) 						\
 1510	case NETDEV_##val:				\
 1511		return "NETDEV_" __stringify(val);
 1512	switch (cmd) {
 1513	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
 1514	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
 1515	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
 1516	N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
 1517	N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
 1518	N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
 1519	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
 1520	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
 1521	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
 1522	N(PRE_CHANGEADDR)
 1523	}
 1524#undef N
 1525	return "UNKNOWN_NETDEV_EVENT";
 1526}
 1527EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
 1528
 1529static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
 1530				   struct net_device *dev)
 1531{
 1532	struct netdev_notifier_info info = {
 1533		.dev = dev,
 1534	};
 1535
 1536	return nb->notifier_call(nb, val, &info);
 1537}
 1538
 1539static int dev_boot_phase = 1;
 1540
 1541/**
 1542 * register_netdevice_notifier - register a network notifier block
 1543 * @nb: notifier
 1544 *
 1545 * Register a notifier to be called when network device events occur.
 1546 * The notifier passed is linked into the kernel structures and must
 1547 * not be reused until it has been unregistered. A negative errno code
 1548 * is returned on a failure.
 1549 *
 1550 * When registered all registration and up events are replayed
 1551 * to the new notifier to allow device to have a race free
 1552 * view of the network device list.
 1553 */
 1554
 1555int register_netdevice_notifier(struct notifier_block *nb)
 1556{
 1557	struct net_device *dev;
 1558	struct net_device *last;
 1559	struct net *net;
 1560	int err;
 1561
 1562	/* Close race with setup_net() and cleanup_net() */
 1563	down_write(&pernet_ops_rwsem);
 1564	rtnl_lock();
 1565	err = raw_notifier_chain_register(&netdev_chain, nb);
 1566	if (err)
 1567		goto unlock;
 1568	if (dev_boot_phase)
 1569		goto unlock;
 1570	for_each_net(net) {
 1571		for_each_netdev(net, dev) {
 1572			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
 1573			err = notifier_to_errno(err);
 1574			if (err)
 1575				goto rollback;
 1576
 1577			if (!(dev->flags & IFF_UP))
 1578				continue;
 1579
 1580			call_netdevice_notifier(nb, NETDEV_UP, dev);
 1581		}
 1582	}
 1583
 1584unlock:
 1585	rtnl_unlock();
 1586	up_write(&pernet_ops_rwsem);
 1587	return err;
 1588
 1589rollback:
 1590	last = dev;
 1591	for_each_net(net) {
 1592		for_each_netdev(net, dev) {
 1593			if (dev == last)
 1594				goto outroll;
 1595
 1596			if (dev->flags & IFF_UP) {
 1597				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
 1598							dev);
 1599				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
 1600			}
 1601			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
 1602		}
 1603	}
 1604
 1605outroll:
 1606	raw_notifier_chain_unregister(&netdev_chain, nb);
 1607	goto unlock;
 1608}
 1609EXPORT_SYMBOL(register_netdevice_notifier);
 1610
 1611/**
 1612 * unregister_netdevice_notifier - unregister a network notifier block
 1613 * @nb: notifier
 1614 *
 1615 * Unregister a notifier previously registered by
 1616 * register_netdevice_notifier(). The notifier is unlinked into the
 1617 * kernel structures and may then be reused. A negative errno code
 1618 * is returned on a failure.
 1619 *
 1620 * After unregistering unregister and down device events are synthesized
 1621 * for all devices on the device list to the removed notifier to remove
 1622 * the need for special case cleanup code.
 1623 */
 1624
 1625int unregister_netdevice_notifier(struct notifier_block *nb)
 1626{
 1627	struct net_device *dev;
 1628	struct net *net;
 1629	int err;
 1630
 1631	/* Close race with setup_net() and cleanup_net() */
 1632	down_write(&pernet_ops_rwsem);
 1633	rtnl_lock();
 1634	err = raw_notifier_chain_unregister(&netdev_chain, nb);
 1635	if (err)
 1636		goto unlock;
 1637
 1638	for_each_net(net) {
 1639		for_each_netdev(net, dev) {
 1640			if (dev->flags & IFF_UP) {
 1641				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
 1642							dev);
 1643				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
 1644			}
 1645			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
 1646		}
 1647	}
 1648unlock:
 1649	rtnl_unlock();
 1650	up_write(&pernet_ops_rwsem);
 1651	return err;
 1652}
 1653EXPORT_SYMBOL(unregister_netdevice_notifier);
 1654
 1655/**
 1656 *	call_netdevice_notifiers_info - call all network notifier blocks
 1657 *	@val: value passed unmodified to notifier function
 1658 *	@info: notifier information data
 1659 *
 1660 *	Call all network notifier blocks.  Parameters and return value
 1661 *	are as for raw_notifier_call_chain().
 1662 */
 1663
 1664static int call_netdevice_notifiers_info(unsigned long val,
 1665					 struct netdev_notifier_info *info)
 1666{
 1667	ASSERT_RTNL();
 1668	return raw_notifier_call_chain(&netdev_chain, val, info);
 1669}
 1670
 1671static int call_netdevice_notifiers_extack(unsigned long val,
 1672					   struct net_device *dev,
 1673					   struct netlink_ext_ack *extack)
 1674{
 1675	struct netdev_notifier_info info = {
 1676		.dev = dev,
 1677		.extack = extack,
 1678	};
 1679
 1680	return call_netdevice_notifiers_info(val, &info);
 1681}
 1682
 1683/**
 1684 *	call_netdevice_notifiers - call all network notifier blocks
 1685 *      @val: value passed unmodified to notifier function
 1686 *      @dev: net_device pointer passed unmodified to notifier function
 1687 *
 1688 *	Call all network notifier blocks.  Parameters and return value
 1689 *	are as for raw_notifier_call_chain().
 1690 */
 1691
 1692int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
 1693{
 1694	return call_netdevice_notifiers_extack(val, dev, NULL);
 1695}
 1696EXPORT_SYMBOL(call_netdevice_notifiers);
 1697
 1698/**
 1699 *	call_netdevice_notifiers_mtu - call all network notifier blocks
 1700 *	@val: value passed unmodified to notifier function
 1701 *	@dev: net_device pointer passed unmodified to notifier function
 1702 *	@arg: additional u32 argument passed to the notifier function
 1703 *
 1704 *	Call all network notifier blocks.  Parameters and return value
 1705 *	are as for raw_notifier_call_chain().
 1706 */
 1707static int call_netdevice_notifiers_mtu(unsigned long val,
 1708					struct net_device *dev, u32 arg)
 1709{
 1710	struct netdev_notifier_info_ext info = {
 1711		.info.dev = dev,
 1712		.ext.mtu = arg,
 1713	};
 1714
 1715	BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
 1716
 1717	return call_netdevice_notifiers_info(val, &info.info);
 1718}
 
 1719
 1720#ifdef CONFIG_NET_INGRESS
 1721static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
 1722
 1723void net_inc_ingress_queue(void)
 1724{
 1725	static_branch_inc(&ingress_needed_key);
 1726}
 1727EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
 1728
 1729void net_dec_ingress_queue(void)
 1730{
 1731	static_branch_dec(&ingress_needed_key);
 1732}
 1733EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
 1734#endif
 1735
 1736#ifdef CONFIG_NET_EGRESS
 1737static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
 1738
 1739void net_inc_egress_queue(void)
 1740{
 1741	static_branch_inc(&egress_needed_key);
 1742}
 1743EXPORT_SYMBOL_GPL(net_inc_egress_queue);
 1744
 1745void net_dec_egress_queue(void)
 1746{
 1747	static_branch_dec(&egress_needed_key);
 1748}
 1749EXPORT_SYMBOL_GPL(net_dec_egress_queue);
 1750#endif
 1751
 1752static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
 1753#ifdef CONFIG_JUMP_LABEL
 1754static atomic_t netstamp_needed_deferred;
 1755static atomic_t netstamp_wanted;
 1756static void netstamp_clear(struct work_struct *work)
 1757{
 1758	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
 1759	int wanted;
 1760
 1761	wanted = atomic_add_return(deferred, &netstamp_wanted);
 1762	if (wanted > 0)
 1763		static_branch_enable(&netstamp_needed_key);
 1764	else
 1765		static_branch_disable(&netstamp_needed_key);
 1766}
 1767static DECLARE_WORK(netstamp_work, netstamp_clear);
 1768#endif
 1769
 1770void net_enable_timestamp(void)
 1771{
 1772#ifdef CONFIG_JUMP_LABEL
 1773	int wanted;
 1774
 1775	while (1) {
 1776		wanted = atomic_read(&netstamp_wanted);
 1777		if (wanted <= 0)
 1778			break;
 1779		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
 1780			return;
 1781	}
 1782	atomic_inc(&netstamp_needed_deferred);
 1783	schedule_work(&netstamp_work);
 1784#else
 1785	static_branch_inc(&netstamp_needed_key);
 1786#endif
 1787}
 1788EXPORT_SYMBOL(net_enable_timestamp);
 1789
 1790void net_disable_timestamp(void)
 1791{
 1792#ifdef CONFIG_JUMP_LABEL
 1793	int wanted;
 1794
 1795	while (1) {
 1796		wanted = atomic_read(&netstamp_wanted);
 1797		if (wanted <= 1)
 1798			break;
 1799		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
 1800			return;
 1801	}
 1802	atomic_dec(&netstamp_needed_deferred);
 1803	schedule_work(&netstamp_work);
 1804#else
 1805	static_branch_dec(&netstamp_needed_key);
 1806#endif
 1807}
 1808EXPORT_SYMBOL(net_disable_timestamp);
 1809
 1810static inline void net_timestamp_set(struct sk_buff *skb)
 1811{
 1812	skb->tstamp = 0;
 1813	if (static_branch_unlikely(&netstamp_needed_key))
 1814		__net_timestamp(skb);
 1815}
 1816
 1817#define net_timestamp_check(COND, SKB)				\
 1818	if (static_branch_unlikely(&netstamp_needed_key)) {	\
 1819		if ((COND) && !(SKB)->tstamp)			\
 1820			__net_timestamp(SKB);			\
 1821	}							\
 1822
 1823bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
 1824{
 1825	unsigned int len;
 1826
 1827	if (!(dev->flags & IFF_UP))
 1828		return false;
 1829
 1830	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
 1831	if (skb->len <= len)
 1832		return true;
 1833
 1834	/* if TSO is enabled, we don't care about the length as the packet
 1835	 * could be forwarded without being segmented before
 1836	 */
 1837	if (skb_is_gso(skb))
 1838		return true;
 1839
 1840	return false;
 1841}
 1842EXPORT_SYMBOL_GPL(is_skb_forwardable);
 1843
 1844int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 1845{
 1846	int ret = ____dev_forward_skb(dev, skb);
 1847
 1848	if (likely(!ret)) {
 1849		skb->protocol = eth_type_trans(skb, dev);
 1850		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 1851	}
 1852
 1853	return ret;
 1854}
 1855EXPORT_SYMBOL_GPL(__dev_forward_skb);
 1856
 1857/**
 1858 * dev_forward_skb - loopback an skb to another netif
 1859 *
 1860 * @dev: destination network device
 1861 * @skb: buffer to forward
 1862 *
 1863 * return values:
 1864 *	NET_RX_SUCCESS	(no congestion)
 1865 *	NET_RX_DROP     (packet was dropped, but freed)
 1866 *
 1867 * dev_forward_skb can be used for injecting an skb from the
 1868 * start_xmit function of one device into the receive queue
 1869 * of another device.
 1870 *
 1871 * The receiving device may be in another namespace, so
 1872 * we have to clear all information in the skb that could
 1873 * impact namespace isolation.
 1874 */
 1875int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 1876{
 1877	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
 1878}
 1879EXPORT_SYMBOL_GPL(dev_forward_skb);
 1880
 1881static inline int deliver_skb(struct sk_buff *skb,
 1882			      struct packet_type *pt_prev,
 1883			      struct net_device *orig_dev)
 1884{
 1885	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 1886		return -ENOMEM;
 1887	refcount_inc(&skb->users);
 1888	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 1889}
 1890
 1891static inline void deliver_ptype_list_skb(struct sk_buff *skb,
 1892					  struct packet_type **pt,
 1893					  struct net_device *orig_dev,
 1894					  __be16 type,
 1895					  struct list_head *ptype_list)
 1896{
 1897	struct packet_type *ptype, *pt_prev = *pt;
 1898
 1899	list_for_each_entry_rcu(ptype, ptype_list, list) {
 1900		if (ptype->type != type)
 1901			continue;
 1902		if (pt_prev)
 1903			deliver_skb(skb, pt_prev, orig_dev);
 1904		pt_prev = ptype;
 1905	}
 1906	*pt = pt_prev;
 1907}
 1908
 1909static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
 1910{
 1911	if (!ptype->af_packet_priv || !skb->sk)
 1912		return false;
 1913
 1914	if (ptype->id_match)
 1915		return ptype->id_match(ptype, skb->sk);
 1916	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
 1917		return true;
 1918
 1919	return false;
 1920}
 1921
 1922/**
 1923 * dev_nit_active - return true if any network interface taps are in use
 1924 *
 1925 * @dev: network device to check for the presence of taps
 1926 */
 1927bool dev_nit_active(struct net_device *dev)
 1928{
 1929	return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
 1930}
 1931EXPORT_SYMBOL_GPL(dev_nit_active);
 1932
 1933/*
 1934 *	Support routine. Sends outgoing frames to any network
 1935 *	taps currently in use.
 1936 */
 1937
 1938void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 1939{
 1940	struct packet_type *ptype;
 1941	struct sk_buff *skb2 = NULL;
 1942	struct packet_type *pt_prev = NULL;
 1943	struct list_head *ptype_list = &ptype_all;
 1944
 1945	rcu_read_lock();
 1946again:
 1947	list_for_each_entry_rcu(ptype, ptype_list, list) {
 1948		if (ptype->ignore_outgoing)
 1949			continue;
 1950
 1951		/* Never send packets back to the socket
 1952		 * they originated from - MvS (miquels@drinkel.ow.org)
 1953		 */
 1954		if (skb_loop_sk(ptype, skb))
 1955			continue;
 1956
 1957		if (pt_prev) {
 1958			deliver_skb(skb2, pt_prev, skb->dev);
 1959			pt_prev = ptype;
 1960			continue;
 1961		}
 1962
 1963		/* need to clone skb, done only once */
 1964		skb2 = skb_clone(skb, GFP_ATOMIC);
 1965		if (!skb2)
 1966			goto out_unlock;
 1967
 1968		net_timestamp_set(skb2);
 1969
 1970		/* skb->nh should be correctly
 1971		 * set by sender, so that the second statement is
 1972		 * just protection against buggy protocols.
 1973		 */
 1974		skb_reset_mac_header(skb2);
 1975
 1976		if (skb_network_header(skb2) < skb2->data ||
 1977		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
 1978			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
 1979					     ntohs(skb2->protocol),
 1980					     dev->name);
 1981			skb_reset_network_header(skb2);
 1982		}
 1983
 1984		skb2->transport_header = skb2->network_header;
 1985		skb2->pkt_type = PACKET_OUTGOING;
 1986		pt_prev = ptype;
 1987	}
 1988
 1989	if (ptype_list == &ptype_all) {
 1990		ptype_list = &dev->ptype_all;
 1991		goto again;
 1992	}
 1993out_unlock:
 1994	if (pt_prev) {
 1995		if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
 1996			pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
 1997		else
 1998			kfree_skb(skb2);
 1999	}
 2000	rcu_read_unlock();
 2001}
 2002EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
 2003
 2004/**
 2005 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
 2006 * @dev: Network device
 2007 * @txq: number of queues available
 2008 *
 2009 * If real_num_tx_queues is changed the tc mappings may no longer be
 2010 * valid. To resolve this verify the tc mapping remains valid and if
 2011 * not NULL the mapping. With no priorities mapping to this
 2012 * offset/count pair it will no longer be used. In the worst case TC0
 2013 * is invalid nothing can be done so disable priority mappings. If is
 2014 * expected that drivers will fix this mapping if they can before
 2015 * calling netif_set_real_num_tx_queues.
 2016 */
 2017static void netif_setup_tc(struct net_device *dev, unsigned int txq)
 2018{
 2019	int i;
 2020	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2021
 2022	/* If TC0 is invalidated disable TC mapping */
 2023	if (tc->offset + tc->count > txq) {
 2024		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
 2025		dev->num_tc = 0;
 2026		return;
 2027	}
 2028
 2029	/* Invalidated prio to tc mappings set to TC0 */
 2030	for (i = 1; i < TC_BITMASK + 1; i++) {
 2031		int q = netdev_get_prio_tc_map(dev, i);
 2032
 2033		tc = &dev->tc_to_txq[q];
 2034		if (tc->offset + tc->count > txq) {
 2035			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
 2036				i, q);
 2037			netdev_set_prio_tc_map(dev, i, 0);
 2038		}
 2039	}
 2040}
 2041
 2042int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
 2043{
 2044	if (dev->num_tc) {
 2045		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2046		int i;
 2047
 2048		/* walk through the TCs and see if it falls into any of them */
 2049		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
 2050			if ((txq - tc->offset) < tc->count)
 2051				return i;
 2052		}
 2053
 2054		/* didn't find it, just return -1 to indicate no match */
 2055		return -1;
 2056	}
 2057
 2058	return 0;
 2059}
 2060EXPORT_SYMBOL(netdev_txq_to_tc);
 2061
 2062#ifdef CONFIG_XPS
 2063struct static_key xps_needed __read_mostly;
 2064EXPORT_SYMBOL(xps_needed);
 2065struct static_key xps_rxqs_needed __read_mostly;
 2066EXPORT_SYMBOL(xps_rxqs_needed);
 2067static DEFINE_MUTEX(xps_map_mutex);
 2068#define xmap_dereference(P)		\
 2069	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
 2070
 2071static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
 2072			     int tci, u16 index)
 2073{
 2074	struct xps_map *map = NULL;
 2075	int pos;
 2076
 2077	if (dev_maps)
 2078		map = xmap_dereference(dev_maps->attr_map[tci]);
 2079	if (!map)
 2080		return false;
 2081
 2082	for (pos = map->len; pos--;) {
 2083		if (map->queues[pos] != index)
 2084			continue;
 2085
 2086		if (map->len > 1) {
 2087			map->queues[pos] = map->queues[--map->len];
 2088			break;
 2089		}
 2090
 2091		RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
 2092		kfree_rcu(map, rcu);
 2093		return false;
 2094	}
 2095
 2096	return true;
 2097}
 2098
 2099static bool remove_xps_queue_cpu(struct net_device *dev,
 2100				 struct xps_dev_maps *dev_maps,
 2101				 int cpu, u16 offset, u16 count)
 2102{
 2103	int num_tc = dev->num_tc ? : 1;
 2104	bool active = false;
 2105	int tci;
 2106
 2107	for (tci = cpu * num_tc; num_tc--; tci++) {
 2108		int i, j;
 2109
 2110		for (i = count, j = offset; i--; j++) {
 2111			if (!remove_xps_queue(dev_maps, tci, j))
 2112				break;
 2113		}
 2114
 2115		active |= i < 0;
 2116	}
 2117
 2118	return active;
 2119}
 2120
 2121static void reset_xps_maps(struct net_device *dev,
 2122			   struct xps_dev_maps *dev_maps,
 2123			   bool is_rxqs_map)
 2124{
 2125	if (is_rxqs_map) {
 2126		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
 2127		RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
 2128	} else {
 2129		RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
 2130	}
 2131	static_key_slow_dec_cpuslocked(&xps_needed);
 2132	kfree_rcu(dev_maps, rcu);
 2133}
 2134
 2135static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
 2136			   struct xps_dev_maps *dev_maps, unsigned int nr_ids,
 2137			   u16 offset, u16 count, bool is_rxqs_map)
 2138{
 2139	bool active = false;
 2140	int i, j;
 2141
 2142	for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
 2143	     j < nr_ids;)
 2144		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
 2145					       count);
 2146	if (!active)
 2147		reset_xps_maps(dev, dev_maps, is_rxqs_map);
 2148
 2149	if (!is_rxqs_map) {
 2150		for (i = offset + (count - 1); count--; i--) {
 2151			netdev_queue_numa_node_write(
 2152				netdev_get_tx_queue(dev, i),
 2153				NUMA_NO_NODE);
 2154		}
 2155	}
 2156}
 2157
 2158static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
 2159				   u16 count)
 2160{
 2161	const unsigned long *possible_mask = NULL;
 2162	struct xps_dev_maps *dev_maps;
 2163	unsigned int nr_ids;
 2164
 2165	if (!static_key_false(&xps_needed))
 2166		return;
 2167
 2168	cpus_read_lock();
 2169	mutex_lock(&xps_map_mutex);
 
 2170
 2171	if (static_key_false(&xps_rxqs_needed)) {
 2172		dev_maps = xmap_dereference(dev->xps_rxqs_map);
 2173		if (dev_maps) {
 2174			nr_ids = dev->num_rx_queues;
 2175			clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
 2176				       offset, count, true);
 2177		}
 2178	}
 2179
 2180	dev_maps = xmap_dereference(dev->xps_cpus_map);
 2181	if (!dev_maps)
 2182		goto out_no_maps;
 2183
 2184	if (num_possible_cpus() > 1)
 2185		possible_mask = cpumask_bits(cpu_possible_mask);
 2186	nr_ids = nr_cpu_ids;
 2187	clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
 2188		       false);
 
 
 
 
 
 
 
 2189
 2190out_no_maps:
 2191	mutex_unlock(&xps_map_mutex);
 2192	cpus_read_unlock();
 2193}
 2194
 2195static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
 2196{
 2197	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
 2198}
 2199
 2200static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
 2201				      u16 index, bool is_rxqs_map)
 2202{
 2203	struct xps_map *new_map;
 2204	int alloc_len = XPS_MIN_MAP_ALLOC;
 2205	int i, pos;
 2206
 2207	for (pos = 0; map && pos < map->len; pos++) {
 2208		if (map->queues[pos] != index)
 2209			continue;
 2210		return map;
 2211	}
 2212
 2213	/* Need to add tx-queue to this CPU's/rx-queue's existing map */
 2214	if (map) {
 2215		if (pos < map->alloc_len)
 2216			return map;
 2217
 2218		alloc_len = map->alloc_len * 2;
 2219	}
 2220
 2221	/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
 2222	 *  map
 2223	 */
 2224	if (is_rxqs_map)
 2225		new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
 2226	else
 2227		new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
 2228				       cpu_to_node(attr_index));
 2229	if (!new_map)
 2230		return NULL;
 2231
 2232	for (i = 0; i < pos; i++)
 2233		new_map->queues[i] = map->queues[i];
 2234	new_map->alloc_len = alloc_len;
 2235	new_map->len = pos;
 2236
 2237	return new_map;
 2238}
 2239
 2240/* Must be called under cpus_read_lock */
 2241int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 2242			  u16 index, bool is_rxqs_map)
 2243{
 2244	const unsigned long *online_mask = NULL, *possible_mask = NULL;
 2245	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
 2246	int i, j, tci, numa_node_id = -2;
 2247	int maps_sz, num_tc = 1, tc = 0;
 2248	struct xps_map *map, *new_map;
 2249	bool active = false;
 2250	unsigned int nr_ids;
 2251
 2252	if (dev->num_tc) {
 2253		/* Do not allow XPS on subordinate device directly */
 2254		num_tc = dev->num_tc;
 2255		if (num_tc < 0)
 2256			return -EINVAL;
 2257
 2258		/* If queue belongs to subordinate dev use its map */
 2259		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
 2260
 2261		tc = netdev_txq_to_tc(dev, index);
 2262		if (tc < 0)
 2263			return -EINVAL;
 2264	}
 2265
 2266	mutex_lock(&xps_map_mutex);
 2267	if (is_rxqs_map) {
 2268		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
 2269		dev_maps = xmap_dereference(dev->xps_rxqs_map);
 2270		nr_ids = dev->num_rx_queues;
 2271	} else {
 2272		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
 2273		if (num_possible_cpus() > 1) {
 2274			online_mask = cpumask_bits(cpu_online_mask);
 2275			possible_mask = cpumask_bits(cpu_possible_mask);
 2276		}
 2277		dev_maps = xmap_dereference(dev->xps_cpus_map);
 2278		nr_ids = nr_cpu_ids;
 2279	}
 2280
 2281	if (maps_sz < L1_CACHE_BYTES)
 2282		maps_sz = L1_CACHE_BYTES;
 2283
 
 
 
 
 2284	/* allocate memory for queue storage */
 2285	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
 2286	     j < nr_ids;) {
 2287		if (!new_dev_maps)
 2288			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
 2289		if (!new_dev_maps) {
 2290			mutex_unlock(&xps_map_mutex);
 2291			return -ENOMEM;
 2292		}
 2293
 2294		tci = j * num_tc + tc;
 2295		map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
 2296				 NULL;
 2297
 2298		map = expand_xps_map(map, j, index, is_rxqs_map);
 2299		if (!map)
 2300			goto error;
 2301
 2302		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2303	}
 2304
 2305	if (!new_dev_maps)
 2306		goto out_no_new_maps;
 2307
 2308	if (!dev_maps) {
 2309		/* Increment static keys at most once per type */
 2310		static_key_slow_inc_cpuslocked(&xps_needed);
 2311		if (is_rxqs_map)
 2312			static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
 2313	}
 2314
 2315	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2316	     j < nr_ids;) {
 2317		/* copy maps belonging to foreign traffic classes */
 2318		for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
 2319			/* fill in the new device map from the old device map */
 2320			map = xmap_dereference(dev_maps->attr_map[tci]);
 2321			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2322		}
 2323
 2324		/* We need to explicitly update tci as prevous loop
 2325		 * could break out early if dev_maps is NULL.
 2326		 */
 2327		tci = j * num_tc + tc;
 2328
 2329		if (netif_attr_test_mask(j, mask, nr_ids) &&
 2330		    netif_attr_test_online(j, online_mask, nr_ids)) {
 2331			/* add tx-queue to CPU/rx-queue maps */
 2332			int pos = 0;
 2333
 2334			map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2335			while ((pos < map->len) && (map->queues[pos] != index))
 2336				pos++;
 2337
 2338			if (pos == map->len)
 2339				map->queues[map->len++] = index;
 2340#ifdef CONFIG_NUMA
 2341			if (!is_rxqs_map) {
 2342				if (numa_node_id == -2)
 2343					numa_node_id = cpu_to_node(j);
 2344				else if (numa_node_id != cpu_to_node(j))
 2345					numa_node_id = -1;
 2346			}
 2347#endif
 2348		} else if (dev_maps) {
 2349			/* fill in the new device map from the old device map */
 2350			map = xmap_dereference(dev_maps->attr_map[tci]);
 2351			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2352		}
 2353
 2354		/* copy maps belonging to foreign traffic classes */
 2355		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
 2356			/* fill in the new device map from the old device map */
 2357			map = xmap_dereference(dev_maps->attr_map[tci]);
 2358			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2359		}
 2360	}
 2361
 2362	if (is_rxqs_map)
 2363		rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
 2364	else
 2365		rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
 2366
 2367	/* Cleanup old maps */
 2368	if (!dev_maps)
 2369		goto out_no_old_maps;
 2370
 2371	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2372	     j < nr_ids;) {
 2373		for (i = num_tc, tci = j * num_tc; i--; tci++) {
 2374			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2375			map = xmap_dereference(dev_maps->attr_map[tci]);
 2376			if (map && map != new_map)
 2377				kfree_rcu(map, rcu);
 2378		}
 2379	}
 2380
 2381	kfree_rcu(dev_maps, rcu);
 2382
 2383out_no_old_maps:
 2384	dev_maps = new_dev_maps;
 2385	active = true;
 2386
 2387out_no_new_maps:
 2388	if (!is_rxqs_map) {
 2389		/* update Tx queue numa node */
 2390		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
 2391					     (numa_node_id >= 0) ?
 2392					     numa_node_id : NUMA_NO_NODE);
 2393	}
 2394
 2395	if (!dev_maps)
 2396		goto out_no_maps;
 2397
 2398	/* removes tx-queue from unused CPUs/rx-queues */
 2399	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2400	     j < nr_ids;) {
 2401		for (i = tc, tci = j * num_tc; i--; tci++)
 2402			active |= remove_xps_queue(dev_maps, tci, index);
 2403		if (!netif_attr_test_mask(j, mask, nr_ids) ||
 2404		    !netif_attr_test_online(j, online_mask, nr_ids))
 2405			active |= remove_xps_queue(dev_maps, tci, index);
 2406		for (i = num_tc - tc, tci++; --i; tci++)
 2407			active |= remove_xps_queue(dev_maps, tci, index);
 2408	}
 2409
 2410	/* free map if not active */
 2411	if (!active)
 2412		reset_xps_maps(dev, dev_maps, is_rxqs_map);
 
 
 2413
 2414out_no_maps:
 2415	mutex_unlock(&xps_map_mutex);
 2416
 2417	return 0;
 2418error:
 2419	/* remove any maps that we added */
 2420	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2421	     j < nr_ids;) {
 2422		for (i = num_tc, tci = j * num_tc; i--; tci++) {
 2423			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2424			map = dev_maps ?
 2425			      xmap_dereference(dev_maps->attr_map[tci]) :
 2426			      NULL;
 2427			if (new_map && new_map != map)
 2428				kfree(new_map);
 2429		}
 2430	}
 2431
 2432	mutex_unlock(&xps_map_mutex);
 2433
 2434	kfree(new_dev_maps);
 2435	return -ENOMEM;
 2436}
 2437EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
 2438
 2439int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 2440			u16 index)
 2441{
 2442	int ret;
 2443
 2444	cpus_read_lock();
 2445	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
 2446	cpus_read_unlock();
 2447
 2448	return ret;
 2449}
 2450EXPORT_SYMBOL(netif_set_xps_queue);
 2451
 2452#endif
 2453static void netdev_unbind_all_sb_channels(struct net_device *dev)
 2454{
 2455	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2456
 2457	/* Unbind any subordinate channels */
 2458	while (txq-- != &dev->_tx[0]) {
 2459		if (txq->sb_dev)
 2460			netdev_unbind_sb_channel(dev, txq->sb_dev);
 2461	}
 2462}
 2463
 2464void netdev_reset_tc(struct net_device *dev)
 2465{
 2466#ifdef CONFIG_XPS
 2467	netif_reset_xps_queues_gt(dev, 0);
 2468#endif
 2469	netdev_unbind_all_sb_channels(dev);
 2470
 2471	/* Reset TC configuration of device */
 2472	dev->num_tc = 0;
 2473	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
 2474	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
 2475}
 2476EXPORT_SYMBOL(netdev_reset_tc);
 2477
 2478int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
 2479{
 2480	if (tc >= dev->num_tc)
 2481		return -EINVAL;
 2482
 2483#ifdef CONFIG_XPS
 2484	netif_reset_xps_queues(dev, offset, count);
 2485#endif
 2486	dev->tc_to_txq[tc].count = count;
 2487	dev->tc_to_txq[tc].offset = offset;
 2488	return 0;
 2489}
 2490EXPORT_SYMBOL(netdev_set_tc_queue);
 2491
 2492int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
 2493{
 2494	if (num_tc > TC_MAX_QUEUE)
 2495		return -EINVAL;
 2496
 2497#ifdef CONFIG_XPS
 2498	netif_reset_xps_queues_gt(dev, 0);
 2499#endif
 2500	netdev_unbind_all_sb_channels(dev);
 2501
 2502	dev->num_tc = num_tc;
 2503	return 0;
 2504}
 2505EXPORT_SYMBOL(netdev_set_num_tc);
 2506
 2507void netdev_unbind_sb_channel(struct net_device *dev,
 2508			      struct net_device *sb_dev)
 2509{
 2510	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2511
 2512#ifdef CONFIG_XPS
 2513	netif_reset_xps_queues_gt(sb_dev, 0);
 2514#endif
 2515	memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
 2516	memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
 2517
 2518	while (txq-- != &dev->_tx[0]) {
 2519		if (txq->sb_dev == sb_dev)
 2520			txq->sb_dev = NULL;
 2521	}
 2522}
 2523EXPORT_SYMBOL(netdev_unbind_sb_channel);
 2524
 2525int netdev_bind_sb_channel_queue(struct net_device *dev,
 2526				 struct net_device *sb_dev,
 2527				 u8 tc, u16 count, u16 offset)
 2528{
 2529	/* Make certain the sb_dev and dev are already configured */
 2530	if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
 2531		return -EINVAL;
 2532
 2533	/* We cannot hand out queues we don't have */
 2534	if ((offset + count) > dev->real_num_tx_queues)
 2535		return -EINVAL;
 2536
 2537	/* Record the mapping */
 2538	sb_dev->tc_to_txq[tc].count = count;
 2539	sb_dev->tc_to_txq[tc].offset = offset;
 2540
 2541	/* Provide a way for Tx queue to find the tc_to_txq map or
 2542	 * XPS map for itself.
 2543	 */
 2544	while (count--)
 2545		netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
 2546
 2547	return 0;
 2548}
 2549EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
 2550
 2551int netdev_set_sb_channel(struct net_device *dev, u16 channel)
 2552{
 2553	/* Do not use a multiqueue device to represent a subordinate channel */
 2554	if (netif_is_multiqueue(dev))
 2555		return -ENODEV;
 2556
 2557	/* We allow channels 1 - 32767 to be used for subordinate channels.
 2558	 * Channel 0 is meant to be "native" mode and used only to represent
 2559	 * the main root device. We allow writing 0 to reset the device back
 2560	 * to normal mode after being used as a subordinate channel.
 2561	 */
 2562	if (channel > S16_MAX)
 2563		return -EINVAL;
 2564
 2565	dev->num_tc = -channel;
 2566
 2567	return 0;
 2568}
 2569EXPORT_SYMBOL(netdev_set_sb_channel);
 2570
 2571/*
 2572 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
 2573 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
 2574 */
 2575int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
 2576{
 2577	bool disabling;
 2578	int rc;
 2579
 2580	disabling = txq < dev->real_num_tx_queues;
 2581
 2582	if (txq < 1 || txq > dev->num_tx_queues)
 2583		return -EINVAL;
 2584
 2585	if (dev->reg_state == NETREG_REGISTERED ||
 2586	    dev->reg_state == NETREG_UNREGISTERING) {
 2587		ASSERT_RTNL();
 2588
 2589		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
 2590						  txq);
 2591		if (rc)
 2592			return rc;
 2593
 2594		if (dev->num_tc)
 2595			netif_setup_tc(dev, txq);
 2596
 2597		dev->real_num_tx_queues = txq;
 2598
 2599		if (disabling) {
 2600			synchronize_net();
 2601			qdisc_reset_all_tx_gt(dev, txq);
 2602#ifdef CONFIG_XPS
 2603			netif_reset_xps_queues_gt(dev, txq);
 2604#endif
 2605		}
 2606	} else {
 2607		dev->real_num_tx_queues = txq;
 2608	}
 2609
 2610	return 0;
 2611}
 2612EXPORT_SYMBOL(netif_set_real_num_tx_queues);
 2613
 2614#ifdef CONFIG_SYSFS
 2615/**
 2616 *	netif_set_real_num_rx_queues - set actual number of RX queues used
 2617 *	@dev: Network device
 2618 *	@rxq: Actual number of RX queues
 2619 *
 2620 *	This must be called either with the rtnl_lock held or before
 2621 *	registration of the net device.  Returns 0 on success, or a
 2622 *	negative error code.  If called before registration, it always
 2623 *	succeeds.
 2624 */
 2625int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
 2626{
 2627	int rc;
 2628
 2629	if (rxq < 1 || rxq > dev->num_rx_queues)
 2630		return -EINVAL;
 2631
 2632	if (dev->reg_state == NETREG_REGISTERED) {
 2633		ASSERT_RTNL();
 2634
 2635		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
 2636						  rxq);
 2637		if (rc)
 2638			return rc;
 2639	}
 2640
 2641	dev->real_num_rx_queues = rxq;
 2642	return 0;
 2643}
 2644EXPORT_SYMBOL(netif_set_real_num_rx_queues);
 2645#endif
 2646
 2647/**
 2648 * netif_get_num_default_rss_queues - default number of RSS queues
 2649 *
 2650 * This routine should set an upper limit on the number of RSS queues
 2651 * used by default by multiqueue devices.
 2652 */
 2653int netif_get_num_default_rss_queues(void)
 2654{
 2655	return is_kdump_kernel() ?
 2656		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
 2657}
 2658EXPORT_SYMBOL(netif_get_num_default_rss_queues);
 2659
 2660static void __netif_reschedule(struct Qdisc *q)
 2661{
 2662	struct softnet_data *sd;
 2663	unsigned long flags;
 2664
 2665	local_irq_save(flags);
 2666	sd = this_cpu_ptr(&softnet_data);
 2667	q->next_sched = NULL;
 2668	*sd->output_queue_tailp = q;
 2669	sd->output_queue_tailp = &q->next_sched;
 2670	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 2671	local_irq_restore(flags);
 2672}
 2673
 2674void __netif_schedule(struct Qdisc *q)
 2675{
 2676	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
 2677		__netif_reschedule(q);
 2678}
 2679EXPORT_SYMBOL(__netif_schedule);
 2680
 2681struct dev_kfree_skb_cb {
 2682	enum skb_free_reason reason;
 2683};
 2684
 2685static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
 2686{
 2687	return (struct dev_kfree_skb_cb *)skb->cb;
 2688}
 2689
 2690void netif_schedule_queue(struct netdev_queue *txq)
 2691{
 2692	rcu_read_lock();
 2693	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
 2694		struct Qdisc *q = rcu_dereference(txq->qdisc);
 2695
 2696		__netif_schedule(q);
 2697	}
 2698	rcu_read_unlock();
 2699}
 2700EXPORT_SYMBOL(netif_schedule_queue);
 2701
 2702void netif_tx_wake_queue(struct netdev_queue *dev_queue)
 2703{
 2704	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
 2705		struct Qdisc *q;
 2706
 2707		rcu_read_lock();
 2708		q = rcu_dereference(dev_queue->qdisc);
 2709		__netif_schedule(q);
 2710		rcu_read_unlock();
 2711	}
 2712}
 2713EXPORT_SYMBOL(netif_tx_wake_queue);
 2714
 2715void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
 2716{
 2717	unsigned long flags;
 2718
 2719	if (unlikely(!skb))
 2720		return;
 2721
 2722	if (likely(refcount_read(&skb->users) == 1)) {
 2723		smp_rmb();
 2724		refcount_set(&skb->users, 0);
 2725	} else if (likely(!refcount_dec_and_test(&skb->users))) {
 2726		return;
 2727	}
 2728	get_kfree_skb_cb(skb)->reason = reason;
 2729	local_irq_save(flags);
 2730	skb->next = __this_cpu_read(softnet_data.completion_queue);
 2731	__this_cpu_write(softnet_data.completion_queue, skb);
 2732	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 2733	local_irq_restore(flags);
 2734}
 2735EXPORT_SYMBOL(__dev_kfree_skb_irq);
 2736
 2737void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
 2738{
 2739	if (in_irq() || irqs_disabled())
 2740		__dev_kfree_skb_irq(skb, reason);
 2741	else
 2742		dev_kfree_skb(skb);
 2743}
 2744EXPORT_SYMBOL(__dev_kfree_skb_any);
 2745
 2746
 2747/**
 2748 * netif_device_detach - mark device as removed
 2749 * @dev: network device
 2750 *
 2751 * Mark device as removed from system and therefore no longer available.
 2752 */
 2753void netif_device_detach(struct net_device *dev)
 2754{
 2755	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
 2756	    netif_running(dev)) {
 2757		netif_tx_stop_all_queues(dev);
 2758	}
 2759}
 2760EXPORT_SYMBOL(netif_device_detach);
 2761
 2762/**
 2763 * netif_device_attach - mark device as attached
 2764 * @dev: network device
 2765 *
 2766 * Mark device as attached from system and restart if needed.
 2767 */
 2768void netif_device_attach(struct net_device *dev)
 2769{
 2770	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
 2771	    netif_running(dev)) {
 2772		netif_tx_wake_all_queues(dev);
 2773		__netdev_watchdog_up(dev);
 2774	}
 2775}
 2776EXPORT_SYMBOL(netif_device_attach);
 2777
 2778/*
 2779 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
 2780 * to be used as a distribution range.
 2781 */
 2782static u16 skb_tx_hash(const struct net_device *dev,
 2783		       const struct net_device *sb_dev,
 2784		       struct sk_buff *skb)
 2785{
 2786	u32 hash;
 2787	u16 qoffset = 0;
 2788	u16 qcount = dev->real_num_tx_queues;
 
 
 
 
 
 
 
 2789
 2790	if (dev->num_tc) {
 2791		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
 2792
 2793		qoffset = sb_dev->tc_to_txq[tc].offset;
 2794		qcount = sb_dev->tc_to_txq[tc].count;
 2795	}
 2796
 2797	if (skb_rx_queue_recorded(skb)) {
 2798		hash = skb_get_rx_queue(skb);
 2799		while (unlikely(hash >= qcount))
 2800			hash -= qcount;
 2801		return hash + qoffset;
 2802	}
 2803
 2804	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
 2805}
 
 2806
 2807static void skb_warn_bad_offload(const struct sk_buff *skb)
 2808{
 2809	static const netdev_features_t null_features;
 2810	struct net_device *dev = skb->dev;
 2811	const char *name = "";
 2812
 2813	if (!net_ratelimit())
 2814		return;
 2815
 2816	if (dev) {
 2817		if (dev->dev.parent)
 2818			name = dev_driver_string(dev->dev.parent);
 2819		else
 2820			name = netdev_name(dev);
 2821	}
 2822	skb_dump(KERN_WARNING, skb, false);
 2823	WARN(1, "%s: caps=(%pNF, %pNF)\n",
 2824	     name, dev ? &dev->features : &null_features,
 2825	     skb->sk ? &skb->sk->sk_route_caps : &null_features);
 
 
 2826}
 2827
 2828/*
 2829 * Invalidate hardware checksum when packet is to be mangled, and
 2830 * complete checksum manually on outgoing path.
 2831 */
 2832int skb_checksum_help(struct sk_buff *skb)
 2833{
 2834	__wsum csum;
 2835	int ret = 0, offset;
 2836
 2837	if (skb->ip_summed == CHECKSUM_COMPLETE)
 2838		goto out_set_summed;
 2839
 2840	if (unlikely(skb_shinfo(skb)->gso_size)) {
 2841		skb_warn_bad_offload(skb);
 2842		return -EINVAL;
 2843	}
 2844
 2845	/* Before computing a checksum, we should make sure no frag could
 2846	 * be modified by an external entity : checksum could be wrong.
 2847	 */
 2848	if (skb_has_shared_frag(skb)) {
 2849		ret = __skb_linearize(skb);
 2850		if (ret)
 2851			goto out;
 2852	}
 2853
 2854	offset = skb_checksum_start_offset(skb);
 2855	BUG_ON(offset >= skb_headlen(skb));
 2856	csum = skb_checksum(skb, offset, skb->len - offset, 0);
 2857
 2858	offset += skb->csum_offset;
 2859	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
 2860
 2861	if (skb_cloned(skb) &&
 2862	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
 2863		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
 2864		if (ret)
 2865			goto out;
 2866	}
 2867
 2868	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
 2869out_set_summed:
 2870	skb->ip_summed = CHECKSUM_NONE;
 2871out:
 2872	return ret;
 2873}
 2874EXPORT_SYMBOL(skb_checksum_help);
 2875
 2876int skb_crc32c_csum_help(struct sk_buff *skb)
 2877{
 2878	__le32 crc32c_csum;
 2879	int ret = 0, offset, start;
 2880
 2881	if (skb->ip_summed != CHECKSUM_PARTIAL)
 2882		goto out;
 2883
 2884	if (unlikely(skb_is_gso(skb)))
 2885		goto out;
 2886
 2887	/* Before computing a checksum, we should make sure no frag could
 2888	 * be modified by an external entity : checksum could be wrong.
 2889	 */
 2890	if (unlikely(skb_has_shared_frag(skb))) {
 2891		ret = __skb_linearize(skb);
 2892		if (ret)
 2893			goto out;
 2894	}
 2895	start = skb_checksum_start_offset(skb);
 2896	offset = start + offsetof(struct sctphdr, checksum);
 2897	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
 2898		ret = -EINVAL;
 2899		goto out;
 2900	}
 2901	if (skb_cloned(skb) &&
 2902	    !skb_clone_writable(skb, offset + sizeof(__le32))) {
 2903		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
 2904		if (ret)
 2905			goto out;
 2906	}
 2907	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
 2908						  skb->len - start, ~(__u32)0,
 2909						  crc32c_csum_stub));
 2910	*(__le32 *)(skb->data + offset) = crc32c_csum;
 2911	skb->ip_summed = CHECKSUM_NONE;
 2912	skb->csum_not_inet = 0;
 2913out:
 2914	return ret;
 2915}
 2916
 2917__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
 2918{
 2919	__be16 type = skb->protocol;
 2920
 2921	/* Tunnel gso handlers can set protocol to ethernet. */
 2922	if (type == htons(ETH_P_TEB)) {
 2923		struct ethhdr *eth;
 2924
 2925		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
 2926			return 0;
 2927
 2928		eth = (struct ethhdr *)skb->data;
 2929		type = eth->h_proto;
 2930	}
 2931
 2932	return __vlan_get_protocol(skb, type, depth);
 2933}
 2934
 2935/**
 2936 *	skb_mac_gso_segment - mac layer segmentation handler.
 2937 *	@skb: buffer to segment
 2938 *	@features: features for the output path (see dev->features)
 2939 */
 2940struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
 2941				    netdev_features_t features)
 2942{
 2943	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
 2944	struct packet_offload *ptype;
 2945	int vlan_depth = skb->mac_len;
 2946	__be16 type = skb_network_protocol(skb, &vlan_depth);
 2947
 2948	if (unlikely(!type))
 2949		return ERR_PTR(-EINVAL);
 2950
 2951	__skb_pull(skb, vlan_depth);
 2952
 2953	rcu_read_lock();
 2954	list_for_each_entry_rcu(ptype, &offload_base, list) {
 2955		if (ptype->type == type && ptype->callbacks.gso_segment) {
 2956			segs = ptype->callbacks.gso_segment(skb, features);
 2957			break;
 2958		}
 2959	}
 2960	rcu_read_unlock();
 2961
 2962	__skb_push(skb, skb->data - skb_mac_header(skb));
 2963
 2964	return segs;
 2965}
 2966EXPORT_SYMBOL(skb_mac_gso_segment);
 2967
 2968
 2969/* openvswitch calls this on rx path, so we need a different check.
 2970 */
 2971static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
 2972{
 2973	if (tx_path)
 2974		return skb->ip_summed != CHECKSUM_PARTIAL &&
 2975		       skb->ip_summed != CHECKSUM_UNNECESSARY;
 2976
 2977	return skb->ip_summed == CHECKSUM_NONE;
 2978}
 2979
 2980/**
 2981 *	__skb_gso_segment - Perform segmentation on skb.
 2982 *	@skb: buffer to segment
 2983 *	@features: features for the output path (see dev->features)
 2984 *	@tx_path: whether it is called in TX path
 2985 *
 2986 *	This function segments the given skb and returns a list of segments.
 2987 *
 2988 *	It may return NULL if the skb requires no segmentation.  This is
 2989 *	only possible when GSO is used for verifying header integrity.
 2990 *
 2991 *	Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
 2992 */
 2993struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 2994				  netdev_features_t features, bool tx_path)
 2995{
 2996	struct sk_buff *segs;
 2997
 2998	if (unlikely(skb_needs_check(skb, tx_path))) {
 2999		int err;
 3000
 3001		/* We're going to init ->check field in TCP or UDP header */
 3002		err = skb_cow_head(skb, 0);
 3003		if (err < 0)
 3004			return ERR_PTR(err);
 3005	}
 3006
 3007	/* Only report GSO partial support if it will enable us to
 3008	 * support segmentation on this frame without needing additional
 3009	 * work.
 3010	 */
 3011	if (features & NETIF_F_GSO_PARTIAL) {
 3012		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
 3013		struct net_device *dev = skb->dev;
 3014
 3015		partial_features |= dev->features & dev->gso_partial_features;
 3016		if (!skb_gso_ok(skb, features | partial_features))
 3017			features &= ~NETIF_F_GSO_PARTIAL;
 3018	}
 3019
 3020	BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
 3021		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
 3022
 3023	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
 3024	SKB_GSO_CB(skb)->encap_level = 0;
 3025
 3026	skb_reset_mac_header(skb);
 3027	skb_reset_mac_len(skb);
 3028
 3029	segs = skb_mac_gso_segment(skb, features);
 3030
 3031	if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
 3032		skb_warn_bad_offload(skb);
 3033
 3034	return segs;
 3035}
 3036EXPORT_SYMBOL(__skb_gso_segment);
 3037
 3038/* Take action when hardware reception checksum errors are detected. */
 3039#ifdef CONFIG_BUG
 3040void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
 3041{
 3042	if (net_ratelimit()) {
 3043		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
 3044		skb_dump(KERN_ERR, skb, true);
 3045		dump_stack();
 3046	}
 3047}
 3048EXPORT_SYMBOL(netdev_rx_csum_fault);
 3049#endif
 3050
 3051/* XXX: check that highmem exists at all on the given machine. */
 
 
 
 
 3052static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 3053{
 3054#ifdef CONFIG_HIGHMEM
 3055	int i;
 3056
 3057	if (!(dev->features & NETIF_F_HIGHDMA)) {
 3058		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 3059			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 3060
 3061			if (PageHighMem(skb_frag_page(frag)))
 3062				return 1;
 3063		}
 3064	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3065#endif
 3066	return 0;
 3067}
 3068
 3069/* If MPLS offload request, verify we are testing hardware MPLS features
 3070 * instead of standard features for the netdev.
 3071 */
 3072#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
 3073static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3074					   netdev_features_t features,
 3075					   __be16 type)
 3076{
 3077	if (eth_p_mpls(type))
 3078		features &= skb->dev->mpls_features;
 3079
 3080	return features;
 3081}
 3082#else
 3083static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3084					   netdev_features_t features,
 3085					   __be16 type)
 3086{
 3087	return features;
 3088}
 3089#endif
 3090
 3091static netdev_features_t harmonize_features(struct sk_buff *skb,
 3092	netdev_features_t features)
 3093{
 3094	int tmp;
 3095	__be16 type;
 3096
 3097	type = skb_network_protocol(skb, &tmp);
 3098	features = net_mpls_features(skb, features, type);
 3099
 3100	if (skb->ip_summed != CHECKSUM_NONE &&
 3101	    !can_checksum_protocol(features, type)) {
 3102		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
 3103	}
 3104	if (illegal_highdma(skb->dev, skb))
 3105		features &= ~NETIF_F_SG;
 3106
 3107	return features;
 3108}
 3109
 3110netdev_features_t passthru_features_check(struct sk_buff *skb,
 3111					  struct net_device *dev,
 3112					  netdev_features_t features)
 3113{
 3114	return features;
 3115}
 3116EXPORT_SYMBOL(passthru_features_check);
 3117
 3118static netdev_features_t dflt_features_check(struct sk_buff *skb,
 3119					     struct net_device *dev,
 3120					     netdev_features_t features)
 3121{
 3122	return vlan_features_check(skb, features);
 3123}
 3124
 3125static netdev_features_t gso_features_check(const struct sk_buff *skb,
 3126					    struct net_device *dev,
 3127					    netdev_features_t features)
 3128{
 3129	u16 gso_segs = skb_shinfo(skb)->gso_segs;
 3130
 3131	if (gso_segs > dev->gso_max_segs)
 3132		return features & ~NETIF_F_GSO_MASK;
 3133
 3134	/* Support for GSO partial features requires software
 3135	 * intervention before we can actually process the packets
 3136	 * so we need to strip support for any partial features now
 3137	 * and we can pull them back in after we have partially
 3138	 * segmented the frame.
 3139	 */
 3140	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
 3141		features &= ~dev->gso_partial_features;
 3142
 3143	/* Make sure to clear the IPv4 ID mangling feature if the
 3144	 * IPv4 header has the potential to be fragmented.
 3145	 */
 3146	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
 3147		struct iphdr *iph = skb->encapsulation ?
 3148				    inner_ip_hdr(skb) : ip_hdr(skb);
 3149
 3150		if (!(iph->frag_off & htons(IP_DF)))
 3151			features &= ~NETIF_F_TSO_MANGLEID;
 3152	}
 3153
 3154	return features;
 3155}
 3156
 3157netdev_features_t netif_skb_features(struct sk_buff *skb)
 3158{
 3159	struct net_device *dev = skb->dev;
 3160	netdev_features_t features = dev->features;
 3161
 3162	if (skb_is_gso(skb))
 3163		features = gso_features_check(skb, dev, features);
 3164
 3165	/* If encapsulation offload request, verify we are testing
 3166	 * hardware encapsulation features instead of standard
 3167	 * features for the netdev
 3168	 */
 3169	if (skb->encapsulation)
 3170		features &= dev->hw_enc_features;
 3171
 3172	if (skb_vlan_tagged(skb))
 3173		features = netdev_intersect_features(features,
 3174						     dev->vlan_features |
 3175						     NETIF_F_HW_VLAN_CTAG_TX |
 3176						     NETIF_F_HW_VLAN_STAG_TX);
 3177
 3178	if (dev->netdev_ops->ndo_features_check)
 3179		features &= dev->netdev_ops->ndo_features_check(skb, dev,
 3180								features);
 3181	else
 3182		features &= dflt_features_check(skb, dev, features);
 3183
 3184	return harmonize_features(skb, features);
 3185}
 3186EXPORT_SYMBOL(netif_skb_features);
 3187
 3188static int xmit_one(struct sk_buff *skb, struct net_device *dev,
 3189		    struct netdev_queue *txq, bool more)
 3190{
 3191	unsigned int len;
 3192	int rc;
 3193
 3194	if (dev_nit_active(dev))
 3195		dev_queue_xmit_nit(skb, dev);
 3196
 3197	len = skb->len;
 3198	trace_net_dev_start_xmit(skb, dev);
 3199	rc = netdev_start_xmit(skb, dev, txq, more);
 3200	trace_net_dev_xmit(skb, rc, dev, len);
 3201
 3202	return rc;
 3203}
 3204
 3205struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
 3206				    struct netdev_queue *txq, int *ret)
 3207{
 3208	struct sk_buff *skb = first;
 3209	int rc = NETDEV_TX_OK;
 3210
 3211	while (skb) {
 3212		struct sk_buff *next = skb->next;
 3213
 3214		skb_mark_not_on_list(skb);
 3215		rc = xmit_one(skb, dev, txq, next != NULL);
 3216		if (unlikely(!dev_xmit_complete(rc))) {
 3217			skb->next = next;
 3218			goto out;
 3219		}
 3220
 3221		skb = next;
 3222		if (netif_tx_queue_stopped(txq) && skb) {
 3223			rc = NETDEV_TX_BUSY;
 3224			break;
 3225		}
 3226	}
 3227
 3228out:
 3229	*ret = rc;
 3230	return skb;
 3231}
 3232
 3233static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
 3234					  netdev_features_t features)
 3235{
 3236	if (skb_vlan_tag_present(skb) &&
 3237	    !vlan_hw_offload_capable(features, skb->vlan_proto))
 3238		skb = __vlan_hwaccel_push_inside(skb);
 3239	return skb;
 3240}
 3241
 3242int skb_csum_hwoffload_help(struct sk_buff *skb,
 3243			    const netdev_features_t features)
 3244{
 3245	if (unlikely(skb->csum_not_inet))
 3246		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
 3247			skb_crc32c_csum_help(skb);
 3248
 3249	return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
 3250}
 3251EXPORT_SYMBOL(skb_csum_hwoffload_help);
 3252
 3253static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
 3254{
 3255	netdev_features_t features;
 3256
 3257	features = netif_skb_features(skb);
 3258	skb = validate_xmit_vlan(skb, features);
 3259	if (unlikely(!skb))
 3260		goto out_null;
 3261
 3262	skb = sk_validate_xmit_skb(skb, dev);
 3263	if (unlikely(!skb))
 3264		goto out_null;
 3265
 3266	if (netif_needs_gso(skb, features)) {
 3267		struct sk_buff *segs;
 3268
 3269		segs = skb_gso_segment(skb, features);
 3270		if (IS_ERR(segs)) {
 3271			goto out_kfree_skb;
 3272		} else if (segs) {
 3273			consume_skb(skb);
 3274			skb = segs;
 3275		}
 3276	} else {
 3277		if (skb_needs_linearize(skb, features) &&
 3278		    __skb_linearize(skb))
 3279			goto out_kfree_skb;
 3280
 3281		/* If packet is not checksummed and device does not
 3282		 * support checksumming for this protocol, complete
 3283		 * checksumming here.
 3284		 */
 3285		if (skb->ip_summed == CHECKSUM_PARTIAL) {
 3286			if (skb->encapsulation)
 3287				skb_set_inner_transport_header(skb,
 3288							       skb_checksum_start_offset(skb));
 3289			else
 3290				skb_set_transport_header(skb,
 3291							 skb_checksum_start_offset(skb));
 3292			if (skb_csum_hwoffload_help(skb, features))
 3293				goto out_kfree_skb;
 3294		}
 3295	}
 3296
 3297	skb = validate_xmit_xfrm(skb, features, again);
 3298
 3299	return skb;
 3300
 3301out_kfree_skb:
 3302	kfree_skb(skb);
 3303out_null:
 3304	atomic_long_inc(&dev->tx_dropped);
 3305	return NULL;
 3306}
 3307
 3308struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
 3309{
 3310	struct sk_buff *next, *head = NULL, *tail;
 3311
 3312	for (; skb != NULL; skb = next) {
 3313		next = skb->next;
 3314		skb_mark_not_on_list(skb);
 3315
 3316		/* in case skb wont be segmented, point to itself */
 3317		skb->prev = skb;
 3318
 3319		skb = validate_xmit_skb(skb, dev, again);
 3320		if (!skb)
 3321			continue;
 3322
 3323		if (!head)
 3324			head = skb;
 3325		else
 3326			tail->next = skb;
 3327		/* If skb was segmented, skb->prev points to
 3328		 * the last segment. If not, it still contains skb.
 3329		 */
 3330		tail = skb->prev;
 3331	}
 3332	return head;
 3333}
 3334EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
 3335
 3336static void qdisc_pkt_len_init(struct sk_buff *skb)
 3337{
 3338	const struct skb_shared_info *shinfo = skb_shinfo(skb);
 3339
 3340	qdisc_skb_cb(skb)->pkt_len = skb->len;
 3341
 3342	/* To get more precise estimation of bytes sent on wire,
 3343	 * we add to pkt_len the headers size of all segments
 3344	 */
 3345	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
 3346		unsigned int hdr_len;
 3347		u16 gso_segs = shinfo->gso_segs;
 3348
 3349		/* mac layer + network layer */
 3350		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
 3351
 3352		/* + transport layer */
 3353		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
 3354			const struct tcphdr *th;
 3355			struct tcphdr _tcphdr;
 3356
 3357			th = skb_header_pointer(skb, skb_transport_offset(skb),
 3358						sizeof(_tcphdr), &_tcphdr);
 3359			if (likely(th))
 3360				hdr_len += __tcp_hdrlen(th);
 3361		} else {
 3362			struct udphdr _udphdr;
 3363
 3364			if (skb_header_pointer(skb, skb_transport_offset(skb),
 3365					       sizeof(_udphdr), &_udphdr))
 3366				hdr_len += sizeof(struct udphdr);
 3367		}
 3368
 3369		if (shinfo->gso_type & SKB_GSO_DODGY)
 3370			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
 3371						shinfo->gso_size);
 3372
 3373		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
 3374	}
 3375}
 3376
 3377static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 3378				 struct net_device *dev,
 3379				 struct netdev_queue *txq)
 3380{
 3381	spinlock_t *root_lock = qdisc_lock(q);
 3382	struct sk_buff *to_free = NULL;
 3383	bool contended;
 3384	int rc;
 3385
 3386	qdisc_calculate_pkt_len(skb, q);
 3387
 3388	if (q->flags & TCQ_F_NOLOCK) {
 3389		if ((q->flags & TCQ_F_CAN_BYPASS) && q->empty &&
 3390		    qdisc_run_begin(q)) {
 3391			if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
 3392					      &q->state))) {
 3393				__qdisc_drop(skb, &to_free);
 3394				rc = NET_XMIT_DROP;
 3395				goto end_run;
 3396			}
 3397			qdisc_bstats_cpu_update(q, skb);
 3398
 3399			rc = NET_XMIT_SUCCESS;
 3400			if (sch_direct_xmit(skb, q, dev, txq, NULL, true))
 3401				__qdisc_run(q);
 3402
 3403end_run:
 3404			qdisc_run_end(q);
 3405		} else {
 3406			rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
 3407			qdisc_run(q);
 3408		}
 3409
 3410		if (unlikely(to_free))
 3411			kfree_skb_list(to_free);
 3412		return rc;
 3413	}
 3414
 3415	/*
 3416	 * Heuristic to force contended enqueues to serialize on a
 3417	 * separate lock before trying to get qdisc main lock.
 3418	 * This permits qdisc->running owner to get the lock more
 3419	 * often and dequeue packets faster.
 3420	 */
 3421	contended = qdisc_is_running(q);
 3422	if (unlikely(contended))
 3423		spin_lock(&q->busylock);
 3424
 3425	spin_lock(root_lock);
 3426	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
 3427		__qdisc_drop(skb, &to_free);
 3428		rc = NET_XMIT_DROP;
 3429	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
 3430		   qdisc_run_begin(q)) {
 3431		/*
 3432		 * This is a work-conserving queue; there are no old skbs
 3433		 * waiting to be sent out; and the qdisc is not running -
 3434		 * xmit the skb directly.
 3435		 */
 3436
 3437		qdisc_bstats_update(q, skb);
 3438
 3439		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
 3440			if (unlikely(contended)) {
 3441				spin_unlock(&q->busylock);
 3442				contended = false;
 3443			}
 3444			__qdisc_run(q);
 3445		}
 3446
 3447		qdisc_run_end(q);
 3448		rc = NET_XMIT_SUCCESS;
 3449	} else {
 3450		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
 3451		if (qdisc_run_begin(q)) {
 3452			if (unlikely(contended)) {
 3453				spin_unlock(&q->busylock);
 3454				contended = false;
 3455			}
 3456			__qdisc_run(q);
 3457			qdisc_run_end(q);
 3458		}
 3459	}
 3460	spin_unlock(root_lock);
 3461	if (unlikely(to_free))
 3462		kfree_skb_list(to_free);
 3463	if (unlikely(contended))
 3464		spin_unlock(&q->busylock);
 3465	return rc;
 3466}
 3467
 3468#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 3469static void skb_update_prio(struct sk_buff *skb)
 3470{
 3471	const struct netprio_map *map;
 3472	const struct sock *sk;
 3473	unsigned int prioidx;
 3474
 3475	if (skb->priority)
 3476		return;
 3477	map = rcu_dereference_bh(skb->dev->priomap);
 3478	if (!map)
 3479		return;
 3480	sk = skb_to_full_sk(skb);
 3481	if (!sk)
 3482		return;
 3483
 3484	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
 3485
 3486	if (prioidx < map->priomap_len)
 3487		skb->priority = map->priomap[prioidx];
 3488}
 3489#else
 3490#define skb_update_prio(skb)
 3491#endif
 3492
 
 
 
 3493/**
 3494 *	dev_loopback_xmit - loop back @skb
 3495 *	@net: network namespace this loopback is happening in
 3496 *	@sk:  sk needed to be a netfilter okfn
 3497 *	@skb: buffer to transmit
 3498 */
 3499int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
 3500{
 3501	skb_reset_mac_header(skb);
 3502	__skb_pull(skb, skb_network_offset(skb));
 3503	skb->pkt_type = PACKET_LOOPBACK;
 3504	skb->ip_summed = CHECKSUM_UNNECESSARY;
 3505	WARN_ON(!skb_dst(skb));
 3506	skb_dst_force(skb);
 3507	netif_rx_ni(skb);
 3508	return 0;
 3509}
 3510EXPORT_SYMBOL(dev_loopback_xmit);
 3511
 3512#ifdef CONFIG_NET_EGRESS
 3513static struct sk_buff *
 3514sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 3515{
 3516	struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
 3517	struct tcf_result cl_res;
 3518
 3519	if (!miniq)
 3520		return skb;
 3521
 3522	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
 3523	mini_qdisc_bstats_cpu_update(miniq, skb);
 3524
 3525	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
 3526	case TC_ACT_OK:
 3527	case TC_ACT_RECLASSIFY:
 3528		skb->tc_index = TC_H_MIN(cl_res.classid);
 3529		break;
 3530	case TC_ACT_SHOT:
 3531		mini_qdisc_qstats_cpu_drop(miniq);
 3532		*ret = NET_XMIT_DROP;
 3533		kfree_skb(skb);
 3534		return NULL;
 3535	case TC_ACT_STOLEN:
 3536	case TC_ACT_QUEUED:
 3537	case TC_ACT_TRAP:
 3538		*ret = NET_XMIT_SUCCESS;
 3539		consume_skb(skb);
 3540		return NULL;
 3541	case TC_ACT_REDIRECT:
 3542		/* No need to push/pop skb's mac_header here on egress! */
 3543		skb_do_redirect(skb);
 3544		*ret = NET_XMIT_SUCCESS;
 3545		return NULL;
 3546	default:
 3547		break;
 3548	}
 3549
 3550	return skb;
 3551}
 3552#endif /* CONFIG_NET_EGRESS */
 3553
 3554#ifdef CONFIG_XPS
 3555static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
 3556			       struct xps_dev_maps *dev_maps, unsigned int tci)
 3557{
 3558	struct xps_map *map;
 3559	int queue_index = -1;
 3560
 3561	if (dev->num_tc) {
 3562		tci *= dev->num_tc;
 3563		tci += netdev_get_prio_tc_map(dev, skb->priority);
 3564	}
 3565
 3566	map = rcu_dereference(dev_maps->attr_map[tci]);
 3567	if (map) {
 3568		if (map->len == 1)
 3569			queue_index = map->queues[0];
 3570		else
 3571			queue_index = map->queues[reciprocal_scale(
 3572						skb_get_hash(skb), map->len)];
 3573		if (unlikely(queue_index >= dev->real_num_tx_queues))
 3574			queue_index = -1;
 3575	}
 3576	return queue_index;
 3577}
 3578#endif
 3579
 3580static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
 3581			 struct sk_buff *skb)
 3582{
 3583#ifdef CONFIG_XPS
 3584	struct xps_dev_maps *dev_maps;
 3585	struct sock *sk = skb->sk;
 3586	int queue_index = -1;
 3587
 3588	if (!static_key_false(&xps_needed))
 3589		return -1;
 3590
 3591	rcu_read_lock();
 3592	if (!static_key_false(&xps_rxqs_needed))
 3593		goto get_cpus_map;
 3594
 3595	dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
 3596	if (dev_maps) {
 3597		int tci = sk_rx_queue_get(sk);
 3598
 3599		if (tci >= 0 && tci < dev->num_rx_queues)
 3600			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 3601							  tci);
 3602	}
 3603
 3604get_cpus_map:
 3605	if (queue_index < 0) {
 3606		dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
 3607		if (dev_maps) {
 3608			unsigned int tci = skb->sender_cpu - 1;
 3609
 3610			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 3611							  tci);
 
 
 
 
 
 
 
 3612		}
 3613	}
 3614	rcu_read_unlock();
 3615
 3616	return queue_index;
 3617#else
 3618	return -1;
 3619#endif
 3620}
 3621
 3622u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
 3623		     struct net_device *sb_dev)
 3624{
 3625	return 0;
 3626}
 3627EXPORT_SYMBOL(dev_pick_tx_zero);
 3628
 3629u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
 3630		       struct net_device *sb_dev)
 3631{
 3632	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
 3633}
 3634EXPORT_SYMBOL(dev_pick_tx_cpu_id);
 3635
 3636u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
 3637		     struct net_device *sb_dev)
 3638{
 3639	struct sock *sk = skb->sk;
 3640	int queue_index = sk_tx_queue_get(sk);
 3641
 3642	sb_dev = sb_dev ? : dev;
 3643
 3644	if (queue_index < 0 || skb->ooo_okay ||
 3645	    queue_index >= dev->real_num_tx_queues) {
 3646		int new_index = get_xps_queue(dev, sb_dev, skb);
 3647
 3648		if (new_index < 0)
 3649			new_index = skb_tx_hash(dev, sb_dev, skb);
 3650
 3651		if (queue_index != new_index && sk &&
 3652		    sk_fullsock(sk) &&
 3653		    rcu_access_pointer(sk->sk_dst_cache))
 3654			sk_tx_queue_set(sk, new_index);
 3655
 3656		queue_index = new_index;
 3657	}
 3658
 3659	return queue_index;
 3660}
 3661EXPORT_SYMBOL(netdev_pick_tx);
 3662
 3663struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
 3664					 struct sk_buff *skb,
 3665					 struct net_device *sb_dev)
 3666{
 3667	int queue_index = 0;
 3668
 3669#ifdef CONFIG_XPS
 3670	u32 sender_cpu = skb->sender_cpu - 1;
 3671
 3672	if (sender_cpu >= (u32)NR_CPUS)
 3673		skb->sender_cpu = raw_smp_processor_id() + 1;
 3674#endif
 3675
 3676	if (dev->real_num_tx_queues != 1) {
 3677		const struct net_device_ops *ops = dev->netdev_ops;
 3678
 3679		if (ops->ndo_select_queue)
 3680			queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
 
 3681		else
 3682			queue_index = netdev_pick_tx(dev, skb, sb_dev);
 3683
 3684		queue_index = netdev_cap_txqueue(dev, queue_index);
 3685	}
 3686
 3687	skb_set_queue_mapping(skb, queue_index);
 3688	return netdev_get_tx_queue(dev, queue_index);
 3689}
 3690
 3691/**
 3692 *	__dev_queue_xmit - transmit a buffer
 3693 *	@skb: buffer to transmit
 3694 *	@sb_dev: suboordinate device used for L2 forwarding offload
 3695 *
 3696 *	Queue a buffer for transmission to a network device. The caller must
 3697 *	have set the device and priority and built the buffer before calling
 3698 *	this function. The function can be called from an interrupt.
 3699 *
 3700 *	A negative errno code is returned on a failure. A success does not
 3701 *	guarantee the frame will be transmitted as it may be dropped due
 3702 *	to congestion or traffic shaping.
 3703 *
 3704 * -----------------------------------------------------------------------------------
 3705 *      I notice this method can also return errors from the queue disciplines,
 3706 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
 3707 *      be positive.
 3708 *
 3709 *      Regardless of the return value, the skb is consumed, so it is currently
 3710 *      difficult to retry a send to this method.  (You can bump the ref count
 3711 *      before sending to hold a reference for retry if you are careful.)
 3712 *
 3713 *      When calling this method, interrupts MUST be enabled.  This is because
 3714 *      the BH enable code must have IRQs enabled so that it will not deadlock.
 3715 *          --BLG
 3716 */
 3717static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 3718{
 3719	struct net_device *dev = skb->dev;
 3720	struct netdev_queue *txq;
 3721	struct Qdisc *q;
 3722	int rc = -ENOMEM;
 3723	bool again = false;
 3724
 3725	skb_reset_mac_header(skb);
 3726
 3727	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
 3728		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
 3729
 3730	/* Disable soft irqs for various locks below. Also
 3731	 * stops preemption for RCU.
 3732	 */
 3733	rcu_read_lock_bh();
 3734
 3735	skb_update_prio(skb);
 3736
 3737	qdisc_pkt_len_init(skb);
 3738#ifdef CONFIG_NET_CLS_ACT
 3739	skb->tc_at_ingress = 0;
 3740# ifdef CONFIG_NET_EGRESS
 3741	if (static_branch_unlikely(&egress_needed_key)) {
 3742		skb = sch_handle_egress(skb, &rc, dev);
 3743		if (!skb)
 3744			goto out;
 3745	}
 3746# endif
 3747#endif
 3748	/* If device/qdisc don't need skb->dst, release it right now while
 3749	 * its hot in this cpu cache.
 3750	 */
 3751	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
 3752		skb_dst_drop(skb);
 3753	else
 3754		skb_dst_force(skb);
 3755
 3756	txq = netdev_core_pick_tx(dev, skb, sb_dev);
 3757	q = rcu_dereference_bh(txq->qdisc);
 3758
 3759	trace_net_dev_queue(skb);
 3760	if (q->enqueue) {
 3761		rc = __dev_xmit_skb(skb, q, dev, txq);
 3762		goto out;
 3763	}
 3764
 3765	/* The device has no queue. Common case for software devices:
 3766	 * loopback, all the sorts of tunnels...
 3767
 3768	 * Really, it is unlikely that netif_tx_lock protection is necessary
 3769	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
 3770	 * counters.)
 3771	 * However, it is possible, that they rely on protection
 3772	 * made by us here.
 3773
 3774	 * Check this and shot the lock. It is not prone from deadlocks.
 3775	 *Either shot noqueue qdisc, it is even simpler 8)
 3776	 */
 3777	if (dev->flags & IFF_UP) {
 3778		int cpu = smp_processor_id(); /* ok because BHs are off */
 3779
 3780		if (txq->xmit_lock_owner != cpu) {
 3781			if (dev_xmit_recursion())
 
 3782				goto recursion_alert;
 3783
 3784			skb = validate_xmit_skb(skb, dev, &again);
 3785			if (!skb)
 3786				goto out;
 3787
 3788			HARD_TX_LOCK(dev, txq, cpu);
 3789
 3790			if (!netif_xmit_stopped(txq)) {
 3791				dev_xmit_recursion_inc();
 3792				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
 3793				dev_xmit_recursion_dec();
 3794				if (dev_xmit_complete(rc)) {
 3795					HARD_TX_UNLOCK(dev, txq);
 3796					goto out;
 3797				}
 3798			}
 3799			HARD_TX_UNLOCK(dev, txq);
 3800			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
 3801					     dev->name);
 3802		} else {
 3803			/* Recursion is detected! It is possible,
 3804			 * unfortunately
 3805			 */
 3806recursion_alert:
 3807			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
 3808					     dev->name);
 3809		}
 3810	}
 3811
 3812	rc = -ENETDOWN;
 3813	rcu_read_unlock_bh();
 3814
 3815	atomic_long_inc(&dev->tx_dropped);
 3816	kfree_skb_list(skb);
 3817	return rc;
 3818out:
 3819	rcu_read_unlock_bh();
 3820	return rc;
 3821}
 3822
 3823int dev_queue_xmit(struct sk_buff *skb)
 3824{
 3825	return __dev_queue_xmit(skb, NULL);
 3826}
 3827EXPORT_SYMBOL(dev_queue_xmit);
 3828
 3829int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
 3830{
 3831	return __dev_queue_xmit(skb, sb_dev);
 3832}
 3833EXPORT_SYMBOL(dev_queue_xmit_accel);
 3834
 3835int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
 3836{
 3837	struct net_device *dev = skb->dev;
 3838	struct sk_buff *orig_skb = skb;
 3839	struct netdev_queue *txq;
 3840	int ret = NETDEV_TX_BUSY;
 3841	bool again = false;
 3842
 3843	if (unlikely(!netif_running(dev) ||
 3844		     !netif_carrier_ok(dev)))
 3845		goto drop;
 3846
 3847	skb = validate_xmit_skb_list(skb, dev, &again);
 3848	if (skb != orig_skb)
 3849		goto drop;
 3850
 3851	skb_set_queue_mapping(skb, queue_id);
 3852	txq = skb_get_tx_queue(dev, skb);
 3853
 3854	local_bh_disable();
 3855
 3856	HARD_TX_LOCK(dev, txq, smp_processor_id());
 3857	if (!netif_xmit_frozen_or_drv_stopped(txq))
 3858		ret = netdev_start_xmit(skb, dev, txq, false);
 3859	HARD_TX_UNLOCK(dev, txq);
 3860
 3861	local_bh_enable();
 3862
 3863	if (!dev_xmit_complete(ret))
 3864		kfree_skb(skb);
 3865
 3866	return ret;
 3867drop:
 3868	atomic_long_inc(&dev->tx_dropped);
 3869	kfree_skb_list(skb);
 3870	return NET_XMIT_DROP;
 3871}
 3872EXPORT_SYMBOL(dev_direct_xmit);
 3873
 3874/*************************************************************************
 3875 *			Receiver routines
 3876 *************************************************************************/
 3877
 3878int netdev_max_backlog __read_mostly = 1000;
 3879EXPORT_SYMBOL(netdev_max_backlog);
 3880
 3881int netdev_tstamp_prequeue __read_mostly = 1;
 3882int netdev_budget __read_mostly = 300;
 3883unsigned int __read_mostly netdev_budget_usecs = 2000;
 3884int weight_p __read_mostly = 64;           /* old backlog weight */
 3885int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
 3886int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
 3887int dev_rx_weight __read_mostly = 64;
 3888int dev_tx_weight __read_mostly = 64;
 3889/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
 3890int gro_normal_batch __read_mostly = 8;
 3891
 3892/* Called with irq disabled */
 3893static inline void ____napi_schedule(struct softnet_data *sd,
 3894				     struct napi_struct *napi)
 3895{
 3896	list_add_tail(&napi->poll_list, &sd->poll_list);
 3897	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 3898}
 3899
 3900#ifdef CONFIG_RPS
 3901
 3902/* One global table that all flow-based protocols share. */
 3903struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
 3904EXPORT_SYMBOL(rps_sock_flow_table);
 3905u32 rps_cpu_mask __read_mostly;
 3906EXPORT_SYMBOL(rps_cpu_mask);
 3907
 3908struct static_key_false rps_needed __read_mostly;
 3909EXPORT_SYMBOL(rps_needed);
 3910struct static_key_false rfs_needed __read_mostly;
 3911EXPORT_SYMBOL(rfs_needed);
 3912
 3913static struct rps_dev_flow *
 3914set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 3915	    struct rps_dev_flow *rflow, u16 next_cpu)
 3916{
 3917	if (next_cpu < nr_cpu_ids) {
 3918#ifdef CONFIG_RFS_ACCEL
 3919		struct netdev_rx_queue *rxqueue;
 3920		struct rps_dev_flow_table *flow_table;
 3921		struct rps_dev_flow *old_rflow;
 3922		u32 flow_id;
 3923		u16 rxq_index;
 3924		int rc;
 3925
 3926		/* Should we steer this flow to a different hardware queue? */
 3927		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
 3928		    !(dev->features & NETIF_F_NTUPLE))
 3929			goto out;
 3930		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
 3931		if (rxq_index == skb_get_rx_queue(skb))
 3932			goto out;
 3933
 3934		rxqueue = dev->_rx + rxq_index;
 3935		flow_table = rcu_dereference(rxqueue->rps_flow_table);
 3936		if (!flow_table)
 3937			goto out;
 3938		flow_id = skb_get_hash(skb) & flow_table->mask;
 3939		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
 3940							rxq_index, flow_id);
 3941		if (rc < 0)
 3942			goto out;
 3943		old_rflow = rflow;
 3944		rflow = &flow_table->flows[flow_id];
 3945		rflow->filter = rc;
 3946		if (old_rflow->filter == rflow->filter)
 3947			old_rflow->filter = RPS_NO_FILTER;
 3948	out:
 3949#endif
 3950		rflow->last_qtail =
 3951			per_cpu(softnet_data, next_cpu).input_queue_head;
 3952	}
 3953
 3954	rflow->cpu = next_cpu;
 3955	return rflow;
 3956}
 3957
 3958/*
 3959 * get_rps_cpu is called from netif_receive_skb and returns the target
 3960 * CPU from the RPS map of the receiving queue for a given skb.
 3961 * rcu_read_lock must be held on entry.
 3962 */
 3963static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 3964		       struct rps_dev_flow **rflowp)
 3965{
 3966	const struct rps_sock_flow_table *sock_flow_table;
 3967	struct netdev_rx_queue *rxqueue = dev->_rx;
 3968	struct rps_dev_flow_table *flow_table;
 3969	struct rps_map *map;
 3970	int cpu = -1;
 3971	u32 tcpu;
 3972	u32 hash;
 3973
 3974	if (skb_rx_queue_recorded(skb)) {
 3975		u16 index = skb_get_rx_queue(skb);
 3976
 3977		if (unlikely(index >= dev->real_num_rx_queues)) {
 3978			WARN_ONCE(dev->real_num_rx_queues > 1,
 3979				  "%s received packet on queue %u, but number "
 3980				  "of RX queues is %u\n",
 3981				  dev->name, index, dev->real_num_rx_queues);
 3982			goto done;
 3983		}
 3984		rxqueue += index;
 3985	}
 3986
 3987	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
 3988
 3989	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 3990	map = rcu_dereference(rxqueue->rps_map);
 3991	if (!flow_table && !map)
 3992		goto done;
 3993
 3994	skb_reset_network_header(skb);
 3995	hash = skb_get_hash(skb);
 3996	if (!hash)
 3997		goto done;
 3998
 3999	sock_flow_table = rcu_dereference(rps_sock_flow_table);
 4000	if (flow_table && sock_flow_table) {
 4001		struct rps_dev_flow *rflow;
 4002		u32 next_cpu;
 4003		u32 ident;
 4004
 4005		/* First check into global flow table if there is a match */
 4006		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
 4007		if ((ident ^ hash) & ~rps_cpu_mask)
 4008			goto try_rps;
 4009
 4010		next_cpu = ident & rps_cpu_mask;
 4011
 4012		/* OK, now we know there is a match,
 4013		 * we can look at the local (per receive queue) flow table
 4014		 */
 4015		rflow = &flow_table->flows[hash & flow_table->mask];
 4016		tcpu = rflow->cpu;
 4017
 4018		/*
 4019		 * If the desired CPU (where last recvmsg was done) is
 4020		 * different from current CPU (one in the rx-queue flow
 4021		 * table entry), switch if one of the following holds:
 4022		 *   - Current CPU is unset (>= nr_cpu_ids).
 4023		 *   - Current CPU is offline.
 4024		 *   - The current CPU's queue tail has advanced beyond the
 4025		 *     last packet that was enqueued using this table entry.
 4026		 *     This guarantees that all previous packets for the flow
 4027		 *     have been dequeued, thus preserving in order delivery.
 4028		 */
 4029		if (unlikely(tcpu != next_cpu) &&
 4030		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
 4031		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
 4032		      rflow->last_qtail)) >= 0)) {
 4033			tcpu = next_cpu;
 4034			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
 4035		}
 4036
 4037		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
 4038			*rflowp = rflow;
 4039			cpu = tcpu;
 4040			goto done;
 4041		}
 4042	}
 4043
 4044try_rps:
 4045
 4046	if (map) {
 4047		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
 4048		if (cpu_online(tcpu)) {
 4049			cpu = tcpu;
 4050			goto done;
 4051		}
 4052	}
 4053
 4054done:
 4055	return cpu;
 4056}
 4057
 4058#ifdef CONFIG_RFS_ACCEL
 4059
 4060/**
 4061 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
 4062 * @dev: Device on which the filter was set
 4063 * @rxq_index: RX queue index
 4064 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
 4065 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
 4066 *
 4067 * Drivers that implement ndo_rx_flow_steer() should periodically call
 4068 * this function for each installed filter and remove the filters for
 4069 * which it returns %true.
 4070 */
 4071bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
 4072			 u32 flow_id, u16 filter_id)
 4073{
 4074	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
 4075	struct rps_dev_flow_table *flow_table;
 4076	struct rps_dev_flow *rflow;
 4077	bool expire = true;
 4078	unsigned int cpu;
 4079
 4080	rcu_read_lock();
 4081	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4082	if (flow_table && flow_id <= flow_table->mask) {
 4083		rflow = &flow_table->flows[flow_id];
 4084		cpu = READ_ONCE(rflow->cpu);
 4085		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
 4086		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
 4087			   rflow->last_qtail) <
 4088		     (int)(10 * flow_table->mask)))
 4089			expire = false;
 4090	}
 4091	rcu_read_unlock();
 4092	return expire;
 4093}
 4094EXPORT_SYMBOL(rps_may_expire_flow);
 4095
 4096#endif /* CONFIG_RFS_ACCEL */
 4097
 4098/* Called from hardirq (IPI) context */
 4099static void rps_trigger_softirq(void *data)
 4100{
 4101	struct softnet_data *sd = data;
 4102
 4103	____napi_schedule(sd, &sd->backlog);
 4104	sd->received_rps++;
 4105}
 4106
 4107#endif /* CONFIG_RPS */
 4108
 4109/*
 4110 * Check if this softnet_data structure is another cpu one
 4111 * If yes, queue it to our IPI list and return 1
 4112 * If no, return 0
 4113 */
 4114static int rps_ipi_queued(struct softnet_data *sd)
 4115{
 4116#ifdef CONFIG_RPS
 4117	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
 4118
 4119	if (sd != mysd) {
 4120		sd->rps_ipi_next = mysd->rps_ipi_list;
 4121		mysd->rps_ipi_list = sd;
 4122
 4123		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4124		return 1;
 4125	}
 4126#endif /* CONFIG_RPS */
 4127	return 0;
 4128}
 4129
 4130#ifdef CONFIG_NET_FLOW_LIMIT
 4131int netdev_flow_limit_table_len __read_mostly = (1 << 12);
 4132#endif
 4133
 4134static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
 4135{
 4136#ifdef CONFIG_NET_FLOW_LIMIT
 4137	struct sd_flow_limit *fl;
 4138	struct softnet_data *sd;
 4139	unsigned int old_flow, new_flow;
 4140
 4141	if (qlen < (netdev_max_backlog >> 1))
 4142		return false;
 4143
 4144	sd = this_cpu_ptr(&softnet_data);
 4145
 4146	rcu_read_lock();
 4147	fl = rcu_dereference(sd->flow_limit);
 4148	if (fl) {
 4149		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
 4150		old_flow = fl->history[fl->history_head];
 4151		fl->history[fl->history_head] = new_flow;
 4152
 4153		fl->history_head++;
 4154		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
 4155
 4156		if (likely(fl->buckets[old_flow]))
 4157			fl->buckets[old_flow]--;
 4158
 4159		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
 4160			fl->count++;
 4161			rcu_read_unlock();
 4162			return true;
 4163		}
 4164	}
 4165	rcu_read_unlock();
 4166#endif
 4167	return false;
 4168}
 4169
 4170/*
 4171 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
 4172 * queue (may be a remote CPU queue).
 4173 */
 4174static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 4175			      unsigned int *qtail)
 4176{
 4177	struct softnet_data *sd;
 4178	unsigned long flags;
 4179	unsigned int qlen;
 4180
 4181	sd = &per_cpu(softnet_data, cpu);
 4182
 4183	local_irq_save(flags);
 4184
 4185	rps_lock(sd);
 4186	if (!netif_running(skb->dev))
 4187		goto drop;
 4188	qlen = skb_queue_len(&sd->input_pkt_queue);
 4189	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
 4190		if (qlen) {
 4191enqueue:
 4192			__skb_queue_tail(&sd->input_pkt_queue, skb);
 4193			input_queue_tail_incr_save(sd, qtail);
 4194			rps_unlock(sd);
 4195			local_irq_restore(flags);
 4196			return NET_RX_SUCCESS;
 4197		}
 4198
 4199		/* Schedule NAPI for backlog device
 4200		 * We can use non atomic operation since we own the queue lock
 4201		 */
 4202		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
 4203			if (!rps_ipi_queued(sd))
 4204				____napi_schedule(sd, &sd->backlog);
 4205		}
 4206		goto enqueue;
 4207	}
 4208
 4209drop:
 4210	sd->dropped++;
 4211	rps_unlock(sd);
 4212
 4213	local_irq_restore(flags);
 4214
 4215	atomic_long_inc(&skb->dev->rx_dropped);
 4216	kfree_skb(skb);
 4217	return NET_RX_DROP;
 4218}
 4219
 4220static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
 4221{
 4222	struct net_device *dev = skb->dev;
 4223	struct netdev_rx_queue *rxqueue;
 4224
 4225	rxqueue = dev->_rx;
 4226
 4227	if (skb_rx_queue_recorded(skb)) {
 4228		u16 index = skb_get_rx_queue(skb);
 4229
 4230		if (unlikely(index >= dev->real_num_rx_queues)) {
 4231			WARN_ONCE(dev->real_num_rx_queues > 1,
 4232				  "%s received packet on queue %u, but number "
 4233				  "of RX queues is %u\n",
 4234				  dev->name, index, dev->real_num_rx_queues);
 4235
 4236			return rxqueue; /* Return first rxqueue */
 4237		}
 4238		rxqueue += index;
 4239	}
 4240	return rxqueue;
 4241}
 4242
 4243static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 4244				     struct xdp_buff *xdp,
 4245				     struct bpf_prog *xdp_prog)
 4246{
 4247	struct netdev_rx_queue *rxqueue;
 4248	void *orig_data, *orig_data_end;
 4249	u32 metalen, act = XDP_DROP;
 4250	__be16 orig_eth_type;
 4251	struct ethhdr *eth;
 4252	bool orig_bcast;
 4253	int hlen, off;
 4254	u32 mac_len;
 4255
 4256	/* Reinjected packets coming from act_mirred or similar should
 4257	 * not get XDP generic processing.
 4258	 */
 4259	if (skb_cloned(skb) || skb_is_tc_redirected(skb))
 4260		return XDP_PASS;
 4261
 4262	/* XDP packets must be linear and must have sufficient headroom
 4263	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
 4264	 * native XDP provides, thus we need to do it here as well.
 4265	 */
 4266	if (skb_is_nonlinear(skb) ||
 4267	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
 4268		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
 4269		int troom = skb->tail + skb->data_len - skb->end;
 4270
 4271		/* In case we have to go down the path and also linearize,
 4272		 * then lets do the pskb_expand_head() work just once here.
 4273		 */
 4274		if (pskb_expand_head(skb,
 4275				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
 4276				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
 4277			goto do_drop;
 4278		if (skb_linearize(skb))
 4279			goto do_drop;
 4280	}
 4281
 4282	/* The XDP program wants to see the packet starting at the MAC
 4283	 * header.
 4284	 */
 4285	mac_len = skb->data - skb_mac_header(skb);
 4286	hlen = skb_headlen(skb) + mac_len;
 4287	xdp->data = skb->data - mac_len;
 4288	xdp->data_meta = xdp->data;
 4289	xdp->data_end = xdp->data + hlen;
 4290	xdp->data_hard_start = skb->data - skb_headroom(skb);
 4291	orig_data_end = xdp->data_end;
 4292	orig_data = xdp->data;
 4293	eth = (struct ethhdr *)xdp->data;
 4294	orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
 4295	orig_eth_type = eth->h_proto;
 4296
 4297	rxqueue = netif_get_rxqueue(skb);
 4298	xdp->rxq = &rxqueue->xdp_rxq;
 4299
 4300	act = bpf_prog_run_xdp(xdp_prog, xdp);
 4301
 4302	/* check if bpf_xdp_adjust_head was used */
 4303	off = xdp->data - orig_data;
 4304	if (off) {
 4305		if (off > 0)
 4306			__skb_pull(skb, off);
 4307		else if (off < 0)
 4308			__skb_push(skb, -off);
 4309
 4310		skb->mac_header += off;
 4311		skb_reset_network_header(skb);
 4312	}
 4313
 4314	/* check if bpf_xdp_adjust_tail was used. it can only "shrink"
 4315	 * pckt.
 4316	 */
 4317	off = orig_data_end - xdp->data_end;
 4318	if (off != 0) {
 4319		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
 4320		skb->len -= off;
 4321
 4322	}
 4323
 4324	/* check if XDP changed eth hdr such SKB needs update */
 4325	eth = (struct ethhdr *)xdp->data;
 4326	if ((orig_eth_type != eth->h_proto) ||
 4327	    (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
 4328		__skb_push(skb, ETH_HLEN);
 4329		skb->protocol = eth_type_trans(skb, skb->dev);
 4330	}
 4331
 4332	switch (act) {
 4333	case XDP_REDIRECT:
 4334	case XDP_TX:
 4335		__skb_push(skb, mac_len);
 4336		break;
 4337	case XDP_PASS:
 4338		metalen = xdp->data - xdp->data_meta;
 4339		if (metalen)
 4340			skb_metadata_set(skb, metalen);
 4341		break;
 4342	default:
 4343		bpf_warn_invalid_xdp_action(act);
 4344		/* fall through */
 4345	case XDP_ABORTED:
 4346		trace_xdp_exception(skb->dev, xdp_prog, act);
 4347		/* fall through */
 4348	case XDP_DROP:
 4349	do_drop:
 4350		kfree_skb(skb);
 4351		break;
 4352	}
 4353
 4354	return act;
 4355}
 4356
 4357/* When doing generic XDP we have to bypass the qdisc layer and the
 4358 * network taps in order to match in-driver-XDP behavior.
 4359 */
 4360void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 4361{
 4362	struct net_device *dev = skb->dev;
 4363	struct netdev_queue *txq;
 4364	bool free_skb = true;
 4365	int cpu, rc;
 4366
 4367	txq = netdev_core_pick_tx(dev, skb, NULL);
 4368	cpu = smp_processor_id();
 4369	HARD_TX_LOCK(dev, txq, cpu);
 4370	if (!netif_xmit_stopped(txq)) {
 4371		rc = netdev_start_xmit(skb, dev, txq, 0);
 4372		if (dev_xmit_complete(rc))
 4373			free_skb = false;
 4374	}
 4375	HARD_TX_UNLOCK(dev, txq);
 4376	if (free_skb) {
 4377		trace_xdp_exception(dev, xdp_prog, XDP_TX);
 4378		kfree_skb(skb);
 4379	}
 4380}
 4381EXPORT_SYMBOL_GPL(generic_xdp_tx);
 4382
 4383static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
 4384
 4385int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
 4386{
 4387	if (xdp_prog) {
 4388		struct xdp_buff xdp;
 4389		u32 act;
 4390		int err;
 4391
 4392		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
 4393		if (act != XDP_PASS) {
 4394			switch (act) {
 4395			case XDP_REDIRECT:
 4396				err = xdp_do_generic_redirect(skb->dev, skb,
 4397							      &xdp, xdp_prog);
 4398				if (err)
 4399					goto out_redir;
 4400				break;
 4401			case XDP_TX:
 4402				generic_xdp_tx(skb, xdp_prog);
 4403				break;
 4404			}
 4405			return XDP_DROP;
 4406		}
 4407	}
 4408	return XDP_PASS;
 4409out_redir:
 4410	kfree_skb(skb);
 4411	return XDP_DROP;
 4412}
 4413EXPORT_SYMBOL_GPL(do_xdp_generic);
 4414
 4415static int netif_rx_internal(struct sk_buff *skb)
 4416{
 4417	int ret;
 4418
 4419	net_timestamp_check(netdev_tstamp_prequeue, skb);
 4420
 4421	trace_netif_rx(skb);
 4422
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 4423#ifdef CONFIG_RPS
 4424	if (static_branch_unlikely(&rps_needed)) {
 4425		struct rps_dev_flow voidflow, *rflow = &voidflow;
 4426		int cpu;
 4427
 4428		preempt_disable();
 4429		rcu_read_lock();
 4430
 4431		cpu = get_rps_cpu(skb->dev, skb, &rflow);
 4432		if (cpu < 0)
 4433			cpu = smp_processor_id();
 4434
 4435		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 4436
 4437		rcu_read_unlock();
 4438		preempt_enable();
 4439	} else
 4440#endif
 4441	{
 4442		unsigned int qtail;
 4443
 4444		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
 4445		put_cpu();
 4446	}
 4447	return ret;
 4448}
 4449
 4450/**
 4451 *	netif_rx	-	post buffer to the network code
 4452 *	@skb: buffer to post
 4453 *
 4454 *	This function receives a packet from a device driver and queues it for
 4455 *	the upper (protocol) levels to process.  It always succeeds. The buffer
 4456 *	may be dropped during processing for congestion control or by the
 4457 *	protocol layers.
 4458 *
 4459 *	return values:
 4460 *	NET_RX_SUCCESS	(no congestion)
 4461 *	NET_RX_DROP     (packet was dropped)
 4462 *
 4463 */
 4464
 4465int netif_rx(struct sk_buff *skb)
 4466{
 4467	int ret;
 4468
 4469	trace_netif_rx_entry(skb);
 4470
 4471	ret = netif_rx_internal(skb);
 4472	trace_netif_rx_exit(ret);
 4473
 4474	return ret;
 4475}
 4476EXPORT_SYMBOL(netif_rx);
 4477
 4478int netif_rx_ni(struct sk_buff *skb)
 4479{
 4480	int err;
 4481
 4482	trace_netif_rx_ni_entry(skb);
 4483
 4484	preempt_disable();
 4485	err = netif_rx_internal(skb);
 4486	if (local_softirq_pending())
 4487		do_softirq();
 4488	preempt_enable();
 4489	trace_netif_rx_ni_exit(err);
 4490
 4491	return err;
 4492}
 4493EXPORT_SYMBOL(netif_rx_ni);
 4494
 4495static __latent_entropy void net_tx_action(struct softirq_action *h)
 4496{
 4497	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 4498
 4499	if (sd->completion_queue) {
 4500		struct sk_buff *clist;
 4501
 4502		local_irq_disable();
 4503		clist = sd->completion_queue;
 4504		sd->completion_queue = NULL;
 4505		local_irq_enable();
 4506
 4507		while (clist) {
 4508			struct sk_buff *skb = clist;
 4509
 4510			clist = clist->next;
 4511
 4512			WARN_ON(refcount_read(&skb->users));
 4513			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
 4514				trace_consume_skb(skb);
 4515			else
 4516				trace_kfree_skb(skb, net_tx_action);
 4517
 4518			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
 4519				__kfree_skb(skb);
 4520			else
 4521				__kfree_skb_defer(skb);
 4522		}
 4523
 4524		__kfree_skb_flush();
 4525	}
 4526
 4527	if (sd->output_queue) {
 4528		struct Qdisc *head;
 4529
 4530		local_irq_disable();
 4531		head = sd->output_queue;
 4532		sd->output_queue = NULL;
 4533		sd->output_queue_tailp = &sd->output_queue;
 4534		local_irq_enable();
 4535
 4536		while (head) {
 4537			struct Qdisc *q = head;
 4538			spinlock_t *root_lock = NULL;
 4539
 4540			head = head->next_sched;
 4541
 4542			if (!(q->flags & TCQ_F_NOLOCK)) {
 4543				root_lock = qdisc_lock(q);
 4544				spin_lock(root_lock);
 4545			}
 4546			/* We need to make sure head->next_sched is read
 4547			 * before clearing __QDISC_STATE_SCHED
 4548			 */
 4549			smp_mb__before_atomic();
 4550			clear_bit(__QDISC_STATE_SCHED, &q->state);
 4551			qdisc_run(q);
 4552			if (root_lock)
 4553				spin_unlock(root_lock);
 4554		}
 4555	}
 4556
 4557	xfrm_dev_backlog(sd);
 4558}
 4559
 4560#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
 4561/* This hook is defined here for ATM LANE */
 4562int (*br_fdb_test_addr_hook)(struct net_device *dev,
 4563			     unsigned char *addr) __read_mostly;
 4564EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
 4565#endif
 4566
 4567static inline struct sk_buff *
 4568sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 4569		   struct net_device *orig_dev)
 4570{
 4571#ifdef CONFIG_NET_CLS_ACT
 4572	struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
 4573	struct tcf_result cl_res;
 4574
 4575	/* If there's at least one ingress present somewhere (so
 4576	 * we get here via enabled static key), remaining devices
 4577	 * that are not configured with an ingress qdisc will bail
 4578	 * out here.
 4579	 */
 4580	if (!miniq)
 4581		return skb;
 4582
 4583	if (*pt_prev) {
 4584		*ret = deliver_skb(skb, *pt_prev, orig_dev);
 4585		*pt_prev = NULL;
 4586	}
 4587
 4588	qdisc_skb_cb(skb)->pkt_len = skb->len;
 4589	skb->tc_at_ingress = 1;
 4590	mini_qdisc_bstats_cpu_update(miniq, skb);
 4591
 4592	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
 4593	case TC_ACT_OK:
 4594	case TC_ACT_RECLASSIFY:
 4595		skb->tc_index = TC_H_MIN(cl_res.classid);
 4596		break;
 4597	case TC_ACT_SHOT:
 4598		mini_qdisc_qstats_cpu_drop(miniq);
 4599		kfree_skb(skb);
 4600		return NULL;
 4601	case TC_ACT_STOLEN:
 4602	case TC_ACT_QUEUED:
 4603	case TC_ACT_TRAP:
 4604		consume_skb(skb);
 4605		return NULL;
 4606	case TC_ACT_REDIRECT:
 4607		/* skb_mac_header check was done by cls/act_bpf, so
 4608		 * we can safely push the L2 header back before
 4609		 * redirecting to another netdev
 4610		 */
 4611		__skb_push(skb, skb->mac_len);
 4612		skb_do_redirect(skb);
 4613		return NULL;
 4614	case TC_ACT_CONSUMED:
 4615		return NULL;
 4616	default:
 4617		break;
 4618	}
 4619#endif /* CONFIG_NET_CLS_ACT */
 4620	return skb;
 4621}
 4622
 4623/**
 4624 *	netdev_is_rx_handler_busy - check if receive handler is registered
 4625 *	@dev: device to check
 4626 *
 4627 *	Check if a receive handler is already registered for a given device.
 4628 *	Return true if there one.
 4629 *
 4630 *	The caller must hold the rtnl_mutex.
 4631 */
 4632bool netdev_is_rx_handler_busy(struct net_device *dev)
 4633{
 4634	ASSERT_RTNL();
 4635	return dev && rtnl_dereference(dev->rx_handler);
 4636}
 4637EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
 4638
 4639/**
 4640 *	netdev_rx_handler_register - register receive handler
 4641 *	@dev: device to register a handler for
 4642 *	@rx_handler: receive handler to register
 4643 *	@rx_handler_data: data pointer that is used by rx handler
 4644 *
 4645 *	Register a receive handler for a device. This handler will then be
 4646 *	called from __netif_receive_skb. A negative errno code is returned
 4647 *	on a failure.
 4648 *
 4649 *	The caller must hold the rtnl_mutex.
 4650 *
 4651 *	For a general description of rx_handler, see enum rx_handler_result.
 4652 */
 4653int netdev_rx_handler_register(struct net_device *dev,
 4654			       rx_handler_func_t *rx_handler,
 4655			       void *rx_handler_data)
 4656{
 4657	if (netdev_is_rx_handler_busy(dev))
 4658		return -EBUSY;
 4659
 4660	if (dev->priv_flags & IFF_NO_RX_HANDLER)
 4661		return -EINVAL;
 4662
 4663	/* Note: rx_handler_data must be set before rx_handler */
 4664	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
 4665	rcu_assign_pointer(dev->rx_handler, rx_handler);
 4666
 4667	return 0;
 4668}
 4669EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
 4670
 4671/**
 4672 *	netdev_rx_handler_unregister - unregister receive handler
 4673 *	@dev: device to unregister a handler from
 4674 *
 4675 *	Unregister a receive handler from a device.
 4676 *
 4677 *	The caller must hold the rtnl_mutex.
 4678 */
 4679void netdev_rx_handler_unregister(struct net_device *dev)
 4680{
 4681
 4682	ASSERT_RTNL();
 4683	RCU_INIT_POINTER(dev->rx_handler, NULL);
 4684	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
 4685	 * section has a guarantee to see a non NULL rx_handler_data
 4686	 * as well.
 4687	 */
 4688	synchronize_net();
 4689	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
 4690}
 4691EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
 4692
 4693/*
 4694 * Limit the use of PFMEMALLOC reserves to those protocols that implement
 4695 * the special handling of PFMEMALLOC skbs.
 4696 */
 4697static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
 4698{
 4699	switch (skb->protocol) {
 4700	case htons(ETH_P_ARP):
 4701	case htons(ETH_P_IP):
 4702	case htons(ETH_P_IPV6):
 4703	case htons(ETH_P_8021Q):
 4704	case htons(ETH_P_8021AD):
 4705		return true;
 4706	default:
 4707		return false;
 4708	}
 4709}
 4710
 4711static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
 4712			     int *ret, struct net_device *orig_dev)
 4713{
 4714#ifdef CONFIG_NETFILTER_INGRESS
 4715	if (nf_hook_ingress_active(skb)) {
 4716		int ingress_retval;
 4717
 4718		if (*pt_prev) {
 4719			*ret = deliver_skb(skb, *pt_prev, orig_dev);
 4720			*pt_prev = NULL;
 4721		}
 4722
 4723		rcu_read_lock();
 4724		ingress_retval = nf_hook_ingress(skb);
 4725		rcu_read_unlock();
 4726		return ingress_retval;
 4727	}
 4728#endif /* CONFIG_NETFILTER_INGRESS */
 4729	return 0;
 4730}
 4731
 4732static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
 4733				    struct packet_type **ppt_prev)
 4734{
 4735	struct packet_type *ptype, *pt_prev;
 4736	rx_handler_func_t *rx_handler;
 4737	struct net_device *orig_dev;
 4738	bool deliver_exact = false;
 4739	int ret = NET_RX_DROP;
 4740	__be16 type;
 4741
 4742	net_timestamp_check(!netdev_tstamp_prequeue, skb);
 4743
 4744	trace_netif_receive_skb(skb);
 4745
 4746	orig_dev = skb->dev;
 4747
 4748	skb_reset_network_header(skb);
 4749	if (!skb_transport_header_was_set(skb))
 4750		skb_reset_transport_header(skb);
 4751	skb_reset_mac_len(skb);
 4752
 4753	pt_prev = NULL;
 4754
 4755another_round:
 4756	skb->skb_iif = skb->dev->ifindex;
 4757
 4758	__this_cpu_inc(softnet_data.processed);
 4759
 4760	if (static_branch_unlikely(&generic_xdp_needed_key)) {
 4761		int ret2;
 4762
 4763		preempt_disable();
 4764		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
 4765		preempt_enable();
 4766
 4767		if (ret2 != XDP_PASS)
 4768			return NET_RX_DROP;
 4769		skb_reset_mac_len(skb);
 4770	}
 4771
 4772	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
 4773	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 4774		skb = skb_vlan_untag(skb);
 4775		if (unlikely(!skb))
 4776			goto out;
 4777	}
 4778
 4779	if (skb_skip_tc_classify(skb))
 4780		goto skip_classify;
 4781
 4782	if (pfmemalloc)
 4783		goto skip_taps;
 4784
 4785	list_for_each_entry_rcu(ptype, &ptype_all, list) {
 4786		if (pt_prev)
 4787			ret = deliver_skb(skb, pt_prev, orig_dev);
 4788		pt_prev = ptype;
 4789	}
 4790
 4791	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
 4792		if (pt_prev)
 4793			ret = deliver_skb(skb, pt_prev, orig_dev);
 4794		pt_prev = ptype;
 4795	}
 4796
 4797skip_taps:
 4798#ifdef CONFIG_NET_INGRESS
 4799	if (static_branch_unlikely(&ingress_needed_key)) {
 4800		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
 4801		if (!skb)
 4802			goto out;
 4803
 4804		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
 4805			goto out;
 4806	}
 4807#endif
 4808	skb_reset_tc(skb);
 4809skip_classify:
 4810	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
 4811		goto drop;
 4812
 4813	if (skb_vlan_tag_present(skb)) {
 4814		if (pt_prev) {
 4815			ret = deliver_skb(skb, pt_prev, orig_dev);
 4816			pt_prev = NULL;
 4817		}
 4818		if (vlan_do_receive(&skb))
 4819			goto another_round;
 4820		else if (unlikely(!skb))
 4821			goto out;
 4822	}
 4823
 4824	rx_handler = rcu_dereference(skb->dev->rx_handler);
 4825	if (rx_handler) {
 4826		if (pt_prev) {
 4827			ret = deliver_skb(skb, pt_prev, orig_dev);
 4828			pt_prev = NULL;
 4829		}
 4830		switch (rx_handler(&skb)) {
 4831		case RX_HANDLER_CONSUMED:
 4832			ret = NET_RX_SUCCESS;
 4833			goto out;
 4834		case RX_HANDLER_ANOTHER:
 4835			goto another_round;
 4836		case RX_HANDLER_EXACT:
 4837			deliver_exact = true;
 4838		case RX_HANDLER_PASS:
 4839			break;
 4840		default:
 4841			BUG();
 4842		}
 4843	}
 4844
 4845	if (unlikely(skb_vlan_tag_present(skb))) {
 4846check_vlan_id:
 4847		if (skb_vlan_tag_get_id(skb)) {
 4848			/* Vlan id is non 0 and vlan_do_receive() above couldn't
 4849			 * find vlan device.
 4850			 */
 4851			skb->pkt_type = PACKET_OTHERHOST;
 4852		} else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
 4853			   skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 4854			/* Outer header is 802.1P with vlan 0, inner header is
 4855			 * 802.1Q or 802.1AD and vlan_do_receive() above could
 4856			 * not find vlan dev for vlan id 0.
 4857			 */
 4858			__vlan_hwaccel_clear_tag(skb);
 4859			skb = skb_vlan_untag(skb);
 4860			if (unlikely(!skb))
 4861				goto out;
 4862			if (vlan_do_receive(&skb))
 4863				/* After stripping off 802.1P header with vlan 0
 4864				 * vlan dev is found for inner header.
 4865				 */
 4866				goto another_round;
 4867			else if (unlikely(!skb))
 4868				goto out;
 4869			else
 4870				/* We have stripped outer 802.1P vlan 0 header.
 4871				 * But could not find vlan dev.
 4872				 * check again for vlan id to set OTHERHOST.
 4873				 */
 4874				goto check_vlan_id;
 4875		}
 4876		/* Note: we might in the future use prio bits
 4877		 * and set skb->priority like in vlan_do_receive()
 4878		 * For the time being, just ignore Priority Code Point
 4879		 */
 4880		__vlan_hwaccel_clear_tag(skb);
 4881	}
 4882
 4883	type = skb->protocol;
 4884
 4885	/* deliver only exact match when indicated */
 4886	if (likely(!deliver_exact)) {
 4887		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 4888				       &ptype_base[ntohs(type) &
 4889						   PTYPE_HASH_MASK]);
 4890	}
 4891
 4892	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 4893			       &orig_dev->ptype_specific);
 4894
 4895	if (unlikely(skb->dev != orig_dev)) {
 4896		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 4897				       &skb->dev->ptype_specific);
 4898	}
 4899
 4900	if (pt_prev) {
 4901		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 4902			goto drop;
 4903		*ppt_prev = pt_prev;
 
 4904	} else {
 4905drop:
 4906		if (!deliver_exact)
 4907			atomic_long_inc(&skb->dev->rx_dropped);
 4908		else
 4909			atomic_long_inc(&skb->dev->rx_nohandler);
 4910		kfree_skb(skb);
 4911		/* Jamal, now you will not able to escape explaining
 4912		 * me how you were going to use this. :-)
 4913		 */
 4914		ret = NET_RX_DROP;
 4915	}
 4916
 4917out:
 4918	return ret;
 4919}
 4920
 4921static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
 4922{
 4923	struct net_device *orig_dev = skb->dev;
 4924	struct packet_type *pt_prev = NULL;
 4925	int ret;
 4926
 4927	ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
 4928	if (pt_prev)
 4929		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
 4930					 skb->dev, pt_prev, orig_dev);
 4931	return ret;
 4932}
 4933
 4934/**
 4935 *	netif_receive_skb_core - special purpose version of netif_receive_skb
 4936 *	@skb: buffer to process
 4937 *
 4938 *	More direct receive version of netif_receive_skb().  It should
 4939 *	only be used by callers that have a need to skip RPS and Generic XDP.
 4940 *	Caller must also take care of handling if (page_is_)pfmemalloc.
 4941 *
 4942 *	This function may only be called from softirq context and interrupts
 4943 *	should be enabled.
 4944 *
 4945 *	Return values (usually ignored):
 4946 *	NET_RX_SUCCESS: no congestion
 4947 *	NET_RX_DROP: packet was dropped
 4948 */
 4949int netif_receive_skb_core(struct sk_buff *skb)
 4950{
 4951	int ret;
 4952
 4953	rcu_read_lock();
 4954	ret = __netif_receive_skb_one_core(skb, false);
 4955	rcu_read_unlock();
 4956
 4957	return ret;
 4958}
 4959EXPORT_SYMBOL(netif_receive_skb_core);
 4960
 4961static inline void __netif_receive_skb_list_ptype(struct list_head *head,
 4962						  struct packet_type *pt_prev,
 4963						  struct net_device *orig_dev)
 4964{
 4965	struct sk_buff *skb, *next;
 4966
 4967	if (!pt_prev)
 4968		return;
 4969	if (list_empty(head))
 4970		return;
 4971	if (pt_prev->list_func != NULL)
 4972		INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
 4973				   ip_list_rcv, head, pt_prev, orig_dev);
 4974	else
 4975		list_for_each_entry_safe(skb, next, head, list) {
 4976			skb_list_del_init(skb);
 4977			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 4978		}
 4979}
 4980
 4981static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
 4982{
 4983	/* Fast-path assumptions:
 4984	 * - There is no RX handler.
 4985	 * - Only one packet_type matches.
 4986	 * If either of these fails, we will end up doing some per-packet
 4987	 * processing in-line, then handling the 'last ptype' for the whole
 4988	 * sublist.  This can't cause out-of-order delivery to any single ptype,
 4989	 * because the 'last ptype' must be constant across the sublist, and all
 4990	 * other ptypes are handled per-packet.
 4991	 */
 4992	/* Current (common) ptype of sublist */
 4993	struct packet_type *pt_curr = NULL;
 4994	/* Current (common) orig_dev of sublist */
 4995	struct net_device *od_curr = NULL;
 4996	struct list_head sublist;
 4997	struct sk_buff *skb, *next;
 4998
 4999	INIT_LIST_HEAD(&sublist);
 5000	list_for_each_entry_safe(skb, next, head, list) {
 5001		struct net_device *orig_dev = skb->dev;
 5002		struct packet_type *pt_prev = NULL;
 5003
 5004		skb_list_del_init(skb);
 5005		__netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
 5006		if (!pt_prev)
 5007			continue;
 5008		if (pt_curr != pt_prev || od_curr != orig_dev) {
 5009			/* dispatch old sublist */
 5010			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 5011			/* start new sublist */
 5012			INIT_LIST_HEAD(&sublist);
 5013			pt_curr = pt_prev;
 5014			od_curr = orig_dev;
 5015		}
 5016		list_add_tail(&skb->list, &sublist);
 5017	}
 5018
 5019	/* dispatch final sublist */
 5020	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 5021}
 5022
 5023static int __netif_receive_skb(struct sk_buff *skb)
 5024{
 5025	int ret;
 5026
 5027	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
 5028		unsigned int noreclaim_flag;
 5029
 5030		/*
 5031		 * PFMEMALLOC skbs are special, they should
 5032		 * - be delivered to SOCK_MEMALLOC sockets only
 5033		 * - stay away from userspace
 5034		 * - have bounded memory usage
 5035		 *
 5036		 * Use PF_MEMALLOC as this saves us from propagating the allocation
 5037		 * context down to all allocation sites.
 5038		 */
 5039		noreclaim_flag = memalloc_noreclaim_save();
 5040		ret = __netif_receive_skb_one_core(skb, true);
 5041		memalloc_noreclaim_restore(noreclaim_flag);
 5042	} else
 5043		ret = __netif_receive_skb_one_core(skb, false);
 5044
 5045	return ret;
 5046}
 5047
 5048static void __netif_receive_skb_list(struct list_head *head)
 5049{
 5050	unsigned long noreclaim_flag = 0;
 5051	struct sk_buff *skb, *next;
 5052	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
 5053
 5054	list_for_each_entry_safe(skb, next, head, list) {
 5055		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
 5056			struct list_head sublist;
 5057
 5058			/* Handle the previous sublist */
 5059			list_cut_before(&sublist, head, &skb->list);
 5060			if (!list_empty(&sublist))
 5061				__netif_receive_skb_list_core(&sublist, pfmemalloc);
 5062			pfmemalloc = !pfmemalloc;
 5063			/* See comments in __netif_receive_skb */
 5064			if (pfmemalloc)
 5065				noreclaim_flag = memalloc_noreclaim_save();
 5066			else
 5067				memalloc_noreclaim_restore(noreclaim_flag);
 5068		}
 5069	}
 5070	/* Handle the remaining sublist */
 5071	if (!list_empty(head))
 5072		__netif_receive_skb_list_core(head, pfmemalloc);
 5073	/* Restore pflags */
 5074	if (pfmemalloc)
 5075		memalloc_noreclaim_restore(noreclaim_flag);
 5076}
 5077
 5078static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 5079{
 5080	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
 5081	struct bpf_prog *new = xdp->prog;
 5082	int ret = 0;
 5083
 5084	switch (xdp->command) {
 5085	case XDP_SETUP_PROG:
 5086		rcu_assign_pointer(dev->xdp_prog, new);
 5087		if (old)
 5088			bpf_prog_put(old);
 5089
 5090		if (old && !new) {
 5091			static_branch_dec(&generic_xdp_needed_key);
 5092		} else if (new && !old) {
 5093			static_branch_inc(&generic_xdp_needed_key);
 5094			dev_disable_lro(dev);
 5095			dev_disable_gro_hw(dev);
 5096		}
 5097		break;
 5098
 5099	case XDP_QUERY_PROG:
 
 5100		xdp->prog_id = old ? old->aux->id : 0;
 5101		break;
 5102
 5103	default:
 5104		ret = -EINVAL;
 5105		break;
 5106	}
 5107
 5108	return ret;
 5109}
 5110
 5111static int netif_receive_skb_internal(struct sk_buff *skb)
 5112{
 5113	int ret;
 5114
 5115	net_timestamp_check(netdev_tstamp_prequeue, skb);
 5116
 5117	if (skb_defer_rx_timestamp(skb))
 5118		return NET_RX_SUCCESS;
 5119
 
 
 
 
 
 
 
 
 
 
 
 
 
 5120	rcu_read_lock();
 5121#ifdef CONFIG_RPS
 5122	if (static_branch_unlikely(&rps_needed)) {
 5123		struct rps_dev_flow voidflow, *rflow = &voidflow;
 5124		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5125
 5126		if (cpu >= 0) {
 5127			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5128			rcu_read_unlock();
 5129			return ret;
 5130		}
 5131	}
 5132#endif
 5133	ret = __netif_receive_skb(skb);
 5134	rcu_read_unlock();
 5135	return ret;
 5136}
 5137
 5138static void netif_receive_skb_list_internal(struct list_head *head)
 5139{
 5140	struct sk_buff *skb, *next;
 5141	struct list_head sublist;
 5142
 5143	INIT_LIST_HEAD(&sublist);
 5144	list_for_each_entry_safe(skb, next, head, list) {
 5145		net_timestamp_check(netdev_tstamp_prequeue, skb);
 5146		skb_list_del_init(skb);
 5147		if (!skb_defer_rx_timestamp(skb))
 5148			list_add_tail(&skb->list, &sublist);
 5149	}
 5150	list_splice_init(&sublist, head);
 5151
 5152	rcu_read_lock();
 5153#ifdef CONFIG_RPS
 5154	if (static_branch_unlikely(&rps_needed)) {
 5155		list_for_each_entry_safe(skb, next, head, list) {
 5156			struct rps_dev_flow voidflow, *rflow = &voidflow;
 5157			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5158
 5159			if (cpu >= 0) {
 5160				/* Will be handled, remove from list */
 5161				skb_list_del_init(skb);
 5162				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5163			}
 5164		}
 5165	}
 5166#endif
 5167	__netif_receive_skb_list(head);
 5168	rcu_read_unlock();
 5169}
 5170
 5171/**
 5172 *	netif_receive_skb - process receive buffer from network
 5173 *	@skb: buffer to process
 5174 *
 5175 *	netif_receive_skb() is the main receive data processing function.
 5176 *	It always succeeds. The buffer may be dropped during processing
 5177 *	for congestion control or by the protocol layers.
 5178 *
 5179 *	This function may only be called from softirq context and interrupts
 5180 *	should be enabled.
 5181 *
 5182 *	Return values (usually ignored):
 5183 *	NET_RX_SUCCESS: no congestion
 5184 *	NET_RX_DROP: packet was dropped
 5185 */
 5186int netif_receive_skb(struct sk_buff *skb)
 5187{
 5188	int ret;
 5189
 5190	trace_netif_receive_skb_entry(skb);
 5191
 5192	ret = netif_receive_skb_internal(skb);
 5193	trace_netif_receive_skb_exit(ret);
 5194
 5195	return ret;
 5196}
 5197EXPORT_SYMBOL(netif_receive_skb);
 5198
 5199/**
 5200 *	netif_receive_skb_list - process many receive buffers from network
 5201 *	@head: list of skbs to process.
 5202 *
 5203 *	Since return value of netif_receive_skb() is normally ignored, and
 5204 *	wouldn't be meaningful for a list, this function returns void.
 5205 *
 5206 *	This function may only be called from softirq context and interrupts
 5207 *	should be enabled.
 5208 */
 5209void netif_receive_skb_list(struct list_head *head)
 5210{
 5211	struct sk_buff *skb;
 5212
 5213	if (list_empty(head))
 5214		return;
 5215	if (trace_netif_receive_skb_list_entry_enabled()) {
 5216		list_for_each_entry(skb, head, list)
 5217			trace_netif_receive_skb_list_entry(skb);
 5218	}
 5219	netif_receive_skb_list_internal(head);
 5220	trace_netif_receive_skb_list_exit(0);
 5221}
 5222EXPORT_SYMBOL(netif_receive_skb_list);
 5223
 5224DEFINE_PER_CPU(struct work_struct, flush_works);
 5225
 5226/* Network device is going away, flush any packets still pending */
 5227static void flush_backlog(struct work_struct *work)
 5228{
 5229	struct sk_buff *skb, *tmp;
 5230	struct softnet_data *sd;
 5231
 5232	local_bh_disable();
 5233	sd = this_cpu_ptr(&softnet_data);
 5234
 5235	local_irq_disable();
 5236	rps_lock(sd);
 5237	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
 5238		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 5239			__skb_unlink(skb, &sd->input_pkt_queue);
 5240			kfree_skb(skb);
 5241			input_queue_head_incr(sd);
 5242		}
 5243	}
 5244	rps_unlock(sd);
 5245	local_irq_enable();
 5246
 5247	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
 5248		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 5249			__skb_unlink(skb, &sd->process_queue);
 5250			kfree_skb(skb);
 5251			input_queue_head_incr(sd);
 5252		}
 5253	}
 5254	local_bh_enable();
 5255}
 5256
 5257static void flush_all_backlogs(void)
 5258{
 5259	unsigned int cpu;
 5260
 5261	get_online_cpus();
 5262
 5263	for_each_online_cpu(cpu)
 5264		queue_work_on(cpu, system_highpri_wq,
 5265			      per_cpu_ptr(&flush_works, cpu));
 5266
 5267	for_each_online_cpu(cpu)
 5268		flush_work(per_cpu_ptr(&flush_works, cpu));
 5269
 5270	put_online_cpus();
 5271}
 5272
 5273INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
 5274INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
 5275static int napi_gro_complete(struct sk_buff *skb)
 5276{
 5277	struct packet_offload *ptype;
 5278	__be16 type = skb->protocol;
 5279	struct list_head *head = &offload_base;
 5280	int err = -ENOENT;
 5281
 5282	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
 5283
 5284	if (NAPI_GRO_CB(skb)->count == 1) {
 5285		skb_shinfo(skb)->gso_size = 0;
 5286		goto out;
 5287	}
 5288
 5289	rcu_read_lock();
 5290	list_for_each_entry_rcu(ptype, head, list) {
 5291		if (ptype->type != type || !ptype->callbacks.gro_complete)
 5292			continue;
 5293
 5294		err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
 5295					 ipv6_gro_complete, inet_gro_complete,
 5296					 skb, 0);
 5297		break;
 5298	}
 5299	rcu_read_unlock();
 5300
 5301	if (err) {
 5302		WARN_ON(&ptype->list == head);
 5303		kfree_skb(skb);
 5304		return NET_RX_SUCCESS;
 5305	}
 5306
 5307out:
 5308	return netif_receive_skb_internal(skb);
 5309}
 5310
 5311static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
 5312				   bool flush_old)
 5313{
 5314	struct list_head *head = &napi->gro_hash[index].list;
 5315	struct sk_buff *skb, *p;
 5316
 5317	list_for_each_entry_safe_reverse(skb, p, head, list) {
 5318		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
 5319			return;
 5320		skb_list_del_init(skb);
 5321		napi_gro_complete(skb);
 5322		napi->gro_hash[index].count--;
 5323	}
 5324
 5325	if (!napi->gro_hash[index].count)
 5326		__clear_bit(index, &napi->gro_bitmask);
 5327}
 5328
 5329/* napi->gro_hash[].list contains packets ordered by age.
 5330 * youngest packets at the head of it.
 5331 * Complete skbs in reverse order to reduce latencies.
 5332 */
 5333void napi_gro_flush(struct napi_struct *napi, bool flush_old)
 5334{
 5335	unsigned long bitmask = napi->gro_bitmask;
 5336	unsigned int i, base = ~0U;
 5337
 5338	while ((i = ffs(bitmask)) != 0) {
 5339		bitmask >>= i;
 5340		base += i;
 5341		__napi_gro_flush_chain(napi, base, flush_old);
 5342	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 5343}
 5344EXPORT_SYMBOL(napi_gro_flush);
 5345
 5346static struct list_head *gro_list_prepare(struct napi_struct *napi,
 5347					  struct sk_buff *skb)
 5348{
 
 5349	unsigned int maclen = skb->dev->hard_header_len;
 5350	u32 hash = skb_get_hash_raw(skb);
 5351	struct list_head *head;
 5352	struct sk_buff *p;
 5353
 5354	head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
 5355	list_for_each_entry(p, head, list) {
 5356		unsigned long diffs;
 5357
 5358		NAPI_GRO_CB(p)->flush = 0;
 5359
 5360		if (hash != skb_get_hash_raw(p)) {
 5361			NAPI_GRO_CB(p)->same_flow = 0;
 5362			continue;
 5363		}
 5364
 5365		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
 5366		diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
 5367		if (skb_vlan_tag_present(p))
 5368			diffs |= p->vlan_tci ^ skb->vlan_tci;
 5369		diffs |= skb_metadata_dst_cmp(p, skb);
 5370		diffs |= skb_metadata_differs(p, skb);
 5371		if (maclen == ETH_HLEN)
 5372			diffs |= compare_ether_header(skb_mac_header(p),
 5373						      skb_mac_header(skb));
 5374		else if (!diffs)
 5375			diffs = memcmp(skb_mac_header(p),
 5376				       skb_mac_header(skb),
 5377				       maclen);
 5378		NAPI_GRO_CB(p)->same_flow = !diffs;
 5379	}
 5380
 5381	return head;
 5382}
 5383
 5384static void skb_gro_reset_offset(struct sk_buff *skb)
 5385{
 5386	const struct skb_shared_info *pinfo = skb_shinfo(skb);
 5387	const skb_frag_t *frag0 = &pinfo->frags[0];
 5388
 5389	NAPI_GRO_CB(skb)->data_offset = 0;
 5390	NAPI_GRO_CB(skb)->frag0 = NULL;
 5391	NAPI_GRO_CB(skb)->frag0_len = 0;
 5392
 5393	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
 5394	    pinfo->nr_frags &&
 5395	    !PageHighMem(skb_frag_page(frag0))) {
 5396		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
 5397		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
 5398						    skb_frag_size(frag0),
 5399						    skb->end - skb->tail);
 5400	}
 5401}
 5402
 5403static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
 5404{
 5405	struct skb_shared_info *pinfo = skb_shinfo(skb);
 5406
 5407	BUG_ON(skb->end - skb->tail < grow);
 5408
 5409	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
 5410
 5411	skb->data_len -= grow;
 5412	skb->tail += grow;
 5413
 5414	skb_frag_off_add(&pinfo->frags[0], grow);
 5415	skb_frag_size_sub(&pinfo->frags[0], grow);
 5416
 5417	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
 5418		skb_frag_unref(skb, 0);
 5419		memmove(pinfo->frags, pinfo->frags + 1,
 5420			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
 5421	}
 5422}
 5423
 5424static void gro_flush_oldest(struct list_head *head)
 5425{
 5426	struct sk_buff *oldest;
 5427
 5428	oldest = list_last_entry(head, struct sk_buff, list);
 5429
 5430	/* We are called with head length >= MAX_GRO_SKBS, so this is
 5431	 * impossible.
 5432	 */
 5433	if (WARN_ON_ONCE(!oldest))
 5434		return;
 5435
 5436	/* Do not adjust napi->gro_hash[].count, caller is adding a new
 5437	 * SKB to the chain.
 5438	 */
 5439	skb_list_del_init(oldest);
 5440	napi_gro_complete(oldest);
 5441}
 5442
 5443INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
 5444							   struct sk_buff *));
 5445INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
 5446							   struct sk_buff *));
 5447static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 5448{
 5449	u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
 5450	struct list_head *head = &offload_base;
 5451	struct packet_offload *ptype;
 5452	__be16 type = skb->protocol;
 5453	struct list_head *gro_head;
 5454	struct sk_buff *pp = NULL;
 5455	enum gro_result ret;
 5456	int same_flow;
 
 5457	int grow;
 5458
 5459	if (netif_elide_gro(skb->dev))
 5460		goto normal;
 5461
 5462	gro_head = gro_list_prepare(napi, skb);
 5463
 5464	rcu_read_lock();
 5465	list_for_each_entry_rcu(ptype, head, list) {
 5466		if (ptype->type != type || !ptype->callbacks.gro_receive)
 5467			continue;
 5468
 5469		skb_set_network_header(skb, skb_gro_offset(skb));
 5470		skb_reset_mac_len(skb);
 5471		NAPI_GRO_CB(skb)->same_flow = 0;
 5472		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
 5473		NAPI_GRO_CB(skb)->free = 0;
 5474		NAPI_GRO_CB(skb)->encap_mark = 0;
 5475		NAPI_GRO_CB(skb)->recursion_counter = 0;
 5476		NAPI_GRO_CB(skb)->is_fou = 0;
 5477		NAPI_GRO_CB(skb)->is_atomic = 1;
 5478		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
 5479
 5480		/* Setup for GRO checksum validation */
 5481		switch (skb->ip_summed) {
 5482		case CHECKSUM_COMPLETE:
 5483			NAPI_GRO_CB(skb)->csum = skb->csum;
 5484			NAPI_GRO_CB(skb)->csum_valid = 1;
 5485			NAPI_GRO_CB(skb)->csum_cnt = 0;
 5486			break;
 5487		case CHECKSUM_UNNECESSARY:
 5488			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
 5489			NAPI_GRO_CB(skb)->csum_valid = 0;
 5490			break;
 5491		default:
 5492			NAPI_GRO_CB(skb)->csum_cnt = 0;
 5493			NAPI_GRO_CB(skb)->csum_valid = 0;
 5494		}
 5495
 5496		pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
 5497					ipv6_gro_receive, inet_gro_receive,
 5498					gro_head, skb);
 5499		break;
 5500	}
 5501	rcu_read_unlock();
 5502
 5503	if (&ptype->list == head)
 5504		goto normal;
 5505
 5506	if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
 5507		ret = GRO_CONSUMED;
 5508		goto ok;
 5509	}
 5510
 5511	same_flow = NAPI_GRO_CB(skb)->same_flow;
 5512	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
 5513
 5514	if (pp) {
 5515		skb_list_del_init(pp);
 5516		napi_gro_complete(pp);
 5517		napi->gro_hash[hash].count--;
 
 
 
 5518	}
 5519
 5520	if (same_flow)
 5521		goto ok;
 5522
 5523	if (NAPI_GRO_CB(skb)->flush)
 5524		goto normal;
 5525
 5526	if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
 5527		gro_flush_oldest(gro_head);
 
 
 
 
 
 
 
 
 
 5528	} else {
 5529		napi->gro_hash[hash].count++;
 5530	}
 5531	NAPI_GRO_CB(skb)->count = 1;
 5532	NAPI_GRO_CB(skb)->age = jiffies;
 5533	NAPI_GRO_CB(skb)->last = skb;
 5534	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
 5535	list_add(&skb->list, gro_head);
 
 5536	ret = GRO_HELD;
 5537
 5538pull:
 5539	grow = skb_gro_offset(skb) - skb_headlen(skb);
 5540	if (grow > 0)
 5541		gro_pull_from_frag0(skb, grow);
 5542ok:
 5543	if (napi->gro_hash[hash].count) {
 5544		if (!test_bit(hash, &napi->gro_bitmask))
 5545			__set_bit(hash, &napi->gro_bitmask);
 5546	} else if (test_bit(hash, &napi->gro_bitmask)) {
 5547		__clear_bit(hash, &napi->gro_bitmask);
 5548	}
 5549
 5550	return ret;
 5551
 5552normal:
 5553	ret = GRO_NORMAL;
 5554	goto pull;
 5555}
 5556
 5557struct packet_offload *gro_find_receive_by_type(__be16 type)
 5558{
 5559	struct list_head *offload_head = &offload_base;
 5560	struct packet_offload *ptype;
 5561
 5562	list_for_each_entry_rcu(ptype, offload_head, list) {
 5563		if (ptype->type != type || !ptype->callbacks.gro_receive)
 5564			continue;
 5565		return ptype;
 5566	}
 5567	return NULL;
 5568}
 5569EXPORT_SYMBOL(gro_find_receive_by_type);
 5570
 5571struct packet_offload *gro_find_complete_by_type(__be16 type)
 5572{
 5573	struct list_head *offload_head = &offload_base;
 5574	struct packet_offload *ptype;
 5575
 5576	list_for_each_entry_rcu(ptype, offload_head, list) {
 5577		if (ptype->type != type || !ptype->callbacks.gro_complete)
 5578			continue;
 5579		return ptype;
 5580	}
 5581	return NULL;
 5582}
 5583EXPORT_SYMBOL(gro_find_complete_by_type);
 5584
 5585static void napi_skb_free_stolen_head(struct sk_buff *skb)
 5586{
 5587	skb_dst_drop(skb);
 5588	skb_ext_put(skb);
 5589	kmem_cache_free(skbuff_head_cache, skb);
 5590}
 5591
 5592static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 5593{
 5594	switch (ret) {
 5595	case GRO_NORMAL:
 5596		if (netif_receive_skb_internal(skb))
 5597			ret = GRO_DROP;
 5598		break;
 5599
 5600	case GRO_DROP:
 5601		kfree_skb(skb);
 5602		break;
 5603
 5604	case GRO_MERGED_FREE:
 5605		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
 5606			napi_skb_free_stolen_head(skb);
 5607		else
 5608			__kfree_skb(skb);
 5609		break;
 5610
 5611	case GRO_HELD:
 5612	case GRO_MERGED:
 5613	case GRO_CONSUMED:
 5614		break;
 5615	}
 5616
 5617	return ret;
 5618}
 5619
 5620gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 5621{
 5622	gro_result_t ret;
 5623
 5624	skb_mark_napi_id(skb, napi);
 5625	trace_napi_gro_receive_entry(skb);
 5626
 5627	skb_gro_reset_offset(skb);
 5628
 5629	ret = napi_skb_finish(dev_gro_receive(napi, skb), skb);
 5630	trace_napi_gro_receive_exit(ret);
 5631
 5632	return ret;
 5633}
 5634EXPORT_SYMBOL(napi_gro_receive);
 5635
 5636static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
 5637{
 5638	if (unlikely(skb->pfmemalloc)) {
 5639		consume_skb(skb);
 5640		return;
 5641	}
 5642	__skb_pull(skb, skb_headlen(skb));
 5643	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
 5644	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
 5645	__vlan_hwaccel_clear_tag(skb);
 5646	skb->dev = napi->dev;
 5647	skb->skb_iif = 0;
 5648
 5649	/* eth_type_trans() assumes pkt_type is PACKET_HOST */
 5650	skb->pkt_type = PACKET_HOST;
 5651
 5652	skb->encapsulation = 0;
 5653	skb_shinfo(skb)->gso_type = 0;
 5654	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
 5655	skb_ext_reset(skb);
 5656
 5657	napi->skb = skb;
 5658}
 5659
 5660struct sk_buff *napi_get_frags(struct napi_struct *napi)
 5661{
 5662	struct sk_buff *skb = napi->skb;
 5663
 5664	if (!skb) {
 5665		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
 5666		if (skb) {
 5667			napi->skb = skb;
 5668			skb_mark_napi_id(skb, napi);
 5669		}
 5670	}
 5671	return skb;
 5672}
 5673EXPORT_SYMBOL(napi_get_frags);
 5674
 5675/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
 5676static void gro_normal_list(struct napi_struct *napi)
 5677{
 5678	if (!napi->rx_count)
 5679		return;
 5680	netif_receive_skb_list_internal(&napi->rx_list);
 5681	INIT_LIST_HEAD(&napi->rx_list);
 5682	napi->rx_count = 0;
 5683}
 5684
 5685/* Queue one GRO_NORMAL SKB up for list processing.  If batch size exceeded,
 5686 * pass the whole batch up to the stack.
 5687 */
 5688static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb)
 5689{
 5690	list_add_tail(&skb->list, &napi->rx_list);
 5691	if (++napi->rx_count >= gro_normal_batch)
 5692		gro_normal_list(napi);
 5693}
 5694
 5695static gro_result_t napi_frags_finish(struct napi_struct *napi,
 5696				      struct sk_buff *skb,
 5697				      gro_result_t ret)
 5698{
 5699	switch (ret) {
 5700	case GRO_NORMAL:
 5701	case GRO_HELD:
 5702		__skb_push(skb, ETH_HLEN);
 5703		skb->protocol = eth_type_trans(skb, skb->dev);
 5704		if (ret == GRO_NORMAL)
 5705			gro_normal_one(napi, skb);
 5706		break;
 5707
 5708	case GRO_DROP:
 5709		napi_reuse_skb(napi, skb);
 5710		break;
 5711
 5712	case GRO_MERGED_FREE:
 5713		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
 5714			napi_skb_free_stolen_head(skb);
 5715		else
 5716			napi_reuse_skb(napi, skb);
 5717		break;
 5718
 5719	case GRO_MERGED:
 5720	case GRO_CONSUMED:
 5721		break;
 5722	}
 5723
 5724	return ret;
 5725}
 5726
 5727/* Upper GRO stack assumes network header starts at gro_offset=0
 5728 * Drivers could call both napi_gro_frags() and napi_gro_receive()
 5729 * We copy ethernet header into skb->data to have a common layout.
 5730 */
 5731static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 5732{
 5733	struct sk_buff *skb = napi->skb;
 5734	const struct ethhdr *eth;
 5735	unsigned int hlen = sizeof(*eth);
 5736
 5737	napi->skb = NULL;
 5738
 5739	skb_reset_mac_header(skb);
 5740	skb_gro_reset_offset(skb);
 5741
 
 5742	if (unlikely(skb_gro_header_hard(skb, hlen))) {
 5743		eth = skb_gro_header_slow(skb, hlen, 0);
 5744		if (unlikely(!eth)) {
 5745			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
 5746					     __func__, napi->dev->name);
 5747			napi_reuse_skb(napi, skb);
 5748			return NULL;
 5749		}
 5750	} else {
 5751		eth = (const struct ethhdr *)skb->data;
 5752		gro_pull_from_frag0(skb, hlen);
 5753		NAPI_GRO_CB(skb)->frag0 += hlen;
 5754		NAPI_GRO_CB(skb)->frag0_len -= hlen;
 5755	}
 5756	__skb_pull(skb, hlen);
 5757
 5758	/*
 5759	 * This works because the only protocols we care about don't require
 5760	 * special handling.
 5761	 * We'll fix it up properly in napi_frags_finish()
 5762	 */
 5763	skb->protocol = eth->h_proto;
 5764
 5765	return skb;
 5766}
 5767
 5768gro_result_t napi_gro_frags(struct napi_struct *napi)
 5769{
 5770	gro_result_t ret;
 5771	struct sk_buff *skb = napi_frags_skb(napi);
 5772
 5773	if (!skb)
 5774		return GRO_DROP;
 5775
 5776	trace_napi_gro_frags_entry(skb);
 5777
 5778	ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
 5779	trace_napi_gro_frags_exit(ret);
 5780
 5781	return ret;
 5782}
 5783EXPORT_SYMBOL(napi_gro_frags);
 5784
 5785/* Compute the checksum from gro_offset and return the folded value
 5786 * after adding in any pseudo checksum.
 5787 */
 5788__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
 5789{
 5790	__wsum wsum;
 5791	__sum16 sum;
 5792
 5793	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
 5794
 5795	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
 5796	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
 5797	/* See comments in __skb_checksum_complete(). */
 5798	if (likely(!sum)) {
 5799		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
 5800		    !skb->csum_complete_sw)
 5801			netdev_rx_csum_fault(skb->dev, skb);
 5802	}
 5803
 5804	NAPI_GRO_CB(skb)->csum = wsum;
 5805	NAPI_GRO_CB(skb)->csum_valid = 1;
 5806
 5807	return sum;
 5808}
 5809EXPORT_SYMBOL(__skb_gro_checksum_complete);
 5810
 5811static void net_rps_send_ipi(struct softnet_data *remsd)
 5812{
 5813#ifdef CONFIG_RPS
 5814	while (remsd) {
 5815		struct softnet_data *next = remsd->rps_ipi_next;
 5816
 5817		if (cpu_online(remsd->cpu))
 5818			smp_call_function_single_async(remsd->cpu, &remsd->csd);
 5819		remsd = next;
 5820	}
 5821#endif
 5822}
 5823
 5824/*
 5825 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
 5826 * Note: called with local irq disabled, but exits with local irq enabled.
 5827 */
 5828static void net_rps_action_and_irq_enable(struct softnet_data *sd)
 5829{
 5830#ifdef CONFIG_RPS
 5831	struct softnet_data *remsd = sd->rps_ipi_list;
 5832
 5833	if (remsd) {
 5834		sd->rps_ipi_list = NULL;
 5835
 5836		local_irq_enable();
 5837
 5838		/* Send pending IPI's to kick RPS processing on remote cpus. */
 5839		net_rps_send_ipi(remsd);
 5840	} else
 5841#endif
 5842		local_irq_enable();
 5843}
 5844
 5845static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
 5846{
 5847#ifdef CONFIG_RPS
 5848	return sd->rps_ipi_list != NULL;
 5849#else
 5850	return false;
 5851#endif
 5852}
 5853
 5854static int process_backlog(struct napi_struct *napi, int quota)
 5855{
 5856	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
 5857	bool again = true;
 5858	int work = 0;
 5859
 5860	/* Check if we have pending ipi, its better to send them now,
 5861	 * not waiting net_rx_action() end.
 5862	 */
 5863	if (sd_has_rps_ipi_waiting(sd)) {
 5864		local_irq_disable();
 5865		net_rps_action_and_irq_enable(sd);
 5866	}
 5867
 5868	napi->weight = dev_rx_weight;
 5869	while (again) {
 5870		struct sk_buff *skb;
 5871
 5872		while ((skb = __skb_dequeue(&sd->process_queue))) {
 5873			rcu_read_lock();
 5874			__netif_receive_skb(skb);
 5875			rcu_read_unlock();
 5876			input_queue_head_incr(sd);
 5877			if (++work >= quota)
 5878				return work;
 5879
 5880		}
 5881
 5882		local_irq_disable();
 5883		rps_lock(sd);
 5884		if (skb_queue_empty(&sd->input_pkt_queue)) {
 5885			/*
 5886			 * Inline a custom version of __napi_complete().
 5887			 * only current cpu owns and manipulates this napi,
 5888			 * and NAPI_STATE_SCHED is the only possible flag set
 5889			 * on backlog.
 5890			 * We can use a plain write instead of clear_bit(),
 5891			 * and we dont need an smp_mb() memory barrier.
 5892			 */
 5893			napi->state = 0;
 5894			again = false;
 5895		} else {
 5896			skb_queue_splice_tail_init(&sd->input_pkt_queue,
 5897						   &sd->process_queue);
 5898		}
 5899		rps_unlock(sd);
 5900		local_irq_enable();
 5901	}
 5902
 5903	return work;
 5904}
 5905
 5906/**
 5907 * __napi_schedule - schedule for receive
 5908 * @n: entry to schedule
 5909 *
 5910 * The entry's receive function will be scheduled to run.
 5911 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
 5912 */
 5913void __napi_schedule(struct napi_struct *n)
 5914{
 5915	unsigned long flags;
 5916
 5917	local_irq_save(flags);
 5918	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 5919	local_irq_restore(flags);
 5920}
 5921EXPORT_SYMBOL(__napi_schedule);
 5922
 5923/**
 5924 *	napi_schedule_prep - check if napi can be scheduled
 5925 *	@n: napi context
 5926 *
 5927 * Test if NAPI routine is already running, and if not mark
 5928 * it as running.  This is used as a condition variable
 5929 * insure only one NAPI poll instance runs.  We also make
 5930 * sure there is no pending NAPI disable.
 5931 */
 5932bool napi_schedule_prep(struct napi_struct *n)
 5933{
 5934	unsigned long val, new;
 5935
 5936	do {
 5937		val = READ_ONCE(n->state);
 5938		if (unlikely(val & NAPIF_STATE_DISABLE))
 5939			return false;
 5940		new = val | NAPIF_STATE_SCHED;
 5941
 5942		/* Sets STATE_MISSED bit if STATE_SCHED was already set
 5943		 * This was suggested by Alexander Duyck, as compiler
 5944		 * emits better code than :
 5945		 * if (val & NAPIF_STATE_SCHED)
 5946		 *     new |= NAPIF_STATE_MISSED;
 5947		 */
 5948		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
 5949						   NAPIF_STATE_MISSED;
 5950	} while (cmpxchg(&n->state, val, new) != val);
 5951
 5952	return !(val & NAPIF_STATE_SCHED);
 5953}
 5954EXPORT_SYMBOL(napi_schedule_prep);
 5955
 5956/**
 5957 * __napi_schedule_irqoff - schedule for receive
 5958 * @n: entry to schedule
 5959 *
 5960 * Variant of __napi_schedule() assuming hard irqs are masked
 5961 */
 5962void __napi_schedule_irqoff(struct napi_struct *n)
 5963{
 5964	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 5965}
 5966EXPORT_SYMBOL(__napi_schedule_irqoff);
 5967
 5968bool napi_complete_done(struct napi_struct *n, int work_done)
 5969{
 5970	unsigned long flags, val, new;
 5971
 5972	/*
 5973	 * 1) Don't let napi dequeue from the cpu poll list
 5974	 *    just in case its running on a different cpu.
 5975	 * 2) If we are busy polling, do nothing here, we have
 5976	 *    the guarantee we will be called later.
 5977	 */
 5978	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
 5979				 NAPIF_STATE_IN_BUSY_POLL)))
 5980		return false;
 5981
 5982	gro_normal_list(n);
 5983
 5984	if (n->gro_bitmask) {
 5985		unsigned long timeout = 0;
 5986
 5987		if (work_done)
 5988			timeout = n->dev->gro_flush_timeout;
 5989
 5990		/* When the NAPI instance uses a timeout and keeps postponing
 5991		 * it, we need to bound somehow the time packets are kept in
 5992		 * the GRO layer
 5993		 */
 5994		napi_gro_flush(n, !!timeout);
 5995		if (timeout)
 5996			hrtimer_start(&n->timer, ns_to_ktime(timeout),
 5997				      HRTIMER_MODE_REL_PINNED);
 
 
 5998	}
 5999	if (unlikely(!list_empty(&n->poll_list))) {
 6000		/* If n->poll_list is not empty, we need to mask irqs */
 6001		local_irq_save(flags);
 6002		list_del_init(&n->poll_list);
 6003		local_irq_restore(flags);
 6004	}
 6005
 6006	do {
 6007		val = READ_ONCE(n->state);
 6008
 6009		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
 6010
 6011		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
 6012
 6013		/* If STATE_MISSED was set, leave STATE_SCHED set,
 6014		 * because we will call napi->poll() one more time.
 6015		 * This C code was suggested by Alexander Duyck to help gcc.
 6016		 */
 6017		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
 6018						    NAPIF_STATE_SCHED;
 6019	} while (cmpxchg(&n->state, val, new) != val);
 6020
 6021	if (unlikely(val & NAPIF_STATE_MISSED)) {
 6022		__napi_schedule(n);
 6023		return false;
 6024	}
 6025
 6026	return true;
 6027}
 6028EXPORT_SYMBOL(napi_complete_done);
 6029
 6030/* must be called under rcu_read_lock(), as we dont take a reference */
 6031static struct napi_struct *napi_by_id(unsigned int napi_id)
 6032{
 6033	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
 6034	struct napi_struct *napi;
 6035
 6036	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
 6037		if (napi->napi_id == napi_id)
 6038			return napi;
 6039
 6040	return NULL;
 6041}
 6042
 6043#if defined(CONFIG_NET_RX_BUSY_POLL)
 6044
 6045#define BUSY_POLL_BUDGET 8
 6046
 6047static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
 6048{
 6049	int rc;
 6050
 6051	/* Busy polling means there is a high chance device driver hard irq
 6052	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
 6053	 * set in napi_schedule_prep().
 6054	 * Since we are about to call napi->poll() once more, we can safely
 6055	 * clear NAPI_STATE_MISSED.
 6056	 *
 6057	 * Note: x86 could use a single "lock and ..." instruction
 6058	 * to perform these two clear_bit()
 6059	 */
 6060	clear_bit(NAPI_STATE_MISSED, &napi->state);
 6061	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
 6062
 6063	local_bh_disable();
 6064
 6065	/* All we really want here is to re-enable device interrupts.
 6066	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
 6067	 */
 6068	rc = napi->poll(napi, BUSY_POLL_BUDGET);
 6069	/* We can't gro_normal_list() here, because napi->poll() might have
 6070	 * rearmed the napi (napi_complete_done()) in which case it could
 6071	 * already be running on another CPU.
 6072	 */
 6073	trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
 6074	netpoll_poll_unlock(have_poll_lock);
 6075	if (rc == BUSY_POLL_BUDGET) {
 6076		/* As the whole budget was spent, we still own the napi so can
 6077		 * safely handle the rx_list.
 6078		 */
 6079		gro_normal_list(napi);
 6080		__napi_schedule(napi);
 6081	}
 6082	local_bh_enable();
 6083}
 6084
 6085void napi_busy_loop(unsigned int napi_id,
 6086		    bool (*loop_end)(void *, unsigned long),
 6087		    void *loop_end_arg)
 6088{
 6089	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
 6090	int (*napi_poll)(struct napi_struct *napi, int budget);
 6091	void *have_poll_lock = NULL;
 6092	struct napi_struct *napi;
 6093
 6094restart:
 6095	napi_poll = NULL;
 6096
 6097	rcu_read_lock();
 6098
 6099	napi = napi_by_id(napi_id);
 6100	if (!napi)
 6101		goto out;
 6102
 6103	preempt_disable();
 6104	for (;;) {
 6105		int work = 0;
 6106
 6107		local_bh_disable();
 6108		if (!napi_poll) {
 6109			unsigned long val = READ_ONCE(napi->state);
 6110
 6111			/* If multiple threads are competing for this napi,
 6112			 * we avoid dirtying napi->state as much as we can.
 6113			 */
 6114			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
 6115				   NAPIF_STATE_IN_BUSY_POLL))
 6116				goto count;
 6117			if (cmpxchg(&napi->state, val,
 6118				    val | NAPIF_STATE_IN_BUSY_POLL |
 6119					  NAPIF_STATE_SCHED) != val)
 6120				goto count;
 6121			have_poll_lock = netpoll_poll_lock(napi);
 6122			napi_poll = napi->poll;
 6123		}
 6124		work = napi_poll(napi, BUSY_POLL_BUDGET);
 6125		trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
 6126		gro_normal_list(napi);
 6127count:
 6128		if (work > 0)
 6129			__NET_ADD_STATS(dev_net(napi->dev),
 6130					LINUX_MIB_BUSYPOLLRXPACKETS, work);
 6131		local_bh_enable();
 6132
 6133		if (!loop_end || loop_end(loop_end_arg, start_time))
 6134			break;
 6135
 6136		if (unlikely(need_resched())) {
 6137			if (napi_poll)
 6138				busy_poll_stop(napi, have_poll_lock);
 6139			preempt_enable();
 6140			rcu_read_unlock();
 6141			cond_resched();
 6142			if (loop_end(loop_end_arg, start_time))
 6143				return;
 6144			goto restart;
 6145		}
 6146		cpu_relax();
 6147	}
 6148	if (napi_poll)
 6149		busy_poll_stop(napi, have_poll_lock);
 6150	preempt_enable();
 6151out:
 6152	rcu_read_unlock();
 6153}
 6154EXPORT_SYMBOL(napi_busy_loop);
 6155
 6156#endif /* CONFIG_NET_RX_BUSY_POLL */
 6157
 6158static void napi_hash_add(struct napi_struct *napi)
 6159{
 6160	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
 6161	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
 6162		return;
 6163
 6164	spin_lock(&napi_hash_lock);
 6165
 6166	/* 0..NR_CPUS range is reserved for sender_cpu use */
 6167	do {
 6168		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
 6169			napi_gen_id = MIN_NAPI_ID;
 6170	} while (napi_by_id(napi_gen_id));
 6171	napi->napi_id = napi_gen_id;
 6172
 6173	hlist_add_head_rcu(&napi->napi_hash_node,
 6174			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
 6175
 6176	spin_unlock(&napi_hash_lock);
 6177}
 6178
 6179/* Warning : caller is responsible to make sure rcu grace period
 6180 * is respected before freeing memory containing @napi
 6181 */
 6182bool napi_hash_del(struct napi_struct *napi)
 6183{
 6184	bool rcu_sync_needed = false;
 6185
 6186	spin_lock(&napi_hash_lock);
 6187
 6188	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
 6189		rcu_sync_needed = true;
 6190		hlist_del_rcu(&napi->napi_hash_node);
 6191	}
 6192	spin_unlock(&napi_hash_lock);
 6193	return rcu_sync_needed;
 6194}
 6195EXPORT_SYMBOL_GPL(napi_hash_del);
 6196
 6197static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 6198{
 6199	struct napi_struct *napi;
 6200
 6201	napi = container_of(timer, struct napi_struct, timer);
 6202
 6203	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
 6204	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
 6205	 */
 6206	if (napi->gro_bitmask && !napi_disable_pending(napi) &&
 6207	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
 6208		__napi_schedule_irqoff(napi);
 6209
 6210	return HRTIMER_NORESTART;
 6211}
 6212
 6213static void init_gro_hash(struct napi_struct *napi)
 6214{
 6215	int i;
 6216
 6217	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6218		INIT_LIST_HEAD(&napi->gro_hash[i].list);
 6219		napi->gro_hash[i].count = 0;
 6220	}
 6221	napi->gro_bitmask = 0;
 6222}
 6223
 6224void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 6225		    int (*poll)(struct napi_struct *, int), int weight)
 6226{
 6227	INIT_LIST_HEAD(&napi->poll_list);
 6228	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 6229	napi->timer.function = napi_watchdog;
 6230	init_gro_hash(napi);
 
 6231	napi->skb = NULL;
 6232	INIT_LIST_HEAD(&napi->rx_list);
 6233	napi->rx_count = 0;
 6234	napi->poll = poll;
 6235	if (weight > NAPI_POLL_WEIGHT)
 6236		netdev_err_once(dev, "%s() called with weight %d\n", __func__,
 6237				weight);
 6238	napi->weight = weight;
 6239	list_add(&napi->dev_list, &dev->napi_list);
 6240	napi->dev = dev;
 6241#ifdef CONFIG_NETPOLL
 6242	napi->poll_owner = -1;
 6243#endif
 6244	set_bit(NAPI_STATE_SCHED, &napi->state);
 6245	napi_hash_add(napi);
 6246}
 6247EXPORT_SYMBOL(netif_napi_add);
 6248
 6249void napi_disable(struct napi_struct *n)
 6250{
 6251	might_sleep();
 6252	set_bit(NAPI_STATE_DISABLE, &n->state);
 6253
 6254	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
 6255		msleep(1);
 6256	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
 6257		msleep(1);
 6258
 6259	hrtimer_cancel(&n->timer);
 6260
 6261	clear_bit(NAPI_STATE_DISABLE, &n->state);
 6262}
 6263EXPORT_SYMBOL(napi_disable);
 6264
 6265static void flush_gro_hash(struct napi_struct *napi)
 6266{
 6267	int i;
 6268
 6269	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6270		struct sk_buff *skb, *n;
 6271
 6272		list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
 6273			kfree_skb(skb);
 6274		napi->gro_hash[i].count = 0;
 6275	}
 6276}
 6277
 6278/* Must be called in process context */
 6279void netif_napi_del(struct napi_struct *napi)
 6280{
 6281	might_sleep();
 6282	if (napi_hash_del(napi))
 6283		synchronize_net();
 6284	list_del_init(&napi->dev_list);
 6285	napi_free_frags(napi);
 6286
 6287	flush_gro_hash(napi);
 6288	napi->gro_bitmask = 0;
 
 6289}
 6290EXPORT_SYMBOL(netif_napi_del);
 6291
 6292static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 6293{
 6294	void *have;
 6295	int work, weight;
 6296
 6297	list_del_init(&n->poll_list);
 6298
 6299	have = netpoll_poll_lock(n);
 6300
 6301	weight = n->weight;
 6302
 6303	/* This NAPI_STATE_SCHED test is for avoiding a race
 6304	 * with netpoll's poll_napi().  Only the entity which
 6305	 * obtains the lock and sees NAPI_STATE_SCHED set will
 6306	 * actually make the ->poll() call.  Therefore we avoid
 6307	 * accidentally calling ->poll() when NAPI is not scheduled.
 6308	 */
 6309	work = 0;
 6310	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 6311		work = n->poll(n, weight);
 6312		trace_napi_poll(n, work, weight);
 6313	}
 6314
 6315	WARN_ON_ONCE(work > weight);
 6316
 6317	if (likely(work < weight))
 6318		goto out_unlock;
 6319
 6320	/* Drivers must not modify the NAPI state if they
 6321	 * consume the entire weight.  In such cases this code
 6322	 * still "owns" the NAPI instance and therefore can
 6323	 * move the instance around on the list at-will.
 6324	 */
 6325	if (unlikely(napi_disable_pending(n))) {
 6326		napi_complete(n);
 6327		goto out_unlock;
 6328	}
 6329
 6330	gro_normal_list(n);
 6331
 6332	if (n->gro_bitmask) {
 6333		/* flush too old packets
 6334		 * If HZ < 1000, flush all packets.
 6335		 */
 6336		napi_gro_flush(n, HZ >= 1000);
 6337	}
 6338
 6339	/* Some drivers may have called napi_schedule
 6340	 * prior to exhausting their budget.
 6341	 */
 6342	if (unlikely(!list_empty(&n->poll_list))) {
 6343		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
 6344			     n->dev ? n->dev->name : "backlog");
 6345		goto out_unlock;
 6346	}
 6347
 6348	list_add_tail(&n->poll_list, repoll);
 6349
 6350out_unlock:
 6351	netpoll_poll_unlock(have);
 6352
 6353	return work;
 6354}
 6355
 6356static __latent_entropy void net_rx_action(struct softirq_action *h)
 6357{
 6358	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 6359	unsigned long time_limit = jiffies +
 6360		usecs_to_jiffies(netdev_budget_usecs);
 6361	int budget = netdev_budget;
 6362	LIST_HEAD(list);
 6363	LIST_HEAD(repoll);
 6364
 6365	local_irq_disable();
 6366	list_splice_init(&sd->poll_list, &list);
 6367	local_irq_enable();
 6368
 6369	for (;;) {
 6370		struct napi_struct *n;
 6371
 6372		if (list_empty(&list)) {
 6373			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
 6374				goto out;
 6375			break;
 6376		}
 6377
 6378		n = list_first_entry(&list, struct napi_struct, poll_list);
 6379		budget -= napi_poll(n, &repoll);
 6380
 6381		/* If softirq window is exhausted then punt.
 6382		 * Allow this to run for 2 jiffies since which will allow
 6383		 * an average latency of 1.5/HZ.
 6384		 */
 6385		if (unlikely(budget <= 0 ||
 6386			     time_after_eq(jiffies, time_limit))) {
 6387			sd->time_squeeze++;
 6388			break;
 6389		}
 6390	}
 6391
 6392	local_irq_disable();
 6393
 6394	list_splice_tail_init(&sd->poll_list, &list);
 6395	list_splice_tail(&repoll, &list);
 6396	list_splice(&list, &sd->poll_list);
 6397	if (!list_empty(&sd->poll_list))
 6398		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 6399
 6400	net_rps_action_and_irq_enable(sd);
 6401out:
 6402	__kfree_skb_flush();
 6403}
 6404
 6405struct netdev_adjacent {
 6406	struct net_device *dev;
 6407
 6408	/* upper master flag, there can only be one master device per list */
 6409	bool master;
 6410
 6411	/* lookup ignore flag */
 6412	bool ignore;
 6413
 6414	/* counter for the number of times this device was added to us */
 6415	u16 ref_nr;
 6416
 6417	/* private field for the users */
 6418	void *private;
 6419
 6420	struct list_head list;
 6421	struct rcu_head rcu;
 6422};
 6423
 6424static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
 6425						 struct list_head *adj_list)
 6426{
 6427	struct netdev_adjacent *adj;
 6428
 6429	list_for_each_entry(adj, adj_list, list) {
 6430		if (adj->dev == adj_dev)
 6431			return adj;
 6432	}
 6433	return NULL;
 6434}
 6435
 6436static int ____netdev_has_upper_dev(struct net_device *upper_dev, void *data)
 6437{
 6438	struct net_device *dev = data;
 6439
 6440	return upper_dev == dev;
 6441}
 6442
 6443/**
 6444 * netdev_has_upper_dev - Check if device is linked to an upper device
 6445 * @dev: device
 6446 * @upper_dev: upper device to check
 6447 *
 6448 * Find out if a device is linked to specified upper device and return true
 6449 * in case it is. Note that this checks only immediate upper device,
 6450 * not through a complete stack of devices. The caller must hold the RTNL lock.
 6451 */
 6452bool netdev_has_upper_dev(struct net_device *dev,
 6453			  struct net_device *upper_dev)
 6454{
 6455	ASSERT_RTNL();
 6456
 6457	return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 6458					     upper_dev);
 6459}
 6460EXPORT_SYMBOL(netdev_has_upper_dev);
 6461
 6462/**
 6463 * netdev_has_upper_dev_all - Check if device is linked to an upper device
 6464 * @dev: device
 6465 * @upper_dev: upper device to check
 6466 *
 6467 * Find out if a device is linked to specified upper device and return true
 6468 * in case it is. Note that this checks the entire upper device chain.
 6469 * The caller must hold rcu lock.
 6470 */
 6471
 6472bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
 6473				  struct net_device *upper_dev)
 6474{
 6475	return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 6476					       upper_dev);
 6477}
 6478EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
 6479
 6480/**
 6481 * netdev_has_any_upper_dev - Check if device is linked to some device
 6482 * @dev: device
 6483 *
 6484 * Find out if a device is linked to an upper device and return true in case
 6485 * it is. The caller must hold the RTNL lock.
 6486 */
 6487bool netdev_has_any_upper_dev(struct net_device *dev)
 6488{
 6489	ASSERT_RTNL();
 6490
 6491	return !list_empty(&dev->adj_list.upper);
 6492}
 6493EXPORT_SYMBOL(netdev_has_any_upper_dev);
 6494
 6495/**
 6496 * netdev_master_upper_dev_get - Get master upper device
 6497 * @dev: device
 6498 *
 6499 * Find a master upper device and return pointer to it or NULL in case
 6500 * it's not there. The caller must hold the RTNL lock.
 6501 */
 6502struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
 6503{
 6504	struct netdev_adjacent *upper;
 6505
 6506	ASSERT_RTNL();
 6507
 6508	if (list_empty(&dev->adj_list.upper))
 6509		return NULL;
 6510
 6511	upper = list_first_entry(&dev->adj_list.upper,
 6512				 struct netdev_adjacent, list);
 6513	if (likely(upper->master))
 6514		return upper->dev;
 6515	return NULL;
 6516}
 6517EXPORT_SYMBOL(netdev_master_upper_dev_get);
 6518
 6519static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
 6520{
 6521	struct netdev_adjacent *upper;
 6522
 6523	ASSERT_RTNL();
 6524
 6525	if (list_empty(&dev->adj_list.upper))
 6526		return NULL;
 6527
 6528	upper = list_first_entry(&dev->adj_list.upper,
 6529				 struct netdev_adjacent, list);
 6530	if (likely(upper->master) && !upper->ignore)
 6531		return upper->dev;
 6532	return NULL;
 6533}
 6534
 6535/**
 6536 * netdev_has_any_lower_dev - Check if device is linked to some device
 6537 * @dev: device
 6538 *
 6539 * Find out if a device is linked to a lower device and return true in case
 6540 * it is. The caller must hold the RTNL lock.
 6541 */
 6542static bool netdev_has_any_lower_dev(struct net_device *dev)
 6543{
 6544	ASSERT_RTNL();
 6545
 6546	return !list_empty(&dev->adj_list.lower);
 6547}
 6548
 6549void *netdev_adjacent_get_private(struct list_head *adj_list)
 6550{
 6551	struct netdev_adjacent *adj;
 6552
 6553	adj = list_entry(adj_list, struct netdev_adjacent, list);
 6554
 6555	return adj->private;
 6556}
 6557EXPORT_SYMBOL(netdev_adjacent_get_private);
 6558
 6559/**
 6560 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
 6561 * @dev: device
 6562 * @iter: list_head ** of the current position
 6563 *
 6564 * Gets the next device from the dev's upper list, starting from iter
 6565 * position. The caller must hold RCU read lock.
 6566 */
 6567struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
 6568						 struct list_head **iter)
 6569{
 6570	struct netdev_adjacent *upper;
 6571
 6572	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 6573
 6574	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 6575
 6576	if (&upper->list == &dev->adj_list.upper)
 6577		return NULL;
 6578
 6579	*iter = &upper->list;
 6580
 6581	return upper->dev;
 6582}
 6583EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
 6584
 6585static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
 6586						  struct list_head **iter,
 6587						  bool *ignore)
 6588{
 6589	struct netdev_adjacent *upper;
 6590
 6591	upper = list_entry((*iter)->next, struct netdev_adjacent, list);
 6592
 6593	if (&upper->list == &dev->adj_list.upper)
 6594		return NULL;
 6595
 6596	*iter = &upper->list;
 6597	*ignore = upper->ignore;
 6598
 6599	return upper->dev;
 6600}
 6601
 6602static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
 6603						    struct list_head **iter)
 6604{
 6605	struct netdev_adjacent *upper;
 6606
 6607	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 6608
 6609	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 6610
 6611	if (&upper->list == &dev->adj_list.upper)
 6612		return NULL;
 6613
 6614	*iter = &upper->list;
 6615
 6616	return upper->dev;
 6617}
 6618
 6619static int __netdev_walk_all_upper_dev(struct net_device *dev,
 6620				       int (*fn)(struct net_device *dev,
 6621						 void *data),
 6622				       void *data)
 6623{
 6624	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 6625	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 6626	int ret, cur = 0;
 6627	bool ignore;
 6628
 6629	now = dev;
 6630	iter = &dev->adj_list.upper;
 6631
 6632	while (1) {
 6633		if (now != dev) {
 6634			ret = fn(now, data);
 6635			if (ret)
 6636				return ret;
 6637		}
 6638
 6639		next = NULL;
 6640		while (1) {
 6641			udev = __netdev_next_upper_dev(now, &iter, &ignore);
 6642			if (!udev)
 6643				break;
 6644			if (ignore)
 6645				continue;
 6646
 6647			next = udev;
 6648			niter = &udev->adj_list.upper;
 6649			dev_stack[cur] = now;
 6650			iter_stack[cur++] = iter;
 6651			break;
 6652		}
 6653
 6654		if (!next) {
 6655			if (!cur)
 6656				return 0;
 6657			next = dev_stack[--cur];
 6658			niter = iter_stack[cur];
 6659		}
 6660
 6661		now = next;
 6662		iter = niter;
 6663	}
 6664
 6665	return 0;
 6666}
 6667
 6668int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
 6669				  int (*fn)(struct net_device *dev,
 6670					    void *data),
 6671				  void *data)
 6672{
 6673	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 6674	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 6675	int ret, cur = 0;
 6676
 6677	now = dev;
 6678	iter = &dev->adj_list.upper;
 6679
 6680	while (1) {
 6681		if (now != dev) {
 6682			ret = fn(now, data);
 6683			if (ret)
 6684				return ret;
 6685		}
 6686
 6687		next = NULL;
 6688		while (1) {
 6689			udev = netdev_next_upper_dev_rcu(now, &iter);
 6690			if (!udev)
 6691				break;
 6692
 6693			next = udev;
 6694			niter = &udev->adj_list.upper;
 6695			dev_stack[cur] = now;
 6696			iter_stack[cur++] = iter;
 6697			break;
 6698		}
 6699
 6700		if (!next) {
 6701			if (!cur)
 6702				return 0;
 6703			next = dev_stack[--cur];
 6704			niter = iter_stack[cur];
 6705		}
 
 
 6706
 6707		now = next;
 6708		iter = niter;
 
 
 6709	}
 6710
 6711	return 0;
 6712}
 6713EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
 6714
 6715static bool __netdev_has_upper_dev(struct net_device *dev,
 6716				   struct net_device *upper_dev)
 6717{
 6718	ASSERT_RTNL();
 6719
 6720	return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
 6721					   upper_dev);
 6722}
 6723
 6724/**
 6725 * netdev_lower_get_next_private - Get the next ->private from the
 6726 *				   lower neighbour list
 6727 * @dev: device
 6728 * @iter: list_head ** of the current position
 6729 *
 6730 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 6731 * list, starting from iter position. The caller must hold either hold the
 6732 * RTNL lock or its own locking that guarantees that the neighbour lower
 6733 * list will remain unchanged.
 6734 */
 6735void *netdev_lower_get_next_private(struct net_device *dev,
 6736				    struct list_head **iter)
 6737{
 6738	struct netdev_adjacent *lower;
 6739
 6740	lower = list_entry(*iter, struct netdev_adjacent, list);
 6741
 6742	if (&lower->list == &dev->adj_list.lower)
 6743		return NULL;
 6744
 6745	*iter = lower->list.next;
 6746
 6747	return lower->private;
 6748}
 6749EXPORT_SYMBOL(netdev_lower_get_next_private);
 6750
 6751/**
 6752 * netdev_lower_get_next_private_rcu - Get the next ->private from the
 6753 *				       lower neighbour list, RCU
 6754 *				       variant
 6755 * @dev: device
 6756 * @iter: list_head ** of the current position
 6757 *
 6758 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 6759 * list, starting from iter position. The caller must hold RCU read lock.
 6760 */
 6761void *netdev_lower_get_next_private_rcu(struct net_device *dev,
 6762					struct list_head **iter)
 6763{
 6764	struct netdev_adjacent *lower;
 6765
 6766	WARN_ON_ONCE(!rcu_read_lock_held());
 6767
 6768	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 6769
 6770	if (&lower->list == &dev->adj_list.lower)
 6771		return NULL;
 6772
 6773	*iter = &lower->list;
 6774
 6775	return lower->private;
 6776}
 6777EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
 6778
 6779/**
 6780 * netdev_lower_get_next - Get the next device from the lower neighbour
 6781 *                         list
 6782 * @dev: device
 6783 * @iter: list_head ** of the current position
 6784 *
 6785 * Gets the next netdev_adjacent from the dev's lower neighbour
 6786 * list, starting from iter position. The caller must hold RTNL lock or
 6787 * its own locking that guarantees that the neighbour lower
 6788 * list will remain unchanged.
 6789 */
 6790void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
 6791{
 6792	struct netdev_adjacent *lower;
 6793
 6794	lower = list_entry(*iter, struct netdev_adjacent, list);
 6795
 6796	if (&lower->list == &dev->adj_list.lower)
 6797		return NULL;
 6798
 6799	*iter = lower->list.next;
 6800
 6801	return lower->dev;
 6802}
 6803EXPORT_SYMBOL(netdev_lower_get_next);
 6804
 6805static struct net_device *netdev_next_lower_dev(struct net_device *dev,
 6806						struct list_head **iter)
 6807{
 6808	struct netdev_adjacent *lower;
 6809
 6810	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 6811
 6812	if (&lower->list == &dev->adj_list.lower)
 6813		return NULL;
 6814
 6815	*iter = &lower->list;
 6816
 6817	return lower->dev;
 6818}
 6819
 6820static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
 6821						  struct list_head **iter,
 6822						  bool *ignore)
 6823{
 6824	struct netdev_adjacent *lower;
 6825
 6826	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 6827
 6828	if (&lower->list == &dev->adj_list.lower)
 6829		return NULL;
 6830
 6831	*iter = &lower->list;
 6832	*ignore = lower->ignore;
 6833
 6834	return lower->dev;
 6835}
 6836
 6837int netdev_walk_all_lower_dev(struct net_device *dev,
 6838			      int (*fn)(struct net_device *dev,
 6839					void *data),
 6840			      void *data)
 6841{
 6842	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 6843	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 6844	int ret, cur = 0;
 6845
 6846	now = dev;
 6847	iter = &dev->adj_list.lower;
 6848
 6849	while (1) {
 6850		if (now != dev) {
 6851			ret = fn(now, data);
 6852			if (ret)
 6853				return ret;
 6854		}
 6855
 6856		next = NULL;
 6857		while (1) {
 6858			ldev = netdev_next_lower_dev(now, &iter);
 6859			if (!ldev)
 6860				break;
 6861
 6862			next = ldev;
 6863			niter = &ldev->adj_list.lower;
 6864			dev_stack[cur] = now;
 6865			iter_stack[cur++] = iter;
 6866			break;
 6867		}
 6868
 6869		if (!next) {
 6870			if (!cur)
 6871				return 0;
 6872			next = dev_stack[--cur];
 6873			niter = iter_stack[cur];
 6874		}
 
 
 6875
 6876		now = next;
 6877		iter = niter;
 
 
 6878	}
 6879
 6880	return 0;
 6881}
 6882EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
 6883
 6884static int __netdev_walk_all_lower_dev(struct net_device *dev,
 6885				       int (*fn)(struct net_device *dev,
 6886						 void *data),
 6887				       void *data)
 6888{
 6889	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 6890	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 6891	int ret, cur = 0;
 6892	bool ignore;
 6893
 6894	now = dev;
 6895	iter = &dev->adj_list.lower;
 6896
 6897	while (1) {
 6898		if (now != dev) {
 6899			ret = fn(now, data);
 6900			if (ret)
 6901				return ret;
 6902		}
 6903
 6904		next = NULL;
 6905		while (1) {
 6906			ldev = __netdev_next_lower_dev(now, &iter, &ignore);
 6907			if (!ldev)
 6908				break;
 6909			if (ignore)
 6910				continue;
 6911
 6912			next = ldev;
 6913			niter = &ldev->adj_list.lower;
 6914			dev_stack[cur] = now;
 6915			iter_stack[cur++] = iter;
 6916			break;
 6917		}
 6918
 6919		if (!next) {
 6920			if (!cur)
 6921				return 0;
 6922			next = dev_stack[--cur];
 6923			niter = iter_stack[cur];
 6924		}
 6925
 6926		now = next;
 6927		iter = niter;
 6928	}
 6929
 6930	return 0;
 6931}
 6932
 6933static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
 6934						    struct list_head **iter)
 6935{
 6936	struct netdev_adjacent *lower;
 6937
 6938	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 6939	if (&lower->list == &dev->adj_list.lower)
 6940		return NULL;
 6941
 6942	*iter = &lower->list;
 6943
 6944	return lower->dev;
 6945}
 6946
 6947static u8 __netdev_upper_depth(struct net_device *dev)
 6948{
 6949	struct net_device *udev;
 6950	struct list_head *iter;
 6951	u8 max_depth = 0;
 6952	bool ignore;
 6953
 6954	for (iter = &dev->adj_list.upper,
 6955	     udev = __netdev_next_upper_dev(dev, &iter, &ignore);
 6956	     udev;
 6957	     udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
 6958		if (ignore)
 6959			continue;
 6960		if (max_depth < udev->upper_level)
 6961			max_depth = udev->upper_level;
 6962	}
 6963
 6964	return max_depth;
 6965}
 6966
 6967static u8 __netdev_lower_depth(struct net_device *dev)
 6968{
 6969	struct net_device *ldev;
 6970	struct list_head *iter;
 6971	u8 max_depth = 0;
 6972	bool ignore;
 6973
 6974	for (iter = &dev->adj_list.lower,
 6975	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
 6976	     ldev;
 6977	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
 6978		if (ignore)
 6979			continue;
 6980		if (max_depth < ldev->lower_level)
 6981			max_depth = ldev->lower_level;
 6982	}
 6983
 6984	return max_depth;
 6985}
 6986
 6987static int __netdev_update_upper_level(struct net_device *dev, void *data)
 6988{
 6989	dev->upper_level = __netdev_upper_depth(dev) + 1;
 6990	return 0;
 6991}
 6992
 6993static int __netdev_update_lower_level(struct net_device *dev, void *data)
 6994{
 6995	dev->lower_level = __netdev_lower_depth(dev) + 1;
 6996	return 0;
 6997}
 6998
 6999int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
 7000				  int (*fn)(struct net_device *dev,
 7001					    void *data),
 7002				  void *data)
 7003{
 7004	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7005	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7006	int ret, cur = 0;
 7007
 7008	now = dev;
 7009	iter = &dev->adj_list.lower;
 7010
 7011	while (1) {
 7012		if (now != dev) {
 7013			ret = fn(now, data);
 7014			if (ret)
 7015				return ret;
 7016		}
 7017
 7018		next = NULL;
 7019		while (1) {
 7020			ldev = netdev_next_lower_dev_rcu(now, &iter);
 7021			if (!ldev)
 7022				break;
 7023
 7024			next = ldev;
 7025			niter = &ldev->adj_list.lower;
 7026			dev_stack[cur] = now;
 7027			iter_stack[cur++] = iter;
 7028			break;
 7029		}
 7030
 7031		if (!next) {
 7032			if (!cur)
 7033				return 0;
 7034			next = dev_stack[--cur];
 7035			niter = iter_stack[cur];
 7036		}
 
 
 7037
 7038		now = next;
 7039		iter = niter;
 
 
 7040	}
 7041
 7042	return 0;
 7043}
 7044EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
 7045
 7046/**
 7047 * netdev_lower_get_first_private_rcu - Get the first ->private from the
 7048 *				       lower neighbour list, RCU
 7049 *				       variant
 7050 * @dev: device
 7051 *
 7052 * Gets the first netdev_adjacent->private from the dev's lower neighbour
 7053 * list. The caller must hold RCU read lock.
 7054 */
 7055void *netdev_lower_get_first_private_rcu(struct net_device *dev)
 7056{
 7057	struct netdev_adjacent *lower;
 7058
 7059	lower = list_first_or_null_rcu(&dev->adj_list.lower,
 7060			struct netdev_adjacent, list);
 7061	if (lower)
 7062		return lower->private;
 7063	return NULL;
 7064}
 7065EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
 7066
 7067/**
 7068 * netdev_master_upper_dev_get_rcu - Get master upper device
 7069 * @dev: device
 7070 *
 7071 * Find a master upper device and return pointer to it or NULL in case
 7072 * it's not there. The caller must hold the RCU read lock.
 7073 */
 7074struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
 7075{
 7076	struct netdev_adjacent *upper;
 7077
 7078	upper = list_first_or_null_rcu(&dev->adj_list.upper,
 7079				       struct netdev_adjacent, list);
 7080	if (upper && likely(upper->master))
 7081		return upper->dev;
 7082	return NULL;
 7083}
 7084EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
 7085
 7086static int netdev_adjacent_sysfs_add(struct net_device *dev,
 7087			      struct net_device *adj_dev,
 7088			      struct list_head *dev_list)
 7089{
 7090	char linkname[IFNAMSIZ+7];
 7091
 7092	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7093		"upper_%s" : "lower_%s", adj_dev->name);
 7094	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
 7095				 linkname);
 7096}
 7097static void netdev_adjacent_sysfs_del(struct net_device *dev,
 7098			       char *name,
 7099			       struct list_head *dev_list)
 7100{
 7101	char linkname[IFNAMSIZ+7];
 7102
 7103	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7104		"upper_%s" : "lower_%s", name);
 7105	sysfs_remove_link(&(dev->dev.kobj), linkname);
 7106}
 7107
 7108static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
 7109						 struct net_device *adj_dev,
 7110						 struct list_head *dev_list)
 7111{
 7112	return (dev_list == &dev->adj_list.upper ||
 7113		dev_list == &dev->adj_list.lower) &&
 7114		net_eq(dev_net(dev), dev_net(adj_dev));
 7115}
 7116
 7117static int __netdev_adjacent_dev_insert(struct net_device *dev,
 7118					struct net_device *adj_dev,
 7119					struct list_head *dev_list,
 7120					void *private, bool master)
 7121{
 7122	struct netdev_adjacent *adj;
 7123	int ret;
 7124
 7125	adj = __netdev_find_adj(adj_dev, dev_list);
 7126
 7127	if (adj) {
 7128		adj->ref_nr += 1;
 7129		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
 7130			 dev->name, adj_dev->name, adj->ref_nr);
 7131
 7132		return 0;
 7133	}
 7134
 7135	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
 7136	if (!adj)
 7137		return -ENOMEM;
 7138
 7139	adj->dev = adj_dev;
 7140	adj->master = master;
 7141	adj->ref_nr = 1;
 7142	adj->private = private;
 7143	adj->ignore = false;
 7144	dev_hold(adj_dev);
 7145
 7146	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
 7147		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
 7148
 7149	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
 7150		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
 7151		if (ret)
 7152			goto free_adj;
 7153	}
 7154
 7155	/* Ensure that master link is always the first item in list. */
 7156	if (master) {
 7157		ret = sysfs_create_link(&(dev->dev.kobj),
 7158					&(adj_dev->dev.kobj), "master");
 7159		if (ret)
 7160			goto remove_symlinks;
 7161
 7162		list_add_rcu(&adj->list, dev_list);
 7163	} else {
 7164		list_add_tail_rcu(&adj->list, dev_list);
 7165	}
 7166
 7167	return 0;
 7168
 7169remove_symlinks:
 7170	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 7171		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 7172free_adj:
 7173	kfree(adj);
 7174	dev_put(adj_dev);
 7175
 7176	return ret;
 7177}
 7178
 7179static void __netdev_adjacent_dev_remove(struct net_device *dev,
 7180					 struct net_device *adj_dev,
 7181					 u16 ref_nr,
 7182					 struct list_head *dev_list)
 7183{
 7184	struct netdev_adjacent *adj;
 7185
 7186	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
 7187		 dev->name, adj_dev->name, ref_nr);
 7188
 7189	adj = __netdev_find_adj(adj_dev, dev_list);
 7190
 7191	if (!adj) {
 7192		pr_err("Adjacency does not exist for device %s from %s\n",
 7193		       dev->name, adj_dev->name);
 7194		WARN_ON(1);
 7195		return;
 7196	}
 7197
 7198	if (adj->ref_nr > ref_nr) {
 7199		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
 7200			 dev->name, adj_dev->name, ref_nr,
 7201			 adj->ref_nr - ref_nr);
 7202		adj->ref_nr -= ref_nr;
 7203		return;
 7204	}
 7205
 7206	if (adj->master)
 7207		sysfs_remove_link(&(dev->dev.kobj), "master");
 7208
 7209	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 7210		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 7211
 7212	list_del_rcu(&adj->list);
 7213	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
 7214		 adj_dev->name, dev->name, adj_dev->name);
 7215	dev_put(adj_dev);
 7216	kfree_rcu(adj, rcu);
 7217}
 7218
 7219static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
 7220					    struct net_device *upper_dev,
 7221					    struct list_head *up_list,
 7222					    struct list_head *down_list,
 7223					    void *private, bool master)
 7224{
 7225	int ret;
 7226
 7227	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
 7228					   private, master);
 7229	if (ret)
 7230		return ret;
 7231
 7232	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
 7233					   private, false);
 7234	if (ret) {
 7235		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
 7236		return ret;
 7237	}
 7238
 7239	return 0;
 7240}
 7241
 7242static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
 7243					       struct net_device *upper_dev,
 7244					       u16 ref_nr,
 7245					       struct list_head *up_list,
 7246					       struct list_head *down_list)
 7247{
 7248	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
 7249	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
 7250}
 7251
 7252static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
 7253						struct net_device *upper_dev,
 7254						void *private, bool master)
 7255{
 7256	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
 7257						&dev->adj_list.upper,
 7258						&upper_dev->adj_list.lower,
 7259						private, master);
 7260}
 7261
 7262static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
 7263						   struct net_device *upper_dev)
 7264{
 7265	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
 7266					   &dev->adj_list.upper,
 7267					   &upper_dev->adj_list.lower);
 7268}
 7269
 7270static int __netdev_upper_dev_link(struct net_device *dev,
 7271				   struct net_device *upper_dev, bool master,
 7272				   void *upper_priv, void *upper_info,
 7273				   struct netlink_ext_ack *extack)
 7274{
 7275	struct netdev_notifier_changeupper_info changeupper_info = {
 7276		.info = {
 7277			.dev = dev,
 7278			.extack = extack,
 7279		},
 7280		.upper_dev = upper_dev,
 7281		.master = master,
 7282		.linking = true,
 7283		.upper_info = upper_info,
 7284	};
 7285	struct net_device *master_dev;
 7286	int ret = 0;
 7287
 7288	ASSERT_RTNL();
 7289
 7290	if (dev == upper_dev)
 7291		return -EBUSY;
 7292
 7293	/* To prevent loops, check if dev is not upper device to upper_dev. */
 7294	if (__netdev_has_upper_dev(upper_dev, dev))
 7295		return -EBUSY;
 7296
 7297	if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
 7298		return -EMLINK;
 7299
 7300	if (!master) {
 7301		if (__netdev_has_upper_dev(dev, upper_dev))
 7302			return -EEXIST;
 7303	} else {
 7304		master_dev = __netdev_master_upper_dev_get(dev);
 7305		if (master_dev)
 7306			return master_dev == upper_dev ? -EEXIST : -EBUSY;
 7307	}
 7308
 7309	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 7310					    &changeupper_info.info);
 7311	ret = notifier_to_errno(ret);
 7312	if (ret)
 7313		return ret;
 7314
 7315	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
 7316						   master);
 7317	if (ret)
 7318		return ret;
 7319
 7320	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 7321					    &changeupper_info.info);
 7322	ret = notifier_to_errno(ret);
 7323	if (ret)
 7324		goto rollback;
 7325
 7326	__netdev_update_upper_level(dev, NULL);
 7327	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 7328
 7329	__netdev_update_lower_level(upper_dev, NULL);
 7330	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 7331				    NULL);
 7332
 7333	return 0;
 7334
 7335rollback:
 7336	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 7337
 7338	return ret;
 7339}
 7340
 7341/**
 7342 * netdev_upper_dev_link - Add a link to the upper device
 7343 * @dev: device
 7344 * @upper_dev: new upper device
 7345 * @extack: netlink extended ack
 7346 *
 7347 * Adds a link to device which is upper to this one. The caller must hold
 7348 * the RTNL lock. On a failure a negative errno code is returned.
 7349 * On success the reference counts are adjusted and the function
 7350 * returns zero.
 7351 */
 7352int netdev_upper_dev_link(struct net_device *dev,
 7353			  struct net_device *upper_dev,
 7354			  struct netlink_ext_ack *extack)
 7355{
 7356	return __netdev_upper_dev_link(dev, upper_dev, false,
 7357				       NULL, NULL, extack);
 7358}
 7359EXPORT_SYMBOL(netdev_upper_dev_link);
 7360
 7361/**
 7362 * netdev_master_upper_dev_link - Add a master link to the upper device
 7363 * @dev: device
 7364 * @upper_dev: new upper device
 7365 * @upper_priv: upper device private
 7366 * @upper_info: upper info to be passed down via notifier
 7367 * @extack: netlink extended ack
 7368 *
 7369 * Adds a link to device which is upper to this one. In this case, only
 7370 * one master upper device can be linked, although other non-master devices
 7371 * might be linked as well. The caller must hold the RTNL lock.
 7372 * On a failure a negative errno code is returned. On success the reference
 7373 * counts are adjusted and the function returns zero.
 7374 */
 7375int netdev_master_upper_dev_link(struct net_device *dev,
 7376				 struct net_device *upper_dev,
 7377				 void *upper_priv, void *upper_info,
 7378				 struct netlink_ext_ack *extack)
 7379{
 7380	return __netdev_upper_dev_link(dev, upper_dev, true,
 7381				       upper_priv, upper_info, extack);
 7382}
 7383EXPORT_SYMBOL(netdev_master_upper_dev_link);
 7384
 7385/**
 7386 * netdev_upper_dev_unlink - Removes a link to upper device
 7387 * @dev: device
 7388 * @upper_dev: new upper device
 7389 *
 7390 * Removes a link to device which is upper to this one. The caller must hold
 7391 * the RTNL lock.
 7392 */
 7393void netdev_upper_dev_unlink(struct net_device *dev,
 7394			     struct net_device *upper_dev)
 7395{
 7396	struct netdev_notifier_changeupper_info changeupper_info = {
 7397		.info = {
 7398			.dev = dev,
 7399		},
 7400		.upper_dev = upper_dev,
 7401		.linking = false,
 7402	};
 7403
 7404	ASSERT_RTNL();
 7405
 7406	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
 7407
 7408	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 7409				      &changeupper_info.info);
 7410
 7411	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 7412
 7413	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 7414				      &changeupper_info.info);
 7415
 7416	__netdev_update_upper_level(dev, NULL);
 7417	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 7418
 7419	__netdev_update_lower_level(upper_dev, NULL);
 7420	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 7421				    NULL);
 7422}
 7423EXPORT_SYMBOL(netdev_upper_dev_unlink);
 7424
 7425static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
 7426				      struct net_device *lower_dev,
 7427				      bool val)
 7428{
 7429	struct netdev_adjacent *adj;
 7430
 7431	adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
 7432	if (adj)
 7433		adj->ignore = val;
 7434
 7435	adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
 7436	if (adj)
 7437		adj->ignore = val;
 7438}
 7439
 7440static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
 7441					struct net_device *lower_dev)
 7442{
 7443	__netdev_adjacent_dev_set(upper_dev, lower_dev, true);
 7444}
 7445
 7446static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
 7447				       struct net_device *lower_dev)
 7448{
 7449	__netdev_adjacent_dev_set(upper_dev, lower_dev, false);
 7450}
 7451
 7452int netdev_adjacent_change_prepare(struct net_device *old_dev,
 7453				   struct net_device *new_dev,
 7454				   struct net_device *dev,
 7455				   struct netlink_ext_ack *extack)
 7456{
 7457	int err;
 7458
 7459	if (!new_dev)
 7460		return 0;
 7461
 7462	if (old_dev && new_dev != old_dev)
 7463		netdev_adjacent_dev_disable(dev, old_dev);
 7464
 7465	err = netdev_upper_dev_link(new_dev, dev, extack);
 7466	if (err) {
 7467		if (old_dev && new_dev != old_dev)
 7468			netdev_adjacent_dev_enable(dev, old_dev);
 7469		return err;
 7470	}
 7471
 7472	return 0;
 7473}
 7474EXPORT_SYMBOL(netdev_adjacent_change_prepare);
 7475
 7476void netdev_adjacent_change_commit(struct net_device *old_dev,
 7477				   struct net_device *new_dev,
 7478				   struct net_device *dev)
 7479{
 7480	if (!new_dev || !old_dev)
 7481		return;
 7482
 7483	if (new_dev == old_dev)
 7484		return;
 7485
 7486	netdev_adjacent_dev_enable(dev, old_dev);
 7487	netdev_upper_dev_unlink(old_dev, dev);
 7488}
 7489EXPORT_SYMBOL(netdev_adjacent_change_commit);
 7490
 7491void netdev_adjacent_change_abort(struct net_device *old_dev,
 7492				  struct net_device *new_dev,
 7493				  struct net_device *dev)
 7494{
 7495	if (!new_dev)
 7496		return;
 7497
 7498	if (old_dev && new_dev != old_dev)
 7499		netdev_adjacent_dev_enable(dev, old_dev);
 7500
 7501	netdev_upper_dev_unlink(new_dev, dev);
 7502}
 7503EXPORT_SYMBOL(netdev_adjacent_change_abort);
 7504
 7505/**
 7506 * netdev_bonding_info_change - Dispatch event about slave change
 7507 * @dev: device
 7508 * @bonding_info: info to dispatch
 7509 *
 7510 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
 7511 * The caller must hold the RTNL lock.
 7512 */
 7513void netdev_bonding_info_change(struct net_device *dev,
 7514				struct netdev_bonding_info *bonding_info)
 7515{
 7516	struct netdev_notifier_bonding_info info = {
 7517		.info.dev = dev,
 7518	};
 7519
 7520	memcpy(&info.bonding_info, bonding_info,
 7521	       sizeof(struct netdev_bonding_info));
 7522	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
 7523				      &info.info);
 7524}
 7525EXPORT_SYMBOL(netdev_bonding_info_change);
 7526
 7527static void netdev_adjacent_add_links(struct net_device *dev)
 7528{
 7529	struct netdev_adjacent *iter;
 7530
 7531	struct net *net = dev_net(dev);
 7532
 7533	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 7534		if (!net_eq(net, dev_net(iter->dev)))
 7535			continue;
 7536		netdev_adjacent_sysfs_add(iter->dev, dev,
 7537					  &iter->dev->adj_list.lower);
 7538		netdev_adjacent_sysfs_add(dev, iter->dev,
 7539					  &dev->adj_list.upper);
 7540	}
 7541
 7542	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 7543		if (!net_eq(net, dev_net(iter->dev)))
 7544			continue;
 7545		netdev_adjacent_sysfs_add(iter->dev, dev,
 7546					  &iter->dev->adj_list.upper);
 7547		netdev_adjacent_sysfs_add(dev, iter->dev,
 7548					  &dev->adj_list.lower);
 7549	}
 7550}
 7551
 7552static void netdev_adjacent_del_links(struct net_device *dev)
 7553{
 7554	struct netdev_adjacent *iter;
 7555
 7556	struct net *net = dev_net(dev);
 7557
 7558	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 7559		if (!net_eq(net, dev_net(iter->dev)))
 7560			continue;
 7561		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 7562					  &iter->dev->adj_list.lower);
 7563		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 7564					  &dev->adj_list.upper);
 7565	}
 7566
 7567	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 7568		if (!net_eq(net, dev_net(iter->dev)))
 7569			continue;
 7570		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 7571					  &iter->dev->adj_list.upper);
 7572		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 7573					  &dev->adj_list.lower);
 7574	}
 7575}
 7576
 7577void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
 7578{
 7579	struct netdev_adjacent *iter;
 7580
 7581	struct net *net = dev_net(dev);
 7582
 7583	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 7584		if (!net_eq(net, dev_net(iter->dev)))
 7585			continue;
 7586		netdev_adjacent_sysfs_del(iter->dev, oldname,
 7587					  &iter->dev->adj_list.lower);
 7588		netdev_adjacent_sysfs_add(iter->dev, dev,
 7589					  &iter->dev->adj_list.lower);
 7590	}
 7591
 7592	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 7593		if (!net_eq(net, dev_net(iter->dev)))
 7594			continue;
 7595		netdev_adjacent_sysfs_del(iter->dev, oldname,
 7596					  &iter->dev->adj_list.upper);
 7597		netdev_adjacent_sysfs_add(iter->dev, dev,
 7598					  &iter->dev->adj_list.upper);
 7599	}
 7600}
 7601
 7602void *netdev_lower_dev_get_private(struct net_device *dev,
 7603				   struct net_device *lower_dev)
 7604{
 7605	struct netdev_adjacent *lower;
 7606
 7607	if (!lower_dev)
 7608		return NULL;
 7609	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
 7610	if (!lower)
 7611		return NULL;
 7612
 7613	return lower->private;
 7614}
 7615EXPORT_SYMBOL(netdev_lower_dev_get_private);
 7616
 7617
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 7618/**
 7619 * netdev_lower_change - Dispatch event about lower device state change
 7620 * @lower_dev: device
 7621 * @lower_state_info: state to dispatch
 7622 *
 7623 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
 7624 * The caller must hold the RTNL lock.
 7625 */
 7626void netdev_lower_state_changed(struct net_device *lower_dev,
 7627				void *lower_state_info)
 7628{
 7629	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
 7630		.info.dev = lower_dev,
 7631	};
 7632
 7633	ASSERT_RTNL();
 7634	changelowerstate_info.lower_state_info = lower_state_info;
 7635	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
 7636				      &changelowerstate_info.info);
 7637}
 7638EXPORT_SYMBOL(netdev_lower_state_changed);
 7639
 7640static void dev_change_rx_flags(struct net_device *dev, int flags)
 7641{
 7642	const struct net_device_ops *ops = dev->netdev_ops;
 7643
 7644	if (ops->ndo_change_rx_flags)
 7645		ops->ndo_change_rx_flags(dev, flags);
 7646}
 7647
 7648static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
 7649{
 7650	unsigned int old_flags = dev->flags;
 7651	kuid_t uid;
 7652	kgid_t gid;
 7653
 7654	ASSERT_RTNL();
 7655
 7656	dev->flags |= IFF_PROMISC;
 7657	dev->promiscuity += inc;
 7658	if (dev->promiscuity == 0) {
 7659		/*
 7660		 * Avoid overflow.
 7661		 * If inc causes overflow, untouch promisc and return error.
 7662		 */
 7663		if (inc < 0)
 7664			dev->flags &= ~IFF_PROMISC;
 7665		else {
 7666			dev->promiscuity -= inc;
 7667			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
 7668				dev->name);
 7669			return -EOVERFLOW;
 7670		}
 7671	}
 7672	if (dev->flags != old_flags) {
 7673		pr_info("device %s %s promiscuous mode\n",
 7674			dev->name,
 7675			dev->flags & IFF_PROMISC ? "entered" : "left");
 7676		if (audit_enabled) {
 7677			current_uid_gid(&uid, &gid);
 7678			audit_log(audit_context(), GFP_ATOMIC,
 7679				  AUDIT_ANOM_PROMISCUOUS,
 7680				  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
 7681				  dev->name, (dev->flags & IFF_PROMISC),
 7682				  (old_flags & IFF_PROMISC),
 7683				  from_kuid(&init_user_ns, audit_get_loginuid(current)),
 7684				  from_kuid(&init_user_ns, uid),
 7685				  from_kgid(&init_user_ns, gid),
 7686				  audit_get_sessionid(current));
 7687		}
 7688
 7689		dev_change_rx_flags(dev, IFF_PROMISC);
 7690	}
 7691	if (notify)
 7692		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
 7693	return 0;
 7694}
 7695
 7696/**
 7697 *	dev_set_promiscuity	- update promiscuity count on a device
 7698 *	@dev: device
 7699 *	@inc: modifier
 7700 *
 7701 *	Add or remove promiscuity from a device. While the count in the device
 7702 *	remains above zero the interface remains promiscuous. Once it hits zero
 7703 *	the device reverts back to normal filtering operation. A negative inc
 7704 *	value is used to drop promiscuity on the device.
 7705 *	Return 0 if successful or a negative errno code on error.
 7706 */
 7707int dev_set_promiscuity(struct net_device *dev, int inc)
 7708{
 7709	unsigned int old_flags = dev->flags;
 7710	int err;
 7711
 7712	err = __dev_set_promiscuity(dev, inc, true);
 7713	if (err < 0)
 7714		return err;
 7715	if (dev->flags != old_flags)
 7716		dev_set_rx_mode(dev);
 7717	return err;
 7718}
 7719EXPORT_SYMBOL(dev_set_promiscuity);
 7720
 7721static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
 7722{
 7723	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
 7724
 7725	ASSERT_RTNL();
 7726
 7727	dev->flags |= IFF_ALLMULTI;
 7728	dev->allmulti += inc;
 7729	if (dev->allmulti == 0) {
 7730		/*
 7731		 * Avoid overflow.
 7732		 * If inc causes overflow, untouch allmulti and return error.
 7733		 */
 7734		if (inc < 0)
 7735			dev->flags &= ~IFF_ALLMULTI;
 7736		else {
 7737			dev->allmulti -= inc;
 7738			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
 7739				dev->name);
 7740			return -EOVERFLOW;
 7741		}
 7742	}
 7743	if (dev->flags ^ old_flags) {
 7744		dev_change_rx_flags(dev, IFF_ALLMULTI);
 7745		dev_set_rx_mode(dev);
 7746		if (notify)
 7747			__dev_notify_flags(dev, old_flags,
 7748					   dev->gflags ^ old_gflags);
 7749	}
 7750	return 0;
 7751}
 7752
 7753/**
 7754 *	dev_set_allmulti	- update allmulti count on a device
 7755 *	@dev: device
 7756 *	@inc: modifier
 7757 *
 7758 *	Add or remove reception of all multicast frames to a device. While the
 7759 *	count in the device remains above zero the interface remains listening
 7760 *	to all interfaces. Once it hits zero the device reverts back to normal
 7761 *	filtering operation. A negative @inc value is used to drop the counter
 7762 *	when releasing a resource needing all multicasts.
 7763 *	Return 0 if successful or a negative errno code on error.
 7764 */
 7765
 7766int dev_set_allmulti(struct net_device *dev, int inc)
 7767{
 7768	return __dev_set_allmulti(dev, inc, true);
 7769}
 7770EXPORT_SYMBOL(dev_set_allmulti);
 7771
 7772/*
 7773 *	Upload unicast and multicast address lists to device and
 7774 *	configure RX filtering. When the device doesn't support unicast
 7775 *	filtering it is put in promiscuous mode while unicast addresses
 7776 *	are present.
 7777 */
 7778void __dev_set_rx_mode(struct net_device *dev)
 7779{
 7780	const struct net_device_ops *ops = dev->netdev_ops;
 7781
 7782	/* dev_open will call this function so the list will stay sane. */
 7783	if (!(dev->flags&IFF_UP))
 7784		return;
 7785
 7786	if (!netif_device_present(dev))
 7787		return;
 7788
 7789	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
 7790		/* Unicast addresses changes may only happen under the rtnl,
 7791		 * therefore calling __dev_set_promiscuity here is safe.
 7792		 */
 7793		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
 7794			__dev_set_promiscuity(dev, 1, false);
 7795			dev->uc_promisc = true;
 7796		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
 7797			__dev_set_promiscuity(dev, -1, false);
 7798			dev->uc_promisc = false;
 7799		}
 7800	}
 7801
 7802	if (ops->ndo_set_rx_mode)
 7803		ops->ndo_set_rx_mode(dev);
 7804}
 7805
 7806void dev_set_rx_mode(struct net_device *dev)
 7807{
 7808	netif_addr_lock_bh(dev);
 7809	__dev_set_rx_mode(dev);
 7810	netif_addr_unlock_bh(dev);
 7811}
 7812
 7813/**
 7814 *	dev_get_flags - get flags reported to userspace
 7815 *	@dev: device
 7816 *
 7817 *	Get the combination of flag bits exported through APIs to userspace.
 7818 */
 7819unsigned int dev_get_flags(const struct net_device *dev)
 7820{
 7821	unsigned int flags;
 7822
 7823	flags = (dev->flags & ~(IFF_PROMISC |
 7824				IFF_ALLMULTI |
 7825				IFF_RUNNING |
 7826				IFF_LOWER_UP |
 7827				IFF_DORMANT)) |
 7828		(dev->gflags & (IFF_PROMISC |
 7829				IFF_ALLMULTI));
 7830
 7831	if (netif_running(dev)) {
 7832		if (netif_oper_up(dev))
 7833			flags |= IFF_RUNNING;
 7834		if (netif_carrier_ok(dev))
 7835			flags |= IFF_LOWER_UP;
 7836		if (netif_dormant(dev))
 7837			flags |= IFF_DORMANT;
 7838	}
 7839
 7840	return flags;
 7841}
 7842EXPORT_SYMBOL(dev_get_flags);
 7843
 7844int __dev_change_flags(struct net_device *dev, unsigned int flags,
 7845		       struct netlink_ext_ack *extack)
 7846{
 7847	unsigned int old_flags = dev->flags;
 7848	int ret;
 7849
 7850	ASSERT_RTNL();
 7851
 7852	/*
 7853	 *	Set the flags on our device.
 7854	 */
 7855
 7856	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
 7857			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
 7858			       IFF_AUTOMEDIA)) |
 7859		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
 7860				    IFF_ALLMULTI));
 7861
 7862	/*
 7863	 *	Load in the correct multicast list now the flags have changed.
 7864	 */
 7865
 7866	if ((old_flags ^ flags) & IFF_MULTICAST)
 7867		dev_change_rx_flags(dev, IFF_MULTICAST);
 7868
 7869	dev_set_rx_mode(dev);
 7870
 7871	/*
 7872	 *	Have we downed the interface. We handle IFF_UP ourselves
 7873	 *	according to user attempts to set it, rather than blindly
 7874	 *	setting it.
 7875	 */
 7876
 7877	ret = 0;
 7878	if ((old_flags ^ flags) & IFF_UP) {
 7879		if (old_flags & IFF_UP)
 7880			__dev_close(dev);
 7881		else
 7882			ret = __dev_open(dev, extack);
 7883	}
 7884
 7885	if ((flags ^ dev->gflags) & IFF_PROMISC) {
 7886		int inc = (flags & IFF_PROMISC) ? 1 : -1;
 7887		unsigned int old_flags = dev->flags;
 7888
 7889		dev->gflags ^= IFF_PROMISC;
 7890
 7891		if (__dev_set_promiscuity(dev, inc, false) >= 0)
 7892			if (dev->flags != old_flags)
 7893				dev_set_rx_mode(dev);
 7894	}
 7895
 7896	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
 7897	 * is important. Some (broken) drivers set IFF_PROMISC, when
 7898	 * IFF_ALLMULTI is requested not asking us and not reporting.
 7899	 */
 7900	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
 7901		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
 7902
 7903		dev->gflags ^= IFF_ALLMULTI;
 7904		__dev_set_allmulti(dev, inc, false);
 7905	}
 7906
 7907	return ret;
 7908}
 7909
 7910void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
 7911			unsigned int gchanges)
 7912{
 7913	unsigned int changes = dev->flags ^ old_flags;
 7914
 7915	if (gchanges)
 7916		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
 7917
 7918	if (changes & IFF_UP) {
 7919		if (dev->flags & IFF_UP)
 7920			call_netdevice_notifiers(NETDEV_UP, dev);
 7921		else
 7922			call_netdevice_notifiers(NETDEV_DOWN, dev);
 7923	}
 7924
 7925	if (dev->flags & IFF_UP &&
 7926	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
 7927		struct netdev_notifier_change_info change_info = {
 7928			.info = {
 7929				.dev = dev,
 7930			},
 7931			.flags_changed = changes,
 7932		};
 7933
 7934		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
 7935	}
 7936}
 7937
 7938/**
 7939 *	dev_change_flags - change device settings
 7940 *	@dev: device
 7941 *	@flags: device state flags
 7942 *	@extack: netlink extended ack
 7943 *
 7944 *	Change settings on device based state flags. The flags are
 7945 *	in the userspace exported format.
 7946 */
 7947int dev_change_flags(struct net_device *dev, unsigned int flags,
 7948		     struct netlink_ext_ack *extack)
 7949{
 7950	int ret;
 7951	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
 7952
 7953	ret = __dev_change_flags(dev, flags, extack);
 7954	if (ret < 0)
 7955		return ret;
 7956
 7957	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
 7958	__dev_notify_flags(dev, old_flags, changes);
 7959	return ret;
 7960}
 7961EXPORT_SYMBOL(dev_change_flags);
 7962
 7963int __dev_set_mtu(struct net_device *dev, int new_mtu)
 7964{
 7965	const struct net_device_ops *ops = dev->netdev_ops;
 7966
 7967	if (ops->ndo_change_mtu)
 7968		return ops->ndo_change_mtu(dev, new_mtu);
 7969
 7970	dev->mtu = new_mtu;
 7971	return 0;
 7972}
 7973EXPORT_SYMBOL(__dev_set_mtu);
 7974
 7975/**
 7976 *	dev_set_mtu_ext - Change maximum transfer unit
 7977 *	@dev: device
 7978 *	@new_mtu: new transfer unit
 7979 *	@extack: netlink extended ack
 7980 *
 7981 *	Change the maximum transfer size of the network device.
 7982 */
 7983int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
 7984		    struct netlink_ext_ack *extack)
 7985{
 7986	int err, orig_mtu;
 7987
 7988	if (new_mtu == dev->mtu)
 7989		return 0;
 7990
 7991	/* MTU must be positive, and in range */
 7992	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
 7993		NL_SET_ERR_MSG(extack, "mtu less than device minimum");
 
 7994		return -EINVAL;
 7995	}
 7996
 7997	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
 7998		NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
 
 7999		return -EINVAL;
 8000	}
 8001
 8002	if (!netif_device_present(dev))
 8003		return -ENODEV;
 8004
 8005	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
 8006	err = notifier_to_errno(err);
 8007	if (err)
 8008		return err;
 8009
 8010	orig_mtu = dev->mtu;
 8011	err = __dev_set_mtu(dev, new_mtu);
 8012
 8013	if (!err) {
 8014		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 8015						   orig_mtu);
 8016		err = notifier_to_errno(err);
 8017		if (err) {
 8018			/* setting mtu back and notifying everyone again,
 8019			 * so that they have a chance to revert changes.
 8020			 */
 8021			__dev_set_mtu(dev, orig_mtu);
 8022			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 8023						     new_mtu);
 8024		}
 8025	}
 8026	return err;
 8027}
 8028
 8029int dev_set_mtu(struct net_device *dev, int new_mtu)
 8030{
 8031	struct netlink_ext_ack extack;
 8032	int err;
 8033
 8034	memset(&extack, 0, sizeof(extack));
 8035	err = dev_set_mtu_ext(dev, new_mtu, &extack);
 8036	if (err && extack._msg)
 8037		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
 8038	return err;
 8039}
 8040EXPORT_SYMBOL(dev_set_mtu);
 8041
 8042/**
 8043 *	dev_change_tx_queue_len - Change TX queue length of a netdevice
 8044 *	@dev: device
 8045 *	@new_len: new tx queue length
 8046 */
 8047int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
 8048{
 8049	unsigned int orig_len = dev->tx_queue_len;
 8050	int res;
 8051
 8052	if (new_len != (unsigned int)new_len)
 8053		return -ERANGE;
 8054
 8055	if (new_len != orig_len) {
 8056		dev->tx_queue_len = new_len;
 8057		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
 8058		res = notifier_to_errno(res);
 8059		if (res)
 8060			goto err_rollback;
 8061		res = dev_qdisc_change_tx_queue_len(dev);
 8062		if (res)
 8063			goto err_rollback;
 
 
 8064	}
 8065
 8066	return 0;
 8067
 8068err_rollback:
 8069	netdev_err(dev, "refused to change device tx_queue_len\n");
 8070	dev->tx_queue_len = orig_len;
 8071	return res;
 8072}
 8073
 8074/**
 8075 *	dev_set_group - Change group this device belongs to
 8076 *	@dev: device
 8077 *	@new_group: group this device should belong to
 8078 */
 8079void dev_set_group(struct net_device *dev, int new_group)
 8080{
 8081	dev->group = new_group;
 8082}
 8083EXPORT_SYMBOL(dev_set_group);
 8084
 8085/**
 8086 *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
 8087 *	@dev: device
 8088 *	@addr: new address
 8089 *	@extack: netlink extended ack
 8090 */
 8091int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
 8092			      struct netlink_ext_ack *extack)
 8093{
 8094	struct netdev_notifier_pre_changeaddr_info info = {
 8095		.info.dev = dev,
 8096		.info.extack = extack,
 8097		.dev_addr = addr,
 8098	};
 8099	int rc;
 8100
 8101	rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
 8102	return notifier_to_errno(rc);
 8103}
 8104EXPORT_SYMBOL(dev_pre_changeaddr_notify);
 8105
 8106/**
 8107 *	dev_set_mac_address - Change Media Access Control Address
 8108 *	@dev: device
 8109 *	@sa: new address
 8110 *	@extack: netlink extended ack
 8111 *
 8112 *	Change the hardware (MAC) address of the device
 8113 */
 8114int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
 8115			struct netlink_ext_ack *extack)
 8116{
 8117	const struct net_device_ops *ops = dev->netdev_ops;
 8118	int err;
 8119
 8120	if (!ops->ndo_set_mac_address)
 8121		return -EOPNOTSUPP;
 8122	if (sa->sa_family != dev->type)
 8123		return -EINVAL;
 8124	if (!netif_device_present(dev))
 8125		return -ENODEV;
 8126	err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
 8127	if (err)
 8128		return err;
 8129	err = ops->ndo_set_mac_address(dev, sa);
 8130	if (err)
 8131		return err;
 8132	dev->addr_assign_type = NET_ADDR_SET;
 8133	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
 8134	add_device_randomness(dev->dev_addr, dev->addr_len);
 8135	return 0;
 8136}
 8137EXPORT_SYMBOL(dev_set_mac_address);
 8138
 8139/**
 8140 *	dev_change_carrier - Change device carrier
 8141 *	@dev: device
 8142 *	@new_carrier: new value
 8143 *
 8144 *	Change device carrier
 8145 */
 8146int dev_change_carrier(struct net_device *dev, bool new_carrier)
 8147{
 8148	const struct net_device_ops *ops = dev->netdev_ops;
 8149
 8150	if (!ops->ndo_change_carrier)
 8151		return -EOPNOTSUPP;
 8152	if (!netif_device_present(dev))
 8153		return -ENODEV;
 8154	return ops->ndo_change_carrier(dev, new_carrier);
 8155}
 8156EXPORT_SYMBOL(dev_change_carrier);
 8157
 8158/**
 8159 *	dev_get_phys_port_id - Get device physical port ID
 8160 *	@dev: device
 8161 *	@ppid: port ID
 8162 *
 8163 *	Get device physical port ID
 8164 */
 8165int dev_get_phys_port_id(struct net_device *dev,
 8166			 struct netdev_phys_item_id *ppid)
 8167{
 8168	const struct net_device_ops *ops = dev->netdev_ops;
 8169
 8170	if (!ops->ndo_get_phys_port_id)
 8171		return -EOPNOTSUPP;
 8172	return ops->ndo_get_phys_port_id(dev, ppid);
 8173}
 8174EXPORT_SYMBOL(dev_get_phys_port_id);
 8175
 8176/**
 8177 *	dev_get_phys_port_name - Get device physical port name
 8178 *	@dev: device
 8179 *	@name: port name
 8180 *	@len: limit of bytes to copy to name
 8181 *
 8182 *	Get device physical port name
 8183 */
 8184int dev_get_phys_port_name(struct net_device *dev,
 8185			   char *name, size_t len)
 8186{
 8187	const struct net_device_ops *ops = dev->netdev_ops;
 8188	int err;
 8189
 8190	if (ops->ndo_get_phys_port_name) {
 8191		err = ops->ndo_get_phys_port_name(dev, name, len);
 8192		if (err != -EOPNOTSUPP)
 8193			return err;
 8194	}
 8195	return devlink_compat_phys_port_name_get(dev, name, len);
 8196}
 8197EXPORT_SYMBOL(dev_get_phys_port_name);
 8198
 8199/**
 8200 *	dev_get_port_parent_id - Get the device's port parent identifier
 8201 *	@dev: network device
 8202 *	@ppid: pointer to a storage for the port's parent identifier
 8203 *	@recurse: allow/disallow recursion to lower devices
 8204 *
 8205 *	Get the devices's port parent identifier
 8206 */
 8207int dev_get_port_parent_id(struct net_device *dev,
 8208			   struct netdev_phys_item_id *ppid,
 8209			   bool recurse)
 8210{
 8211	const struct net_device_ops *ops = dev->netdev_ops;
 8212	struct netdev_phys_item_id first = { };
 8213	struct net_device *lower_dev;
 8214	struct list_head *iter;
 8215	int err;
 8216
 8217	if (ops->ndo_get_port_parent_id) {
 8218		err = ops->ndo_get_port_parent_id(dev, ppid);
 8219		if (err != -EOPNOTSUPP)
 8220			return err;
 8221	}
 8222
 8223	err = devlink_compat_switch_id_get(dev, ppid);
 8224	if (!err || err != -EOPNOTSUPP)
 8225		return err;
 8226
 8227	if (!recurse)
 8228		return -EOPNOTSUPP;
 8229
 8230	netdev_for_each_lower_dev(dev, lower_dev, iter) {
 8231		err = dev_get_port_parent_id(lower_dev, ppid, recurse);
 8232		if (err)
 8233			break;
 8234		if (!first.id_len)
 8235			first = *ppid;
 8236		else if (memcmp(&first, ppid, sizeof(*ppid)))
 8237			return -ENODATA;
 8238	}
 8239
 8240	return err;
 8241}
 8242EXPORT_SYMBOL(dev_get_port_parent_id);
 8243
 8244/**
 8245 *	netdev_port_same_parent_id - Indicate if two network devices have
 8246 *	the same port parent identifier
 8247 *	@a: first network device
 8248 *	@b: second network device
 8249 */
 8250bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
 8251{
 8252	struct netdev_phys_item_id a_id = { };
 8253	struct netdev_phys_item_id b_id = { };
 8254
 8255	if (dev_get_port_parent_id(a, &a_id, true) ||
 8256	    dev_get_port_parent_id(b, &b_id, true))
 8257		return false;
 8258
 8259	return netdev_phys_item_id_same(&a_id, &b_id);
 8260}
 8261EXPORT_SYMBOL(netdev_port_same_parent_id);
 8262
 8263/**
 8264 *	dev_change_proto_down - update protocol port state information
 8265 *	@dev: device
 8266 *	@proto_down: new value
 8267 *
 8268 *	This info can be used by switch drivers to set the phys state of the
 8269 *	port.
 8270 */
 8271int dev_change_proto_down(struct net_device *dev, bool proto_down)
 8272{
 8273	const struct net_device_ops *ops = dev->netdev_ops;
 8274
 8275	if (!ops->ndo_change_proto_down)
 8276		return -EOPNOTSUPP;
 8277	if (!netif_device_present(dev))
 8278		return -ENODEV;
 8279	return ops->ndo_change_proto_down(dev, proto_down);
 8280}
 8281EXPORT_SYMBOL(dev_change_proto_down);
 8282
 8283/**
 8284 *	dev_change_proto_down_generic - generic implementation for
 8285 * 	ndo_change_proto_down that sets carrier according to
 8286 * 	proto_down.
 8287 *
 8288 *	@dev: device
 8289 *	@proto_down: new value
 8290 */
 8291int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
 8292{
 8293	if (proto_down)
 8294		netif_carrier_off(dev);
 8295	else
 8296		netif_carrier_on(dev);
 8297	dev->proto_down = proto_down;
 8298	return 0;
 8299}
 8300EXPORT_SYMBOL(dev_change_proto_down_generic);
 8301
 8302u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
 8303		    enum bpf_netdev_command cmd)
 8304{
 8305	struct netdev_bpf xdp;
 8306
 8307	if (!bpf_op)
 8308		return 0;
 8309
 8310	memset(&xdp, 0, sizeof(xdp));
 8311	xdp.command = cmd;
 8312
 8313	/* Query must always succeed. */
 8314	WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG);
 8315
 8316	return xdp.prog_id;
 8317}
 8318
 8319static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
 8320			   struct netlink_ext_ack *extack, u32 flags,
 8321			   struct bpf_prog *prog)
 8322{
 8323	struct netdev_bpf xdp;
 8324
 8325	memset(&xdp, 0, sizeof(xdp));
 8326	if (flags & XDP_FLAGS_HW_MODE)
 8327		xdp.command = XDP_SETUP_PROG_HW;
 8328	else
 8329		xdp.command = XDP_SETUP_PROG;
 8330	xdp.extack = extack;
 8331	xdp.flags = flags;
 8332	xdp.prog = prog;
 8333
 8334	return bpf_op(dev, &xdp);
 8335}
 8336
 8337static void dev_xdp_uninstall(struct net_device *dev)
 8338{
 8339	struct netdev_bpf xdp;
 8340	bpf_op_t ndo_bpf;
 8341
 8342	/* Remove generic XDP */
 8343	WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL));
 8344
 8345	/* Remove from the driver */
 8346	ndo_bpf = dev->netdev_ops->ndo_bpf;
 8347	if (!ndo_bpf)
 8348		return;
 8349
 8350	memset(&xdp, 0, sizeof(xdp));
 8351	xdp.command = XDP_QUERY_PROG;
 8352	WARN_ON(ndo_bpf(dev, &xdp));
 8353	if (xdp.prog_id)
 8354		WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
 8355					NULL));
 8356
 8357	/* Remove HW offload */
 8358	memset(&xdp, 0, sizeof(xdp));
 8359	xdp.command = XDP_QUERY_PROG_HW;
 8360	if (!ndo_bpf(dev, &xdp) && xdp.prog_id)
 8361		WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
 8362					NULL));
 8363}
 8364
 8365/**
 8366 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
 8367 *	@dev: device
 8368 *	@extack: netlink extended ack
 8369 *	@fd: new program fd or negative value to clear
 8370 *	@flags: xdp-related flags
 8371 *
 8372 *	Set or clear a bpf program for a device
 8373 */
 8374int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 8375		      int fd, u32 flags)
 8376{
 8377	const struct net_device_ops *ops = dev->netdev_ops;
 8378	enum bpf_netdev_command query;
 8379	struct bpf_prog *prog = NULL;
 8380	bpf_op_t bpf_op, bpf_chk;
 8381	bool offload;
 8382	int err;
 8383
 8384	ASSERT_RTNL();
 8385
 8386	offload = flags & XDP_FLAGS_HW_MODE;
 8387	query = offload ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;
 8388
 8389	bpf_op = bpf_chk = ops->ndo_bpf;
 8390	if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) {
 8391		NL_SET_ERR_MSG(extack, "underlying driver does not support XDP in native mode");
 8392		return -EOPNOTSUPP;
 8393	}
 8394	if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE))
 8395		bpf_op = generic_xdp_install;
 8396	if (bpf_op == bpf_chk)
 8397		bpf_chk = generic_xdp_install;
 8398
 8399	if (fd >= 0) {
 8400		u32 prog_id;
 8401
 8402		if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) {
 8403			NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time");
 8404			return -EEXIST;
 8405		}
 8406
 8407		prog_id = __dev_xdp_query(dev, bpf_op, query);
 8408		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && prog_id) {
 8409			NL_SET_ERR_MSG(extack, "XDP program already attached");
 8410			return -EBUSY;
 8411		}
 8412
 8413		prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
 8414					     bpf_op == ops->ndo_bpf);
 8415		if (IS_ERR(prog))
 8416			return PTR_ERR(prog);
 8417
 8418		if (!offload && bpf_prog_is_dev_bound(prog->aux)) {
 
 8419			NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported");
 8420			bpf_prog_put(prog);
 8421			return -EINVAL;
 8422		}
 8423
 8424		/* prog->aux->id may be 0 for orphaned device-bound progs */
 8425		if (prog->aux->id && prog->aux->id == prog_id) {
 8426			bpf_prog_put(prog);
 8427			return 0;
 8428		}
 8429	} else {
 8430		if (!__dev_xdp_query(dev, bpf_op, query))
 8431			return 0;
 8432	}
 8433
 8434	err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
 8435	if (err < 0 && prog)
 8436		bpf_prog_put(prog);
 8437
 8438	return err;
 8439}
 8440
 8441/**
 8442 *	dev_new_index	-	allocate an ifindex
 8443 *	@net: the applicable net namespace
 8444 *
 8445 *	Returns a suitable unique value for a new device interface
 8446 *	number.  The caller must hold the rtnl semaphore or the
 8447 *	dev_base_lock to be sure it remains unique.
 8448 */
 8449static int dev_new_index(struct net *net)
 8450{
 8451	int ifindex = net->ifindex;
 8452
 8453	for (;;) {
 8454		if (++ifindex <= 0)
 8455			ifindex = 1;
 8456		if (!__dev_get_by_index(net, ifindex))
 8457			return net->ifindex = ifindex;
 8458	}
 8459}
 8460
 8461/* Delayed registration/unregisteration */
 8462static LIST_HEAD(net_todo_list);
 8463DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
 8464
 8465static void net_set_todo(struct net_device *dev)
 8466{
 8467	list_add_tail(&dev->todo_list, &net_todo_list);
 8468	dev_net(dev)->dev_unreg_count++;
 8469}
 8470
 8471static void rollback_registered_many(struct list_head *head)
 8472{
 8473	struct net_device *dev, *tmp;
 8474	LIST_HEAD(close_head);
 8475
 8476	BUG_ON(dev_boot_phase);
 8477	ASSERT_RTNL();
 8478
 8479	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
 8480		/* Some devices call without registering
 8481		 * for initialization unwind. Remove those
 8482		 * devices and proceed with the remaining.
 8483		 */
 8484		if (dev->reg_state == NETREG_UNINITIALIZED) {
 8485			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
 8486				 dev->name, dev);
 8487
 8488			WARN_ON(1);
 8489			list_del(&dev->unreg_list);
 8490			continue;
 8491		}
 8492		dev->dismantle = true;
 8493		BUG_ON(dev->reg_state != NETREG_REGISTERED);
 8494	}
 8495
 8496	/* If device is running, close it first. */
 8497	list_for_each_entry(dev, head, unreg_list)
 8498		list_add_tail(&dev->close_list, &close_head);
 8499	dev_close_many(&close_head, true);
 8500
 8501	list_for_each_entry(dev, head, unreg_list) {
 8502		/* And unlink it from device chain. */
 8503		unlist_netdevice(dev);
 8504
 8505		dev->reg_state = NETREG_UNREGISTERING;
 8506	}
 8507	flush_all_backlogs();
 8508
 8509	synchronize_net();
 8510
 8511	list_for_each_entry(dev, head, unreg_list) {
 8512		struct sk_buff *skb = NULL;
 8513
 8514		/* Shutdown queueing discipline. */
 8515		dev_shutdown(dev);
 8516
 8517		dev_xdp_uninstall(dev);
 8518
 8519		/* Notify protocols, that we are about to destroy
 8520		 * this device. They should clean all the things.
 8521		 */
 8522		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 8523
 8524		if (!dev->rtnl_link_ops ||
 8525		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 8526			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
 8527						     GFP_KERNEL, NULL, 0);
 8528
 8529		/*
 8530		 *	Flush the unicast and multicast chains
 8531		 */
 8532		dev_uc_flush(dev);
 8533		dev_mc_flush(dev);
 8534
 8535		if (dev->netdev_ops->ndo_uninit)
 8536			dev->netdev_ops->ndo_uninit(dev);
 8537
 8538		if (skb)
 8539			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
 8540
 8541		/* Notifier chain MUST detach us all upper devices. */
 8542		WARN_ON(netdev_has_any_upper_dev(dev));
 8543		WARN_ON(netdev_has_any_lower_dev(dev));
 8544
 8545		/* Remove entries from kobject tree */
 8546		netdev_unregister_kobject(dev);
 8547#ifdef CONFIG_XPS
 8548		/* Remove XPS queueing entries */
 8549		netif_reset_xps_queues_gt(dev, 0);
 8550#endif
 8551	}
 8552
 8553	synchronize_net();
 8554
 8555	list_for_each_entry(dev, head, unreg_list)
 8556		dev_put(dev);
 8557}
 8558
 8559static void rollback_registered(struct net_device *dev)
 8560{
 8561	LIST_HEAD(single);
 8562
 8563	list_add(&dev->unreg_list, &single);
 8564	rollback_registered_many(&single);
 8565	list_del(&single);
 8566}
 8567
 8568static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
 8569	struct net_device *upper, netdev_features_t features)
 8570{
 8571	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
 8572	netdev_features_t feature;
 8573	int feature_bit;
 8574
 8575	for_each_netdev_feature(upper_disables, feature_bit) {
 8576		feature = __NETIF_F_BIT(feature_bit);
 8577		if (!(upper->wanted_features & feature)
 8578		    && (features & feature)) {
 8579			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
 8580				   &feature, upper->name);
 8581			features &= ~feature;
 8582		}
 8583	}
 8584
 8585	return features;
 8586}
 8587
 8588static void netdev_sync_lower_features(struct net_device *upper,
 8589	struct net_device *lower, netdev_features_t features)
 8590{
 8591	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
 8592	netdev_features_t feature;
 8593	int feature_bit;
 8594
 8595	for_each_netdev_feature(upper_disables, feature_bit) {
 8596		feature = __NETIF_F_BIT(feature_bit);
 8597		if (!(features & feature) && (lower->features & feature)) {
 8598			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
 8599				   &feature, lower->name);
 8600			lower->wanted_features &= ~feature;
 8601			netdev_update_features(lower);
 8602
 8603			if (unlikely(lower->features & feature))
 8604				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
 8605					    &feature, lower->name);
 8606		}
 8607	}
 8608}
 8609
 8610static netdev_features_t netdev_fix_features(struct net_device *dev,
 8611	netdev_features_t features)
 8612{
 8613	/* Fix illegal checksum combinations */
 8614	if ((features & NETIF_F_HW_CSUM) &&
 8615	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
 8616		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
 8617		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
 8618	}
 8619
 8620	/* TSO requires that SG is present as well. */
 8621	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
 8622		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
 8623		features &= ~NETIF_F_ALL_TSO;
 8624	}
 8625
 8626	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
 8627					!(features & NETIF_F_IP_CSUM)) {
 8628		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
 8629		features &= ~NETIF_F_TSO;
 8630		features &= ~NETIF_F_TSO_ECN;
 8631	}
 8632
 8633	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
 8634					 !(features & NETIF_F_IPV6_CSUM)) {
 8635		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
 8636		features &= ~NETIF_F_TSO6;
 8637	}
 8638
 8639	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
 8640	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
 8641		features &= ~NETIF_F_TSO_MANGLEID;
 8642
 8643	/* TSO ECN requires that TSO is present as well. */
 8644	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
 8645		features &= ~NETIF_F_TSO_ECN;
 8646
 8647	/* Software GSO depends on SG. */
 8648	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
 8649		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
 8650		features &= ~NETIF_F_GSO;
 8651	}
 8652
 8653	/* GSO partial features require GSO partial be set */
 8654	if ((features & dev->gso_partial_features) &&
 8655	    !(features & NETIF_F_GSO_PARTIAL)) {
 8656		netdev_dbg(dev,
 8657			   "Dropping partially supported GSO features since no GSO partial.\n");
 8658		features &= ~dev->gso_partial_features;
 8659	}
 8660
 8661	if (!(features & NETIF_F_RXCSUM)) {
 8662		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
 8663		 * successfully merged by hardware must also have the
 8664		 * checksum verified by hardware.  If the user does not
 8665		 * want to enable RXCSUM, logically, we should disable GRO_HW.
 8666		 */
 8667		if (features & NETIF_F_GRO_HW) {
 8668			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
 8669			features &= ~NETIF_F_GRO_HW;
 8670		}
 8671	}
 8672
 8673	/* LRO/HW-GRO features cannot be combined with RX-FCS */
 8674	if (features & NETIF_F_RXFCS) {
 8675		if (features & NETIF_F_LRO) {
 8676			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
 8677			features &= ~NETIF_F_LRO;
 8678		}
 8679
 8680		if (features & NETIF_F_GRO_HW) {
 8681			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
 8682			features &= ~NETIF_F_GRO_HW;
 8683		}
 8684	}
 8685
 8686	return features;
 8687}
 8688
 8689int __netdev_update_features(struct net_device *dev)
 8690{
 8691	struct net_device *upper, *lower;
 8692	netdev_features_t features;
 8693	struct list_head *iter;
 8694	int err = -1;
 8695
 8696	ASSERT_RTNL();
 8697
 8698	features = netdev_get_wanted_features(dev);
 8699
 8700	if (dev->netdev_ops->ndo_fix_features)
 8701		features = dev->netdev_ops->ndo_fix_features(dev, features);
 8702
 8703	/* driver might be less strict about feature dependencies */
 8704	features = netdev_fix_features(dev, features);
 8705
 8706	/* some features can't be enabled if they're off an an upper device */
 8707	netdev_for_each_upper_dev_rcu(dev, upper, iter)
 8708		features = netdev_sync_upper_features(dev, upper, features);
 8709
 8710	if (dev->features == features)
 8711		goto sync_lower;
 8712
 8713	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
 8714		&dev->features, &features);
 8715
 8716	if (dev->netdev_ops->ndo_set_features)
 8717		err = dev->netdev_ops->ndo_set_features(dev, features);
 8718	else
 8719		err = 0;
 8720
 8721	if (unlikely(err < 0)) {
 8722		netdev_err(dev,
 8723			"set_features() failed (%d); wanted %pNF, left %pNF\n",
 8724			err, &features, &dev->features);
 8725		/* return non-0 since some features might have changed and
 8726		 * it's better to fire a spurious notification than miss it
 8727		 */
 8728		return -1;
 8729	}
 8730
 8731sync_lower:
 8732	/* some features must be disabled on lower devices when disabled
 8733	 * on an upper device (think: bonding master or bridge)
 8734	 */
 8735	netdev_for_each_lower_dev(dev, lower, iter)
 8736		netdev_sync_lower_features(dev, lower, features);
 8737
 8738	if (!err) {
 8739		netdev_features_t diff = features ^ dev->features;
 8740
 8741		if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
 8742			/* udp_tunnel_{get,drop}_rx_info both need
 8743			 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
 8744			 * device, or they won't do anything.
 8745			 * Thus we need to update dev->features
 8746			 * *before* calling udp_tunnel_get_rx_info,
 8747			 * but *after* calling udp_tunnel_drop_rx_info.
 8748			 */
 8749			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
 8750				dev->features = features;
 8751				udp_tunnel_get_rx_info(dev);
 8752			} else {
 8753				udp_tunnel_drop_rx_info(dev);
 8754			}
 8755		}
 8756
 8757		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
 8758			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
 8759				dev->features = features;
 8760				err |= vlan_get_rx_ctag_filter_info(dev);
 8761			} else {
 8762				vlan_drop_rx_ctag_filter_info(dev);
 8763			}
 8764		}
 8765
 8766		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
 8767			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
 8768				dev->features = features;
 8769				err |= vlan_get_rx_stag_filter_info(dev);
 8770			} else {
 8771				vlan_drop_rx_stag_filter_info(dev);
 8772			}
 8773		}
 8774
 8775		dev->features = features;
 8776	}
 8777
 8778	return err < 0 ? 0 : 1;
 8779}
 8780
 8781/**
 8782 *	netdev_update_features - recalculate device features
 8783 *	@dev: the device to check
 8784 *
 8785 *	Recalculate dev->features set and send notifications if it
 8786 *	has changed. Should be called after driver or hardware dependent
 8787 *	conditions might have changed that influence the features.
 8788 */
 8789void netdev_update_features(struct net_device *dev)
 8790{
 8791	if (__netdev_update_features(dev))
 8792		netdev_features_change(dev);
 8793}
 8794EXPORT_SYMBOL(netdev_update_features);
 8795
 8796/**
 8797 *	netdev_change_features - recalculate device features
 8798 *	@dev: the device to check
 8799 *
 8800 *	Recalculate dev->features set and send notifications even
 8801 *	if they have not changed. Should be called instead of
 8802 *	netdev_update_features() if also dev->vlan_features might
 8803 *	have changed to allow the changes to be propagated to stacked
 8804 *	VLAN devices.
 8805 */
 8806void netdev_change_features(struct net_device *dev)
 8807{
 8808	__netdev_update_features(dev);
 8809	netdev_features_change(dev);
 8810}
 8811EXPORT_SYMBOL(netdev_change_features);
 8812
 8813/**
 8814 *	netif_stacked_transfer_operstate -	transfer operstate
 8815 *	@rootdev: the root or lower level device to transfer state from
 8816 *	@dev: the device to transfer operstate to
 8817 *
 8818 *	Transfer operational state from root to device. This is normally
 8819 *	called when a stacking relationship exists between the root
 8820 *	device and the device(a leaf device).
 8821 */
 8822void netif_stacked_transfer_operstate(const struct net_device *rootdev,
 8823					struct net_device *dev)
 8824{
 8825	if (rootdev->operstate == IF_OPER_DORMANT)
 8826		netif_dormant_on(dev);
 8827	else
 8828		netif_dormant_off(dev);
 8829
 8830	if (netif_carrier_ok(rootdev))
 8831		netif_carrier_on(dev);
 8832	else
 8833		netif_carrier_off(dev);
 8834}
 8835EXPORT_SYMBOL(netif_stacked_transfer_operstate);
 8836
 8837static int netif_alloc_rx_queues(struct net_device *dev)
 8838{
 8839	unsigned int i, count = dev->num_rx_queues;
 8840	struct netdev_rx_queue *rx;
 8841	size_t sz = count * sizeof(*rx);
 8842	int err = 0;
 8843
 8844	BUG_ON(count < 1);
 8845
 8846	rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 8847	if (!rx)
 8848		return -ENOMEM;
 8849
 8850	dev->_rx = rx;
 8851
 8852	for (i = 0; i < count; i++) {
 8853		rx[i].dev = dev;
 8854
 8855		/* XDP RX-queue setup */
 8856		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
 8857		if (err < 0)
 8858			goto err_rxq_info;
 8859	}
 8860	return 0;
 8861
 8862err_rxq_info:
 8863	/* Rollback successful reg's and free other resources */
 8864	while (i--)
 8865		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
 8866	kvfree(dev->_rx);
 8867	dev->_rx = NULL;
 8868	return err;
 8869}
 8870
 8871static void netif_free_rx_queues(struct net_device *dev)
 8872{
 8873	unsigned int i, count = dev->num_rx_queues;
 8874
 8875	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
 8876	if (!dev->_rx)
 8877		return;
 8878
 8879	for (i = 0; i < count; i++)
 8880		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
 8881
 8882	kvfree(dev->_rx);
 8883}
 8884
 8885static void netdev_init_one_queue(struct net_device *dev,
 8886				  struct netdev_queue *queue, void *_unused)
 8887{
 8888	/* Initialize queue lock */
 8889	spin_lock_init(&queue->_xmit_lock);
 8890	lockdep_set_class(&queue->_xmit_lock, &dev->qdisc_xmit_lock_key);
 8891	queue->xmit_lock_owner = -1;
 8892	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
 8893	queue->dev = dev;
 8894#ifdef CONFIG_BQL
 8895	dql_init(&queue->dql, HZ);
 8896#endif
 8897}
 8898
 8899static void netif_free_tx_queues(struct net_device *dev)
 8900{
 8901	kvfree(dev->_tx);
 8902}
 8903
 8904static int netif_alloc_netdev_queues(struct net_device *dev)
 8905{
 8906	unsigned int count = dev->num_tx_queues;
 8907	struct netdev_queue *tx;
 8908	size_t sz = count * sizeof(*tx);
 8909
 8910	if (count < 1 || count > 0xffff)
 8911		return -EINVAL;
 8912
 8913	tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 8914	if (!tx)
 8915		return -ENOMEM;
 8916
 8917	dev->_tx = tx;
 8918
 8919	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
 8920	spin_lock_init(&dev->tx_global_lock);
 8921
 8922	return 0;
 8923}
 8924
 8925void netif_tx_stop_all_queues(struct net_device *dev)
 8926{
 8927	unsigned int i;
 8928
 8929	for (i = 0; i < dev->num_tx_queues; i++) {
 8930		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 8931
 8932		netif_tx_stop_queue(txq);
 8933	}
 8934}
 8935EXPORT_SYMBOL(netif_tx_stop_all_queues);
 8936
 8937static void netdev_register_lockdep_key(struct net_device *dev)
 8938{
 8939	lockdep_register_key(&dev->qdisc_tx_busylock_key);
 8940	lockdep_register_key(&dev->qdisc_running_key);
 8941	lockdep_register_key(&dev->qdisc_xmit_lock_key);
 8942	lockdep_register_key(&dev->addr_list_lock_key);
 8943}
 8944
 8945static void netdev_unregister_lockdep_key(struct net_device *dev)
 8946{
 8947	lockdep_unregister_key(&dev->qdisc_tx_busylock_key);
 8948	lockdep_unregister_key(&dev->qdisc_running_key);
 8949	lockdep_unregister_key(&dev->qdisc_xmit_lock_key);
 8950	lockdep_unregister_key(&dev->addr_list_lock_key);
 8951}
 8952
 8953void netdev_update_lockdep_key(struct net_device *dev)
 8954{
 8955	struct netdev_queue *queue;
 8956	int i;
 8957
 8958	lockdep_unregister_key(&dev->qdisc_xmit_lock_key);
 8959	lockdep_unregister_key(&dev->addr_list_lock_key);
 8960
 8961	lockdep_register_key(&dev->qdisc_xmit_lock_key);
 8962	lockdep_register_key(&dev->addr_list_lock_key);
 8963
 8964	lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key);
 8965	for (i = 0; i < dev->num_tx_queues; i++) {
 8966		queue = netdev_get_tx_queue(dev, i);
 8967
 8968		lockdep_set_class(&queue->_xmit_lock,
 8969				  &dev->qdisc_xmit_lock_key);
 8970	}
 8971}
 8972EXPORT_SYMBOL(netdev_update_lockdep_key);
 8973
 8974/**
 8975 *	register_netdevice	- register a network device
 8976 *	@dev: device to register
 8977 *
 8978 *	Take a completed network device structure and add it to the kernel
 8979 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
 8980 *	chain. 0 is returned on success. A negative errno code is returned
 8981 *	on a failure to set up the device, or if the name is a duplicate.
 8982 *
 8983 *	Callers must hold the rtnl semaphore. You may want
 8984 *	register_netdev() instead of this.
 8985 *
 8986 *	BUGS:
 8987 *	The locking appears insufficient to guarantee two parallel registers
 8988 *	will not get the same name.
 8989 */
 8990
 8991int register_netdevice(struct net_device *dev)
 8992{
 8993	int ret;
 8994	struct net *net = dev_net(dev);
 8995
 8996	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
 8997		     NETDEV_FEATURE_COUNT);
 8998	BUG_ON(dev_boot_phase);
 8999	ASSERT_RTNL();
 9000
 9001	might_sleep();
 9002
 9003	/* When net_device's are persistent, this will be fatal. */
 9004	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
 9005	BUG_ON(!net);
 9006
 9007	spin_lock_init(&dev->addr_list_lock);
 9008	lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key);
 9009
 9010	ret = dev_get_valid_name(net, dev, dev->name);
 9011	if (ret < 0)
 9012		goto out;
 9013
 9014	/* Init, if this function is available */
 9015	if (dev->netdev_ops->ndo_init) {
 9016		ret = dev->netdev_ops->ndo_init(dev);
 9017		if (ret) {
 9018			if (ret > 0)
 9019				ret = -EIO;
 9020			goto out;
 9021		}
 9022	}
 9023
 9024	if (((dev->hw_features | dev->features) &
 9025	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
 9026	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
 9027	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
 9028		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
 9029		ret = -EINVAL;
 9030		goto err_uninit;
 9031	}
 9032
 9033	ret = -EBUSY;
 9034	if (!dev->ifindex)
 9035		dev->ifindex = dev_new_index(net);
 9036	else if (__dev_get_by_index(net, dev->ifindex))
 9037		goto err_uninit;
 9038
 9039	/* Transfer changeable features to wanted_features and enable
 9040	 * software offloads (GSO and GRO).
 9041	 */
 9042	dev->hw_features |= NETIF_F_SOFT_FEATURES;
 9043	dev->features |= NETIF_F_SOFT_FEATURES;
 9044
 9045	if (dev->netdev_ops->ndo_udp_tunnel_add) {
 9046		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
 9047		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
 9048	}
 9049
 9050	dev->wanted_features = dev->features & dev->hw_features;
 9051
 9052	if (!(dev->flags & IFF_LOOPBACK))
 9053		dev->hw_features |= NETIF_F_NOCACHE_COPY;
 9054
 9055	/* If IPv4 TCP segmentation offload is supported we should also
 9056	 * allow the device to enable segmenting the frame with the option
 9057	 * of ignoring a static IP ID value.  This doesn't enable the
 9058	 * feature itself but allows the user to enable it later.
 9059	 */
 9060	if (dev->hw_features & NETIF_F_TSO)
 9061		dev->hw_features |= NETIF_F_TSO_MANGLEID;
 9062	if (dev->vlan_features & NETIF_F_TSO)
 9063		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
 9064	if (dev->mpls_features & NETIF_F_TSO)
 9065		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
 9066	if (dev->hw_enc_features & NETIF_F_TSO)
 9067		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
 9068
 9069	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
 9070	 */
 9071	dev->vlan_features |= NETIF_F_HIGHDMA;
 9072
 9073	/* Make NETIF_F_SG inheritable to tunnel devices.
 9074	 */
 9075	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
 9076
 9077	/* Make NETIF_F_SG inheritable to MPLS.
 9078	 */
 9079	dev->mpls_features |= NETIF_F_SG;
 9080
 9081	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
 9082	ret = notifier_to_errno(ret);
 9083	if (ret)
 9084		goto err_uninit;
 9085
 9086	ret = netdev_register_kobject(dev);
 9087	if (ret)
 9088		goto err_uninit;
 9089	dev->reg_state = NETREG_REGISTERED;
 9090
 9091	__netdev_update_features(dev);
 9092
 9093	/*
 9094	 *	Default initial state at registry is that the
 9095	 *	device is present.
 9096	 */
 9097
 9098	set_bit(__LINK_STATE_PRESENT, &dev->state);
 9099
 9100	linkwatch_init_dev(dev);
 9101
 9102	dev_init_scheduler(dev);
 9103	dev_hold(dev);
 9104	list_netdevice(dev);
 9105	add_device_randomness(dev->dev_addr, dev->addr_len);
 9106
 9107	/* If the device has permanent device address, driver should
 9108	 * set dev_addr and also addr_assign_type should be set to
 9109	 * NET_ADDR_PERM (default value).
 9110	 */
 9111	if (dev->addr_assign_type == NET_ADDR_PERM)
 9112		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
 9113
 9114	/* Notify protocols, that a new device appeared. */
 9115	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
 9116	ret = notifier_to_errno(ret);
 9117	if (ret) {
 9118		rollback_registered(dev);
 9119		rcu_barrier();
 9120
 9121		dev->reg_state = NETREG_UNREGISTERED;
 9122	}
 9123	/*
 9124	 *	Prevent userspace races by waiting until the network
 9125	 *	device is fully setup before sending notifications.
 9126	 */
 9127	if (!dev->rtnl_link_ops ||
 9128	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 9129		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
 9130
 9131out:
 9132	return ret;
 9133
 9134err_uninit:
 9135	if (dev->netdev_ops->ndo_uninit)
 9136		dev->netdev_ops->ndo_uninit(dev);
 9137	if (dev->priv_destructor)
 9138		dev->priv_destructor(dev);
 9139	goto out;
 9140}
 9141EXPORT_SYMBOL(register_netdevice);
 9142
 9143/**
 9144 *	init_dummy_netdev	- init a dummy network device for NAPI
 9145 *	@dev: device to init
 9146 *
 9147 *	This takes a network device structure and initialize the minimum
 9148 *	amount of fields so it can be used to schedule NAPI polls without
 9149 *	registering a full blown interface. This is to be used by drivers
 9150 *	that need to tie several hardware interfaces to a single NAPI
 9151 *	poll scheduler due to HW limitations.
 9152 */
 9153int init_dummy_netdev(struct net_device *dev)
 9154{
 9155	/* Clear everything. Note we don't initialize spinlocks
 9156	 * are they aren't supposed to be taken by any of the
 9157	 * NAPI code and this dummy netdev is supposed to be
 9158	 * only ever used for NAPI polls
 9159	 */
 9160	memset(dev, 0, sizeof(struct net_device));
 9161
 9162	/* make sure we BUG if trying to hit standard
 9163	 * register/unregister code path
 9164	 */
 9165	dev->reg_state = NETREG_DUMMY;
 9166
 9167	/* NAPI wants this */
 9168	INIT_LIST_HEAD(&dev->napi_list);
 9169
 9170	/* a dummy interface is started by default */
 9171	set_bit(__LINK_STATE_PRESENT, &dev->state);
 9172	set_bit(__LINK_STATE_START, &dev->state);
 9173
 9174	/* napi_busy_loop stats accounting wants this */
 9175	dev_net_set(dev, &init_net);
 9176
 9177	/* Note : We dont allocate pcpu_refcnt for dummy devices,
 9178	 * because users of this 'device' dont need to change
 9179	 * its refcount.
 9180	 */
 9181
 9182	return 0;
 9183}
 9184EXPORT_SYMBOL_GPL(init_dummy_netdev);
 9185
 9186
 9187/**
 9188 *	register_netdev	- register a network device
 9189 *	@dev: device to register
 9190 *
 9191 *	Take a completed network device structure and add it to the kernel
 9192 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
 9193 *	chain. 0 is returned on success. A negative errno code is returned
 9194 *	on a failure to set up the device, or if the name is a duplicate.
 9195 *
 9196 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
 9197 *	and expands the device name if you passed a format string to
 9198 *	alloc_netdev.
 9199 */
 9200int register_netdev(struct net_device *dev)
 9201{
 9202	int err;
 9203
 9204	if (rtnl_lock_killable())
 9205		return -EINTR;
 9206	err = register_netdevice(dev);
 9207	rtnl_unlock();
 9208	return err;
 9209}
 9210EXPORT_SYMBOL(register_netdev);
 9211
 9212int netdev_refcnt_read(const struct net_device *dev)
 9213{
 9214	int i, refcnt = 0;
 9215
 9216	for_each_possible_cpu(i)
 9217		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
 9218	return refcnt;
 9219}
 9220EXPORT_SYMBOL(netdev_refcnt_read);
 9221
 9222/**
 9223 * netdev_wait_allrefs - wait until all references are gone.
 9224 * @dev: target net_device
 9225 *
 9226 * This is called when unregistering network devices.
 9227 *
 9228 * Any protocol or device that holds a reference should register
 9229 * for netdevice notification, and cleanup and put back the
 9230 * reference if they receive an UNREGISTER event.
 9231 * We can get stuck here if buggy protocols don't correctly
 9232 * call dev_put.
 9233 */
 9234static void netdev_wait_allrefs(struct net_device *dev)
 9235{
 9236	unsigned long rebroadcast_time, warning_time;
 9237	int refcnt;
 9238
 9239	linkwatch_forget_dev(dev);
 9240
 9241	rebroadcast_time = warning_time = jiffies;
 9242	refcnt = netdev_refcnt_read(dev);
 9243
 9244	while (refcnt != 0) {
 9245		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
 9246			rtnl_lock();
 9247
 9248			/* Rebroadcast unregister notification */
 9249			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 9250
 9251			__rtnl_unlock();
 9252			rcu_barrier();
 9253			rtnl_lock();
 9254
 9255			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
 9256				     &dev->state)) {
 9257				/* We must not have linkwatch events
 9258				 * pending on unregister. If this
 9259				 * happens, we simply run the queue
 9260				 * unscheduled, resulting in a noop
 9261				 * for this device.
 9262				 */
 9263				linkwatch_run_queue();
 9264			}
 9265
 9266			__rtnl_unlock();
 9267
 9268			rebroadcast_time = jiffies;
 9269		}
 9270
 9271		msleep(250);
 9272
 9273		refcnt = netdev_refcnt_read(dev);
 9274
 9275		if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
 9276			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
 9277				 dev->name, refcnt);
 9278			warning_time = jiffies;
 9279		}
 9280	}
 9281}
 9282
 9283/* The sequence is:
 9284 *
 9285 *	rtnl_lock();
 9286 *	...
 9287 *	register_netdevice(x1);
 9288 *	register_netdevice(x2);
 9289 *	...
 9290 *	unregister_netdevice(y1);
 9291 *	unregister_netdevice(y2);
 9292 *      ...
 9293 *	rtnl_unlock();
 9294 *	free_netdev(y1);
 9295 *	free_netdev(y2);
 9296 *
 9297 * We are invoked by rtnl_unlock().
 9298 * This allows us to deal with problems:
 9299 * 1) We can delete sysfs objects which invoke hotplug
 9300 *    without deadlocking with linkwatch via keventd.
 9301 * 2) Since we run with the RTNL semaphore not held, we can sleep
 9302 *    safely in order to wait for the netdev refcnt to drop to zero.
 9303 *
 9304 * We must not return until all unregister events added during
 9305 * the interval the lock was held have been completed.
 9306 */
 9307void netdev_run_todo(void)
 9308{
 9309	struct list_head list;
 9310
 9311	/* Snapshot list, allow later requests */
 9312	list_replace_init(&net_todo_list, &list);
 9313
 9314	__rtnl_unlock();
 9315
 9316
 9317	/* Wait for rcu callbacks to finish before next phase */
 9318	if (!list_empty(&list))
 9319		rcu_barrier();
 9320
 9321	while (!list_empty(&list)) {
 9322		struct net_device *dev
 9323			= list_first_entry(&list, struct net_device, todo_list);
 9324		list_del(&dev->todo_list);
 9325
 9326		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
 9327			pr_err("network todo '%s' but state %d\n",
 9328			       dev->name, dev->reg_state);
 9329			dump_stack();
 9330			continue;
 9331		}
 9332
 9333		dev->reg_state = NETREG_UNREGISTERED;
 9334
 9335		netdev_wait_allrefs(dev);
 9336
 9337		/* paranoia */
 9338		BUG_ON(netdev_refcnt_read(dev));
 9339		BUG_ON(!list_empty(&dev->ptype_all));
 9340		BUG_ON(!list_empty(&dev->ptype_specific));
 9341		WARN_ON(rcu_access_pointer(dev->ip_ptr));
 9342		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
 9343#if IS_ENABLED(CONFIG_DECNET)
 9344		WARN_ON(dev->dn_ptr);
 9345#endif
 9346		if (dev->priv_destructor)
 9347			dev->priv_destructor(dev);
 9348		if (dev->needs_free_netdev)
 9349			free_netdev(dev);
 9350
 9351		/* Report a network device has been unregistered */
 9352		rtnl_lock();
 9353		dev_net(dev)->dev_unreg_count--;
 9354		__rtnl_unlock();
 9355		wake_up(&netdev_unregistering_wq);
 9356
 9357		/* Free network device */
 9358		kobject_put(&dev->dev.kobj);
 9359	}
 9360}
 9361
 9362/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
 9363 * all the same fields in the same order as net_device_stats, with only
 9364 * the type differing, but rtnl_link_stats64 may have additional fields
 9365 * at the end for newer counters.
 9366 */
 9367void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
 9368			     const struct net_device_stats *netdev_stats)
 9369{
 9370#if BITS_PER_LONG == 64
 9371	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
 9372	memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
 9373	/* zero out counters that only exist in rtnl_link_stats64 */
 9374	memset((char *)stats64 + sizeof(*netdev_stats), 0,
 9375	       sizeof(*stats64) - sizeof(*netdev_stats));
 9376#else
 9377	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
 9378	const unsigned long *src = (const unsigned long *)netdev_stats;
 9379	u64 *dst = (u64 *)stats64;
 9380
 9381	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
 9382	for (i = 0; i < n; i++)
 9383		dst[i] = src[i];
 9384	/* zero out counters that only exist in rtnl_link_stats64 */
 9385	memset((char *)stats64 + n * sizeof(u64), 0,
 9386	       sizeof(*stats64) - n * sizeof(u64));
 9387#endif
 9388}
 9389EXPORT_SYMBOL(netdev_stats_to_stats64);
 9390
 9391/**
 9392 *	dev_get_stats	- get network device statistics
 9393 *	@dev: device to get statistics from
 9394 *	@storage: place to store stats
 9395 *
 9396 *	Get network statistics from device. Return @storage.
 9397 *	The device driver may provide its own method by setting
 9398 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
 9399 *	otherwise the internal statistics structure is used.
 9400 */
 9401struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
 9402					struct rtnl_link_stats64 *storage)
 9403{
 9404	const struct net_device_ops *ops = dev->netdev_ops;
 9405
 9406	if (ops->ndo_get_stats64) {
 9407		memset(storage, 0, sizeof(*storage));
 9408		ops->ndo_get_stats64(dev, storage);
 9409	} else if (ops->ndo_get_stats) {
 9410		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
 9411	} else {
 9412		netdev_stats_to_stats64(storage, &dev->stats);
 9413	}
 9414	storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
 9415	storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
 9416	storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
 9417	return storage;
 9418}
 9419EXPORT_SYMBOL(dev_get_stats);
 9420
 9421struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
 9422{
 9423	struct netdev_queue *queue = dev_ingress_queue(dev);
 9424
 9425#ifdef CONFIG_NET_CLS_ACT
 9426	if (queue)
 9427		return queue;
 9428	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
 9429	if (!queue)
 9430		return NULL;
 9431	netdev_init_one_queue(dev, queue, NULL);
 9432	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
 9433	queue->qdisc_sleeping = &noop_qdisc;
 9434	rcu_assign_pointer(dev->ingress_queue, queue);
 9435#endif
 9436	return queue;
 9437}
 9438
 9439static const struct ethtool_ops default_ethtool_ops;
 9440
 9441void netdev_set_default_ethtool_ops(struct net_device *dev,
 9442				    const struct ethtool_ops *ops)
 9443{
 9444	if (dev->ethtool_ops == &default_ethtool_ops)
 9445		dev->ethtool_ops = ops;
 9446}
 9447EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
 9448
 9449void netdev_freemem(struct net_device *dev)
 9450{
 9451	char *addr = (char *)dev - dev->padded;
 9452
 9453	kvfree(addr);
 9454}
 9455
 9456/**
 9457 * alloc_netdev_mqs - allocate network device
 9458 * @sizeof_priv: size of private data to allocate space for
 9459 * @name: device name format string
 9460 * @name_assign_type: origin of device name
 9461 * @setup: callback to initialize device
 9462 * @txqs: the number of TX subqueues to allocate
 9463 * @rxqs: the number of RX subqueues to allocate
 9464 *
 9465 * Allocates a struct net_device with private data area for driver use
 9466 * and performs basic initialization.  Also allocates subqueue structs
 9467 * for each queue on the device.
 9468 */
 9469struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 9470		unsigned char name_assign_type,
 9471		void (*setup)(struct net_device *),
 9472		unsigned int txqs, unsigned int rxqs)
 9473{
 9474	struct net_device *dev;
 9475	unsigned int alloc_size;
 9476	struct net_device *p;
 9477
 9478	BUG_ON(strlen(name) >= sizeof(dev->name));
 9479
 9480	if (txqs < 1) {
 9481		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
 9482		return NULL;
 9483	}
 9484
 9485	if (rxqs < 1) {
 9486		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
 9487		return NULL;
 9488	}
 9489
 9490	alloc_size = sizeof(struct net_device);
 9491	if (sizeof_priv) {
 9492		/* ensure 32-byte alignment of private area */
 9493		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
 9494		alloc_size += sizeof_priv;
 9495	}
 9496	/* ensure 32-byte alignment of whole construct */
 9497	alloc_size += NETDEV_ALIGN - 1;
 9498
 9499	p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 9500	if (!p)
 9501		return NULL;
 9502
 9503	dev = PTR_ALIGN(p, NETDEV_ALIGN);
 9504	dev->padded = (char *)dev - (char *)p;
 9505
 9506	dev->pcpu_refcnt = alloc_percpu(int);
 9507	if (!dev->pcpu_refcnt)
 9508		goto free_dev;
 9509
 9510	if (dev_addr_init(dev))
 9511		goto free_pcpu;
 9512
 9513	dev_mc_init(dev);
 9514	dev_uc_init(dev);
 9515
 9516	dev_net_set(dev, &init_net);
 9517
 9518	netdev_register_lockdep_key(dev);
 9519
 9520	dev->gso_max_size = GSO_MAX_SIZE;
 9521	dev->gso_max_segs = GSO_MAX_SEGS;
 9522	dev->upper_level = 1;
 9523	dev->lower_level = 1;
 9524
 9525	INIT_LIST_HEAD(&dev->napi_list);
 9526	INIT_LIST_HEAD(&dev->unreg_list);
 9527	INIT_LIST_HEAD(&dev->close_list);
 9528	INIT_LIST_HEAD(&dev->link_watch_list);
 9529	INIT_LIST_HEAD(&dev->adj_list.upper);
 9530	INIT_LIST_HEAD(&dev->adj_list.lower);
 9531	INIT_LIST_HEAD(&dev->ptype_all);
 9532	INIT_LIST_HEAD(&dev->ptype_specific);
 9533#ifdef CONFIG_NET_SCHED
 9534	hash_init(dev->qdisc_hash);
 9535#endif
 9536	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
 9537	setup(dev);
 9538
 9539	if (!dev->tx_queue_len) {
 9540		dev->priv_flags |= IFF_NO_QUEUE;
 9541		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
 9542	}
 9543
 9544	dev->num_tx_queues = txqs;
 9545	dev->real_num_tx_queues = txqs;
 9546	if (netif_alloc_netdev_queues(dev))
 9547		goto free_all;
 9548
 9549	dev->num_rx_queues = rxqs;
 9550	dev->real_num_rx_queues = rxqs;
 9551	if (netif_alloc_rx_queues(dev))
 9552		goto free_all;
 9553
 9554	strcpy(dev->name, name);
 9555	dev->name_assign_type = name_assign_type;
 9556	dev->group = INIT_NETDEV_GROUP;
 9557	if (!dev->ethtool_ops)
 9558		dev->ethtool_ops = &default_ethtool_ops;
 9559
 9560	nf_hook_ingress_init(dev);
 9561
 9562	return dev;
 9563
 9564free_all:
 9565	free_netdev(dev);
 9566	return NULL;
 9567
 9568free_pcpu:
 9569	free_percpu(dev->pcpu_refcnt);
 9570free_dev:
 9571	netdev_freemem(dev);
 9572	return NULL;
 9573}
 9574EXPORT_SYMBOL(alloc_netdev_mqs);
 9575
 9576/**
 9577 * free_netdev - free network device
 9578 * @dev: device
 9579 *
 9580 * This function does the last stage of destroying an allocated device
 9581 * interface. The reference to the device object is released. If this
 9582 * is the last reference then it will be freed.Must be called in process
 9583 * context.
 9584 */
 9585void free_netdev(struct net_device *dev)
 9586{
 9587	struct napi_struct *p, *n;
 9588
 9589	might_sleep();
 9590	netif_free_tx_queues(dev);
 9591	netif_free_rx_queues(dev);
 9592
 9593	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
 9594
 9595	/* Flush device addresses */
 9596	dev_addr_flush(dev);
 9597
 9598	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 9599		netif_napi_del(p);
 9600
 9601	free_percpu(dev->pcpu_refcnt);
 9602	dev->pcpu_refcnt = NULL;
 9603
 9604	netdev_unregister_lockdep_key(dev);
 9605
 9606	/*  Compatibility with error handling in drivers */
 9607	if (dev->reg_state == NETREG_UNINITIALIZED) {
 9608		netdev_freemem(dev);
 9609		return;
 9610	}
 9611
 9612	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
 9613	dev->reg_state = NETREG_RELEASED;
 9614
 9615	/* will free via device release */
 9616	put_device(&dev->dev);
 9617}
 9618EXPORT_SYMBOL(free_netdev);
 9619
 9620/**
 9621 *	synchronize_net -  Synchronize with packet receive processing
 9622 *
 9623 *	Wait for packets currently being received to be done.
 9624 *	Does not block later packets from starting.
 9625 */
 9626void synchronize_net(void)
 9627{
 9628	might_sleep();
 9629	if (rtnl_is_locked())
 9630		synchronize_rcu_expedited();
 9631	else
 9632		synchronize_rcu();
 9633}
 9634EXPORT_SYMBOL(synchronize_net);
 9635
 9636/**
 9637 *	unregister_netdevice_queue - remove device from the kernel
 9638 *	@dev: device
 9639 *	@head: list
 9640 *
 9641 *	This function shuts down a device interface and removes it
 9642 *	from the kernel tables.
 9643 *	If head not NULL, device is queued to be unregistered later.
 9644 *
 9645 *	Callers must hold the rtnl semaphore.  You may want
 9646 *	unregister_netdev() instead of this.
 9647 */
 9648
 9649void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
 9650{
 9651	ASSERT_RTNL();
 9652
 9653	if (head) {
 9654		list_move_tail(&dev->unreg_list, head);
 9655	} else {
 9656		rollback_registered(dev);
 9657		/* Finish processing unregister after unlock */
 9658		net_set_todo(dev);
 9659	}
 9660}
 9661EXPORT_SYMBOL(unregister_netdevice_queue);
 9662
 9663/**
 9664 *	unregister_netdevice_many - unregister many devices
 9665 *	@head: list of devices
 9666 *
 9667 *  Note: As most callers use a stack allocated list_head,
 9668 *  we force a list_del() to make sure stack wont be corrupted later.
 9669 */
 9670void unregister_netdevice_many(struct list_head *head)
 9671{
 9672	struct net_device *dev;
 9673
 9674	if (!list_empty(head)) {
 9675		rollback_registered_many(head);
 9676		list_for_each_entry(dev, head, unreg_list)
 9677			net_set_todo(dev);
 9678		list_del(head);
 9679	}
 9680}
 9681EXPORT_SYMBOL(unregister_netdevice_many);
 9682
 9683/**
 9684 *	unregister_netdev - remove device from the kernel
 9685 *	@dev: device
 9686 *
 9687 *	This function shuts down a device interface and removes it
 9688 *	from the kernel tables.
 9689 *
 9690 *	This is just a wrapper for unregister_netdevice that takes
 9691 *	the rtnl semaphore.  In general you want to use this and not
 9692 *	unregister_netdevice.
 9693 */
 9694void unregister_netdev(struct net_device *dev)
 9695{
 9696	rtnl_lock();
 9697	unregister_netdevice(dev);
 9698	rtnl_unlock();
 9699}
 9700EXPORT_SYMBOL(unregister_netdev);
 9701
 9702/**
 9703 *	dev_change_net_namespace - move device to different nethost namespace
 9704 *	@dev: device
 9705 *	@net: network namespace
 9706 *	@pat: If not NULL name pattern to try if the current device name
 9707 *	      is already taken in the destination network namespace.
 9708 *
 9709 *	This function shuts down a device interface and moves it
 9710 *	to a new network namespace. On success 0 is returned, on
 9711 *	a failure a netagive errno code is returned.
 9712 *
 9713 *	Callers must hold the rtnl semaphore.
 9714 */
 9715
 9716int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
 9717{
 9718	int err, new_nsid, new_ifindex;
 9719
 9720	ASSERT_RTNL();
 9721
 9722	/* Don't allow namespace local devices to be moved. */
 9723	err = -EINVAL;
 9724	if (dev->features & NETIF_F_NETNS_LOCAL)
 9725		goto out;
 9726
 9727	/* Ensure the device has been registrered */
 9728	if (dev->reg_state != NETREG_REGISTERED)
 9729		goto out;
 9730
 9731	/* Get out if there is nothing todo */
 9732	err = 0;
 9733	if (net_eq(dev_net(dev), net))
 9734		goto out;
 9735
 9736	/* Pick the destination device name, and ensure
 9737	 * we can use it in the destination network namespace.
 9738	 */
 9739	err = -EEXIST;
 9740	if (__dev_get_by_name(net, dev->name)) {
 9741		/* We get here if we can't use the current device name */
 9742		if (!pat)
 9743			goto out;
 9744		err = dev_get_valid_name(net, dev, pat);
 9745		if (err < 0)
 9746			goto out;
 9747	}
 9748
 9749	/*
 9750	 * And now a mini version of register_netdevice unregister_netdevice.
 9751	 */
 9752
 9753	/* If device is running close it first. */
 9754	dev_close(dev);
 9755
 9756	/* And unlink it from device chain */
 
 9757	unlist_netdevice(dev);
 9758
 9759	synchronize_net();
 9760
 9761	/* Shutdown queueing discipline. */
 9762	dev_shutdown(dev);
 9763
 9764	/* Notify protocols, that we are about to destroy
 9765	 * this device. They should clean all the things.
 9766	 *
 9767	 * Note that dev->reg_state stays at NETREG_REGISTERED.
 9768	 * This is wanted because this way 8021q and macvlan know
 9769	 * the device is just moving and can keep their slaves up.
 9770	 */
 9771	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 9772	rcu_barrier();
 9773
 9774	new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
 9775	/* If there is an ifindex conflict assign a new one */
 9776	if (__dev_get_by_index(net, dev->ifindex))
 9777		new_ifindex = dev_new_index(net);
 9778	else
 9779		new_ifindex = dev->ifindex;
 9780
 9781	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
 9782			    new_ifindex);
 9783
 9784	/*
 9785	 *	Flush the unicast and multicast chains
 9786	 */
 9787	dev_uc_flush(dev);
 9788	dev_mc_flush(dev);
 9789
 9790	/* Send a netdev-removed uevent to the old namespace */
 9791	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
 9792	netdev_adjacent_del_links(dev);
 9793
 9794	/* Actually switch the network namespace */
 9795	dev_net_set(dev, net);
 9796	dev->ifindex = new_ifindex;
 9797
 9798	/* Send a netdev-add uevent to the new namespace */
 9799	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
 9800	netdev_adjacent_add_links(dev);
 9801
 9802	/* Fixup kobjects */
 9803	err = device_rename(&dev->dev, dev->name);
 9804	WARN_ON(err);
 9805
 9806	/* Add the device back in the hashes */
 9807	list_netdevice(dev);
 9808
 9809	/* Notify protocols, that a new device appeared. */
 9810	call_netdevice_notifiers(NETDEV_REGISTER, dev);
 9811
 9812	/*
 9813	 *	Prevent userspace races by waiting until the network
 9814	 *	device is fully setup before sending notifications.
 9815	 */
 9816	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
 9817
 9818	synchronize_net();
 9819	err = 0;
 9820out:
 9821	return err;
 9822}
 9823EXPORT_SYMBOL_GPL(dev_change_net_namespace);
 9824
 9825static int dev_cpu_dead(unsigned int oldcpu)
 9826{
 9827	struct sk_buff **list_skb;
 9828	struct sk_buff *skb;
 9829	unsigned int cpu;
 9830	struct softnet_data *sd, *oldsd, *remsd = NULL;
 9831
 9832	local_irq_disable();
 9833	cpu = smp_processor_id();
 9834	sd = &per_cpu(softnet_data, cpu);
 9835	oldsd = &per_cpu(softnet_data, oldcpu);
 9836
 9837	/* Find end of our completion_queue. */
 9838	list_skb = &sd->completion_queue;
 9839	while (*list_skb)
 9840		list_skb = &(*list_skb)->next;
 9841	/* Append completion queue from offline CPU. */
 9842	*list_skb = oldsd->completion_queue;
 9843	oldsd->completion_queue = NULL;
 9844
 9845	/* Append output queue from offline CPU. */
 9846	if (oldsd->output_queue) {
 9847		*sd->output_queue_tailp = oldsd->output_queue;
 9848		sd->output_queue_tailp = oldsd->output_queue_tailp;
 9849		oldsd->output_queue = NULL;
 9850		oldsd->output_queue_tailp = &oldsd->output_queue;
 9851	}
 9852	/* Append NAPI poll list from offline CPU, with one exception :
 9853	 * process_backlog() must be called by cpu owning percpu backlog.
 9854	 * We properly handle process_queue & input_pkt_queue later.
 9855	 */
 9856	while (!list_empty(&oldsd->poll_list)) {
 9857		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
 9858							    struct napi_struct,
 9859							    poll_list);
 9860
 9861		list_del_init(&napi->poll_list);
 9862		if (napi->poll == process_backlog)
 9863			napi->state = 0;
 9864		else
 9865			____napi_schedule(sd, napi);
 9866	}
 9867
 9868	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 9869	local_irq_enable();
 9870
 9871#ifdef CONFIG_RPS
 9872	remsd = oldsd->rps_ipi_list;
 9873	oldsd->rps_ipi_list = NULL;
 9874#endif
 9875	/* send out pending IPI's on offline CPU */
 9876	net_rps_send_ipi(remsd);
 9877
 9878	/* Process offline CPU's input_pkt_queue */
 9879	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
 9880		netif_rx_ni(skb);
 9881		input_queue_head_incr(oldsd);
 9882	}
 9883	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
 9884		netif_rx_ni(skb);
 9885		input_queue_head_incr(oldsd);
 9886	}
 9887
 9888	return 0;
 9889}
 9890
 9891/**
 9892 *	netdev_increment_features - increment feature set by one
 9893 *	@all: current feature set
 9894 *	@one: new feature set
 9895 *	@mask: mask feature set
 9896 *
 9897 *	Computes a new feature set after adding a device with feature set
 9898 *	@one to the master device with current feature set @all.  Will not
 9899 *	enable anything that is off in @mask. Returns the new feature set.
 9900 */
 9901netdev_features_t netdev_increment_features(netdev_features_t all,
 9902	netdev_features_t one, netdev_features_t mask)
 9903{
 9904	if (mask & NETIF_F_HW_CSUM)
 9905		mask |= NETIF_F_CSUM_MASK;
 9906	mask |= NETIF_F_VLAN_CHALLENGED;
 9907
 9908	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
 9909	all &= one | ~NETIF_F_ALL_FOR_ALL;
 9910
 9911	/* If one device supports hw checksumming, set for all. */
 9912	if (all & NETIF_F_HW_CSUM)
 9913		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
 9914
 9915	return all;
 9916}
 9917EXPORT_SYMBOL(netdev_increment_features);
 9918
 9919static struct hlist_head * __net_init netdev_create_hash(void)
 9920{
 9921	int i;
 9922	struct hlist_head *hash;
 9923
 9924	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
 9925	if (hash != NULL)
 9926		for (i = 0; i < NETDEV_HASHENTRIES; i++)
 9927			INIT_HLIST_HEAD(&hash[i]);
 9928
 9929	return hash;
 9930}
 9931
 9932/* Initialize per network namespace state */
 9933static int __net_init netdev_init(struct net *net)
 9934{
 9935	BUILD_BUG_ON(GRO_HASH_BUCKETS >
 9936		     8 * FIELD_SIZEOF(struct napi_struct, gro_bitmask));
 9937
 9938	if (net != &init_net)
 9939		INIT_LIST_HEAD(&net->dev_base_head);
 9940
 9941	net->dev_name_head = netdev_create_hash();
 9942	if (net->dev_name_head == NULL)
 9943		goto err_name;
 9944
 9945	net->dev_index_head = netdev_create_hash();
 9946	if (net->dev_index_head == NULL)
 9947		goto err_idx;
 9948
 9949	return 0;
 9950
 9951err_idx:
 9952	kfree(net->dev_name_head);
 9953err_name:
 9954	return -ENOMEM;
 9955}
 9956
 9957/**
 9958 *	netdev_drivername - network driver for the device
 9959 *	@dev: network device
 9960 *
 9961 *	Determine network driver for device.
 9962 */
 9963const char *netdev_drivername(const struct net_device *dev)
 9964{
 9965	const struct device_driver *driver;
 9966	const struct device *parent;
 9967	const char *empty = "";
 9968
 9969	parent = dev->dev.parent;
 9970	if (!parent)
 9971		return empty;
 9972
 9973	driver = parent->driver;
 9974	if (driver && driver->name)
 9975		return driver->name;
 9976	return empty;
 9977}
 9978
 9979static void __netdev_printk(const char *level, const struct net_device *dev,
 9980			    struct va_format *vaf)
 9981{
 9982	if (dev && dev->dev.parent) {
 9983		dev_printk_emit(level[1] - '0',
 9984				dev->dev.parent,
 9985				"%s %s %s%s: %pV",
 9986				dev_driver_string(dev->dev.parent),
 9987				dev_name(dev->dev.parent),
 9988				netdev_name(dev), netdev_reg_state(dev),
 9989				vaf);
 9990	} else if (dev) {
 9991		printk("%s%s%s: %pV",
 9992		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
 9993	} else {
 9994		printk("%s(NULL net_device): %pV", level, vaf);
 9995	}
 9996}
 9997
 9998void netdev_printk(const char *level, const struct net_device *dev,
 9999		   const char *format, ...)
10000{
10001	struct va_format vaf;
10002	va_list args;
10003
10004	va_start(args, format);
10005
10006	vaf.fmt = format;
10007	vaf.va = &args;
10008
10009	__netdev_printk(level, dev, &vaf);
10010
10011	va_end(args);
10012}
10013EXPORT_SYMBOL(netdev_printk);
10014
10015#define define_netdev_printk_level(func, level)			\
10016void func(const struct net_device *dev, const char *fmt, ...)	\
10017{								\
10018	struct va_format vaf;					\
10019	va_list args;						\
10020								\
10021	va_start(args, fmt);					\
10022								\
10023	vaf.fmt = fmt;						\
10024	vaf.va = &args;						\
10025								\
10026	__netdev_printk(level, dev, &vaf);			\
10027								\
10028	va_end(args);						\
10029}								\
10030EXPORT_SYMBOL(func);
10031
10032define_netdev_printk_level(netdev_emerg, KERN_EMERG);
10033define_netdev_printk_level(netdev_alert, KERN_ALERT);
10034define_netdev_printk_level(netdev_crit, KERN_CRIT);
10035define_netdev_printk_level(netdev_err, KERN_ERR);
10036define_netdev_printk_level(netdev_warn, KERN_WARNING);
10037define_netdev_printk_level(netdev_notice, KERN_NOTICE);
10038define_netdev_printk_level(netdev_info, KERN_INFO);
10039
10040static void __net_exit netdev_exit(struct net *net)
10041{
10042	kfree(net->dev_name_head);
10043	kfree(net->dev_index_head);
10044	if (net != &init_net)
10045		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
10046}
10047
10048static struct pernet_operations __net_initdata netdev_net_ops = {
10049	.init = netdev_init,
10050	.exit = netdev_exit,
10051};
10052
10053static void __net_exit default_device_exit(struct net *net)
10054{
10055	struct net_device *dev, *aux;
10056	/*
10057	 * Push all migratable network devices back to the
10058	 * initial network namespace
10059	 */
10060	rtnl_lock();
10061	for_each_netdev_safe(net, dev, aux) {
10062		int err;
10063		char fb_name[IFNAMSIZ];
10064
10065		/* Ignore unmoveable devices (i.e. loopback) */
10066		if (dev->features & NETIF_F_NETNS_LOCAL)
10067			continue;
10068
10069		/* Leave virtual devices for the generic cleanup */
10070		if (dev->rtnl_link_ops)
10071			continue;
10072
10073		/* Push remaining network devices to init_net */
10074		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
10075		if (__dev_get_by_name(&init_net, fb_name))
10076			snprintf(fb_name, IFNAMSIZ, "dev%%d");
10077		err = dev_change_net_namespace(dev, &init_net, fb_name);
10078		if (err) {
10079			pr_emerg("%s: failed to move %s to init_net: %d\n",
10080				 __func__, dev->name, err);
10081			BUG();
10082		}
10083	}
10084	rtnl_unlock();
10085}
10086
10087static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
10088{
10089	/* Return with the rtnl_lock held when there are no network
10090	 * devices unregistering in any network namespace in net_list.
10091	 */
10092	struct net *net;
10093	bool unregistering;
10094	DEFINE_WAIT_FUNC(wait, woken_wake_function);
10095
10096	add_wait_queue(&netdev_unregistering_wq, &wait);
10097	for (;;) {
10098		unregistering = false;
10099		rtnl_lock();
10100		list_for_each_entry(net, net_list, exit_list) {
10101			if (net->dev_unreg_count > 0) {
10102				unregistering = true;
10103				break;
10104			}
10105		}
10106		if (!unregistering)
10107			break;
10108		__rtnl_unlock();
10109
10110		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
10111	}
10112	remove_wait_queue(&netdev_unregistering_wq, &wait);
10113}
10114
10115static void __net_exit default_device_exit_batch(struct list_head *net_list)
10116{
10117	/* At exit all network devices most be removed from a network
10118	 * namespace.  Do this in the reverse order of registration.
10119	 * Do this across as many network namespaces as possible to
10120	 * improve batching efficiency.
10121	 */
10122	struct net_device *dev;
10123	struct net *net;
10124	LIST_HEAD(dev_kill_list);
10125
10126	/* To prevent network device cleanup code from dereferencing
10127	 * loopback devices or network devices that have been freed
10128	 * wait here for all pending unregistrations to complete,
10129	 * before unregistring the loopback device and allowing the
10130	 * network namespace be freed.
10131	 *
10132	 * The netdev todo list containing all network devices
10133	 * unregistrations that happen in default_device_exit_batch
10134	 * will run in the rtnl_unlock() at the end of
10135	 * default_device_exit_batch.
10136	 */
10137	rtnl_lock_unregistering(net_list);
10138	list_for_each_entry(net, net_list, exit_list) {
10139		for_each_netdev_reverse(net, dev) {
10140			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
10141				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
10142			else
10143				unregister_netdevice_queue(dev, &dev_kill_list);
10144		}
10145	}
10146	unregister_netdevice_many(&dev_kill_list);
10147	rtnl_unlock();
10148}
10149
10150static struct pernet_operations __net_initdata default_device_ops = {
10151	.exit = default_device_exit,
10152	.exit_batch = default_device_exit_batch,
10153};
10154
10155/*
10156 *	Initialize the DEV module. At boot time this walks the device list and
10157 *	unhooks any devices that fail to initialise (normally hardware not
10158 *	present) and leaves us with a valid list of present and active devices.
10159 *
10160 */
10161
10162/*
10163 *       This is called single threaded during boot, so no need
10164 *       to take the rtnl semaphore.
10165 */
10166static int __init net_dev_init(void)
10167{
10168	int i, rc = -ENOMEM;
10169
10170	BUG_ON(!dev_boot_phase);
10171
10172	if (dev_proc_init())
10173		goto out;
10174
10175	if (netdev_kobject_init())
10176		goto out;
10177
10178	INIT_LIST_HEAD(&ptype_all);
10179	for (i = 0; i < PTYPE_HASH_SIZE; i++)
10180		INIT_LIST_HEAD(&ptype_base[i]);
10181
10182	INIT_LIST_HEAD(&offload_base);
10183
10184	if (register_pernet_subsys(&netdev_net_ops))
10185		goto out;
10186
10187	/*
10188	 *	Initialise the packet receive queues.
10189	 */
10190
10191	for_each_possible_cpu(i) {
10192		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
10193		struct softnet_data *sd = &per_cpu(softnet_data, i);
10194
10195		INIT_WORK(flush, flush_backlog);
10196
10197		skb_queue_head_init(&sd->input_pkt_queue);
10198		skb_queue_head_init(&sd->process_queue);
10199#ifdef CONFIG_XFRM_OFFLOAD
10200		skb_queue_head_init(&sd->xfrm_backlog);
10201#endif
10202		INIT_LIST_HEAD(&sd->poll_list);
10203		sd->output_queue_tailp = &sd->output_queue;
10204#ifdef CONFIG_RPS
10205		sd->csd.func = rps_trigger_softirq;
10206		sd->csd.info = sd;
10207		sd->cpu = i;
10208#endif
10209
10210		init_gro_hash(&sd->backlog);
10211		sd->backlog.poll = process_backlog;
10212		sd->backlog.weight = weight_p;
10213	}
10214
10215	dev_boot_phase = 0;
10216
10217	/* The loopback device is special if any other network devices
10218	 * is present in a network namespace the loopback device must
10219	 * be present. Since we now dynamically allocate and free the
10220	 * loopback device ensure this invariant is maintained by
10221	 * keeping the loopback device as the first device on the
10222	 * list of network devices.  Ensuring the loopback devices
10223	 * is the first device that appears and the last network device
10224	 * that disappears.
10225	 */
10226	if (register_pernet_device(&loopback_net_ops))
10227		goto out;
10228
10229	if (register_pernet_device(&default_device_ops))
10230		goto out;
10231
10232	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
10233	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
10234
10235	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
10236				       NULL, dev_cpu_dead);
10237	WARN_ON(rc < 0);
10238	rc = 0;
10239out:
10240	return rc;
10241}
10242
10243subsys_initcall(net_dev_init);
v4.17
 
   1/*
   2 *      NET3    Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 *              Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *                                      to 2 if register_netdev gets called
  25 *                                      before net_dev_init & also removed a
  26 *                                      few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *              Alan Cox        :       Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *                                      indefinitely on dev->refcnt
  71 *              J Hadi Salim    :       - Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <linux/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/sched/mm.h>
  85#include <linux/mutex.h>
  86#include <linux/string.h>
  87#include <linux/mm.h>
  88#include <linux/socket.h>
  89#include <linux/sockios.h>
  90#include <linux/errno.h>
  91#include <linux/interrupt.h>
  92#include <linux/if_ether.h>
  93#include <linux/netdevice.h>
  94#include <linux/etherdevice.h>
  95#include <linux/ethtool.h>
  96#include <linux/notifier.h>
  97#include <linux/skbuff.h>
  98#include <linux/bpf.h>
  99#include <linux/bpf_trace.h>
 100#include <net/net_namespace.h>
 101#include <net/sock.h>
 102#include <net/busy_poll.h>
 103#include <linux/rtnetlink.h>
 104#include <linux/stat.h>
 105#include <net/dst.h>
 106#include <net/dst_metadata.h>
 107#include <net/pkt_sched.h>
 108#include <net/pkt_cls.h>
 109#include <net/checksum.h>
 110#include <net/xfrm.h>
 111#include <linux/highmem.h>
 112#include <linux/init.h>
 113#include <linux/module.h>
 114#include <linux/netpoll.h>
 115#include <linux/rcupdate.h>
 116#include <linux/delay.h>
 117#include <net/iw_handler.h>
 118#include <asm/current.h>
 119#include <linux/audit.h>
 120#include <linux/dmaengine.h>
 121#include <linux/err.h>
 122#include <linux/ctype.h>
 123#include <linux/if_arp.h>
 124#include <linux/if_vlan.h>
 125#include <linux/ip.h>
 126#include <net/ip.h>
 127#include <net/mpls.h>
 128#include <linux/ipv6.h>
 129#include <linux/in.h>
 130#include <linux/jhash.h>
 131#include <linux/random.h>
 132#include <trace/events/napi.h>
 133#include <trace/events/net.h>
 134#include <trace/events/skb.h>
 135#include <linux/pci.h>
 136#include <linux/inetdevice.h>
 137#include <linux/cpu_rmap.h>
 138#include <linux/static_key.h>
 139#include <linux/hashtable.h>
 140#include <linux/vmalloc.h>
 141#include <linux/if_macvlan.h>
 142#include <linux/errqueue.h>
 143#include <linux/hrtimer.h>
 144#include <linux/netfilter_ingress.h>
 145#include <linux/crash_dump.h>
 146#include <linux/sctp.h>
 147#include <net/udp_tunnel.h>
 148#include <linux/net_namespace.h>
 
 
 149
 150#include "net-sysfs.h"
 151
 152/* Instead of increasing this, you should create a hash table. */
 153#define MAX_GRO_SKBS 8
 
 154
 155/* This should be increased if a protocol with a bigger head is added. */
 156#define GRO_MAX_HEAD (MAX_HEADER + 128)
 157
 158static DEFINE_SPINLOCK(ptype_lock);
 159static DEFINE_SPINLOCK(offload_lock);
 160struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 161struct list_head ptype_all __read_mostly;	/* Taps */
 162static struct list_head offload_base __read_mostly;
 163
 164static int netif_rx_internal(struct sk_buff *skb);
 165static int call_netdevice_notifiers_info(unsigned long val,
 166					 struct netdev_notifier_info *info);
 
 
 
 167static struct napi_struct *napi_by_id(unsigned int napi_id);
 168
 169/*
 170 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 171 * semaphore.
 172 *
 173 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 174 *
 175 * Writers must hold the rtnl semaphore while they loop through the
 176 * dev_base_head list, and hold dev_base_lock for writing when they do the
 177 * actual updates.  This allows pure readers to access the list even
 178 * while a writer is preparing to update it.
 179 *
 180 * To put it another way, dev_base_lock is held for writing only to
 181 * protect against pure readers; the rtnl semaphore provides the
 182 * protection against other writers.
 183 *
 184 * See, for example usages, register_netdevice() and
 185 * unregister_netdevice(), which must be called with the rtnl
 186 * semaphore held.
 187 */
 188DEFINE_RWLOCK(dev_base_lock);
 189EXPORT_SYMBOL(dev_base_lock);
 190
 191static DEFINE_MUTEX(ifalias_mutex);
 192
 193/* protects napi_hash addition/deletion and napi_gen_id */
 194static DEFINE_SPINLOCK(napi_hash_lock);
 195
 196static unsigned int napi_gen_id = NR_CPUS;
 197static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 198
 199static seqcount_t devnet_rename_seq;
 200
 201static inline void dev_base_seq_inc(struct net *net)
 202{
 203	while (++net->dev_base_seq == 0)
 204		;
 205}
 206
 207static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 208{
 209	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
 210
 211	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 212}
 213
 214static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 215{
 216	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 217}
 218
 219static inline void rps_lock(struct softnet_data *sd)
 220{
 221#ifdef CONFIG_RPS
 222	spin_lock(&sd->input_pkt_queue.lock);
 223#endif
 224}
 225
 226static inline void rps_unlock(struct softnet_data *sd)
 227{
 228#ifdef CONFIG_RPS
 229	spin_unlock(&sd->input_pkt_queue.lock);
 230#endif
 231}
 232
 233/* Device list insertion */
 234static void list_netdevice(struct net_device *dev)
 235{
 236	struct net *net = dev_net(dev);
 237
 238	ASSERT_RTNL();
 239
 240	write_lock_bh(&dev_base_lock);
 241	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 242	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 243	hlist_add_head_rcu(&dev->index_hlist,
 244			   dev_index_hash(net, dev->ifindex));
 245	write_unlock_bh(&dev_base_lock);
 246
 247	dev_base_seq_inc(net);
 248}
 249
 250/* Device list removal
 251 * caller must respect a RCU grace period before freeing/reusing dev
 252 */
 253static void unlist_netdevice(struct net_device *dev)
 254{
 255	ASSERT_RTNL();
 256
 257	/* Unlink dev from the device chain */
 258	write_lock_bh(&dev_base_lock);
 259	list_del_rcu(&dev->dev_list);
 260	hlist_del_rcu(&dev->name_hlist);
 261	hlist_del_rcu(&dev->index_hlist);
 262	write_unlock_bh(&dev_base_lock);
 263
 264	dev_base_seq_inc(dev_net(dev));
 265}
 266
 267/*
 268 *	Our notifier list
 269 */
 270
 271static RAW_NOTIFIER_HEAD(netdev_chain);
 272
 273/*
 274 *	Device drivers call our routines to queue packets here. We empty the
 275 *	queue in the local softnet handler.
 276 */
 277
 278DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 279EXPORT_PER_CPU_SYMBOL(softnet_data);
 280
 281#ifdef CONFIG_LOCKDEP
 282/*
 283 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 284 * according to dev->type
 285 */
 286static const unsigned short netdev_lock_type[] = {
 287	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 288	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 289	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 290	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 291	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 292	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 293	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 294	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 295	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 296	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 297	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 298	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 299	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 300	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 301	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 302
 303static const char *const netdev_lock_name[] = {
 304	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 305	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 306	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 307	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 308	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 309	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 310	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 311	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 312	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 313	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 314	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 315	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 316	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 317	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 318	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 319
 320static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 321static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 322
 323static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 324{
 325	int i;
 326
 327	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 328		if (netdev_lock_type[i] == dev_type)
 329			return i;
 330	/* the last key is used by default */
 331	return ARRAY_SIZE(netdev_lock_type) - 1;
 332}
 333
 334static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 335						 unsigned short dev_type)
 336{
 337	int i;
 338
 339	i = netdev_lock_pos(dev_type);
 340	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 341				   netdev_lock_name[i]);
 342}
 343
 344static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 345{
 346	int i;
 347
 348	i = netdev_lock_pos(dev->type);
 349	lockdep_set_class_and_name(&dev->addr_list_lock,
 350				   &netdev_addr_lock_key[i],
 351				   netdev_lock_name[i]);
 352}
 353#else
 354static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 355						 unsigned short dev_type)
 356{
 357}
 358static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 359{
 360}
 361#endif
 362
 363/*******************************************************************************
 364 *
 365 *		Protocol management and registration routines
 366 *
 367 *******************************************************************************/
 368
 369
 370/*
 371 *	Add a protocol ID to the list. Now that the input handler is
 372 *	smarter we can dispense with all the messy stuff that used to be
 373 *	here.
 374 *
 375 *	BEWARE!!! Protocol handlers, mangling input packets,
 376 *	MUST BE last in hash buckets and checking protocol handlers
 377 *	MUST start from promiscuous ptype_all chain in net_bh.
 378 *	It is true now, do not change it.
 379 *	Explanation follows: if protocol handler, mangling packet, will
 380 *	be the first on list, it is not able to sense, that packet
 381 *	is cloned and should be copied-on-write, so that it will
 382 *	change it and subsequent readers will get broken packet.
 383 *							--ANK (980803)
 384 */
 385
 386static inline struct list_head *ptype_head(const struct packet_type *pt)
 387{
 388	if (pt->type == htons(ETH_P_ALL))
 389		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 390	else
 391		return pt->dev ? &pt->dev->ptype_specific :
 392				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 393}
 394
 395/**
 396 *	dev_add_pack - add packet handler
 397 *	@pt: packet type declaration
 398 *
 399 *	Add a protocol handler to the networking stack. The passed &packet_type
 400 *	is linked into kernel lists and may not be freed until it has been
 401 *	removed from the kernel lists.
 402 *
 403 *	This call does not sleep therefore it can not
 404 *	guarantee all CPU's that are in middle of receiving packets
 405 *	will see the new packet type (until the next received packet).
 406 */
 407
 408void dev_add_pack(struct packet_type *pt)
 409{
 410	struct list_head *head = ptype_head(pt);
 411
 412	spin_lock(&ptype_lock);
 413	list_add_rcu(&pt->list, head);
 414	spin_unlock(&ptype_lock);
 415}
 416EXPORT_SYMBOL(dev_add_pack);
 417
 418/**
 419 *	__dev_remove_pack	 - remove packet handler
 420 *	@pt: packet type declaration
 421 *
 422 *	Remove a protocol handler that was previously added to the kernel
 423 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 424 *	from the kernel lists and can be freed or reused once this function
 425 *	returns.
 426 *
 427 *      The packet type might still be in use by receivers
 428 *	and must not be freed until after all the CPU's have gone
 429 *	through a quiescent state.
 430 */
 431void __dev_remove_pack(struct packet_type *pt)
 432{
 433	struct list_head *head = ptype_head(pt);
 434	struct packet_type *pt1;
 435
 436	spin_lock(&ptype_lock);
 437
 438	list_for_each_entry(pt1, head, list) {
 439		if (pt == pt1) {
 440			list_del_rcu(&pt->list);
 441			goto out;
 442		}
 443	}
 444
 445	pr_warn("dev_remove_pack: %p not found\n", pt);
 446out:
 447	spin_unlock(&ptype_lock);
 448}
 449EXPORT_SYMBOL(__dev_remove_pack);
 450
 451/**
 452 *	dev_remove_pack	 - remove packet handler
 453 *	@pt: packet type declaration
 454 *
 455 *	Remove a protocol handler that was previously added to the kernel
 456 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 457 *	from the kernel lists and can be freed or reused once this function
 458 *	returns.
 459 *
 460 *	This call sleeps to guarantee that no CPU is looking at the packet
 461 *	type after return.
 462 */
 463void dev_remove_pack(struct packet_type *pt)
 464{
 465	__dev_remove_pack(pt);
 466
 467	synchronize_net();
 468}
 469EXPORT_SYMBOL(dev_remove_pack);
 470
 471
 472/**
 473 *	dev_add_offload - register offload handlers
 474 *	@po: protocol offload declaration
 475 *
 476 *	Add protocol offload handlers to the networking stack. The passed
 477 *	&proto_offload is linked into kernel lists and may not be freed until
 478 *	it has been removed from the kernel lists.
 479 *
 480 *	This call does not sleep therefore it can not
 481 *	guarantee all CPU's that are in middle of receiving packets
 482 *	will see the new offload handlers (until the next received packet).
 483 */
 484void dev_add_offload(struct packet_offload *po)
 485{
 486	struct packet_offload *elem;
 487
 488	spin_lock(&offload_lock);
 489	list_for_each_entry(elem, &offload_base, list) {
 490		if (po->priority < elem->priority)
 491			break;
 492	}
 493	list_add_rcu(&po->list, elem->list.prev);
 494	spin_unlock(&offload_lock);
 495}
 496EXPORT_SYMBOL(dev_add_offload);
 497
 498/**
 499 *	__dev_remove_offload	 - remove offload handler
 500 *	@po: packet offload declaration
 501 *
 502 *	Remove a protocol offload handler that was previously added to the
 503 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
 504 *	is removed from the kernel lists and can be freed or reused once this
 505 *	function returns.
 506 *
 507 *      The packet type might still be in use by receivers
 508 *	and must not be freed until after all the CPU's have gone
 509 *	through a quiescent state.
 510 */
 511static void __dev_remove_offload(struct packet_offload *po)
 512{
 513	struct list_head *head = &offload_base;
 514	struct packet_offload *po1;
 515
 516	spin_lock(&offload_lock);
 517
 518	list_for_each_entry(po1, head, list) {
 519		if (po == po1) {
 520			list_del_rcu(&po->list);
 521			goto out;
 522		}
 523	}
 524
 525	pr_warn("dev_remove_offload: %p not found\n", po);
 526out:
 527	spin_unlock(&offload_lock);
 528}
 529
 530/**
 531 *	dev_remove_offload	 - remove packet offload handler
 532 *	@po: packet offload declaration
 533 *
 534 *	Remove a packet offload handler that was previously added to the kernel
 535 *	offload handlers by dev_add_offload(). The passed &offload_type is
 536 *	removed from the kernel lists and can be freed or reused once this
 537 *	function returns.
 538 *
 539 *	This call sleeps to guarantee that no CPU is looking at the packet
 540 *	type after return.
 541 */
 542void dev_remove_offload(struct packet_offload *po)
 543{
 544	__dev_remove_offload(po);
 545
 546	synchronize_net();
 547}
 548EXPORT_SYMBOL(dev_remove_offload);
 549
 550/******************************************************************************
 551 *
 552 *		      Device Boot-time Settings Routines
 553 *
 554 ******************************************************************************/
 555
 556/* Boot time configuration table */
 557static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 558
 559/**
 560 *	netdev_boot_setup_add	- add new setup entry
 561 *	@name: name of the device
 562 *	@map: configured settings for the device
 563 *
 564 *	Adds new setup entry to the dev_boot_setup list.  The function
 565 *	returns 0 on error and 1 on success.  This is a generic routine to
 566 *	all netdevices.
 567 */
 568static int netdev_boot_setup_add(char *name, struct ifmap *map)
 569{
 570	struct netdev_boot_setup *s;
 571	int i;
 572
 573	s = dev_boot_setup;
 574	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 575		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 576			memset(s[i].name, 0, sizeof(s[i].name));
 577			strlcpy(s[i].name, name, IFNAMSIZ);
 578			memcpy(&s[i].map, map, sizeof(s[i].map));
 579			break;
 580		}
 581	}
 582
 583	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 584}
 585
 586/**
 587 * netdev_boot_setup_check	- check boot time settings
 588 * @dev: the netdevice
 589 *
 590 * Check boot time settings for the device.
 591 * The found settings are set for the device to be used
 592 * later in the device probing.
 593 * Returns 0 if no settings found, 1 if they are.
 594 */
 595int netdev_boot_setup_check(struct net_device *dev)
 596{
 597	struct netdev_boot_setup *s = dev_boot_setup;
 598	int i;
 599
 600	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 601		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 602		    !strcmp(dev->name, s[i].name)) {
 603			dev->irq = s[i].map.irq;
 604			dev->base_addr = s[i].map.base_addr;
 605			dev->mem_start = s[i].map.mem_start;
 606			dev->mem_end = s[i].map.mem_end;
 607			return 1;
 608		}
 609	}
 610	return 0;
 611}
 612EXPORT_SYMBOL(netdev_boot_setup_check);
 613
 614
 615/**
 616 * netdev_boot_base	- get address from boot time settings
 617 * @prefix: prefix for network device
 618 * @unit: id for network device
 619 *
 620 * Check boot time settings for the base address of device.
 621 * The found settings are set for the device to be used
 622 * later in the device probing.
 623 * Returns 0 if no settings found.
 624 */
 625unsigned long netdev_boot_base(const char *prefix, int unit)
 626{
 627	const struct netdev_boot_setup *s = dev_boot_setup;
 628	char name[IFNAMSIZ];
 629	int i;
 630
 631	sprintf(name, "%s%d", prefix, unit);
 632
 633	/*
 634	 * If device already registered then return base of 1
 635	 * to indicate not to probe for this interface
 636	 */
 637	if (__dev_get_by_name(&init_net, name))
 638		return 1;
 639
 640	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 641		if (!strcmp(name, s[i].name))
 642			return s[i].map.base_addr;
 643	return 0;
 644}
 645
 646/*
 647 * Saves at boot time configured settings for any netdevice.
 648 */
 649int __init netdev_boot_setup(char *str)
 650{
 651	int ints[5];
 652	struct ifmap map;
 653
 654	str = get_options(str, ARRAY_SIZE(ints), ints);
 655	if (!str || !*str)
 656		return 0;
 657
 658	/* Save settings */
 659	memset(&map, 0, sizeof(map));
 660	if (ints[0] > 0)
 661		map.irq = ints[1];
 662	if (ints[0] > 1)
 663		map.base_addr = ints[2];
 664	if (ints[0] > 2)
 665		map.mem_start = ints[3];
 666	if (ints[0] > 3)
 667		map.mem_end = ints[4];
 668
 669	/* Add new entry to the list */
 670	return netdev_boot_setup_add(str, &map);
 671}
 672
 673__setup("netdev=", netdev_boot_setup);
 674
 675/*******************************************************************************
 676 *
 677 *			    Device Interface Subroutines
 678 *
 679 *******************************************************************************/
 680
 681/**
 682 *	dev_get_iflink	- get 'iflink' value of a interface
 683 *	@dev: targeted interface
 684 *
 685 *	Indicates the ifindex the interface is linked to.
 686 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
 687 */
 688
 689int dev_get_iflink(const struct net_device *dev)
 690{
 691	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 692		return dev->netdev_ops->ndo_get_iflink(dev);
 693
 694	return dev->ifindex;
 695}
 696EXPORT_SYMBOL(dev_get_iflink);
 697
 698/**
 699 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
 700 *	@dev: targeted interface
 701 *	@skb: The packet.
 702 *
 703 *	For better visibility of tunnel traffic OVS needs to retrieve
 704 *	egress tunnel information for a packet. Following API allows
 705 *	user to get this info.
 706 */
 707int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 708{
 709	struct ip_tunnel_info *info;
 710
 711	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 712		return -EINVAL;
 713
 714	info = skb_tunnel_info_unclone(skb);
 715	if (!info)
 716		return -ENOMEM;
 717	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 718		return -EINVAL;
 719
 720	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 721}
 722EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 723
 724/**
 725 *	__dev_get_by_name	- find a device by its name
 726 *	@net: the applicable net namespace
 727 *	@name: name to find
 728 *
 729 *	Find an interface by name. Must be called under RTNL semaphore
 730 *	or @dev_base_lock. If the name is found a pointer to the device
 731 *	is returned. If the name is not found then %NULL is returned. The
 732 *	reference counters are not incremented so the caller must be
 733 *	careful with locks.
 734 */
 735
 736struct net_device *__dev_get_by_name(struct net *net, const char *name)
 737{
 738	struct net_device *dev;
 739	struct hlist_head *head = dev_name_hash(net, name);
 740
 741	hlist_for_each_entry(dev, head, name_hlist)
 742		if (!strncmp(dev->name, name, IFNAMSIZ))
 743			return dev;
 744
 745	return NULL;
 746}
 747EXPORT_SYMBOL(__dev_get_by_name);
 748
 749/**
 750 * dev_get_by_name_rcu	- find a device by its name
 751 * @net: the applicable net namespace
 752 * @name: name to find
 753 *
 754 * Find an interface by name.
 755 * If the name is found a pointer to the device is returned.
 756 * If the name is not found then %NULL is returned.
 757 * The reference counters are not incremented so the caller must be
 758 * careful with locks. The caller must hold RCU lock.
 759 */
 760
 761struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 762{
 763	struct net_device *dev;
 764	struct hlist_head *head = dev_name_hash(net, name);
 765
 766	hlist_for_each_entry_rcu(dev, head, name_hlist)
 767		if (!strncmp(dev->name, name, IFNAMSIZ))
 768			return dev;
 769
 770	return NULL;
 771}
 772EXPORT_SYMBOL(dev_get_by_name_rcu);
 773
 774/**
 775 *	dev_get_by_name		- find a device by its name
 776 *	@net: the applicable net namespace
 777 *	@name: name to find
 778 *
 779 *	Find an interface by name. This can be called from any
 780 *	context and does its own locking. The returned handle has
 781 *	the usage count incremented and the caller must use dev_put() to
 782 *	release it when it is no longer needed. %NULL is returned if no
 783 *	matching device is found.
 784 */
 785
 786struct net_device *dev_get_by_name(struct net *net, const char *name)
 787{
 788	struct net_device *dev;
 789
 790	rcu_read_lock();
 791	dev = dev_get_by_name_rcu(net, name);
 792	if (dev)
 793		dev_hold(dev);
 794	rcu_read_unlock();
 795	return dev;
 796}
 797EXPORT_SYMBOL(dev_get_by_name);
 798
 799/**
 800 *	__dev_get_by_index - find a device by its ifindex
 801 *	@net: the applicable net namespace
 802 *	@ifindex: index of device
 803 *
 804 *	Search for an interface by index. Returns %NULL if the device
 805 *	is not found or a pointer to the device. The device has not
 806 *	had its reference counter increased so the caller must be careful
 807 *	about locking. The caller must hold either the RTNL semaphore
 808 *	or @dev_base_lock.
 809 */
 810
 811struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 812{
 813	struct net_device *dev;
 814	struct hlist_head *head = dev_index_hash(net, ifindex);
 815
 816	hlist_for_each_entry(dev, head, index_hlist)
 817		if (dev->ifindex == ifindex)
 818			return dev;
 819
 820	return NULL;
 821}
 822EXPORT_SYMBOL(__dev_get_by_index);
 823
 824/**
 825 *	dev_get_by_index_rcu - find a device by its ifindex
 826 *	@net: the applicable net namespace
 827 *	@ifindex: index of device
 828 *
 829 *	Search for an interface by index. Returns %NULL if the device
 830 *	is not found or a pointer to the device. The device has not
 831 *	had its reference counter increased so the caller must be careful
 832 *	about locking. The caller must hold RCU lock.
 833 */
 834
 835struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 836{
 837	struct net_device *dev;
 838	struct hlist_head *head = dev_index_hash(net, ifindex);
 839
 840	hlist_for_each_entry_rcu(dev, head, index_hlist)
 841		if (dev->ifindex == ifindex)
 842			return dev;
 843
 844	return NULL;
 845}
 846EXPORT_SYMBOL(dev_get_by_index_rcu);
 847
 848
 849/**
 850 *	dev_get_by_index - find a device by its ifindex
 851 *	@net: the applicable net namespace
 852 *	@ifindex: index of device
 853 *
 854 *	Search for an interface by index. Returns NULL if the device
 855 *	is not found or a pointer to the device. The device returned has
 856 *	had a reference added and the pointer is safe until the user calls
 857 *	dev_put to indicate they have finished with it.
 858 */
 859
 860struct net_device *dev_get_by_index(struct net *net, int ifindex)
 861{
 862	struct net_device *dev;
 863
 864	rcu_read_lock();
 865	dev = dev_get_by_index_rcu(net, ifindex);
 866	if (dev)
 867		dev_hold(dev);
 868	rcu_read_unlock();
 869	return dev;
 870}
 871EXPORT_SYMBOL(dev_get_by_index);
 872
 873/**
 874 *	dev_get_by_napi_id - find a device by napi_id
 875 *	@napi_id: ID of the NAPI struct
 876 *
 877 *	Search for an interface by NAPI ID. Returns %NULL if the device
 878 *	is not found or a pointer to the device. The device has not had
 879 *	its reference counter increased so the caller must be careful
 880 *	about locking. The caller must hold RCU lock.
 881 */
 882
 883struct net_device *dev_get_by_napi_id(unsigned int napi_id)
 884{
 885	struct napi_struct *napi;
 886
 887	WARN_ON_ONCE(!rcu_read_lock_held());
 888
 889	if (napi_id < MIN_NAPI_ID)
 890		return NULL;
 891
 892	napi = napi_by_id(napi_id);
 893
 894	return napi ? napi->dev : NULL;
 895}
 896EXPORT_SYMBOL(dev_get_by_napi_id);
 897
 898/**
 899 *	netdev_get_name - get a netdevice name, knowing its ifindex.
 900 *	@net: network namespace
 901 *	@name: a pointer to the buffer where the name will be stored.
 902 *	@ifindex: the ifindex of the interface to get the name from.
 903 *
 904 *	The use of raw_seqcount_begin() and cond_resched() before
 905 *	retrying is required as we want to give the writers a chance
 906 *	to complete when CONFIG_PREEMPT is not set.
 907 */
 908int netdev_get_name(struct net *net, char *name, int ifindex)
 909{
 910	struct net_device *dev;
 911	unsigned int seq;
 912
 913retry:
 914	seq = raw_seqcount_begin(&devnet_rename_seq);
 915	rcu_read_lock();
 916	dev = dev_get_by_index_rcu(net, ifindex);
 917	if (!dev) {
 918		rcu_read_unlock();
 919		return -ENODEV;
 920	}
 921
 922	strcpy(name, dev->name);
 923	rcu_read_unlock();
 924	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 925		cond_resched();
 926		goto retry;
 927	}
 928
 929	return 0;
 930}
 931
 932/**
 933 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 934 *	@net: the applicable net namespace
 935 *	@type: media type of device
 936 *	@ha: hardware address
 937 *
 938 *	Search for an interface by MAC address. Returns NULL if the device
 939 *	is not found or a pointer to the device.
 940 *	The caller must hold RCU or RTNL.
 941 *	The returned device has not had its ref count increased
 942 *	and the caller must therefore be careful about locking
 943 *
 944 */
 945
 946struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 947				       const char *ha)
 948{
 949	struct net_device *dev;
 950
 951	for_each_netdev_rcu(net, dev)
 952		if (dev->type == type &&
 953		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 954			return dev;
 955
 956	return NULL;
 957}
 958EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 959
 960struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 961{
 962	struct net_device *dev;
 963
 964	ASSERT_RTNL();
 965	for_each_netdev(net, dev)
 966		if (dev->type == type)
 967			return dev;
 968
 969	return NULL;
 970}
 971EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 972
 973struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 974{
 975	struct net_device *dev, *ret = NULL;
 976
 977	rcu_read_lock();
 978	for_each_netdev_rcu(net, dev)
 979		if (dev->type == type) {
 980			dev_hold(dev);
 981			ret = dev;
 982			break;
 983		}
 984	rcu_read_unlock();
 985	return ret;
 986}
 987EXPORT_SYMBOL(dev_getfirstbyhwtype);
 988
 989/**
 990 *	__dev_get_by_flags - find any device with given flags
 991 *	@net: the applicable net namespace
 992 *	@if_flags: IFF_* values
 993 *	@mask: bitmask of bits in if_flags to check
 994 *
 995 *	Search for any interface with the given flags. Returns NULL if a device
 996 *	is not found or a pointer to the device. Must be called inside
 997 *	rtnl_lock(), and result refcount is unchanged.
 998 */
 999
1000struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
1001				      unsigned short mask)
1002{
1003	struct net_device *dev, *ret;
1004
1005	ASSERT_RTNL();
1006
1007	ret = NULL;
1008	for_each_netdev(net, dev) {
1009		if (((dev->flags ^ if_flags) & mask) == 0) {
1010			ret = dev;
1011			break;
1012		}
1013	}
1014	return ret;
1015}
1016EXPORT_SYMBOL(__dev_get_by_flags);
1017
1018/**
1019 *	dev_valid_name - check if name is okay for network device
1020 *	@name: name string
1021 *
1022 *	Network device names need to be valid file names to
1023 *	to allow sysfs to work.  We also disallow any kind of
1024 *	whitespace.
1025 */
1026bool dev_valid_name(const char *name)
1027{
1028	if (*name == '\0')
1029		return false;
1030	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1031		return false;
1032	if (!strcmp(name, ".") || !strcmp(name, ".."))
1033		return false;
1034
1035	while (*name) {
1036		if (*name == '/' || *name == ':' || isspace(*name))
1037			return false;
1038		name++;
1039	}
1040	return true;
1041}
1042EXPORT_SYMBOL(dev_valid_name);
1043
1044/**
1045 *	__dev_alloc_name - allocate a name for a device
1046 *	@net: network namespace to allocate the device name in
1047 *	@name: name format string
1048 *	@buf:  scratch buffer and result name string
1049 *
1050 *	Passed a format string - eg "lt%d" it will try and find a suitable
1051 *	id. It scans list of devices to build up a free map, then chooses
1052 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1053 *	while allocating the name and adding the device in order to avoid
1054 *	duplicates.
1055 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1056 *	Returns the number of the unit assigned or a negative errno code.
1057 */
1058
1059static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1060{
1061	int i = 0;
1062	const char *p;
1063	const int max_netdevices = 8*PAGE_SIZE;
1064	unsigned long *inuse;
1065	struct net_device *d;
1066
1067	if (!dev_valid_name(name))
1068		return -EINVAL;
1069
1070	p = strchr(name, '%');
1071	if (p) {
1072		/*
1073		 * Verify the string as this thing may have come from
1074		 * the user.  There must be either one "%d" and no other "%"
1075		 * characters.
1076		 */
1077		if (p[1] != 'd' || strchr(p + 2, '%'))
1078			return -EINVAL;
1079
1080		/* Use one page as a bit array of possible slots */
1081		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1082		if (!inuse)
1083			return -ENOMEM;
1084
1085		for_each_netdev(net, d) {
1086			if (!sscanf(d->name, name, &i))
1087				continue;
1088			if (i < 0 || i >= max_netdevices)
1089				continue;
1090
1091			/*  avoid cases where sscanf is not exact inverse of printf */
1092			snprintf(buf, IFNAMSIZ, name, i);
1093			if (!strncmp(buf, d->name, IFNAMSIZ))
1094				set_bit(i, inuse);
1095		}
1096
1097		i = find_first_zero_bit(inuse, max_netdevices);
1098		free_page((unsigned long) inuse);
1099	}
1100
1101	snprintf(buf, IFNAMSIZ, name, i);
1102	if (!__dev_get_by_name(net, buf))
1103		return i;
1104
1105	/* It is possible to run out of possible slots
1106	 * when the name is long and there isn't enough space left
1107	 * for the digits, or if all bits are used.
1108	 */
1109	return -ENFILE;
1110}
1111
1112static int dev_alloc_name_ns(struct net *net,
1113			     struct net_device *dev,
1114			     const char *name)
1115{
1116	char buf[IFNAMSIZ];
1117	int ret;
1118
1119	BUG_ON(!net);
1120	ret = __dev_alloc_name(net, name, buf);
1121	if (ret >= 0)
1122		strlcpy(dev->name, buf, IFNAMSIZ);
1123	return ret;
1124}
1125
1126/**
1127 *	dev_alloc_name - allocate a name for a device
1128 *	@dev: device
1129 *	@name: name format string
1130 *
1131 *	Passed a format string - eg "lt%d" it will try and find a suitable
1132 *	id. It scans list of devices to build up a free map, then chooses
1133 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1134 *	while allocating the name and adding the device in order to avoid
1135 *	duplicates.
1136 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1137 *	Returns the number of the unit assigned or a negative errno code.
1138 */
1139
1140int dev_alloc_name(struct net_device *dev, const char *name)
1141{
1142	return dev_alloc_name_ns(dev_net(dev), dev, name);
1143}
1144EXPORT_SYMBOL(dev_alloc_name);
1145
1146int dev_get_valid_name(struct net *net, struct net_device *dev,
1147		       const char *name)
1148{
1149	BUG_ON(!net);
1150
1151	if (!dev_valid_name(name))
1152		return -EINVAL;
1153
1154	if (strchr(name, '%'))
1155		return dev_alloc_name_ns(net, dev, name);
1156	else if (__dev_get_by_name(net, name))
1157		return -EEXIST;
1158	else if (dev->name != name)
1159		strlcpy(dev->name, name, IFNAMSIZ);
1160
1161	return 0;
1162}
1163EXPORT_SYMBOL(dev_get_valid_name);
1164
1165/**
1166 *	dev_change_name - change name of a device
1167 *	@dev: device
1168 *	@newname: name (or format string) must be at least IFNAMSIZ
1169 *
1170 *	Change name of a device, can pass format strings "eth%d".
1171 *	for wildcarding.
1172 */
1173int dev_change_name(struct net_device *dev, const char *newname)
1174{
1175	unsigned char old_assign_type;
1176	char oldname[IFNAMSIZ];
1177	int err = 0;
1178	int ret;
1179	struct net *net;
1180
1181	ASSERT_RTNL();
1182	BUG_ON(!dev_net(dev));
1183
1184	net = dev_net(dev);
1185	if (dev->flags & IFF_UP)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1186		return -EBUSY;
1187
1188	write_seqcount_begin(&devnet_rename_seq);
1189
1190	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1191		write_seqcount_end(&devnet_rename_seq);
1192		return 0;
1193	}
1194
1195	memcpy(oldname, dev->name, IFNAMSIZ);
1196
1197	err = dev_get_valid_name(net, dev, newname);
1198	if (err < 0) {
1199		write_seqcount_end(&devnet_rename_seq);
1200		return err;
1201	}
1202
1203	if (oldname[0] && !strchr(oldname, '%'))
1204		netdev_info(dev, "renamed from %s\n", oldname);
1205
1206	old_assign_type = dev->name_assign_type;
1207	dev->name_assign_type = NET_NAME_RENAMED;
1208
1209rollback:
1210	ret = device_rename(&dev->dev, dev->name);
1211	if (ret) {
1212		memcpy(dev->name, oldname, IFNAMSIZ);
1213		dev->name_assign_type = old_assign_type;
1214		write_seqcount_end(&devnet_rename_seq);
1215		return ret;
1216	}
1217
1218	write_seqcount_end(&devnet_rename_seq);
1219
1220	netdev_adjacent_rename_links(dev, oldname);
1221
1222	write_lock_bh(&dev_base_lock);
1223	hlist_del_rcu(&dev->name_hlist);
1224	write_unlock_bh(&dev_base_lock);
1225
1226	synchronize_rcu();
1227
1228	write_lock_bh(&dev_base_lock);
1229	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1230	write_unlock_bh(&dev_base_lock);
1231
1232	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1233	ret = notifier_to_errno(ret);
1234
1235	if (ret) {
1236		/* err >= 0 after dev_alloc_name() or stores the first errno */
1237		if (err >= 0) {
1238			err = ret;
1239			write_seqcount_begin(&devnet_rename_seq);
1240			memcpy(dev->name, oldname, IFNAMSIZ);
1241			memcpy(oldname, newname, IFNAMSIZ);
1242			dev->name_assign_type = old_assign_type;
1243			old_assign_type = NET_NAME_RENAMED;
1244			goto rollback;
1245		} else {
1246			pr_err("%s: name change rollback failed: %d\n",
1247			       dev->name, ret);
1248		}
1249	}
1250
1251	return err;
1252}
1253
1254/**
1255 *	dev_set_alias - change ifalias of a device
1256 *	@dev: device
1257 *	@alias: name up to IFALIASZ
1258 *	@len: limit of bytes to copy from info
1259 *
1260 *	Set ifalias for a device,
1261 */
1262int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1263{
1264	struct dev_ifalias *new_alias = NULL;
1265
1266	if (len >= IFALIASZ)
1267		return -EINVAL;
1268
1269	if (len) {
1270		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1271		if (!new_alias)
1272			return -ENOMEM;
1273
1274		memcpy(new_alias->ifalias, alias, len);
1275		new_alias->ifalias[len] = 0;
1276	}
1277
1278	mutex_lock(&ifalias_mutex);
1279	rcu_swap_protected(dev->ifalias, new_alias,
1280			   mutex_is_locked(&ifalias_mutex));
1281	mutex_unlock(&ifalias_mutex);
1282
1283	if (new_alias)
1284		kfree_rcu(new_alias, rcuhead);
1285
1286	return len;
1287}
 
1288
1289/**
1290 *	dev_get_alias - get ifalias of a device
1291 *	@dev: device
1292 *	@name: buffer to store name of ifalias
1293 *	@len: size of buffer
1294 *
1295 *	get ifalias for a device.  Caller must make sure dev cannot go
1296 *	away,  e.g. rcu read lock or own a reference count to device.
1297 */
1298int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1299{
1300	const struct dev_ifalias *alias;
1301	int ret = 0;
1302
1303	rcu_read_lock();
1304	alias = rcu_dereference(dev->ifalias);
1305	if (alias)
1306		ret = snprintf(name, len, "%s", alias->ifalias);
1307	rcu_read_unlock();
1308
1309	return ret;
1310}
1311
1312/**
1313 *	netdev_features_change - device changes features
1314 *	@dev: device to cause notification
1315 *
1316 *	Called to indicate a device has changed features.
1317 */
1318void netdev_features_change(struct net_device *dev)
1319{
1320	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1321}
1322EXPORT_SYMBOL(netdev_features_change);
1323
1324/**
1325 *	netdev_state_change - device changes state
1326 *	@dev: device to cause notification
1327 *
1328 *	Called to indicate a device has changed state. This function calls
1329 *	the notifier chains for netdev_chain and sends a NEWLINK message
1330 *	to the routing socket.
1331 */
1332void netdev_state_change(struct net_device *dev)
1333{
1334	if (dev->flags & IFF_UP) {
1335		struct netdev_notifier_change_info change_info = {
1336			.info.dev = dev,
1337		};
1338
1339		call_netdevice_notifiers_info(NETDEV_CHANGE,
1340					      &change_info.info);
1341		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1342	}
1343}
1344EXPORT_SYMBOL(netdev_state_change);
1345
1346/**
1347 * netdev_notify_peers - notify network peers about existence of @dev
1348 * @dev: network device
1349 *
1350 * Generate traffic such that interested network peers are aware of
1351 * @dev, such as by generating a gratuitous ARP. This may be used when
1352 * a device wants to inform the rest of the network about some sort of
1353 * reconfiguration such as a failover event or virtual machine
1354 * migration.
1355 */
1356void netdev_notify_peers(struct net_device *dev)
1357{
1358	rtnl_lock();
1359	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1360	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1361	rtnl_unlock();
1362}
1363EXPORT_SYMBOL(netdev_notify_peers);
1364
1365static int __dev_open(struct net_device *dev)
1366{
1367	const struct net_device_ops *ops = dev->netdev_ops;
1368	int ret;
1369
1370	ASSERT_RTNL();
1371
1372	if (!netif_device_present(dev))
1373		return -ENODEV;
1374
1375	/* Block netpoll from trying to do any rx path servicing.
1376	 * If we don't do this there is a chance ndo_poll_controller
1377	 * or ndo_poll may be running while we open the device
1378	 */
1379	netpoll_poll_disable(dev);
1380
1381	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1382	ret = notifier_to_errno(ret);
1383	if (ret)
1384		return ret;
1385
1386	set_bit(__LINK_STATE_START, &dev->state);
1387
1388	if (ops->ndo_validate_addr)
1389		ret = ops->ndo_validate_addr(dev);
1390
1391	if (!ret && ops->ndo_open)
1392		ret = ops->ndo_open(dev);
1393
1394	netpoll_poll_enable(dev);
1395
1396	if (ret)
1397		clear_bit(__LINK_STATE_START, &dev->state);
1398	else {
1399		dev->flags |= IFF_UP;
1400		dev_set_rx_mode(dev);
1401		dev_activate(dev);
1402		add_device_randomness(dev->dev_addr, dev->addr_len);
1403	}
1404
1405	return ret;
1406}
1407
1408/**
1409 *	dev_open	- prepare an interface for use.
1410 *	@dev:	device to open
 
1411 *
1412 *	Takes a device from down to up state. The device's private open
1413 *	function is invoked and then the multicast lists are loaded. Finally
1414 *	the device is moved into the up state and a %NETDEV_UP message is
1415 *	sent to the netdev notifier chain.
1416 *
1417 *	Calling this function on an active interface is a nop. On a failure
1418 *	a negative errno code is returned.
1419 */
1420int dev_open(struct net_device *dev)
1421{
1422	int ret;
1423
1424	if (dev->flags & IFF_UP)
1425		return 0;
1426
1427	ret = __dev_open(dev);
1428	if (ret < 0)
1429		return ret;
1430
1431	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1432	call_netdevice_notifiers(NETDEV_UP, dev);
1433
1434	return ret;
1435}
1436EXPORT_SYMBOL(dev_open);
1437
1438static void __dev_close_many(struct list_head *head)
1439{
1440	struct net_device *dev;
1441
1442	ASSERT_RTNL();
1443	might_sleep();
1444
1445	list_for_each_entry(dev, head, close_list) {
1446		/* Temporarily disable netpoll until the interface is down */
1447		netpoll_poll_disable(dev);
1448
1449		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1450
1451		clear_bit(__LINK_STATE_START, &dev->state);
1452
1453		/* Synchronize to scheduled poll. We cannot touch poll list, it
1454		 * can be even on different cpu. So just clear netif_running().
1455		 *
1456		 * dev->stop() will invoke napi_disable() on all of it's
1457		 * napi_struct instances on this device.
1458		 */
1459		smp_mb__after_atomic(); /* Commit netif_running(). */
1460	}
1461
1462	dev_deactivate_many(head);
1463
1464	list_for_each_entry(dev, head, close_list) {
1465		const struct net_device_ops *ops = dev->netdev_ops;
1466
1467		/*
1468		 *	Call the device specific close. This cannot fail.
1469		 *	Only if device is UP
1470		 *
1471		 *	We allow it to be called even after a DETACH hot-plug
1472		 *	event.
1473		 */
1474		if (ops->ndo_stop)
1475			ops->ndo_stop(dev);
1476
1477		dev->flags &= ~IFF_UP;
1478		netpoll_poll_enable(dev);
1479	}
1480}
1481
1482static void __dev_close(struct net_device *dev)
1483{
1484	LIST_HEAD(single);
1485
1486	list_add(&dev->close_list, &single);
1487	__dev_close_many(&single);
1488	list_del(&single);
1489}
1490
1491void dev_close_many(struct list_head *head, bool unlink)
1492{
1493	struct net_device *dev, *tmp;
1494
1495	/* Remove the devices that don't need to be closed */
1496	list_for_each_entry_safe(dev, tmp, head, close_list)
1497		if (!(dev->flags & IFF_UP))
1498			list_del_init(&dev->close_list);
1499
1500	__dev_close_many(head);
1501
1502	list_for_each_entry_safe(dev, tmp, head, close_list) {
1503		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1504		call_netdevice_notifiers(NETDEV_DOWN, dev);
1505		if (unlink)
1506			list_del_init(&dev->close_list);
1507	}
1508}
1509EXPORT_SYMBOL(dev_close_many);
1510
1511/**
1512 *	dev_close - shutdown an interface.
1513 *	@dev: device to shutdown
1514 *
1515 *	This function moves an active device into down state. A
1516 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1517 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1518 *	chain.
1519 */
1520void dev_close(struct net_device *dev)
1521{
1522	if (dev->flags & IFF_UP) {
1523		LIST_HEAD(single);
1524
1525		list_add(&dev->close_list, &single);
1526		dev_close_many(&single, true);
1527		list_del(&single);
1528	}
1529}
1530EXPORT_SYMBOL(dev_close);
1531
1532
1533/**
1534 *	dev_disable_lro - disable Large Receive Offload on a device
1535 *	@dev: device
1536 *
1537 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1538 *	called under RTNL.  This is needed if received packets may be
1539 *	forwarded to another interface.
1540 */
1541void dev_disable_lro(struct net_device *dev)
1542{
1543	struct net_device *lower_dev;
1544	struct list_head *iter;
1545
1546	dev->wanted_features &= ~NETIF_F_LRO;
1547	netdev_update_features(dev);
1548
1549	if (unlikely(dev->features & NETIF_F_LRO))
1550		netdev_WARN(dev, "failed to disable LRO!\n");
1551
1552	netdev_for_each_lower_dev(dev, lower_dev, iter)
1553		dev_disable_lro(lower_dev);
1554}
1555EXPORT_SYMBOL(dev_disable_lro);
1556
1557/**
1558 *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1559 *	@dev: device
1560 *
1561 *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
1562 *	called under RTNL.  This is needed if Generic XDP is installed on
1563 *	the device.
1564 */
1565static void dev_disable_gro_hw(struct net_device *dev)
1566{
1567	dev->wanted_features &= ~NETIF_F_GRO_HW;
1568	netdev_update_features(dev);
1569
1570	if (unlikely(dev->features & NETIF_F_GRO_HW))
1571		netdev_WARN(dev, "failed to disable GRO_HW!\n");
1572}
1573
1574const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1575{
1576#define N(val) 						\
1577	case NETDEV_##val:				\
1578		return "NETDEV_" __stringify(val);
1579	switch (cmd) {
1580	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1581	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1582	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1583	N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
1584	N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
1585	N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
1586	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1587	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1588	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1589	};
 
1590#undef N
1591	return "UNKNOWN_NETDEV_EVENT";
1592}
1593EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1594
1595static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1596				   struct net_device *dev)
1597{
1598	struct netdev_notifier_info info = {
1599		.dev = dev,
1600	};
1601
1602	return nb->notifier_call(nb, val, &info);
1603}
1604
1605static int dev_boot_phase = 1;
1606
1607/**
1608 * register_netdevice_notifier - register a network notifier block
1609 * @nb: notifier
1610 *
1611 * Register a notifier to be called when network device events occur.
1612 * The notifier passed is linked into the kernel structures and must
1613 * not be reused until it has been unregistered. A negative errno code
1614 * is returned on a failure.
1615 *
1616 * When registered all registration and up events are replayed
1617 * to the new notifier to allow device to have a race free
1618 * view of the network device list.
1619 */
1620
1621int register_netdevice_notifier(struct notifier_block *nb)
1622{
1623	struct net_device *dev;
1624	struct net_device *last;
1625	struct net *net;
1626	int err;
1627
1628	/* Close race with setup_net() and cleanup_net() */
1629	down_write(&pernet_ops_rwsem);
1630	rtnl_lock();
1631	err = raw_notifier_chain_register(&netdev_chain, nb);
1632	if (err)
1633		goto unlock;
1634	if (dev_boot_phase)
1635		goto unlock;
1636	for_each_net(net) {
1637		for_each_netdev(net, dev) {
1638			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1639			err = notifier_to_errno(err);
1640			if (err)
1641				goto rollback;
1642
1643			if (!(dev->flags & IFF_UP))
1644				continue;
1645
1646			call_netdevice_notifier(nb, NETDEV_UP, dev);
1647		}
1648	}
1649
1650unlock:
1651	rtnl_unlock();
1652	up_write(&pernet_ops_rwsem);
1653	return err;
1654
1655rollback:
1656	last = dev;
1657	for_each_net(net) {
1658		for_each_netdev(net, dev) {
1659			if (dev == last)
1660				goto outroll;
1661
1662			if (dev->flags & IFF_UP) {
1663				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1664							dev);
1665				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1666			}
1667			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1668		}
1669	}
1670
1671outroll:
1672	raw_notifier_chain_unregister(&netdev_chain, nb);
1673	goto unlock;
1674}
1675EXPORT_SYMBOL(register_netdevice_notifier);
1676
1677/**
1678 * unregister_netdevice_notifier - unregister a network notifier block
1679 * @nb: notifier
1680 *
1681 * Unregister a notifier previously registered by
1682 * register_netdevice_notifier(). The notifier is unlinked into the
1683 * kernel structures and may then be reused. A negative errno code
1684 * is returned on a failure.
1685 *
1686 * After unregistering unregister and down device events are synthesized
1687 * for all devices on the device list to the removed notifier to remove
1688 * the need for special case cleanup code.
1689 */
1690
1691int unregister_netdevice_notifier(struct notifier_block *nb)
1692{
1693	struct net_device *dev;
1694	struct net *net;
1695	int err;
1696
1697	/* Close race with setup_net() and cleanup_net() */
1698	down_write(&pernet_ops_rwsem);
1699	rtnl_lock();
1700	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1701	if (err)
1702		goto unlock;
1703
1704	for_each_net(net) {
1705		for_each_netdev(net, dev) {
1706			if (dev->flags & IFF_UP) {
1707				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1708							dev);
1709				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1710			}
1711			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1712		}
1713	}
1714unlock:
1715	rtnl_unlock();
1716	up_write(&pernet_ops_rwsem);
1717	return err;
1718}
1719EXPORT_SYMBOL(unregister_netdevice_notifier);
1720
1721/**
1722 *	call_netdevice_notifiers_info - call all network notifier blocks
1723 *	@val: value passed unmodified to notifier function
1724 *	@info: notifier information data
1725 *
1726 *	Call all network notifier blocks.  Parameters and return value
1727 *	are as for raw_notifier_call_chain().
1728 */
1729
1730static int call_netdevice_notifiers_info(unsigned long val,
1731					 struct netdev_notifier_info *info)
1732{
1733	ASSERT_RTNL();
1734	return raw_notifier_call_chain(&netdev_chain, val, info);
1735}
1736
 
 
 
 
 
 
 
 
 
 
 
 
1737/**
1738 *	call_netdevice_notifiers - call all network notifier blocks
1739 *      @val: value passed unmodified to notifier function
1740 *      @dev: net_device pointer passed unmodified to notifier function
1741 *
1742 *	Call all network notifier blocks.  Parameters and return value
1743 *	are as for raw_notifier_call_chain().
1744 */
1745
1746int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1747{
1748	struct netdev_notifier_info info = {
1749		.dev = dev,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1750	};
1751
1752	return call_netdevice_notifiers_info(val, &info);
 
 
1753}
1754EXPORT_SYMBOL(call_netdevice_notifiers);
1755
1756#ifdef CONFIG_NET_INGRESS
1757static struct static_key ingress_needed __read_mostly;
1758
1759void net_inc_ingress_queue(void)
1760{
1761	static_key_slow_inc(&ingress_needed);
1762}
1763EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1764
1765void net_dec_ingress_queue(void)
1766{
1767	static_key_slow_dec(&ingress_needed);
1768}
1769EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1770#endif
1771
1772#ifdef CONFIG_NET_EGRESS
1773static struct static_key egress_needed __read_mostly;
1774
1775void net_inc_egress_queue(void)
1776{
1777	static_key_slow_inc(&egress_needed);
1778}
1779EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1780
1781void net_dec_egress_queue(void)
1782{
1783	static_key_slow_dec(&egress_needed);
1784}
1785EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1786#endif
1787
1788static struct static_key netstamp_needed __read_mostly;
1789#ifdef HAVE_JUMP_LABEL
1790static atomic_t netstamp_needed_deferred;
1791static atomic_t netstamp_wanted;
1792static void netstamp_clear(struct work_struct *work)
1793{
1794	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1795	int wanted;
1796
1797	wanted = atomic_add_return(deferred, &netstamp_wanted);
1798	if (wanted > 0)
1799		static_key_enable(&netstamp_needed);
1800	else
1801		static_key_disable(&netstamp_needed);
1802}
1803static DECLARE_WORK(netstamp_work, netstamp_clear);
1804#endif
1805
1806void net_enable_timestamp(void)
1807{
1808#ifdef HAVE_JUMP_LABEL
1809	int wanted;
1810
1811	while (1) {
1812		wanted = atomic_read(&netstamp_wanted);
1813		if (wanted <= 0)
1814			break;
1815		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1816			return;
1817	}
1818	atomic_inc(&netstamp_needed_deferred);
1819	schedule_work(&netstamp_work);
1820#else
1821	static_key_slow_inc(&netstamp_needed);
1822#endif
1823}
1824EXPORT_SYMBOL(net_enable_timestamp);
1825
1826void net_disable_timestamp(void)
1827{
1828#ifdef HAVE_JUMP_LABEL
1829	int wanted;
1830
1831	while (1) {
1832		wanted = atomic_read(&netstamp_wanted);
1833		if (wanted <= 1)
1834			break;
1835		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1836			return;
1837	}
1838	atomic_dec(&netstamp_needed_deferred);
1839	schedule_work(&netstamp_work);
1840#else
1841	static_key_slow_dec(&netstamp_needed);
1842#endif
1843}
1844EXPORT_SYMBOL(net_disable_timestamp);
1845
1846static inline void net_timestamp_set(struct sk_buff *skb)
1847{
1848	skb->tstamp = 0;
1849	if (static_key_false(&netstamp_needed))
1850		__net_timestamp(skb);
1851}
1852
1853#define net_timestamp_check(COND, SKB)			\
1854	if (static_key_false(&netstamp_needed)) {		\
1855		if ((COND) && !(SKB)->tstamp)	\
1856			__net_timestamp(SKB);		\
1857	}						\
1858
1859bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1860{
1861	unsigned int len;
1862
1863	if (!(dev->flags & IFF_UP))
1864		return false;
1865
1866	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1867	if (skb->len <= len)
1868		return true;
1869
1870	/* if TSO is enabled, we don't care about the length as the packet
1871	 * could be forwarded without being segmented before
1872	 */
1873	if (skb_is_gso(skb))
1874		return true;
1875
1876	return false;
1877}
1878EXPORT_SYMBOL_GPL(is_skb_forwardable);
1879
1880int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1881{
1882	int ret = ____dev_forward_skb(dev, skb);
1883
1884	if (likely(!ret)) {
1885		skb->protocol = eth_type_trans(skb, dev);
1886		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1887	}
1888
1889	return ret;
1890}
1891EXPORT_SYMBOL_GPL(__dev_forward_skb);
1892
1893/**
1894 * dev_forward_skb - loopback an skb to another netif
1895 *
1896 * @dev: destination network device
1897 * @skb: buffer to forward
1898 *
1899 * return values:
1900 *	NET_RX_SUCCESS	(no congestion)
1901 *	NET_RX_DROP     (packet was dropped, but freed)
1902 *
1903 * dev_forward_skb can be used for injecting an skb from the
1904 * start_xmit function of one device into the receive queue
1905 * of another device.
1906 *
1907 * The receiving device may be in another namespace, so
1908 * we have to clear all information in the skb that could
1909 * impact namespace isolation.
1910 */
1911int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1912{
1913	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1914}
1915EXPORT_SYMBOL_GPL(dev_forward_skb);
1916
1917static inline int deliver_skb(struct sk_buff *skb,
1918			      struct packet_type *pt_prev,
1919			      struct net_device *orig_dev)
1920{
1921	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
1922		return -ENOMEM;
1923	refcount_inc(&skb->users);
1924	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1925}
1926
1927static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1928					  struct packet_type **pt,
1929					  struct net_device *orig_dev,
1930					  __be16 type,
1931					  struct list_head *ptype_list)
1932{
1933	struct packet_type *ptype, *pt_prev = *pt;
1934
1935	list_for_each_entry_rcu(ptype, ptype_list, list) {
1936		if (ptype->type != type)
1937			continue;
1938		if (pt_prev)
1939			deliver_skb(skb, pt_prev, orig_dev);
1940		pt_prev = ptype;
1941	}
1942	*pt = pt_prev;
1943}
1944
1945static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1946{
1947	if (!ptype->af_packet_priv || !skb->sk)
1948		return false;
1949
1950	if (ptype->id_match)
1951		return ptype->id_match(ptype, skb->sk);
1952	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1953		return true;
1954
1955	return false;
1956}
1957
 
 
 
 
 
 
 
 
 
 
 
1958/*
1959 *	Support routine. Sends outgoing frames to any network
1960 *	taps currently in use.
1961 */
1962
1963void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1964{
1965	struct packet_type *ptype;
1966	struct sk_buff *skb2 = NULL;
1967	struct packet_type *pt_prev = NULL;
1968	struct list_head *ptype_list = &ptype_all;
1969
1970	rcu_read_lock();
1971again:
1972	list_for_each_entry_rcu(ptype, ptype_list, list) {
 
 
 
1973		/* Never send packets back to the socket
1974		 * they originated from - MvS (miquels@drinkel.ow.org)
1975		 */
1976		if (skb_loop_sk(ptype, skb))
1977			continue;
1978
1979		if (pt_prev) {
1980			deliver_skb(skb2, pt_prev, skb->dev);
1981			pt_prev = ptype;
1982			continue;
1983		}
1984
1985		/* need to clone skb, done only once */
1986		skb2 = skb_clone(skb, GFP_ATOMIC);
1987		if (!skb2)
1988			goto out_unlock;
1989
1990		net_timestamp_set(skb2);
1991
1992		/* skb->nh should be correctly
1993		 * set by sender, so that the second statement is
1994		 * just protection against buggy protocols.
1995		 */
1996		skb_reset_mac_header(skb2);
1997
1998		if (skb_network_header(skb2) < skb2->data ||
1999		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2000			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2001					     ntohs(skb2->protocol),
2002					     dev->name);
2003			skb_reset_network_header(skb2);
2004		}
2005
2006		skb2->transport_header = skb2->network_header;
2007		skb2->pkt_type = PACKET_OUTGOING;
2008		pt_prev = ptype;
2009	}
2010
2011	if (ptype_list == &ptype_all) {
2012		ptype_list = &dev->ptype_all;
2013		goto again;
2014	}
2015out_unlock:
2016	if (pt_prev) {
2017		if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2018			pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2019		else
2020			kfree_skb(skb2);
2021	}
2022	rcu_read_unlock();
2023}
2024EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2025
2026/**
2027 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2028 * @dev: Network device
2029 * @txq: number of queues available
2030 *
2031 * If real_num_tx_queues is changed the tc mappings may no longer be
2032 * valid. To resolve this verify the tc mapping remains valid and if
2033 * not NULL the mapping. With no priorities mapping to this
2034 * offset/count pair it will no longer be used. In the worst case TC0
2035 * is invalid nothing can be done so disable priority mappings. If is
2036 * expected that drivers will fix this mapping if they can before
2037 * calling netif_set_real_num_tx_queues.
2038 */
2039static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2040{
2041	int i;
2042	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2043
2044	/* If TC0 is invalidated disable TC mapping */
2045	if (tc->offset + tc->count > txq) {
2046		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2047		dev->num_tc = 0;
2048		return;
2049	}
2050
2051	/* Invalidated prio to tc mappings set to TC0 */
2052	for (i = 1; i < TC_BITMASK + 1; i++) {
2053		int q = netdev_get_prio_tc_map(dev, i);
2054
2055		tc = &dev->tc_to_txq[q];
2056		if (tc->offset + tc->count > txq) {
2057			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2058				i, q);
2059			netdev_set_prio_tc_map(dev, i, 0);
2060		}
2061	}
2062}
2063
2064int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2065{
2066	if (dev->num_tc) {
2067		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2068		int i;
2069
 
2070		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2071			if ((txq - tc->offset) < tc->count)
2072				return i;
2073		}
2074
 
2075		return -1;
2076	}
2077
2078	return 0;
2079}
2080EXPORT_SYMBOL(netdev_txq_to_tc);
2081
2082#ifdef CONFIG_XPS
 
 
 
 
2083static DEFINE_MUTEX(xps_map_mutex);
2084#define xmap_dereference(P)		\
2085	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2086
2087static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2088			     int tci, u16 index)
2089{
2090	struct xps_map *map = NULL;
2091	int pos;
2092
2093	if (dev_maps)
2094		map = xmap_dereference(dev_maps->cpu_map[tci]);
2095	if (!map)
2096		return false;
2097
2098	for (pos = map->len; pos--;) {
2099		if (map->queues[pos] != index)
2100			continue;
2101
2102		if (map->len > 1) {
2103			map->queues[pos] = map->queues[--map->len];
2104			break;
2105		}
2106
2107		RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
2108		kfree_rcu(map, rcu);
2109		return false;
2110	}
2111
2112	return true;
2113}
2114
2115static bool remove_xps_queue_cpu(struct net_device *dev,
2116				 struct xps_dev_maps *dev_maps,
2117				 int cpu, u16 offset, u16 count)
2118{
2119	int num_tc = dev->num_tc ? : 1;
2120	bool active = false;
2121	int tci;
2122
2123	for (tci = cpu * num_tc; num_tc--; tci++) {
2124		int i, j;
2125
2126		for (i = count, j = offset; i--; j++) {
2127			if (!remove_xps_queue(dev_maps, tci, j))
2128				break;
2129		}
2130
2131		active |= i < 0;
2132	}
2133
2134	return active;
2135}
2136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2137static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2138				   u16 count)
2139{
 
2140	struct xps_dev_maps *dev_maps;
2141	int cpu, i;
2142	bool active = false;
 
 
2143
 
2144	mutex_lock(&xps_map_mutex);
2145	dev_maps = xmap_dereference(dev->xps_maps);
2146
 
 
 
 
 
 
 
 
 
 
2147	if (!dev_maps)
2148		goto out_no_maps;
2149
2150	for_each_possible_cpu(cpu)
2151		active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
2152					       offset, count);
2153
2154	if (!active) {
2155		RCU_INIT_POINTER(dev->xps_maps, NULL);
2156		kfree_rcu(dev_maps, rcu);
2157	}
2158
2159	for (i = offset + (count - 1); count--; i--)
2160		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2161					     NUMA_NO_NODE);
2162
2163out_no_maps:
2164	mutex_unlock(&xps_map_mutex);
 
2165}
2166
2167static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2168{
2169	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2170}
2171
2172static struct xps_map *expand_xps_map(struct xps_map *map,
2173				      int cpu, u16 index)
2174{
2175	struct xps_map *new_map;
2176	int alloc_len = XPS_MIN_MAP_ALLOC;
2177	int i, pos;
2178
2179	for (pos = 0; map && pos < map->len; pos++) {
2180		if (map->queues[pos] != index)
2181			continue;
2182		return map;
2183	}
2184
2185	/* Need to add queue to this CPU's existing map */
2186	if (map) {
2187		if (pos < map->alloc_len)
2188			return map;
2189
2190		alloc_len = map->alloc_len * 2;
2191	}
2192
2193	/* Need to allocate new map to store queue on this CPU's map */
2194	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2195			       cpu_to_node(cpu));
 
 
 
 
 
2196	if (!new_map)
2197		return NULL;
2198
2199	for (i = 0; i < pos; i++)
2200		new_map->queues[i] = map->queues[i];
2201	new_map->alloc_len = alloc_len;
2202	new_map->len = pos;
2203
2204	return new_map;
2205}
2206
2207int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2208			u16 index)
 
2209{
 
2210	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2211	int i, cpu, tci, numa_node_id = -2;
2212	int maps_sz, num_tc = 1, tc = 0;
2213	struct xps_map *map, *new_map;
2214	bool active = false;
 
2215
2216	if (dev->num_tc) {
 
2217		num_tc = dev->num_tc;
 
 
 
 
 
 
2218		tc = netdev_txq_to_tc(dev, index);
2219		if (tc < 0)
2220			return -EINVAL;
2221	}
2222
2223	maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2224	if (maps_sz < L1_CACHE_BYTES)
2225		maps_sz = L1_CACHE_BYTES;
2226
2227	mutex_lock(&xps_map_mutex);
2228
2229	dev_maps = xmap_dereference(dev->xps_maps);
2230
2231	/* allocate memory for queue storage */
2232	for_each_cpu_and(cpu, cpu_online_mask, mask) {
 
2233		if (!new_dev_maps)
2234			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2235		if (!new_dev_maps) {
2236			mutex_unlock(&xps_map_mutex);
2237			return -ENOMEM;
2238		}
2239
2240		tci = cpu * num_tc + tc;
2241		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
2242				 NULL;
2243
2244		map = expand_xps_map(map, cpu, index);
2245		if (!map)
2246			goto error;
2247
2248		RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2249	}
2250
2251	if (!new_dev_maps)
2252		goto out_no_new_maps;
2253
2254	for_each_possible_cpu(cpu) {
 
 
 
 
 
 
 
 
2255		/* copy maps belonging to foreign traffic classes */
2256		for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
2257			/* fill in the new device map from the old device map */
2258			map = xmap_dereference(dev_maps->cpu_map[tci]);
2259			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2260		}
2261
2262		/* We need to explicitly update tci as prevous loop
2263		 * could break out early if dev_maps is NULL.
2264		 */
2265		tci = cpu * num_tc + tc;
2266
2267		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2268			/* add queue to CPU maps */
 
2269			int pos = 0;
2270
2271			map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2272			while ((pos < map->len) && (map->queues[pos] != index))
2273				pos++;
2274
2275			if (pos == map->len)
2276				map->queues[map->len++] = index;
2277#ifdef CONFIG_NUMA
2278			if (numa_node_id == -2)
2279				numa_node_id = cpu_to_node(cpu);
2280			else if (numa_node_id != cpu_to_node(cpu))
2281				numa_node_id = -1;
 
 
2282#endif
2283		} else if (dev_maps) {
2284			/* fill in the new device map from the old device map */
2285			map = xmap_dereference(dev_maps->cpu_map[tci]);
2286			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2287		}
2288
2289		/* copy maps belonging to foreign traffic classes */
2290		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2291			/* fill in the new device map from the old device map */
2292			map = xmap_dereference(dev_maps->cpu_map[tci]);
2293			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2294		}
2295	}
2296
2297	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
 
 
 
2298
2299	/* Cleanup old maps */
2300	if (!dev_maps)
2301		goto out_no_old_maps;
2302
2303	for_each_possible_cpu(cpu) {
2304		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2305			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2306			map = xmap_dereference(dev_maps->cpu_map[tci]);
 
2307			if (map && map != new_map)
2308				kfree_rcu(map, rcu);
2309		}
2310	}
2311
2312	kfree_rcu(dev_maps, rcu);
2313
2314out_no_old_maps:
2315	dev_maps = new_dev_maps;
2316	active = true;
2317
2318out_no_new_maps:
2319	/* update Tx queue numa node */
2320	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2321				     (numa_node_id >= 0) ? numa_node_id :
2322				     NUMA_NO_NODE);
 
 
2323
2324	if (!dev_maps)
2325		goto out_no_maps;
2326
2327	/* removes queue from unused CPUs */
2328	for_each_possible_cpu(cpu) {
2329		for (i = tc, tci = cpu * num_tc; i--; tci++)
 
2330			active |= remove_xps_queue(dev_maps, tci, index);
2331		if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
 
2332			active |= remove_xps_queue(dev_maps, tci, index);
2333		for (i = num_tc - tc, tci++; --i; tci++)
2334			active |= remove_xps_queue(dev_maps, tci, index);
2335	}
2336
2337	/* free map if not active */
2338	if (!active) {
2339		RCU_INIT_POINTER(dev->xps_maps, NULL);
2340		kfree_rcu(dev_maps, rcu);
2341	}
2342
2343out_no_maps:
2344	mutex_unlock(&xps_map_mutex);
2345
2346	return 0;
2347error:
2348	/* remove any maps that we added */
2349	for_each_possible_cpu(cpu) {
2350		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2351			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
 
2352			map = dev_maps ?
2353			      xmap_dereference(dev_maps->cpu_map[tci]) :
2354			      NULL;
2355			if (new_map && new_map != map)
2356				kfree(new_map);
2357		}
2358	}
2359
2360	mutex_unlock(&xps_map_mutex);
2361
2362	kfree(new_dev_maps);
2363	return -ENOMEM;
2364}
 
 
 
 
 
 
 
 
 
 
 
 
 
2365EXPORT_SYMBOL(netif_set_xps_queue);
2366
2367#endif
 
 
 
 
 
 
 
 
 
 
 
2368void netdev_reset_tc(struct net_device *dev)
2369{
2370#ifdef CONFIG_XPS
2371	netif_reset_xps_queues_gt(dev, 0);
2372#endif
 
 
 
2373	dev->num_tc = 0;
2374	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2375	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2376}
2377EXPORT_SYMBOL(netdev_reset_tc);
2378
2379int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2380{
2381	if (tc >= dev->num_tc)
2382		return -EINVAL;
2383
2384#ifdef CONFIG_XPS
2385	netif_reset_xps_queues(dev, offset, count);
2386#endif
2387	dev->tc_to_txq[tc].count = count;
2388	dev->tc_to_txq[tc].offset = offset;
2389	return 0;
2390}
2391EXPORT_SYMBOL(netdev_set_tc_queue);
2392
2393int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2394{
2395	if (num_tc > TC_MAX_QUEUE)
2396		return -EINVAL;
2397
2398#ifdef CONFIG_XPS
2399	netif_reset_xps_queues_gt(dev, 0);
2400#endif
 
 
2401	dev->num_tc = num_tc;
2402	return 0;
2403}
2404EXPORT_SYMBOL(netdev_set_num_tc);
2405
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2406/*
2407 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2408 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2409 */
2410int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2411{
2412	bool disabling;
2413	int rc;
2414
2415	disabling = txq < dev->real_num_tx_queues;
2416
2417	if (txq < 1 || txq > dev->num_tx_queues)
2418		return -EINVAL;
2419
2420	if (dev->reg_state == NETREG_REGISTERED ||
2421	    dev->reg_state == NETREG_UNREGISTERING) {
2422		ASSERT_RTNL();
2423
2424		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2425						  txq);
2426		if (rc)
2427			return rc;
2428
2429		if (dev->num_tc)
2430			netif_setup_tc(dev, txq);
2431
2432		dev->real_num_tx_queues = txq;
2433
2434		if (disabling) {
2435			synchronize_net();
2436			qdisc_reset_all_tx_gt(dev, txq);
2437#ifdef CONFIG_XPS
2438			netif_reset_xps_queues_gt(dev, txq);
2439#endif
2440		}
2441	} else {
2442		dev->real_num_tx_queues = txq;
2443	}
2444
2445	return 0;
2446}
2447EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2448
2449#ifdef CONFIG_SYSFS
2450/**
2451 *	netif_set_real_num_rx_queues - set actual number of RX queues used
2452 *	@dev: Network device
2453 *	@rxq: Actual number of RX queues
2454 *
2455 *	This must be called either with the rtnl_lock held or before
2456 *	registration of the net device.  Returns 0 on success, or a
2457 *	negative error code.  If called before registration, it always
2458 *	succeeds.
2459 */
2460int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2461{
2462	int rc;
2463
2464	if (rxq < 1 || rxq > dev->num_rx_queues)
2465		return -EINVAL;
2466
2467	if (dev->reg_state == NETREG_REGISTERED) {
2468		ASSERT_RTNL();
2469
2470		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2471						  rxq);
2472		if (rc)
2473			return rc;
2474	}
2475
2476	dev->real_num_rx_queues = rxq;
2477	return 0;
2478}
2479EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2480#endif
2481
2482/**
2483 * netif_get_num_default_rss_queues - default number of RSS queues
2484 *
2485 * This routine should set an upper limit on the number of RSS queues
2486 * used by default by multiqueue devices.
2487 */
2488int netif_get_num_default_rss_queues(void)
2489{
2490	return is_kdump_kernel() ?
2491		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2492}
2493EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2494
2495static void __netif_reschedule(struct Qdisc *q)
2496{
2497	struct softnet_data *sd;
2498	unsigned long flags;
2499
2500	local_irq_save(flags);
2501	sd = this_cpu_ptr(&softnet_data);
2502	q->next_sched = NULL;
2503	*sd->output_queue_tailp = q;
2504	sd->output_queue_tailp = &q->next_sched;
2505	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2506	local_irq_restore(flags);
2507}
2508
2509void __netif_schedule(struct Qdisc *q)
2510{
2511	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2512		__netif_reschedule(q);
2513}
2514EXPORT_SYMBOL(__netif_schedule);
2515
2516struct dev_kfree_skb_cb {
2517	enum skb_free_reason reason;
2518};
2519
2520static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2521{
2522	return (struct dev_kfree_skb_cb *)skb->cb;
2523}
2524
2525void netif_schedule_queue(struct netdev_queue *txq)
2526{
2527	rcu_read_lock();
2528	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2529		struct Qdisc *q = rcu_dereference(txq->qdisc);
2530
2531		__netif_schedule(q);
2532	}
2533	rcu_read_unlock();
2534}
2535EXPORT_SYMBOL(netif_schedule_queue);
2536
2537void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2538{
2539	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2540		struct Qdisc *q;
2541
2542		rcu_read_lock();
2543		q = rcu_dereference(dev_queue->qdisc);
2544		__netif_schedule(q);
2545		rcu_read_unlock();
2546	}
2547}
2548EXPORT_SYMBOL(netif_tx_wake_queue);
2549
2550void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2551{
2552	unsigned long flags;
2553
2554	if (unlikely(!skb))
2555		return;
2556
2557	if (likely(refcount_read(&skb->users) == 1)) {
2558		smp_rmb();
2559		refcount_set(&skb->users, 0);
2560	} else if (likely(!refcount_dec_and_test(&skb->users))) {
2561		return;
2562	}
2563	get_kfree_skb_cb(skb)->reason = reason;
2564	local_irq_save(flags);
2565	skb->next = __this_cpu_read(softnet_data.completion_queue);
2566	__this_cpu_write(softnet_data.completion_queue, skb);
2567	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2568	local_irq_restore(flags);
2569}
2570EXPORT_SYMBOL(__dev_kfree_skb_irq);
2571
2572void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2573{
2574	if (in_irq() || irqs_disabled())
2575		__dev_kfree_skb_irq(skb, reason);
2576	else
2577		dev_kfree_skb(skb);
2578}
2579EXPORT_SYMBOL(__dev_kfree_skb_any);
2580
2581
2582/**
2583 * netif_device_detach - mark device as removed
2584 * @dev: network device
2585 *
2586 * Mark device as removed from system and therefore no longer available.
2587 */
2588void netif_device_detach(struct net_device *dev)
2589{
2590	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2591	    netif_running(dev)) {
2592		netif_tx_stop_all_queues(dev);
2593	}
2594}
2595EXPORT_SYMBOL(netif_device_detach);
2596
2597/**
2598 * netif_device_attach - mark device as attached
2599 * @dev: network device
2600 *
2601 * Mark device as attached from system and restart if needed.
2602 */
2603void netif_device_attach(struct net_device *dev)
2604{
2605	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2606	    netif_running(dev)) {
2607		netif_tx_wake_all_queues(dev);
2608		__netdev_watchdog_up(dev);
2609	}
2610}
2611EXPORT_SYMBOL(netif_device_attach);
2612
2613/*
2614 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2615 * to be used as a distribution range.
2616 */
2617u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2618		  unsigned int num_tx_queues)
 
2619{
2620	u32 hash;
2621	u16 qoffset = 0;
2622	u16 qcount = num_tx_queues;
2623
2624	if (skb_rx_queue_recorded(skb)) {
2625		hash = skb_get_rx_queue(skb);
2626		while (unlikely(hash >= num_tx_queues))
2627			hash -= num_tx_queues;
2628		return hash;
2629	}
2630
2631	if (dev->num_tc) {
2632		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2633
2634		qoffset = dev->tc_to_txq[tc].offset;
2635		qcount = dev->tc_to_txq[tc].count;
 
 
 
 
 
 
 
2636	}
2637
2638	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2639}
2640EXPORT_SYMBOL(__skb_tx_hash);
2641
2642static void skb_warn_bad_offload(const struct sk_buff *skb)
2643{
2644	static const netdev_features_t null_features;
2645	struct net_device *dev = skb->dev;
2646	const char *name = "";
2647
2648	if (!net_ratelimit())
2649		return;
2650
2651	if (dev) {
2652		if (dev->dev.parent)
2653			name = dev_driver_string(dev->dev.parent);
2654		else
2655			name = netdev_name(dev);
2656	}
2657	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2658	     "gso_type=%d ip_summed=%d\n",
2659	     name, dev ? &dev->features : &null_features,
2660	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2661	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2662	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2663}
2664
2665/*
2666 * Invalidate hardware checksum when packet is to be mangled, and
2667 * complete checksum manually on outgoing path.
2668 */
2669int skb_checksum_help(struct sk_buff *skb)
2670{
2671	__wsum csum;
2672	int ret = 0, offset;
2673
2674	if (skb->ip_summed == CHECKSUM_COMPLETE)
2675		goto out_set_summed;
2676
2677	if (unlikely(skb_shinfo(skb)->gso_size)) {
2678		skb_warn_bad_offload(skb);
2679		return -EINVAL;
2680	}
2681
2682	/* Before computing a checksum, we should make sure no frag could
2683	 * be modified by an external entity : checksum could be wrong.
2684	 */
2685	if (skb_has_shared_frag(skb)) {
2686		ret = __skb_linearize(skb);
2687		if (ret)
2688			goto out;
2689	}
2690
2691	offset = skb_checksum_start_offset(skb);
2692	BUG_ON(offset >= skb_headlen(skb));
2693	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2694
2695	offset += skb->csum_offset;
2696	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2697
2698	if (skb_cloned(skb) &&
2699	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2700		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2701		if (ret)
2702			goto out;
2703	}
2704
2705	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2706out_set_summed:
2707	skb->ip_summed = CHECKSUM_NONE;
2708out:
2709	return ret;
2710}
2711EXPORT_SYMBOL(skb_checksum_help);
2712
2713int skb_crc32c_csum_help(struct sk_buff *skb)
2714{
2715	__le32 crc32c_csum;
2716	int ret = 0, offset, start;
2717
2718	if (skb->ip_summed != CHECKSUM_PARTIAL)
2719		goto out;
2720
2721	if (unlikely(skb_is_gso(skb)))
2722		goto out;
2723
2724	/* Before computing a checksum, we should make sure no frag could
2725	 * be modified by an external entity : checksum could be wrong.
2726	 */
2727	if (unlikely(skb_has_shared_frag(skb))) {
2728		ret = __skb_linearize(skb);
2729		if (ret)
2730			goto out;
2731	}
2732	start = skb_checksum_start_offset(skb);
2733	offset = start + offsetof(struct sctphdr, checksum);
2734	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
2735		ret = -EINVAL;
2736		goto out;
2737	}
2738	if (skb_cloned(skb) &&
2739	    !skb_clone_writable(skb, offset + sizeof(__le32))) {
2740		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2741		if (ret)
2742			goto out;
2743	}
2744	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
2745						  skb->len - start, ~(__u32)0,
2746						  crc32c_csum_stub));
2747	*(__le32 *)(skb->data + offset) = crc32c_csum;
2748	skb->ip_summed = CHECKSUM_NONE;
2749	skb->csum_not_inet = 0;
2750out:
2751	return ret;
2752}
2753
2754__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2755{
2756	__be16 type = skb->protocol;
2757
2758	/* Tunnel gso handlers can set protocol to ethernet. */
2759	if (type == htons(ETH_P_TEB)) {
2760		struct ethhdr *eth;
2761
2762		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2763			return 0;
2764
2765		eth = (struct ethhdr *)skb->data;
2766		type = eth->h_proto;
2767	}
2768
2769	return __vlan_get_protocol(skb, type, depth);
2770}
2771
2772/**
2773 *	skb_mac_gso_segment - mac layer segmentation handler.
2774 *	@skb: buffer to segment
2775 *	@features: features for the output path (see dev->features)
2776 */
2777struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2778				    netdev_features_t features)
2779{
2780	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2781	struct packet_offload *ptype;
2782	int vlan_depth = skb->mac_len;
2783	__be16 type = skb_network_protocol(skb, &vlan_depth);
2784
2785	if (unlikely(!type))
2786		return ERR_PTR(-EINVAL);
2787
2788	__skb_pull(skb, vlan_depth);
2789
2790	rcu_read_lock();
2791	list_for_each_entry_rcu(ptype, &offload_base, list) {
2792		if (ptype->type == type && ptype->callbacks.gso_segment) {
2793			segs = ptype->callbacks.gso_segment(skb, features);
2794			break;
2795		}
2796	}
2797	rcu_read_unlock();
2798
2799	__skb_push(skb, skb->data - skb_mac_header(skb));
2800
2801	return segs;
2802}
2803EXPORT_SYMBOL(skb_mac_gso_segment);
2804
2805
2806/* openvswitch calls this on rx path, so we need a different check.
2807 */
2808static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2809{
2810	if (tx_path)
2811		return skb->ip_summed != CHECKSUM_PARTIAL &&
2812		       skb->ip_summed != CHECKSUM_UNNECESSARY;
2813
2814	return skb->ip_summed == CHECKSUM_NONE;
2815}
2816
2817/**
2818 *	__skb_gso_segment - Perform segmentation on skb.
2819 *	@skb: buffer to segment
2820 *	@features: features for the output path (see dev->features)
2821 *	@tx_path: whether it is called in TX path
2822 *
2823 *	This function segments the given skb and returns a list of segments.
2824 *
2825 *	It may return NULL if the skb requires no segmentation.  This is
2826 *	only possible when GSO is used for verifying header integrity.
2827 *
2828 *	Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2829 */
2830struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2831				  netdev_features_t features, bool tx_path)
2832{
2833	struct sk_buff *segs;
2834
2835	if (unlikely(skb_needs_check(skb, tx_path))) {
2836		int err;
2837
2838		/* We're going to init ->check field in TCP or UDP header */
2839		err = skb_cow_head(skb, 0);
2840		if (err < 0)
2841			return ERR_PTR(err);
2842	}
2843
2844	/* Only report GSO partial support if it will enable us to
2845	 * support segmentation on this frame without needing additional
2846	 * work.
2847	 */
2848	if (features & NETIF_F_GSO_PARTIAL) {
2849		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2850		struct net_device *dev = skb->dev;
2851
2852		partial_features |= dev->features & dev->gso_partial_features;
2853		if (!skb_gso_ok(skb, features | partial_features))
2854			features &= ~NETIF_F_GSO_PARTIAL;
2855	}
2856
2857	BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2858		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2859
2860	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2861	SKB_GSO_CB(skb)->encap_level = 0;
2862
2863	skb_reset_mac_header(skb);
2864	skb_reset_mac_len(skb);
2865
2866	segs = skb_mac_gso_segment(skb, features);
2867
2868	if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
2869		skb_warn_bad_offload(skb);
2870
2871	return segs;
2872}
2873EXPORT_SYMBOL(__skb_gso_segment);
2874
2875/* Take action when hardware reception checksum errors are detected. */
2876#ifdef CONFIG_BUG
2877void netdev_rx_csum_fault(struct net_device *dev)
2878{
2879	if (net_ratelimit()) {
2880		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
 
2881		dump_stack();
2882	}
2883}
2884EXPORT_SYMBOL(netdev_rx_csum_fault);
2885#endif
2886
2887/* Actually, we should eliminate this check as soon as we know, that:
2888 * 1. IOMMU is present and allows to map all the memory.
2889 * 2. No high memory really exists on this machine.
2890 */
2891
2892static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2893{
2894#ifdef CONFIG_HIGHMEM
2895	int i;
2896
2897	if (!(dev->features & NETIF_F_HIGHDMA)) {
2898		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2899			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2900
2901			if (PageHighMem(skb_frag_page(frag)))
2902				return 1;
2903		}
2904	}
2905
2906	if (PCI_DMA_BUS_IS_PHYS) {
2907		struct device *pdev = dev->dev.parent;
2908
2909		if (!pdev)
2910			return 0;
2911		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2912			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2913			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2914
2915			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2916				return 1;
2917		}
2918	}
2919#endif
2920	return 0;
2921}
2922
2923/* If MPLS offload request, verify we are testing hardware MPLS features
2924 * instead of standard features for the netdev.
2925 */
2926#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2927static netdev_features_t net_mpls_features(struct sk_buff *skb,
2928					   netdev_features_t features,
2929					   __be16 type)
2930{
2931	if (eth_p_mpls(type))
2932		features &= skb->dev->mpls_features;
2933
2934	return features;
2935}
2936#else
2937static netdev_features_t net_mpls_features(struct sk_buff *skb,
2938					   netdev_features_t features,
2939					   __be16 type)
2940{
2941	return features;
2942}
2943#endif
2944
2945static netdev_features_t harmonize_features(struct sk_buff *skb,
2946	netdev_features_t features)
2947{
2948	int tmp;
2949	__be16 type;
2950
2951	type = skb_network_protocol(skb, &tmp);
2952	features = net_mpls_features(skb, features, type);
2953
2954	if (skb->ip_summed != CHECKSUM_NONE &&
2955	    !can_checksum_protocol(features, type)) {
2956		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2957	}
2958	if (illegal_highdma(skb->dev, skb))
2959		features &= ~NETIF_F_SG;
2960
2961	return features;
2962}
2963
2964netdev_features_t passthru_features_check(struct sk_buff *skb,
2965					  struct net_device *dev,
2966					  netdev_features_t features)
2967{
2968	return features;
2969}
2970EXPORT_SYMBOL(passthru_features_check);
2971
2972static netdev_features_t dflt_features_check(struct sk_buff *skb,
2973					     struct net_device *dev,
2974					     netdev_features_t features)
2975{
2976	return vlan_features_check(skb, features);
2977}
2978
2979static netdev_features_t gso_features_check(const struct sk_buff *skb,
2980					    struct net_device *dev,
2981					    netdev_features_t features)
2982{
2983	u16 gso_segs = skb_shinfo(skb)->gso_segs;
2984
2985	if (gso_segs > dev->gso_max_segs)
2986		return features & ~NETIF_F_GSO_MASK;
2987
2988	/* Support for GSO partial features requires software
2989	 * intervention before we can actually process the packets
2990	 * so we need to strip support for any partial features now
2991	 * and we can pull them back in after we have partially
2992	 * segmented the frame.
2993	 */
2994	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2995		features &= ~dev->gso_partial_features;
2996
2997	/* Make sure to clear the IPv4 ID mangling feature if the
2998	 * IPv4 header has the potential to be fragmented.
2999	 */
3000	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3001		struct iphdr *iph = skb->encapsulation ?
3002				    inner_ip_hdr(skb) : ip_hdr(skb);
3003
3004		if (!(iph->frag_off & htons(IP_DF)))
3005			features &= ~NETIF_F_TSO_MANGLEID;
3006	}
3007
3008	return features;
3009}
3010
3011netdev_features_t netif_skb_features(struct sk_buff *skb)
3012{
3013	struct net_device *dev = skb->dev;
3014	netdev_features_t features = dev->features;
3015
3016	if (skb_is_gso(skb))
3017		features = gso_features_check(skb, dev, features);
3018
3019	/* If encapsulation offload request, verify we are testing
3020	 * hardware encapsulation features instead of standard
3021	 * features for the netdev
3022	 */
3023	if (skb->encapsulation)
3024		features &= dev->hw_enc_features;
3025
3026	if (skb_vlan_tagged(skb))
3027		features = netdev_intersect_features(features,
3028						     dev->vlan_features |
3029						     NETIF_F_HW_VLAN_CTAG_TX |
3030						     NETIF_F_HW_VLAN_STAG_TX);
3031
3032	if (dev->netdev_ops->ndo_features_check)
3033		features &= dev->netdev_ops->ndo_features_check(skb, dev,
3034								features);
3035	else
3036		features &= dflt_features_check(skb, dev, features);
3037
3038	return harmonize_features(skb, features);
3039}
3040EXPORT_SYMBOL(netif_skb_features);
3041
3042static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3043		    struct netdev_queue *txq, bool more)
3044{
3045	unsigned int len;
3046	int rc;
3047
3048	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
3049		dev_queue_xmit_nit(skb, dev);
3050
3051	len = skb->len;
3052	trace_net_dev_start_xmit(skb, dev);
3053	rc = netdev_start_xmit(skb, dev, txq, more);
3054	trace_net_dev_xmit(skb, rc, dev, len);
3055
3056	return rc;
3057}
3058
3059struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3060				    struct netdev_queue *txq, int *ret)
3061{
3062	struct sk_buff *skb = first;
3063	int rc = NETDEV_TX_OK;
3064
3065	while (skb) {
3066		struct sk_buff *next = skb->next;
3067
3068		skb->next = NULL;
3069		rc = xmit_one(skb, dev, txq, next != NULL);
3070		if (unlikely(!dev_xmit_complete(rc))) {
3071			skb->next = next;
3072			goto out;
3073		}
3074
3075		skb = next;
3076		if (netif_xmit_stopped(txq) && skb) {
3077			rc = NETDEV_TX_BUSY;
3078			break;
3079		}
3080	}
3081
3082out:
3083	*ret = rc;
3084	return skb;
3085}
3086
3087static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3088					  netdev_features_t features)
3089{
3090	if (skb_vlan_tag_present(skb) &&
3091	    !vlan_hw_offload_capable(features, skb->vlan_proto))
3092		skb = __vlan_hwaccel_push_inside(skb);
3093	return skb;
3094}
3095
3096int skb_csum_hwoffload_help(struct sk_buff *skb,
3097			    const netdev_features_t features)
3098{
3099	if (unlikely(skb->csum_not_inet))
3100		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3101			skb_crc32c_csum_help(skb);
3102
3103	return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
3104}
3105EXPORT_SYMBOL(skb_csum_hwoffload_help);
3106
3107static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3108{
3109	netdev_features_t features;
3110
3111	features = netif_skb_features(skb);
3112	skb = validate_xmit_vlan(skb, features);
3113	if (unlikely(!skb))
3114		goto out_null;
3115
 
 
 
 
3116	if (netif_needs_gso(skb, features)) {
3117		struct sk_buff *segs;
3118
3119		segs = skb_gso_segment(skb, features);
3120		if (IS_ERR(segs)) {
3121			goto out_kfree_skb;
3122		} else if (segs) {
3123			consume_skb(skb);
3124			skb = segs;
3125		}
3126	} else {
3127		if (skb_needs_linearize(skb, features) &&
3128		    __skb_linearize(skb))
3129			goto out_kfree_skb;
3130
3131		/* If packet is not checksummed and device does not
3132		 * support checksumming for this protocol, complete
3133		 * checksumming here.
3134		 */
3135		if (skb->ip_summed == CHECKSUM_PARTIAL) {
3136			if (skb->encapsulation)
3137				skb_set_inner_transport_header(skb,
3138							       skb_checksum_start_offset(skb));
3139			else
3140				skb_set_transport_header(skb,
3141							 skb_checksum_start_offset(skb));
3142			if (skb_csum_hwoffload_help(skb, features))
3143				goto out_kfree_skb;
3144		}
3145	}
3146
3147	skb = validate_xmit_xfrm(skb, features, again);
3148
3149	return skb;
3150
3151out_kfree_skb:
3152	kfree_skb(skb);
3153out_null:
3154	atomic_long_inc(&dev->tx_dropped);
3155	return NULL;
3156}
3157
3158struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3159{
3160	struct sk_buff *next, *head = NULL, *tail;
3161
3162	for (; skb != NULL; skb = next) {
3163		next = skb->next;
3164		skb->next = NULL;
3165
3166		/* in case skb wont be segmented, point to itself */
3167		skb->prev = skb;
3168
3169		skb = validate_xmit_skb(skb, dev, again);
3170		if (!skb)
3171			continue;
3172
3173		if (!head)
3174			head = skb;
3175		else
3176			tail->next = skb;
3177		/* If skb was segmented, skb->prev points to
3178		 * the last segment. If not, it still contains skb.
3179		 */
3180		tail = skb->prev;
3181	}
3182	return head;
3183}
3184EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3185
3186static void qdisc_pkt_len_init(struct sk_buff *skb)
3187{
3188	const struct skb_shared_info *shinfo = skb_shinfo(skb);
3189
3190	qdisc_skb_cb(skb)->pkt_len = skb->len;
3191
3192	/* To get more precise estimation of bytes sent on wire,
3193	 * we add to pkt_len the headers size of all segments
3194	 */
3195	if (shinfo->gso_size)  {
3196		unsigned int hdr_len;
3197		u16 gso_segs = shinfo->gso_segs;
3198
3199		/* mac layer + network layer */
3200		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3201
3202		/* + transport layer */
3203		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3204			const struct tcphdr *th;
3205			struct tcphdr _tcphdr;
3206
3207			th = skb_header_pointer(skb, skb_transport_offset(skb),
3208						sizeof(_tcphdr), &_tcphdr);
3209			if (likely(th))
3210				hdr_len += __tcp_hdrlen(th);
3211		} else {
3212			struct udphdr _udphdr;
3213
3214			if (skb_header_pointer(skb, skb_transport_offset(skb),
3215					       sizeof(_udphdr), &_udphdr))
3216				hdr_len += sizeof(struct udphdr);
3217		}
3218
3219		if (shinfo->gso_type & SKB_GSO_DODGY)
3220			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3221						shinfo->gso_size);
3222
3223		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3224	}
3225}
3226
3227static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3228				 struct net_device *dev,
3229				 struct netdev_queue *txq)
3230{
3231	spinlock_t *root_lock = qdisc_lock(q);
3232	struct sk_buff *to_free = NULL;
3233	bool contended;
3234	int rc;
3235
3236	qdisc_calculate_pkt_len(skb, q);
3237
3238	if (q->flags & TCQ_F_NOLOCK) {
3239		if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3240			__qdisc_drop(skb, &to_free);
3241			rc = NET_XMIT_DROP;
 
 
 
 
 
 
 
 
 
 
 
 
 
3242		} else {
3243			rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3244			__qdisc_run(q);
3245		}
3246
3247		if (unlikely(to_free))
3248			kfree_skb_list(to_free);
3249		return rc;
3250	}
3251
3252	/*
3253	 * Heuristic to force contended enqueues to serialize on a
3254	 * separate lock before trying to get qdisc main lock.
3255	 * This permits qdisc->running owner to get the lock more
3256	 * often and dequeue packets faster.
3257	 */
3258	contended = qdisc_is_running(q);
3259	if (unlikely(contended))
3260		spin_lock(&q->busylock);
3261
3262	spin_lock(root_lock);
3263	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3264		__qdisc_drop(skb, &to_free);
3265		rc = NET_XMIT_DROP;
3266	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3267		   qdisc_run_begin(q)) {
3268		/*
3269		 * This is a work-conserving queue; there are no old skbs
3270		 * waiting to be sent out; and the qdisc is not running -
3271		 * xmit the skb directly.
3272		 */
3273
3274		qdisc_bstats_update(q, skb);
3275
3276		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3277			if (unlikely(contended)) {
3278				spin_unlock(&q->busylock);
3279				contended = false;
3280			}
3281			__qdisc_run(q);
3282		}
3283
3284		qdisc_run_end(q);
3285		rc = NET_XMIT_SUCCESS;
3286	} else {
3287		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3288		if (qdisc_run_begin(q)) {
3289			if (unlikely(contended)) {
3290				spin_unlock(&q->busylock);
3291				contended = false;
3292			}
3293			__qdisc_run(q);
3294			qdisc_run_end(q);
3295		}
3296	}
3297	spin_unlock(root_lock);
3298	if (unlikely(to_free))
3299		kfree_skb_list(to_free);
3300	if (unlikely(contended))
3301		spin_unlock(&q->busylock);
3302	return rc;
3303}
3304
3305#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3306static void skb_update_prio(struct sk_buff *skb)
3307{
3308	const struct netprio_map *map;
3309	const struct sock *sk;
3310	unsigned int prioidx;
3311
3312	if (skb->priority)
3313		return;
3314	map = rcu_dereference_bh(skb->dev->priomap);
3315	if (!map)
3316		return;
3317	sk = skb_to_full_sk(skb);
3318	if (!sk)
3319		return;
3320
3321	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3322
3323	if (prioidx < map->priomap_len)
3324		skb->priority = map->priomap[prioidx];
3325}
3326#else
3327#define skb_update_prio(skb)
3328#endif
3329
3330DEFINE_PER_CPU(int, xmit_recursion);
3331EXPORT_SYMBOL(xmit_recursion);
3332
3333/**
3334 *	dev_loopback_xmit - loop back @skb
3335 *	@net: network namespace this loopback is happening in
3336 *	@sk:  sk needed to be a netfilter okfn
3337 *	@skb: buffer to transmit
3338 */
3339int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3340{
3341	skb_reset_mac_header(skb);
3342	__skb_pull(skb, skb_network_offset(skb));
3343	skb->pkt_type = PACKET_LOOPBACK;
3344	skb->ip_summed = CHECKSUM_UNNECESSARY;
3345	WARN_ON(!skb_dst(skb));
3346	skb_dst_force(skb);
3347	netif_rx_ni(skb);
3348	return 0;
3349}
3350EXPORT_SYMBOL(dev_loopback_xmit);
3351
3352#ifdef CONFIG_NET_EGRESS
3353static struct sk_buff *
3354sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3355{
3356	struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
3357	struct tcf_result cl_res;
3358
3359	if (!miniq)
3360		return skb;
3361
3362	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3363	mini_qdisc_bstats_cpu_update(miniq, skb);
3364
3365	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
3366	case TC_ACT_OK:
3367	case TC_ACT_RECLASSIFY:
3368		skb->tc_index = TC_H_MIN(cl_res.classid);
3369		break;
3370	case TC_ACT_SHOT:
3371		mini_qdisc_qstats_cpu_drop(miniq);
3372		*ret = NET_XMIT_DROP;
3373		kfree_skb(skb);
3374		return NULL;
3375	case TC_ACT_STOLEN:
3376	case TC_ACT_QUEUED:
3377	case TC_ACT_TRAP:
3378		*ret = NET_XMIT_SUCCESS;
3379		consume_skb(skb);
3380		return NULL;
3381	case TC_ACT_REDIRECT:
3382		/* No need to push/pop skb's mac_header here on egress! */
3383		skb_do_redirect(skb);
3384		*ret = NET_XMIT_SUCCESS;
3385		return NULL;
3386	default:
3387		break;
3388	}
3389
3390	return skb;
3391}
3392#endif /* CONFIG_NET_EGRESS */
3393
3394static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3395{
3396#ifdef CONFIG_XPS
3397	struct xps_dev_maps *dev_maps;
3398	struct xps_map *map;
3399	int queue_index = -1;
3400
 
 
 
3401	rcu_read_lock();
3402	dev_maps = rcu_dereference(dev->xps_maps);
 
 
 
3403	if (dev_maps) {
3404		unsigned int tci = skb->sender_cpu - 1;
 
 
 
 
 
3405
3406		if (dev->num_tc) {
3407			tci *= dev->num_tc;
3408			tci += netdev_get_prio_tc_map(dev, skb->priority);
3409		}
 
3410
3411		map = rcu_dereference(dev_maps->cpu_map[tci]);
3412		if (map) {
3413			if (map->len == 1)
3414				queue_index = map->queues[0];
3415			else
3416				queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3417									   map->len)];
3418			if (unlikely(queue_index >= dev->real_num_tx_queues))
3419				queue_index = -1;
3420		}
3421	}
3422	rcu_read_unlock();
3423
3424	return queue_index;
3425#else
3426	return -1;
3427#endif
3428}
3429
3430static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3431{
3432	struct sock *sk = skb->sk;
3433	int queue_index = sk_tx_queue_get(sk);
3434
 
 
3435	if (queue_index < 0 || skb->ooo_okay ||
3436	    queue_index >= dev->real_num_tx_queues) {
3437		int new_index = get_xps_queue(dev, skb);
3438
3439		if (new_index < 0)
3440			new_index = skb_tx_hash(dev, skb);
3441
3442		if (queue_index != new_index && sk &&
3443		    sk_fullsock(sk) &&
3444		    rcu_access_pointer(sk->sk_dst_cache))
3445			sk_tx_queue_set(sk, new_index);
3446
3447		queue_index = new_index;
3448	}
3449
3450	return queue_index;
3451}
 
3452
3453struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3454				    struct sk_buff *skb,
3455				    void *accel_priv)
3456{
3457	int queue_index = 0;
3458
3459#ifdef CONFIG_XPS
3460	u32 sender_cpu = skb->sender_cpu - 1;
3461
3462	if (sender_cpu >= (u32)NR_CPUS)
3463		skb->sender_cpu = raw_smp_processor_id() + 1;
3464#endif
3465
3466	if (dev->real_num_tx_queues != 1) {
3467		const struct net_device_ops *ops = dev->netdev_ops;
3468
3469		if (ops->ndo_select_queue)
3470			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3471							    __netdev_pick_tx);
3472		else
3473			queue_index = __netdev_pick_tx(dev, skb);
3474
3475		queue_index = netdev_cap_txqueue(dev, queue_index);
3476	}
3477
3478	skb_set_queue_mapping(skb, queue_index);
3479	return netdev_get_tx_queue(dev, queue_index);
3480}
3481
3482/**
3483 *	__dev_queue_xmit - transmit a buffer
3484 *	@skb: buffer to transmit
3485 *	@accel_priv: private data used for L2 forwarding offload
3486 *
3487 *	Queue a buffer for transmission to a network device. The caller must
3488 *	have set the device and priority and built the buffer before calling
3489 *	this function. The function can be called from an interrupt.
3490 *
3491 *	A negative errno code is returned on a failure. A success does not
3492 *	guarantee the frame will be transmitted as it may be dropped due
3493 *	to congestion or traffic shaping.
3494 *
3495 * -----------------------------------------------------------------------------------
3496 *      I notice this method can also return errors from the queue disciplines,
3497 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3498 *      be positive.
3499 *
3500 *      Regardless of the return value, the skb is consumed, so it is currently
3501 *      difficult to retry a send to this method.  (You can bump the ref count
3502 *      before sending to hold a reference for retry if you are careful.)
3503 *
3504 *      When calling this method, interrupts MUST be enabled.  This is because
3505 *      the BH enable code must have IRQs enabled so that it will not deadlock.
3506 *          --BLG
3507 */
3508static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3509{
3510	struct net_device *dev = skb->dev;
3511	struct netdev_queue *txq;
3512	struct Qdisc *q;
3513	int rc = -ENOMEM;
3514	bool again = false;
3515
3516	skb_reset_mac_header(skb);
3517
3518	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3519		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3520
3521	/* Disable soft irqs for various locks below. Also
3522	 * stops preemption for RCU.
3523	 */
3524	rcu_read_lock_bh();
3525
3526	skb_update_prio(skb);
3527
3528	qdisc_pkt_len_init(skb);
3529#ifdef CONFIG_NET_CLS_ACT
3530	skb->tc_at_ingress = 0;
3531# ifdef CONFIG_NET_EGRESS
3532	if (static_key_false(&egress_needed)) {
3533		skb = sch_handle_egress(skb, &rc, dev);
3534		if (!skb)
3535			goto out;
3536	}
3537# endif
3538#endif
3539	/* If device/qdisc don't need skb->dst, release it right now while
3540	 * its hot in this cpu cache.
3541	 */
3542	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3543		skb_dst_drop(skb);
3544	else
3545		skb_dst_force(skb);
3546
3547	txq = netdev_pick_tx(dev, skb, accel_priv);
3548	q = rcu_dereference_bh(txq->qdisc);
3549
3550	trace_net_dev_queue(skb);
3551	if (q->enqueue) {
3552		rc = __dev_xmit_skb(skb, q, dev, txq);
3553		goto out;
3554	}
3555
3556	/* The device has no queue. Common case for software devices:
3557	 * loopback, all the sorts of tunnels...
3558
3559	 * Really, it is unlikely that netif_tx_lock protection is necessary
3560	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3561	 * counters.)
3562	 * However, it is possible, that they rely on protection
3563	 * made by us here.
3564
3565	 * Check this and shot the lock. It is not prone from deadlocks.
3566	 *Either shot noqueue qdisc, it is even simpler 8)
3567	 */
3568	if (dev->flags & IFF_UP) {
3569		int cpu = smp_processor_id(); /* ok because BHs are off */
3570
3571		if (txq->xmit_lock_owner != cpu) {
3572			if (unlikely(__this_cpu_read(xmit_recursion) >
3573				     XMIT_RECURSION_LIMIT))
3574				goto recursion_alert;
3575
3576			skb = validate_xmit_skb(skb, dev, &again);
3577			if (!skb)
3578				goto out;
3579
3580			HARD_TX_LOCK(dev, txq, cpu);
3581
3582			if (!netif_xmit_stopped(txq)) {
3583				__this_cpu_inc(xmit_recursion);
3584				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3585				__this_cpu_dec(xmit_recursion);
3586				if (dev_xmit_complete(rc)) {
3587					HARD_TX_UNLOCK(dev, txq);
3588					goto out;
3589				}
3590			}
3591			HARD_TX_UNLOCK(dev, txq);
3592			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3593					     dev->name);
3594		} else {
3595			/* Recursion is detected! It is possible,
3596			 * unfortunately
3597			 */
3598recursion_alert:
3599			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3600					     dev->name);
3601		}
3602	}
3603
3604	rc = -ENETDOWN;
3605	rcu_read_unlock_bh();
3606
3607	atomic_long_inc(&dev->tx_dropped);
3608	kfree_skb_list(skb);
3609	return rc;
3610out:
3611	rcu_read_unlock_bh();
3612	return rc;
3613}
3614
3615int dev_queue_xmit(struct sk_buff *skb)
3616{
3617	return __dev_queue_xmit(skb, NULL);
3618}
3619EXPORT_SYMBOL(dev_queue_xmit);
3620
3621int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3622{
3623	return __dev_queue_xmit(skb, accel_priv);
3624}
3625EXPORT_SYMBOL(dev_queue_xmit_accel);
3626
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3627
3628/*************************************************************************
3629 *			Receiver routines
3630 *************************************************************************/
3631
3632int netdev_max_backlog __read_mostly = 1000;
3633EXPORT_SYMBOL(netdev_max_backlog);
3634
3635int netdev_tstamp_prequeue __read_mostly = 1;
3636int netdev_budget __read_mostly = 300;
3637unsigned int __read_mostly netdev_budget_usecs = 2000;
3638int weight_p __read_mostly = 64;           /* old backlog weight */
3639int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
3640int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
3641int dev_rx_weight __read_mostly = 64;
3642int dev_tx_weight __read_mostly = 64;
 
 
3643
3644/* Called with irq disabled */
3645static inline void ____napi_schedule(struct softnet_data *sd,
3646				     struct napi_struct *napi)
3647{
3648	list_add_tail(&napi->poll_list, &sd->poll_list);
3649	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3650}
3651
3652#ifdef CONFIG_RPS
3653
3654/* One global table that all flow-based protocols share. */
3655struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3656EXPORT_SYMBOL(rps_sock_flow_table);
3657u32 rps_cpu_mask __read_mostly;
3658EXPORT_SYMBOL(rps_cpu_mask);
3659
3660struct static_key rps_needed __read_mostly;
3661EXPORT_SYMBOL(rps_needed);
3662struct static_key rfs_needed __read_mostly;
3663EXPORT_SYMBOL(rfs_needed);
3664
3665static struct rps_dev_flow *
3666set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3667	    struct rps_dev_flow *rflow, u16 next_cpu)
3668{
3669	if (next_cpu < nr_cpu_ids) {
3670#ifdef CONFIG_RFS_ACCEL
3671		struct netdev_rx_queue *rxqueue;
3672		struct rps_dev_flow_table *flow_table;
3673		struct rps_dev_flow *old_rflow;
3674		u32 flow_id;
3675		u16 rxq_index;
3676		int rc;
3677
3678		/* Should we steer this flow to a different hardware queue? */
3679		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3680		    !(dev->features & NETIF_F_NTUPLE))
3681			goto out;
3682		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3683		if (rxq_index == skb_get_rx_queue(skb))
3684			goto out;
3685
3686		rxqueue = dev->_rx + rxq_index;
3687		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3688		if (!flow_table)
3689			goto out;
3690		flow_id = skb_get_hash(skb) & flow_table->mask;
3691		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3692							rxq_index, flow_id);
3693		if (rc < 0)
3694			goto out;
3695		old_rflow = rflow;
3696		rflow = &flow_table->flows[flow_id];
3697		rflow->filter = rc;
3698		if (old_rflow->filter == rflow->filter)
3699			old_rflow->filter = RPS_NO_FILTER;
3700	out:
3701#endif
3702		rflow->last_qtail =
3703			per_cpu(softnet_data, next_cpu).input_queue_head;
3704	}
3705
3706	rflow->cpu = next_cpu;
3707	return rflow;
3708}
3709
3710/*
3711 * get_rps_cpu is called from netif_receive_skb and returns the target
3712 * CPU from the RPS map of the receiving queue for a given skb.
3713 * rcu_read_lock must be held on entry.
3714 */
3715static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3716		       struct rps_dev_flow **rflowp)
3717{
3718	const struct rps_sock_flow_table *sock_flow_table;
3719	struct netdev_rx_queue *rxqueue = dev->_rx;
3720	struct rps_dev_flow_table *flow_table;
3721	struct rps_map *map;
3722	int cpu = -1;
3723	u32 tcpu;
3724	u32 hash;
3725
3726	if (skb_rx_queue_recorded(skb)) {
3727		u16 index = skb_get_rx_queue(skb);
3728
3729		if (unlikely(index >= dev->real_num_rx_queues)) {
3730			WARN_ONCE(dev->real_num_rx_queues > 1,
3731				  "%s received packet on queue %u, but number "
3732				  "of RX queues is %u\n",
3733				  dev->name, index, dev->real_num_rx_queues);
3734			goto done;
3735		}
3736		rxqueue += index;
3737	}
3738
3739	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3740
3741	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3742	map = rcu_dereference(rxqueue->rps_map);
3743	if (!flow_table && !map)
3744		goto done;
3745
3746	skb_reset_network_header(skb);
3747	hash = skb_get_hash(skb);
3748	if (!hash)
3749		goto done;
3750
3751	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3752	if (flow_table && sock_flow_table) {
3753		struct rps_dev_flow *rflow;
3754		u32 next_cpu;
3755		u32 ident;
3756
3757		/* First check into global flow table if there is a match */
3758		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3759		if ((ident ^ hash) & ~rps_cpu_mask)
3760			goto try_rps;
3761
3762		next_cpu = ident & rps_cpu_mask;
3763
3764		/* OK, now we know there is a match,
3765		 * we can look at the local (per receive queue) flow table
3766		 */
3767		rflow = &flow_table->flows[hash & flow_table->mask];
3768		tcpu = rflow->cpu;
3769
3770		/*
3771		 * If the desired CPU (where last recvmsg was done) is
3772		 * different from current CPU (one in the rx-queue flow
3773		 * table entry), switch if one of the following holds:
3774		 *   - Current CPU is unset (>= nr_cpu_ids).
3775		 *   - Current CPU is offline.
3776		 *   - The current CPU's queue tail has advanced beyond the
3777		 *     last packet that was enqueued using this table entry.
3778		 *     This guarantees that all previous packets for the flow
3779		 *     have been dequeued, thus preserving in order delivery.
3780		 */
3781		if (unlikely(tcpu != next_cpu) &&
3782		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3783		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3784		      rflow->last_qtail)) >= 0)) {
3785			tcpu = next_cpu;
3786			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3787		}
3788
3789		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3790			*rflowp = rflow;
3791			cpu = tcpu;
3792			goto done;
3793		}
3794	}
3795
3796try_rps:
3797
3798	if (map) {
3799		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3800		if (cpu_online(tcpu)) {
3801			cpu = tcpu;
3802			goto done;
3803		}
3804	}
3805
3806done:
3807	return cpu;
3808}
3809
3810#ifdef CONFIG_RFS_ACCEL
3811
3812/**
3813 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3814 * @dev: Device on which the filter was set
3815 * @rxq_index: RX queue index
3816 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3817 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3818 *
3819 * Drivers that implement ndo_rx_flow_steer() should periodically call
3820 * this function for each installed filter and remove the filters for
3821 * which it returns %true.
3822 */
3823bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3824			 u32 flow_id, u16 filter_id)
3825{
3826	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3827	struct rps_dev_flow_table *flow_table;
3828	struct rps_dev_flow *rflow;
3829	bool expire = true;
3830	unsigned int cpu;
3831
3832	rcu_read_lock();
3833	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3834	if (flow_table && flow_id <= flow_table->mask) {
3835		rflow = &flow_table->flows[flow_id];
3836		cpu = READ_ONCE(rflow->cpu);
3837		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3838		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3839			   rflow->last_qtail) <
3840		     (int)(10 * flow_table->mask)))
3841			expire = false;
3842	}
3843	rcu_read_unlock();
3844	return expire;
3845}
3846EXPORT_SYMBOL(rps_may_expire_flow);
3847
3848#endif /* CONFIG_RFS_ACCEL */
3849
3850/* Called from hardirq (IPI) context */
3851static void rps_trigger_softirq(void *data)
3852{
3853	struct softnet_data *sd = data;
3854
3855	____napi_schedule(sd, &sd->backlog);
3856	sd->received_rps++;
3857}
3858
3859#endif /* CONFIG_RPS */
3860
3861/*
3862 * Check if this softnet_data structure is another cpu one
3863 * If yes, queue it to our IPI list and return 1
3864 * If no, return 0
3865 */
3866static int rps_ipi_queued(struct softnet_data *sd)
3867{
3868#ifdef CONFIG_RPS
3869	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3870
3871	if (sd != mysd) {
3872		sd->rps_ipi_next = mysd->rps_ipi_list;
3873		mysd->rps_ipi_list = sd;
3874
3875		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3876		return 1;
3877	}
3878#endif /* CONFIG_RPS */
3879	return 0;
3880}
3881
3882#ifdef CONFIG_NET_FLOW_LIMIT
3883int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3884#endif
3885
3886static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3887{
3888#ifdef CONFIG_NET_FLOW_LIMIT
3889	struct sd_flow_limit *fl;
3890	struct softnet_data *sd;
3891	unsigned int old_flow, new_flow;
3892
3893	if (qlen < (netdev_max_backlog >> 1))
3894		return false;
3895
3896	sd = this_cpu_ptr(&softnet_data);
3897
3898	rcu_read_lock();
3899	fl = rcu_dereference(sd->flow_limit);
3900	if (fl) {
3901		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3902		old_flow = fl->history[fl->history_head];
3903		fl->history[fl->history_head] = new_flow;
3904
3905		fl->history_head++;
3906		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3907
3908		if (likely(fl->buckets[old_flow]))
3909			fl->buckets[old_flow]--;
3910
3911		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3912			fl->count++;
3913			rcu_read_unlock();
3914			return true;
3915		}
3916	}
3917	rcu_read_unlock();
3918#endif
3919	return false;
3920}
3921
3922/*
3923 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3924 * queue (may be a remote CPU queue).
3925 */
3926static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3927			      unsigned int *qtail)
3928{
3929	struct softnet_data *sd;
3930	unsigned long flags;
3931	unsigned int qlen;
3932
3933	sd = &per_cpu(softnet_data, cpu);
3934
3935	local_irq_save(flags);
3936
3937	rps_lock(sd);
3938	if (!netif_running(skb->dev))
3939		goto drop;
3940	qlen = skb_queue_len(&sd->input_pkt_queue);
3941	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3942		if (qlen) {
3943enqueue:
3944			__skb_queue_tail(&sd->input_pkt_queue, skb);
3945			input_queue_tail_incr_save(sd, qtail);
3946			rps_unlock(sd);
3947			local_irq_restore(flags);
3948			return NET_RX_SUCCESS;
3949		}
3950
3951		/* Schedule NAPI for backlog device
3952		 * We can use non atomic operation since we own the queue lock
3953		 */
3954		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3955			if (!rps_ipi_queued(sd))
3956				____napi_schedule(sd, &sd->backlog);
3957		}
3958		goto enqueue;
3959	}
3960
3961drop:
3962	sd->dropped++;
3963	rps_unlock(sd);
3964
3965	local_irq_restore(flags);
3966
3967	atomic_long_inc(&skb->dev->rx_dropped);
3968	kfree_skb(skb);
3969	return NET_RX_DROP;
3970}
3971
3972static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
3973{
3974	struct net_device *dev = skb->dev;
3975	struct netdev_rx_queue *rxqueue;
3976
3977	rxqueue = dev->_rx;
3978
3979	if (skb_rx_queue_recorded(skb)) {
3980		u16 index = skb_get_rx_queue(skb);
3981
3982		if (unlikely(index >= dev->real_num_rx_queues)) {
3983			WARN_ONCE(dev->real_num_rx_queues > 1,
3984				  "%s received packet on queue %u, but number "
3985				  "of RX queues is %u\n",
3986				  dev->name, index, dev->real_num_rx_queues);
3987
3988			return rxqueue; /* Return first rxqueue */
3989		}
3990		rxqueue += index;
3991	}
3992	return rxqueue;
3993}
3994
3995static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 
3996				     struct bpf_prog *xdp_prog)
3997{
3998	struct netdev_rx_queue *rxqueue;
 
3999	u32 metalen, act = XDP_DROP;
4000	struct xdp_buff xdp;
4001	void *orig_data;
 
4002	int hlen, off;
4003	u32 mac_len;
4004
4005	/* Reinjected packets coming from act_mirred or similar should
4006	 * not get XDP generic processing.
4007	 */
4008	if (skb_cloned(skb))
4009		return XDP_PASS;
4010
4011	/* XDP packets must be linear and must have sufficient headroom
4012	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
4013	 * native XDP provides, thus we need to do it here as well.
4014	 */
4015	if (skb_is_nonlinear(skb) ||
4016	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
4017		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
4018		int troom = skb->tail + skb->data_len - skb->end;
4019
4020		/* In case we have to go down the path and also linearize,
4021		 * then lets do the pskb_expand_head() work just once here.
4022		 */
4023		if (pskb_expand_head(skb,
4024				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
4025				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
4026			goto do_drop;
4027		if (skb_linearize(skb))
4028			goto do_drop;
4029	}
4030
4031	/* The XDP program wants to see the packet starting at the MAC
4032	 * header.
4033	 */
4034	mac_len = skb->data - skb_mac_header(skb);
4035	hlen = skb_headlen(skb) + mac_len;
4036	xdp.data = skb->data - mac_len;
4037	xdp.data_meta = xdp.data;
4038	xdp.data_end = xdp.data + hlen;
4039	xdp.data_hard_start = skb->data - skb_headroom(skb);
4040	orig_data = xdp.data;
 
 
 
 
4041
4042	rxqueue = netif_get_rxqueue(skb);
4043	xdp.rxq = &rxqueue->xdp_rxq;
 
 
4044
4045	act = bpf_prog_run_xdp(xdp_prog, &xdp);
 
 
 
 
 
 
4046
4047	off = xdp.data - orig_data;
4048	if (off > 0)
4049		__skb_pull(skb, off);
4050	else if (off < 0)
4051		__skb_push(skb, -off);
4052	skb->mac_header += off;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4053
4054	switch (act) {
4055	case XDP_REDIRECT:
4056	case XDP_TX:
4057		__skb_push(skb, mac_len);
4058		break;
4059	case XDP_PASS:
4060		metalen = xdp.data - xdp.data_meta;
4061		if (metalen)
4062			skb_metadata_set(skb, metalen);
4063		break;
4064	default:
4065		bpf_warn_invalid_xdp_action(act);
4066		/* fall through */
4067	case XDP_ABORTED:
4068		trace_xdp_exception(skb->dev, xdp_prog, act);
4069		/* fall through */
4070	case XDP_DROP:
4071	do_drop:
4072		kfree_skb(skb);
4073		break;
4074	}
4075
4076	return act;
4077}
4078
4079/* When doing generic XDP we have to bypass the qdisc layer and the
4080 * network taps in order to match in-driver-XDP behavior.
4081 */
4082void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
4083{
4084	struct net_device *dev = skb->dev;
4085	struct netdev_queue *txq;
4086	bool free_skb = true;
4087	int cpu, rc;
4088
4089	txq = netdev_pick_tx(dev, skb, NULL);
4090	cpu = smp_processor_id();
4091	HARD_TX_LOCK(dev, txq, cpu);
4092	if (!netif_xmit_stopped(txq)) {
4093		rc = netdev_start_xmit(skb, dev, txq, 0);
4094		if (dev_xmit_complete(rc))
4095			free_skb = false;
4096	}
4097	HARD_TX_UNLOCK(dev, txq);
4098	if (free_skb) {
4099		trace_xdp_exception(dev, xdp_prog, XDP_TX);
4100		kfree_skb(skb);
4101	}
4102}
4103EXPORT_SYMBOL_GPL(generic_xdp_tx);
4104
4105static struct static_key generic_xdp_needed __read_mostly;
4106
4107int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
4108{
4109	if (xdp_prog) {
4110		u32 act = netif_receive_generic_xdp(skb, xdp_prog);
 
4111		int err;
4112
 
4113		if (act != XDP_PASS) {
4114			switch (act) {
4115			case XDP_REDIRECT:
4116				err = xdp_do_generic_redirect(skb->dev, skb,
4117							      xdp_prog);
4118				if (err)
4119					goto out_redir;
4120			/* fallthru to submit skb */
4121			case XDP_TX:
4122				generic_xdp_tx(skb, xdp_prog);
4123				break;
4124			}
4125			return XDP_DROP;
4126		}
4127	}
4128	return XDP_PASS;
4129out_redir:
4130	kfree_skb(skb);
4131	return XDP_DROP;
4132}
4133EXPORT_SYMBOL_GPL(do_xdp_generic);
4134
4135static int netif_rx_internal(struct sk_buff *skb)
4136{
4137	int ret;
4138
4139	net_timestamp_check(netdev_tstamp_prequeue, skb);
4140
4141	trace_netif_rx(skb);
4142
4143	if (static_key_false(&generic_xdp_needed)) {
4144		int ret;
4145
4146		preempt_disable();
4147		rcu_read_lock();
4148		ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
4149		rcu_read_unlock();
4150		preempt_enable();
4151
4152		/* Consider XDP consuming the packet a success from
4153		 * the netdev point of view we do not want to count
4154		 * this as an error.
4155		 */
4156		if (ret != XDP_PASS)
4157			return NET_RX_SUCCESS;
4158	}
4159
4160#ifdef CONFIG_RPS
4161	if (static_key_false(&rps_needed)) {
4162		struct rps_dev_flow voidflow, *rflow = &voidflow;
4163		int cpu;
4164
4165		preempt_disable();
4166		rcu_read_lock();
4167
4168		cpu = get_rps_cpu(skb->dev, skb, &rflow);
4169		if (cpu < 0)
4170			cpu = smp_processor_id();
4171
4172		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4173
4174		rcu_read_unlock();
4175		preempt_enable();
4176	} else
4177#endif
4178	{
4179		unsigned int qtail;
4180
4181		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
4182		put_cpu();
4183	}
4184	return ret;
4185}
4186
4187/**
4188 *	netif_rx	-	post buffer to the network code
4189 *	@skb: buffer to post
4190 *
4191 *	This function receives a packet from a device driver and queues it for
4192 *	the upper (protocol) levels to process.  It always succeeds. The buffer
4193 *	may be dropped during processing for congestion control or by the
4194 *	protocol layers.
4195 *
4196 *	return values:
4197 *	NET_RX_SUCCESS	(no congestion)
4198 *	NET_RX_DROP     (packet was dropped)
4199 *
4200 */
4201
4202int netif_rx(struct sk_buff *skb)
4203{
 
 
4204	trace_netif_rx_entry(skb);
4205
4206	return netif_rx_internal(skb);
 
 
 
4207}
4208EXPORT_SYMBOL(netif_rx);
4209
4210int netif_rx_ni(struct sk_buff *skb)
4211{
4212	int err;
4213
4214	trace_netif_rx_ni_entry(skb);
4215
4216	preempt_disable();
4217	err = netif_rx_internal(skb);
4218	if (local_softirq_pending())
4219		do_softirq();
4220	preempt_enable();
 
4221
4222	return err;
4223}
4224EXPORT_SYMBOL(netif_rx_ni);
4225
4226static __latent_entropy void net_tx_action(struct softirq_action *h)
4227{
4228	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4229
4230	if (sd->completion_queue) {
4231		struct sk_buff *clist;
4232
4233		local_irq_disable();
4234		clist = sd->completion_queue;
4235		sd->completion_queue = NULL;
4236		local_irq_enable();
4237
4238		while (clist) {
4239			struct sk_buff *skb = clist;
4240
4241			clist = clist->next;
4242
4243			WARN_ON(refcount_read(&skb->users));
4244			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
4245				trace_consume_skb(skb);
4246			else
4247				trace_kfree_skb(skb, net_tx_action);
4248
4249			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
4250				__kfree_skb(skb);
4251			else
4252				__kfree_skb_defer(skb);
4253		}
4254
4255		__kfree_skb_flush();
4256	}
4257
4258	if (sd->output_queue) {
4259		struct Qdisc *head;
4260
4261		local_irq_disable();
4262		head = sd->output_queue;
4263		sd->output_queue = NULL;
4264		sd->output_queue_tailp = &sd->output_queue;
4265		local_irq_enable();
4266
4267		while (head) {
4268			struct Qdisc *q = head;
4269			spinlock_t *root_lock = NULL;
4270
4271			head = head->next_sched;
4272
4273			if (!(q->flags & TCQ_F_NOLOCK)) {
4274				root_lock = qdisc_lock(q);
4275				spin_lock(root_lock);
4276			}
4277			/* We need to make sure head->next_sched is read
4278			 * before clearing __QDISC_STATE_SCHED
4279			 */
4280			smp_mb__before_atomic();
4281			clear_bit(__QDISC_STATE_SCHED, &q->state);
4282			qdisc_run(q);
4283			if (root_lock)
4284				spin_unlock(root_lock);
4285		}
4286	}
4287
4288	xfrm_dev_backlog(sd);
4289}
4290
4291#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
4292/* This hook is defined here for ATM LANE */
4293int (*br_fdb_test_addr_hook)(struct net_device *dev,
4294			     unsigned char *addr) __read_mostly;
4295EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
4296#endif
4297
4298static inline struct sk_buff *
4299sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4300		   struct net_device *orig_dev)
4301{
4302#ifdef CONFIG_NET_CLS_ACT
4303	struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
4304	struct tcf_result cl_res;
4305
4306	/* If there's at least one ingress present somewhere (so
4307	 * we get here via enabled static key), remaining devices
4308	 * that are not configured with an ingress qdisc will bail
4309	 * out here.
4310	 */
4311	if (!miniq)
4312		return skb;
4313
4314	if (*pt_prev) {
4315		*ret = deliver_skb(skb, *pt_prev, orig_dev);
4316		*pt_prev = NULL;
4317	}
4318
4319	qdisc_skb_cb(skb)->pkt_len = skb->len;
4320	skb->tc_at_ingress = 1;
4321	mini_qdisc_bstats_cpu_update(miniq, skb);
4322
4323	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
4324	case TC_ACT_OK:
4325	case TC_ACT_RECLASSIFY:
4326		skb->tc_index = TC_H_MIN(cl_res.classid);
4327		break;
4328	case TC_ACT_SHOT:
4329		mini_qdisc_qstats_cpu_drop(miniq);
4330		kfree_skb(skb);
4331		return NULL;
4332	case TC_ACT_STOLEN:
4333	case TC_ACT_QUEUED:
4334	case TC_ACT_TRAP:
4335		consume_skb(skb);
4336		return NULL;
4337	case TC_ACT_REDIRECT:
4338		/* skb_mac_header check was done by cls/act_bpf, so
4339		 * we can safely push the L2 header back before
4340		 * redirecting to another netdev
4341		 */
4342		__skb_push(skb, skb->mac_len);
4343		skb_do_redirect(skb);
4344		return NULL;
 
 
4345	default:
4346		break;
4347	}
4348#endif /* CONFIG_NET_CLS_ACT */
4349	return skb;
4350}
4351
4352/**
4353 *	netdev_is_rx_handler_busy - check if receive handler is registered
4354 *	@dev: device to check
4355 *
4356 *	Check if a receive handler is already registered for a given device.
4357 *	Return true if there one.
4358 *
4359 *	The caller must hold the rtnl_mutex.
4360 */
4361bool netdev_is_rx_handler_busy(struct net_device *dev)
4362{
4363	ASSERT_RTNL();
4364	return dev && rtnl_dereference(dev->rx_handler);
4365}
4366EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
4367
4368/**
4369 *	netdev_rx_handler_register - register receive handler
4370 *	@dev: device to register a handler for
4371 *	@rx_handler: receive handler to register
4372 *	@rx_handler_data: data pointer that is used by rx handler
4373 *
4374 *	Register a receive handler for a device. This handler will then be
4375 *	called from __netif_receive_skb. A negative errno code is returned
4376 *	on a failure.
4377 *
4378 *	The caller must hold the rtnl_mutex.
4379 *
4380 *	For a general description of rx_handler, see enum rx_handler_result.
4381 */
4382int netdev_rx_handler_register(struct net_device *dev,
4383			       rx_handler_func_t *rx_handler,
4384			       void *rx_handler_data)
4385{
4386	if (netdev_is_rx_handler_busy(dev))
4387		return -EBUSY;
4388
4389	if (dev->priv_flags & IFF_NO_RX_HANDLER)
4390		return -EINVAL;
4391
4392	/* Note: rx_handler_data must be set before rx_handler */
4393	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4394	rcu_assign_pointer(dev->rx_handler, rx_handler);
4395
4396	return 0;
4397}
4398EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4399
4400/**
4401 *	netdev_rx_handler_unregister - unregister receive handler
4402 *	@dev: device to unregister a handler from
4403 *
4404 *	Unregister a receive handler from a device.
4405 *
4406 *	The caller must hold the rtnl_mutex.
4407 */
4408void netdev_rx_handler_unregister(struct net_device *dev)
4409{
4410
4411	ASSERT_RTNL();
4412	RCU_INIT_POINTER(dev->rx_handler, NULL);
4413	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4414	 * section has a guarantee to see a non NULL rx_handler_data
4415	 * as well.
4416	 */
4417	synchronize_net();
4418	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4419}
4420EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4421
4422/*
4423 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4424 * the special handling of PFMEMALLOC skbs.
4425 */
4426static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4427{
4428	switch (skb->protocol) {
4429	case htons(ETH_P_ARP):
4430	case htons(ETH_P_IP):
4431	case htons(ETH_P_IPV6):
4432	case htons(ETH_P_8021Q):
4433	case htons(ETH_P_8021AD):
4434		return true;
4435	default:
4436		return false;
4437	}
4438}
4439
4440static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4441			     int *ret, struct net_device *orig_dev)
4442{
4443#ifdef CONFIG_NETFILTER_INGRESS
4444	if (nf_hook_ingress_active(skb)) {
4445		int ingress_retval;
4446
4447		if (*pt_prev) {
4448			*ret = deliver_skb(skb, *pt_prev, orig_dev);
4449			*pt_prev = NULL;
4450		}
4451
4452		rcu_read_lock();
4453		ingress_retval = nf_hook_ingress(skb);
4454		rcu_read_unlock();
4455		return ingress_retval;
4456	}
4457#endif /* CONFIG_NETFILTER_INGRESS */
4458	return 0;
4459}
4460
4461static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
 
4462{
4463	struct packet_type *ptype, *pt_prev;
4464	rx_handler_func_t *rx_handler;
4465	struct net_device *orig_dev;
4466	bool deliver_exact = false;
4467	int ret = NET_RX_DROP;
4468	__be16 type;
4469
4470	net_timestamp_check(!netdev_tstamp_prequeue, skb);
4471
4472	trace_netif_receive_skb(skb);
4473
4474	orig_dev = skb->dev;
4475
4476	skb_reset_network_header(skb);
4477	if (!skb_transport_header_was_set(skb))
4478		skb_reset_transport_header(skb);
4479	skb_reset_mac_len(skb);
4480
4481	pt_prev = NULL;
4482
4483another_round:
4484	skb->skb_iif = skb->dev->ifindex;
4485
4486	__this_cpu_inc(softnet_data.processed);
4487
 
 
 
 
 
 
 
 
 
 
 
 
4488	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4489	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4490		skb = skb_vlan_untag(skb);
4491		if (unlikely(!skb))
4492			goto out;
4493	}
4494
4495	if (skb_skip_tc_classify(skb))
4496		goto skip_classify;
4497
4498	if (pfmemalloc)
4499		goto skip_taps;
4500
4501	list_for_each_entry_rcu(ptype, &ptype_all, list) {
4502		if (pt_prev)
4503			ret = deliver_skb(skb, pt_prev, orig_dev);
4504		pt_prev = ptype;
4505	}
4506
4507	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4508		if (pt_prev)
4509			ret = deliver_skb(skb, pt_prev, orig_dev);
4510		pt_prev = ptype;
4511	}
4512
4513skip_taps:
4514#ifdef CONFIG_NET_INGRESS
4515	if (static_key_false(&ingress_needed)) {
4516		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4517		if (!skb)
4518			goto out;
4519
4520		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4521			goto out;
4522	}
4523#endif
4524	skb_reset_tc(skb);
4525skip_classify:
4526	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4527		goto drop;
4528
4529	if (skb_vlan_tag_present(skb)) {
4530		if (pt_prev) {
4531			ret = deliver_skb(skb, pt_prev, orig_dev);
4532			pt_prev = NULL;
4533		}
4534		if (vlan_do_receive(&skb))
4535			goto another_round;
4536		else if (unlikely(!skb))
4537			goto out;
4538	}
4539
4540	rx_handler = rcu_dereference(skb->dev->rx_handler);
4541	if (rx_handler) {
4542		if (pt_prev) {
4543			ret = deliver_skb(skb, pt_prev, orig_dev);
4544			pt_prev = NULL;
4545		}
4546		switch (rx_handler(&skb)) {
4547		case RX_HANDLER_CONSUMED:
4548			ret = NET_RX_SUCCESS;
4549			goto out;
4550		case RX_HANDLER_ANOTHER:
4551			goto another_round;
4552		case RX_HANDLER_EXACT:
4553			deliver_exact = true;
4554		case RX_HANDLER_PASS:
4555			break;
4556		default:
4557			BUG();
4558		}
4559	}
4560
4561	if (unlikely(skb_vlan_tag_present(skb))) {
4562		if (skb_vlan_tag_get_id(skb))
 
 
 
 
4563			skb->pkt_type = PACKET_OTHERHOST;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4564		/* Note: we might in the future use prio bits
4565		 * and set skb->priority like in vlan_do_receive()
4566		 * For the time being, just ignore Priority Code Point
4567		 */
4568		skb->vlan_tci = 0;
4569	}
4570
4571	type = skb->protocol;
4572
4573	/* deliver only exact match when indicated */
4574	if (likely(!deliver_exact)) {
4575		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4576				       &ptype_base[ntohs(type) &
4577						   PTYPE_HASH_MASK]);
4578	}
4579
4580	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4581			       &orig_dev->ptype_specific);
4582
4583	if (unlikely(skb->dev != orig_dev)) {
4584		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4585				       &skb->dev->ptype_specific);
4586	}
4587
4588	if (pt_prev) {
4589		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
4590			goto drop;
4591		else
4592			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4593	} else {
4594drop:
4595		if (!deliver_exact)
4596			atomic_long_inc(&skb->dev->rx_dropped);
4597		else
4598			atomic_long_inc(&skb->dev->rx_nohandler);
4599		kfree_skb(skb);
4600		/* Jamal, now you will not able to escape explaining
4601		 * me how you were going to use this. :-)
4602		 */
4603		ret = NET_RX_DROP;
4604	}
4605
4606out:
4607	return ret;
4608}
4609
 
 
 
 
 
 
 
 
 
 
 
 
 
4610/**
4611 *	netif_receive_skb_core - special purpose version of netif_receive_skb
4612 *	@skb: buffer to process
4613 *
4614 *	More direct receive version of netif_receive_skb().  It should
4615 *	only be used by callers that have a need to skip RPS and Generic XDP.
4616 *	Caller must also take care of handling if (page_is_)pfmemalloc.
4617 *
4618 *	This function may only be called from softirq context and interrupts
4619 *	should be enabled.
4620 *
4621 *	Return values (usually ignored):
4622 *	NET_RX_SUCCESS: no congestion
4623 *	NET_RX_DROP: packet was dropped
4624 */
4625int netif_receive_skb_core(struct sk_buff *skb)
4626{
4627	int ret;
4628
4629	rcu_read_lock();
4630	ret = __netif_receive_skb_core(skb, false);
4631	rcu_read_unlock();
4632
4633	return ret;
4634}
4635EXPORT_SYMBOL(netif_receive_skb_core);
4636
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4637static int __netif_receive_skb(struct sk_buff *skb)
4638{
4639	int ret;
4640
4641	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4642		unsigned int noreclaim_flag;
4643
4644		/*
4645		 * PFMEMALLOC skbs are special, they should
4646		 * - be delivered to SOCK_MEMALLOC sockets only
4647		 * - stay away from userspace
4648		 * - have bounded memory usage
4649		 *
4650		 * Use PF_MEMALLOC as this saves us from propagating the allocation
4651		 * context down to all allocation sites.
4652		 */
4653		noreclaim_flag = memalloc_noreclaim_save();
4654		ret = __netif_receive_skb_core(skb, true);
4655		memalloc_noreclaim_restore(noreclaim_flag);
4656	} else
4657		ret = __netif_receive_skb_core(skb, false);
4658
4659	return ret;
4660}
4661
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4662static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
4663{
4664	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
4665	struct bpf_prog *new = xdp->prog;
4666	int ret = 0;
4667
4668	switch (xdp->command) {
4669	case XDP_SETUP_PROG:
4670		rcu_assign_pointer(dev->xdp_prog, new);
4671		if (old)
4672			bpf_prog_put(old);
4673
4674		if (old && !new) {
4675			static_key_slow_dec(&generic_xdp_needed);
4676		} else if (new && !old) {
4677			static_key_slow_inc(&generic_xdp_needed);
4678			dev_disable_lro(dev);
4679			dev_disable_gro_hw(dev);
4680		}
4681		break;
4682
4683	case XDP_QUERY_PROG:
4684		xdp->prog_attached = !!old;
4685		xdp->prog_id = old ? old->aux->id : 0;
4686		break;
4687
4688	default:
4689		ret = -EINVAL;
4690		break;
4691	}
4692
4693	return ret;
4694}
4695
4696static int netif_receive_skb_internal(struct sk_buff *skb)
4697{
4698	int ret;
4699
4700	net_timestamp_check(netdev_tstamp_prequeue, skb);
4701
4702	if (skb_defer_rx_timestamp(skb))
4703		return NET_RX_SUCCESS;
4704
4705	if (static_key_false(&generic_xdp_needed)) {
4706		int ret;
4707
4708		preempt_disable();
4709		rcu_read_lock();
4710		ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
4711		rcu_read_unlock();
4712		preempt_enable();
4713
4714		if (ret != XDP_PASS)
4715			return NET_RX_DROP;
4716	}
4717
4718	rcu_read_lock();
4719#ifdef CONFIG_RPS
4720	if (static_key_false(&rps_needed)) {
4721		struct rps_dev_flow voidflow, *rflow = &voidflow;
4722		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4723
4724		if (cpu >= 0) {
4725			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4726			rcu_read_unlock();
4727			return ret;
4728		}
4729	}
4730#endif
4731	ret = __netif_receive_skb(skb);
4732	rcu_read_unlock();
4733	return ret;
4734}
4735
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4736/**
4737 *	netif_receive_skb - process receive buffer from network
4738 *	@skb: buffer to process
4739 *
4740 *	netif_receive_skb() is the main receive data processing function.
4741 *	It always succeeds. The buffer may be dropped during processing
4742 *	for congestion control or by the protocol layers.
4743 *
4744 *	This function may only be called from softirq context and interrupts
4745 *	should be enabled.
4746 *
4747 *	Return values (usually ignored):
4748 *	NET_RX_SUCCESS: no congestion
4749 *	NET_RX_DROP: packet was dropped
4750 */
4751int netif_receive_skb(struct sk_buff *skb)
4752{
 
 
4753	trace_netif_receive_skb_entry(skb);
4754
4755	return netif_receive_skb_internal(skb);
 
 
 
4756}
4757EXPORT_SYMBOL(netif_receive_skb);
4758
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4759DEFINE_PER_CPU(struct work_struct, flush_works);
4760
4761/* Network device is going away, flush any packets still pending */
4762static void flush_backlog(struct work_struct *work)
4763{
4764	struct sk_buff *skb, *tmp;
4765	struct softnet_data *sd;
4766
4767	local_bh_disable();
4768	sd = this_cpu_ptr(&softnet_data);
4769
4770	local_irq_disable();
4771	rps_lock(sd);
4772	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4773		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4774			__skb_unlink(skb, &sd->input_pkt_queue);
4775			kfree_skb(skb);
4776			input_queue_head_incr(sd);
4777		}
4778	}
4779	rps_unlock(sd);
4780	local_irq_enable();
4781
4782	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4783		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4784			__skb_unlink(skb, &sd->process_queue);
4785			kfree_skb(skb);
4786			input_queue_head_incr(sd);
4787		}
4788	}
4789	local_bh_enable();
4790}
4791
4792static void flush_all_backlogs(void)
4793{
4794	unsigned int cpu;
4795
4796	get_online_cpus();
4797
4798	for_each_online_cpu(cpu)
4799		queue_work_on(cpu, system_highpri_wq,
4800			      per_cpu_ptr(&flush_works, cpu));
4801
4802	for_each_online_cpu(cpu)
4803		flush_work(per_cpu_ptr(&flush_works, cpu));
4804
4805	put_online_cpus();
4806}
4807
 
 
4808static int napi_gro_complete(struct sk_buff *skb)
4809{
4810	struct packet_offload *ptype;
4811	__be16 type = skb->protocol;
4812	struct list_head *head = &offload_base;
4813	int err = -ENOENT;
4814
4815	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4816
4817	if (NAPI_GRO_CB(skb)->count == 1) {
4818		skb_shinfo(skb)->gso_size = 0;
4819		goto out;
4820	}
4821
4822	rcu_read_lock();
4823	list_for_each_entry_rcu(ptype, head, list) {
4824		if (ptype->type != type || !ptype->callbacks.gro_complete)
4825			continue;
4826
4827		err = ptype->callbacks.gro_complete(skb, 0);
 
 
4828		break;
4829	}
4830	rcu_read_unlock();
4831
4832	if (err) {
4833		WARN_ON(&ptype->list == head);
4834		kfree_skb(skb);
4835		return NET_RX_SUCCESS;
4836	}
4837
4838out:
4839	return netif_receive_skb_internal(skb);
4840}
4841
4842/* napi->gro_list contains packets ordered by age.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4843 * youngest packets at the head of it.
4844 * Complete skbs in reverse order to reduce latencies.
4845 */
4846void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4847{
4848	struct sk_buff *skb, *prev = NULL;
 
4849
4850	/* scan list and build reverse chain */
4851	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4852		skb->prev = prev;
4853		prev = skb;
4854	}
4855
4856	for (skb = prev; skb; skb = prev) {
4857		skb->next = NULL;
4858
4859		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4860			return;
4861
4862		prev = skb->prev;
4863		napi_gro_complete(skb);
4864		napi->gro_count--;
4865	}
4866
4867	napi->gro_list = NULL;
4868}
4869EXPORT_SYMBOL(napi_gro_flush);
4870
4871static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
 
4872{
4873	struct sk_buff *p;
4874	unsigned int maclen = skb->dev->hard_header_len;
4875	u32 hash = skb_get_hash_raw(skb);
 
 
4876
4877	for (p = napi->gro_list; p; p = p->next) {
 
4878		unsigned long diffs;
4879
4880		NAPI_GRO_CB(p)->flush = 0;
4881
4882		if (hash != skb_get_hash_raw(p)) {
4883			NAPI_GRO_CB(p)->same_flow = 0;
4884			continue;
4885		}
4886
4887		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4888		diffs |= p->vlan_tci ^ skb->vlan_tci;
 
 
4889		diffs |= skb_metadata_dst_cmp(p, skb);
4890		diffs |= skb_metadata_differs(p, skb);
4891		if (maclen == ETH_HLEN)
4892			diffs |= compare_ether_header(skb_mac_header(p),
4893						      skb_mac_header(skb));
4894		else if (!diffs)
4895			diffs = memcmp(skb_mac_header(p),
4896				       skb_mac_header(skb),
4897				       maclen);
4898		NAPI_GRO_CB(p)->same_flow = !diffs;
4899	}
 
 
4900}
4901
4902static void skb_gro_reset_offset(struct sk_buff *skb)
4903{
4904	const struct skb_shared_info *pinfo = skb_shinfo(skb);
4905	const skb_frag_t *frag0 = &pinfo->frags[0];
4906
4907	NAPI_GRO_CB(skb)->data_offset = 0;
4908	NAPI_GRO_CB(skb)->frag0 = NULL;
4909	NAPI_GRO_CB(skb)->frag0_len = 0;
4910
4911	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4912	    pinfo->nr_frags &&
4913	    !PageHighMem(skb_frag_page(frag0))) {
4914		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4915		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4916						    skb_frag_size(frag0),
4917						    skb->end - skb->tail);
4918	}
4919}
4920
4921static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4922{
4923	struct skb_shared_info *pinfo = skb_shinfo(skb);
4924
4925	BUG_ON(skb->end - skb->tail < grow);
4926
4927	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4928
4929	skb->data_len -= grow;
4930	skb->tail += grow;
4931
4932	pinfo->frags[0].page_offset += grow;
4933	skb_frag_size_sub(&pinfo->frags[0], grow);
4934
4935	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4936		skb_frag_unref(skb, 0);
4937		memmove(pinfo->frags, pinfo->frags + 1,
4938			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
4939	}
4940}
4941
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4942static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4943{
4944	struct sk_buff **pp = NULL;
 
4945	struct packet_offload *ptype;
4946	__be16 type = skb->protocol;
4947	struct list_head *head = &offload_base;
 
 
4948	int same_flow;
4949	enum gro_result ret;
4950	int grow;
4951
4952	if (netif_elide_gro(skb->dev))
4953		goto normal;
4954
4955	gro_list_prepare(napi, skb);
4956
4957	rcu_read_lock();
4958	list_for_each_entry_rcu(ptype, head, list) {
4959		if (ptype->type != type || !ptype->callbacks.gro_receive)
4960			continue;
4961
4962		skb_set_network_header(skb, skb_gro_offset(skb));
4963		skb_reset_mac_len(skb);
4964		NAPI_GRO_CB(skb)->same_flow = 0;
4965		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
4966		NAPI_GRO_CB(skb)->free = 0;
4967		NAPI_GRO_CB(skb)->encap_mark = 0;
4968		NAPI_GRO_CB(skb)->recursion_counter = 0;
4969		NAPI_GRO_CB(skb)->is_fou = 0;
4970		NAPI_GRO_CB(skb)->is_atomic = 1;
4971		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4972
4973		/* Setup for GRO checksum validation */
4974		switch (skb->ip_summed) {
4975		case CHECKSUM_COMPLETE:
4976			NAPI_GRO_CB(skb)->csum = skb->csum;
4977			NAPI_GRO_CB(skb)->csum_valid = 1;
4978			NAPI_GRO_CB(skb)->csum_cnt = 0;
4979			break;
4980		case CHECKSUM_UNNECESSARY:
4981			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4982			NAPI_GRO_CB(skb)->csum_valid = 0;
4983			break;
4984		default:
4985			NAPI_GRO_CB(skb)->csum_cnt = 0;
4986			NAPI_GRO_CB(skb)->csum_valid = 0;
4987		}
4988
4989		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
 
 
4990		break;
4991	}
4992	rcu_read_unlock();
4993
4994	if (&ptype->list == head)
4995		goto normal;
4996
4997	if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
4998		ret = GRO_CONSUMED;
4999		goto ok;
5000	}
5001
5002	same_flow = NAPI_GRO_CB(skb)->same_flow;
5003	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
5004
5005	if (pp) {
5006		struct sk_buff *nskb = *pp;
5007
5008		*pp = nskb->next;
5009		nskb->next = NULL;
5010		napi_gro_complete(nskb);
5011		napi->gro_count--;
5012	}
5013
5014	if (same_flow)
5015		goto ok;
5016
5017	if (NAPI_GRO_CB(skb)->flush)
5018		goto normal;
5019
5020	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
5021		struct sk_buff *nskb = napi->gro_list;
5022
5023		/* locate the end of the list to select the 'oldest' flow */
5024		while (nskb->next) {
5025			pp = &nskb->next;
5026			nskb = *pp;
5027		}
5028		*pp = NULL;
5029		nskb->next = NULL;
5030		napi_gro_complete(nskb);
5031	} else {
5032		napi->gro_count++;
5033	}
5034	NAPI_GRO_CB(skb)->count = 1;
5035	NAPI_GRO_CB(skb)->age = jiffies;
5036	NAPI_GRO_CB(skb)->last = skb;
5037	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
5038	skb->next = napi->gro_list;
5039	napi->gro_list = skb;
5040	ret = GRO_HELD;
5041
5042pull:
5043	grow = skb_gro_offset(skb) - skb_headlen(skb);
5044	if (grow > 0)
5045		gro_pull_from_frag0(skb, grow);
5046ok:
 
 
 
 
 
 
 
5047	return ret;
5048
5049normal:
5050	ret = GRO_NORMAL;
5051	goto pull;
5052}
5053
5054struct packet_offload *gro_find_receive_by_type(__be16 type)
5055{
5056	struct list_head *offload_head = &offload_base;
5057	struct packet_offload *ptype;
5058
5059	list_for_each_entry_rcu(ptype, offload_head, list) {
5060		if (ptype->type != type || !ptype->callbacks.gro_receive)
5061			continue;
5062		return ptype;
5063	}
5064	return NULL;
5065}
5066EXPORT_SYMBOL(gro_find_receive_by_type);
5067
5068struct packet_offload *gro_find_complete_by_type(__be16 type)
5069{
5070	struct list_head *offload_head = &offload_base;
5071	struct packet_offload *ptype;
5072
5073	list_for_each_entry_rcu(ptype, offload_head, list) {
5074		if (ptype->type != type || !ptype->callbacks.gro_complete)
5075			continue;
5076		return ptype;
5077	}
5078	return NULL;
5079}
5080EXPORT_SYMBOL(gro_find_complete_by_type);
5081
5082static void napi_skb_free_stolen_head(struct sk_buff *skb)
5083{
5084	skb_dst_drop(skb);
5085	secpath_reset(skb);
5086	kmem_cache_free(skbuff_head_cache, skb);
5087}
5088
5089static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
5090{
5091	switch (ret) {
5092	case GRO_NORMAL:
5093		if (netif_receive_skb_internal(skb))
5094			ret = GRO_DROP;
5095		break;
5096
5097	case GRO_DROP:
5098		kfree_skb(skb);
5099		break;
5100
5101	case GRO_MERGED_FREE:
5102		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
5103			napi_skb_free_stolen_head(skb);
5104		else
5105			__kfree_skb(skb);
5106		break;
5107
5108	case GRO_HELD:
5109	case GRO_MERGED:
5110	case GRO_CONSUMED:
5111		break;
5112	}
5113
5114	return ret;
5115}
5116
5117gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5118{
 
 
5119	skb_mark_napi_id(skb, napi);
5120	trace_napi_gro_receive_entry(skb);
5121
5122	skb_gro_reset_offset(skb);
5123
5124	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
 
 
 
5125}
5126EXPORT_SYMBOL(napi_gro_receive);
5127
5128static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
5129{
5130	if (unlikely(skb->pfmemalloc)) {
5131		consume_skb(skb);
5132		return;
5133	}
5134	__skb_pull(skb, skb_headlen(skb));
5135	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
5136	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
5137	skb->vlan_tci = 0;
5138	skb->dev = napi->dev;
5139	skb->skb_iif = 0;
 
 
 
 
5140	skb->encapsulation = 0;
5141	skb_shinfo(skb)->gso_type = 0;
5142	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
5143	secpath_reset(skb);
5144
5145	napi->skb = skb;
5146}
5147
5148struct sk_buff *napi_get_frags(struct napi_struct *napi)
5149{
5150	struct sk_buff *skb = napi->skb;
5151
5152	if (!skb) {
5153		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
5154		if (skb) {
5155			napi->skb = skb;
5156			skb_mark_napi_id(skb, napi);
5157		}
5158	}
5159	return skb;
5160}
5161EXPORT_SYMBOL(napi_get_frags);
5162
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5163static gro_result_t napi_frags_finish(struct napi_struct *napi,
5164				      struct sk_buff *skb,
5165				      gro_result_t ret)
5166{
5167	switch (ret) {
5168	case GRO_NORMAL:
5169	case GRO_HELD:
5170		__skb_push(skb, ETH_HLEN);
5171		skb->protocol = eth_type_trans(skb, skb->dev);
5172		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
5173			ret = GRO_DROP;
5174		break;
5175
5176	case GRO_DROP:
5177		napi_reuse_skb(napi, skb);
5178		break;
5179
5180	case GRO_MERGED_FREE:
5181		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
5182			napi_skb_free_stolen_head(skb);
5183		else
5184			napi_reuse_skb(napi, skb);
5185		break;
5186
5187	case GRO_MERGED:
5188	case GRO_CONSUMED:
5189		break;
5190	}
5191
5192	return ret;
5193}
5194
5195/* Upper GRO stack assumes network header starts at gro_offset=0
5196 * Drivers could call both napi_gro_frags() and napi_gro_receive()
5197 * We copy ethernet header into skb->data to have a common layout.
5198 */
5199static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
5200{
5201	struct sk_buff *skb = napi->skb;
5202	const struct ethhdr *eth;
5203	unsigned int hlen = sizeof(*eth);
5204
5205	napi->skb = NULL;
5206
5207	skb_reset_mac_header(skb);
5208	skb_gro_reset_offset(skb);
5209
5210	eth = skb_gro_header_fast(skb, 0);
5211	if (unlikely(skb_gro_header_hard(skb, hlen))) {
5212		eth = skb_gro_header_slow(skb, hlen, 0);
5213		if (unlikely(!eth)) {
5214			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
5215					     __func__, napi->dev->name);
5216			napi_reuse_skb(napi, skb);
5217			return NULL;
5218		}
5219	} else {
 
5220		gro_pull_from_frag0(skb, hlen);
5221		NAPI_GRO_CB(skb)->frag0 += hlen;
5222		NAPI_GRO_CB(skb)->frag0_len -= hlen;
5223	}
5224	__skb_pull(skb, hlen);
5225
5226	/*
5227	 * This works because the only protocols we care about don't require
5228	 * special handling.
5229	 * We'll fix it up properly in napi_frags_finish()
5230	 */
5231	skb->protocol = eth->h_proto;
5232
5233	return skb;
5234}
5235
5236gro_result_t napi_gro_frags(struct napi_struct *napi)
5237{
 
5238	struct sk_buff *skb = napi_frags_skb(napi);
5239
5240	if (!skb)
5241		return GRO_DROP;
5242
5243	trace_napi_gro_frags_entry(skb);
5244
5245	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
 
 
 
5246}
5247EXPORT_SYMBOL(napi_gro_frags);
5248
5249/* Compute the checksum from gro_offset and return the folded value
5250 * after adding in any pseudo checksum.
5251 */
5252__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
5253{
5254	__wsum wsum;
5255	__sum16 sum;
5256
5257	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
5258
5259	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
5260	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
 
5261	if (likely(!sum)) {
5262		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
5263		    !skb->csum_complete_sw)
5264			netdev_rx_csum_fault(skb->dev);
5265	}
5266
5267	NAPI_GRO_CB(skb)->csum = wsum;
5268	NAPI_GRO_CB(skb)->csum_valid = 1;
5269
5270	return sum;
5271}
5272EXPORT_SYMBOL(__skb_gro_checksum_complete);
5273
5274static void net_rps_send_ipi(struct softnet_data *remsd)
5275{
5276#ifdef CONFIG_RPS
5277	while (remsd) {
5278		struct softnet_data *next = remsd->rps_ipi_next;
5279
5280		if (cpu_online(remsd->cpu))
5281			smp_call_function_single_async(remsd->cpu, &remsd->csd);
5282		remsd = next;
5283	}
5284#endif
5285}
5286
5287/*
5288 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
5289 * Note: called with local irq disabled, but exits with local irq enabled.
5290 */
5291static void net_rps_action_and_irq_enable(struct softnet_data *sd)
5292{
5293#ifdef CONFIG_RPS
5294	struct softnet_data *remsd = sd->rps_ipi_list;
5295
5296	if (remsd) {
5297		sd->rps_ipi_list = NULL;
5298
5299		local_irq_enable();
5300
5301		/* Send pending IPI's to kick RPS processing on remote cpus. */
5302		net_rps_send_ipi(remsd);
5303	} else
5304#endif
5305		local_irq_enable();
5306}
5307
5308static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
5309{
5310#ifdef CONFIG_RPS
5311	return sd->rps_ipi_list != NULL;
5312#else
5313	return false;
5314#endif
5315}
5316
5317static int process_backlog(struct napi_struct *napi, int quota)
5318{
5319	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
5320	bool again = true;
5321	int work = 0;
5322
5323	/* Check if we have pending ipi, its better to send them now,
5324	 * not waiting net_rx_action() end.
5325	 */
5326	if (sd_has_rps_ipi_waiting(sd)) {
5327		local_irq_disable();
5328		net_rps_action_and_irq_enable(sd);
5329	}
5330
5331	napi->weight = dev_rx_weight;
5332	while (again) {
5333		struct sk_buff *skb;
5334
5335		while ((skb = __skb_dequeue(&sd->process_queue))) {
5336			rcu_read_lock();
5337			__netif_receive_skb(skb);
5338			rcu_read_unlock();
5339			input_queue_head_incr(sd);
5340			if (++work >= quota)
5341				return work;
5342
5343		}
5344
5345		local_irq_disable();
5346		rps_lock(sd);
5347		if (skb_queue_empty(&sd->input_pkt_queue)) {
5348			/*
5349			 * Inline a custom version of __napi_complete().
5350			 * only current cpu owns and manipulates this napi,
5351			 * and NAPI_STATE_SCHED is the only possible flag set
5352			 * on backlog.
5353			 * We can use a plain write instead of clear_bit(),
5354			 * and we dont need an smp_mb() memory barrier.
5355			 */
5356			napi->state = 0;
5357			again = false;
5358		} else {
5359			skb_queue_splice_tail_init(&sd->input_pkt_queue,
5360						   &sd->process_queue);
5361		}
5362		rps_unlock(sd);
5363		local_irq_enable();
5364	}
5365
5366	return work;
5367}
5368
5369/**
5370 * __napi_schedule - schedule for receive
5371 * @n: entry to schedule
5372 *
5373 * The entry's receive function will be scheduled to run.
5374 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
5375 */
5376void __napi_schedule(struct napi_struct *n)
5377{
5378	unsigned long flags;
5379
5380	local_irq_save(flags);
5381	____napi_schedule(this_cpu_ptr(&softnet_data), n);
5382	local_irq_restore(flags);
5383}
5384EXPORT_SYMBOL(__napi_schedule);
5385
5386/**
5387 *	napi_schedule_prep - check if napi can be scheduled
5388 *	@n: napi context
5389 *
5390 * Test if NAPI routine is already running, and if not mark
5391 * it as running.  This is used as a condition variable
5392 * insure only one NAPI poll instance runs.  We also make
5393 * sure there is no pending NAPI disable.
5394 */
5395bool napi_schedule_prep(struct napi_struct *n)
5396{
5397	unsigned long val, new;
5398
5399	do {
5400		val = READ_ONCE(n->state);
5401		if (unlikely(val & NAPIF_STATE_DISABLE))
5402			return false;
5403		new = val | NAPIF_STATE_SCHED;
5404
5405		/* Sets STATE_MISSED bit if STATE_SCHED was already set
5406		 * This was suggested by Alexander Duyck, as compiler
5407		 * emits better code than :
5408		 * if (val & NAPIF_STATE_SCHED)
5409		 *     new |= NAPIF_STATE_MISSED;
5410		 */
5411		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
5412						   NAPIF_STATE_MISSED;
5413	} while (cmpxchg(&n->state, val, new) != val);
5414
5415	return !(val & NAPIF_STATE_SCHED);
5416}
5417EXPORT_SYMBOL(napi_schedule_prep);
5418
5419/**
5420 * __napi_schedule_irqoff - schedule for receive
5421 * @n: entry to schedule
5422 *
5423 * Variant of __napi_schedule() assuming hard irqs are masked
5424 */
5425void __napi_schedule_irqoff(struct napi_struct *n)
5426{
5427	____napi_schedule(this_cpu_ptr(&softnet_data), n);
5428}
5429EXPORT_SYMBOL(__napi_schedule_irqoff);
5430
5431bool napi_complete_done(struct napi_struct *n, int work_done)
5432{
5433	unsigned long flags, val, new;
5434
5435	/*
5436	 * 1) Don't let napi dequeue from the cpu poll list
5437	 *    just in case its running on a different cpu.
5438	 * 2) If we are busy polling, do nothing here, we have
5439	 *    the guarantee we will be called later.
5440	 */
5441	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
5442				 NAPIF_STATE_IN_BUSY_POLL)))
5443		return false;
5444
5445	if (n->gro_list) {
 
 
5446		unsigned long timeout = 0;
5447
5448		if (work_done)
5449			timeout = n->dev->gro_flush_timeout;
5450
 
 
 
 
 
5451		if (timeout)
5452			hrtimer_start(&n->timer, ns_to_ktime(timeout),
5453				      HRTIMER_MODE_REL_PINNED);
5454		else
5455			napi_gro_flush(n, false);
5456	}
5457	if (unlikely(!list_empty(&n->poll_list))) {
5458		/* If n->poll_list is not empty, we need to mask irqs */
5459		local_irq_save(flags);
5460		list_del_init(&n->poll_list);
5461		local_irq_restore(flags);
5462	}
5463
5464	do {
5465		val = READ_ONCE(n->state);
5466
5467		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
5468
5469		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
5470
5471		/* If STATE_MISSED was set, leave STATE_SCHED set,
5472		 * because we will call napi->poll() one more time.
5473		 * This C code was suggested by Alexander Duyck to help gcc.
5474		 */
5475		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
5476						    NAPIF_STATE_SCHED;
5477	} while (cmpxchg(&n->state, val, new) != val);
5478
5479	if (unlikely(val & NAPIF_STATE_MISSED)) {
5480		__napi_schedule(n);
5481		return false;
5482	}
5483
5484	return true;
5485}
5486EXPORT_SYMBOL(napi_complete_done);
5487
5488/* must be called under rcu_read_lock(), as we dont take a reference */
5489static struct napi_struct *napi_by_id(unsigned int napi_id)
5490{
5491	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
5492	struct napi_struct *napi;
5493
5494	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
5495		if (napi->napi_id == napi_id)
5496			return napi;
5497
5498	return NULL;
5499}
5500
5501#if defined(CONFIG_NET_RX_BUSY_POLL)
5502
5503#define BUSY_POLL_BUDGET 8
5504
5505static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
5506{
5507	int rc;
5508
5509	/* Busy polling means there is a high chance device driver hard irq
5510	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
5511	 * set in napi_schedule_prep().
5512	 * Since we are about to call napi->poll() once more, we can safely
5513	 * clear NAPI_STATE_MISSED.
5514	 *
5515	 * Note: x86 could use a single "lock and ..." instruction
5516	 * to perform these two clear_bit()
5517	 */
5518	clear_bit(NAPI_STATE_MISSED, &napi->state);
5519	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
5520
5521	local_bh_disable();
5522
5523	/* All we really want here is to re-enable device interrupts.
5524	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
5525	 */
5526	rc = napi->poll(napi, BUSY_POLL_BUDGET);
 
 
 
 
5527	trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5528	netpoll_poll_unlock(have_poll_lock);
5529	if (rc == BUSY_POLL_BUDGET)
 
 
 
 
5530		__napi_schedule(napi);
 
5531	local_bh_enable();
5532}
5533
5534void napi_busy_loop(unsigned int napi_id,
5535		    bool (*loop_end)(void *, unsigned long),
5536		    void *loop_end_arg)
5537{
5538	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
5539	int (*napi_poll)(struct napi_struct *napi, int budget);
5540	void *have_poll_lock = NULL;
5541	struct napi_struct *napi;
5542
5543restart:
5544	napi_poll = NULL;
5545
5546	rcu_read_lock();
5547
5548	napi = napi_by_id(napi_id);
5549	if (!napi)
5550		goto out;
5551
5552	preempt_disable();
5553	for (;;) {
5554		int work = 0;
5555
5556		local_bh_disable();
5557		if (!napi_poll) {
5558			unsigned long val = READ_ONCE(napi->state);
5559
5560			/* If multiple threads are competing for this napi,
5561			 * we avoid dirtying napi->state as much as we can.
5562			 */
5563			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
5564				   NAPIF_STATE_IN_BUSY_POLL))
5565				goto count;
5566			if (cmpxchg(&napi->state, val,
5567				    val | NAPIF_STATE_IN_BUSY_POLL |
5568					  NAPIF_STATE_SCHED) != val)
5569				goto count;
5570			have_poll_lock = netpoll_poll_lock(napi);
5571			napi_poll = napi->poll;
5572		}
5573		work = napi_poll(napi, BUSY_POLL_BUDGET);
5574		trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
 
5575count:
5576		if (work > 0)
5577			__NET_ADD_STATS(dev_net(napi->dev),
5578					LINUX_MIB_BUSYPOLLRXPACKETS, work);
5579		local_bh_enable();
5580
5581		if (!loop_end || loop_end(loop_end_arg, start_time))
5582			break;
5583
5584		if (unlikely(need_resched())) {
5585			if (napi_poll)
5586				busy_poll_stop(napi, have_poll_lock);
5587			preempt_enable();
5588			rcu_read_unlock();
5589			cond_resched();
5590			if (loop_end(loop_end_arg, start_time))
5591				return;
5592			goto restart;
5593		}
5594		cpu_relax();
5595	}
5596	if (napi_poll)
5597		busy_poll_stop(napi, have_poll_lock);
5598	preempt_enable();
5599out:
5600	rcu_read_unlock();
5601}
5602EXPORT_SYMBOL(napi_busy_loop);
5603
5604#endif /* CONFIG_NET_RX_BUSY_POLL */
5605
5606static void napi_hash_add(struct napi_struct *napi)
5607{
5608	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5609	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5610		return;
5611
5612	spin_lock(&napi_hash_lock);
5613
5614	/* 0..NR_CPUS range is reserved for sender_cpu use */
5615	do {
5616		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
5617			napi_gen_id = MIN_NAPI_ID;
5618	} while (napi_by_id(napi_gen_id));
5619	napi->napi_id = napi_gen_id;
5620
5621	hlist_add_head_rcu(&napi->napi_hash_node,
5622			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5623
5624	spin_unlock(&napi_hash_lock);
5625}
5626
5627/* Warning : caller is responsible to make sure rcu grace period
5628 * is respected before freeing memory containing @napi
5629 */
5630bool napi_hash_del(struct napi_struct *napi)
5631{
5632	bool rcu_sync_needed = false;
5633
5634	spin_lock(&napi_hash_lock);
5635
5636	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5637		rcu_sync_needed = true;
5638		hlist_del_rcu(&napi->napi_hash_node);
5639	}
5640	spin_unlock(&napi_hash_lock);
5641	return rcu_sync_needed;
5642}
5643EXPORT_SYMBOL_GPL(napi_hash_del);
5644
5645static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5646{
5647	struct napi_struct *napi;
5648
5649	napi = container_of(timer, struct napi_struct, timer);
5650
5651	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
5652	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
5653	 */
5654	if (napi->gro_list && !napi_disable_pending(napi) &&
5655	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
5656		__napi_schedule_irqoff(napi);
5657
5658	return HRTIMER_NORESTART;
5659}
5660
 
 
 
 
 
 
 
 
 
 
 
5661void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5662		    int (*poll)(struct napi_struct *, int), int weight)
5663{
5664	INIT_LIST_HEAD(&napi->poll_list);
5665	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5666	napi->timer.function = napi_watchdog;
5667	napi->gro_count = 0;
5668	napi->gro_list = NULL;
5669	napi->skb = NULL;
 
 
5670	napi->poll = poll;
5671	if (weight > NAPI_POLL_WEIGHT)
5672		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5673			    weight, dev->name);
5674	napi->weight = weight;
5675	list_add(&napi->dev_list, &dev->napi_list);
5676	napi->dev = dev;
5677#ifdef CONFIG_NETPOLL
5678	napi->poll_owner = -1;
5679#endif
5680	set_bit(NAPI_STATE_SCHED, &napi->state);
5681	napi_hash_add(napi);
5682}
5683EXPORT_SYMBOL(netif_napi_add);
5684
5685void napi_disable(struct napi_struct *n)
5686{
5687	might_sleep();
5688	set_bit(NAPI_STATE_DISABLE, &n->state);
5689
5690	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5691		msleep(1);
5692	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5693		msleep(1);
5694
5695	hrtimer_cancel(&n->timer);
5696
5697	clear_bit(NAPI_STATE_DISABLE, &n->state);
5698}
5699EXPORT_SYMBOL(napi_disable);
5700
 
 
 
 
 
 
 
 
 
 
 
 
 
5701/* Must be called in process context */
5702void netif_napi_del(struct napi_struct *napi)
5703{
5704	might_sleep();
5705	if (napi_hash_del(napi))
5706		synchronize_net();
5707	list_del_init(&napi->dev_list);
5708	napi_free_frags(napi);
5709
5710	kfree_skb_list(napi->gro_list);
5711	napi->gro_list = NULL;
5712	napi->gro_count = 0;
5713}
5714EXPORT_SYMBOL(netif_napi_del);
5715
5716static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5717{
5718	void *have;
5719	int work, weight;
5720
5721	list_del_init(&n->poll_list);
5722
5723	have = netpoll_poll_lock(n);
5724
5725	weight = n->weight;
5726
5727	/* This NAPI_STATE_SCHED test is for avoiding a race
5728	 * with netpoll's poll_napi().  Only the entity which
5729	 * obtains the lock and sees NAPI_STATE_SCHED set will
5730	 * actually make the ->poll() call.  Therefore we avoid
5731	 * accidentally calling ->poll() when NAPI is not scheduled.
5732	 */
5733	work = 0;
5734	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5735		work = n->poll(n, weight);
5736		trace_napi_poll(n, work, weight);
5737	}
5738
5739	WARN_ON_ONCE(work > weight);
5740
5741	if (likely(work < weight))
5742		goto out_unlock;
5743
5744	/* Drivers must not modify the NAPI state if they
5745	 * consume the entire weight.  In such cases this code
5746	 * still "owns" the NAPI instance and therefore can
5747	 * move the instance around on the list at-will.
5748	 */
5749	if (unlikely(napi_disable_pending(n))) {
5750		napi_complete(n);
5751		goto out_unlock;
5752	}
5753
5754	if (n->gro_list) {
 
 
5755		/* flush too old packets
5756		 * If HZ < 1000, flush all packets.
5757		 */
5758		napi_gro_flush(n, HZ >= 1000);
5759	}
5760
5761	/* Some drivers may have called napi_schedule
5762	 * prior to exhausting their budget.
5763	 */
5764	if (unlikely(!list_empty(&n->poll_list))) {
5765		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5766			     n->dev ? n->dev->name : "backlog");
5767		goto out_unlock;
5768	}
5769
5770	list_add_tail(&n->poll_list, repoll);
5771
5772out_unlock:
5773	netpoll_poll_unlock(have);
5774
5775	return work;
5776}
5777
5778static __latent_entropy void net_rx_action(struct softirq_action *h)
5779{
5780	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5781	unsigned long time_limit = jiffies +
5782		usecs_to_jiffies(netdev_budget_usecs);
5783	int budget = netdev_budget;
5784	LIST_HEAD(list);
5785	LIST_HEAD(repoll);
5786
5787	local_irq_disable();
5788	list_splice_init(&sd->poll_list, &list);
5789	local_irq_enable();
5790
5791	for (;;) {
5792		struct napi_struct *n;
5793
5794		if (list_empty(&list)) {
5795			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5796				goto out;
5797			break;
5798		}
5799
5800		n = list_first_entry(&list, struct napi_struct, poll_list);
5801		budget -= napi_poll(n, &repoll);
5802
5803		/* If softirq window is exhausted then punt.
5804		 * Allow this to run for 2 jiffies since which will allow
5805		 * an average latency of 1.5/HZ.
5806		 */
5807		if (unlikely(budget <= 0 ||
5808			     time_after_eq(jiffies, time_limit))) {
5809			sd->time_squeeze++;
5810			break;
5811		}
5812	}
5813
5814	local_irq_disable();
5815
5816	list_splice_tail_init(&sd->poll_list, &list);
5817	list_splice_tail(&repoll, &list);
5818	list_splice(&list, &sd->poll_list);
5819	if (!list_empty(&sd->poll_list))
5820		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
5821
5822	net_rps_action_and_irq_enable(sd);
5823out:
5824	__kfree_skb_flush();
5825}
5826
5827struct netdev_adjacent {
5828	struct net_device *dev;
5829
5830	/* upper master flag, there can only be one master device per list */
5831	bool master;
5832
 
 
 
5833	/* counter for the number of times this device was added to us */
5834	u16 ref_nr;
5835
5836	/* private field for the users */
5837	void *private;
5838
5839	struct list_head list;
5840	struct rcu_head rcu;
5841};
5842
5843static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5844						 struct list_head *adj_list)
5845{
5846	struct netdev_adjacent *adj;
5847
5848	list_for_each_entry(adj, adj_list, list) {
5849		if (adj->dev == adj_dev)
5850			return adj;
5851	}
5852	return NULL;
5853}
5854
5855static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
5856{
5857	struct net_device *dev = data;
5858
5859	return upper_dev == dev;
5860}
5861
5862/**
5863 * netdev_has_upper_dev - Check if device is linked to an upper device
5864 * @dev: device
5865 * @upper_dev: upper device to check
5866 *
5867 * Find out if a device is linked to specified upper device and return true
5868 * in case it is. Note that this checks only immediate upper device,
5869 * not through a complete stack of devices. The caller must hold the RTNL lock.
5870 */
5871bool netdev_has_upper_dev(struct net_device *dev,
5872			  struct net_device *upper_dev)
5873{
5874	ASSERT_RTNL();
5875
5876	return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5877					     upper_dev);
5878}
5879EXPORT_SYMBOL(netdev_has_upper_dev);
5880
5881/**
5882 * netdev_has_upper_dev_all - Check if device is linked to an upper device
5883 * @dev: device
5884 * @upper_dev: upper device to check
5885 *
5886 * Find out if a device is linked to specified upper device and return true
5887 * in case it is. Note that this checks the entire upper device chain.
5888 * The caller must hold rcu lock.
5889 */
5890
5891bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
5892				  struct net_device *upper_dev)
5893{
5894	return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5895					       upper_dev);
5896}
5897EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
5898
5899/**
5900 * netdev_has_any_upper_dev - Check if device is linked to some device
5901 * @dev: device
5902 *
5903 * Find out if a device is linked to an upper device and return true in case
5904 * it is. The caller must hold the RTNL lock.
5905 */
5906bool netdev_has_any_upper_dev(struct net_device *dev)
5907{
5908	ASSERT_RTNL();
5909
5910	return !list_empty(&dev->adj_list.upper);
5911}
5912EXPORT_SYMBOL(netdev_has_any_upper_dev);
5913
5914/**
5915 * netdev_master_upper_dev_get - Get master upper device
5916 * @dev: device
5917 *
5918 * Find a master upper device and return pointer to it or NULL in case
5919 * it's not there. The caller must hold the RTNL lock.
5920 */
5921struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5922{
5923	struct netdev_adjacent *upper;
5924
5925	ASSERT_RTNL();
5926
5927	if (list_empty(&dev->adj_list.upper))
5928		return NULL;
5929
5930	upper = list_first_entry(&dev->adj_list.upper,
5931				 struct netdev_adjacent, list);
5932	if (likely(upper->master))
5933		return upper->dev;
5934	return NULL;
5935}
5936EXPORT_SYMBOL(netdev_master_upper_dev_get);
5937
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5938/**
5939 * netdev_has_any_lower_dev - Check if device is linked to some device
5940 * @dev: device
5941 *
5942 * Find out if a device is linked to a lower device and return true in case
5943 * it is. The caller must hold the RTNL lock.
5944 */
5945static bool netdev_has_any_lower_dev(struct net_device *dev)
5946{
5947	ASSERT_RTNL();
5948
5949	return !list_empty(&dev->adj_list.lower);
5950}
5951
5952void *netdev_adjacent_get_private(struct list_head *adj_list)
5953{
5954	struct netdev_adjacent *adj;
5955
5956	adj = list_entry(adj_list, struct netdev_adjacent, list);
5957
5958	return adj->private;
5959}
5960EXPORT_SYMBOL(netdev_adjacent_get_private);
5961
5962/**
5963 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5964 * @dev: device
5965 * @iter: list_head ** of the current position
5966 *
5967 * Gets the next device from the dev's upper list, starting from iter
5968 * position. The caller must hold RCU read lock.
5969 */
5970struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5971						 struct list_head **iter)
5972{
5973	struct netdev_adjacent *upper;
5974
5975	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5976
5977	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5978
5979	if (&upper->list == &dev->adj_list.upper)
5980		return NULL;
5981
5982	*iter = &upper->list;
5983
5984	return upper->dev;
5985}
5986EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5987
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5988static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
5989						    struct list_head **iter)
5990{
5991	struct netdev_adjacent *upper;
5992
5993	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5994
5995	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5996
5997	if (&upper->list == &dev->adj_list.upper)
5998		return NULL;
5999
6000	*iter = &upper->list;
6001
6002	return upper->dev;
6003}
6004
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6005int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
6006				  int (*fn)(struct net_device *dev,
6007					    void *data),
6008				  void *data)
6009{
6010	struct net_device *udev;
6011	struct list_head *iter;
6012	int ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6013
6014	for (iter = &dev->adj_list.upper,
6015	     udev = netdev_next_upper_dev_rcu(dev, &iter);
6016	     udev;
6017	     udev = netdev_next_upper_dev_rcu(dev, &iter)) {
6018		/* first is the upper device itself */
6019		ret = fn(udev, data);
6020		if (ret)
6021			return ret;
6022
6023		/* then look at all of its upper devices */
6024		ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
6025		if (ret)
6026			return ret;
6027	}
6028
6029	return 0;
6030}
6031EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
6032
 
 
 
 
 
 
 
 
 
6033/**
6034 * netdev_lower_get_next_private - Get the next ->private from the
6035 *				   lower neighbour list
6036 * @dev: device
6037 * @iter: list_head ** of the current position
6038 *
6039 * Gets the next netdev_adjacent->private from the dev's lower neighbour
6040 * list, starting from iter position. The caller must hold either hold the
6041 * RTNL lock or its own locking that guarantees that the neighbour lower
6042 * list will remain unchanged.
6043 */
6044void *netdev_lower_get_next_private(struct net_device *dev,
6045				    struct list_head **iter)
6046{
6047	struct netdev_adjacent *lower;
6048
6049	lower = list_entry(*iter, struct netdev_adjacent, list);
6050
6051	if (&lower->list == &dev->adj_list.lower)
6052		return NULL;
6053
6054	*iter = lower->list.next;
6055
6056	return lower->private;
6057}
6058EXPORT_SYMBOL(netdev_lower_get_next_private);
6059
6060/**
6061 * netdev_lower_get_next_private_rcu - Get the next ->private from the
6062 *				       lower neighbour list, RCU
6063 *				       variant
6064 * @dev: device
6065 * @iter: list_head ** of the current position
6066 *
6067 * Gets the next netdev_adjacent->private from the dev's lower neighbour
6068 * list, starting from iter position. The caller must hold RCU read lock.
6069 */
6070void *netdev_lower_get_next_private_rcu(struct net_device *dev,
6071					struct list_head **iter)
6072{
6073	struct netdev_adjacent *lower;
6074
6075	WARN_ON_ONCE(!rcu_read_lock_held());
6076
6077	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6078
6079	if (&lower->list == &dev->adj_list.lower)
6080		return NULL;
6081
6082	*iter = &lower->list;
6083
6084	return lower->private;
6085}
6086EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
6087
6088/**
6089 * netdev_lower_get_next - Get the next device from the lower neighbour
6090 *                         list
6091 * @dev: device
6092 * @iter: list_head ** of the current position
6093 *
6094 * Gets the next netdev_adjacent from the dev's lower neighbour
6095 * list, starting from iter position. The caller must hold RTNL lock or
6096 * its own locking that guarantees that the neighbour lower
6097 * list will remain unchanged.
6098 */
6099void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
6100{
6101	struct netdev_adjacent *lower;
6102
6103	lower = list_entry(*iter, struct netdev_adjacent, list);
6104
6105	if (&lower->list == &dev->adj_list.lower)
6106		return NULL;
6107
6108	*iter = lower->list.next;
6109
6110	return lower->dev;
6111}
6112EXPORT_SYMBOL(netdev_lower_get_next);
6113
6114static struct net_device *netdev_next_lower_dev(struct net_device *dev,
6115						struct list_head **iter)
6116{
6117	struct netdev_adjacent *lower;
6118
6119	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
6120
6121	if (&lower->list == &dev->adj_list.lower)
6122		return NULL;
6123
6124	*iter = &lower->list;
6125
6126	return lower->dev;
6127}
6128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6129int netdev_walk_all_lower_dev(struct net_device *dev,
6130			      int (*fn)(struct net_device *dev,
6131					void *data),
6132			      void *data)
6133{
6134	struct net_device *ldev;
6135	struct list_head *iter;
6136	int ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6137
6138	for (iter = &dev->adj_list.lower,
6139	     ldev = netdev_next_lower_dev(dev, &iter);
6140	     ldev;
6141	     ldev = netdev_next_lower_dev(dev, &iter)) {
6142		/* first is the lower device itself */
6143		ret = fn(ldev, data);
6144		if (ret)
6145			return ret;
6146
6147		/* then look at all of its lower devices */
6148		ret = netdev_walk_all_lower_dev(ldev, fn, data);
6149		if (ret)
6150			return ret;
6151	}
6152
6153	return 0;
6154}
6155EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
6156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6157static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
6158						    struct list_head **iter)
6159{
6160	struct netdev_adjacent *lower;
6161
6162	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6163	if (&lower->list == &dev->adj_list.lower)
6164		return NULL;
6165
6166	*iter = &lower->list;
6167
6168	return lower->dev;
6169}
6170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6171int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
6172				  int (*fn)(struct net_device *dev,
6173					    void *data),
6174				  void *data)
6175{
6176	struct net_device *ldev;
6177	struct list_head *iter;
6178	int ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6179
6180	for (iter = &dev->adj_list.lower,
6181	     ldev = netdev_next_lower_dev_rcu(dev, &iter);
6182	     ldev;
6183	     ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
6184		/* first is the lower device itself */
6185		ret = fn(ldev, data);
6186		if (ret)
6187			return ret;
6188
6189		/* then look at all of its lower devices */
6190		ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
6191		if (ret)
6192			return ret;
6193	}
6194
6195	return 0;
6196}
6197EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
6198
6199/**
6200 * netdev_lower_get_first_private_rcu - Get the first ->private from the
6201 *				       lower neighbour list, RCU
6202 *				       variant
6203 * @dev: device
6204 *
6205 * Gets the first netdev_adjacent->private from the dev's lower neighbour
6206 * list. The caller must hold RCU read lock.
6207 */
6208void *netdev_lower_get_first_private_rcu(struct net_device *dev)
6209{
6210	struct netdev_adjacent *lower;
6211
6212	lower = list_first_or_null_rcu(&dev->adj_list.lower,
6213			struct netdev_adjacent, list);
6214	if (lower)
6215		return lower->private;
6216	return NULL;
6217}
6218EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
6219
6220/**
6221 * netdev_master_upper_dev_get_rcu - Get master upper device
6222 * @dev: device
6223 *
6224 * Find a master upper device and return pointer to it or NULL in case
6225 * it's not there. The caller must hold the RCU read lock.
6226 */
6227struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
6228{
6229	struct netdev_adjacent *upper;
6230
6231	upper = list_first_or_null_rcu(&dev->adj_list.upper,
6232				       struct netdev_adjacent, list);
6233	if (upper && likely(upper->master))
6234		return upper->dev;
6235	return NULL;
6236}
6237EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
6238
6239static int netdev_adjacent_sysfs_add(struct net_device *dev,
6240			      struct net_device *adj_dev,
6241			      struct list_head *dev_list)
6242{
6243	char linkname[IFNAMSIZ+7];
6244
6245	sprintf(linkname, dev_list == &dev->adj_list.upper ?
6246		"upper_%s" : "lower_%s", adj_dev->name);
6247	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
6248				 linkname);
6249}
6250static void netdev_adjacent_sysfs_del(struct net_device *dev,
6251			       char *name,
6252			       struct list_head *dev_list)
6253{
6254	char linkname[IFNAMSIZ+7];
6255
6256	sprintf(linkname, dev_list == &dev->adj_list.upper ?
6257		"upper_%s" : "lower_%s", name);
6258	sysfs_remove_link(&(dev->dev.kobj), linkname);
6259}
6260
6261static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
6262						 struct net_device *adj_dev,
6263						 struct list_head *dev_list)
6264{
6265	return (dev_list == &dev->adj_list.upper ||
6266		dev_list == &dev->adj_list.lower) &&
6267		net_eq(dev_net(dev), dev_net(adj_dev));
6268}
6269
6270static int __netdev_adjacent_dev_insert(struct net_device *dev,
6271					struct net_device *adj_dev,
6272					struct list_head *dev_list,
6273					void *private, bool master)
6274{
6275	struct netdev_adjacent *adj;
6276	int ret;
6277
6278	adj = __netdev_find_adj(adj_dev, dev_list);
6279
6280	if (adj) {
6281		adj->ref_nr += 1;
6282		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
6283			 dev->name, adj_dev->name, adj->ref_nr);
6284
6285		return 0;
6286	}
6287
6288	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
6289	if (!adj)
6290		return -ENOMEM;
6291
6292	adj->dev = adj_dev;
6293	adj->master = master;
6294	adj->ref_nr = 1;
6295	adj->private = private;
 
6296	dev_hold(adj_dev);
6297
6298	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
6299		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
6300
6301	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
6302		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
6303		if (ret)
6304			goto free_adj;
6305	}
6306
6307	/* Ensure that master link is always the first item in list. */
6308	if (master) {
6309		ret = sysfs_create_link(&(dev->dev.kobj),
6310					&(adj_dev->dev.kobj), "master");
6311		if (ret)
6312			goto remove_symlinks;
6313
6314		list_add_rcu(&adj->list, dev_list);
6315	} else {
6316		list_add_tail_rcu(&adj->list, dev_list);
6317	}
6318
6319	return 0;
6320
6321remove_symlinks:
6322	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
6323		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
6324free_adj:
6325	kfree(adj);
6326	dev_put(adj_dev);
6327
6328	return ret;
6329}
6330
6331static void __netdev_adjacent_dev_remove(struct net_device *dev,
6332					 struct net_device *adj_dev,
6333					 u16 ref_nr,
6334					 struct list_head *dev_list)
6335{
6336	struct netdev_adjacent *adj;
6337
6338	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
6339		 dev->name, adj_dev->name, ref_nr);
6340
6341	adj = __netdev_find_adj(adj_dev, dev_list);
6342
6343	if (!adj) {
6344		pr_err("Adjacency does not exist for device %s from %s\n",
6345		       dev->name, adj_dev->name);
6346		WARN_ON(1);
6347		return;
6348	}
6349
6350	if (adj->ref_nr > ref_nr) {
6351		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
6352			 dev->name, adj_dev->name, ref_nr,
6353			 adj->ref_nr - ref_nr);
6354		adj->ref_nr -= ref_nr;
6355		return;
6356	}
6357
6358	if (adj->master)
6359		sysfs_remove_link(&(dev->dev.kobj), "master");
6360
6361	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
6362		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
6363
6364	list_del_rcu(&adj->list);
6365	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
6366		 adj_dev->name, dev->name, adj_dev->name);
6367	dev_put(adj_dev);
6368	kfree_rcu(adj, rcu);
6369}
6370
6371static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
6372					    struct net_device *upper_dev,
6373					    struct list_head *up_list,
6374					    struct list_head *down_list,
6375					    void *private, bool master)
6376{
6377	int ret;
6378
6379	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
6380					   private, master);
6381	if (ret)
6382		return ret;
6383
6384	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
6385					   private, false);
6386	if (ret) {
6387		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
6388		return ret;
6389	}
6390
6391	return 0;
6392}
6393
6394static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
6395					       struct net_device *upper_dev,
6396					       u16 ref_nr,
6397					       struct list_head *up_list,
6398					       struct list_head *down_list)
6399{
6400	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
6401	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
6402}
6403
6404static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
6405						struct net_device *upper_dev,
6406						void *private, bool master)
6407{
6408	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
6409						&dev->adj_list.upper,
6410						&upper_dev->adj_list.lower,
6411						private, master);
6412}
6413
6414static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
6415						   struct net_device *upper_dev)
6416{
6417	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
6418					   &dev->adj_list.upper,
6419					   &upper_dev->adj_list.lower);
6420}
6421
6422static int __netdev_upper_dev_link(struct net_device *dev,
6423				   struct net_device *upper_dev, bool master,
6424				   void *upper_priv, void *upper_info,
6425				   struct netlink_ext_ack *extack)
6426{
6427	struct netdev_notifier_changeupper_info changeupper_info = {
6428		.info = {
6429			.dev = dev,
6430			.extack = extack,
6431		},
6432		.upper_dev = upper_dev,
6433		.master = master,
6434		.linking = true,
6435		.upper_info = upper_info,
6436	};
6437	struct net_device *master_dev;
6438	int ret = 0;
6439
6440	ASSERT_RTNL();
6441
6442	if (dev == upper_dev)
6443		return -EBUSY;
6444
6445	/* To prevent loops, check if dev is not upper device to upper_dev. */
6446	if (netdev_has_upper_dev(upper_dev, dev))
6447		return -EBUSY;
6448
 
 
 
6449	if (!master) {
6450		if (netdev_has_upper_dev(dev, upper_dev))
6451			return -EEXIST;
6452	} else {
6453		master_dev = netdev_master_upper_dev_get(dev);
6454		if (master_dev)
6455			return master_dev == upper_dev ? -EEXIST : -EBUSY;
6456	}
6457
6458	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
6459					    &changeupper_info.info);
6460	ret = notifier_to_errno(ret);
6461	if (ret)
6462		return ret;
6463
6464	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
6465						   master);
6466	if (ret)
6467		return ret;
6468
6469	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
6470					    &changeupper_info.info);
6471	ret = notifier_to_errno(ret);
6472	if (ret)
6473		goto rollback;
6474
 
 
 
 
 
 
 
6475	return 0;
6476
6477rollback:
6478	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6479
6480	return ret;
6481}
6482
6483/**
6484 * netdev_upper_dev_link - Add a link to the upper device
6485 * @dev: device
6486 * @upper_dev: new upper device
6487 * @extack: netlink extended ack
6488 *
6489 * Adds a link to device which is upper to this one. The caller must hold
6490 * the RTNL lock. On a failure a negative errno code is returned.
6491 * On success the reference counts are adjusted and the function
6492 * returns zero.
6493 */
6494int netdev_upper_dev_link(struct net_device *dev,
6495			  struct net_device *upper_dev,
6496			  struct netlink_ext_ack *extack)
6497{
6498	return __netdev_upper_dev_link(dev, upper_dev, false,
6499				       NULL, NULL, extack);
6500}
6501EXPORT_SYMBOL(netdev_upper_dev_link);
6502
6503/**
6504 * netdev_master_upper_dev_link - Add a master link to the upper device
6505 * @dev: device
6506 * @upper_dev: new upper device
6507 * @upper_priv: upper device private
6508 * @upper_info: upper info to be passed down via notifier
6509 * @extack: netlink extended ack
6510 *
6511 * Adds a link to device which is upper to this one. In this case, only
6512 * one master upper device can be linked, although other non-master devices
6513 * might be linked as well. The caller must hold the RTNL lock.
6514 * On a failure a negative errno code is returned. On success the reference
6515 * counts are adjusted and the function returns zero.
6516 */
6517int netdev_master_upper_dev_link(struct net_device *dev,
6518				 struct net_device *upper_dev,
6519				 void *upper_priv, void *upper_info,
6520				 struct netlink_ext_ack *extack)
6521{
6522	return __netdev_upper_dev_link(dev, upper_dev, true,
6523				       upper_priv, upper_info, extack);
6524}
6525EXPORT_SYMBOL(netdev_master_upper_dev_link);
6526
6527/**
6528 * netdev_upper_dev_unlink - Removes a link to upper device
6529 * @dev: device
6530 * @upper_dev: new upper device
6531 *
6532 * Removes a link to device which is upper to this one. The caller must hold
6533 * the RTNL lock.
6534 */
6535void netdev_upper_dev_unlink(struct net_device *dev,
6536			     struct net_device *upper_dev)
6537{
6538	struct netdev_notifier_changeupper_info changeupper_info = {
6539		.info = {
6540			.dev = dev,
6541		},
6542		.upper_dev = upper_dev,
6543		.linking = false,
6544	};
6545
6546	ASSERT_RTNL();
6547
6548	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
6549
6550	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
6551				      &changeupper_info.info);
6552
6553	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6554
6555	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
6556				      &changeupper_info.info);
 
 
 
 
 
 
 
6557}
6558EXPORT_SYMBOL(netdev_upper_dev_unlink);
6559
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6560/**
6561 * netdev_bonding_info_change - Dispatch event about slave change
6562 * @dev: device
6563 * @bonding_info: info to dispatch
6564 *
6565 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
6566 * The caller must hold the RTNL lock.
6567 */
6568void netdev_bonding_info_change(struct net_device *dev,
6569				struct netdev_bonding_info *bonding_info)
6570{
6571	struct netdev_notifier_bonding_info info = {
6572		.info.dev = dev,
6573	};
6574
6575	memcpy(&info.bonding_info, bonding_info,
6576	       sizeof(struct netdev_bonding_info));
6577	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
6578				      &info.info);
6579}
6580EXPORT_SYMBOL(netdev_bonding_info_change);
6581
6582static void netdev_adjacent_add_links(struct net_device *dev)
6583{
6584	struct netdev_adjacent *iter;
6585
6586	struct net *net = dev_net(dev);
6587
6588	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6589		if (!net_eq(net, dev_net(iter->dev)))
6590			continue;
6591		netdev_adjacent_sysfs_add(iter->dev, dev,
6592					  &iter->dev->adj_list.lower);
6593		netdev_adjacent_sysfs_add(dev, iter->dev,
6594					  &dev->adj_list.upper);
6595	}
6596
6597	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6598		if (!net_eq(net, dev_net(iter->dev)))
6599			continue;
6600		netdev_adjacent_sysfs_add(iter->dev, dev,
6601					  &iter->dev->adj_list.upper);
6602		netdev_adjacent_sysfs_add(dev, iter->dev,
6603					  &dev->adj_list.lower);
6604	}
6605}
6606
6607static void netdev_adjacent_del_links(struct net_device *dev)
6608{
6609	struct netdev_adjacent *iter;
6610
6611	struct net *net = dev_net(dev);
6612
6613	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6614		if (!net_eq(net, dev_net(iter->dev)))
6615			continue;
6616		netdev_adjacent_sysfs_del(iter->dev, dev->name,
6617					  &iter->dev->adj_list.lower);
6618		netdev_adjacent_sysfs_del(dev, iter->dev->name,
6619					  &dev->adj_list.upper);
6620	}
6621
6622	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6623		if (!net_eq(net, dev_net(iter->dev)))
6624			continue;
6625		netdev_adjacent_sysfs_del(iter->dev, dev->name,
6626					  &iter->dev->adj_list.upper);
6627		netdev_adjacent_sysfs_del(dev, iter->dev->name,
6628					  &dev->adj_list.lower);
6629	}
6630}
6631
6632void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
6633{
6634	struct netdev_adjacent *iter;
6635
6636	struct net *net = dev_net(dev);
6637
6638	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6639		if (!net_eq(net, dev_net(iter->dev)))
6640			continue;
6641		netdev_adjacent_sysfs_del(iter->dev, oldname,
6642					  &iter->dev->adj_list.lower);
6643		netdev_adjacent_sysfs_add(iter->dev, dev,
6644					  &iter->dev->adj_list.lower);
6645	}
6646
6647	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6648		if (!net_eq(net, dev_net(iter->dev)))
6649			continue;
6650		netdev_adjacent_sysfs_del(iter->dev, oldname,
6651					  &iter->dev->adj_list.upper);
6652		netdev_adjacent_sysfs_add(iter->dev, dev,
6653					  &iter->dev->adj_list.upper);
6654	}
6655}
6656
6657void *netdev_lower_dev_get_private(struct net_device *dev,
6658				   struct net_device *lower_dev)
6659{
6660	struct netdev_adjacent *lower;
6661
6662	if (!lower_dev)
6663		return NULL;
6664	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
6665	if (!lower)
6666		return NULL;
6667
6668	return lower->private;
6669}
6670EXPORT_SYMBOL(netdev_lower_dev_get_private);
6671
6672
6673int dev_get_nest_level(struct net_device *dev)
6674{
6675	struct net_device *lower = NULL;
6676	struct list_head *iter;
6677	int max_nest = -1;
6678	int nest;
6679
6680	ASSERT_RTNL();
6681
6682	netdev_for_each_lower_dev(dev, lower, iter) {
6683		nest = dev_get_nest_level(lower);
6684		if (max_nest < nest)
6685			max_nest = nest;
6686	}
6687
6688	return max_nest + 1;
6689}
6690EXPORT_SYMBOL(dev_get_nest_level);
6691
6692/**
6693 * netdev_lower_change - Dispatch event about lower device state change
6694 * @lower_dev: device
6695 * @lower_state_info: state to dispatch
6696 *
6697 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6698 * The caller must hold the RTNL lock.
6699 */
6700void netdev_lower_state_changed(struct net_device *lower_dev,
6701				void *lower_state_info)
6702{
6703	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
6704		.info.dev = lower_dev,
6705	};
6706
6707	ASSERT_RTNL();
6708	changelowerstate_info.lower_state_info = lower_state_info;
6709	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
6710				      &changelowerstate_info.info);
6711}
6712EXPORT_SYMBOL(netdev_lower_state_changed);
6713
6714static void dev_change_rx_flags(struct net_device *dev, int flags)
6715{
6716	const struct net_device_ops *ops = dev->netdev_ops;
6717
6718	if (ops->ndo_change_rx_flags)
6719		ops->ndo_change_rx_flags(dev, flags);
6720}
6721
6722static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6723{
6724	unsigned int old_flags = dev->flags;
6725	kuid_t uid;
6726	kgid_t gid;
6727
6728	ASSERT_RTNL();
6729
6730	dev->flags |= IFF_PROMISC;
6731	dev->promiscuity += inc;
6732	if (dev->promiscuity == 0) {
6733		/*
6734		 * Avoid overflow.
6735		 * If inc causes overflow, untouch promisc and return error.
6736		 */
6737		if (inc < 0)
6738			dev->flags &= ~IFF_PROMISC;
6739		else {
6740			dev->promiscuity -= inc;
6741			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6742				dev->name);
6743			return -EOVERFLOW;
6744		}
6745	}
6746	if (dev->flags != old_flags) {
6747		pr_info("device %s %s promiscuous mode\n",
6748			dev->name,
6749			dev->flags & IFF_PROMISC ? "entered" : "left");
6750		if (audit_enabled) {
6751			current_uid_gid(&uid, &gid);
6752			audit_log(current->audit_context, GFP_ATOMIC,
6753				AUDIT_ANOM_PROMISCUOUS,
6754				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6755				dev->name, (dev->flags & IFF_PROMISC),
6756				(old_flags & IFF_PROMISC),
6757				from_kuid(&init_user_ns, audit_get_loginuid(current)),
6758				from_kuid(&init_user_ns, uid),
6759				from_kgid(&init_user_ns, gid),
6760				audit_get_sessionid(current));
6761		}
6762
6763		dev_change_rx_flags(dev, IFF_PROMISC);
6764	}
6765	if (notify)
6766		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
6767	return 0;
6768}
6769
6770/**
6771 *	dev_set_promiscuity	- update promiscuity count on a device
6772 *	@dev: device
6773 *	@inc: modifier
6774 *
6775 *	Add or remove promiscuity from a device. While the count in the device
6776 *	remains above zero the interface remains promiscuous. Once it hits zero
6777 *	the device reverts back to normal filtering operation. A negative inc
6778 *	value is used to drop promiscuity on the device.
6779 *	Return 0 if successful or a negative errno code on error.
6780 */
6781int dev_set_promiscuity(struct net_device *dev, int inc)
6782{
6783	unsigned int old_flags = dev->flags;
6784	int err;
6785
6786	err = __dev_set_promiscuity(dev, inc, true);
6787	if (err < 0)
6788		return err;
6789	if (dev->flags != old_flags)
6790		dev_set_rx_mode(dev);
6791	return err;
6792}
6793EXPORT_SYMBOL(dev_set_promiscuity);
6794
6795static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6796{
6797	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6798
6799	ASSERT_RTNL();
6800
6801	dev->flags |= IFF_ALLMULTI;
6802	dev->allmulti += inc;
6803	if (dev->allmulti == 0) {
6804		/*
6805		 * Avoid overflow.
6806		 * If inc causes overflow, untouch allmulti and return error.
6807		 */
6808		if (inc < 0)
6809			dev->flags &= ~IFF_ALLMULTI;
6810		else {
6811			dev->allmulti -= inc;
6812			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6813				dev->name);
6814			return -EOVERFLOW;
6815		}
6816	}
6817	if (dev->flags ^ old_flags) {
6818		dev_change_rx_flags(dev, IFF_ALLMULTI);
6819		dev_set_rx_mode(dev);
6820		if (notify)
6821			__dev_notify_flags(dev, old_flags,
6822					   dev->gflags ^ old_gflags);
6823	}
6824	return 0;
6825}
6826
6827/**
6828 *	dev_set_allmulti	- update allmulti count on a device
6829 *	@dev: device
6830 *	@inc: modifier
6831 *
6832 *	Add or remove reception of all multicast frames to a device. While the
6833 *	count in the device remains above zero the interface remains listening
6834 *	to all interfaces. Once it hits zero the device reverts back to normal
6835 *	filtering operation. A negative @inc value is used to drop the counter
6836 *	when releasing a resource needing all multicasts.
6837 *	Return 0 if successful or a negative errno code on error.
6838 */
6839
6840int dev_set_allmulti(struct net_device *dev, int inc)
6841{
6842	return __dev_set_allmulti(dev, inc, true);
6843}
6844EXPORT_SYMBOL(dev_set_allmulti);
6845
6846/*
6847 *	Upload unicast and multicast address lists to device and
6848 *	configure RX filtering. When the device doesn't support unicast
6849 *	filtering it is put in promiscuous mode while unicast addresses
6850 *	are present.
6851 */
6852void __dev_set_rx_mode(struct net_device *dev)
6853{
6854	const struct net_device_ops *ops = dev->netdev_ops;
6855
6856	/* dev_open will call this function so the list will stay sane. */
6857	if (!(dev->flags&IFF_UP))
6858		return;
6859
6860	if (!netif_device_present(dev))
6861		return;
6862
6863	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6864		/* Unicast addresses changes may only happen under the rtnl,
6865		 * therefore calling __dev_set_promiscuity here is safe.
6866		 */
6867		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6868			__dev_set_promiscuity(dev, 1, false);
6869			dev->uc_promisc = true;
6870		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6871			__dev_set_promiscuity(dev, -1, false);
6872			dev->uc_promisc = false;
6873		}
6874	}
6875
6876	if (ops->ndo_set_rx_mode)
6877		ops->ndo_set_rx_mode(dev);
6878}
6879
6880void dev_set_rx_mode(struct net_device *dev)
6881{
6882	netif_addr_lock_bh(dev);
6883	__dev_set_rx_mode(dev);
6884	netif_addr_unlock_bh(dev);
6885}
6886
6887/**
6888 *	dev_get_flags - get flags reported to userspace
6889 *	@dev: device
6890 *
6891 *	Get the combination of flag bits exported through APIs to userspace.
6892 */
6893unsigned int dev_get_flags(const struct net_device *dev)
6894{
6895	unsigned int flags;
6896
6897	flags = (dev->flags & ~(IFF_PROMISC |
6898				IFF_ALLMULTI |
6899				IFF_RUNNING |
6900				IFF_LOWER_UP |
6901				IFF_DORMANT)) |
6902		(dev->gflags & (IFF_PROMISC |
6903				IFF_ALLMULTI));
6904
6905	if (netif_running(dev)) {
6906		if (netif_oper_up(dev))
6907			flags |= IFF_RUNNING;
6908		if (netif_carrier_ok(dev))
6909			flags |= IFF_LOWER_UP;
6910		if (netif_dormant(dev))
6911			flags |= IFF_DORMANT;
6912	}
6913
6914	return flags;
6915}
6916EXPORT_SYMBOL(dev_get_flags);
6917
6918int __dev_change_flags(struct net_device *dev, unsigned int flags)
 
6919{
6920	unsigned int old_flags = dev->flags;
6921	int ret;
6922
6923	ASSERT_RTNL();
6924
6925	/*
6926	 *	Set the flags on our device.
6927	 */
6928
6929	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6930			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6931			       IFF_AUTOMEDIA)) |
6932		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6933				    IFF_ALLMULTI));
6934
6935	/*
6936	 *	Load in the correct multicast list now the flags have changed.
6937	 */
6938
6939	if ((old_flags ^ flags) & IFF_MULTICAST)
6940		dev_change_rx_flags(dev, IFF_MULTICAST);
6941
6942	dev_set_rx_mode(dev);
6943
6944	/*
6945	 *	Have we downed the interface. We handle IFF_UP ourselves
6946	 *	according to user attempts to set it, rather than blindly
6947	 *	setting it.
6948	 */
6949
6950	ret = 0;
6951	if ((old_flags ^ flags) & IFF_UP) {
6952		if (old_flags & IFF_UP)
6953			__dev_close(dev);
6954		else
6955			ret = __dev_open(dev);
6956	}
6957
6958	if ((flags ^ dev->gflags) & IFF_PROMISC) {
6959		int inc = (flags & IFF_PROMISC) ? 1 : -1;
6960		unsigned int old_flags = dev->flags;
6961
6962		dev->gflags ^= IFF_PROMISC;
6963
6964		if (__dev_set_promiscuity(dev, inc, false) >= 0)
6965			if (dev->flags != old_flags)
6966				dev_set_rx_mode(dev);
6967	}
6968
6969	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6970	 * is important. Some (broken) drivers set IFF_PROMISC, when
6971	 * IFF_ALLMULTI is requested not asking us and not reporting.
6972	 */
6973	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6974		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6975
6976		dev->gflags ^= IFF_ALLMULTI;
6977		__dev_set_allmulti(dev, inc, false);
6978	}
6979
6980	return ret;
6981}
6982
6983void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6984			unsigned int gchanges)
6985{
6986	unsigned int changes = dev->flags ^ old_flags;
6987
6988	if (gchanges)
6989		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6990
6991	if (changes & IFF_UP) {
6992		if (dev->flags & IFF_UP)
6993			call_netdevice_notifiers(NETDEV_UP, dev);
6994		else
6995			call_netdevice_notifiers(NETDEV_DOWN, dev);
6996	}
6997
6998	if (dev->flags & IFF_UP &&
6999	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
7000		struct netdev_notifier_change_info change_info = {
7001			.info = {
7002				.dev = dev,
7003			},
7004			.flags_changed = changes,
7005		};
7006
7007		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
7008	}
7009}
7010
7011/**
7012 *	dev_change_flags - change device settings
7013 *	@dev: device
7014 *	@flags: device state flags
 
7015 *
7016 *	Change settings on device based state flags. The flags are
7017 *	in the userspace exported format.
7018 */
7019int dev_change_flags(struct net_device *dev, unsigned int flags)
 
7020{
7021	int ret;
7022	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
7023
7024	ret = __dev_change_flags(dev, flags);
7025	if (ret < 0)
7026		return ret;
7027
7028	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
7029	__dev_notify_flags(dev, old_flags, changes);
7030	return ret;
7031}
7032EXPORT_SYMBOL(dev_change_flags);
7033
7034int __dev_set_mtu(struct net_device *dev, int new_mtu)
7035{
7036	const struct net_device_ops *ops = dev->netdev_ops;
7037
7038	if (ops->ndo_change_mtu)
7039		return ops->ndo_change_mtu(dev, new_mtu);
7040
7041	dev->mtu = new_mtu;
7042	return 0;
7043}
7044EXPORT_SYMBOL(__dev_set_mtu);
7045
7046/**
7047 *	dev_set_mtu - Change maximum transfer unit
7048 *	@dev: device
7049 *	@new_mtu: new transfer unit
 
7050 *
7051 *	Change the maximum transfer size of the network device.
7052 */
7053int dev_set_mtu(struct net_device *dev, int new_mtu)
 
7054{
7055	int err, orig_mtu;
7056
7057	if (new_mtu == dev->mtu)
7058		return 0;
7059
7060	/* MTU must be positive, and in range */
7061	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
7062		net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
7063				    dev->name, new_mtu, dev->min_mtu);
7064		return -EINVAL;
7065	}
7066
7067	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
7068		net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
7069				    dev->name, new_mtu, dev->max_mtu);
7070		return -EINVAL;
7071	}
7072
7073	if (!netif_device_present(dev))
7074		return -ENODEV;
7075
7076	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
7077	err = notifier_to_errno(err);
7078	if (err)
7079		return err;
7080
7081	orig_mtu = dev->mtu;
7082	err = __dev_set_mtu(dev, new_mtu);
7083
7084	if (!err) {
7085		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
 
7086		err = notifier_to_errno(err);
7087		if (err) {
7088			/* setting mtu back and notifying everyone again,
7089			 * so that they have a chance to revert changes.
7090			 */
7091			__dev_set_mtu(dev, orig_mtu);
7092			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
 
7093		}
7094	}
7095	return err;
7096}
 
 
 
 
 
 
 
 
 
 
 
 
7097EXPORT_SYMBOL(dev_set_mtu);
7098
7099/**
7100 *	dev_change_tx_queue_len - Change TX queue length of a netdevice
7101 *	@dev: device
7102 *	@new_len: new tx queue length
7103 */
7104int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
7105{
7106	unsigned int orig_len = dev->tx_queue_len;
7107	int res;
7108
7109	if (new_len != (unsigned int)new_len)
7110		return -ERANGE;
7111
7112	if (new_len != orig_len) {
7113		dev->tx_queue_len = new_len;
7114		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
7115		res = notifier_to_errno(res);
7116		if (res) {
7117			netdev_err(dev,
7118				   "refused to change device tx_queue_len\n");
7119			dev->tx_queue_len = orig_len;
7120			return res;
7121		}
7122		return dev_qdisc_change_tx_queue_len(dev);
7123	}
7124
7125	return 0;
 
 
 
 
 
7126}
7127
7128/**
7129 *	dev_set_group - Change group this device belongs to
7130 *	@dev: device
7131 *	@new_group: group this device should belong to
7132 */
7133void dev_set_group(struct net_device *dev, int new_group)
7134{
7135	dev->group = new_group;
7136}
7137EXPORT_SYMBOL(dev_set_group);
7138
7139/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7140 *	dev_set_mac_address - Change Media Access Control Address
7141 *	@dev: device
7142 *	@sa: new address
 
7143 *
7144 *	Change the hardware (MAC) address of the device
7145 */
7146int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
 
7147{
7148	const struct net_device_ops *ops = dev->netdev_ops;
7149	int err;
7150
7151	if (!ops->ndo_set_mac_address)
7152		return -EOPNOTSUPP;
7153	if (sa->sa_family != dev->type)
7154		return -EINVAL;
7155	if (!netif_device_present(dev))
7156		return -ENODEV;
 
 
 
7157	err = ops->ndo_set_mac_address(dev, sa);
7158	if (err)
7159		return err;
7160	dev->addr_assign_type = NET_ADDR_SET;
7161	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
7162	add_device_randomness(dev->dev_addr, dev->addr_len);
7163	return 0;
7164}
7165EXPORT_SYMBOL(dev_set_mac_address);
7166
7167/**
7168 *	dev_change_carrier - Change device carrier
7169 *	@dev: device
7170 *	@new_carrier: new value
7171 *
7172 *	Change device carrier
7173 */
7174int dev_change_carrier(struct net_device *dev, bool new_carrier)
7175{
7176	const struct net_device_ops *ops = dev->netdev_ops;
7177
7178	if (!ops->ndo_change_carrier)
7179		return -EOPNOTSUPP;
7180	if (!netif_device_present(dev))
7181		return -ENODEV;
7182	return ops->ndo_change_carrier(dev, new_carrier);
7183}
7184EXPORT_SYMBOL(dev_change_carrier);
7185
7186/**
7187 *	dev_get_phys_port_id - Get device physical port ID
7188 *	@dev: device
7189 *	@ppid: port ID
7190 *
7191 *	Get device physical port ID
7192 */
7193int dev_get_phys_port_id(struct net_device *dev,
7194			 struct netdev_phys_item_id *ppid)
7195{
7196	const struct net_device_ops *ops = dev->netdev_ops;
7197
7198	if (!ops->ndo_get_phys_port_id)
7199		return -EOPNOTSUPP;
7200	return ops->ndo_get_phys_port_id(dev, ppid);
7201}
7202EXPORT_SYMBOL(dev_get_phys_port_id);
7203
7204/**
7205 *	dev_get_phys_port_name - Get device physical port name
7206 *	@dev: device
7207 *	@name: port name
7208 *	@len: limit of bytes to copy to name
7209 *
7210 *	Get device physical port name
7211 */
7212int dev_get_phys_port_name(struct net_device *dev,
7213			   char *name, size_t len)
7214{
7215	const struct net_device_ops *ops = dev->netdev_ops;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7216
7217	if (!ops->ndo_get_phys_port_name)
 
 
 
 
 
 
 
 
 
 
7218		return -EOPNOTSUPP;
7219	return ops->ndo_get_phys_port_name(dev, name, len);
 
 
 
 
 
 
 
 
 
 
 
7220}
7221EXPORT_SYMBOL(dev_get_phys_port_name);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7222
7223/**
7224 *	dev_change_proto_down - update protocol port state information
7225 *	@dev: device
7226 *	@proto_down: new value
7227 *
7228 *	This info can be used by switch drivers to set the phys state of the
7229 *	port.
7230 */
7231int dev_change_proto_down(struct net_device *dev, bool proto_down)
7232{
7233	const struct net_device_ops *ops = dev->netdev_ops;
7234
7235	if (!ops->ndo_change_proto_down)
7236		return -EOPNOTSUPP;
7237	if (!netif_device_present(dev))
7238		return -ENODEV;
7239	return ops->ndo_change_proto_down(dev, proto_down);
7240}
7241EXPORT_SYMBOL(dev_change_proto_down);
7242
7243void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
7244		     struct netdev_bpf *xdp)
 
 
 
 
 
 
 
7245{
7246	memset(xdp, 0, sizeof(*xdp));
7247	xdp->command = XDP_QUERY_PROG;
7248
7249	/* Query must always succeed. */
7250	WARN_ON(bpf_op(dev, xdp) < 0);
 
7251}
 
7252
7253static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op)
 
7254{
7255	struct netdev_bpf xdp;
7256
7257	__dev_xdp_query(dev, bpf_op, &xdp);
 
 
 
 
 
 
 
7258
7259	return xdp.prog_attached;
7260}
7261
7262static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
7263			   struct netlink_ext_ack *extack, u32 flags,
7264			   struct bpf_prog *prog)
7265{
7266	struct netdev_bpf xdp;
7267
7268	memset(&xdp, 0, sizeof(xdp));
7269	if (flags & XDP_FLAGS_HW_MODE)
7270		xdp.command = XDP_SETUP_PROG_HW;
7271	else
7272		xdp.command = XDP_SETUP_PROG;
7273	xdp.extack = extack;
7274	xdp.flags = flags;
7275	xdp.prog = prog;
7276
7277	return bpf_op(dev, &xdp);
7278}
7279
7280static void dev_xdp_uninstall(struct net_device *dev)
7281{
7282	struct netdev_bpf xdp;
7283	bpf_op_t ndo_bpf;
7284
7285	/* Remove generic XDP */
7286	WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL));
7287
7288	/* Remove from the driver */
7289	ndo_bpf = dev->netdev_ops->ndo_bpf;
7290	if (!ndo_bpf)
7291		return;
7292
7293	__dev_xdp_query(dev, ndo_bpf, &xdp);
7294	if (xdp.prog_attached == XDP_ATTACHED_NONE)
7295		return;
 
 
 
7296
7297	/* Program removal should always succeed */
7298	WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL));
 
 
 
 
7299}
7300
7301/**
7302 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
7303 *	@dev: device
7304 *	@extack: netlink extended ack
7305 *	@fd: new program fd or negative value to clear
7306 *	@flags: xdp-related flags
7307 *
7308 *	Set or clear a bpf program for a device
7309 */
7310int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
7311		      int fd, u32 flags)
7312{
7313	const struct net_device_ops *ops = dev->netdev_ops;
 
7314	struct bpf_prog *prog = NULL;
7315	bpf_op_t bpf_op, bpf_chk;
 
7316	int err;
7317
7318	ASSERT_RTNL();
7319
 
 
 
7320	bpf_op = bpf_chk = ops->ndo_bpf;
7321	if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE)))
 
7322		return -EOPNOTSUPP;
 
7323	if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE))
7324		bpf_op = generic_xdp_install;
7325	if (bpf_op == bpf_chk)
7326		bpf_chk = generic_xdp_install;
7327
7328	if (fd >= 0) {
7329		if (bpf_chk && __dev_xdp_attached(dev, bpf_chk))
 
 
 
7330			return -EEXIST;
7331		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
7332		    __dev_xdp_attached(dev, bpf_op))
 
 
 
7333			return -EBUSY;
 
7334
7335		prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
7336					     bpf_op == ops->ndo_bpf);
7337		if (IS_ERR(prog))
7338			return PTR_ERR(prog);
7339
7340		if (!(flags & XDP_FLAGS_HW_MODE) &&
7341		    bpf_prog_is_dev_bound(prog->aux)) {
7342			NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported");
7343			bpf_prog_put(prog);
7344			return -EINVAL;
7345		}
 
 
 
 
 
 
 
 
 
7346	}
7347
7348	err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
7349	if (err < 0 && prog)
7350		bpf_prog_put(prog);
7351
7352	return err;
7353}
7354
7355/**
7356 *	dev_new_index	-	allocate an ifindex
7357 *	@net: the applicable net namespace
7358 *
7359 *	Returns a suitable unique value for a new device interface
7360 *	number.  The caller must hold the rtnl semaphore or the
7361 *	dev_base_lock to be sure it remains unique.
7362 */
7363static int dev_new_index(struct net *net)
7364{
7365	int ifindex = net->ifindex;
7366
7367	for (;;) {
7368		if (++ifindex <= 0)
7369			ifindex = 1;
7370		if (!__dev_get_by_index(net, ifindex))
7371			return net->ifindex = ifindex;
7372	}
7373}
7374
7375/* Delayed registration/unregisteration */
7376static LIST_HEAD(net_todo_list);
7377DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
7378
7379static void net_set_todo(struct net_device *dev)
7380{
7381	list_add_tail(&dev->todo_list, &net_todo_list);
7382	dev_net(dev)->dev_unreg_count++;
7383}
7384
7385static void rollback_registered_many(struct list_head *head)
7386{
7387	struct net_device *dev, *tmp;
7388	LIST_HEAD(close_head);
7389
7390	BUG_ON(dev_boot_phase);
7391	ASSERT_RTNL();
7392
7393	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
7394		/* Some devices call without registering
7395		 * for initialization unwind. Remove those
7396		 * devices and proceed with the remaining.
7397		 */
7398		if (dev->reg_state == NETREG_UNINITIALIZED) {
7399			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
7400				 dev->name, dev);
7401
7402			WARN_ON(1);
7403			list_del(&dev->unreg_list);
7404			continue;
7405		}
7406		dev->dismantle = true;
7407		BUG_ON(dev->reg_state != NETREG_REGISTERED);
7408	}
7409
7410	/* If device is running, close it first. */
7411	list_for_each_entry(dev, head, unreg_list)
7412		list_add_tail(&dev->close_list, &close_head);
7413	dev_close_many(&close_head, true);
7414
7415	list_for_each_entry(dev, head, unreg_list) {
7416		/* And unlink it from device chain. */
7417		unlist_netdevice(dev);
7418
7419		dev->reg_state = NETREG_UNREGISTERING;
7420	}
7421	flush_all_backlogs();
7422
7423	synchronize_net();
7424
7425	list_for_each_entry(dev, head, unreg_list) {
7426		struct sk_buff *skb = NULL;
7427
7428		/* Shutdown queueing discipline. */
7429		dev_shutdown(dev);
7430
7431		dev_xdp_uninstall(dev);
7432
7433		/* Notify protocols, that we are about to destroy
7434		 * this device. They should clean all the things.
7435		 */
7436		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7437
7438		if (!dev->rtnl_link_ops ||
7439		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7440			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
7441						     GFP_KERNEL, NULL, 0);
7442
7443		/*
7444		 *	Flush the unicast and multicast chains
7445		 */
7446		dev_uc_flush(dev);
7447		dev_mc_flush(dev);
7448
7449		if (dev->netdev_ops->ndo_uninit)
7450			dev->netdev_ops->ndo_uninit(dev);
7451
7452		if (skb)
7453			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
7454
7455		/* Notifier chain MUST detach us all upper devices. */
7456		WARN_ON(netdev_has_any_upper_dev(dev));
7457		WARN_ON(netdev_has_any_lower_dev(dev));
7458
7459		/* Remove entries from kobject tree */
7460		netdev_unregister_kobject(dev);
7461#ifdef CONFIG_XPS
7462		/* Remove XPS queueing entries */
7463		netif_reset_xps_queues_gt(dev, 0);
7464#endif
7465	}
7466
7467	synchronize_net();
7468
7469	list_for_each_entry(dev, head, unreg_list)
7470		dev_put(dev);
7471}
7472
7473static void rollback_registered(struct net_device *dev)
7474{
7475	LIST_HEAD(single);
7476
7477	list_add(&dev->unreg_list, &single);
7478	rollback_registered_many(&single);
7479	list_del(&single);
7480}
7481
7482static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
7483	struct net_device *upper, netdev_features_t features)
7484{
7485	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
7486	netdev_features_t feature;
7487	int feature_bit;
7488
7489	for_each_netdev_feature(&upper_disables, feature_bit) {
7490		feature = __NETIF_F_BIT(feature_bit);
7491		if (!(upper->wanted_features & feature)
7492		    && (features & feature)) {
7493			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
7494				   &feature, upper->name);
7495			features &= ~feature;
7496		}
7497	}
7498
7499	return features;
7500}
7501
7502static void netdev_sync_lower_features(struct net_device *upper,
7503	struct net_device *lower, netdev_features_t features)
7504{
7505	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
7506	netdev_features_t feature;
7507	int feature_bit;
7508
7509	for_each_netdev_feature(&upper_disables, feature_bit) {
7510		feature = __NETIF_F_BIT(feature_bit);
7511		if (!(features & feature) && (lower->features & feature)) {
7512			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
7513				   &feature, lower->name);
7514			lower->wanted_features &= ~feature;
7515			netdev_update_features(lower);
7516
7517			if (unlikely(lower->features & feature))
7518				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
7519					    &feature, lower->name);
7520		}
7521	}
7522}
7523
7524static netdev_features_t netdev_fix_features(struct net_device *dev,
7525	netdev_features_t features)
7526{
7527	/* Fix illegal checksum combinations */
7528	if ((features & NETIF_F_HW_CSUM) &&
7529	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
7530		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
7531		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
7532	}
7533
7534	/* TSO requires that SG is present as well. */
7535	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
7536		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
7537		features &= ~NETIF_F_ALL_TSO;
7538	}
7539
7540	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
7541					!(features & NETIF_F_IP_CSUM)) {
7542		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
7543		features &= ~NETIF_F_TSO;
7544		features &= ~NETIF_F_TSO_ECN;
7545	}
7546
7547	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
7548					 !(features & NETIF_F_IPV6_CSUM)) {
7549		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
7550		features &= ~NETIF_F_TSO6;
7551	}
7552
7553	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
7554	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
7555		features &= ~NETIF_F_TSO_MANGLEID;
7556
7557	/* TSO ECN requires that TSO is present as well. */
7558	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
7559		features &= ~NETIF_F_TSO_ECN;
7560
7561	/* Software GSO depends on SG. */
7562	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
7563		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
7564		features &= ~NETIF_F_GSO;
7565	}
7566
7567	/* GSO partial features require GSO partial be set */
7568	if ((features & dev->gso_partial_features) &&
7569	    !(features & NETIF_F_GSO_PARTIAL)) {
7570		netdev_dbg(dev,
7571			   "Dropping partially supported GSO features since no GSO partial.\n");
7572		features &= ~dev->gso_partial_features;
7573	}
7574
7575	if (!(features & NETIF_F_RXCSUM)) {
7576		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
7577		 * successfully merged by hardware must also have the
7578		 * checksum verified by hardware.  If the user does not
7579		 * want to enable RXCSUM, logically, we should disable GRO_HW.
7580		 */
7581		if (features & NETIF_F_GRO_HW) {
7582			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
7583			features &= ~NETIF_F_GRO_HW;
7584		}
7585	}
7586
7587	/* LRO/HW-GRO features cannot be combined with RX-FCS */
7588	if (features & NETIF_F_RXFCS) {
7589		if (features & NETIF_F_LRO) {
7590			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
7591			features &= ~NETIF_F_LRO;
7592		}
7593
7594		if (features & NETIF_F_GRO_HW) {
7595			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
7596			features &= ~NETIF_F_GRO_HW;
7597		}
7598	}
7599
7600	return features;
7601}
7602
7603int __netdev_update_features(struct net_device *dev)
7604{
7605	struct net_device *upper, *lower;
7606	netdev_features_t features;
7607	struct list_head *iter;
7608	int err = -1;
7609
7610	ASSERT_RTNL();
7611
7612	features = netdev_get_wanted_features(dev);
7613
7614	if (dev->netdev_ops->ndo_fix_features)
7615		features = dev->netdev_ops->ndo_fix_features(dev, features);
7616
7617	/* driver might be less strict about feature dependencies */
7618	features = netdev_fix_features(dev, features);
7619
7620	/* some features can't be enabled if they're off an an upper device */
7621	netdev_for_each_upper_dev_rcu(dev, upper, iter)
7622		features = netdev_sync_upper_features(dev, upper, features);
7623
7624	if (dev->features == features)
7625		goto sync_lower;
7626
7627	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
7628		&dev->features, &features);
7629
7630	if (dev->netdev_ops->ndo_set_features)
7631		err = dev->netdev_ops->ndo_set_features(dev, features);
7632	else
7633		err = 0;
7634
7635	if (unlikely(err < 0)) {
7636		netdev_err(dev,
7637			"set_features() failed (%d); wanted %pNF, left %pNF\n",
7638			err, &features, &dev->features);
7639		/* return non-0 since some features might have changed and
7640		 * it's better to fire a spurious notification than miss it
7641		 */
7642		return -1;
7643	}
7644
7645sync_lower:
7646	/* some features must be disabled on lower devices when disabled
7647	 * on an upper device (think: bonding master or bridge)
7648	 */
7649	netdev_for_each_lower_dev(dev, lower, iter)
7650		netdev_sync_lower_features(dev, lower, features);
7651
7652	if (!err) {
7653		netdev_features_t diff = features ^ dev->features;
7654
7655		if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
7656			/* udp_tunnel_{get,drop}_rx_info both need
7657			 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
7658			 * device, or they won't do anything.
7659			 * Thus we need to update dev->features
7660			 * *before* calling udp_tunnel_get_rx_info,
7661			 * but *after* calling udp_tunnel_drop_rx_info.
7662			 */
7663			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
7664				dev->features = features;
7665				udp_tunnel_get_rx_info(dev);
7666			} else {
7667				udp_tunnel_drop_rx_info(dev);
7668			}
7669		}
7670
7671		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
7672			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
7673				dev->features = features;
7674				err |= vlan_get_rx_ctag_filter_info(dev);
7675			} else {
7676				vlan_drop_rx_ctag_filter_info(dev);
7677			}
7678		}
7679
7680		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
7681			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
7682				dev->features = features;
7683				err |= vlan_get_rx_stag_filter_info(dev);
7684			} else {
7685				vlan_drop_rx_stag_filter_info(dev);
7686			}
7687		}
7688
7689		dev->features = features;
7690	}
7691
7692	return err < 0 ? 0 : 1;
7693}
7694
7695/**
7696 *	netdev_update_features - recalculate device features
7697 *	@dev: the device to check
7698 *
7699 *	Recalculate dev->features set and send notifications if it
7700 *	has changed. Should be called after driver or hardware dependent
7701 *	conditions might have changed that influence the features.
7702 */
7703void netdev_update_features(struct net_device *dev)
7704{
7705	if (__netdev_update_features(dev))
7706		netdev_features_change(dev);
7707}
7708EXPORT_SYMBOL(netdev_update_features);
7709
7710/**
7711 *	netdev_change_features - recalculate device features
7712 *	@dev: the device to check
7713 *
7714 *	Recalculate dev->features set and send notifications even
7715 *	if they have not changed. Should be called instead of
7716 *	netdev_update_features() if also dev->vlan_features might
7717 *	have changed to allow the changes to be propagated to stacked
7718 *	VLAN devices.
7719 */
7720void netdev_change_features(struct net_device *dev)
7721{
7722	__netdev_update_features(dev);
7723	netdev_features_change(dev);
7724}
7725EXPORT_SYMBOL(netdev_change_features);
7726
7727/**
7728 *	netif_stacked_transfer_operstate -	transfer operstate
7729 *	@rootdev: the root or lower level device to transfer state from
7730 *	@dev: the device to transfer operstate to
7731 *
7732 *	Transfer operational state from root to device. This is normally
7733 *	called when a stacking relationship exists between the root
7734 *	device and the device(a leaf device).
7735 */
7736void netif_stacked_transfer_operstate(const struct net_device *rootdev,
7737					struct net_device *dev)
7738{
7739	if (rootdev->operstate == IF_OPER_DORMANT)
7740		netif_dormant_on(dev);
7741	else
7742		netif_dormant_off(dev);
7743
7744	if (netif_carrier_ok(rootdev))
7745		netif_carrier_on(dev);
7746	else
7747		netif_carrier_off(dev);
7748}
7749EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7750
7751static int netif_alloc_rx_queues(struct net_device *dev)
7752{
7753	unsigned int i, count = dev->num_rx_queues;
7754	struct netdev_rx_queue *rx;
7755	size_t sz = count * sizeof(*rx);
7756	int err = 0;
7757
7758	BUG_ON(count < 1);
7759
7760	rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
7761	if (!rx)
7762		return -ENOMEM;
7763
7764	dev->_rx = rx;
7765
7766	for (i = 0; i < count; i++) {
7767		rx[i].dev = dev;
7768
7769		/* XDP RX-queue setup */
7770		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
7771		if (err < 0)
7772			goto err_rxq_info;
7773	}
7774	return 0;
7775
7776err_rxq_info:
7777	/* Rollback successful reg's and free other resources */
7778	while (i--)
7779		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
7780	kvfree(dev->_rx);
7781	dev->_rx = NULL;
7782	return err;
7783}
7784
7785static void netif_free_rx_queues(struct net_device *dev)
7786{
7787	unsigned int i, count = dev->num_rx_queues;
7788
7789	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
7790	if (!dev->_rx)
7791		return;
7792
7793	for (i = 0; i < count; i++)
7794		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
7795
7796	kvfree(dev->_rx);
7797}
7798
7799static void netdev_init_one_queue(struct net_device *dev,
7800				  struct netdev_queue *queue, void *_unused)
7801{
7802	/* Initialize queue lock */
7803	spin_lock_init(&queue->_xmit_lock);
7804	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7805	queue->xmit_lock_owner = -1;
7806	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
7807	queue->dev = dev;
7808#ifdef CONFIG_BQL
7809	dql_init(&queue->dql, HZ);
7810#endif
7811}
7812
7813static void netif_free_tx_queues(struct net_device *dev)
7814{
7815	kvfree(dev->_tx);
7816}
7817
7818static int netif_alloc_netdev_queues(struct net_device *dev)
7819{
7820	unsigned int count = dev->num_tx_queues;
7821	struct netdev_queue *tx;
7822	size_t sz = count * sizeof(*tx);
7823
7824	if (count < 1 || count > 0xffff)
7825		return -EINVAL;
7826
7827	tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
7828	if (!tx)
7829		return -ENOMEM;
7830
7831	dev->_tx = tx;
7832
7833	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7834	spin_lock_init(&dev->tx_global_lock);
7835
7836	return 0;
7837}
7838
7839void netif_tx_stop_all_queues(struct net_device *dev)
7840{
7841	unsigned int i;
7842
7843	for (i = 0; i < dev->num_tx_queues; i++) {
7844		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
7845
7846		netif_tx_stop_queue(txq);
7847	}
7848}
7849EXPORT_SYMBOL(netif_tx_stop_all_queues);
7850
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7851/**
7852 *	register_netdevice	- register a network device
7853 *	@dev: device to register
7854 *
7855 *	Take a completed network device structure and add it to the kernel
7856 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7857 *	chain. 0 is returned on success. A negative errno code is returned
7858 *	on a failure to set up the device, or if the name is a duplicate.
7859 *
7860 *	Callers must hold the rtnl semaphore. You may want
7861 *	register_netdev() instead of this.
7862 *
7863 *	BUGS:
7864 *	The locking appears insufficient to guarantee two parallel registers
7865 *	will not get the same name.
7866 */
7867
7868int register_netdevice(struct net_device *dev)
7869{
7870	int ret;
7871	struct net *net = dev_net(dev);
7872
 
 
7873	BUG_ON(dev_boot_phase);
7874	ASSERT_RTNL();
7875
7876	might_sleep();
7877
7878	/* When net_device's are persistent, this will be fatal. */
7879	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7880	BUG_ON(!net);
7881
7882	spin_lock_init(&dev->addr_list_lock);
7883	netdev_set_addr_lockdep_class(dev);
7884
7885	ret = dev_get_valid_name(net, dev, dev->name);
7886	if (ret < 0)
7887		goto out;
7888
7889	/* Init, if this function is available */
7890	if (dev->netdev_ops->ndo_init) {
7891		ret = dev->netdev_ops->ndo_init(dev);
7892		if (ret) {
7893			if (ret > 0)
7894				ret = -EIO;
7895			goto out;
7896		}
7897	}
7898
7899	if (((dev->hw_features | dev->features) &
7900	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
7901	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7902	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7903		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7904		ret = -EINVAL;
7905		goto err_uninit;
7906	}
7907
7908	ret = -EBUSY;
7909	if (!dev->ifindex)
7910		dev->ifindex = dev_new_index(net);
7911	else if (__dev_get_by_index(net, dev->ifindex))
7912		goto err_uninit;
7913
7914	/* Transfer changeable features to wanted_features and enable
7915	 * software offloads (GSO and GRO).
7916	 */
7917	dev->hw_features |= NETIF_F_SOFT_FEATURES;
7918	dev->features |= NETIF_F_SOFT_FEATURES;
7919
7920	if (dev->netdev_ops->ndo_udp_tunnel_add) {
7921		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
7922		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
7923	}
7924
7925	dev->wanted_features = dev->features & dev->hw_features;
7926
7927	if (!(dev->flags & IFF_LOOPBACK))
7928		dev->hw_features |= NETIF_F_NOCACHE_COPY;
7929
7930	/* If IPv4 TCP segmentation offload is supported we should also
7931	 * allow the device to enable segmenting the frame with the option
7932	 * of ignoring a static IP ID value.  This doesn't enable the
7933	 * feature itself but allows the user to enable it later.
7934	 */
7935	if (dev->hw_features & NETIF_F_TSO)
7936		dev->hw_features |= NETIF_F_TSO_MANGLEID;
7937	if (dev->vlan_features & NETIF_F_TSO)
7938		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7939	if (dev->mpls_features & NETIF_F_TSO)
7940		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7941	if (dev->hw_enc_features & NETIF_F_TSO)
7942		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7943
7944	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7945	 */
7946	dev->vlan_features |= NETIF_F_HIGHDMA;
7947
7948	/* Make NETIF_F_SG inheritable to tunnel devices.
7949	 */
7950	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7951
7952	/* Make NETIF_F_SG inheritable to MPLS.
7953	 */
7954	dev->mpls_features |= NETIF_F_SG;
7955
7956	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7957	ret = notifier_to_errno(ret);
7958	if (ret)
7959		goto err_uninit;
7960
7961	ret = netdev_register_kobject(dev);
7962	if (ret)
7963		goto err_uninit;
7964	dev->reg_state = NETREG_REGISTERED;
7965
7966	__netdev_update_features(dev);
7967
7968	/*
7969	 *	Default initial state at registry is that the
7970	 *	device is present.
7971	 */
7972
7973	set_bit(__LINK_STATE_PRESENT, &dev->state);
7974
7975	linkwatch_init_dev(dev);
7976
7977	dev_init_scheduler(dev);
7978	dev_hold(dev);
7979	list_netdevice(dev);
7980	add_device_randomness(dev->dev_addr, dev->addr_len);
7981
7982	/* If the device has permanent device address, driver should
7983	 * set dev_addr and also addr_assign_type should be set to
7984	 * NET_ADDR_PERM (default value).
7985	 */
7986	if (dev->addr_assign_type == NET_ADDR_PERM)
7987		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7988
7989	/* Notify protocols, that a new device appeared. */
7990	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7991	ret = notifier_to_errno(ret);
7992	if (ret) {
7993		rollback_registered(dev);
 
 
7994		dev->reg_state = NETREG_UNREGISTERED;
7995	}
7996	/*
7997	 *	Prevent userspace races by waiting until the network
7998	 *	device is fully setup before sending notifications.
7999	 */
8000	if (!dev->rtnl_link_ops ||
8001	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
8002		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
8003
8004out:
8005	return ret;
8006
8007err_uninit:
8008	if (dev->netdev_ops->ndo_uninit)
8009		dev->netdev_ops->ndo_uninit(dev);
8010	if (dev->priv_destructor)
8011		dev->priv_destructor(dev);
8012	goto out;
8013}
8014EXPORT_SYMBOL(register_netdevice);
8015
8016/**
8017 *	init_dummy_netdev	- init a dummy network device for NAPI
8018 *	@dev: device to init
8019 *
8020 *	This takes a network device structure and initialize the minimum
8021 *	amount of fields so it can be used to schedule NAPI polls without
8022 *	registering a full blown interface. This is to be used by drivers
8023 *	that need to tie several hardware interfaces to a single NAPI
8024 *	poll scheduler due to HW limitations.
8025 */
8026int init_dummy_netdev(struct net_device *dev)
8027{
8028	/* Clear everything. Note we don't initialize spinlocks
8029	 * are they aren't supposed to be taken by any of the
8030	 * NAPI code and this dummy netdev is supposed to be
8031	 * only ever used for NAPI polls
8032	 */
8033	memset(dev, 0, sizeof(struct net_device));
8034
8035	/* make sure we BUG if trying to hit standard
8036	 * register/unregister code path
8037	 */
8038	dev->reg_state = NETREG_DUMMY;
8039
8040	/* NAPI wants this */
8041	INIT_LIST_HEAD(&dev->napi_list);
8042
8043	/* a dummy interface is started by default */
8044	set_bit(__LINK_STATE_PRESENT, &dev->state);
8045	set_bit(__LINK_STATE_START, &dev->state);
8046
 
 
 
8047	/* Note : We dont allocate pcpu_refcnt for dummy devices,
8048	 * because users of this 'device' dont need to change
8049	 * its refcount.
8050	 */
8051
8052	return 0;
8053}
8054EXPORT_SYMBOL_GPL(init_dummy_netdev);
8055
8056
8057/**
8058 *	register_netdev	- register a network device
8059 *	@dev: device to register
8060 *
8061 *	Take a completed network device structure and add it to the kernel
8062 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
8063 *	chain. 0 is returned on success. A negative errno code is returned
8064 *	on a failure to set up the device, or if the name is a duplicate.
8065 *
8066 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
8067 *	and expands the device name if you passed a format string to
8068 *	alloc_netdev.
8069 */
8070int register_netdev(struct net_device *dev)
8071{
8072	int err;
8073
8074	if (rtnl_lock_killable())
8075		return -EINTR;
8076	err = register_netdevice(dev);
8077	rtnl_unlock();
8078	return err;
8079}
8080EXPORT_SYMBOL(register_netdev);
8081
8082int netdev_refcnt_read(const struct net_device *dev)
8083{
8084	int i, refcnt = 0;
8085
8086	for_each_possible_cpu(i)
8087		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
8088	return refcnt;
8089}
8090EXPORT_SYMBOL(netdev_refcnt_read);
8091
8092/**
8093 * netdev_wait_allrefs - wait until all references are gone.
8094 * @dev: target net_device
8095 *
8096 * This is called when unregistering network devices.
8097 *
8098 * Any protocol or device that holds a reference should register
8099 * for netdevice notification, and cleanup and put back the
8100 * reference if they receive an UNREGISTER event.
8101 * We can get stuck here if buggy protocols don't correctly
8102 * call dev_put.
8103 */
8104static void netdev_wait_allrefs(struct net_device *dev)
8105{
8106	unsigned long rebroadcast_time, warning_time;
8107	int refcnt;
8108
8109	linkwatch_forget_dev(dev);
8110
8111	rebroadcast_time = warning_time = jiffies;
8112	refcnt = netdev_refcnt_read(dev);
8113
8114	while (refcnt != 0) {
8115		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
8116			rtnl_lock();
8117
8118			/* Rebroadcast unregister notification */
8119			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8120
8121			__rtnl_unlock();
8122			rcu_barrier();
8123			rtnl_lock();
8124
8125			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
8126				     &dev->state)) {
8127				/* We must not have linkwatch events
8128				 * pending on unregister. If this
8129				 * happens, we simply run the queue
8130				 * unscheduled, resulting in a noop
8131				 * for this device.
8132				 */
8133				linkwatch_run_queue();
8134			}
8135
8136			__rtnl_unlock();
8137
8138			rebroadcast_time = jiffies;
8139		}
8140
8141		msleep(250);
8142
8143		refcnt = netdev_refcnt_read(dev);
8144
8145		if (time_after(jiffies, warning_time + 10 * HZ)) {
8146			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
8147				 dev->name, refcnt);
8148			warning_time = jiffies;
8149		}
8150	}
8151}
8152
8153/* The sequence is:
8154 *
8155 *	rtnl_lock();
8156 *	...
8157 *	register_netdevice(x1);
8158 *	register_netdevice(x2);
8159 *	...
8160 *	unregister_netdevice(y1);
8161 *	unregister_netdevice(y2);
8162 *      ...
8163 *	rtnl_unlock();
8164 *	free_netdev(y1);
8165 *	free_netdev(y2);
8166 *
8167 * We are invoked by rtnl_unlock().
8168 * This allows us to deal with problems:
8169 * 1) We can delete sysfs objects which invoke hotplug
8170 *    without deadlocking with linkwatch via keventd.
8171 * 2) Since we run with the RTNL semaphore not held, we can sleep
8172 *    safely in order to wait for the netdev refcnt to drop to zero.
8173 *
8174 * We must not return until all unregister events added during
8175 * the interval the lock was held have been completed.
8176 */
8177void netdev_run_todo(void)
8178{
8179	struct list_head list;
8180
8181	/* Snapshot list, allow later requests */
8182	list_replace_init(&net_todo_list, &list);
8183
8184	__rtnl_unlock();
8185
8186
8187	/* Wait for rcu callbacks to finish before next phase */
8188	if (!list_empty(&list))
8189		rcu_barrier();
8190
8191	while (!list_empty(&list)) {
8192		struct net_device *dev
8193			= list_first_entry(&list, struct net_device, todo_list);
8194		list_del(&dev->todo_list);
8195
8196		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
8197			pr_err("network todo '%s' but state %d\n",
8198			       dev->name, dev->reg_state);
8199			dump_stack();
8200			continue;
8201		}
8202
8203		dev->reg_state = NETREG_UNREGISTERED;
8204
8205		netdev_wait_allrefs(dev);
8206
8207		/* paranoia */
8208		BUG_ON(netdev_refcnt_read(dev));
8209		BUG_ON(!list_empty(&dev->ptype_all));
8210		BUG_ON(!list_empty(&dev->ptype_specific));
8211		WARN_ON(rcu_access_pointer(dev->ip_ptr));
8212		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
8213#if IS_ENABLED(CONFIG_DECNET)
8214		WARN_ON(dev->dn_ptr);
8215#endif
8216		if (dev->priv_destructor)
8217			dev->priv_destructor(dev);
8218		if (dev->needs_free_netdev)
8219			free_netdev(dev);
8220
8221		/* Report a network device has been unregistered */
8222		rtnl_lock();
8223		dev_net(dev)->dev_unreg_count--;
8224		__rtnl_unlock();
8225		wake_up(&netdev_unregistering_wq);
8226
8227		/* Free network device */
8228		kobject_put(&dev->dev.kobj);
8229	}
8230}
8231
8232/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
8233 * all the same fields in the same order as net_device_stats, with only
8234 * the type differing, but rtnl_link_stats64 may have additional fields
8235 * at the end for newer counters.
8236 */
8237void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
8238			     const struct net_device_stats *netdev_stats)
8239{
8240#if BITS_PER_LONG == 64
8241	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
8242	memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
8243	/* zero out counters that only exist in rtnl_link_stats64 */
8244	memset((char *)stats64 + sizeof(*netdev_stats), 0,
8245	       sizeof(*stats64) - sizeof(*netdev_stats));
8246#else
8247	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
8248	const unsigned long *src = (const unsigned long *)netdev_stats;
8249	u64 *dst = (u64 *)stats64;
8250
8251	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
8252	for (i = 0; i < n; i++)
8253		dst[i] = src[i];
8254	/* zero out counters that only exist in rtnl_link_stats64 */
8255	memset((char *)stats64 + n * sizeof(u64), 0,
8256	       sizeof(*stats64) - n * sizeof(u64));
8257#endif
8258}
8259EXPORT_SYMBOL(netdev_stats_to_stats64);
8260
8261/**
8262 *	dev_get_stats	- get network device statistics
8263 *	@dev: device to get statistics from
8264 *	@storage: place to store stats
8265 *
8266 *	Get network statistics from device. Return @storage.
8267 *	The device driver may provide its own method by setting
8268 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
8269 *	otherwise the internal statistics structure is used.
8270 */
8271struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
8272					struct rtnl_link_stats64 *storage)
8273{
8274	const struct net_device_ops *ops = dev->netdev_ops;
8275
8276	if (ops->ndo_get_stats64) {
8277		memset(storage, 0, sizeof(*storage));
8278		ops->ndo_get_stats64(dev, storage);
8279	} else if (ops->ndo_get_stats) {
8280		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
8281	} else {
8282		netdev_stats_to_stats64(storage, &dev->stats);
8283	}
8284	storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
8285	storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
8286	storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
8287	return storage;
8288}
8289EXPORT_SYMBOL(dev_get_stats);
8290
8291struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
8292{
8293	struct netdev_queue *queue = dev_ingress_queue(dev);
8294
8295#ifdef CONFIG_NET_CLS_ACT
8296	if (queue)
8297		return queue;
8298	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
8299	if (!queue)
8300		return NULL;
8301	netdev_init_one_queue(dev, queue, NULL);
8302	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
8303	queue->qdisc_sleeping = &noop_qdisc;
8304	rcu_assign_pointer(dev->ingress_queue, queue);
8305#endif
8306	return queue;
8307}
8308
8309static const struct ethtool_ops default_ethtool_ops;
8310
8311void netdev_set_default_ethtool_ops(struct net_device *dev,
8312				    const struct ethtool_ops *ops)
8313{
8314	if (dev->ethtool_ops == &default_ethtool_ops)
8315		dev->ethtool_ops = ops;
8316}
8317EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
8318
8319void netdev_freemem(struct net_device *dev)
8320{
8321	char *addr = (char *)dev - dev->padded;
8322
8323	kvfree(addr);
8324}
8325
8326/**
8327 * alloc_netdev_mqs - allocate network device
8328 * @sizeof_priv: size of private data to allocate space for
8329 * @name: device name format string
8330 * @name_assign_type: origin of device name
8331 * @setup: callback to initialize device
8332 * @txqs: the number of TX subqueues to allocate
8333 * @rxqs: the number of RX subqueues to allocate
8334 *
8335 * Allocates a struct net_device with private data area for driver use
8336 * and performs basic initialization.  Also allocates subqueue structs
8337 * for each queue on the device.
8338 */
8339struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
8340		unsigned char name_assign_type,
8341		void (*setup)(struct net_device *),
8342		unsigned int txqs, unsigned int rxqs)
8343{
8344	struct net_device *dev;
8345	unsigned int alloc_size;
8346	struct net_device *p;
8347
8348	BUG_ON(strlen(name) >= sizeof(dev->name));
8349
8350	if (txqs < 1) {
8351		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
8352		return NULL;
8353	}
8354
8355	if (rxqs < 1) {
8356		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
8357		return NULL;
8358	}
8359
8360	alloc_size = sizeof(struct net_device);
8361	if (sizeof_priv) {
8362		/* ensure 32-byte alignment of private area */
8363		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
8364		alloc_size += sizeof_priv;
8365	}
8366	/* ensure 32-byte alignment of whole construct */
8367	alloc_size += NETDEV_ALIGN - 1;
8368
8369	p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
8370	if (!p)
8371		return NULL;
8372
8373	dev = PTR_ALIGN(p, NETDEV_ALIGN);
8374	dev->padded = (char *)dev - (char *)p;
8375
8376	dev->pcpu_refcnt = alloc_percpu(int);
8377	if (!dev->pcpu_refcnt)
8378		goto free_dev;
8379
8380	if (dev_addr_init(dev))
8381		goto free_pcpu;
8382
8383	dev_mc_init(dev);
8384	dev_uc_init(dev);
8385
8386	dev_net_set(dev, &init_net);
8387
 
 
8388	dev->gso_max_size = GSO_MAX_SIZE;
8389	dev->gso_max_segs = GSO_MAX_SEGS;
 
 
8390
8391	INIT_LIST_HEAD(&dev->napi_list);
8392	INIT_LIST_HEAD(&dev->unreg_list);
8393	INIT_LIST_HEAD(&dev->close_list);
8394	INIT_LIST_HEAD(&dev->link_watch_list);
8395	INIT_LIST_HEAD(&dev->adj_list.upper);
8396	INIT_LIST_HEAD(&dev->adj_list.lower);
8397	INIT_LIST_HEAD(&dev->ptype_all);
8398	INIT_LIST_HEAD(&dev->ptype_specific);
8399#ifdef CONFIG_NET_SCHED
8400	hash_init(dev->qdisc_hash);
8401#endif
8402	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
8403	setup(dev);
8404
8405	if (!dev->tx_queue_len) {
8406		dev->priv_flags |= IFF_NO_QUEUE;
8407		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
8408	}
8409
8410	dev->num_tx_queues = txqs;
8411	dev->real_num_tx_queues = txqs;
8412	if (netif_alloc_netdev_queues(dev))
8413		goto free_all;
8414
8415	dev->num_rx_queues = rxqs;
8416	dev->real_num_rx_queues = rxqs;
8417	if (netif_alloc_rx_queues(dev))
8418		goto free_all;
8419
8420	strcpy(dev->name, name);
8421	dev->name_assign_type = name_assign_type;
8422	dev->group = INIT_NETDEV_GROUP;
8423	if (!dev->ethtool_ops)
8424		dev->ethtool_ops = &default_ethtool_ops;
8425
8426	nf_hook_ingress_init(dev);
8427
8428	return dev;
8429
8430free_all:
8431	free_netdev(dev);
8432	return NULL;
8433
8434free_pcpu:
8435	free_percpu(dev->pcpu_refcnt);
8436free_dev:
8437	netdev_freemem(dev);
8438	return NULL;
8439}
8440EXPORT_SYMBOL(alloc_netdev_mqs);
8441
8442/**
8443 * free_netdev - free network device
8444 * @dev: device
8445 *
8446 * This function does the last stage of destroying an allocated device
8447 * interface. The reference to the device object is released. If this
8448 * is the last reference then it will be freed.Must be called in process
8449 * context.
8450 */
8451void free_netdev(struct net_device *dev)
8452{
8453	struct napi_struct *p, *n;
8454
8455	might_sleep();
8456	netif_free_tx_queues(dev);
8457	netif_free_rx_queues(dev);
8458
8459	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
8460
8461	/* Flush device addresses */
8462	dev_addr_flush(dev);
8463
8464	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
8465		netif_napi_del(p);
8466
8467	free_percpu(dev->pcpu_refcnt);
8468	dev->pcpu_refcnt = NULL;
8469
 
 
8470	/*  Compatibility with error handling in drivers */
8471	if (dev->reg_state == NETREG_UNINITIALIZED) {
8472		netdev_freemem(dev);
8473		return;
8474	}
8475
8476	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
8477	dev->reg_state = NETREG_RELEASED;
8478
8479	/* will free via device release */
8480	put_device(&dev->dev);
8481}
8482EXPORT_SYMBOL(free_netdev);
8483
8484/**
8485 *	synchronize_net -  Synchronize with packet receive processing
8486 *
8487 *	Wait for packets currently being received to be done.
8488 *	Does not block later packets from starting.
8489 */
8490void synchronize_net(void)
8491{
8492	might_sleep();
8493	if (rtnl_is_locked())
8494		synchronize_rcu_expedited();
8495	else
8496		synchronize_rcu();
8497}
8498EXPORT_SYMBOL(synchronize_net);
8499
8500/**
8501 *	unregister_netdevice_queue - remove device from the kernel
8502 *	@dev: device
8503 *	@head: list
8504 *
8505 *	This function shuts down a device interface and removes it
8506 *	from the kernel tables.
8507 *	If head not NULL, device is queued to be unregistered later.
8508 *
8509 *	Callers must hold the rtnl semaphore.  You may want
8510 *	unregister_netdev() instead of this.
8511 */
8512
8513void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
8514{
8515	ASSERT_RTNL();
8516
8517	if (head) {
8518		list_move_tail(&dev->unreg_list, head);
8519	} else {
8520		rollback_registered(dev);
8521		/* Finish processing unregister after unlock */
8522		net_set_todo(dev);
8523	}
8524}
8525EXPORT_SYMBOL(unregister_netdevice_queue);
8526
8527/**
8528 *	unregister_netdevice_many - unregister many devices
8529 *	@head: list of devices
8530 *
8531 *  Note: As most callers use a stack allocated list_head,
8532 *  we force a list_del() to make sure stack wont be corrupted later.
8533 */
8534void unregister_netdevice_many(struct list_head *head)
8535{
8536	struct net_device *dev;
8537
8538	if (!list_empty(head)) {
8539		rollback_registered_many(head);
8540		list_for_each_entry(dev, head, unreg_list)
8541			net_set_todo(dev);
8542		list_del(head);
8543	}
8544}
8545EXPORT_SYMBOL(unregister_netdevice_many);
8546
8547/**
8548 *	unregister_netdev - remove device from the kernel
8549 *	@dev: device
8550 *
8551 *	This function shuts down a device interface and removes it
8552 *	from the kernel tables.
8553 *
8554 *	This is just a wrapper for unregister_netdevice that takes
8555 *	the rtnl semaphore.  In general you want to use this and not
8556 *	unregister_netdevice.
8557 */
8558void unregister_netdev(struct net_device *dev)
8559{
8560	rtnl_lock();
8561	unregister_netdevice(dev);
8562	rtnl_unlock();
8563}
8564EXPORT_SYMBOL(unregister_netdev);
8565
8566/**
8567 *	dev_change_net_namespace - move device to different nethost namespace
8568 *	@dev: device
8569 *	@net: network namespace
8570 *	@pat: If not NULL name pattern to try if the current device name
8571 *	      is already taken in the destination network namespace.
8572 *
8573 *	This function shuts down a device interface and moves it
8574 *	to a new network namespace. On success 0 is returned, on
8575 *	a failure a netagive errno code is returned.
8576 *
8577 *	Callers must hold the rtnl semaphore.
8578 */
8579
8580int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
8581{
8582	int err, new_nsid, new_ifindex;
8583
8584	ASSERT_RTNL();
8585
8586	/* Don't allow namespace local devices to be moved. */
8587	err = -EINVAL;
8588	if (dev->features & NETIF_F_NETNS_LOCAL)
8589		goto out;
8590
8591	/* Ensure the device has been registrered */
8592	if (dev->reg_state != NETREG_REGISTERED)
8593		goto out;
8594
8595	/* Get out if there is nothing todo */
8596	err = 0;
8597	if (net_eq(dev_net(dev), net))
8598		goto out;
8599
8600	/* Pick the destination device name, and ensure
8601	 * we can use it in the destination network namespace.
8602	 */
8603	err = -EEXIST;
8604	if (__dev_get_by_name(net, dev->name)) {
8605		/* We get here if we can't use the current device name */
8606		if (!pat)
8607			goto out;
8608		if (dev_get_valid_name(net, dev, pat) < 0)
 
8609			goto out;
8610	}
8611
8612	/*
8613	 * And now a mini version of register_netdevice unregister_netdevice.
8614	 */
8615
8616	/* If device is running close it first. */
8617	dev_close(dev);
8618
8619	/* And unlink it from device chain */
8620	err = -ENODEV;
8621	unlist_netdevice(dev);
8622
8623	synchronize_net();
8624
8625	/* Shutdown queueing discipline. */
8626	dev_shutdown(dev);
8627
8628	/* Notify protocols, that we are about to destroy
8629	 * this device. They should clean all the things.
8630	 *
8631	 * Note that dev->reg_state stays at NETREG_REGISTERED.
8632	 * This is wanted because this way 8021q and macvlan know
8633	 * the device is just moving and can keep their slaves up.
8634	 */
8635	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8636	rcu_barrier();
8637
8638	new_nsid = peernet2id_alloc(dev_net(dev), net);
8639	/* If there is an ifindex conflict assign a new one */
8640	if (__dev_get_by_index(net, dev->ifindex))
8641		new_ifindex = dev_new_index(net);
8642	else
8643		new_ifindex = dev->ifindex;
8644
8645	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
8646			    new_ifindex);
8647
8648	/*
8649	 *	Flush the unicast and multicast chains
8650	 */
8651	dev_uc_flush(dev);
8652	dev_mc_flush(dev);
8653
8654	/* Send a netdev-removed uevent to the old namespace */
8655	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
8656	netdev_adjacent_del_links(dev);
8657
8658	/* Actually switch the network namespace */
8659	dev_net_set(dev, net);
8660	dev->ifindex = new_ifindex;
8661
8662	/* Send a netdev-add uevent to the new namespace */
8663	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
8664	netdev_adjacent_add_links(dev);
8665
8666	/* Fixup kobjects */
8667	err = device_rename(&dev->dev, dev->name);
8668	WARN_ON(err);
8669
8670	/* Add the device back in the hashes */
8671	list_netdevice(dev);
8672
8673	/* Notify protocols, that a new device appeared. */
8674	call_netdevice_notifiers(NETDEV_REGISTER, dev);
8675
8676	/*
8677	 *	Prevent userspace races by waiting until the network
8678	 *	device is fully setup before sending notifications.
8679	 */
8680	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
8681
8682	synchronize_net();
8683	err = 0;
8684out:
8685	return err;
8686}
8687EXPORT_SYMBOL_GPL(dev_change_net_namespace);
8688
8689static int dev_cpu_dead(unsigned int oldcpu)
8690{
8691	struct sk_buff **list_skb;
8692	struct sk_buff *skb;
8693	unsigned int cpu;
8694	struct softnet_data *sd, *oldsd, *remsd = NULL;
8695
8696	local_irq_disable();
8697	cpu = smp_processor_id();
8698	sd = &per_cpu(softnet_data, cpu);
8699	oldsd = &per_cpu(softnet_data, oldcpu);
8700
8701	/* Find end of our completion_queue. */
8702	list_skb = &sd->completion_queue;
8703	while (*list_skb)
8704		list_skb = &(*list_skb)->next;
8705	/* Append completion queue from offline CPU. */
8706	*list_skb = oldsd->completion_queue;
8707	oldsd->completion_queue = NULL;
8708
8709	/* Append output queue from offline CPU. */
8710	if (oldsd->output_queue) {
8711		*sd->output_queue_tailp = oldsd->output_queue;
8712		sd->output_queue_tailp = oldsd->output_queue_tailp;
8713		oldsd->output_queue = NULL;
8714		oldsd->output_queue_tailp = &oldsd->output_queue;
8715	}
8716	/* Append NAPI poll list from offline CPU, with one exception :
8717	 * process_backlog() must be called by cpu owning percpu backlog.
8718	 * We properly handle process_queue & input_pkt_queue later.
8719	 */
8720	while (!list_empty(&oldsd->poll_list)) {
8721		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
8722							    struct napi_struct,
8723							    poll_list);
8724
8725		list_del_init(&napi->poll_list);
8726		if (napi->poll == process_backlog)
8727			napi->state = 0;
8728		else
8729			____napi_schedule(sd, napi);
8730	}
8731
8732	raise_softirq_irqoff(NET_TX_SOFTIRQ);
8733	local_irq_enable();
8734
8735#ifdef CONFIG_RPS
8736	remsd = oldsd->rps_ipi_list;
8737	oldsd->rps_ipi_list = NULL;
8738#endif
8739	/* send out pending IPI's on offline CPU */
8740	net_rps_send_ipi(remsd);
8741
8742	/* Process offline CPU's input_pkt_queue */
8743	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
8744		netif_rx_ni(skb);
8745		input_queue_head_incr(oldsd);
8746	}
8747	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
8748		netif_rx_ni(skb);
8749		input_queue_head_incr(oldsd);
8750	}
8751
8752	return 0;
8753}
8754
8755/**
8756 *	netdev_increment_features - increment feature set by one
8757 *	@all: current feature set
8758 *	@one: new feature set
8759 *	@mask: mask feature set
8760 *
8761 *	Computes a new feature set after adding a device with feature set
8762 *	@one to the master device with current feature set @all.  Will not
8763 *	enable anything that is off in @mask. Returns the new feature set.
8764 */
8765netdev_features_t netdev_increment_features(netdev_features_t all,
8766	netdev_features_t one, netdev_features_t mask)
8767{
8768	if (mask & NETIF_F_HW_CSUM)
8769		mask |= NETIF_F_CSUM_MASK;
8770	mask |= NETIF_F_VLAN_CHALLENGED;
8771
8772	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
8773	all &= one | ~NETIF_F_ALL_FOR_ALL;
8774
8775	/* If one device supports hw checksumming, set for all. */
8776	if (all & NETIF_F_HW_CSUM)
8777		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
8778
8779	return all;
8780}
8781EXPORT_SYMBOL(netdev_increment_features);
8782
8783static struct hlist_head * __net_init netdev_create_hash(void)
8784{
8785	int i;
8786	struct hlist_head *hash;
8787
8788	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8789	if (hash != NULL)
8790		for (i = 0; i < NETDEV_HASHENTRIES; i++)
8791			INIT_HLIST_HEAD(&hash[i]);
8792
8793	return hash;
8794}
8795
8796/* Initialize per network namespace state */
8797static int __net_init netdev_init(struct net *net)
8798{
 
 
 
8799	if (net != &init_net)
8800		INIT_LIST_HEAD(&net->dev_base_head);
8801
8802	net->dev_name_head = netdev_create_hash();
8803	if (net->dev_name_head == NULL)
8804		goto err_name;
8805
8806	net->dev_index_head = netdev_create_hash();
8807	if (net->dev_index_head == NULL)
8808		goto err_idx;
8809
8810	return 0;
8811
8812err_idx:
8813	kfree(net->dev_name_head);
8814err_name:
8815	return -ENOMEM;
8816}
8817
8818/**
8819 *	netdev_drivername - network driver for the device
8820 *	@dev: network device
8821 *
8822 *	Determine network driver for device.
8823 */
8824const char *netdev_drivername(const struct net_device *dev)
8825{
8826	const struct device_driver *driver;
8827	const struct device *parent;
8828	const char *empty = "";
8829
8830	parent = dev->dev.parent;
8831	if (!parent)
8832		return empty;
8833
8834	driver = parent->driver;
8835	if (driver && driver->name)
8836		return driver->name;
8837	return empty;
8838}
8839
8840static void __netdev_printk(const char *level, const struct net_device *dev,
8841			    struct va_format *vaf)
8842{
8843	if (dev && dev->dev.parent) {
8844		dev_printk_emit(level[1] - '0',
8845				dev->dev.parent,
8846				"%s %s %s%s: %pV",
8847				dev_driver_string(dev->dev.parent),
8848				dev_name(dev->dev.parent),
8849				netdev_name(dev), netdev_reg_state(dev),
8850				vaf);
8851	} else if (dev) {
8852		printk("%s%s%s: %pV",
8853		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
8854	} else {
8855		printk("%s(NULL net_device): %pV", level, vaf);
8856	}
8857}
8858
8859void netdev_printk(const char *level, const struct net_device *dev,
8860		   const char *format, ...)
8861{
8862	struct va_format vaf;
8863	va_list args;
8864
8865	va_start(args, format);
8866
8867	vaf.fmt = format;
8868	vaf.va = &args;
8869
8870	__netdev_printk(level, dev, &vaf);
8871
8872	va_end(args);
8873}
8874EXPORT_SYMBOL(netdev_printk);
8875
8876#define define_netdev_printk_level(func, level)			\
8877void func(const struct net_device *dev, const char *fmt, ...)	\
8878{								\
8879	struct va_format vaf;					\
8880	va_list args;						\
8881								\
8882	va_start(args, fmt);					\
8883								\
8884	vaf.fmt = fmt;						\
8885	vaf.va = &args;						\
8886								\
8887	__netdev_printk(level, dev, &vaf);			\
8888								\
8889	va_end(args);						\
8890}								\
8891EXPORT_SYMBOL(func);
8892
8893define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8894define_netdev_printk_level(netdev_alert, KERN_ALERT);
8895define_netdev_printk_level(netdev_crit, KERN_CRIT);
8896define_netdev_printk_level(netdev_err, KERN_ERR);
8897define_netdev_printk_level(netdev_warn, KERN_WARNING);
8898define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8899define_netdev_printk_level(netdev_info, KERN_INFO);
8900
8901static void __net_exit netdev_exit(struct net *net)
8902{
8903	kfree(net->dev_name_head);
8904	kfree(net->dev_index_head);
8905	if (net != &init_net)
8906		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
8907}
8908
8909static struct pernet_operations __net_initdata netdev_net_ops = {
8910	.init = netdev_init,
8911	.exit = netdev_exit,
8912};
8913
8914static void __net_exit default_device_exit(struct net *net)
8915{
8916	struct net_device *dev, *aux;
8917	/*
8918	 * Push all migratable network devices back to the
8919	 * initial network namespace
8920	 */
8921	rtnl_lock();
8922	for_each_netdev_safe(net, dev, aux) {
8923		int err;
8924		char fb_name[IFNAMSIZ];
8925
8926		/* Ignore unmoveable devices (i.e. loopback) */
8927		if (dev->features & NETIF_F_NETNS_LOCAL)
8928			continue;
8929
8930		/* Leave virtual devices for the generic cleanup */
8931		if (dev->rtnl_link_ops)
8932			continue;
8933
8934		/* Push remaining network devices to init_net */
8935		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
 
 
8936		err = dev_change_net_namespace(dev, &init_net, fb_name);
8937		if (err) {
8938			pr_emerg("%s: failed to move %s to init_net: %d\n",
8939				 __func__, dev->name, err);
8940			BUG();
8941		}
8942	}
8943	rtnl_unlock();
8944}
8945
8946static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8947{
8948	/* Return with the rtnl_lock held when there are no network
8949	 * devices unregistering in any network namespace in net_list.
8950	 */
8951	struct net *net;
8952	bool unregistering;
8953	DEFINE_WAIT_FUNC(wait, woken_wake_function);
8954
8955	add_wait_queue(&netdev_unregistering_wq, &wait);
8956	for (;;) {
8957		unregistering = false;
8958		rtnl_lock();
8959		list_for_each_entry(net, net_list, exit_list) {
8960			if (net->dev_unreg_count > 0) {
8961				unregistering = true;
8962				break;
8963			}
8964		}
8965		if (!unregistering)
8966			break;
8967		__rtnl_unlock();
8968
8969		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8970	}
8971	remove_wait_queue(&netdev_unregistering_wq, &wait);
8972}
8973
8974static void __net_exit default_device_exit_batch(struct list_head *net_list)
8975{
8976	/* At exit all network devices most be removed from a network
8977	 * namespace.  Do this in the reverse order of registration.
8978	 * Do this across as many network namespaces as possible to
8979	 * improve batching efficiency.
8980	 */
8981	struct net_device *dev;
8982	struct net *net;
8983	LIST_HEAD(dev_kill_list);
8984
8985	/* To prevent network device cleanup code from dereferencing
8986	 * loopback devices or network devices that have been freed
8987	 * wait here for all pending unregistrations to complete,
8988	 * before unregistring the loopback device and allowing the
8989	 * network namespace be freed.
8990	 *
8991	 * The netdev todo list containing all network devices
8992	 * unregistrations that happen in default_device_exit_batch
8993	 * will run in the rtnl_unlock() at the end of
8994	 * default_device_exit_batch.
8995	 */
8996	rtnl_lock_unregistering(net_list);
8997	list_for_each_entry(net, net_list, exit_list) {
8998		for_each_netdev_reverse(net, dev) {
8999			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
9000				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
9001			else
9002				unregister_netdevice_queue(dev, &dev_kill_list);
9003		}
9004	}
9005	unregister_netdevice_many(&dev_kill_list);
9006	rtnl_unlock();
9007}
9008
9009static struct pernet_operations __net_initdata default_device_ops = {
9010	.exit = default_device_exit,
9011	.exit_batch = default_device_exit_batch,
9012};
9013
9014/*
9015 *	Initialize the DEV module. At boot time this walks the device list and
9016 *	unhooks any devices that fail to initialise (normally hardware not
9017 *	present) and leaves us with a valid list of present and active devices.
9018 *
9019 */
9020
9021/*
9022 *       This is called single threaded during boot, so no need
9023 *       to take the rtnl semaphore.
9024 */
9025static int __init net_dev_init(void)
9026{
9027	int i, rc = -ENOMEM;
9028
9029	BUG_ON(!dev_boot_phase);
9030
9031	if (dev_proc_init())
9032		goto out;
9033
9034	if (netdev_kobject_init())
9035		goto out;
9036
9037	INIT_LIST_HEAD(&ptype_all);
9038	for (i = 0; i < PTYPE_HASH_SIZE; i++)
9039		INIT_LIST_HEAD(&ptype_base[i]);
9040
9041	INIT_LIST_HEAD(&offload_base);
9042
9043	if (register_pernet_subsys(&netdev_net_ops))
9044		goto out;
9045
9046	/*
9047	 *	Initialise the packet receive queues.
9048	 */
9049
9050	for_each_possible_cpu(i) {
9051		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
9052		struct softnet_data *sd = &per_cpu(softnet_data, i);
9053
9054		INIT_WORK(flush, flush_backlog);
9055
9056		skb_queue_head_init(&sd->input_pkt_queue);
9057		skb_queue_head_init(&sd->process_queue);
9058#ifdef CONFIG_XFRM_OFFLOAD
9059		skb_queue_head_init(&sd->xfrm_backlog);
9060#endif
9061		INIT_LIST_HEAD(&sd->poll_list);
9062		sd->output_queue_tailp = &sd->output_queue;
9063#ifdef CONFIG_RPS
9064		sd->csd.func = rps_trigger_softirq;
9065		sd->csd.info = sd;
9066		sd->cpu = i;
9067#endif
9068
 
9069		sd->backlog.poll = process_backlog;
9070		sd->backlog.weight = weight_p;
9071	}
9072
9073	dev_boot_phase = 0;
9074
9075	/* The loopback device is special if any other network devices
9076	 * is present in a network namespace the loopback device must
9077	 * be present. Since we now dynamically allocate and free the
9078	 * loopback device ensure this invariant is maintained by
9079	 * keeping the loopback device as the first device on the
9080	 * list of network devices.  Ensuring the loopback devices
9081	 * is the first device that appears and the last network device
9082	 * that disappears.
9083	 */
9084	if (register_pernet_device(&loopback_net_ops))
9085		goto out;
9086
9087	if (register_pernet_device(&default_device_ops))
9088		goto out;
9089
9090	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
9091	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
9092
9093	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
9094				       NULL, dev_cpu_dead);
9095	WARN_ON(rc < 0);
9096	rc = 0;
9097out:
9098	return rc;
9099}
9100
9101subsys_initcall(net_dev_init);