Linux Audio

Check our new training course

Loading...
v6.8
    1// SPDX-License-Identifier: GPL-2.0-or-later
    2/*
    3 *      NET3    Protocol independent device support routines.
 
 
 
 
 
    4 *
    5 *	Derived from the non IP parts of dev.c 1.0.19
    6 *              Authors:	Ross Biro
    7 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
    8 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
    9 *
   10 *	Additional Authors:
   11 *		Florian la Roche <rzsfl@rz.uni-sb.de>
   12 *		Alan Cox <gw4pts@gw4pts.ampr.org>
   13 *		David Hinds <dahinds@users.sourceforge.net>
   14 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
   15 *		Adam Sulmicki <adam@cfar.umd.edu>
   16 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
   17 *
   18 *	Changes:
   19 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
   20 *                                      to 2 if register_netdev gets called
   21 *                                      before net_dev_init & also removed a
   22 *                                      few lines of code in the process.
   23 *		Alan Cox	:	device private ioctl copies fields back.
   24 *		Alan Cox	:	Transmit queue code does relevant
   25 *					stunts to keep the queue safe.
   26 *		Alan Cox	:	Fixed double lock.
   27 *		Alan Cox	:	Fixed promisc NULL pointer trap
   28 *		????????	:	Support the full private ioctl range
   29 *		Alan Cox	:	Moved ioctl permission check into
   30 *					drivers
   31 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
   32 *		Alan Cox	:	100 backlog just doesn't cut it when
   33 *					you start doing multicast video 8)
   34 *		Alan Cox	:	Rewrote net_bh and list manager.
   35 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
   36 *		Alan Cox	:	Took out transmit every packet pass
   37 *					Saved a few bytes in the ioctl handler
   38 *		Alan Cox	:	Network driver sets packet type before
   39 *					calling netif_rx. Saves a function
   40 *					call a packet.
   41 *		Alan Cox	:	Hashed net_bh()
   42 *		Richard Kooijman:	Timestamp fixes.
   43 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
   44 *		Alan Cox	:	Device lock protection.
   45 *              Alan Cox        :       Fixed nasty side effect of device close
   46 *					changes.
   47 *		Rudi Cilibrasi	:	Pass the right thing to
   48 *					set_mac_address()
   49 *		Dave Miller	:	32bit quantity for the device lock to
   50 *					make it work out on a Sparc.
   51 *		Bjorn Ekwall	:	Added KERNELD hack.
   52 *		Alan Cox	:	Cleaned up the backlog initialise.
   53 *		Craig Metz	:	SIOCGIFCONF fix if space for under
   54 *					1 device.
   55 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
   56 *					is no device open function.
   57 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
   58 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
   59 *		Cyrus Durgin	:	Cleaned for KMOD
   60 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
   61 *					A network device unload needs to purge
   62 *					the backlog queue.
   63 *	Paul Rusty Russell	:	SIOCSIFNAME
   64 *              Pekka Riikonen  :	Netdev boot-time settings code
   65 *              Andrew Morton   :       Make unregister_netdevice wait
   66 *                                      indefinitely on dev->refcnt
   67 *              J Hadi Salim    :       - Backlog queue sampling
   68 *				        - netif_rx() feedback
   69 */
   70
   71#include <linux/uaccess.h>
   72#include <linux/bitmap.h>
   73#include <linux/capability.h>
   74#include <linux/cpu.h>
   75#include <linux/types.h>
   76#include <linux/kernel.h>
   77#include <linux/hash.h>
   78#include <linux/slab.h>
   79#include <linux/sched.h>
   80#include <linux/sched/mm.h>
   81#include <linux/mutex.h>
   82#include <linux/rwsem.h>
   83#include <linux/string.h>
   84#include <linux/mm.h>
   85#include <linux/socket.h>
   86#include <linux/sockios.h>
   87#include <linux/errno.h>
   88#include <linux/interrupt.h>
   89#include <linux/if_ether.h>
   90#include <linux/netdevice.h>
   91#include <linux/etherdevice.h>
   92#include <linux/ethtool.h>
 
   93#include <linux/skbuff.h>
   94#include <linux/kthread.h>
   95#include <linux/bpf.h>
   96#include <linux/bpf_trace.h>
   97#include <net/net_namespace.h>
   98#include <net/sock.h>
   99#include <net/busy_poll.h>
  100#include <linux/rtnetlink.h>
  101#include <linux/stat.h>
  102#include <net/dsa.h>
  103#include <net/dst.h>
  104#include <net/dst_metadata.h>
  105#include <net/gro.h>
  106#include <net/pkt_sched.h>
  107#include <net/pkt_cls.h>
  108#include <net/checksum.h>
  109#include <net/xfrm.h>
  110#include <net/tcx.h>
  111#include <linux/highmem.h>
  112#include <linux/init.h>
  113#include <linux/module.h>
  114#include <linux/netpoll.h>
  115#include <linux/rcupdate.h>
  116#include <linux/delay.h>
  117#include <net/iw_handler.h>
  118#include <asm/current.h>
  119#include <linux/audit.h>
  120#include <linux/dmaengine.h>
  121#include <linux/err.h>
  122#include <linux/ctype.h>
  123#include <linux/if_arp.h>
  124#include <linux/if_vlan.h>
  125#include <linux/ip.h>
  126#include <net/ip.h>
  127#include <net/mpls.h>
  128#include <linux/ipv6.h>
  129#include <linux/in.h>
  130#include <linux/jhash.h>
  131#include <linux/random.h>
  132#include <trace/events/napi.h>
  133#include <trace/events/net.h>
  134#include <trace/events/skb.h>
  135#include <trace/events/qdisc.h>
  136#include <trace/events/xdp.h>
  137#include <linux/inetdevice.h>
  138#include <linux/cpu_rmap.h>
  139#include <linux/static_key.h>
  140#include <linux/hashtable.h>
  141#include <linux/vmalloc.h>
  142#include <linux/if_macvlan.h>
  143#include <linux/errqueue.h>
  144#include <linux/hrtimer.h>
  145#include <linux/netfilter_netdev.h>
  146#include <linux/crash_dump.h>
  147#include <linux/sctp.h>
  148#include <net/udp_tunnel.h>
  149#include <linux/net_namespace.h>
  150#include <linux/indirect_call_wrapper.h>
  151#include <net/devlink.h>
  152#include <linux/pm_runtime.h>
  153#include <linux/prandom.h>
  154#include <linux/once_lite.h>
  155#include <net/netdev_rx_queue.h>
  156
  157#include "dev.h"
  158#include "net-sysfs.h"
  159
 
 
 
 
 
 
  160static DEFINE_SPINLOCK(ptype_lock);
 
  161struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
  162struct list_head ptype_all __read_mostly;	/* Taps */
 
  163
  164static int netif_rx_internal(struct sk_buff *skb);
  165static int call_netdevice_notifiers_extack(unsigned long val,
  166					   struct net_device *dev,
  167					   struct netlink_ext_ack *extack);
  168
  169/*
  170 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
  171 * semaphore.
  172 *
  173 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
  174 *
  175 * Writers must hold the rtnl semaphore while they loop through the
  176 * dev_base_head list, and hold dev_base_lock for writing when they do the
  177 * actual updates.  This allows pure readers to access the list even
  178 * while a writer is preparing to update it.
  179 *
  180 * To put it another way, dev_base_lock is held for writing only to
  181 * protect against pure readers; the rtnl semaphore provides the
  182 * protection against other writers.
  183 *
  184 * See, for example usages, register_netdevice() and
  185 * unregister_netdevice(), which must be called with the rtnl
  186 * semaphore held.
  187 */
  188DEFINE_RWLOCK(dev_base_lock);
  189EXPORT_SYMBOL(dev_base_lock);
  190
  191static DEFINE_MUTEX(ifalias_mutex);
  192
  193/* protects napi_hash addition/deletion and napi_gen_id */
  194static DEFINE_SPINLOCK(napi_hash_lock);
  195
  196static unsigned int napi_gen_id = NR_CPUS;
  197static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
  198
  199static DECLARE_RWSEM(devnet_rename_sem);
  200
  201static inline void dev_base_seq_inc(struct net *net)
  202{
  203	while (++net->dev_base_seq == 0)
  204		;
  205}
  206
  207static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
  208{
  209	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
  210
  211	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
  212}
  213
  214static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
  215{
  216	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
  217}
  218
  219static inline void rps_lock_irqsave(struct softnet_data *sd,
  220				    unsigned long *flags)
  221{
  222	if (IS_ENABLED(CONFIG_RPS))
  223		spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
  224	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
  225		local_irq_save(*flags);
  226}
  227
  228static inline void rps_lock_irq_disable(struct softnet_data *sd)
  229{
  230	if (IS_ENABLED(CONFIG_RPS))
  231		spin_lock_irq(&sd->input_pkt_queue.lock);
  232	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
  233		local_irq_disable();
  234}
  235
  236static inline void rps_unlock_irq_restore(struct softnet_data *sd,
  237					  unsigned long *flags)
  238{
  239	if (IS_ENABLED(CONFIG_RPS))
  240		spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
  241	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
  242		local_irq_restore(*flags);
  243}
  244
  245static inline void rps_unlock_irq_enable(struct softnet_data *sd)
  246{
  247	if (IS_ENABLED(CONFIG_RPS))
  248		spin_unlock_irq(&sd->input_pkt_queue.lock);
  249	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
  250		local_irq_enable();
  251}
  252
  253static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
  254						       const char *name)
  255{
  256	struct netdev_name_node *name_node;
  257
  258	name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
  259	if (!name_node)
  260		return NULL;
  261	INIT_HLIST_NODE(&name_node->hlist);
  262	name_node->dev = dev;
  263	name_node->name = name;
  264	return name_node;
  265}
  266
  267static struct netdev_name_node *
  268netdev_name_node_head_alloc(struct net_device *dev)
  269{
  270	struct netdev_name_node *name_node;
  271
  272	name_node = netdev_name_node_alloc(dev, dev->name);
  273	if (!name_node)
  274		return NULL;
  275	INIT_LIST_HEAD(&name_node->list);
  276	return name_node;
  277}
  278
  279static void netdev_name_node_free(struct netdev_name_node *name_node)
  280{
  281	kfree(name_node);
  282}
  283
  284static void netdev_name_node_add(struct net *net,
  285				 struct netdev_name_node *name_node)
  286{
  287	hlist_add_head_rcu(&name_node->hlist,
  288			   dev_name_hash(net, name_node->name));
  289}
  290
  291static void netdev_name_node_del(struct netdev_name_node *name_node)
  292{
  293	hlist_del_rcu(&name_node->hlist);
  294}
  295
  296static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
  297							const char *name)
  298{
  299	struct hlist_head *head = dev_name_hash(net, name);
  300	struct netdev_name_node *name_node;
  301
  302	hlist_for_each_entry(name_node, head, hlist)
  303		if (!strcmp(name_node->name, name))
  304			return name_node;
  305	return NULL;
  306}
  307
  308static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
  309							    const char *name)
  310{
  311	struct hlist_head *head = dev_name_hash(net, name);
  312	struct netdev_name_node *name_node;
  313
  314	hlist_for_each_entry_rcu(name_node, head, hlist)
  315		if (!strcmp(name_node->name, name))
  316			return name_node;
  317	return NULL;
  318}
  319
  320bool netdev_name_in_use(struct net *net, const char *name)
  321{
  322	return netdev_name_node_lookup(net, name);
  323}
  324EXPORT_SYMBOL(netdev_name_in_use);
  325
  326int netdev_name_node_alt_create(struct net_device *dev, const char *name)
  327{
  328	struct netdev_name_node *name_node;
  329	struct net *net = dev_net(dev);
  330
  331	name_node = netdev_name_node_lookup(net, name);
  332	if (name_node)
  333		return -EEXIST;
  334	name_node = netdev_name_node_alloc(dev, name);
  335	if (!name_node)
  336		return -ENOMEM;
  337	netdev_name_node_add(net, name_node);
  338	/* The node that holds dev->name acts as a head of per-device list. */
  339	list_add_tail_rcu(&name_node->list, &dev->name_node->list);
  340
  341	return 0;
  342}
  343
  344static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
  345{
  346	list_del(&name_node->list);
  347	kfree(name_node->name);
  348	netdev_name_node_free(name_node);
  349}
  350
  351int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
  352{
  353	struct netdev_name_node *name_node;
  354	struct net *net = dev_net(dev);
  355
  356	name_node = netdev_name_node_lookup(net, name);
  357	if (!name_node)
  358		return -ENOENT;
  359	/* lookup might have found our primary name or a name belonging
  360	 * to another device.
  361	 */
  362	if (name_node == dev->name_node || name_node->dev != dev)
  363		return -EINVAL;
  364
  365	netdev_name_node_del(name_node);
  366	synchronize_rcu();
  367	__netdev_name_node_alt_destroy(name_node);
  368
  369	return 0;
  370}
  371
  372static void netdev_name_node_alt_flush(struct net_device *dev)
  373{
  374	struct netdev_name_node *name_node, *tmp;
  375
  376	list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
  377		__netdev_name_node_alt_destroy(name_node);
  378}
  379
  380/* Device list insertion */
  381static void list_netdevice(struct net_device *dev)
  382{
  383	struct netdev_name_node *name_node;
  384	struct net *net = dev_net(dev);
  385
  386	ASSERT_RTNL();
  387
  388	write_lock(&dev_base_lock);
  389	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
  390	netdev_name_node_add(net, dev->name_node);
  391	hlist_add_head_rcu(&dev->index_hlist,
  392			   dev_index_hash(net, dev->ifindex));
  393	write_unlock(&dev_base_lock);
  394
  395	netdev_for_each_altname(dev, name_node)
  396		netdev_name_node_add(net, name_node);
  397
  398	/* We reserved the ifindex, this can't fail */
  399	WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));
  400
  401	dev_base_seq_inc(net);
  402}
  403
  404/* Device list removal
  405 * caller must respect a RCU grace period before freeing/reusing dev
  406 */
  407static void unlist_netdevice(struct net_device *dev, bool lock)
  408{
  409	struct netdev_name_node *name_node;
  410	struct net *net = dev_net(dev);
  411
  412	ASSERT_RTNL();
  413
  414	xa_erase(&net->dev_by_index, dev->ifindex);
  415
  416	netdev_for_each_altname(dev, name_node)
  417		netdev_name_node_del(name_node);
  418
  419	/* Unlink dev from the device chain */
  420	if (lock)
  421		write_lock(&dev_base_lock);
  422	list_del_rcu(&dev->dev_list);
  423	netdev_name_node_del(dev->name_node);
  424	hlist_del_rcu(&dev->index_hlist);
  425	if (lock)
  426		write_unlock(&dev_base_lock);
  427
  428	dev_base_seq_inc(dev_net(dev));
  429}
  430
  431/*
  432 *	Our notifier list
  433 */
  434
  435static RAW_NOTIFIER_HEAD(netdev_chain);
  436
  437/*
  438 *	Device drivers call our routines to queue packets here. We empty the
  439 *	queue in the local softnet handler.
  440 */
  441
  442DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
  443EXPORT_PER_CPU_SYMBOL(softnet_data);
  444
  445#ifdef CONFIG_LOCKDEP
  446/*
  447 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
  448 * according to dev->type
  449 */
  450static const unsigned short netdev_lock_type[] = {
  451	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
  452	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
  453	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
  454	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
  455	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
  456	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
  457	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
  458	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
  459	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
  460	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
  461	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
  462	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
  463	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
  464	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
  465	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
  466
  467static const char *const netdev_lock_name[] = {
  468	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
  469	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
  470	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
  471	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
  472	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
  473	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
  474	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
  475	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
  476	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
  477	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
  478	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
  479	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
  480	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
  481	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
  482	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
  483
  484static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
  485static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
  486
  487static inline unsigned short netdev_lock_pos(unsigned short dev_type)
  488{
  489	int i;
  490
  491	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
  492		if (netdev_lock_type[i] == dev_type)
  493			return i;
  494	/* the last key is used by default */
  495	return ARRAY_SIZE(netdev_lock_type) - 1;
  496}
  497
  498static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
  499						 unsigned short dev_type)
  500{
  501	int i;
  502
  503	i = netdev_lock_pos(dev_type);
  504	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
  505				   netdev_lock_name[i]);
  506}
  507
  508static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
  509{
  510	int i;
  511
  512	i = netdev_lock_pos(dev->type);
  513	lockdep_set_class_and_name(&dev->addr_list_lock,
  514				   &netdev_addr_lock_key[i],
  515				   netdev_lock_name[i]);
  516}
  517#else
  518static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
  519						 unsigned short dev_type)
  520{
  521}
  522
  523static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
  524{
  525}
  526#endif
  527
  528/*******************************************************************************
  529 *
  530 *		Protocol management and registration routines
  531 *
  532 *******************************************************************************/
  533
 
 
 
  534
  535/*
  536 *	Add a protocol ID to the list. Now that the input handler is
  537 *	smarter we can dispense with all the messy stuff that used to be
  538 *	here.
  539 *
  540 *	BEWARE!!! Protocol handlers, mangling input packets,
  541 *	MUST BE last in hash buckets and checking protocol handlers
  542 *	MUST start from promiscuous ptype_all chain in net_bh.
  543 *	It is true now, do not change it.
  544 *	Explanation follows: if protocol handler, mangling packet, will
  545 *	be the first on list, it is not able to sense, that packet
  546 *	is cloned and should be copied-on-write, so that it will
  547 *	change it and subsequent readers will get broken packet.
  548 *							--ANK (980803)
  549 */
  550
  551static inline struct list_head *ptype_head(const struct packet_type *pt)
  552{
  553	if (pt->type == htons(ETH_P_ALL))
  554		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
  555	else
  556		return pt->dev ? &pt->dev->ptype_specific :
  557				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
  558}
  559
  560/**
  561 *	dev_add_pack - add packet handler
  562 *	@pt: packet type declaration
  563 *
  564 *	Add a protocol handler to the networking stack. The passed &packet_type
  565 *	is linked into kernel lists and may not be freed until it has been
  566 *	removed from the kernel lists.
  567 *
  568 *	This call does not sleep therefore it can not
  569 *	guarantee all CPU's that are in middle of receiving packets
  570 *	will see the new packet type (until the next received packet).
  571 */
  572
  573void dev_add_pack(struct packet_type *pt)
  574{
  575	struct list_head *head = ptype_head(pt);
  576
  577	spin_lock(&ptype_lock);
  578	list_add_rcu(&pt->list, head);
  579	spin_unlock(&ptype_lock);
  580}
  581EXPORT_SYMBOL(dev_add_pack);
  582
  583/**
  584 *	__dev_remove_pack	 - remove packet handler
  585 *	@pt: packet type declaration
  586 *
  587 *	Remove a protocol handler that was previously added to the kernel
  588 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  589 *	from the kernel lists and can be freed or reused once this function
  590 *	returns.
  591 *
  592 *      The packet type might still be in use by receivers
  593 *	and must not be freed until after all the CPU's have gone
  594 *	through a quiescent state.
  595 */
  596void __dev_remove_pack(struct packet_type *pt)
  597{
  598	struct list_head *head = ptype_head(pt);
  599	struct packet_type *pt1;
  600
  601	spin_lock(&ptype_lock);
  602
  603	list_for_each_entry(pt1, head, list) {
  604		if (pt == pt1) {
  605			list_del_rcu(&pt->list);
  606			goto out;
  607		}
  608	}
  609
  610	pr_warn("dev_remove_pack: %p not found\n", pt);
  611out:
  612	spin_unlock(&ptype_lock);
  613}
  614EXPORT_SYMBOL(__dev_remove_pack);
  615
  616/**
  617 *	dev_remove_pack	 - remove packet handler
  618 *	@pt: packet type declaration
  619 *
  620 *	Remove a protocol handler that was previously added to the kernel
  621 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  622 *	from the kernel lists and can be freed or reused once this function
  623 *	returns.
  624 *
  625 *	This call sleeps to guarantee that no CPU is looking at the packet
  626 *	type after return.
  627 */
  628void dev_remove_pack(struct packet_type *pt)
  629{
  630	__dev_remove_pack(pt);
  631
  632	synchronize_net();
  633}
  634EXPORT_SYMBOL(dev_remove_pack);
  635
  636
  637/*******************************************************************************
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  638 *
  639 *			    Device Interface Subroutines
 
 
 
  640 *
  641 *******************************************************************************/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  642
  643/**
  644 *	dev_get_iflink	- get 'iflink' value of a interface
  645 *	@dev: targeted interface
  646 *
  647 *	Indicates the ifindex the interface is linked to.
  648 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
  649 */
  650
  651int dev_get_iflink(const struct net_device *dev)
  652{
  653	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
  654		return dev->netdev_ops->ndo_get_iflink(dev);
  655
  656	return dev->ifindex;
  657}
  658EXPORT_SYMBOL(dev_get_iflink);
  659
  660/**
  661 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
  662 *	@dev: targeted interface
  663 *	@skb: The packet.
  664 *
  665 *	For better visibility of tunnel traffic OVS needs to retrieve
  666 *	egress tunnel information for a packet. Following API allows
  667 *	user to get this info.
  668 */
  669int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
  670{
  671	struct ip_tunnel_info *info;
  672
  673	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
  674		return -EINVAL;
  675
  676	info = skb_tunnel_info_unclone(skb);
  677	if (!info)
  678		return -ENOMEM;
  679	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
  680		return -EINVAL;
  681
  682	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
  683}
  684EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
  685
  686static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
  687{
  688	int k = stack->num_paths++;
  689
  690	if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
  691		return NULL;
  692
  693	return &stack->path[k];
  694}
  695
  696int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
  697			  struct net_device_path_stack *stack)
  698{
  699	const struct net_device *last_dev;
  700	struct net_device_path_ctx ctx = {
  701		.dev	= dev,
  702	};
  703	struct net_device_path *path;
  704	int ret = 0;
  705
  706	memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
  707	stack->num_paths = 0;
  708	while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
  709		last_dev = ctx.dev;
  710		path = dev_fwd_path(stack);
  711		if (!path)
  712			return -1;
  713
  714		memset(path, 0, sizeof(struct net_device_path));
  715		ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
  716		if (ret < 0)
  717			return -1;
  718
  719		if (WARN_ON_ONCE(last_dev == ctx.dev))
  720			return -1;
  721	}
  722
  723	if (!ctx.dev)
  724		return ret;
  725
  726	path = dev_fwd_path(stack);
  727	if (!path)
  728		return -1;
  729	path->type = DEV_PATH_ETHERNET;
  730	path->dev = ctx.dev;
  731
  732	return ret;
  733}
  734EXPORT_SYMBOL_GPL(dev_fill_forward_path);
  735
  736/**
  737 *	__dev_get_by_name	- find a device by its name
  738 *	@net: the applicable net namespace
  739 *	@name: name to find
  740 *
  741 *	Find an interface by name. Must be called under RTNL semaphore
  742 *	or @dev_base_lock. If the name is found a pointer to the device
  743 *	is returned. If the name is not found then %NULL is returned. The
  744 *	reference counters are not incremented so the caller must be
  745 *	careful with locks.
  746 */
  747
  748struct net_device *__dev_get_by_name(struct net *net, const char *name)
  749{
  750	struct netdev_name_node *node_name;
 
 
 
 
 
  751
  752	node_name = netdev_name_node_lookup(net, name);
  753	return node_name ? node_name->dev : NULL;
  754}
  755EXPORT_SYMBOL(__dev_get_by_name);
  756
  757/**
  758 * dev_get_by_name_rcu	- find a device by its name
  759 * @net: the applicable net namespace
  760 * @name: name to find
  761 *
  762 * Find an interface by name.
  763 * If the name is found a pointer to the device is returned.
  764 * If the name is not found then %NULL is returned.
  765 * The reference counters are not incremented so the caller must be
  766 * careful with locks. The caller must hold RCU lock.
  767 */
  768
  769struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
  770{
  771	struct netdev_name_node *node_name;
  772
  773	node_name = netdev_name_node_lookup_rcu(net, name);
  774	return node_name ? node_name->dev : NULL;
  775}
  776EXPORT_SYMBOL(dev_get_by_name_rcu);
  777
  778/* Deprecated for new users, call netdev_get_by_name() instead */
  779struct net_device *dev_get_by_name(struct net *net, const char *name)
  780{
  781	struct net_device *dev;
 
  782
  783	rcu_read_lock();
  784	dev = dev_get_by_name_rcu(net, name);
  785	dev_hold(dev);
  786	rcu_read_unlock();
  787	return dev;
  788}
  789EXPORT_SYMBOL(dev_get_by_name);
  790
  791/**
  792 *	netdev_get_by_name() - find a device by its name
  793 *	@net: the applicable net namespace
  794 *	@name: name to find
  795 *	@tracker: tracking object for the acquired reference
  796 *	@gfp: allocation flags for the tracker
  797 *
  798 *	Find an interface by name. This can be called from any
  799 *	context and does its own locking. The returned handle has
  800 *	the usage count incremented and the caller must use netdev_put() to
  801 *	release it when it is no longer needed. %NULL is returned if no
  802 *	matching device is found.
  803 */
  804struct net_device *netdev_get_by_name(struct net *net, const char *name,
  805				      netdevice_tracker *tracker, gfp_t gfp)
  806{
  807	struct net_device *dev;
  808
  809	dev = dev_get_by_name(net, name);
 
  810	if (dev)
  811		netdev_tracker_alloc(dev, tracker, gfp);
 
  812	return dev;
  813}
  814EXPORT_SYMBOL(netdev_get_by_name);
  815
  816/**
  817 *	__dev_get_by_index - find a device by its ifindex
  818 *	@net: the applicable net namespace
  819 *	@ifindex: index of device
  820 *
  821 *	Search for an interface by index. Returns %NULL if the device
  822 *	is not found or a pointer to the device. The device has not
  823 *	had its reference counter increased so the caller must be careful
  824 *	about locking. The caller must hold either the RTNL semaphore
  825 *	or @dev_base_lock.
  826 */
  827
  828struct net_device *__dev_get_by_index(struct net *net, int ifindex)
  829{
  830	struct net_device *dev;
  831	struct hlist_head *head = dev_index_hash(net, ifindex);
  832
  833	hlist_for_each_entry(dev, head, index_hlist)
  834		if (dev->ifindex == ifindex)
  835			return dev;
  836
  837	return NULL;
  838}
  839EXPORT_SYMBOL(__dev_get_by_index);
  840
  841/**
  842 *	dev_get_by_index_rcu - find a device by its ifindex
  843 *	@net: the applicable net namespace
  844 *	@ifindex: index of device
  845 *
  846 *	Search for an interface by index. Returns %NULL if the device
  847 *	is not found or a pointer to the device. The device has not
  848 *	had its reference counter increased so the caller must be careful
  849 *	about locking. The caller must hold RCU lock.
  850 */
  851
  852struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
  853{
  854	struct net_device *dev;
  855	struct hlist_head *head = dev_index_hash(net, ifindex);
  856
  857	hlist_for_each_entry_rcu(dev, head, index_hlist)
  858		if (dev->ifindex == ifindex)
  859			return dev;
  860
  861	return NULL;
  862}
  863EXPORT_SYMBOL(dev_get_by_index_rcu);
  864
  865/* Deprecated for new users, call netdev_get_by_index() instead */
  866struct net_device *dev_get_by_index(struct net *net, int ifindex)
  867{
  868	struct net_device *dev;
  869
  870	rcu_read_lock();
  871	dev = dev_get_by_index_rcu(net, ifindex);
  872	dev_hold(dev);
  873	rcu_read_unlock();
  874	return dev;
  875}
  876EXPORT_SYMBOL(dev_get_by_index);
  877
  878/**
  879 *	netdev_get_by_index() - find a device by its ifindex
  880 *	@net: the applicable net namespace
  881 *	@ifindex: index of device
  882 *	@tracker: tracking object for the acquired reference
  883 *	@gfp: allocation flags for the tracker
  884 *
  885 *	Search for an interface by index. Returns NULL if the device
  886 *	is not found or a pointer to the device. The device returned has
  887 *	had a reference added and the pointer is safe until the user calls
  888 *	netdev_put() to indicate they have finished with it.
  889 */
  890struct net_device *netdev_get_by_index(struct net *net, int ifindex,
  891				       netdevice_tracker *tracker, gfp_t gfp)
  892{
  893	struct net_device *dev;
  894
  895	dev = dev_get_by_index(net, ifindex);
 
  896	if (dev)
  897		netdev_tracker_alloc(dev, tracker, gfp);
 
  898	return dev;
  899}
  900EXPORT_SYMBOL(netdev_get_by_index);
  901
  902/**
  903 *	dev_get_by_napi_id - find a device by napi_id
  904 *	@napi_id: ID of the NAPI struct
  905 *
  906 *	Search for an interface by NAPI ID. Returns %NULL if the device
  907 *	is not found or a pointer to the device. The device has not had
  908 *	its reference counter increased so the caller must be careful
  909 *	about locking. The caller must hold RCU lock.
  910 */
  911
  912struct net_device *dev_get_by_napi_id(unsigned int napi_id)
  913{
  914	struct napi_struct *napi;
  915
  916	WARN_ON_ONCE(!rcu_read_lock_held());
  917
  918	if (napi_id < MIN_NAPI_ID)
  919		return NULL;
  920
  921	napi = napi_by_id(napi_id);
  922
  923	return napi ? napi->dev : NULL;
  924}
  925EXPORT_SYMBOL(dev_get_by_napi_id);
  926
  927/**
  928 *	netdev_get_name - get a netdevice name, knowing its ifindex.
  929 *	@net: network namespace
  930 *	@name: a pointer to the buffer where the name will be stored.
  931 *	@ifindex: the ifindex of the interface to get the name from.
 
 
 
 
  932 */
  933int netdev_get_name(struct net *net, char *name, int ifindex)
  934{
  935	struct net_device *dev;
  936	int ret;
  937
  938	down_read(&devnet_rename_sem);
 
  939	rcu_read_lock();
  940
  941	dev = dev_get_by_index_rcu(net, ifindex);
  942	if (!dev) {
  943		ret = -ENODEV;
  944		goto out;
  945	}
  946
  947	strcpy(name, dev->name);
  948
  949	ret = 0;
  950out:
  951	rcu_read_unlock();
  952	up_read(&devnet_rename_sem);
  953	return ret;
 
 
 
 
  954}
  955
  956/**
  957 *	dev_getbyhwaddr_rcu - find a device by its hardware address
  958 *	@net: the applicable net namespace
  959 *	@type: media type of device
  960 *	@ha: hardware address
  961 *
  962 *	Search for an interface by MAC address. Returns NULL if the device
  963 *	is not found or a pointer to the device.
  964 *	The caller must hold RCU or RTNL.
  965 *	The returned device has not had its ref count increased
  966 *	and the caller must therefore be careful about locking
  967 *
  968 */
  969
  970struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
  971				       const char *ha)
  972{
  973	struct net_device *dev;
  974
  975	for_each_netdev_rcu(net, dev)
  976		if (dev->type == type &&
  977		    !memcmp(dev->dev_addr, ha, dev->addr_len))
  978			return dev;
  979
  980	return NULL;
  981}
  982EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
  983
 
 
 
 
 
 
 
 
 
 
 
 
 
  984struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
  985{
  986	struct net_device *dev, *ret = NULL;
  987
  988	rcu_read_lock();
  989	for_each_netdev_rcu(net, dev)
  990		if (dev->type == type) {
  991			dev_hold(dev);
  992			ret = dev;
  993			break;
  994		}
  995	rcu_read_unlock();
  996	return ret;
  997}
  998EXPORT_SYMBOL(dev_getfirstbyhwtype);
  999
 1000/**
 1001 *	__dev_get_by_flags - find any device with given flags
 1002 *	@net: the applicable net namespace
 1003 *	@if_flags: IFF_* values
 1004 *	@mask: bitmask of bits in if_flags to check
 1005 *
 1006 *	Search for any interface with the given flags. Returns NULL if a device
 1007 *	is not found or a pointer to the device. Must be called inside
 1008 *	rtnl_lock(), and result refcount is unchanged.
 1009 */
 1010
 1011struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 1012				      unsigned short mask)
 1013{
 1014	struct net_device *dev, *ret;
 1015
 1016	ASSERT_RTNL();
 1017
 1018	ret = NULL;
 1019	for_each_netdev(net, dev) {
 1020		if (((dev->flags ^ if_flags) & mask) == 0) {
 1021			ret = dev;
 1022			break;
 1023		}
 1024	}
 1025	return ret;
 1026}
 1027EXPORT_SYMBOL(__dev_get_by_flags);
 1028
 1029/**
 1030 *	dev_valid_name - check if name is okay for network device
 1031 *	@name: name string
 1032 *
 1033 *	Network device names need to be valid file names to
 1034 *	allow sysfs to work.  We also disallow any kind of
 1035 *	whitespace.
 1036 */
 1037bool dev_valid_name(const char *name)
 1038{
 1039	if (*name == '\0')
 1040		return false;
 1041	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
 1042		return false;
 1043	if (!strcmp(name, ".") || !strcmp(name, ".."))
 1044		return false;
 1045
 1046	while (*name) {
 1047		if (*name == '/' || *name == ':' || isspace(*name))
 1048			return false;
 1049		name++;
 1050	}
 1051	return true;
 1052}
 1053EXPORT_SYMBOL(dev_valid_name);
 1054
 1055/**
 1056 *	__dev_alloc_name - allocate a name for a device
 1057 *	@net: network namespace to allocate the device name in
 1058 *	@name: name format string
 1059 *	@res: result name string
 1060 *
 1061 *	Passed a format string - eg "lt%d" it will try and find a suitable
 1062 *	id. It scans list of devices to build up a free map, then chooses
 1063 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 1064 *	while allocating the name and adding the device in order to avoid
 1065 *	duplicates.
 1066 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 1067 *	Returns the number of the unit assigned or a negative errno code.
 1068 */
 1069
 1070static int __dev_alloc_name(struct net *net, const char *name, char *res)
 1071{
 1072	int i = 0;
 1073	const char *p;
 1074	const int max_netdevices = 8*PAGE_SIZE;
 1075	unsigned long *inuse;
 1076	struct net_device *d;
 1077	char buf[IFNAMSIZ];
 1078
 1079	/* Verify the string as this thing may have come from the user.
 1080	 * There must be one "%d" and no other "%" characters.
 1081	 */
 1082	p = strchr(name, '%');
 1083	if (!p || p[1] != 'd' || strchr(p + 2, '%'))
 1084		return -EINVAL;
 1085
 1086	/* Use one page as a bit array of possible slots */
 1087	inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
 1088	if (!inuse)
 1089		return -ENOMEM;
 
 
 
 
 
 1090
 1091	for_each_netdev(net, d) {
 1092		struct netdev_name_node *name_node;
 
 
 1093
 1094		netdev_for_each_altname(d, name_node) {
 1095			if (!sscanf(name_node->name, name, &i))
 1096				continue;
 1097			if (i < 0 || i >= max_netdevices)
 1098				continue;
 1099
 1100			/* avoid cases where sscanf is not exact inverse of printf */
 1101			snprintf(buf, IFNAMSIZ, name, i);
 1102			if (!strncmp(buf, name_node->name, IFNAMSIZ))
 1103				__set_bit(i, inuse);
 1104		}
 1105		if (!sscanf(d->name, name, &i))
 1106			continue;
 1107		if (i < 0 || i >= max_netdevices)
 1108			continue;
 1109
 1110		/* avoid cases where sscanf is not exact inverse of printf */
 1111		snprintf(buf, IFNAMSIZ, name, i);
 1112		if (!strncmp(buf, d->name, IFNAMSIZ))
 1113			__set_bit(i, inuse);
 1114	}
 1115
 1116	i = find_first_zero_bit(inuse, max_netdevices);
 1117	bitmap_free(inuse);
 1118	if (i == max_netdevices)
 1119		return -ENFILE;
 1120
 1121	/* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */
 1122	strscpy(buf, name, IFNAMSIZ);
 1123	snprintf(res, IFNAMSIZ, buf, i);
 1124	return i;
 1125}
 1126
 1127/* Returns negative errno or allocated unit id (see __dev_alloc_name()) */
 1128static int dev_prep_valid_name(struct net *net, struct net_device *dev,
 1129			       const char *want_name, char *out_name,
 1130			       int dup_errno)
 1131{
 1132	if (!dev_valid_name(want_name))
 1133		return -EINVAL;
 1134
 1135	if (strchr(want_name, '%'))
 1136		return __dev_alloc_name(net, want_name, out_name);
 1137
 1138	if (netdev_name_in_use(net, want_name))
 1139		return -dup_errno;
 1140	if (out_name != want_name)
 1141		strscpy(out_name, want_name, IFNAMSIZ);
 1142	return 0;
 1143}
 1144
 1145/**
 1146 *	dev_alloc_name - allocate a name for a device
 1147 *	@dev: device
 1148 *	@name: name format string
 1149 *
 1150 *	Passed a format string - eg "lt%d" it will try and find a suitable
 1151 *	id. It scans list of devices to build up a free map, then chooses
 1152 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 1153 *	while allocating the name and adding the device in order to avoid
 1154 *	duplicates.
 1155 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 1156 *	Returns the number of the unit assigned or a negative errno code.
 1157 */
 1158
 1159int dev_alloc_name(struct net_device *dev, const char *name)
 1160{
 1161	return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE);
 
 
 
 
 
 
 
 
 
 1162}
 1163EXPORT_SYMBOL(dev_alloc_name);
 1164
 1165static int dev_get_valid_name(struct net *net, struct net_device *dev,
 1166			      const char *name)
 
 1167{
 
 1168	int ret;
 1169
 1170	ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST);
 1171	return ret < 0 ? ret : 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 1172}
 1173
 1174/**
 1175 *	dev_change_name - change name of a device
 1176 *	@dev: device
 1177 *	@newname: name (or format string) must be at least IFNAMSIZ
 1178 *
 1179 *	Change name of a device, can pass format strings "eth%d".
 1180 *	for wildcarding.
 1181 */
 1182int dev_change_name(struct net_device *dev, const char *newname)
 1183{
 1184	unsigned char old_assign_type;
 1185	char oldname[IFNAMSIZ];
 1186	int err = 0;
 1187	int ret;
 1188	struct net *net;
 1189
 1190	ASSERT_RTNL();
 1191	BUG_ON(!dev_net(dev));
 1192
 1193	net = dev_net(dev);
 
 
 1194
 1195	down_write(&devnet_rename_sem);
 1196
 1197	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
 1198		up_write(&devnet_rename_sem);
 1199		return 0;
 1200	}
 1201
 1202	memcpy(oldname, dev->name, IFNAMSIZ);
 1203
 1204	err = dev_get_valid_name(net, dev, newname);
 1205	if (err < 0) {
 1206		up_write(&devnet_rename_sem);
 1207		return err;
 1208	}
 1209
 1210	if (oldname[0] && !strchr(oldname, '%'))
 1211		netdev_info(dev, "renamed from %s%s\n", oldname,
 1212			    dev->flags & IFF_UP ? " (while UP)" : "");
 1213
 1214	old_assign_type = dev->name_assign_type;
 1215	dev->name_assign_type = NET_NAME_RENAMED;
 1216
 1217rollback:
 1218	ret = device_rename(&dev->dev, dev->name);
 1219	if (ret) {
 1220		memcpy(dev->name, oldname, IFNAMSIZ);
 1221		dev->name_assign_type = old_assign_type;
 1222		up_write(&devnet_rename_sem);
 1223		return ret;
 1224	}
 1225
 1226	up_write(&devnet_rename_sem);
 1227
 1228	netdev_adjacent_rename_links(dev, oldname);
 1229
 1230	write_lock(&dev_base_lock);
 1231	netdev_name_node_del(dev->name_node);
 1232	write_unlock(&dev_base_lock);
 1233
 1234	synchronize_rcu();
 1235
 1236	write_lock(&dev_base_lock);
 1237	netdev_name_node_add(net, dev->name_node);
 1238	write_unlock(&dev_base_lock);
 1239
 1240	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 1241	ret = notifier_to_errno(ret);
 1242
 1243	if (ret) {
 1244		/* err >= 0 after dev_alloc_name() or stores the first errno */
 1245		if (err >= 0) {
 1246			err = ret;
 1247			down_write(&devnet_rename_sem);
 1248			memcpy(dev->name, oldname, IFNAMSIZ);
 1249			memcpy(oldname, newname, IFNAMSIZ);
 1250			dev->name_assign_type = old_assign_type;
 1251			old_assign_type = NET_NAME_RENAMED;
 1252			goto rollback;
 1253		} else {
 1254			netdev_err(dev, "name change rollback failed: %d\n",
 1255				   ret);
 1256		}
 1257	}
 1258
 1259	return err;
 1260}
 1261
 1262/**
 1263 *	dev_set_alias - change ifalias of a device
 1264 *	@dev: device
 1265 *	@alias: name up to IFALIASZ
 1266 *	@len: limit of bytes to copy from info
 1267 *
 1268 *	Set ifalias for a device,
 1269 */
 1270int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 1271{
 1272	struct dev_ifalias *new_alias = NULL;
 
 
 1273
 1274	if (len >= IFALIASZ)
 1275		return -EINVAL;
 1276
 1277	if (len) {
 1278		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
 1279		if (!new_alias)
 1280			return -ENOMEM;
 1281
 1282		memcpy(new_alias->ifalias, alias, len);
 1283		new_alias->ifalias[len] = 0;
 1284	}
 1285
 1286	mutex_lock(&ifalias_mutex);
 1287	new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
 1288					mutex_is_locked(&ifalias_mutex));
 1289	mutex_unlock(&ifalias_mutex);
 1290
 1291	if (new_alias)
 1292		kfree_rcu(new_alias, rcuhead);
 1293
 
 1294	return len;
 1295}
 1296EXPORT_SYMBOL(dev_set_alias);
 1297
 1298/**
 1299 *	dev_get_alias - get ifalias of a device
 1300 *	@dev: device
 1301 *	@name: buffer to store name of ifalias
 1302 *	@len: size of buffer
 1303 *
 1304 *	get ifalias for a device.  Caller must make sure dev cannot go
 1305 *	away,  e.g. rcu read lock or own a reference count to device.
 1306 */
 1307int dev_get_alias(const struct net_device *dev, char *name, size_t len)
 1308{
 1309	const struct dev_ifalias *alias;
 1310	int ret = 0;
 1311
 1312	rcu_read_lock();
 1313	alias = rcu_dereference(dev->ifalias);
 1314	if (alias)
 1315		ret = snprintf(name, len, "%s", alias->ifalias);
 1316	rcu_read_unlock();
 1317
 1318	return ret;
 1319}
 1320
 1321/**
 1322 *	netdev_features_change - device changes features
 1323 *	@dev: device to cause notification
 1324 *
 1325 *	Called to indicate a device has changed features.
 1326 */
 1327void netdev_features_change(struct net_device *dev)
 1328{
 1329	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
 1330}
 1331EXPORT_SYMBOL(netdev_features_change);
 1332
 1333/**
 1334 *	netdev_state_change - device changes state
 1335 *	@dev: device to cause notification
 1336 *
 1337 *	Called to indicate a device has changed state. This function calls
 1338 *	the notifier chains for netdev_chain and sends a NEWLINK message
 1339 *	to the routing socket.
 1340 */
 1341void netdev_state_change(struct net_device *dev)
 1342{
 1343	if (dev->flags & IFF_UP) {
 1344		struct netdev_notifier_change_info change_info = {
 1345			.info.dev = dev,
 1346		};
 1347
 1348		call_netdevice_notifiers_info(NETDEV_CHANGE,
 
 1349					      &change_info.info);
 1350		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
 1351	}
 1352}
 1353EXPORT_SYMBOL(netdev_state_change);
 1354
 1355/**
 1356 * __netdev_notify_peers - notify network peers about existence of @dev,
 1357 * to be called when rtnl lock is already held.
 1358 * @dev: network device
 1359 *
 1360 * Generate traffic such that interested network peers are aware of
 1361 * @dev, such as by generating a gratuitous ARP. This may be used when
 1362 * a device wants to inform the rest of the network about some sort of
 1363 * reconfiguration such as a failover event or virtual machine
 1364 * migration.
 1365 */
 1366void __netdev_notify_peers(struct net_device *dev)
 1367{
 1368	ASSERT_RTNL();
 1369	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
 1370	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
 1371}
 1372EXPORT_SYMBOL(__netdev_notify_peers);
 1373
 1374/**
 1375 * netdev_notify_peers - notify network peers about existence of @dev
 1376 * @dev: network device
 1377 *
 1378 * Generate traffic such that interested network peers are aware of
 1379 * @dev, such as by generating a gratuitous ARP. This may be used when
 1380 * a device wants to inform the rest of the network about some sort of
 1381 * reconfiguration such as a failover event or virtual machine
 1382 * migration.
 1383 */
 1384void netdev_notify_peers(struct net_device *dev)
 1385{
 1386	rtnl_lock();
 1387	__netdev_notify_peers(dev);
 1388	rtnl_unlock();
 1389}
 1390EXPORT_SYMBOL(netdev_notify_peers);
 1391
 1392static int napi_threaded_poll(void *data);
 1393
 1394static int napi_kthread_create(struct napi_struct *n)
 1395{
 1396	int err = 0;
 1397
 1398	/* Create and wake up the kthread once to put it in
 1399	 * TASK_INTERRUPTIBLE mode to avoid the blocked task
 1400	 * warning and work with loadavg.
 1401	 */
 1402	n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
 1403				n->dev->name, n->napi_id);
 1404	if (IS_ERR(n->thread)) {
 1405		err = PTR_ERR(n->thread);
 1406		pr_err("kthread_run failed with err %d\n", err);
 1407		n->thread = NULL;
 1408	}
 1409
 1410	return err;
 1411}
 1412
 1413static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1414{
 1415	const struct net_device_ops *ops = dev->netdev_ops;
 1416	int ret;
 1417
 1418	ASSERT_RTNL();
 1419	dev_addr_check(dev);
 1420
 1421	if (!netif_device_present(dev)) {
 1422		/* may be detached because parent is runtime-suspended */
 1423		if (dev->dev.parent)
 1424			pm_runtime_resume(dev->dev.parent);
 1425		if (!netif_device_present(dev))
 1426			return -ENODEV;
 1427	}
 1428
 1429	/* Block netpoll from trying to do any rx path servicing.
 1430	 * If we don't do this there is a chance ndo_poll_controller
 1431	 * or ndo_poll may be running while we open the device
 1432	 */
 1433	netpoll_poll_disable(dev);
 1434
 1435	ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
 1436	ret = notifier_to_errno(ret);
 1437	if (ret)
 1438		return ret;
 1439
 1440	set_bit(__LINK_STATE_START, &dev->state);
 1441
 1442	if (ops->ndo_validate_addr)
 1443		ret = ops->ndo_validate_addr(dev);
 1444
 1445	if (!ret && ops->ndo_open)
 1446		ret = ops->ndo_open(dev);
 1447
 1448	netpoll_poll_enable(dev);
 1449
 1450	if (ret)
 1451		clear_bit(__LINK_STATE_START, &dev->state);
 1452	else {
 1453		dev->flags |= IFF_UP;
 1454		dev_set_rx_mode(dev);
 1455		dev_activate(dev);
 1456		add_device_randomness(dev->dev_addr, dev->addr_len);
 1457	}
 1458
 1459	return ret;
 1460}
 1461
 1462/**
 1463 *	dev_open	- prepare an interface for use.
 1464 *	@dev: device to open
 1465 *	@extack: netlink extended ack
 1466 *
 1467 *	Takes a device from down to up state. The device's private open
 1468 *	function is invoked and then the multicast lists are loaded. Finally
 1469 *	the device is moved into the up state and a %NETDEV_UP message is
 1470 *	sent to the netdev notifier chain.
 1471 *
 1472 *	Calling this function on an active interface is a nop. On a failure
 1473 *	a negative errno code is returned.
 1474 */
 1475int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1476{
 1477	int ret;
 1478
 1479	if (dev->flags & IFF_UP)
 1480		return 0;
 1481
 1482	ret = __dev_open(dev, extack);
 1483	if (ret < 0)
 1484		return ret;
 1485
 1486	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
 1487	call_netdevice_notifiers(NETDEV_UP, dev);
 1488
 1489	return ret;
 1490}
 1491EXPORT_SYMBOL(dev_open);
 1492
 1493static void __dev_close_many(struct list_head *head)
 1494{
 1495	struct net_device *dev;
 1496
 1497	ASSERT_RTNL();
 1498	might_sleep();
 1499
 1500	list_for_each_entry(dev, head, close_list) {
 1501		/* Temporarily disable netpoll until the interface is down */
 1502		netpoll_poll_disable(dev);
 1503
 1504		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
 1505
 1506		clear_bit(__LINK_STATE_START, &dev->state);
 1507
 1508		/* Synchronize to scheduled poll. We cannot touch poll list, it
 1509		 * can be even on different cpu. So just clear netif_running().
 1510		 *
 1511		 * dev->stop() will invoke napi_disable() on all of it's
 1512		 * napi_struct instances on this device.
 1513		 */
 1514		smp_mb__after_atomic(); /* Commit netif_running(). */
 1515	}
 1516
 1517	dev_deactivate_many(head);
 1518
 1519	list_for_each_entry(dev, head, close_list) {
 1520		const struct net_device_ops *ops = dev->netdev_ops;
 1521
 1522		/*
 1523		 *	Call the device specific close. This cannot fail.
 1524		 *	Only if device is UP
 1525		 *
 1526		 *	We allow it to be called even after a DETACH hot-plug
 1527		 *	event.
 1528		 */
 1529		if (ops->ndo_stop)
 1530			ops->ndo_stop(dev);
 1531
 1532		dev->flags &= ~IFF_UP;
 1533		netpoll_poll_enable(dev);
 1534	}
 
 
 1535}
 1536
 1537static void __dev_close(struct net_device *dev)
 1538{
 
 1539	LIST_HEAD(single);
 1540
 1541	list_add(&dev->close_list, &single);
 1542	__dev_close_many(&single);
 1543	list_del(&single);
 
 
 1544}
 1545
 1546void dev_close_many(struct list_head *head, bool unlink)
 1547{
 1548	struct net_device *dev, *tmp;
 1549
 1550	/* Remove the devices that don't need to be closed */
 1551	list_for_each_entry_safe(dev, tmp, head, close_list)
 1552		if (!(dev->flags & IFF_UP))
 1553			list_del_init(&dev->close_list);
 1554
 1555	__dev_close_many(head);
 1556
 1557	list_for_each_entry_safe(dev, tmp, head, close_list) {
 1558		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
 1559		call_netdevice_notifiers(NETDEV_DOWN, dev);
 1560		if (unlink)
 1561			list_del_init(&dev->close_list);
 1562	}
 
 
 1563}
 1564EXPORT_SYMBOL(dev_close_many);
 1565
 1566/**
 1567 *	dev_close - shutdown an interface.
 1568 *	@dev: device to shutdown
 1569 *
 1570 *	This function moves an active device into down state. A
 1571 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
 1572 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
 1573 *	chain.
 1574 */
 1575void dev_close(struct net_device *dev)
 1576{
 1577	if (dev->flags & IFF_UP) {
 1578		LIST_HEAD(single);
 1579
 1580		list_add(&dev->close_list, &single);
 1581		dev_close_many(&single, true);
 1582		list_del(&single);
 1583	}
 
 1584}
 1585EXPORT_SYMBOL(dev_close);
 1586
 1587
 1588/**
 1589 *	dev_disable_lro - disable Large Receive Offload on a device
 1590 *	@dev: device
 1591 *
 1592 *	Disable Large Receive Offload (LRO) on a net device.  Must be
 1593 *	called under RTNL.  This is needed if received packets may be
 1594 *	forwarded to another interface.
 1595 */
 1596void dev_disable_lro(struct net_device *dev)
 1597{
 1598	struct net_device *lower_dev;
 1599	struct list_head *iter;
 1600
 1601	dev->wanted_features &= ~NETIF_F_LRO;
 1602	netdev_update_features(dev);
 1603
 1604	if (unlikely(dev->features & NETIF_F_LRO))
 1605		netdev_WARN(dev, "failed to disable LRO!\n");
 1606
 1607	netdev_for_each_lower_dev(dev, lower_dev, iter)
 1608		dev_disable_lro(lower_dev);
 1609}
 1610EXPORT_SYMBOL(dev_disable_lro);
 1611
 1612/**
 1613 *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
 1614 *	@dev: device
 1615 *
 1616 *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
 1617 *	called under RTNL.  This is needed if Generic XDP is installed on
 1618 *	the device.
 1619 */
 1620static void dev_disable_gro_hw(struct net_device *dev)
 1621{
 1622	dev->wanted_features &= ~NETIF_F_GRO_HW;
 1623	netdev_update_features(dev);
 1624
 1625	if (unlikely(dev->features & NETIF_F_GRO_HW))
 1626		netdev_WARN(dev, "failed to disable GRO_HW!\n");
 1627}
 1628
 1629const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 1630{
 1631#define N(val) 						\
 1632	case NETDEV_##val:				\
 1633		return "NETDEV_" __stringify(val);
 1634	switch (cmd) {
 1635	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
 1636	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
 1637	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
 1638	N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN)
 1639	N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA)
 1640	N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE)
 1641	N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
 1642	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
 1643	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
 1644	N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
 1645	N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
 1646	N(XDP_FEAT_CHANGE)
 1647	}
 1648#undef N
 1649	return "UNKNOWN_NETDEV_EVENT";
 1650}
 1651EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
 1652
 1653static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
 1654				   struct net_device *dev)
 1655{
 1656	struct netdev_notifier_info info = {
 1657		.dev = dev,
 1658	};
 1659
 
 1660	return nb->notifier_call(nb, val, &info);
 1661}
 1662
 1663static int call_netdevice_register_notifiers(struct notifier_block *nb,
 1664					     struct net_device *dev)
 1665{
 1666	int err;
 1667
 1668	err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
 1669	err = notifier_to_errno(err);
 1670	if (err)
 1671		return err;
 1672
 1673	if (!(dev->flags & IFF_UP))
 1674		return 0;
 1675
 1676	call_netdevice_notifier(nb, NETDEV_UP, dev);
 1677	return 0;
 1678}
 1679
 1680static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
 1681						struct net_device *dev)
 1682{
 1683	if (dev->flags & IFF_UP) {
 1684		call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
 1685					dev);
 1686		call_netdevice_notifier(nb, NETDEV_DOWN, dev);
 1687	}
 1688	call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
 1689}
 1690
 1691static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
 1692						 struct net *net)
 1693{
 1694	struct net_device *dev;
 1695	int err;
 1696
 1697	for_each_netdev(net, dev) {
 1698		err = call_netdevice_register_notifiers(nb, dev);
 1699		if (err)
 1700			goto rollback;
 1701	}
 1702	return 0;
 1703
 1704rollback:
 1705	for_each_netdev_continue_reverse(net, dev)
 1706		call_netdevice_unregister_notifiers(nb, dev);
 1707	return err;
 1708}
 1709
 1710static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
 1711						    struct net *net)
 1712{
 1713	struct net_device *dev;
 1714
 1715	for_each_netdev(net, dev)
 1716		call_netdevice_unregister_notifiers(nb, dev);
 1717}
 1718
 1719static int dev_boot_phase = 1;
 1720
 1721/**
 1722 * register_netdevice_notifier - register a network notifier block
 1723 * @nb: notifier
 1724 *
 1725 * Register a notifier to be called when network device events occur.
 1726 * The notifier passed is linked into the kernel structures and must
 1727 * not be reused until it has been unregistered. A negative errno code
 1728 * is returned on a failure.
 1729 *
 1730 * When registered all registration and up events are replayed
 1731 * to the new notifier to allow device to have a race free
 1732 * view of the network device list.
 1733 */
 1734
 1735int register_netdevice_notifier(struct notifier_block *nb)
 1736{
 
 
 1737	struct net *net;
 1738	int err;
 1739
 1740	/* Close race with setup_net() and cleanup_net() */
 1741	down_write(&pernet_ops_rwsem);
 1742	rtnl_lock();
 1743	err = raw_notifier_chain_register(&netdev_chain, nb);
 1744	if (err)
 1745		goto unlock;
 1746	if (dev_boot_phase)
 1747		goto unlock;
 1748	for_each_net(net) {
 1749		err = call_netdevice_register_net_notifiers(nb, net);
 1750		if (err)
 1751			goto rollback;
 
 
 
 
 
 
 
 
 1752	}
 1753
 1754unlock:
 1755	rtnl_unlock();
 1756	up_write(&pernet_ops_rwsem);
 1757	return err;
 1758
 1759rollback:
 1760	for_each_net_continue_reverse(net)
 1761		call_netdevice_unregister_net_notifiers(nb, net);
 
 
 
 
 
 
 
 
 
 
 
 
 1762
 
 1763	raw_notifier_chain_unregister(&netdev_chain, nb);
 1764	goto unlock;
 1765}
 1766EXPORT_SYMBOL(register_netdevice_notifier);
 1767
 1768/**
 1769 * unregister_netdevice_notifier - unregister a network notifier block
 1770 * @nb: notifier
 1771 *
 1772 * Unregister a notifier previously registered by
 1773 * register_netdevice_notifier(). The notifier is unlinked into the
 1774 * kernel structures and may then be reused. A negative errno code
 1775 * is returned on a failure.
 1776 *
 1777 * After unregistering unregister and down device events are synthesized
 1778 * for all devices on the device list to the removed notifier to remove
 1779 * the need for special case cleanup code.
 1780 */
 1781
 1782int unregister_netdevice_notifier(struct notifier_block *nb)
 1783{
 
 1784	struct net *net;
 1785	int err;
 1786
 1787	/* Close race with setup_net() and cleanup_net() */
 1788	down_write(&pernet_ops_rwsem);
 1789	rtnl_lock();
 1790	err = raw_notifier_chain_unregister(&netdev_chain, nb);
 1791	if (err)
 1792		goto unlock;
 1793
 1794	for_each_net(net)
 1795		call_netdevice_unregister_net_notifiers(nb, net);
 1796
 
 
 
 
 
 
 
 1797unlock:
 1798	rtnl_unlock();
 1799	up_write(&pernet_ops_rwsem);
 1800	return err;
 1801}
 1802EXPORT_SYMBOL(unregister_netdevice_notifier);
 1803
 1804static int __register_netdevice_notifier_net(struct net *net,
 1805					     struct notifier_block *nb,
 1806					     bool ignore_call_fail)
 1807{
 1808	int err;
 1809
 1810	err = raw_notifier_chain_register(&net->netdev_chain, nb);
 1811	if (err)
 1812		return err;
 1813	if (dev_boot_phase)
 1814		return 0;
 1815
 1816	err = call_netdevice_register_net_notifiers(nb, net);
 1817	if (err && !ignore_call_fail)
 1818		goto chain_unregister;
 1819
 1820	return 0;
 1821
 1822chain_unregister:
 1823	raw_notifier_chain_unregister(&net->netdev_chain, nb);
 1824	return err;
 1825}
 1826
 1827static int __unregister_netdevice_notifier_net(struct net *net,
 1828					       struct notifier_block *nb)
 1829{
 1830	int err;
 1831
 1832	err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
 1833	if (err)
 1834		return err;
 1835
 1836	call_netdevice_unregister_net_notifiers(nb, net);
 1837	return 0;
 1838}
 1839
 1840/**
 1841 * register_netdevice_notifier_net - register a per-netns network notifier block
 1842 * @net: network namespace
 1843 * @nb: notifier
 1844 *
 1845 * Register a notifier to be called when network device events occur.
 1846 * The notifier passed is linked into the kernel structures and must
 1847 * not be reused until it has been unregistered. A negative errno code
 1848 * is returned on a failure.
 1849 *
 1850 * When registered all registration and up events are replayed
 1851 * to the new notifier to allow device to have a race free
 1852 * view of the network device list.
 1853 */
 1854
 1855int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
 1856{
 1857	int err;
 1858
 1859	rtnl_lock();
 1860	err = __register_netdevice_notifier_net(net, nb, false);
 1861	rtnl_unlock();
 1862	return err;
 1863}
 1864EXPORT_SYMBOL(register_netdevice_notifier_net);
 1865
 1866/**
 1867 * unregister_netdevice_notifier_net - unregister a per-netns
 1868 *                                     network notifier block
 1869 * @net: network namespace
 1870 * @nb: notifier
 1871 *
 1872 * Unregister a notifier previously registered by
 1873 * register_netdevice_notifier_net(). The notifier is unlinked from the
 1874 * kernel structures and may then be reused. A negative errno code
 1875 * is returned on a failure.
 1876 *
 1877 * After unregistering unregister and down device events are synthesized
 1878 * for all devices on the device list to the removed notifier to remove
 1879 * the need for special case cleanup code.
 1880 */
 1881
 1882int unregister_netdevice_notifier_net(struct net *net,
 1883				      struct notifier_block *nb)
 1884{
 1885	int err;
 1886
 1887	rtnl_lock();
 1888	err = __unregister_netdevice_notifier_net(net, nb);
 1889	rtnl_unlock();
 1890	return err;
 1891}
 1892EXPORT_SYMBOL(unregister_netdevice_notifier_net);
 1893
 1894static void __move_netdevice_notifier_net(struct net *src_net,
 1895					  struct net *dst_net,
 1896					  struct notifier_block *nb)
 1897{
 1898	__unregister_netdevice_notifier_net(src_net, nb);
 1899	__register_netdevice_notifier_net(dst_net, nb, true);
 1900}
 1901
 1902int register_netdevice_notifier_dev_net(struct net_device *dev,
 1903					struct notifier_block *nb,
 1904					struct netdev_net_notifier *nn)
 1905{
 1906	int err;
 1907
 1908	rtnl_lock();
 1909	err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
 1910	if (!err) {
 1911		nn->nb = nb;
 1912		list_add(&nn->list, &dev->net_notifier_list);
 1913	}
 1914	rtnl_unlock();
 1915	return err;
 1916}
 1917EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
 1918
 1919int unregister_netdevice_notifier_dev_net(struct net_device *dev,
 1920					  struct notifier_block *nb,
 1921					  struct netdev_net_notifier *nn)
 1922{
 1923	int err;
 1924
 1925	rtnl_lock();
 1926	list_del(&nn->list);
 1927	err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
 1928	rtnl_unlock();
 1929	return err;
 1930}
 1931EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
 1932
 1933static void move_netdevice_notifiers_dev_net(struct net_device *dev,
 1934					     struct net *net)
 1935{
 1936	struct netdev_net_notifier *nn;
 1937
 1938	list_for_each_entry(nn, &dev->net_notifier_list, list)
 1939		__move_netdevice_notifier_net(dev_net(dev), net, nn->nb);
 1940}
 1941
 1942/**
 1943 *	call_netdevice_notifiers_info - call all network notifier blocks
 1944 *	@val: value passed unmodified to notifier function
 
 1945 *	@info: notifier information data
 1946 *
 1947 *	Call all network notifier blocks.  Parameters and return value
 1948 *	are as for raw_notifier_call_chain().
 1949 */
 1950
 1951int call_netdevice_notifiers_info(unsigned long val,
 1952				  struct netdev_notifier_info *info)
 
 1953{
 1954	struct net *net = dev_net(info->dev);
 1955	int ret;
 1956
 1957	ASSERT_RTNL();
 1958
 1959	/* Run per-netns notifier block chain first, then run the global one.
 1960	 * Hopefully, one day, the global one is going to be removed after
 1961	 * all notifier block registrators get converted to be per-netns.
 1962	 */
 1963	ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
 1964	if (ret & NOTIFY_STOP_MASK)
 1965		return ret;
 1966	return raw_notifier_call_chain(&netdev_chain, val, info);
 1967}
 1968
 1969/**
 1970 *	call_netdevice_notifiers_info_robust - call per-netns notifier blocks
 1971 *	                                       for and rollback on error
 1972 *	@val_up: value passed unmodified to notifier function
 1973 *	@val_down: value passed unmodified to the notifier function when
 1974 *	           recovering from an error on @val_up
 1975 *	@info: notifier information data
 1976 *
 1977 *	Call all per-netns network notifier blocks, but not notifier blocks on
 1978 *	the global notifier chain. Parameters and return value are as for
 1979 *	raw_notifier_call_chain_robust().
 1980 */
 1981
 1982static int
 1983call_netdevice_notifiers_info_robust(unsigned long val_up,
 1984				     unsigned long val_down,
 1985				     struct netdev_notifier_info *info)
 1986{
 1987	struct net *net = dev_net(info->dev);
 1988
 1989	ASSERT_RTNL();
 1990
 1991	return raw_notifier_call_chain_robust(&net->netdev_chain,
 1992					      val_up, val_down, info);
 1993}
 1994
 1995static int call_netdevice_notifiers_extack(unsigned long val,
 1996					   struct net_device *dev,
 1997					   struct netlink_ext_ack *extack)
 1998{
 1999	struct netdev_notifier_info info = {
 2000		.dev = dev,
 2001		.extack = extack,
 2002	};
 2003
 2004	return call_netdevice_notifiers_info(val, &info);
 2005}
 2006
 2007/**
 2008 *	call_netdevice_notifiers - call all network notifier blocks
 2009 *      @val: value passed unmodified to notifier function
 2010 *      @dev: net_device pointer passed unmodified to notifier function
 2011 *
 2012 *	Call all network notifier blocks.  Parameters and return value
 2013 *	are as for raw_notifier_call_chain().
 2014 */
 2015
 2016int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
 2017{
 2018	return call_netdevice_notifiers_extack(val, dev, NULL);
 2019}
 2020EXPORT_SYMBOL(call_netdevice_notifiers);
 2021
 2022/**
 2023 *	call_netdevice_notifiers_mtu - call all network notifier blocks
 2024 *	@val: value passed unmodified to notifier function
 2025 *	@dev: net_device pointer passed unmodified to notifier function
 2026 *	@arg: additional u32 argument passed to the notifier function
 2027 *
 2028 *	Call all network notifier blocks.  Parameters and return value
 2029 *	are as for raw_notifier_call_chain().
 2030 */
 2031static int call_netdevice_notifiers_mtu(unsigned long val,
 2032					struct net_device *dev, u32 arg)
 2033{
 2034	struct netdev_notifier_info_ext info = {
 2035		.info.dev = dev,
 2036		.ext.mtu = arg,
 2037	};
 2038
 2039	BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
 2040
 2041	return call_netdevice_notifiers_info(val, &info.info);
 2042}
 
 2043
 2044#ifdef CONFIG_NET_INGRESS
 2045static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
 2046
 2047void net_inc_ingress_queue(void)
 2048{
 2049	static_branch_inc(&ingress_needed_key);
 2050}
 2051EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
 2052
 2053void net_dec_ingress_queue(void)
 2054{
 2055	static_branch_dec(&ingress_needed_key);
 2056}
 2057EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
 2058#endif
 2059
 2060#ifdef CONFIG_NET_EGRESS
 2061static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
 2062
 2063void net_inc_egress_queue(void)
 2064{
 2065	static_branch_inc(&egress_needed_key);
 2066}
 2067EXPORT_SYMBOL_GPL(net_inc_egress_queue);
 2068
 2069void net_dec_egress_queue(void)
 2070{
 2071	static_branch_dec(&egress_needed_key);
 2072}
 2073EXPORT_SYMBOL_GPL(net_dec_egress_queue);
 2074#endif
 2075
 2076DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
 2077EXPORT_SYMBOL(netstamp_needed_key);
 2078#ifdef CONFIG_JUMP_LABEL
 2079static atomic_t netstamp_needed_deferred;
 2080static atomic_t netstamp_wanted;
 2081static void netstamp_clear(struct work_struct *work)
 2082{
 2083	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
 2084	int wanted;
 2085
 2086	wanted = atomic_add_return(deferred, &netstamp_wanted);
 2087	if (wanted > 0)
 2088		static_branch_enable(&netstamp_needed_key);
 2089	else
 2090		static_branch_disable(&netstamp_needed_key);
 2091}
 2092static DECLARE_WORK(netstamp_work, netstamp_clear);
 2093#endif
 2094
 2095void net_enable_timestamp(void)
 2096{
 2097#ifdef CONFIG_JUMP_LABEL
 2098	int wanted = atomic_read(&netstamp_wanted);
 2099
 2100	while (wanted > 0) {
 2101		if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1))
 
 
 
 2102			return;
 2103	}
 2104	atomic_inc(&netstamp_needed_deferred);
 2105	schedule_work(&netstamp_work);
 2106#else
 2107	static_branch_inc(&netstamp_needed_key);
 2108#endif
 2109}
 2110EXPORT_SYMBOL(net_enable_timestamp);
 2111
 2112void net_disable_timestamp(void)
 2113{
 2114#ifdef CONFIG_JUMP_LABEL
 2115	int wanted = atomic_read(&netstamp_wanted);
 2116
 2117	while (wanted > 1) {
 2118		if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1))
 
 
 
 2119			return;
 2120	}
 2121	atomic_dec(&netstamp_needed_deferred);
 2122	schedule_work(&netstamp_work);
 2123#else
 2124	static_branch_dec(&netstamp_needed_key);
 2125#endif
 2126}
 2127EXPORT_SYMBOL(net_disable_timestamp);
 2128
 2129static inline void net_timestamp_set(struct sk_buff *skb)
 2130{
 2131	skb->tstamp = 0;
 2132	skb->mono_delivery_time = 0;
 2133	if (static_branch_unlikely(&netstamp_needed_key))
 2134		skb->tstamp = ktime_get_real();
 2135}
 2136
 2137#define net_timestamp_check(COND, SKB)				\
 2138	if (static_branch_unlikely(&netstamp_needed_key)) {	\
 2139		if ((COND) && !(SKB)->tstamp)			\
 2140			(SKB)->tstamp = ktime_get_real();	\
 2141	}							\
 2142
 2143bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
 2144{
 2145	return __is_skb_forwardable(dev, skb, true);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 2146}
 2147EXPORT_SYMBOL_GPL(is_skb_forwardable);
 2148
 2149static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
 2150			      bool check_mtu)
 2151{
 2152	int ret = ____dev_forward_skb(dev, skb, check_mtu);
 2153
 2154	if (likely(!ret)) {
 2155		skb->protocol = eth_type_trans(skb, dev);
 2156		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 2157	}
 2158
 2159	return ret;
 2160}
 2161
 2162int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 2163{
 2164	return __dev_forward_skb2(dev, skb, true);
 2165}
 2166EXPORT_SYMBOL_GPL(__dev_forward_skb);
 2167
 2168/**
 2169 * dev_forward_skb - loopback an skb to another netif
 2170 *
 2171 * @dev: destination network device
 2172 * @skb: buffer to forward
 2173 *
 2174 * return values:
 2175 *	NET_RX_SUCCESS	(no congestion)
 2176 *	NET_RX_DROP     (packet was dropped, but freed)
 2177 *
 2178 * dev_forward_skb can be used for injecting an skb from the
 2179 * start_xmit function of one device into the receive queue
 2180 * of another device.
 2181 *
 2182 * The receiving device may be in another namespace, so
 2183 * we have to clear all information in the skb that could
 2184 * impact namespace isolation.
 2185 */
 2186int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 2187{
 2188	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
 2189}
 2190EXPORT_SYMBOL_GPL(dev_forward_skb);
 2191
 2192int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
 2193{
 2194	return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
 2195}
 2196
 2197static inline int deliver_skb(struct sk_buff *skb,
 2198			      struct packet_type *pt_prev,
 2199			      struct net_device *orig_dev)
 2200{
 2201	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 2202		return -ENOMEM;
 2203	refcount_inc(&skb->users);
 2204	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 2205}
 2206
 2207static inline void deliver_ptype_list_skb(struct sk_buff *skb,
 2208					  struct packet_type **pt,
 2209					  struct net_device *orig_dev,
 2210					  __be16 type,
 2211					  struct list_head *ptype_list)
 2212{
 2213	struct packet_type *ptype, *pt_prev = *pt;
 2214
 2215	list_for_each_entry_rcu(ptype, ptype_list, list) {
 2216		if (ptype->type != type)
 2217			continue;
 2218		if (pt_prev)
 2219			deliver_skb(skb, pt_prev, orig_dev);
 2220		pt_prev = ptype;
 2221	}
 2222	*pt = pt_prev;
 2223}
 2224
 2225static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
 2226{
 2227	if (!ptype->af_packet_priv || !skb->sk)
 2228		return false;
 2229
 2230	if (ptype->id_match)
 2231		return ptype->id_match(ptype, skb->sk);
 2232	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
 2233		return true;
 2234
 2235	return false;
 2236}
 2237
 2238/**
 2239 * dev_nit_active - return true if any network interface taps are in use
 2240 *
 2241 * @dev: network device to check for the presence of taps
 2242 */
 2243bool dev_nit_active(struct net_device *dev)
 2244{
 2245	return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
 2246}
 2247EXPORT_SYMBOL_GPL(dev_nit_active);
 2248
 2249/*
 2250 *	Support routine. Sends outgoing frames to any network
 2251 *	taps currently in use.
 2252 */
 2253
 2254void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 2255{
 2256	struct packet_type *ptype;
 2257	struct sk_buff *skb2 = NULL;
 2258	struct packet_type *pt_prev = NULL;
 2259	struct list_head *ptype_list = &ptype_all;
 2260
 2261	rcu_read_lock();
 2262again:
 2263	list_for_each_entry_rcu(ptype, ptype_list, list) {
 2264		if (ptype->ignore_outgoing)
 2265			continue;
 2266
 2267		/* Never send packets back to the socket
 2268		 * they originated from - MvS (miquels@drinkel.ow.org)
 2269		 */
 2270		if (skb_loop_sk(ptype, skb))
 2271			continue;
 2272
 2273		if (pt_prev) {
 2274			deliver_skb(skb2, pt_prev, skb->dev);
 2275			pt_prev = ptype;
 2276			continue;
 2277		}
 2278
 2279		/* need to clone skb, done only once */
 2280		skb2 = skb_clone(skb, GFP_ATOMIC);
 2281		if (!skb2)
 2282			goto out_unlock;
 2283
 2284		net_timestamp_set(skb2);
 2285
 2286		/* skb->nh should be correctly
 2287		 * set by sender, so that the second statement is
 2288		 * just protection against buggy protocols.
 2289		 */
 2290		skb_reset_mac_header(skb2);
 2291
 2292		if (skb_network_header(skb2) < skb2->data ||
 2293		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
 2294			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
 2295					     ntohs(skb2->protocol),
 2296					     dev->name);
 2297			skb_reset_network_header(skb2);
 2298		}
 2299
 2300		skb2->transport_header = skb2->network_header;
 2301		skb2->pkt_type = PACKET_OUTGOING;
 2302		pt_prev = ptype;
 2303	}
 2304
 2305	if (ptype_list == &ptype_all) {
 2306		ptype_list = &dev->ptype_all;
 2307		goto again;
 2308	}
 2309out_unlock:
 2310	if (pt_prev) {
 2311		if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
 2312			pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
 2313		else
 2314			kfree_skb(skb2);
 2315	}
 2316	rcu_read_unlock();
 2317}
 2318EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
 2319
 2320/**
 2321 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
 2322 * @dev: Network device
 2323 * @txq: number of queues available
 2324 *
 2325 * If real_num_tx_queues is changed the tc mappings may no longer be
 2326 * valid. To resolve this verify the tc mapping remains valid and if
 2327 * not NULL the mapping. With no priorities mapping to this
 2328 * offset/count pair it will no longer be used. In the worst case TC0
 2329 * is invalid nothing can be done so disable priority mappings. If is
 2330 * expected that drivers will fix this mapping if they can before
 2331 * calling netif_set_real_num_tx_queues.
 2332 */
 2333static void netif_setup_tc(struct net_device *dev, unsigned int txq)
 2334{
 2335	int i;
 2336	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2337
 2338	/* If TC0 is invalidated disable TC mapping */
 2339	if (tc->offset + tc->count > txq) {
 2340		netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
 2341		dev->num_tc = 0;
 2342		return;
 2343	}
 2344
 2345	/* Invalidated prio to tc mappings set to TC0 */
 2346	for (i = 1; i < TC_BITMASK + 1; i++) {
 2347		int q = netdev_get_prio_tc_map(dev, i);
 2348
 2349		tc = &dev->tc_to_txq[q];
 2350		if (tc->offset + tc->count > txq) {
 2351			netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
 2352				    i, q);
 2353			netdev_set_prio_tc_map(dev, i, 0);
 2354		}
 2355	}
 2356}
 2357
 2358int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
 2359{
 2360	if (dev->num_tc) {
 2361		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2362		int i;
 2363
 2364		/* walk through the TCs and see if it falls into any of them */
 2365		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
 2366			if ((txq - tc->offset) < tc->count)
 2367				return i;
 2368		}
 2369
 2370		/* didn't find it, just return -1 to indicate no match */
 2371		return -1;
 2372	}
 2373
 2374	return 0;
 2375}
 2376EXPORT_SYMBOL(netdev_txq_to_tc);
 2377
 2378#ifdef CONFIG_XPS
 2379static struct static_key xps_needed __read_mostly;
 2380static struct static_key xps_rxqs_needed __read_mostly;
 2381static DEFINE_MUTEX(xps_map_mutex);
 2382#define xmap_dereference(P)		\
 2383	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
 2384
 2385static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
 2386			     struct xps_dev_maps *old_maps, int tci, u16 index)
 2387{
 2388	struct xps_map *map = NULL;
 2389	int pos;
 2390
 2391	map = xmap_dereference(dev_maps->attr_map[tci]);
 
 2392	if (!map)
 2393		return false;
 2394
 2395	for (pos = map->len; pos--;) {
 2396		if (map->queues[pos] != index)
 2397			continue;
 2398
 2399		if (map->len > 1) {
 2400			map->queues[pos] = map->queues[--map->len];
 2401			break;
 2402		}
 2403
 2404		if (old_maps)
 2405			RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
 2406		RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
 2407		kfree_rcu(map, rcu);
 2408		return false;
 2409	}
 2410
 2411	return true;
 2412}
 2413
 2414static bool remove_xps_queue_cpu(struct net_device *dev,
 2415				 struct xps_dev_maps *dev_maps,
 2416				 int cpu, u16 offset, u16 count)
 2417{
 2418	int num_tc = dev_maps->num_tc;
 2419	bool active = false;
 2420	int tci;
 2421
 2422	for (tci = cpu * num_tc; num_tc--; tci++) {
 2423		int i, j;
 2424
 2425		for (i = count, j = offset; i--; j++) {
 2426			if (!remove_xps_queue(dev_maps, NULL, tci, j))
 2427				break;
 2428		}
 2429
 2430		active |= i < 0;
 2431	}
 2432
 2433	return active;
 2434}
 2435
 2436static void reset_xps_maps(struct net_device *dev,
 2437			   struct xps_dev_maps *dev_maps,
 2438			   enum xps_map_type type)
 2439{
 2440	static_key_slow_dec_cpuslocked(&xps_needed);
 2441	if (type == XPS_RXQS)
 2442		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
 2443
 2444	RCU_INIT_POINTER(dev->xps_maps[type], NULL);
 2445
 2446	kfree_rcu(dev_maps, rcu);
 2447}
 2448
 2449static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
 2450			   u16 offset, u16 count)
 2451{
 2452	struct xps_dev_maps *dev_maps;
 
 2453	bool active = false;
 2454	int i, j;
 2455
 2456	dev_maps = xmap_dereference(dev->xps_maps[type]);
 
 
 2457	if (!dev_maps)
 2458		return;
 2459
 2460	for (j = 0; j < dev_maps->nr_ids; j++)
 2461		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
 2462	if (!active)
 2463		reset_xps_maps(dev, dev_maps, type);
 2464
 2465	if (type == XPS_CPUS) {
 2466		for (i = offset + (count - 1); count--; i--)
 2467			netdev_queue_numa_node_write(
 2468				netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
 2469	}
 2470}
 2471
 2472static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
 2473				   u16 count)
 2474{
 2475	if (!static_key_false(&xps_needed))
 2476		return;
 2477
 2478	cpus_read_lock();
 2479	mutex_lock(&xps_map_mutex);
 2480
 2481	if (static_key_false(&xps_rxqs_needed))
 2482		clean_xps_maps(dev, XPS_RXQS, offset, count);
 2483
 2484	clean_xps_maps(dev, XPS_CPUS, offset, count);
 2485
 
 2486	mutex_unlock(&xps_map_mutex);
 2487	cpus_read_unlock();
 2488}
 2489
 2490static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
 2491{
 2492	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
 2493}
 2494
 2495static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
 2496				      u16 index, bool is_rxqs_map)
 2497{
 2498	struct xps_map *new_map;
 2499	int alloc_len = XPS_MIN_MAP_ALLOC;
 2500	int i, pos;
 2501
 2502	for (pos = 0; map && pos < map->len; pos++) {
 2503		if (map->queues[pos] != index)
 2504			continue;
 2505		return map;
 2506	}
 2507
 2508	/* Need to add tx-queue to this CPU's/rx-queue's existing map */
 2509	if (map) {
 2510		if (pos < map->alloc_len)
 2511			return map;
 2512
 2513		alloc_len = map->alloc_len * 2;
 2514	}
 2515
 2516	/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
 2517	 *  map
 2518	 */
 2519	if (is_rxqs_map)
 2520		new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
 2521	else
 2522		new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
 2523				       cpu_to_node(attr_index));
 2524	if (!new_map)
 2525		return NULL;
 2526
 2527	for (i = 0; i < pos; i++)
 2528		new_map->queues[i] = map->queues[i];
 2529	new_map->alloc_len = alloc_len;
 2530	new_map->len = pos;
 2531
 2532	return new_map;
 2533}
 2534
 2535/* Copy xps maps at a given index */
 2536static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
 2537			      struct xps_dev_maps *new_dev_maps, int index,
 2538			      int tc, bool skip_tc)
 2539{
 2540	int i, tci = index * dev_maps->num_tc;
 2541	struct xps_map *map;
 2542
 2543	/* copy maps belonging to foreign traffic classes */
 2544	for (i = 0; i < dev_maps->num_tc; i++, tci++) {
 2545		if (i == tc && skip_tc)
 2546			continue;
 2547
 2548		/* fill in the new device map from the old device map */
 2549		map = xmap_dereference(dev_maps->attr_map[tci]);
 2550		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2551	}
 2552}
 2553
 2554/* Must be called under cpus_read_lock */
 2555int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 2556			  u16 index, enum xps_map_type type)
 2557{
 2558	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
 2559	const unsigned long *online_mask = NULL;
 2560	bool active = false, copy = false;
 2561	int i, j, tci, numa_node_id = -2;
 2562	int maps_sz, num_tc = 1, tc = 0;
 2563	struct xps_map *map, *new_map;
 2564	unsigned int nr_ids;
 2565
 2566	WARN_ON_ONCE(index >= dev->num_tx_queues);
 2567
 2568	if (dev->num_tc) {
 2569		/* Do not allow XPS on subordinate device directly */
 2570		num_tc = dev->num_tc;
 2571		if (num_tc < 0)
 2572			return -EINVAL;
 2573
 2574		/* If queue belongs to subordinate dev use its map */
 2575		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
 2576
 2577		tc = netdev_txq_to_tc(dev, index);
 2578		if (tc < 0)
 2579			return -EINVAL;
 2580	}
 2581
 2582	mutex_lock(&xps_map_mutex);
 2583
 2584	dev_maps = xmap_dereference(dev->xps_maps[type]);
 2585	if (type == XPS_RXQS) {
 2586		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
 2587		nr_ids = dev->num_rx_queues;
 2588	} else {
 2589		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
 2590		if (num_possible_cpus() > 1)
 2591			online_mask = cpumask_bits(cpu_online_mask);
 2592		nr_ids = nr_cpu_ids;
 2593	}
 2594
 2595	if (maps_sz < L1_CACHE_BYTES)
 2596		maps_sz = L1_CACHE_BYTES;
 2597
 2598	/* The old dev_maps could be larger or smaller than the one we're
 2599	 * setting up now, as dev->num_tc or nr_ids could have been updated in
 2600	 * between. We could try to be smart, but let's be safe instead and only
 2601	 * copy foreign traffic classes if the two map sizes match.
 2602	 */
 2603	if (dev_maps &&
 2604	    dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
 2605		copy = true;
 2606
 2607	/* allocate memory for queue storage */
 2608	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
 2609	     j < nr_ids;) {
 2610		if (!new_dev_maps) {
 2611			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
 2612			if (!new_dev_maps) {
 2613				mutex_unlock(&xps_map_mutex);
 2614				return -ENOMEM;
 2615			}
 2616
 2617			new_dev_maps->nr_ids = nr_ids;
 2618			new_dev_maps->num_tc = num_tc;
 2619		}
 2620
 2621		tci = j * num_tc + tc;
 2622		map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
 
 2623
 2624		map = expand_xps_map(map, j, index, type == XPS_RXQS);
 2625		if (!map)
 2626			goto error;
 2627
 2628		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2629	}
 2630
 2631	if (!new_dev_maps)
 2632		goto out_no_new_maps;
 2633
 2634	if (!dev_maps) {
 2635		/* Increment static keys at most once per type */
 2636		static_key_slow_inc_cpuslocked(&xps_needed);
 2637		if (type == XPS_RXQS)
 2638			static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
 2639	}
 
 2640
 2641	for (j = 0; j < nr_ids; j++) {
 2642		bool skip_tc = false;
 2643
 2644		tci = j * num_tc + tc;
 2645		if (netif_attr_test_mask(j, mask, nr_ids) &&
 2646		    netif_attr_test_online(j, online_mask, nr_ids)) {
 2647			/* add tx-queue to CPU/rx-queue maps */
 2648			int pos = 0;
 2649
 2650			skip_tc = true;
 
 
 2651
 2652			map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2653			while ((pos < map->len) && (map->queues[pos] != index))
 2654				pos++;
 2655
 2656			if (pos == map->len)
 2657				map->queues[map->len++] = index;
 2658#ifdef CONFIG_NUMA
 2659			if (type == XPS_CPUS) {
 2660				if (numa_node_id == -2)
 2661					numa_node_id = cpu_to_node(j);
 2662				else if (numa_node_id != cpu_to_node(j))
 2663					numa_node_id = -1;
 2664			}
 2665#endif
 
 
 
 
 2666		}
 2667
 2668		if (copy)
 2669			xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
 2670					  skip_tc);
 
 
 
 2671	}
 2672
 2673	rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
 2674
 2675	/* Cleanup old maps */
 2676	if (!dev_maps)
 2677		goto out_no_old_maps;
 2678
 2679	for (j = 0; j < dev_maps->nr_ids; j++) {
 2680		for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
 2681			map = xmap_dereference(dev_maps->attr_map[tci]);
 2682			if (!map)
 2683				continue;
 2684
 2685			if (copy) {
 2686				new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2687				if (map == new_map)
 2688					continue;
 2689			}
 2690
 2691			RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
 2692			kfree_rcu(map, rcu);
 2693		}
 2694	}
 2695
 2696	old_dev_maps = dev_maps;
 2697
 2698out_no_old_maps:
 2699	dev_maps = new_dev_maps;
 2700	active = true;
 2701
 2702out_no_new_maps:
 2703	if (type == XPS_CPUS)
 2704		/* update Tx queue numa node */
 2705		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
 2706					     (numa_node_id >= 0) ?
 2707					     numa_node_id : NUMA_NO_NODE);
 2708
 2709	if (!dev_maps)
 2710		goto out_no_maps;
 2711
 2712	/* removes tx-queue from unused CPUs/rx-queues */
 2713	for (j = 0; j < dev_maps->nr_ids; j++) {
 2714		tci = j * dev_maps->num_tc;
 2715
 2716		for (i = 0; i < dev_maps->num_tc; i++, tci++) {
 2717			if (i == tc &&
 2718			    netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
 2719			    netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
 2720				continue;
 2721
 2722			active |= remove_xps_queue(dev_maps,
 2723						   copy ? old_dev_maps : NULL,
 2724						   tci, index);
 2725		}
 2726	}
 2727
 2728	if (old_dev_maps)
 2729		kfree_rcu(old_dev_maps, rcu);
 2730
 2731	/* free map if not active */
 2732	if (!active)
 2733		reset_xps_maps(dev, dev_maps, type);
 
 
 2734
 2735out_no_maps:
 2736	mutex_unlock(&xps_map_mutex);
 2737
 2738	return 0;
 2739error:
 2740	/* remove any maps that we added */
 2741	for (j = 0; j < nr_ids; j++) {
 2742		for (i = num_tc, tci = j * num_tc; i--; tci++) {
 2743			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2744			map = copy ?
 2745			      xmap_dereference(dev_maps->attr_map[tci]) :
 2746			      NULL;
 2747			if (new_map && new_map != map)
 2748				kfree(new_map);
 2749		}
 2750	}
 2751
 2752	mutex_unlock(&xps_map_mutex);
 2753
 2754	kfree(new_dev_maps);
 2755	return -ENOMEM;
 2756}
 2757EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
 2758
 2759int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 2760			u16 index)
 2761{
 2762	int ret;
 2763
 2764	cpus_read_lock();
 2765	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
 2766	cpus_read_unlock();
 2767
 2768	return ret;
 2769}
 2770EXPORT_SYMBOL(netif_set_xps_queue);
 2771
 2772#endif
 2773static void netdev_unbind_all_sb_channels(struct net_device *dev)
 2774{
 2775	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2776
 2777	/* Unbind any subordinate channels */
 2778	while (txq-- != &dev->_tx[0]) {
 2779		if (txq->sb_dev)
 2780			netdev_unbind_sb_channel(dev, txq->sb_dev);
 2781	}
 2782}
 2783
 2784void netdev_reset_tc(struct net_device *dev)
 2785{
 2786#ifdef CONFIG_XPS
 2787	netif_reset_xps_queues_gt(dev, 0);
 2788#endif
 2789	netdev_unbind_all_sb_channels(dev);
 2790
 2791	/* Reset TC configuration of device */
 2792	dev->num_tc = 0;
 2793	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
 2794	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
 2795}
 2796EXPORT_SYMBOL(netdev_reset_tc);
 2797
 2798int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
 2799{
 2800	if (tc >= dev->num_tc)
 2801		return -EINVAL;
 2802
 2803#ifdef CONFIG_XPS
 2804	netif_reset_xps_queues(dev, offset, count);
 2805#endif
 2806	dev->tc_to_txq[tc].count = count;
 2807	dev->tc_to_txq[tc].offset = offset;
 2808	return 0;
 2809}
 2810EXPORT_SYMBOL(netdev_set_tc_queue);
 2811
 2812int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
 2813{
 2814	if (num_tc > TC_MAX_QUEUE)
 2815		return -EINVAL;
 2816
 2817#ifdef CONFIG_XPS
 2818	netif_reset_xps_queues_gt(dev, 0);
 2819#endif
 2820	netdev_unbind_all_sb_channels(dev);
 2821
 2822	dev->num_tc = num_tc;
 2823	return 0;
 2824}
 2825EXPORT_SYMBOL(netdev_set_num_tc);
 2826
 2827void netdev_unbind_sb_channel(struct net_device *dev,
 2828			      struct net_device *sb_dev)
 2829{
 2830	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2831
 2832#ifdef CONFIG_XPS
 2833	netif_reset_xps_queues_gt(sb_dev, 0);
 2834#endif
 2835	memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
 2836	memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
 2837
 2838	while (txq-- != &dev->_tx[0]) {
 2839		if (txq->sb_dev == sb_dev)
 2840			txq->sb_dev = NULL;
 2841	}
 2842}
 2843EXPORT_SYMBOL(netdev_unbind_sb_channel);
 2844
 2845int netdev_bind_sb_channel_queue(struct net_device *dev,
 2846				 struct net_device *sb_dev,
 2847				 u8 tc, u16 count, u16 offset)
 2848{
 2849	/* Make certain the sb_dev and dev are already configured */
 2850	if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
 2851		return -EINVAL;
 2852
 2853	/* We cannot hand out queues we don't have */
 2854	if ((offset + count) > dev->real_num_tx_queues)
 2855		return -EINVAL;
 2856
 2857	/* Record the mapping */
 2858	sb_dev->tc_to_txq[tc].count = count;
 2859	sb_dev->tc_to_txq[tc].offset = offset;
 2860
 2861	/* Provide a way for Tx queue to find the tc_to_txq map or
 2862	 * XPS map for itself.
 2863	 */
 2864	while (count--)
 2865		netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
 2866
 2867	return 0;
 2868}
 2869EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
 2870
 2871int netdev_set_sb_channel(struct net_device *dev, u16 channel)
 2872{
 2873	/* Do not use a multiqueue device to represent a subordinate channel */
 2874	if (netif_is_multiqueue(dev))
 2875		return -ENODEV;
 2876
 2877	/* We allow channels 1 - 32767 to be used for subordinate channels.
 2878	 * Channel 0 is meant to be "native" mode and used only to represent
 2879	 * the main root device. We allow writing 0 to reset the device back
 2880	 * to normal mode after being used as a subordinate channel.
 2881	 */
 2882	if (channel > S16_MAX)
 2883		return -EINVAL;
 2884
 2885	dev->num_tc = -channel;
 2886
 2887	return 0;
 2888}
 2889EXPORT_SYMBOL(netdev_set_sb_channel);
 2890
 2891/*
 2892 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
 2893 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
 2894 */
 2895int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
 2896{
 2897	bool disabling;
 2898	int rc;
 2899
 2900	disabling = txq < dev->real_num_tx_queues;
 2901
 2902	if (txq < 1 || txq > dev->num_tx_queues)
 2903		return -EINVAL;
 2904
 2905	if (dev->reg_state == NETREG_REGISTERED ||
 2906	    dev->reg_state == NETREG_UNREGISTERING) {
 2907		ASSERT_RTNL();
 2908
 2909		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
 2910						  txq);
 2911		if (rc)
 2912			return rc;
 2913
 2914		if (dev->num_tc)
 2915			netif_setup_tc(dev, txq);
 2916
 2917		dev_qdisc_change_real_num_tx(dev, txq);
 2918
 2919		dev->real_num_tx_queues = txq;
 2920
 2921		if (disabling) {
 2922			synchronize_net();
 2923			qdisc_reset_all_tx_gt(dev, txq);
 2924#ifdef CONFIG_XPS
 2925			netif_reset_xps_queues_gt(dev, txq);
 2926#endif
 2927		}
 2928	} else {
 2929		dev->real_num_tx_queues = txq;
 2930	}
 2931
 
 2932	return 0;
 2933}
 2934EXPORT_SYMBOL(netif_set_real_num_tx_queues);
 2935
 2936#ifdef CONFIG_SYSFS
 2937/**
 2938 *	netif_set_real_num_rx_queues - set actual number of RX queues used
 2939 *	@dev: Network device
 2940 *	@rxq: Actual number of RX queues
 2941 *
 2942 *	This must be called either with the rtnl_lock held or before
 2943 *	registration of the net device.  Returns 0 on success, or a
 2944 *	negative error code.  If called before registration, it always
 2945 *	succeeds.
 2946 */
 2947int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
 2948{
 2949	int rc;
 2950
 2951	if (rxq < 1 || rxq > dev->num_rx_queues)
 2952		return -EINVAL;
 2953
 2954	if (dev->reg_state == NETREG_REGISTERED) {
 2955		ASSERT_RTNL();
 2956
 2957		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
 2958						  rxq);
 2959		if (rc)
 2960			return rc;
 2961	}
 2962
 2963	dev->real_num_rx_queues = rxq;
 2964	return 0;
 2965}
 2966EXPORT_SYMBOL(netif_set_real_num_rx_queues);
 2967#endif
 2968
 2969/**
 2970 *	netif_set_real_num_queues - set actual number of RX and TX queues used
 2971 *	@dev: Network device
 2972 *	@txq: Actual number of TX queues
 2973 *	@rxq: Actual number of RX queues
 2974 *
 2975 *	Set the real number of both TX and RX queues.
 2976 *	Does nothing if the number of queues is already correct.
 2977 */
 2978int netif_set_real_num_queues(struct net_device *dev,
 2979			      unsigned int txq, unsigned int rxq)
 2980{
 2981	unsigned int old_rxq = dev->real_num_rx_queues;
 2982	int err;
 2983
 2984	if (txq < 1 || txq > dev->num_tx_queues ||
 2985	    rxq < 1 || rxq > dev->num_rx_queues)
 2986		return -EINVAL;
 2987
 2988	/* Start from increases, so the error path only does decreases -
 2989	 * decreases can't fail.
 2990	 */
 2991	if (rxq > dev->real_num_rx_queues) {
 2992		err = netif_set_real_num_rx_queues(dev, rxq);
 2993		if (err)
 2994			return err;
 2995	}
 2996	if (txq > dev->real_num_tx_queues) {
 2997		err = netif_set_real_num_tx_queues(dev, txq);
 2998		if (err)
 2999			goto undo_rx;
 3000	}
 3001	if (rxq < dev->real_num_rx_queues)
 3002		WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
 3003	if (txq < dev->real_num_tx_queues)
 3004		WARN_ON(netif_set_real_num_tx_queues(dev, txq));
 3005
 3006	return 0;
 3007undo_rx:
 3008	WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
 3009	return err;
 3010}
 3011EXPORT_SYMBOL(netif_set_real_num_queues);
 3012
 3013/**
 3014 * netif_set_tso_max_size() - set the max size of TSO frames supported
 3015 * @dev:	netdev to update
 3016 * @size:	max skb->len of a TSO frame
 3017 *
 3018 * Set the limit on the size of TSO super-frames the device can handle.
 3019 * Unless explicitly set the stack will assume the value of
 3020 * %GSO_LEGACY_MAX_SIZE.
 3021 */
 3022void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
 3023{
 3024	dev->tso_max_size = min(GSO_MAX_SIZE, size);
 3025	if (size < READ_ONCE(dev->gso_max_size))
 3026		netif_set_gso_max_size(dev, size);
 3027	if (size < READ_ONCE(dev->gso_ipv4_max_size))
 3028		netif_set_gso_ipv4_max_size(dev, size);
 3029}
 3030EXPORT_SYMBOL(netif_set_tso_max_size);
 3031
 3032/**
 3033 * netif_set_tso_max_segs() - set the max number of segs supported for TSO
 3034 * @dev:	netdev to update
 3035 * @segs:	max number of TCP segments
 3036 *
 3037 * Set the limit on the number of TCP segments the device can generate from
 3038 * a single TSO super-frame.
 3039 * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
 3040 */
 3041void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
 3042{
 3043	dev->tso_max_segs = segs;
 3044	if (segs < READ_ONCE(dev->gso_max_segs))
 3045		netif_set_gso_max_segs(dev, segs);
 3046}
 3047EXPORT_SYMBOL(netif_set_tso_max_segs);
 3048
 3049/**
 3050 * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
 3051 * @to:		netdev to update
 3052 * @from:	netdev from which to copy the limits
 3053 */
 3054void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
 3055{
 3056	netif_set_tso_max_size(to, from->tso_max_size);
 3057	netif_set_tso_max_segs(to, from->tso_max_segs);
 3058}
 3059EXPORT_SYMBOL(netif_inherit_tso_max);
 3060
 3061/**
 3062 * netif_get_num_default_rss_queues - default number of RSS queues
 3063 *
 3064 * Default value is the number of physical cores if there are only 1 or 2, or
 3065 * divided by 2 if there are more.
 3066 */
 3067int netif_get_num_default_rss_queues(void)
 3068{
 3069	cpumask_var_t cpus;
 3070	int cpu, count = 0;
 3071
 3072	if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
 3073		return 1;
 3074
 3075	cpumask_copy(cpus, cpu_online_mask);
 3076	for_each_cpu(cpu, cpus) {
 3077		++count;
 3078		cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
 3079	}
 3080	free_cpumask_var(cpus);
 3081
 3082	return count > 2 ? DIV_ROUND_UP(count, 2) : count;
 3083}
 3084EXPORT_SYMBOL(netif_get_num_default_rss_queues);
 3085
 3086static void __netif_reschedule(struct Qdisc *q)
 3087{
 3088	struct softnet_data *sd;
 3089	unsigned long flags;
 3090
 3091	local_irq_save(flags);
 3092	sd = this_cpu_ptr(&softnet_data);
 3093	q->next_sched = NULL;
 3094	*sd->output_queue_tailp = q;
 3095	sd->output_queue_tailp = &q->next_sched;
 3096	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 3097	local_irq_restore(flags);
 3098}
 3099
 3100void __netif_schedule(struct Qdisc *q)
 3101{
 3102	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
 3103		__netif_reschedule(q);
 3104}
 3105EXPORT_SYMBOL(__netif_schedule);
 3106
 3107struct dev_kfree_skb_cb {
 3108	enum skb_drop_reason reason;
 3109};
 3110
 3111static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
 3112{
 3113	return (struct dev_kfree_skb_cb *)skb->cb;
 3114}
 3115
 3116void netif_schedule_queue(struct netdev_queue *txq)
 3117{
 3118	rcu_read_lock();
 3119	if (!netif_xmit_stopped(txq)) {
 3120		struct Qdisc *q = rcu_dereference(txq->qdisc);
 3121
 3122		__netif_schedule(q);
 3123	}
 3124	rcu_read_unlock();
 3125}
 3126EXPORT_SYMBOL(netif_schedule_queue);
 3127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3128void netif_tx_wake_queue(struct netdev_queue *dev_queue)
 3129{
 3130	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
 3131		struct Qdisc *q;
 3132
 3133		rcu_read_lock();
 3134		q = rcu_dereference(dev_queue->qdisc);
 3135		__netif_schedule(q);
 3136		rcu_read_unlock();
 3137	}
 3138}
 3139EXPORT_SYMBOL(netif_tx_wake_queue);
 3140
 3141void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason)
 3142{
 3143	unsigned long flags;
 3144
 3145	if (unlikely(!skb))
 3146		return;
 3147
 3148	if (likely(refcount_read(&skb->users) == 1)) {
 3149		smp_rmb();
 3150		refcount_set(&skb->users, 0);
 3151	} else if (likely(!refcount_dec_and_test(&skb->users))) {
 3152		return;
 3153	}
 3154	get_kfree_skb_cb(skb)->reason = reason;
 3155	local_irq_save(flags);
 3156	skb->next = __this_cpu_read(softnet_data.completion_queue);
 3157	__this_cpu_write(softnet_data.completion_queue, skb);
 3158	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 3159	local_irq_restore(flags);
 3160}
 3161EXPORT_SYMBOL(dev_kfree_skb_irq_reason);
 3162
 3163void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason)
 3164{
 3165	if (in_hardirq() || irqs_disabled())
 3166		dev_kfree_skb_irq_reason(skb, reason);
 3167	else
 3168		kfree_skb_reason(skb, reason);
 3169}
 3170EXPORT_SYMBOL(dev_kfree_skb_any_reason);
 3171
 3172
 3173/**
 3174 * netif_device_detach - mark device as removed
 3175 * @dev: network device
 3176 *
 3177 * Mark device as removed from system and therefore no longer available.
 3178 */
 3179void netif_device_detach(struct net_device *dev)
 3180{
 3181	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
 3182	    netif_running(dev)) {
 3183		netif_tx_stop_all_queues(dev);
 3184	}
 3185}
 3186EXPORT_SYMBOL(netif_device_detach);
 3187
 3188/**
 3189 * netif_device_attach - mark device as attached
 3190 * @dev: network device
 3191 *
 3192 * Mark device as attached from system and restart if needed.
 3193 */
 3194void netif_device_attach(struct net_device *dev)
 3195{
 3196	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
 3197	    netif_running(dev)) {
 3198		netif_tx_wake_all_queues(dev);
 3199		__netdev_watchdog_up(dev);
 3200	}
 3201}
 3202EXPORT_SYMBOL(netif_device_attach);
 3203
 3204/*
 3205 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
 3206 * to be used as a distribution range.
 3207 */
 3208static u16 skb_tx_hash(const struct net_device *dev,
 3209		       const struct net_device *sb_dev,
 3210		       struct sk_buff *skb)
 3211{
 3212	u32 hash;
 3213	u16 qoffset = 0;
 3214	u16 qcount = dev->real_num_tx_queues;
 3215
 3216	if (dev->num_tc) {
 3217		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
 3218
 3219		qoffset = sb_dev->tc_to_txq[tc].offset;
 3220		qcount = sb_dev->tc_to_txq[tc].count;
 3221		if (unlikely(!qcount)) {
 3222			net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
 3223					     sb_dev->name, qoffset, tc);
 3224			qoffset = 0;
 3225			qcount = dev->real_num_tx_queues;
 3226		}
 3227	}
 3228
 3229	if (skb_rx_queue_recorded(skb)) {
 3230		DEBUG_NET_WARN_ON_ONCE(qcount == 0);
 3231		hash = skb_get_rx_queue(skb);
 3232		if (hash >= qoffset)
 3233			hash -= qoffset;
 3234		while (unlikely(hash >= qcount))
 3235			hash -= qcount;
 3236		return hash + qoffset;
 
 
 
 
 3237	}
 3238
 3239	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
 3240}
 
 3241
 3242void skb_warn_bad_offload(const struct sk_buff *skb)
 3243{
 3244	static const netdev_features_t null_features;
 3245	struct net_device *dev = skb->dev;
 3246	const char *name = "";
 3247
 3248	if (!net_ratelimit())
 3249		return;
 3250
 3251	if (dev) {
 3252		if (dev->dev.parent)
 3253			name = dev_driver_string(dev->dev.parent);
 3254		else
 3255			name = netdev_name(dev);
 3256	}
 3257	skb_dump(KERN_WARNING, skb, false);
 3258	WARN(1, "%s: caps=(%pNF, %pNF)\n",
 3259	     name, dev ? &dev->features : &null_features,
 3260	     skb->sk ? &skb->sk->sk_route_caps : &null_features);
 
 
 3261}
 3262
 3263/*
 3264 * Invalidate hardware checksum when packet is to be mangled, and
 3265 * complete checksum manually on outgoing path.
 3266 */
 3267int skb_checksum_help(struct sk_buff *skb)
 3268{
 3269	__wsum csum;
 3270	int ret = 0, offset;
 3271
 3272	if (skb->ip_summed == CHECKSUM_COMPLETE)
 3273		goto out_set_summed;
 3274
 3275	if (unlikely(skb_is_gso(skb))) {
 3276		skb_warn_bad_offload(skb);
 3277		return -EINVAL;
 3278	}
 3279
 3280	/* Before computing a checksum, we should make sure no frag could
 3281	 * be modified by an external entity : checksum could be wrong.
 3282	 */
 3283	if (skb_has_shared_frag(skb)) {
 3284		ret = __skb_linearize(skb);
 3285		if (ret)
 3286			goto out;
 3287	}
 3288
 3289	offset = skb_checksum_start_offset(skb);
 3290	ret = -EINVAL;
 3291	if (unlikely(offset >= skb_headlen(skb))) {
 3292		DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
 3293		WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n",
 3294			  offset, skb_headlen(skb));
 3295		goto out;
 3296	}
 3297	csum = skb_checksum(skb, offset, skb->len - offset, 0);
 3298
 3299	offset += skb->csum_offset;
 3300	if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) {
 3301		DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
 3302		WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n",
 3303			  offset + sizeof(__sum16), skb_headlen(skb));
 3304		goto out;
 
 
 3305	}
 3306	ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
 3307	if (ret)
 3308		goto out;
 3309
 3310	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
 3311out_set_summed:
 3312	skb->ip_summed = CHECKSUM_NONE;
 3313out:
 3314	return ret;
 3315}
 3316EXPORT_SYMBOL(skb_checksum_help);
 3317
 3318int skb_crc32c_csum_help(struct sk_buff *skb)
 3319{
 3320	__le32 crc32c_csum;
 3321	int ret = 0, offset, start;
 3322
 3323	if (skb->ip_summed != CHECKSUM_PARTIAL)
 3324		goto out;
 3325
 3326	if (unlikely(skb_is_gso(skb)))
 3327		goto out;
 3328
 3329	/* Before computing a checksum, we should make sure no frag could
 3330	 * be modified by an external entity : checksum could be wrong.
 3331	 */
 3332	if (unlikely(skb_has_shared_frag(skb))) {
 3333		ret = __skb_linearize(skb);
 3334		if (ret)
 3335			goto out;
 3336	}
 3337	start = skb_checksum_start_offset(skb);
 3338	offset = start + offsetof(struct sctphdr, checksum);
 3339	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
 3340		ret = -EINVAL;
 3341		goto out;
 3342	}
 3343
 3344	ret = skb_ensure_writable(skb, offset + sizeof(__le32));
 3345	if (ret)
 3346		goto out;
 3347
 3348	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
 3349						  skb->len - start, ~(__u32)0,
 3350						  crc32c_csum_stub));
 3351	*(__le32 *)(skb->data + offset) = crc32c_csum;
 3352	skb_reset_csum_not_inet(skb);
 3353out:
 3354	return ret;
 3355}
 3356
 3357__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
 3358{
 3359	__be16 type = skb->protocol;
 3360
 3361	/* Tunnel gso handlers can set protocol to ethernet. */
 3362	if (type == htons(ETH_P_TEB)) {
 3363		struct ethhdr *eth;
 3364
 3365		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
 3366			return 0;
 3367
 3368		eth = (struct ethhdr *)skb->data;
 3369		type = eth->h_proto;
 3370	}
 3371
 3372	return vlan_get_protocol_and_depth(skb, type, depth);
 3373}
 3374
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3375
 3376/* Take action when hardware reception checksum errors are detected. */
 3377#ifdef CONFIG_BUG
 3378static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3379{
 3380	netdev_err(dev, "hw csum failure\n");
 3381	skb_dump(KERN_ERR, skb, true);
 3382	dump_stack();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3383}
 
 3384
 3385void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
 
 
 3386{
 3387	DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
 
 
 
 3388}
 3389EXPORT_SYMBOL(netdev_rx_csum_fault);
 3390#endif
 3391
 3392/* XXX: check that highmem exists at all on the given machine. */
 
 
 
 
 3393static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 3394{
 3395#ifdef CONFIG_HIGHMEM
 3396	int i;
 3397
 3398	if (!(dev->features & NETIF_F_HIGHDMA)) {
 3399		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 3400			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 3401
 3402			if (PageHighMem(skb_frag_page(frag)))
 3403				return 1;
 3404		}
 3405	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 3406#endif
 3407	return 0;
 3408}
 3409
 3410/* If MPLS offload request, verify we are testing hardware MPLS features
 3411 * instead of standard features for the netdev.
 3412 */
 3413#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
 3414static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3415					   netdev_features_t features,
 3416					   __be16 type)
 3417{
 3418	if (eth_p_mpls(type))
 3419		features &= skb->dev->mpls_features;
 3420
 3421	return features;
 3422}
 3423#else
 3424static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3425					   netdev_features_t features,
 3426					   __be16 type)
 3427{
 3428	return features;
 3429}
 3430#endif
 3431
 3432static netdev_features_t harmonize_features(struct sk_buff *skb,
 3433	netdev_features_t features)
 3434{
 
 3435	__be16 type;
 3436
 3437	type = skb_network_protocol(skb, NULL);
 3438	features = net_mpls_features(skb, features, type);
 3439
 3440	if (skb->ip_summed != CHECKSUM_NONE &&
 3441	    !can_checksum_protocol(features, type)) {
 3442		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
 3443	}
 3444	if (illegal_highdma(skb->dev, skb))
 3445		features &= ~NETIF_F_SG;
 3446
 3447	return features;
 3448}
 3449
 3450netdev_features_t passthru_features_check(struct sk_buff *skb,
 3451					  struct net_device *dev,
 3452					  netdev_features_t features)
 3453{
 3454	return features;
 3455}
 3456EXPORT_SYMBOL(passthru_features_check);
 3457
 3458static netdev_features_t dflt_features_check(struct sk_buff *skb,
 3459					     struct net_device *dev,
 3460					     netdev_features_t features)
 3461{
 3462	return vlan_features_check(skb, features);
 3463}
 3464
 3465static netdev_features_t gso_features_check(const struct sk_buff *skb,
 3466					    struct net_device *dev,
 3467					    netdev_features_t features)
 3468{
 3469	u16 gso_segs = skb_shinfo(skb)->gso_segs;
 3470
 3471	if (gso_segs > READ_ONCE(dev->gso_max_segs))
 3472		return features & ~NETIF_F_GSO_MASK;
 3473
 3474	if (unlikely(skb->len >= READ_ONCE(dev->gso_max_size)))
 3475		return features & ~NETIF_F_GSO_MASK;
 3476
 3477	if (!skb_shinfo(skb)->gso_type) {
 3478		skb_warn_bad_offload(skb);
 3479		return features & ~NETIF_F_GSO_MASK;
 3480	}
 3481
 3482	/* Support for GSO partial features requires software
 3483	 * intervention before we can actually process the packets
 3484	 * so we need to strip support for any partial features now
 3485	 * and we can pull them back in after we have partially
 3486	 * segmented the frame.
 3487	 */
 3488	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
 3489		features &= ~dev->gso_partial_features;
 3490
 3491	/* Make sure to clear the IPv4 ID mangling feature if the
 3492	 * IPv4 header has the potential to be fragmented.
 3493	 */
 3494	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
 3495		struct iphdr *iph = skb->encapsulation ?
 3496				    inner_ip_hdr(skb) : ip_hdr(skb);
 3497
 3498		if (!(iph->frag_off & htons(IP_DF)))
 3499			features &= ~NETIF_F_TSO_MANGLEID;
 3500	}
 3501
 3502	return features;
 3503}
 3504
 3505netdev_features_t netif_skb_features(struct sk_buff *skb)
 3506{
 3507	struct net_device *dev = skb->dev;
 3508	netdev_features_t features = dev->features;
 3509
 3510	if (skb_is_gso(skb))
 3511		features = gso_features_check(skb, dev, features);
 3512
 3513	/* If encapsulation offload request, verify we are testing
 3514	 * hardware encapsulation features instead of standard
 3515	 * features for the netdev
 3516	 */
 3517	if (skb->encapsulation)
 3518		features &= dev->hw_enc_features;
 3519
 3520	if (skb_vlan_tagged(skb))
 3521		features = netdev_intersect_features(features,
 3522						     dev->vlan_features |
 3523						     NETIF_F_HW_VLAN_CTAG_TX |
 3524						     NETIF_F_HW_VLAN_STAG_TX);
 3525
 3526	if (dev->netdev_ops->ndo_features_check)
 3527		features &= dev->netdev_ops->ndo_features_check(skb, dev,
 3528								features);
 3529	else
 3530		features &= dflt_features_check(skb, dev, features);
 3531
 3532	return harmonize_features(skb, features);
 3533}
 3534EXPORT_SYMBOL(netif_skb_features);
 3535
 3536static int xmit_one(struct sk_buff *skb, struct net_device *dev,
 3537		    struct netdev_queue *txq, bool more)
 3538{
 3539	unsigned int len;
 3540	int rc;
 3541
 3542	if (dev_nit_active(dev))
 3543		dev_queue_xmit_nit(skb, dev);
 3544
 3545	len = skb->len;
 3546	trace_net_dev_start_xmit(skb, dev);
 3547	rc = netdev_start_xmit(skb, dev, txq, more);
 3548	trace_net_dev_xmit(skb, rc, dev, len);
 3549
 3550	return rc;
 3551}
 3552
 3553struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
 3554				    struct netdev_queue *txq, int *ret)
 3555{
 3556	struct sk_buff *skb = first;
 3557	int rc = NETDEV_TX_OK;
 3558
 3559	while (skb) {
 3560		struct sk_buff *next = skb->next;
 3561
 3562		skb_mark_not_on_list(skb);
 3563		rc = xmit_one(skb, dev, txq, next != NULL);
 3564		if (unlikely(!dev_xmit_complete(rc))) {
 3565			skb->next = next;
 3566			goto out;
 3567		}
 3568
 3569		skb = next;
 3570		if (netif_tx_queue_stopped(txq) && skb) {
 3571			rc = NETDEV_TX_BUSY;
 3572			break;
 3573		}
 3574	}
 3575
 3576out:
 3577	*ret = rc;
 3578	return skb;
 3579}
 3580
 3581static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
 3582					  netdev_features_t features)
 3583{
 3584	if (skb_vlan_tag_present(skb) &&
 3585	    !vlan_hw_offload_capable(features, skb->vlan_proto))
 3586		skb = __vlan_hwaccel_push_inside(skb);
 3587	return skb;
 3588}
 3589
 3590int skb_csum_hwoffload_help(struct sk_buff *skb,
 3591			    const netdev_features_t features)
 3592{
 3593	if (unlikely(skb_csum_is_sctp(skb)))
 3594		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
 3595			skb_crc32c_csum_help(skb);
 3596
 3597	if (features & NETIF_F_HW_CSUM)
 3598		return 0;
 3599
 3600	if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
 3601		switch (skb->csum_offset) {
 3602		case offsetof(struct tcphdr, check):
 3603		case offsetof(struct udphdr, check):
 3604			return 0;
 3605		}
 3606	}
 3607
 3608	return skb_checksum_help(skb);
 3609}
 3610EXPORT_SYMBOL(skb_csum_hwoffload_help);
 3611
 3612static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
 3613{
 3614	netdev_features_t features;
 3615
 3616	features = netif_skb_features(skb);
 3617	skb = validate_xmit_vlan(skb, features);
 3618	if (unlikely(!skb))
 3619		goto out_null;
 3620
 3621	skb = sk_validate_xmit_skb(skb, dev);
 3622	if (unlikely(!skb))
 3623		goto out_null;
 3624
 3625	if (netif_needs_gso(skb, features)) {
 3626		struct sk_buff *segs;
 3627
 3628		segs = skb_gso_segment(skb, features);
 3629		if (IS_ERR(segs)) {
 3630			goto out_kfree_skb;
 3631		} else if (segs) {
 3632			consume_skb(skb);
 3633			skb = segs;
 3634		}
 3635	} else {
 3636		if (skb_needs_linearize(skb, features) &&
 3637		    __skb_linearize(skb))
 3638			goto out_kfree_skb;
 3639
 3640		/* If packet is not checksummed and device does not
 3641		 * support checksumming for this protocol, complete
 3642		 * checksumming here.
 3643		 */
 3644		if (skb->ip_summed == CHECKSUM_PARTIAL) {
 3645			if (skb->encapsulation)
 3646				skb_set_inner_transport_header(skb,
 3647							       skb_checksum_start_offset(skb));
 3648			else
 3649				skb_set_transport_header(skb,
 3650							 skb_checksum_start_offset(skb));
 3651			if (skb_csum_hwoffload_help(skb, features))
 
 3652				goto out_kfree_skb;
 3653		}
 3654	}
 3655
 3656	skb = validate_xmit_xfrm(skb, features, again);
 3657
 3658	return skb;
 3659
 3660out_kfree_skb:
 3661	kfree_skb(skb);
 3662out_null:
 3663	dev_core_stats_tx_dropped_inc(dev);
 3664	return NULL;
 3665}
 3666
 3667struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
 3668{
 3669	struct sk_buff *next, *head = NULL, *tail;
 3670
 3671	for (; skb != NULL; skb = next) {
 3672		next = skb->next;
 3673		skb_mark_not_on_list(skb);
 3674
 3675		/* in case skb wont be segmented, point to itself */
 3676		skb->prev = skb;
 3677
 3678		skb = validate_xmit_skb(skb, dev, again);
 3679		if (!skb)
 3680			continue;
 3681
 3682		if (!head)
 3683			head = skb;
 3684		else
 3685			tail->next = skb;
 3686		/* If skb was segmented, skb->prev points to
 3687		 * the last segment. If not, it still contains skb.
 3688		 */
 3689		tail = skb->prev;
 3690	}
 3691	return head;
 3692}
 3693EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
 3694
 3695static void qdisc_pkt_len_init(struct sk_buff *skb)
 3696{
 3697	const struct skb_shared_info *shinfo = skb_shinfo(skb);
 3698
 3699	qdisc_skb_cb(skb)->pkt_len = skb->len;
 3700
 3701	/* To get more precise estimation of bytes sent on wire,
 3702	 * we add to pkt_len the headers size of all segments
 3703	 */
 3704	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
 3705		u16 gso_segs = shinfo->gso_segs;
 3706		unsigned int hdr_len;
 
 3707
 3708		/* mac layer + network layer */
 3709		hdr_len = skb_transport_offset(skb);
 3710
 3711		/* + transport layer */
 3712		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
 3713			const struct tcphdr *th;
 3714			struct tcphdr _tcphdr;
 3715
 3716			th = skb_header_pointer(skb, hdr_len,
 3717						sizeof(_tcphdr), &_tcphdr);
 3718			if (likely(th))
 3719				hdr_len += __tcp_hdrlen(th);
 3720		} else {
 3721			struct udphdr _udphdr;
 3722
 3723			if (skb_header_pointer(skb, hdr_len,
 3724					       sizeof(_udphdr), &_udphdr))
 3725				hdr_len += sizeof(struct udphdr);
 3726		}
 3727
 3728		if (shinfo->gso_type & SKB_GSO_DODGY)
 3729			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
 3730						shinfo->gso_size);
 3731
 3732		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
 3733	}
 3734}
 3735
 3736static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
 3737			     struct sk_buff **to_free,
 3738			     struct netdev_queue *txq)
 3739{
 3740	int rc;
 3741
 3742	rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
 3743	if (rc == NET_XMIT_SUCCESS)
 3744		trace_qdisc_enqueue(q, txq, skb);
 3745	return rc;
 3746}
 3747
 3748static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 3749				 struct net_device *dev,
 3750				 struct netdev_queue *txq)
 3751{
 3752	spinlock_t *root_lock = qdisc_lock(q);
 3753	struct sk_buff *to_free = NULL;
 3754	bool contended;
 3755	int rc;
 3756
 3757	qdisc_calculate_pkt_len(skb, q);
 3758
 3759	tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP);
 3760
 3761	if (q->flags & TCQ_F_NOLOCK) {
 3762		if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
 3763		    qdisc_run_begin(q)) {
 3764			/* Retest nolock_qdisc_is_empty() within the protection
 3765			 * of q->seqlock to protect from racing with requeuing.
 3766			 */
 3767			if (unlikely(!nolock_qdisc_is_empty(q))) {
 3768				rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
 3769				__qdisc_run(q);
 3770				qdisc_run_end(q);
 3771
 3772				goto no_lock_out;
 3773			}
 3774
 3775			qdisc_bstats_cpu_update(q, skb);
 3776			if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
 3777			    !nolock_qdisc_is_empty(q))
 3778				__qdisc_run(q);
 3779
 3780			qdisc_run_end(q);
 3781			return NET_XMIT_SUCCESS;
 3782		}
 3783
 3784		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
 3785		qdisc_run(q);
 3786
 3787no_lock_out:
 3788		if (unlikely(to_free))
 3789			kfree_skb_list_reason(to_free,
 3790					      tcf_get_drop_reason(to_free));
 3791		return rc;
 3792	}
 3793
 3794	/*
 3795	 * Heuristic to force contended enqueues to serialize on a
 3796	 * separate lock before trying to get qdisc main lock.
 3797	 * This permits qdisc->running owner to get the lock more
 3798	 * often and dequeue packets faster.
 3799	 * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
 3800	 * and then other tasks will only enqueue packets. The packets will be
 3801	 * sent after the qdisc owner is scheduled again. To prevent this
 3802	 * scenario the task always serialize on the lock.
 3803	 */
 3804	contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
 3805	if (unlikely(contended))
 3806		spin_lock(&q->busylock);
 3807
 3808	spin_lock(root_lock);
 3809	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
 3810		__qdisc_drop(skb, &to_free);
 3811		rc = NET_XMIT_DROP;
 3812	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
 3813		   qdisc_run_begin(q)) {
 3814		/*
 3815		 * This is a work-conserving queue; there are no old skbs
 3816		 * waiting to be sent out; and the qdisc is not running -
 3817		 * xmit the skb directly.
 3818		 */
 3819
 3820		qdisc_bstats_update(q, skb);
 3821
 3822		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
 3823			if (unlikely(contended)) {
 3824				spin_unlock(&q->busylock);
 3825				contended = false;
 3826			}
 3827			__qdisc_run(q);
 3828		}
 
 3829
 3830		qdisc_run_end(q);
 3831		rc = NET_XMIT_SUCCESS;
 3832	} else {
 3833		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
 3834		if (qdisc_run_begin(q)) {
 3835			if (unlikely(contended)) {
 3836				spin_unlock(&q->busylock);
 3837				contended = false;
 3838			}
 3839			__qdisc_run(q);
 3840			qdisc_run_end(q);
 3841		}
 3842	}
 3843	spin_unlock(root_lock);
 3844	if (unlikely(to_free))
 3845		kfree_skb_list_reason(to_free,
 3846				      tcf_get_drop_reason(to_free));
 3847	if (unlikely(contended))
 3848		spin_unlock(&q->busylock);
 3849	return rc;
 3850}
 3851
 3852#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 3853static void skb_update_prio(struct sk_buff *skb)
 3854{
 3855	const struct netprio_map *map;
 3856	const struct sock *sk;
 3857	unsigned int prioidx;
 3858
 3859	if (skb->priority)
 3860		return;
 3861	map = rcu_dereference_bh(skb->dev->priomap);
 3862	if (!map)
 3863		return;
 3864	sk = skb_to_full_sk(skb);
 3865	if (!sk)
 3866		return;
 3867
 3868	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
 3869
 3870	if (prioidx < map->priomap_len)
 3871		skb->priority = map->priomap[prioidx];
 3872}
 3873#else
 3874#define skb_update_prio(skb)
 3875#endif
 3876
 
 
 
 3877/**
 3878 *	dev_loopback_xmit - loop back @skb
 3879 *	@net: network namespace this loopback is happening in
 3880 *	@sk:  sk needed to be a netfilter okfn
 3881 *	@skb: buffer to transmit
 3882 */
 3883int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
 3884{
 3885	skb_reset_mac_header(skb);
 3886	__skb_pull(skb, skb_network_offset(skb));
 3887	skb->pkt_type = PACKET_LOOPBACK;
 3888	if (skb->ip_summed == CHECKSUM_NONE)
 3889		skb->ip_summed = CHECKSUM_UNNECESSARY;
 3890	DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb));
 3891	skb_dst_force(skb);
 3892	netif_rx(skb);
 3893	return 0;
 3894}
 3895EXPORT_SYMBOL(dev_loopback_xmit);
 3896
 3897#ifdef CONFIG_NET_EGRESS
 3898static struct netdev_queue *
 3899netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
 3900{
 3901	int qm = skb_get_queue_mapping(skb);
 3902
 3903	return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
 3904}
 3905
 3906static bool netdev_xmit_txqueue_skipped(void)
 3907{
 3908	return __this_cpu_read(softnet_data.xmit.skip_txqueue);
 3909}
 3910
 3911void netdev_xmit_skip_txqueue(bool skip)
 3912{
 3913	__this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
 3914}
 3915EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
 3916#endif /* CONFIG_NET_EGRESS */
 3917
 3918#ifdef CONFIG_NET_XGRESS
 3919static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
 3920		  enum skb_drop_reason *drop_reason)
 3921{
 3922	int ret = TC_ACT_UNSPEC;
 3923#ifdef CONFIG_NET_CLS_ACT
 3924	struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
 3925	struct tcf_result res;
 3926
 3927	if (!miniq)
 3928		return ret;
 
 
 3929
 3930	tc_skb_cb(skb)->mru = 0;
 3931	tc_skb_cb(skb)->post_ct = false;
 3932	tcf_set_drop_reason(skb, *drop_reason);
 3933
 3934	mini_qdisc_bstats_cpu_update(miniq, skb);
 3935	ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
 3936	/* Only tcf related quirks below. */
 3937	switch (ret) {
 3938	case TC_ACT_SHOT:
 3939		*drop_reason = tcf_get_drop_reason(skb);
 3940		mini_qdisc_qstats_cpu_drop(miniq);
 3941		break;
 3942	case TC_ACT_OK:
 3943	case TC_ACT_RECLASSIFY:
 3944		skb->tc_index = TC_H_MIN(res.classid);
 3945		break;
 3946	}
 3947#endif /* CONFIG_NET_CLS_ACT */
 3948	return ret;
 3949}
 3950
 3951static DEFINE_STATIC_KEY_FALSE(tcx_needed_key);
 3952
 3953void tcx_inc(void)
 3954{
 3955	static_branch_inc(&tcx_needed_key);
 3956}
 3957
 3958void tcx_dec(void)
 3959{
 3960	static_branch_dec(&tcx_needed_key);
 3961}
 3962
 3963static __always_inline enum tcx_action_base
 3964tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
 3965	const bool needs_mac)
 3966{
 3967	const struct bpf_mprog_fp *fp;
 3968	const struct bpf_prog *prog;
 3969	int ret = TCX_NEXT;
 3970
 3971	if (needs_mac)
 3972		__skb_push(skb, skb->mac_len);
 3973	bpf_mprog_foreach_prog(entry, fp, prog) {
 3974		bpf_compute_data_pointers(skb);
 3975		ret = bpf_prog_run(prog, skb);
 3976		if (ret != TCX_NEXT)
 3977			break;
 3978	}
 3979	if (needs_mac)
 3980		__skb_pull(skb, skb->mac_len);
 3981	return tcx_action_code(skb, ret);
 3982}
 3983
 3984static __always_inline struct sk_buff *
 3985sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 3986		   struct net_device *orig_dev, bool *another)
 3987{
 3988	struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
 3989	enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
 3990	int sch_ret;
 3991
 3992	if (!entry)
 3993		return skb;
 3994	if (*pt_prev) {
 3995		*ret = deliver_skb(skb, *pt_prev, orig_dev);
 3996		*pt_prev = NULL;
 3997	}
 3998
 3999	qdisc_skb_cb(skb)->pkt_len = skb->len;
 4000	tcx_set_ingress(skb, true);
 4001
 4002	if (static_branch_unlikely(&tcx_needed_key)) {
 4003		sch_ret = tcx_run(entry, skb, true);
 4004		if (sch_ret != TC_ACT_UNSPEC)
 4005			goto ingress_verdict;
 4006	}
 4007	sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
 4008ingress_verdict:
 4009	switch (sch_ret) {
 4010	case TC_ACT_REDIRECT:
 4011		/* skb_mac_header check was done by BPF, so we can safely
 4012		 * push the L2 header back before redirecting to another
 4013		 * netdev.
 4014		 */
 4015		__skb_push(skb, skb->mac_len);
 4016		if (skb_do_redirect(skb) == -EAGAIN) {
 4017			__skb_pull(skb, skb->mac_len);
 4018			*another = true;
 4019			break;
 4020		}
 4021		*ret = NET_RX_SUCCESS;
 4022		return NULL;
 4023	case TC_ACT_SHOT:
 4024		kfree_skb_reason(skb, drop_reason);
 4025		*ret = NET_RX_DROP;
 
 4026		return NULL;
 4027	/* used by tc_run */
 4028	case TC_ACT_STOLEN:
 4029	case TC_ACT_QUEUED:
 4030	case TC_ACT_TRAP:
 4031		consume_skb(skb);
 4032		fallthrough;
 4033	case TC_ACT_CONSUMED:
 4034		*ret = NET_RX_SUCCESS;
 4035		return NULL;
 4036	}
 4037
 4038	return skb;
 4039}
 4040
 4041static __always_inline struct sk_buff *
 4042sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 4043{
 4044	struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
 4045	enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
 4046	int sch_ret;
 4047
 4048	if (!entry)
 4049		return skb;
 4050
 4051	/* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
 4052	 * already set by the caller.
 4053	 */
 4054	if (static_branch_unlikely(&tcx_needed_key)) {
 4055		sch_ret = tcx_run(entry, skb, false);
 4056		if (sch_ret != TC_ACT_UNSPEC)
 4057			goto egress_verdict;
 4058	}
 4059	sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
 4060egress_verdict:
 4061	switch (sch_ret) {
 4062	case TC_ACT_REDIRECT:
 4063		/* No need to push/pop skb's mac_header here on egress! */
 4064		skb_do_redirect(skb);
 4065		*ret = NET_XMIT_SUCCESS;
 4066		return NULL;
 4067	case TC_ACT_SHOT:
 4068		kfree_skb_reason(skb, drop_reason);
 4069		*ret = NET_XMIT_DROP;
 4070		return NULL;
 4071	/* used by tc_run */
 4072	case TC_ACT_STOLEN:
 4073	case TC_ACT_QUEUED:
 4074	case TC_ACT_TRAP:
 4075		consume_skb(skb);
 4076		fallthrough;
 4077	case TC_ACT_CONSUMED:
 4078		*ret = NET_XMIT_SUCCESS;
 4079		return NULL;
 4080	}
 4081
 4082	return skb;
 4083}
 4084#else
 4085static __always_inline struct sk_buff *
 4086sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 4087		   struct net_device *orig_dev, bool *another)
 4088{
 4089	return skb;
 4090}
 4091
 4092static __always_inline struct sk_buff *
 4093sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 4094{
 4095	return skb;
 4096}
 4097#endif /* CONFIG_NET_XGRESS */
 4098
 4099#ifdef CONFIG_XPS
 4100static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
 4101			       struct xps_dev_maps *dev_maps, unsigned int tci)
 4102{
 4103	int tc = netdev_get_prio_tc_map(dev, skb->priority);
 4104	struct xps_map *map;
 4105	int queue_index = -1;
 4106
 4107	if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
 4108		return queue_index;
 4109
 4110	tci *= dev_maps->num_tc;
 4111	tci += tc;
 4112
 4113	map = rcu_dereference(dev_maps->attr_map[tci]);
 4114	if (map) {
 4115		if (map->len == 1)
 4116			queue_index = map->queues[0];
 4117		else
 4118			queue_index = map->queues[reciprocal_scale(
 4119						skb_get_hash(skb), map->len)];
 4120		if (unlikely(queue_index >= dev->real_num_tx_queues))
 4121			queue_index = -1;
 4122	}
 4123	return queue_index;
 4124}
 4125#endif
 4126
 4127static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
 4128			 struct sk_buff *skb)
 4129{
 4130#ifdef CONFIG_XPS
 4131	struct xps_dev_maps *dev_maps;
 4132	struct sock *sk = skb->sk;
 4133	int queue_index = -1;
 4134
 4135	if (!static_key_false(&xps_needed))
 4136		return -1;
 4137
 4138	rcu_read_lock();
 4139	if (!static_key_false(&xps_rxqs_needed))
 4140		goto get_cpus_map;
 4141
 4142	dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
 4143	if (dev_maps) {
 4144		int tci = sk_rx_queue_get(sk);
 4145
 4146		if (tci >= 0)
 4147			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 4148							  tci);
 4149	}
 4150
 4151get_cpus_map:
 4152	if (queue_index < 0) {
 4153		dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
 4154		if (dev_maps) {
 4155			unsigned int tci = skb->sender_cpu - 1;
 4156
 4157			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 4158							  tci);
 
 
 
 
 
 
 
 4159		}
 4160	}
 4161	rcu_read_unlock();
 4162
 4163	return queue_index;
 4164#else
 4165	return -1;
 4166#endif
 4167}
 4168
 4169u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
 4170		     struct net_device *sb_dev)
 4171{
 4172	return 0;
 4173}
 4174EXPORT_SYMBOL(dev_pick_tx_zero);
 4175
 4176u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
 4177		       struct net_device *sb_dev)
 4178{
 4179	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
 4180}
 4181EXPORT_SYMBOL(dev_pick_tx_cpu_id);
 4182
 4183u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
 4184		     struct net_device *sb_dev)
 4185{
 4186	struct sock *sk = skb->sk;
 4187	int queue_index = sk_tx_queue_get(sk);
 4188
 4189	sb_dev = sb_dev ? : dev;
 4190
 4191	if (queue_index < 0 || skb->ooo_okay ||
 4192	    queue_index >= dev->real_num_tx_queues) {
 4193		int new_index = get_xps_queue(dev, sb_dev, skb);
 4194
 4195		if (new_index < 0)
 4196			new_index = skb_tx_hash(dev, sb_dev, skb);
 4197
 4198		if (queue_index != new_index && sk &&
 4199		    sk_fullsock(sk) &&
 4200		    rcu_access_pointer(sk->sk_dst_cache))
 4201			sk_tx_queue_set(sk, new_index);
 4202
 4203		queue_index = new_index;
 4204	}
 4205
 4206	return queue_index;
 4207}
 4208EXPORT_SYMBOL(netdev_pick_tx);
 4209
 4210struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
 4211					 struct sk_buff *skb,
 4212					 struct net_device *sb_dev)
 4213{
 4214	int queue_index = 0;
 4215
 4216#ifdef CONFIG_XPS
 4217	u32 sender_cpu = skb->sender_cpu - 1;
 4218
 4219	if (sender_cpu >= (u32)NR_CPUS)
 4220		skb->sender_cpu = raw_smp_processor_id() + 1;
 4221#endif
 4222
 4223	if (dev->real_num_tx_queues != 1) {
 4224		const struct net_device_ops *ops = dev->netdev_ops;
 4225
 4226		if (ops->ndo_select_queue)
 4227			queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
 
 4228		else
 4229			queue_index = netdev_pick_tx(dev, skb, sb_dev);
 4230
 4231		queue_index = netdev_cap_txqueue(dev, queue_index);
 
 4232	}
 4233
 4234	skb_set_queue_mapping(skb, queue_index);
 4235	return netdev_get_tx_queue(dev, queue_index);
 4236}
 4237
 4238/**
 4239 * __dev_queue_xmit() - transmit a buffer
 4240 * @skb:	buffer to transmit
 4241 * @sb_dev:	suboordinate device used for L2 forwarding offload
 4242 *
 4243 * Queue a buffer for transmission to a network device. The caller must
 4244 * have set the device and priority and built the buffer before calling
 4245 * this function. The function can be called from an interrupt.
 4246 *
 4247 * When calling this method, interrupts MUST be enabled. This is because
 4248 * the BH enable code must have IRQs enabled so that it will not deadlock.
 4249 *
 4250 * Regardless of the return value, the skb is consumed, so it is currently
 4251 * difficult to retry a send to this method. (You can bump the ref count
 4252 * before sending to hold a reference for retry if you are careful.)
 4253 *
 4254 * Return:
 4255 * * 0				- buffer successfully transmitted
 4256 * * positive qdisc return code	- NET_XMIT_DROP etc.
 4257 * * negative errno		- other errors
 
 
 
 
 
 4258 */
 4259int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 4260{
 4261	struct net_device *dev = skb->dev;
 4262	struct netdev_queue *txq = NULL;
 4263	struct Qdisc *q;
 4264	int rc = -ENOMEM;
 4265	bool again = false;
 4266
 4267	skb_reset_mac_header(skb);
 4268	skb_assert_len(skb);
 4269
 4270	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
 4271		__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
 4272
 4273	/* Disable soft irqs for various locks below. Also
 4274	 * stops preemption for RCU.
 4275	 */
 4276	rcu_read_lock_bh();
 4277
 4278	skb_update_prio(skb);
 4279
 4280	qdisc_pkt_len_init(skb);
 4281	tcx_set_ingress(skb, false);
 4282#ifdef CONFIG_NET_EGRESS
 4283	if (static_branch_unlikely(&egress_needed_key)) {
 4284		if (nf_hook_egress_active()) {
 4285			skb = nf_hook_egress(skb, &rc, dev);
 4286			if (!skb)
 4287				goto out;
 4288		}
 4289
 4290		netdev_xmit_skip_txqueue(false);
 4291
 4292		nf_skip_egress(skb, true);
 4293		skb = sch_handle_egress(skb, &rc, dev);
 4294		if (!skb)
 4295			goto out;
 4296		nf_skip_egress(skb, false);
 4297
 4298		if (netdev_xmit_txqueue_skipped())
 4299			txq = netdev_tx_queue_mapping(dev, skb);
 4300	}
 
 4301#endif
 4302	/* If device/qdisc don't need skb->dst, release it right now while
 4303	 * its hot in this cpu cache.
 4304	 */
 4305	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
 4306		skb_dst_drop(skb);
 4307	else
 4308		skb_dst_force(skb);
 4309
 4310	if (!txq)
 4311		txq = netdev_core_pick_tx(dev, skb, sb_dev);
 4312
 4313	q = rcu_dereference_bh(txq->qdisc);
 4314
 4315	trace_net_dev_queue(skb);
 4316	if (q->enqueue) {
 4317		rc = __dev_xmit_skb(skb, q, dev, txq);
 4318		goto out;
 4319	}
 4320
 4321	/* The device has no queue. Common case for software devices:
 4322	 * loopback, all the sorts of tunnels...
 4323
 4324	 * Really, it is unlikely that netif_tx_lock protection is necessary
 4325	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
 4326	 * counters.)
 4327	 * However, it is possible, that they rely on protection
 4328	 * made by us here.
 4329
 4330	 * Check this and shot the lock. It is not prone from deadlocks.
 4331	 *Either shot noqueue qdisc, it is even simpler 8)
 4332	 */
 4333	if (dev->flags & IFF_UP) {
 4334		int cpu = smp_processor_id(); /* ok because BHs are off */
 4335
 4336		/* Other cpus might concurrently change txq->xmit_lock_owner
 4337		 * to -1 or to their cpu id, but not to our id.
 4338		 */
 4339		if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
 4340			if (dev_xmit_recursion())
 4341				goto recursion_alert;
 4342
 4343			skb = validate_xmit_skb(skb, dev, &again);
 4344			if (!skb)
 4345				goto out;
 4346
 4347			HARD_TX_LOCK(dev, txq, cpu);
 4348
 4349			if (!netif_xmit_stopped(txq)) {
 4350				dev_xmit_recursion_inc();
 4351				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
 4352				dev_xmit_recursion_dec();
 4353				if (dev_xmit_complete(rc)) {
 4354					HARD_TX_UNLOCK(dev, txq);
 4355					goto out;
 4356				}
 4357			}
 4358			HARD_TX_UNLOCK(dev, txq);
 4359			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
 4360					     dev->name);
 4361		} else {
 4362			/* Recursion is detected! It is possible,
 4363			 * unfortunately
 4364			 */
 4365recursion_alert:
 4366			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
 4367					     dev->name);
 4368		}
 4369	}
 4370
 4371	rc = -ENETDOWN;
 4372	rcu_read_unlock_bh();
 4373
 4374	dev_core_stats_tx_dropped_inc(dev);
 4375	kfree_skb_list(skb);
 4376	return rc;
 4377out:
 4378	rcu_read_unlock_bh();
 4379	return rc;
 4380}
 4381EXPORT_SYMBOL(__dev_queue_xmit);
 4382
 4383int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
 4384{
 4385	struct net_device *dev = skb->dev;
 4386	struct sk_buff *orig_skb = skb;
 4387	struct netdev_queue *txq;
 4388	int ret = NETDEV_TX_BUSY;
 4389	bool again = false;
 4390
 4391	if (unlikely(!netif_running(dev) ||
 4392		     !netif_carrier_ok(dev)))
 4393		goto drop;
 4394
 4395	skb = validate_xmit_skb_list(skb, dev, &again);
 4396	if (skb != orig_skb)
 4397		goto drop;
 4398
 4399	skb_set_queue_mapping(skb, queue_id);
 4400	txq = skb_get_tx_queue(dev, skb);
 4401
 4402	local_bh_disable();
 4403
 4404	dev_xmit_recursion_inc();
 4405	HARD_TX_LOCK(dev, txq, smp_processor_id());
 4406	if (!netif_xmit_frozen_or_drv_stopped(txq))
 4407		ret = netdev_start_xmit(skb, dev, txq, false);
 4408	HARD_TX_UNLOCK(dev, txq);
 4409	dev_xmit_recursion_dec();
 4410
 4411	local_bh_enable();
 4412	return ret;
 4413drop:
 4414	dev_core_stats_tx_dropped_inc(dev);
 4415	kfree_skb_list(skb);
 4416	return NET_XMIT_DROP;
 4417}
 4418EXPORT_SYMBOL(__dev_direct_xmit);
 
 4419
 4420/*************************************************************************
 4421 *			Receiver routines
 4422 *************************************************************************/
 4423
 4424int netdev_max_backlog __read_mostly = 1000;
 4425EXPORT_SYMBOL(netdev_max_backlog);
 4426
 4427int netdev_tstamp_prequeue __read_mostly = 1;
 4428unsigned int sysctl_skb_defer_max __read_mostly = 64;
 4429int netdev_budget __read_mostly = 300;
 4430/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
 4431unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
 4432int weight_p __read_mostly = 64;           /* old backlog weight */
 4433int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
 4434int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
 4435int dev_rx_weight __read_mostly = 64;
 4436int dev_tx_weight __read_mostly = 64;
 4437
 4438/* Called with irq disabled */
 4439static inline void ____napi_schedule(struct softnet_data *sd,
 4440				     struct napi_struct *napi)
 4441{
 4442	struct task_struct *thread;
 4443
 4444	lockdep_assert_irqs_disabled();
 4445
 4446	if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
 4447		/* Paired with smp_mb__before_atomic() in
 4448		 * napi_enable()/dev_set_threaded().
 4449		 * Use READ_ONCE() to guarantee a complete
 4450		 * read on napi->thread. Only call
 4451		 * wake_up_process() when it's not NULL.
 4452		 */
 4453		thread = READ_ONCE(napi->thread);
 4454		if (thread) {
 4455			/* Avoid doing set_bit() if the thread is in
 4456			 * INTERRUPTIBLE state, cause napi_thread_wait()
 4457			 * makes sure to proceed with napi polling
 4458			 * if the thread is explicitly woken from here.
 4459			 */
 4460			if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
 4461				set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
 4462			wake_up_process(thread);
 4463			return;
 4464		}
 4465	}
 4466
 4467	list_add_tail(&napi->poll_list, &sd->poll_list);
 4468	WRITE_ONCE(napi->list_owner, smp_processor_id());
 4469	/* If not called from net_rx_action()
 4470	 * we have to raise NET_RX_SOFTIRQ.
 4471	 */
 4472	if (!sd->in_net_rx_action)
 4473		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4474}
 4475
 4476#ifdef CONFIG_RPS
 4477
 4478/* One global table that all flow-based protocols share. */
 4479struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
 4480EXPORT_SYMBOL(rps_sock_flow_table);
 4481u32 rps_cpu_mask __read_mostly;
 4482EXPORT_SYMBOL(rps_cpu_mask);
 4483
 4484struct static_key_false rps_needed __read_mostly;
 4485EXPORT_SYMBOL(rps_needed);
 4486struct static_key_false rfs_needed __read_mostly;
 4487EXPORT_SYMBOL(rfs_needed);
 4488
 4489static struct rps_dev_flow *
 4490set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 4491	    struct rps_dev_flow *rflow, u16 next_cpu)
 4492{
 4493	if (next_cpu < nr_cpu_ids) {
 4494#ifdef CONFIG_RFS_ACCEL
 4495		struct netdev_rx_queue *rxqueue;
 4496		struct rps_dev_flow_table *flow_table;
 4497		struct rps_dev_flow *old_rflow;
 4498		u32 flow_id;
 4499		u16 rxq_index;
 4500		int rc;
 4501
 4502		/* Should we steer this flow to a different hardware queue? */
 4503		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
 4504		    !(dev->features & NETIF_F_NTUPLE))
 4505			goto out;
 4506		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
 4507		if (rxq_index == skb_get_rx_queue(skb))
 4508			goto out;
 4509
 4510		rxqueue = dev->_rx + rxq_index;
 4511		flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4512		if (!flow_table)
 4513			goto out;
 4514		flow_id = skb_get_hash(skb) & flow_table->mask;
 4515		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
 4516							rxq_index, flow_id);
 4517		if (rc < 0)
 4518			goto out;
 4519		old_rflow = rflow;
 4520		rflow = &flow_table->flows[flow_id];
 4521		rflow->filter = rc;
 4522		if (old_rflow->filter == rflow->filter)
 4523			old_rflow->filter = RPS_NO_FILTER;
 4524	out:
 4525#endif
 4526		rflow->last_qtail =
 4527			per_cpu(softnet_data, next_cpu).input_queue_head;
 4528	}
 4529
 4530	rflow->cpu = next_cpu;
 4531	return rflow;
 4532}
 4533
 4534/*
 4535 * get_rps_cpu is called from netif_receive_skb and returns the target
 4536 * CPU from the RPS map of the receiving queue for a given skb.
 4537 * rcu_read_lock must be held on entry.
 4538 */
 4539static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 4540		       struct rps_dev_flow **rflowp)
 4541{
 4542	const struct rps_sock_flow_table *sock_flow_table;
 4543	struct netdev_rx_queue *rxqueue = dev->_rx;
 4544	struct rps_dev_flow_table *flow_table;
 4545	struct rps_map *map;
 4546	int cpu = -1;
 4547	u32 tcpu;
 4548	u32 hash;
 4549
 4550	if (skb_rx_queue_recorded(skb)) {
 4551		u16 index = skb_get_rx_queue(skb);
 4552
 4553		if (unlikely(index >= dev->real_num_rx_queues)) {
 4554			WARN_ONCE(dev->real_num_rx_queues > 1,
 4555				  "%s received packet on queue %u, but number "
 4556				  "of RX queues is %u\n",
 4557				  dev->name, index, dev->real_num_rx_queues);
 4558			goto done;
 4559		}
 4560		rxqueue += index;
 4561	}
 4562
 4563	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
 4564
 4565	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4566	map = rcu_dereference(rxqueue->rps_map);
 4567	if (!flow_table && !map)
 4568		goto done;
 4569
 4570	skb_reset_network_header(skb);
 4571	hash = skb_get_hash(skb);
 4572	if (!hash)
 4573		goto done;
 4574
 4575	sock_flow_table = rcu_dereference(rps_sock_flow_table);
 4576	if (flow_table && sock_flow_table) {
 4577		struct rps_dev_flow *rflow;
 4578		u32 next_cpu;
 4579		u32 ident;
 4580
 4581		/* First check into global flow table if there is a match.
 4582		 * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
 4583		 */
 4584		ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
 4585		if ((ident ^ hash) & ~rps_cpu_mask)
 4586			goto try_rps;
 4587
 4588		next_cpu = ident & rps_cpu_mask;
 4589
 4590		/* OK, now we know there is a match,
 4591		 * we can look at the local (per receive queue) flow table
 4592		 */
 4593		rflow = &flow_table->flows[hash & flow_table->mask];
 4594		tcpu = rflow->cpu;
 4595
 4596		/*
 4597		 * If the desired CPU (where last recvmsg was done) is
 4598		 * different from current CPU (one in the rx-queue flow
 4599		 * table entry), switch if one of the following holds:
 4600		 *   - Current CPU is unset (>= nr_cpu_ids).
 4601		 *   - Current CPU is offline.
 4602		 *   - The current CPU's queue tail has advanced beyond the
 4603		 *     last packet that was enqueued using this table entry.
 4604		 *     This guarantees that all previous packets for the flow
 4605		 *     have been dequeued, thus preserving in order delivery.
 4606		 */
 4607		if (unlikely(tcpu != next_cpu) &&
 4608		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
 4609		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
 4610		      rflow->last_qtail)) >= 0)) {
 4611			tcpu = next_cpu;
 4612			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
 4613		}
 4614
 4615		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
 4616			*rflowp = rflow;
 4617			cpu = tcpu;
 4618			goto done;
 4619		}
 4620	}
 4621
 4622try_rps:
 4623
 4624	if (map) {
 4625		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
 4626		if (cpu_online(tcpu)) {
 4627			cpu = tcpu;
 4628			goto done;
 4629		}
 4630	}
 4631
 4632done:
 4633	return cpu;
 4634}
 4635
 4636#ifdef CONFIG_RFS_ACCEL
 4637
 4638/**
 4639 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
 4640 * @dev: Device on which the filter was set
 4641 * @rxq_index: RX queue index
 4642 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
 4643 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
 4644 *
 4645 * Drivers that implement ndo_rx_flow_steer() should periodically call
 4646 * this function for each installed filter and remove the filters for
 4647 * which it returns %true.
 4648 */
 4649bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
 4650			 u32 flow_id, u16 filter_id)
 4651{
 4652	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
 4653	struct rps_dev_flow_table *flow_table;
 4654	struct rps_dev_flow *rflow;
 4655	bool expire = true;
 4656	unsigned int cpu;
 4657
 4658	rcu_read_lock();
 4659	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4660	if (flow_table && flow_id <= flow_table->mask) {
 4661		rflow = &flow_table->flows[flow_id];
 4662		cpu = READ_ONCE(rflow->cpu);
 4663		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
 4664		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
 4665			   rflow->last_qtail) <
 4666		     (int)(10 * flow_table->mask)))
 4667			expire = false;
 4668	}
 4669	rcu_read_unlock();
 4670	return expire;
 4671}
 4672EXPORT_SYMBOL(rps_may_expire_flow);
 4673
 4674#endif /* CONFIG_RFS_ACCEL */
 4675
 4676/* Called from hardirq (IPI) context */
 4677static void rps_trigger_softirq(void *data)
 4678{
 4679	struct softnet_data *sd = data;
 4680
 4681	____napi_schedule(sd, &sd->backlog);
 4682	sd->received_rps++;
 4683}
 4684
 4685#endif /* CONFIG_RPS */
 4686
 4687/* Called from hardirq (IPI) context */
 4688static void trigger_rx_softirq(void *data)
 4689{
 4690	struct softnet_data *sd = data;
 4691
 4692	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4693	smp_store_release(&sd->defer_ipi_scheduled, 0);
 4694}
 4695
 4696/*
 4697 * After we queued a packet into sd->input_pkt_queue,
 4698 * we need to make sure this queue is serviced soon.
 4699 *
 4700 * - If this is another cpu queue, link it to our rps_ipi_list,
 4701 *   and make sure we will process rps_ipi_list from net_rx_action().
 4702 *
 4703 * - If this is our own queue, NAPI schedule our backlog.
 4704 *   Note that this also raises NET_RX_SOFTIRQ.
 4705 */
 4706static void napi_schedule_rps(struct softnet_data *sd)
 4707{
 
 4708	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
 4709
 4710#ifdef CONFIG_RPS
 4711	if (sd != mysd) {
 4712		sd->rps_ipi_next = mysd->rps_ipi_list;
 4713		mysd->rps_ipi_list = sd;
 4714
 4715		/* If not called from net_rx_action() or napi_threaded_poll()
 4716		 * we have to raise NET_RX_SOFTIRQ.
 4717		 */
 4718		if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll)
 4719			__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4720		return;
 4721	}
 4722#endif /* CONFIG_RPS */
 4723	__napi_schedule_irqoff(&mysd->backlog);
 4724}
 4725
 4726#ifdef CONFIG_NET_FLOW_LIMIT
 4727int netdev_flow_limit_table_len __read_mostly = (1 << 12);
 4728#endif
 4729
 4730static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
 4731{
 4732#ifdef CONFIG_NET_FLOW_LIMIT
 4733	struct sd_flow_limit *fl;
 4734	struct softnet_data *sd;
 4735	unsigned int old_flow, new_flow;
 4736
 4737	if (qlen < (READ_ONCE(netdev_max_backlog) >> 1))
 4738		return false;
 4739
 4740	sd = this_cpu_ptr(&softnet_data);
 4741
 4742	rcu_read_lock();
 4743	fl = rcu_dereference(sd->flow_limit);
 4744	if (fl) {
 4745		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
 4746		old_flow = fl->history[fl->history_head];
 4747		fl->history[fl->history_head] = new_flow;
 4748
 4749		fl->history_head++;
 4750		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
 4751
 4752		if (likely(fl->buckets[old_flow]))
 4753			fl->buckets[old_flow]--;
 4754
 4755		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
 4756			fl->count++;
 4757			rcu_read_unlock();
 4758			return true;
 4759		}
 4760	}
 4761	rcu_read_unlock();
 4762#endif
 4763	return false;
 4764}
 4765
 4766/*
 4767 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
 4768 * queue (may be a remote CPU queue).
 4769 */
 4770static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 4771			      unsigned int *qtail)
 4772{
 4773	enum skb_drop_reason reason;
 4774	struct softnet_data *sd;
 4775	unsigned long flags;
 4776	unsigned int qlen;
 4777
 4778	reason = SKB_DROP_REASON_NOT_SPECIFIED;
 4779	sd = &per_cpu(softnet_data, cpu);
 4780
 4781	rps_lock_irqsave(sd, &flags);
 
 
 4782	if (!netif_running(skb->dev))
 4783		goto drop;
 4784	qlen = skb_queue_len(&sd->input_pkt_queue);
 4785	if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) {
 4786		if (qlen) {
 4787enqueue:
 4788			__skb_queue_tail(&sd->input_pkt_queue, skb);
 4789			input_queue_tail_incr_save(sd, qtail);
 4790			rps_unlock_irq_restore(sd, &flags);
 
 4791			return NET_RX_SUCCESS;
 4792		}
 4793
 4794		/* Schedule NAPI for backlog device
 4795		 * We can use non atomic operation since we own the queue lock
 4796		 */
 4797		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
 4798			napi_schedule_rps(sd);
 
 
 4799		goto enqueue;
 4800	}
 4801	reason = SKB_DROP_REASON_CPU_BACKLOG;
 4802
 4803drop:
 4804	sd->dropped++;
 4805	rps_unlock_irq_restore(sd, &flags);
 4806
 4807	dev_core_stats_rx_dropped_inc(skb->dev);
 4808	kfree_skb_reason(skb, reason);
 4809	return NET_RX_DROP;
 4810}
 4811
 4812static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
 4813{
 4814	struct net_device *dev = skb->dev;
 4815	struct netdev_rx_queue *rxqueue;
 4816
 4817	rxqueue = dev->_rx;
 4818
 4819	if (skb_rx_queue_recorded(skb)) {
 4820		u16 index = skb_get_rx_queue(skb);
 4821
 4822		if (unlikely(index >= dev->real_num_rx_queues)) {
 4823			WARN_ONCE(dev->real_num_rx_queues > 1,
 4824				  "%s received packet on queue %u, but number "
 4825				  "of RX queues is %u\n",
 4826				  dev->name, index, dev->real_num_rx_queues);
 4827
 4828			return rxqueue; /* Return first rxqueue */
 4829		}
 4830		rxqueue += index;
 4831	}
 4832	return rxqueue;
 4833}
 4834
 4835u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
 4836			     struct bpf_prog *xdp_prog)
 4837{
 4838	void *orig_data, *orig_data_end, *hard_start;
 4839	struct netdev_rx_queue *rxqueue;
 4840	bool orig_bcast, orig_host;
 4841	u32 mac_len, frame_sz;
 4842	__be16 orig_eth_type;
 4843	struct ethhdr *eth;
 4844	u32 metalen, act;
 4845	int off;
 4846
 4847	/* The XDP program wants to see the packet starting at the MAC
 4848	 * header.
 4849	 */
 4850	mac_len = skb->data - skb_mac_header(skb);
 4851	hard_start = skb->data - skb_headroom(skb);
 4852
 4853	/* SKB "head" area always have tailroom for skb_shared_info */
 4854	frame_sz = (void *)skb_end_pointer(skb) - hard_start;
 4855	frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 4856
 4857	rxqueue = netif_get_rxqueue(skb);
 4858	xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
 4859	xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
 4860			 skb_headlen(skb) + mac_len, true);
 4861
 4862	orig_data_end = xdp->data_end;
 4863	orig_data = xdp->data;
 4864	eth = (struct ethhdr *)xdp->data;
 4865	orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
 4866	orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
 4867	orig_eth_type = eth->h_proto;
 4868
 4869	act = bpf_prog_run_xdp(xdp_prog, xdp);
 4870
 4871	/* check if bpf_xdp_adjust_head was used */
 4872	off = xdp->data - orig_data;
 4873	if (off) {
 4874		if (off > 0)
 4875			__skb_pull(skb, off);
 4876		else if (off < 0)
 4877			__skb_push(skb, -off);
 4878
 4879		skb->mac_header += off;
 4880		skb_reset_network_header(skb);
 4881	}
 4882
 4883	/* check if bpf_xdp_adjust_tail was used */
 4884	off = xdp->data_end - orig_data_end;
 4885	if (off != 0) {
 4886		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
 4887		skb->len += off; /* positive on grow, negative on shrink */
 4888	}
 4889
 4890	/* check if XDP changed eth hdr such SKB needs update */
 4891	eth = (struct ethhdr *)xdp->data;
 4892	if ((orig_eth_type != eth->h_proto) ||
 4893	    (orig_host != ether_addr_equal_64bits(eth->h_dest,
 4894						  skb->dev->dev_addr)) ||
 4895	    (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
 4896		__skb_push(skb, ETH_HLEN);
 4897		skb->pkt_type = PACKET_HOST;
 4898		skb->protocol = eth_type_trans(skb, skb->dev);
 4899	}
 4900
 4901	/* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
 4902	 * before calling us again on redirect path. We do not call do_redirect
 4903	 * as we leave that up to the caller.
 4904	 *
 4905	 * Caller is responsible for managing lifetime of skb (i.e. calling
 4906	 * kfree_skb in response to actions it cannot handle/XDP_DROP).
 4907	 */
 4908	switch (act) {
 4909	case XDP_REDIRECT:
 4910	case XDP_TX:
 4911		__skb_push(skb, mac_len);
 4912		break;
 4913	case XDP_PASS:
 4914		metalen = xdp->data - xdp->data_meta;
 4915		if (metalen)
 4916			skb_metadata_set(skb, metalen);
 4917		break;
 4918	}
 4919
 4920	return act;
 4921}
 4922
 4923static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 4924				     struct xdp_buff *xdp,
 4925				     struct bpf_prog *xdp_prog)
 4926{
 4927	u32 act = XDP_DROP;
 4928
 4929	/* Reinjected packets coming from act_mirred or similar should
 4930	 * not get XDP generic processing.
 4931	 */
 4932	if (skb_is_redirected(skb))
 4933		return XDP_PASS;
 4934
 4935	/* XDP packets must be linear and must have sufficient headroom
 4936	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
 4937	 * native XDP provides, thus we need to do it here as well.
 4938	 */
 4939	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
 4940	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
 4941		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
 4942		int troom = skb->tail + skb->data_len - skb->end;
 4943
 4944		/* In case we have to go down the path and also linearize,
 4945		 * then lets do the pskb_expand_head() work just once here.
 4946		 */
 4947		if (pskb_expand_head(skb,
 4948				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
 4949				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
 4950			goto do_drop;
 4951		if (skb_linearize(skb))
 4952			goto do_drop;
 4953	}
 4954
 4955	act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
 4956	switch (act) {
 4957	case XDP_REDIRECT:
 4958	case XDP_TX:
 4959	case XDP_PASS:
 4960		break;
 4961	default:
 4962		bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act);
 4963		fallthrough;
 4964	case XDP_ABORTED:
 4965		trace_xdp_exception(skb->dev, xdp_prog, act);
 4966		fallthrough;
 4967	case XDP_DROP:
 4968	do_drop:
 4969		kfree_skb(skb);
 4970		break;
 4971	}
 4972
 4973	return act;
 4974}
 4975
 4976/* When doing generic XDP we have to bypass the qdisc layer and the
 4977 * network taps in order to match in-driver-XDP behavior. This also means
 4978 * that XDP packets are able to starve other packets going through a qdisc,
 4979 * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
 4980 * queues, so they do not have this starvation issue.
 4981 */
 4982void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 4983{
 4984	struct net_device *dev = skb->dev;
 4985	struct netdev_queue *txq;
 4986	bool free_skb = true;
 4987	int cpu, rc;
 4988
 4989	txq = netdev_core_pick_tx(dev, skb, NULL);
 4990	cpu = smp_processor_id();
 4991	HARD_TX_LOCK(dev, txq, cpu);
 4992	if (!netif_xmit_frozen_or_drv_stopped(txq)) {
 4993		rc = netdev_start_xmit(skb, dev, txq, 0);
 4994		if (dev_xmit_complete(rc))
 4995			free_skb = false;
 4996	}
 4997	HARD_TX_UNLOCK(dev, txq);
 4998	if (free_skb) {
 4999		trace_xdp_exception(dev, xdp_prog, XDP_TX);
 5000		dev_core_stats_tx_dropped_inc(dev);
 5001		kfree_skb(skb);
 5002	}
 5003}
 5004
 5005static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
 5006
 5007int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
 5008{
 5009	if (xdp_prog) {
 5010		struct xdp_buff xdp;
 5011		u32 act;
 5012		int err;
 5013
 5014		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
 5015		if (act != XDP_PASS) {
 5016			switch (act) {
 5017			case XDP_REDIRECT:
 5018				err = xdp_do_generic_redirect(skb->dev, skb,
 5019							      &xdp, xdp_prog);
 5020				if (err)
 5021					goto out_redir;
 5022				break;
 5023			case XDP_TX:
 5024				generic_xdp_tx(skb, xdp_prog);
 5025				break;
 5026			}
 5027			return XDP_DROP;
 5028		}
 5029	}
 5030	return XDP_PASS;
 5031out_redir:
 5032	kfree_skb_reason(skb, SKB_DROP_REASON_XDP);
 5033	return XDP_DROP;
 5034}
 5035EXPORT_SYMBOL_GPL(do_xdp_generic);
 5036
 5037static int netif_rx_internal(struct sk_buff *skb)
 5038{
 5039	int ret;
 5040
 5041	net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
 5042
 5043	trace_netif_rx(skb);
 5044
 5045#ifdef CONFIG_RPS
 5046	if (static_branch_unlikely(&rps_needed)) {
 5047		struct rps_dev_flow voidflow, *rflow = &voidflow;
 5048		int cpu;
 5049
 
 5050		rcu_read_lock();
 5051
 5052		cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5053		if (cpu < 0)
 5054			cpu = smp_processor_id();
 5055
 5056		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5057
 5058		rcu_read_unlock();
 
 5059	} else
 5060#endif
 5061	{
 5062		unsigned int qtail;
 5063
 5064		ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
 5065	}
 5066	return ret;
 5067}
 5068
 5069/**
 5070 *	__netif_rx	-	Slightly optimized version of netif_rx
 5071 *	@skb: buffer to post
 5072 *
 5073 *	This behaves as netif_rx except that it does not disable bottom halves.
 5074 *	As a result this function may only be invoked from the interrupt context
 5075 *	(either hard or soft interrupt).
 5076 */
 5077int __netif_rx(struct sk_buff *skb)
 5078{
 5079	int ret;
 5080
 5081	lockdep_assert_once(hardirq_count() | softirq_count());
 5082
 5083	trace_netif_rx_entry(skb);
 5084	ret = netif_rx_internal(skb);
 5085	trace_netif_rx_exit(ret);
 5086	return ret;
 5087}
 5088EXPORT_SYMBOL(__netif_rx);
 5089
 5090/**
 5091 *	netif_rx	-	post buffer to the network code
 5092 *	@skb: buffer to post
 5093 *
 5094 *	This function receives a packet from a device driver and queues it for
 5095 *	the upper (protocol) levels to process via the backlog NAPI device. It
 5096 *	always succeeds. The buffer may be dropped during processing for
 5097 *	congestion control or by the protocol layers.
 5098 *	The network buffer is passed via the backlog NAPI device. Modern NIC
 5099 *	driver should use NAPI and GRO.
 5100 *	This function can used from interrupt and from process context. The
 5101 *	caller from process context must not disable interrupts before invoking
 5102 *	this function.
 5103 *
 5104 *	return values:
 5105 *	NET_RX_SUCCESS	(no congestion)
 5106 *	NET_RX_DROP     (packet was dropped)
 5107 *
 5108 */
 
 5109int netif_rx(struct sk_buff *skb)
 5110{
 5111	bool need_bh_off = !(hardirq_count() | softirq_count());
 5112	int ret;
 5113
 5114	if (need_bh_off)
 5115		local_bh_disable();
 5116	trace_netif_rx_entry(skb);
 5117	ret = netif_rx_internal(skb);
 5118	trace_netif_rx_exit(ret);
 5119	if (need_bh_off)
 5120		local_bh_enable();
 5121	return ret;
 5122}
 5123EXPORT_SYMBOL(netif_rx);
 5124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 5125static __latent_entropy void net_tx_action(struct softirq_action *h)
 5126{
 5127	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 5128
 5129	if (sd->completion_queue) {
 5130		struct sk_buff *clist;
 5131
 5132		local_irq_disable();
 5133		clist = sd->completion_queue;
 5134		sd->completion_queue = NULL;
 5135		local_irq_enable();
 5136
 5137		while (clist) {
 5138			struct sk_buff *skb = clist;
 5139
 5140			clist = clist->next;
 5141
 5142			WARN_ON(refcount_read(&skb->users));
 5143			if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED))
 5144				trace_consume_skb(skb, net_tx_action);
 5145			else
 5146				trace_kfree_skb(skb, net_tx_action,
 5147						get_kfree_skb_cb(skb)->reason);
 5148
 5149			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
 5150				__kfree_skb(skb);
 5151			else
 5152				__napi_kfree_skb(skb,
 5153						 get_kfree_skb_cb(skb)->reason);
 5154		}
 
 
 5155	}
 5156
 5157	if (sd->output_queue) {
 5158		struct Qdisc *head;
 5159
 5160		local_irq_disable();
 5161		head = sd->output_queue;
 5162		sd->output_queue = NULL;
 5163		sd->output_queue_tailp = &sd->output_queue;
 5164		local_irq_enable();
 5165
 5166		rcu_read_lock();
 5167
 5168		while (head) {
 5169			struct Qdisc *q = head;
 5170			spinlock_t *root_lock = NULL;
 5171
 5172			head = head->next_sched;
 5173
 
 
 5174			/* We need to make sure head->next_sched is read
 5175			 * before clearing __QDISC_STATE_SCHED
 5176			 */
 5177			smp_mb__before_atomic();
 5178
 5179			if (!(q->flags & TCQ_F_NOLOCK)) {
 5180				root_lock = qdisc_lock(q);
 5181				spin_lock(root_lock);
 5182			} else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
 5183						     &q->state))) {
 5184				/* There is a synchronize_net() between
 5185				 * STATE_DEACTIVATED flag being set and
 5186				 * qdisc_reset()/some_qdisc_is_busy() in
 5187				 * dev_deactivate(), so we can safely bail out
 5188				 * early here to avoid data race between
 5189				 * qdisc_deactivate() and some_qdisc_is_busy()
 5190				 * for lockless qdisc.
 5191				 */
 5192				clear_bit(__QDISC_STATE_SCHED, &q->state);
 5193				continue;
 5194			}
 5195
 5196			clear_bit(__QDISC_STATE_SCHED, &q->state);
 5197			qdisc_run(q);
 5198			if (root_lock)
 5199				spin_unlock(root_lock);
 5200		}
 5201
 5202		rcu_read_unlock();
 5203	}
 5204
 5205	xfrm_dev_backlog(sd);
 5206}
 5207
 5208#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
 5209/* This hook is defined here for ATM LANE */
 5210int (*br_fdb_test_addr_hook)(struct net_device *dev,
 5211			     unsigned char *addr) __read_mostly;
 5212EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
 5213#endif
 5214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 5215/**
 5216 *	netdev_is_rx_handler_busy - check if receive handler is registered
 5217 *	@dev: device to check
 5218 *
 5219 *	Check if a receive handler is already registered for a given device.
 5220 *	Return true if there one.
 5221 *
 5222 *	The caller must hold the rtnl_mutex.
 5223 */
 5224bool netdev_is_rx_handler_busy(struct net_device *dev)
 5225{
 5226	ASSERT_RTNL();
 5227	return dev && rtnl_dereference(dev->rx_handler);
 5228}
 5229EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
 5230
 5231/**
 5232 *	netdev_rx_handler_register - register receive handler
 5233 *	@dev: device to register a handler for
 5234 *	@rx_handler: receive handler to register
 5235 *	@rx_handler_data: data pointer that is used by rx handler
 5236 *
 5237 *	Register a receive handler for a device. This handler will then be
 5238 *	called from __netif_receive_skb. A negative errno code is returned
 5239 *	on a failure.
 5240 *
 5241 *	The caller must hold the rtnl_mutex.
 5242 *
 5243 *	For a general description of rx_handler, see enum rx_handler_result.
 5244 */
 5245int netdev_rx_handler_register(struct net_device *dev,
 5246			       rx_handler_func_t *rx_handler,
 5247			       void *rx_handler_data)
 5248{
 5249	if (netdev_is_rx_handler_busy(dev))
 5250		return -EBUSY;
 5251
 5252	if (dev->priv_flags & IFF_NO_RX_HANDLER)
 5253		return -EINVAL;
 5254
 5255	/* Note: rx_handler_data must be set before rx_handler */
 5256	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
 5257	rcu_assign_pointer(dev->rx_handler, rx_handler);
 5258
 5259	return 0;
 5260}
 5261EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
 5262
 5263/**
 5264 *	netdev_rx_handler_unregister - unregister receive handler
 5265 *	@dev: device to unregister a handler from
 5266 *
 5267 *	Unregister a receive handler from a device.
 5268 *
 5269 *	The caller must hold the rtnl_mutex.
 5270 */
 5271void netdev_rx_handler_unregister(struct net_device *dev)
 5272{
 5273
 5274	ASSERT_RTNL();
 5275	RCU_INIT_POINTER(dev->rx_handler, NULL);
 5276	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
 5277	 * section has a guarantee to see a non NULL rx_handler_data
 5278	 * as well.
 5279	 */
 5280	synchronize_net();
 5281	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
 5282}
 5283EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
 5284
 5285/*
 5286 * Limit the use of PFMEMALLOC reserves to those protocols that implement
 5287 * the special handling of PFMEMALLOC skbs.
 5288 */
 5289static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
 5290{
 5291	switch (skb->protocol) {
 5292	case htons(ETH_P_ARP):
 5293	case htons(ETH_P_IP):
 5294	case htons(ETH_P_IPV6):
 5295	case htons(ETH_P_8021Q):
 5296	case htons(ETH_P_8021AD):
 5297		return true;
 5298	default:
 5299		return false;
 5300	}
 5301}
 5302
 5303static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
 5304			     int *ret, struct net_device *orig_dev)
 5305{
 
 5306	if (nf_hook_ingress_active(skb)) {
 5307		int ingress_retval;
 5308
 5309		if (*pt_prev) {
 5310			*ret = deliver_skb(skb, *pt_prev, orig_dev);
 5311			*pt_prev = NULL;
 5312		}
 5313
 5314		rcu_read_lock();
 5315		ingress_retval = nf_hook_ingress(skb);
 5316		rcu_read_unlock();
 5317		return ingress_retval;
 5318	}
 
 5319	return 0;
 5320}
 5321
 5322static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
 5323				    struct packet_type **ppt_prev)
 5324{
 5325	struct packet_type *ptype, *pt_prev;
 5326	rx_handler_func_t *rx_handler;
 5327	struct sk_buff *skb = *pskb;
 5328	struct net_device *orig_dev;
 5329	bool deliver_exact = false;
 5330	int ret = NET_RX_DROP;
 5331	__be16 type;
 5332
 5333	net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb);
 5334
 5335	trace_netif_receive_skb(skb);
 5336
 5337	orig_dev = skb->dev;
 5338
 5339	skb_reset_network_header(skb);
 5340	if (!skb_transport_header_was_set(skb))
 5341		skb_reset_transport_header(skb);
 5342	skb_reset_mac_len(skb);
 5343
 5344	pt_prev = NULL;
 5345
 5346another_round:
 5347	skb->skb_iif = skb->dev->ifindex;
 5348
 5349	__this_cpu_inc(softnet_data.processed);
 5350
 5351	if (static_branch_unlikely(&generic_xdp_needed_key)) {
 5352		int ret2;
 5353
 5354		migrate_disable();
 5355		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
 5356		migrate_enable();
 5357
 5358		if (ret2 != XDP_PASS) {
 5359			ret = NET_RX_DROP;
 5360			goto out;
 5361		}
 5362	}
 5363
 5364	if (eth_type_vlan(skb->protocol)) {
 5365		skb = skb_vlan_untag(skb);
 5366		if (unlikely(!skb))
 5367			goto out;
 5368	}
 5369
 5370	if (skb_skip_tc_classify(skb))
 5371		goto skip_classify;
 
 
 
 
 5372
 5373	if (pfmemalloc)
 5374		goto skip_taps;
 5375
 5376	list_for_each_entry_rcu(ptype, &ptype_all, list) {
 5377		if (pt_prev)
 5378			ret = deliver_skb(skb, pt_prev, orig_dev);
 5379		pt_prev = ptype;
 5380	}
 5381
 5382	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
 5383		if (pt_prev)
 5384			ret = deliver_skb(skb, pt_prev, orig_dev);
 5385		pt_prev = ptype;
 5386	}
 5387
 5388skip_taps:
 5389#ifdef CONFIG_NET_INGRESS
 5390	if (static_branch_unlikely(&ingress_needed_key)) {
 5391		bool another = false;
 5392
 5393		nf_skip_egress(skb, true);
 5394		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
 5395					 &another);
 5396		if (another)
 5397			goto another_round;
 5398		if (!skb)
 5399			goto out;
 5400
 5401		nf_skip_egress(skb, false);
 5402		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
 5403			goto out;
 5404	}
 5405#endif
 5406	skb_reset_redirect(skb);
 5407skip_classify:
 
 
 5408	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
 5409		goto drop;
 5410
 5411	if (skb_vlan_tag_present(skb)) {
 5412		if (pt_prev) {
 5413			ret = deliver_skb(skb, pt_prev, orig_dev);
 5414			pt_prev = NULL;
 5415		}
 5416		if (vlan_do_receive(&skb))
 5417			goto another_round;
 5418		else if (unlikely(!skb))
 5419			goto out;
 5420	}
 5421
 5422	rx_handler = rcu_dereference(skb->dev->rx_handler);
 5423	if (rx_handler) {
 5424		if (pt_prev) {
 5425			ret = deliver_skb(skb, pt_prev, orig_dev);
 5426			pt_prev = NULL;
 5427		}
 5428		switch (rx_handler(&skb)) {
 5429		case RX_HANDLER_CONSUMED:
 5430			ret = NET_RX_SUCCESS;
 5431			goto out;
 5432		case RX_HANDLER_ANOTHER:
 5433			goto another_round;
 5434		case RX_HANDLER_EXACT:
 5435			deliver_exact = true;
 5436			break;
 5437		case RX_HANDLER_PASS:
 5438			break;
 5439		default:
 5440			BUG();
 5441		}
 5442	}
 5443
 5444	if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
 5445check_vlan_id:
 5446		if (skb_vlan_tag_get_id(skb)) {
 5447			/* Vlan id is non 0 and vlan_do_receive() above couldn't
 5448			 * find vlan device.
 5449			 */
 5450			skb->pkt_type = PACKET_OTHERHOST;
 5451		} else if (eth_type_vlan(skb->protocol)) {
 5452			/* Outer header is 802.1P with vlan 0, inner header is
 5453			 * 802.1Q or 802.1AD and vlan_do_receive() above could
 5454			 * not find vlan dev for vlan id 0.
 5455			 */
 5456			__vlan_hwaccel_clear_tag(skb);
 5457			skb = skb_vlan_untag(skb);
 5458			if (unlikely(!skb))
 5459				goto out;
 5460			if (vlan_do_receive(&skb))
 5461				/* After stripping off 802.1P header with vlan 0
 5462				 * vlan dev is found for inner header.
 5463				 */
 5464				goto another_round;
 5465			else if (unlikely(!skb))
 5466				goto out;
 5467			else
 5468				/* We have stripped outer 802.1P vlan 0 header.
 5469				 * But could not find vlan dev.
 5470				 * check again for vlan id to set OTHERHOST.
 5471				 */
 5472				goto check_vlan_id;
 5473		}
 5474		/* Note: we might in the future use prio bits
 5475		 * and set skb->priority like in vlan_do_receive()
 5476		 * For the time being, just ignore Priority Code Point
 5477		 */
 5478		__vlan_hwaccel_clear_tag(skb);
 5479	}
 5480
 5481	type = skb->protocol;
 5482
 5483	/* deliver only exact match when indicated */
 5484	if (likely(!deliver_exact)) {
 5485		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5486				       &ptype_base[ntohs(type) &
 5487						   PTYPE_HASH_MASK]);
 5488	}
 5489
 5490	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5491			       &orig_dev->ptype_specific);
 5492
 5493	if (unlikely(skb->dev != orig_dev)) {
 5494		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5495				       &skb->dev->ptype_specific);
 5496	}
 5497
 5498	if (pt_prev) {
 5499		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 5500			goto drop;
 5501		*ppt_prev = pt_prev;
 
 5502	} else {
 5503drop:
 5504		if (!deliver_exact)
 5505			dev_core_stats_rx_dropped_inc(skb->dev);
 5506		else
 5507			dev_core_stats_rx_nohandler_inc(skb->dev);
 5508		kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
 5509		/* Jamal, now you will not able to escape explaining
 5510		 * me how you were going to use this. :-)
 5511		 */
 5512		ret = NET_RX_DROP;
 5513	}
 5514
 5515out:
 5516	/* The invariant here is that if *ppt_prev is not NULL
 5517	 * then skb should also be non-NULL.
 5518	 *
 5519	 * Apparently *ppt_prev assignment above holds this invariant due to
 5520	 * skb dereferencing near it.
 5521	 */
 5522	*pskb = skb;
 5523	return ret;
 5524}
 5525
 5526static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
 5527{
 5528	struct net_device *orig_dev = skb->dev;
 5529	struct packet_type *pt_prev = NULL;
 5530	int ret;
 5531
 5532	ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 5533	if (pt_prev)
 5534		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
 5535					 skb->dev, pt_prev, orig_dev);
 5536	return ret;
 5537}
 5538
 5539/**
 5540 *	netif_receive_skb_core - special purpose version of netif_receive_skb
 5541 *	@skb: buffer to process
 5542 *
 5543 *	More direct receive version of netif_receive_skb().  It should
 5544 *	only be used by callers that have a need to skip RPS and Generic XDP.
 5545 *	Caller must also take care of handling if ``(page_is_)pfmemalloc``.
 5546 *
 5547 *	This function may only be called from softirq context and interrupts
 5548 *	should be enabled.
 5549 *
 5550 *	Return values (usually ignored):
 5551 *	NET_RX_SUCCESS: no congestion
 5552 *	NET_RX_DROP: packet was dropped
 5553 */
 5554int netif_receive_skb_core(struct sk_buff *skb)
 5555{
 5556	int ret;
 5557
 5558	rcu_read_lock();
 5559	ret = __netif_receive_skb_one_core(skb, false);
 5560	rcu_read_unlock();
 5561
 5562	return ret;
 5563}
 5564EXPORT_SYMBOL(netif_receive_skb_core);
 5565
 5566static inline void __netif_receive_skb_list_ptype(struct list_head *head,
 5567						  struct packet_type *pt_prev,
 5568						  struct net_device *orig_dev)
 5569{
 5570	struct sk_buff *skb, *next;
 5571
 5572	if (!pt_prev)
 5573		return;
 5574	if (list_empty(head))
 5575		return;
 5576	if (pt_prev->list_func != NULL)
 5577		INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
 5578				   ip_list_rcv, head, pt_prev, orig_dev);
 5579	else
 5580		list_for_each_entry_safe(skb, next, head, list) {
 5581			skb_list_del_init(skb);
 5582			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 5583		}
 5584}
 5585
 5586static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
 5587{
 5588	/* Fast-path assumptions:
 5589	 * - There is no RX handler.
 5590	 * - Only one packet_type matches.
 5591	 * If either of these fails, we will end up doing some per-packet
 5592	 * processing in-line, then handling the 'last ptype' for the whole
 5593	 * sublist.  This can't cause out-of-order delivery to any single ptype,
 5594	 * because the 'last ptype' must be constant across the sublist, and all
 5595	 * other ptypes are handled per-packet.
 5596	 */
 5597	/* Current (common) ptype of sublist */
 5598	struct packet_type *pt_curr = NULL;
 5599	/* Current (common) orig_dev of sublist */
 5600	struct net_device *od_curr = NULL;
 5601	struct list_head sublist;
 5602	struct sk_buff *skb, *next;
 5603
 5604	INIT_LIST_HEAD(&sublist);
 5605	list_for_each_entry_safe(skb, next, head, list) {
 5606		struct net_device *orig_dev = skb->dev;
 5607		struct packet_type *pt_prev = NULL;
 5608
 5609		skb_list_del_init(skb);
 5610		__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 5611		if (!pt_prev)
 5612			continue;
 5613		if (pt_curr != pt_prev || od_curr != orig_dev) {
 5614			/* dispatch old sublist */
 5615			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 5616			/* start new sublist */
 5617			INIT_LIST_HEAD(&sublist);
 5618			pt_curr = pt_prev;
 5619			od_curr = orig_dev;
 5620		}
 5621		list_add_tail(&skb->list, &sublist);
 5622	}
 5623
 5624	/* dispatch final sublist */
 5625	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 5626}
 5627
 5628static int __netif_receive_skb(struct sk_buff *skb)
 5629{
 5630	int ret;
 5631
 5632	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
 5633		unsigned int noreclaim_flag;
 5634
 5635		/*
 5636		 * PFMEMALLOC skbs are special, they should
 5637		 * - be delivered to SOCK_MEMALLOC sockets only
 5638		 * - stay away from userspace
 5639		 * - have bounded memory usage
 5640		 *
 5641		 * Use PF_MEMALLOC as this saves us from propagating the allocation
 5642		 * context down to all allocation sites.
 5643		 */
 5644		noreclaim_flag = memalloc_noreclaim_save();
 5645		ret = __netif_receive_skb_one_core(skb, true);
 5646		memalloc_noreclaim_restore(noreclaim_flag);
 5647	} else
 5648		ret = __netif_receive_skb_one_core(skb, false);
 5649
 5650	return ret;
 5651}
 5652
 5653static void __netif_receive_skb_list(struct list_head *head)
 5654{
 5655	unsigned long noreclaim_flag = 0;
 5656	struct sk_buff *skb, *next;
 5657	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
 5658
 5659	list_for_each_entry_safe(skb, next, head, list) {
 5660		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
 5661			struct list_head sublist;
 5662
 5663			/* Handle the previous sublist */
 5664			list_cut_before(&sublist, head, &skb->list);
 5665			if (!list_empty(&sublist))
 5666				__netif_receive_skb_list_core(&sublist, pfmemalloc);
 5667			pfmemalloc = !pfmemalloc;
 5668			/* See comments in __netif_receive_skb */
 5669			if (pfmemalloc)
 5670				noreclaim_flag = memalloc_noreclaim_save();
 5671			else
 5672				memalloc_noreclaim_restore(noreclaim_flag);
 5673		}
 5674	}
 5675	/* Handle the remaining sublist */
 5676	if (!list_empty(head))
 5677		__netif_receive_skb_list_core(head, pfmemalloc);
 5678	/* Restore pflags */
 5679	if (pfmemalloc)
 5680		memalloc_noreclaim_restore(noreclaim_flag);
 5681}
 5682
 5683static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 5684{
 5685	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
 5686	struct bpf_prog *new = xdp->prog;
 5687	int ret = 0;
 5688
 5689	switch (xdp->command) {
 5690	case XDP_SETUP_PROG:
 5691		rcu_assign_pointer(dev->xdp_prog, new);
 5692		if (old)
 5693			bpf_prog_put(old);
 5694
 5695		if (old && !new) {
 5696			static_branch_dec(&generic_xdp_needed_key);
 5697		} else if (new && !old) {
 5698			static_branch_inc(&generic_xdp_needed_key);
 5699			dev_disable_lro(dev);
 5700			dev_disable_gro_hw(dev);
 5701		}
 5702		break;
 5703
 5704	default:
 5705		ret = -EINVAL;
 5706		break;
 5707	}
 5708
 5709	return ret;
 5710}
 5711
 5712static int netif_receive_skb_internal(struct sk_buff *skb)
 5713{
 5714	int ret;
 5715
 5716	net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
 5717
 5718	if (skb_defer_rx_timestamp(skb))
 5719		return NET_RX_SUCCESS;
 5720
 5721	rcu_read_lock();
 
 5722#ifdef CONFIG_RPS
 5723	if (static_branch_unlikely(&rps_needed)) {
 5724		struct rps_dev_flow voidflow, *rflow = &voidflow;
 5725		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5726
 5727		if (cpu >= 0) {
 5728			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5729			rcu_read_unlock();
 5730			return ret;
 5731		}
 5732	}
 5733#endif
 5734	ret = __netif_receive_skb(skb);
 5735	rcu_read_unlock();
 5736	return ret;
 5737}
 5738
 5739void netif_receive_skb_list_internal(struct list_head *head)
 5740{
 5741	struct sk_buff *skb, *next;
 5742	struct list_head sublist;
 5743
 5744	INIT_LIST_HEAD(&sublist);
 5745	list_for_each_entry_safe(skb, next, head, list) {
 5746		net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
 5747		skb_list_del_init(skb);
 5748		if (!skb_defer_rx_timestamp(skb))
 5749			list_add_tail(&skb->list, &sublist);
 5750	}
 5751	list_splice_init(&sublist, head);
 5752
 5753	rcu_read_lock();
 5754#ifdef CONFIG_RPS
 5755	if (static_branch_unlikely(&rps_needed)) {
 5756		list_for_each_entry_safe(skb, next, head, list) {
 5757			struct rps_dev_flow voidflow, *rflow = &voidflow;
 5758			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5759
 5760			if (cpu >= 0) {
 5761				/* Will be handled, remove from list */
 5762				skb_list_del_init(skb);
 5763				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5764			}
 5765		}
 5766	}
 5767#endif
 5768	__netif_receive_skb_list(head);
 5769	rcu_read_unlock();
 5770}
 5771
 5772/**
 5773 *	netif_receive_skb - process receive buffer from network
 5774 *	@skb: buffer to process
 5775 *
 5776 *	netif_receive_skb() is the main receive data processing function.
 5777 *	It always succeeds. The buffer may be dropped during processing
 5778 *	for congestion control or by the protocol layers.
 5779 *
 5780 *	This function may only be called from softirq context and interrupts
 5781 *	should be enabled.
 5782 *
 5783 *	Return values (usually ignored):
 5784 *	NET_RX_SUCCESS: no congestion
 5785 *	NET_RX_DROP: packet was dropped
 5786 */
 5787int netif_receive_skb(struct sk_buff *skb)
 5788{
 5789	int ret;
 5790
 5791	trace_netif_receive_skb_entry(skb);
 5792
 5793	ret = netif_receive_skb_internal(skb);
 5794	trace_netif_receive_skb_exit(ret);
 5795
 5796	return ret;
 5797}
 5798EXPORT_SYMBOL(netif_receive_skb);
 5799
 5800/**
 5801 *	netif_receive_skb_list - process many receive buffers from network
 5802 *	@head: list of skbs to process.
 5803 *
 5804 *	Since return value of netif_receive_skb() is normally ignored, and
 5805 *	wouldn't be meaningful for a list, this function returns void.
 5806 *
 5807 *	This function may only be called from softirq context and interrupts
 5808 *	should be enabled.
 5809 */
 5810void netif_receive_skb_list(struct list_head *head)
 5811{
 5812	struct sk_buff *skb;
 5813
 5814	if (list_empty(head))
 5815		return;
 5816	if (trace_netif_receive_skb_list_entry_enabled()) {
 5817		list_for_each_entry(skb, head, list)
 5818			trace_netif_receive_skb_list_entry(skb);
 5819	}
 5820	netif_receive_skb_list_internal(head);
 5821	trace_netif_receive_skb_list_exit(0);
 5822}
 5823EXPORT_SYMBOL(netif_receive_skb_list);
 5824
 5825static DEFINE_PER_CPU(struct work_struct, flush_works);
 5826
 5827/* Network device is going away, flush any packets still pending */
 5828static void flush_backlog(struct work_struct *work)
 5829{
 5830	struct sk_buff *skb, *tmp;
 5831	struct softnet_data *sd;
 5832
 5833	local_bh_disable();
 5834	sd = this_cpu_ptr(&softnet_data);
 5835
 5836	rps_lock_irq_disable(sd);
 
 5837	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
 5838		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 5839			__skb_unlink(skb, &sd->input_pkt_queue);
 5840			dev_kfree_skb_irq(skb);
 5841			input_queue_head_incr(sd);
 5842		}
 5843	}
 5844	rps_unlock_irq_enable(sd);
 
 5845
 5846	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
 5847		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 5848			__skb_unlink(skb, &sd->process_queue);
 5849			kfree_skb(skb);
 5850			input_queue_head_incr(sd);
 5851		}
 5852	}
 5853	local_bh_enable();
 5854}
 5855
 5856static bool flush_required(int cpu)
 5857{
 5858#if IS_ENABLED(CONFIG_RPS)
 5859	struct softnet_data *sd = &per_cpu(softnet_data, cpu);
 5860	bool do_flush;
 5861
 5862	rps_lock_irq_disable(sd);
 5863
 5864	/* as insertion into process_queue happens with the rps lock held,
 5865	 * process_queue access may race only with dequeue
 5866	 */
 5867	do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
 5868		   !skb_queue_empty_lockless(&sd->process_queue);
 5869	rps_unlock_irq_enable(sd);
 5870
 5871	return do_flush;
 5872#endif
 5873	/* without RPS we can't safely check input_pkt_queue: during a
 5874	 * concurrent remote skb_queue_splice() we can detect as empty both
 5875	 * input_pkt_queue and process_queue even if the latter could end-up
 5876	 * containing a lot of packets.
 5877	 */
 5878	return true;
 5879}
 5880
 5881static void flush_all_backlogs(void)
 5882{
 5883	static cpumask_t flush_cpus;
 5884	unsigned int cpu;
 
 
 5885
 5886	/* since we are under rtnl lock protection we can use static data
 5887	 * for the cpumask and avoid allocating on stack the possibly
 5888	 * large mask
 5889	 */
 5890	ASSERT_RTNL();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 5891
 5892	cpus_read_lock();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 5893
 5894	cpumask_clear(&flush_cpus);
 5895	for_each_online_cpu(cpu) {
 5896		if (flush_required(cpu)) {
 5897			queue_work_on(cpu, system_highpri_wq,
 5898				      per_cpu_ptr(&flush_works, cpu));
 5899			cpumask_set_cpu(cpu, &flush_cpus);
 
 
 
 
 5900		}
 
 
 
 
 
 5901	}
 5902
 5903	/* we can have in flight packet[s] on the cpus we are not flushing,
 5904	 * synchronize_net() in unregister_netdevice_many() will take care of
 5905	 * them
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 5906	 */
 5907	for_each_cpu(cpu, &flush_cpus)
 5908		flush_work(per_cpu_ptr(&flush_works, cpu));
 5909
 5910	cpus_read_unlock();
 5911}
 5912
 5913static void net_rps_send_ipi(struct softnet_data *remsd)
 5914{
 5915#ifdef CONFIG_RPS
 5916	while (remsd) {
 5917		struct softnet_data *next = remsd->rps_ipi_next;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 5918
 5919		if (cpu_online(remsd->cpu))
 5920			smp_call_function_single_async(remsd->cpu, &remsd->csd);
 5921		remsd = next;
 
 
 
 
 
 5922	}
 5923#endif
 
 
 
 
 5924}
 
 5925
 5926/*
 5927 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
 5928 * Note: called with local irq disabled, but exits with local irq enabled.
 5929 */
 5930static void net_rps_action_and_irq_enable(struct softnet_data *sd)
 5931{
 5932#ifdef CONFIG_RPS
 5933	struct softnet_data *remsd = sd->rps_ipi_list;
 5934
 5935	if (remsd) {
 5936		sd->rps_ipi_list = NULL;
 5937
 5938		local_irq_enable();
 5939
 5940		/* Send pending IPI's to kick RPS processing on remote cpus. */
 5941		net_rps_send_ipi(remsd);
 
 
 
 
 
 
 
 5942	} else
 5943#endif
 5944		local_irq_enable();
 5945}
 5946
 5947static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
 5948{
 5949#ifdef CONFIG_RPS
 5950	return sd->rps_ipi_list != NULL;
 5951#else
 5952	return false;
 5953#endif
 5954}
 5955
 5956static int process_backlog(struct napi_struct *napi, int quota)
 5957{
 5958	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
 5959	bool again = true;
 5960	int work = 0;
 5961
 5962	/* Check if we have pending ipi, its better to send them now,
 5963	 * not waiting net_rx_action() end.
 5964	 */
 5965	if (sd_has_rps_ipi_waiting(sd)) {
 5966		local_irq_disable();
 5967		net_rps_action_and_irq_enable(sd);
 5968	}
 5969
 5970	napi->weight = READ_ONCE(dev_rx_weight);
 5971	while (again) {
 5972		struct sk_buff *skb;
 5973
 5974		while ((skb = __skb_dequeue(&sd->process_queue))) {
 5975			rcu_read_lock();
 5976			__netif_receive_skb(skb);
 5977			rcu_read_unlock();
 5978			input_queue_head_incr(sd);
 5979			if (++work >= quota)
 5980				return work;
 5981
 5982		}
 5983
 5984		rps_lock_irq_disable(sd);
 
 5985		if (skb_queue_empty(&sd->input_pkt_queue)) {
 5986			/*
 5987			 * Inline a custom version of __napi_complete().
 5988			 * only current cpu owns and manipulates this napi,
 5989			 * and NAPI_STATE_SCHED is the only possible flag set
 5990			 * on backlog.
 5991			 * We can use a plain write instead of clear_bit(),
 5992			 * and we dont need an smp_mb() memory barrier.
 5993			 */
 5994			napi->state = 0;
 5995			again = false;
 5996		} else {
 5997			skb_queue_splice_tail_init(&sd->input_pkt_queue,
 5998						   &sd->process_queue);
 5999		}
 6000		rps_unlock_irq_enable(sd);
 
 6001	}
 6002
 6003	return work;
 6004}
 6005
 6006/**
 6007 * __napi_schedule - schedule for receive
 6008 * @n: entry to schedule
 6009 *
 6010 * The entry's receive function will be scheduled to run.
 6011 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
 6012 */
 6013void __napi_schedule(struct napi_struct *n)
 6014{
 6015	unsigned long flags;
 6016
 6017	local_irq_save(flags);
 6018	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 6019	local_irq_restore(flags);
 6020}
 6021EXPORT_SYMBOL(__napi_schedule);
 6022
 6023/**
 6024 *	napi_schedule_prep - check if napi can be scheduled
 6025 *	@n: napi context
 6026 *
 6027 * Test if NAPI routine is already running, and if not mark
 6028 * it as running.  This is used as a condition variable to
 6029 * insure only one NAPI poll instance runs.  We also make
 6030 * sure there is no pending NAPI disable.
 6031 */
 6032bool napi_schedule_prep(struct napi_struct *n)
 6033{
 6034	unsigned long new, val = READ_ONCE(n->state);
 6035
 6036	do {
 
 6037		if (unlikely(val & NAPIF_STATE_DISABLE))
 6038			return false;
 6039		new = val | NAPIF_STATE_SCHED;
 6040
 6041		/* Sets STATE_MISSED bit if STATE_SCHED was already set
 6042		 * This was suggested by Alexander Duyck, as compiler
 6043		 * emits better code than :
 6044		 * if (val & NAPIF_STATE_SCHED)
 6045		 *     new |= NAPIF_STATE_MISSED;
 6046		 */
 6047		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
 6048						   NAPIF_STATE_MISSED;
 6049	} while (!try_cmpxchg(&n->state, &val, new));
 6050
 6051	return !(val & NAPIF_STATE_SCHED);
 6052}
 6053EXPORT_SYMBOL(napi_schedule_prep);
 6054
 6055/**
 6056 * __napi_schedule_irqoff - schedule for receive
 6057 * @n: entry to schedule
 6058 *
 6059 * Variant of __napi_schedule() assuming hard irqs are masked.
 6060 *
 6061 * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
 6062 * because the interrupt disabled assumption might not be true
 6063 * due to force-threaded interrupts and spinlock substitution.
 6064 */
 6065void __napi_schedule_irqoff(struct napi_struct *n)
 6066{
 6067	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 6068		____napi_schedule(this_cpu_ptr(&softnet_data), n);
 6069	else
 6070		__napi_schedule(n);
 6071}
 6072EXPORT_SYMBOL(__napi_schedule_irqoff);
 6073
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 6074bool napi_complete_done(struct napi_struct *n, int work_done)
 6075{
 6076	unsigned long flags, val, new, timeout = 0;
 6077	bool ret = true;
 6078
 6079	/*
 6080	 * 1) Don't let napi dequeue from the cpu poll list
 6081	 *    just in case its running on a different cpu.
 6082	 * 2) If we are busy polling, do nothing here, we have
 6083	 *    the guarantee we will be called later.
 6084	 */
 6085	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
 6086				 NAPIF_STATE_IN_BUSY_POLL)))
 6087		return false;
 6088
 6089	if (work_done) {
 6090		if (n->gro_bitmask)
 6091			timeout = READ_ONCE(n->dev->gro_flush_timeout);
 6092		n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
 6093	}
 6094	if (n->defer_hard_irqs_count > 0) {
 6095		n->defer_hard_irqs_count--;
 6096		timeout = READ_ONCE(n->dev->gro_flush_timeout);
 6097		if (timeout)
 6098			ret = false;
 6099	}
 6100	if (n->gro_bitmask) {
 6101		/* When the NAPI instance uses a timeout and keeps postponing
 6102		 * it, we need to bound somehow the time packets are kept in
 6103		 * the GRO layer
 6104		 */
 6105		napi_gro_flush(n, !!timeout);
 6106	}
 6107
 6108	gro_normal_list(n);
 
 6109
 
 
 
 
 
 
 6110	if (unlikely(!list_empty(&n->poll_list))) {
 6111		/* If n->poll_list is not empty, we need to mask irqs */
 6112		local_irq_save(flags);
 6113		list_del_init(&n->poll_list);
 6114		local_irq_restore(flags);
 6115	}
 6116	WRITE_ONCE(n->list_owner, -1);
 6117
 6118	val = READ_ONCE(n->state);
 6119	do {
 
 
 6120		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
 6121
 6122		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
 6123			      NAPIF_STATE_SCHED_THREADED |
 6124			      NAPIF_STATE_PREFER_BUSY_POLL);
 6125
 6126		/* If STATE_MISSED was set, leave STATE_SCHED set,
 6127		 * because we will call napi->poll() one more time.
 6128		 * This C code was suggested by Alexander Duyck to help gcc.
 6129		 */
 6130		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
 6131						    NAPIF_STATE_SCHED;
 6132	} while (!try_cmpxchg(&n->state, &val, new));
 6133
 6134	if (unlikely(val & NAPIF_STATE_MISSED)) {
 6135		__napi_schedule(n);
 6136		return false;
 6137	}
 6138
 6139	if (timeout)
 6140		hrtimer_start(&n->timer, ns_to_ktime(timeout),
 6141			      HRTIMER_MODE_REL_PINNED);
 6142	return ret;
 6143}
 6144EXPORT_SYMBOL(napi_complete_done);
 6145
 6146/* must be called under rcu_read_lock(), as we dont take a reference */
 6147struct napi_struct *napi_by_id(unsigned int napi_id)
 6148{
 6149	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
 6150	struct napi_struct *napi;
 6151
 6152	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
 6153		if (napi->napi_id == napi_id)
 6154			return napi;
 6155
 6156	return NULL;
 6157}
 6158
 6159#if defined(CONFIG_NET_RX_BUSY_POLL)
 6160
 6161static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
 6162{
 6163	if (!skip_schedule) {
 6164		gro_normal_list(napi);
 6165		__napi_schedule(napi);
 6166		return;
 6167	}
 6168
 6169	if (napi->gro_bitmask) {
 6170		/* flush too old packets
 6171		 * If HZ < 1000, flush all packets.
 6172		 */
 6173		napi_gro_flush(napi, HZ >= 1000);
 6174	}
 6175
 6176	gro_normal_list(napi);
 6177	clear_bit(NAPI_STATE_SCHED, &napi->state);
 6178}
 6179
 6180static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
 6181			   u16 budget)
 6182{
 6183	bool skip_schedule = false;
 6184	unsigned long timeout;
 6185	int rc;
 6186
 6187	/* Busy polling means there is a high chance device driver hard irq
 6188	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
 6189	 * set in napi_schedule_prep().
 6190	 * Since we are about to call napi->poll() once more, we can safely
 6191	 * clear NAPI_STATE_MISSED.
 6192	 *
 6193	 * Note: x86 could use a single "lock and ..." instruction
 6194	 * to perform these two clear_bit()
 6195	 */
 6196	clear_bit(NAPI_STATE_MISSED, &napi->state);
 6197	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
 6198
 6199	local_bh_disable();
 6200
 6201	if (prefer_busy_poll) {
 6202		napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
 6203		timeout = READ_ONCE(napi->dev->gro_flush_timeout);
 6204		if (napi->defer_hard_irqs_count && timeout) {
 6205			hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
 6206			skip_schedule = true;
 6207		}
 6208	}
 6209
 6210	/* All we really want here is to re-enable device interrupts.
 6211	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
 6212	 */
 6213	rc = napi->poll(napi, budget);
 6214	/* We can't gro_normal_list() here, because napi->poll() might have
 6215	 * rearmed the napi (napi_complete_done()) in which case it could
 6216	 * already be running on another CPU.
 6217	 */
 6218	trace_napi_poll(napi, rc, budget);
 6219	netpoll_poll_unlock(have_poll_lock);
 6220	if (rc == budget)
 6221		__busy_poll_stop(napi, skip_schedule);
 6222	local_bh_enable();
 
 
 6223}
 6224
 6225void napi_busy_loop(unsigned int napi_id,
 6226		    bool (*loop_end)(void *, unsigned long),
 6227		    void *loop_end_arg, bool prefer_busy_poll, u16 budget)
 6228{
 6229	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
 6230	int (*napi_poll)(struct napi_struct *napi, int budget);
 
 6231	void *have_poll_lock = NULL;
 6232	struct napi_struct *napi;
 
 6233
 6234restart:
 
 6235	napi_poll = NULL;
 6236
 6237	rcu_read_lock();
 6238
 6239	napi = napi_by_id(napi_id);
 6240	if (!napi)
 6241		goto out;
 6242
 6243	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 6244		preempt_disable();
 6245	for (;;) {
 6246		int work = 0;
 6247
 
 
 
 6248		local_bh_disable();
 
 
 
 
 6249		if (!napi_poll) {
 6250			unsigned long val = READ_ONCE(napi->state);
 6251
 6252			/* If multiple threads are competing for this napi,
 6253			 * we avoid dirtying napi->state as much as we can.
 6254			 */
 6255			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
 6256				   NAPIF_STATE_IN_BUSY_POLL)) {
 6257				if (prefer_busy_poll)
 6258					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
 6259				goto count;
 6260			}
 6261			if (cmpxchg(&napi->state, val,
 6262				    val | NAPIF_STATE_IN_BUSY_POLL |
 6263					  NAPIF_STATE_SCHED) != val) {
 6264				if (prefer_busy_poll)
 6265					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
 6266				goto count;
 6267			}
 6268			have_poll_lock = netpoll_poll_lock(napi);
 6269			napi_poll = napi->poll;
 6270		}
 6271		work = napi_poll(napi, budget);
 6272		trace_napi_poll(napi, work, budget);
 6273		gro_normal_list(napi);
 6274count:
 6275		if (work > 0)
 6276			__NET_ADD_STATS(dev_net(napi->dev),
 6277					LINUX_MIB_BUSYPOLLRXPACKETS, work);
 6278		local_bh_enable();
 6279
 6280		if (!loop_end || loop_end(loop_end_arg, start_time))
 
 
 
 
 6281			break;
 6282
 6283		if (unlikely(need_resched())) {
 6284			if (napi_poll)
 6285				busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
 6286			if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 6287				preempt_enable();
 6288			rcu_read_unlock();
 6289			cond_resched();
 6290			if (loop_end(loop_end_arg, start_time))
 6291				return;
 
 6292			goto restart;
 6293		}
 6294		cpu_relax();
 6295	}
 6296	if (napi_poll)
 6297		busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
 6298	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 6299		preempt_enable();
 6300out:
 6301	rcu_read_unlock();
 
 6302}
 6303EXPORT_SYMBOL(napi_busy_loop);
 6304
 6305#endif /* CONFIG_NET_RX_BUSY_POLL */
 6306
 6307static void napi_hash_add(struct napi_struct *napi)
 6308{
 6309	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
 
 6310		return;
 6311
 6312	spin_lock(&napi_hash_lock);
 6313
 6314	/* 0..NR_CPUS range is reserved for sender_cpu use */
 6315	do {
 6316		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
 6317			napi_gen_id = MIN_NAPI_ID;
 6318	} while (napi_by_id(napi_gen_id));
 6319	napi->napi_id = napi_gen_id;
 6320
 6321	hlist_add_head_rcu(&napi->napi_hash_node,
 6322			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
 6323
 6324	spin_unlock(&napi_hash_lock);
 6325}
 6326
 6327/* Warning : caller is responsible to make sure rcu grace period
 6328 * is respected before freeing memory containing @napi
 6329 */
 6330static void napi_hash_del(struct napi_struct *napi)
 6331{
 6332	spin_lock(&napi_hash_lock);
 6333
 6334	hlist_del_init_rcu(&napi->napi_hash_node);
 6335
 
 
 
 
 6336	spin_unlock(&napi_hash_lock);
 
 6337}
 
 6338
 6339static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 6340{
 6341	struct napi_struct *napi;
 6342
 6343	napi = container_of(timer, struct napi_struct, timer);
 6344
 6345	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
 6346	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
 6347	 */
 6348	if (!napi_disable_pending(napi) &&
 6349	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
 6350		clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
 6351		__napi_schedule_irqoff(napi);
 6352	}
 6353
 6354	return HRTIMER_NORESTART;
 6355}
 6356
 6357static void init_gro_hash(struct napi_struct *napi)
 6358{
 6359	int i;
 6360
 6361	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6362		INIT_LIST_HEAD(&napi->gro_hash[i].list);
 6363		napi->gro_hash[i].count = 0;
 6364	}
 6365	napi->gro_bitmask = 0;
 6366}
 6367
 6368int dev_set_threaded(struct net_device *dev, bool threaded)
 6369{
 6370	struct napi_struct *napi;
 6371	int err = 0;
 6372
 6373	if (dev->threaded == threaded)
 6374		return 0;
 6375
 6376	if (threaded) {
 6377		list_for_each_entry(napi, &dev->napi_list, dev_list) {
 6378			if (!napi->thread) {
 6379				err = napi_kthread_create(napi);
 6380				if (err) {
 6381					threaded = false;
 6382					break;
 6383				}
 6384			}
 6385		}
 6386	}
 6387
 6388	dev->threaded = threaded;
 6389
 6390	/* Make sure kthread is created before THREADED bit
 6391	 * is set.
 6392	 */
 6393	smp_mb__before_atomic();
 6394
 6395	/* Setting/unsetting threaded mode on a napi might not immediately
 6396	 * take effect, if the current napi instance is actively being
 6397	 * polled. In this case, the switch between threaded mode and
 6398	 * softirq mode will happen in the next round of napi_schedule().
 6399	 * This should not cause hiccups/stalls to the live traffic.
 6400	 */
 6401	list_for_each_entry(napi, &dev->napi_list, dev_list)
 6402		assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
 6403
 6404	return err;
 6405}
 6406EXPORT_SYMBOL(dev_set_threaded);
 6407
 6408/**
 6409 * netif_queue_set_napi - Associate queue with the napi
 6410 * @dev: device to which NAPI and queue belong
 6411 * @queue_index: Index of queue
 6412 * @type: queue type as RX or TX
 6413 * @napi: NAPI context, pass NULL to clear previously set NAPI
 6414 *
 6415 * Set queue with its corresponding napi context. This should be done after
 6416 * registering the NAPI handler for the queue-vector and the queues have been
 6417 * mapped to the corresponding interrupt vector.
 6418 */
 6419void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
 6420			  enum netdev_queue_type type, struct napi_struct *napi)
 6421{
 6422	struct netdev_rx_queue *rxq;
 6423	struct netdev_queue *txq;
 6424
 6425	if (WARN_ON_ONCE(napi && !napi->dev))
 6426		return;
 6427	if (dev->reg_state >= NETREG_REGISTERED)
 6428		ASSERT_RTNL();
 6429
 6430	switch (type) {
 6431	case NETDEV_QUEUE_TYPE_RX:
 6432		rxq = __netif_get_rx_queue(dev, queue_index);
 6433		rxq->napi = napi;
 6434		return;
 6435	case NETDEV_QUEUE_TYPE_TX:
 6436		txq = netdev_get_tx_queue(dev, queue_index);
 6437		txq->napi = napi;
 6438		return;
 6439	default:
 6440		return;
 6441	}
 6442}
 6443EXPORT_SYMBOL(netif_queue_set_napi);
 6444
 6445void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 6446			   int (*poll)(struct napi_struct *, int), int weight)
 6447{
 6448	if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
 6449		return;
 6450
 6451	INIT_LIST_HEAD(&napi->poll_list);
 6452	INIT_HLIST_NODE(&napi->napi_hash_node);
 6453	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 6454	napi->timer.function = napi_watchdog;
 6455	init_gro_hash(napi);
 
 6456	napi->skb = NULL;
 6457	INIT_LIST_HEAD(&napi->rx_list);
 6458	napi->rx_count = 0;
 6459	napi->poll = poll;
 6460	if (weight > NAPI_POLL_WEIGHT)
 6461		netdev_err_once(dev, "%s() called with weight %d\n", __func__,
 6462				weight);
 6463	napi->weight = weight;
 
 6464	napi->dev = dev;
 6465#ifdef CONFIG_NETPOLL
 6466	napi->poll_owner = -1;
 6467#endif
 6468	napi->list_owner = -1;
 6469	set_bit(NAPI_STATE_SCHED, &napi->state);
 6470	set_bit(NAPI_STATE_NPSVC, &napi->state);
 6471	list_add_rcu(&napi->dev_list, &dev->napi_list);
 6472	napi_hash_add(napi);
 6473	napi_get_frags_check(napi);
 6474	/* Create kthread for this napi if dev->threaded is set.
 6475	 * Clear dev->threaded if kthread creation failed so that
 6476	 * threaded mode will not be enabled in napi_enable().
 6477	 */
 6478	if (dev->threaded && napi_kthread_create(napi))
 6479		dev->threaded = 0;
 6480	netif_napi_set_irq(napi, -1);
 6481}
 6482EXPORT_SYMBOL(netif_napi_add_weight);
 6483
 6484void napi_disable(struct napi_struct *n)
 6485{
 6486	unsigned long val, new;
 6487
 6488	might_sleep();
 6489	set_bit(NAPI_STATE_DISABLE, &n->state);
 6490
 6491	val = READ_ONCE(n->state);
 6492	do {
 6493		while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
 6494			usleep_range(20, 200);
 6495			val = READ_ONCE(n->state);
 6496		}
 6497
 6498		new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
 6499		new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
 6500	} while (!try_cmpxchg(&n->state, &val, new));
 6501
 6502	hrtimer_cancel(&n->timer);
 6503
 6504	clear_bit(NAPI_STATE_DISABLE, &n->state);
 6505}
 6506EXPORT_SYMBOL(napi_disable);
 6507
 6508/**
 6509 *	napi_enable - enable NAPI scheduling
 6510 *	@n: NAPI context
 6511 *
 6512 * Resume NAPI from being scheduled on this context.
 6513 * Must be paired with napi_disable.
 6514 */
 6515void napi_enable(struct napi_struct *n)
 6516{
 6517	unsigned long new, val = READ_ONCE(n->state);
 6518
 6519	do {
 6520		BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
 6521
 6522		new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
 6523		if (n->dev->threaded && n->thread)
 6524			new |= NAPIF_STATE_THREADED;
 6525	} while (!try_cmpxchg(&n->state, &val, new));
 6526}
 6527EXPORT_SYMBOL(napi_enable);
 6528
 6529static void flush_gro_hash(struct napi_struct *napi)
 6530{
 6531	int i;
 6532
 6533	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6534		struct sk_buff *skb, *n;
 6535
 6536		list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
 6537			kfree_skb(skb);
 6538		napi->gro_hash[i].count = 0;
 6539	}
 6540}
 6541
 6542/* Must be called in process context */
 6543void __netif_napi_del(struct napi_struct *napi)
 6544{
 6545	if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
 6546		return;
 6547
 6548	napi_hash_del(napi);
 6549	list_del_rcu(&napi->dev_list);
 6550	napi_free_frags(napi);
 6551
 6552	flush_gro_hash(napi);
 6553	napi->gro_bitmask = 0;
 6554
 6555	if (napi->thread) {
 6556		kthread_stop(napi->thread);
 6557		napi->thread = NULL;
 6558	}
 6559}
 6560EXPORT_SYMBOL(__netif_napi_del);
 6561
 6562static int __napi_poll(struct napi_struct *n, bool *repoll)
 6563{
 
 6564	int work, weight;
 6565
 
 
 
 
 6566	weight = n->weight;
 6567
 6568	/* This NAPI_STATE_SCHED test is for avoiding a race
 6569	 * with netpoll's poll_napi().  Only the entity which
 6570	 * obtains the lock and sees NAPI_STATE_SCHED set will
 6571	 * actually make the ->poll() call.  Therefore we avoid
 6572	 * accidentally calling ->poll() when NAPI is not scheduled.
 6573	 */
 6574	work = 0;
 6575	if (napi_is_scheduled(n)) {
 6576		work = n->poll(n, weight);
 6577		trace_napi_poll(n, work, weight);
 6578
 6579		xdp_do_check_flushed(n);
 6580	}
 6581
 6582	if (unlikely(work > weight))
 6583		netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
 6584				n->poll, work, weight);
 6585
 6586	if (likely(work < weight))
 6587		return work;
 6588
 6589	/* Drivers must not modify the NAPI state if they
 6590	 * consume the entire weight.  In such cases this code
 6591	 * still "owns" the NAPI instance and therefore can
 6592	 * move the instance around on the list at-will.
 6593	 */
 6594	if (unlikely(napi_disable_pending(n))) {
 6595		napi_complete(n);
 6596		return work;
 6597	}
 6598
 6599	/* The NAPI context has more processing work, but busy-polling
 6600	 * is preferred. Exit early.
 6601	 */
 6602	if (napi_prefer_busy_poll(n)) {
 6603		if (napi_complete_done(n, work)) {
 6604			/* If timeout is not set, we need to make sure
 6605			 * that the NAPI is re-scheduled.
 6606			 */
 6607			napi_schedule(n);
 6608		}
 6609		return work;
 6610	}
 6611
 6612	if (n->gro_bitmask) {
 6613		/* flush too old packets
 6614		 * If HZ < 1000, flush all packets.
 6615		 */
 6616		napi_gro_flush(n, HZ >= 1000);
 6617	}
 6618
 6619	gro_normal_list(n);
 6620
 6621	/* Some drivers may have called napi_schedule
 6622	 * prior to exhausting their budget.
 6623	 */
 6624	if (unlikely(!list_empty(&n->poll_list))) {
 6625		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
 6626			     n->dev ? n->dev->name : "backlog");
 6627		return work;
 6628	}
 6629
 6630	*repoll = true;
 6631
 6632	return work;
 6633}
 6634
 6635static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 6636{
 6637	bool do_repoll = false;
 6638	void *have;
 6639	int work;
 6640
 6641	list_del_init(&n->poll_list);
 6642
 6643	have = netpoll_poll_lock(n);
 6644
 6645	work = __napi_poll(n, &do_repoll);
 6646
 6647	if (do_repoll)
 6648		list_add_tail(&n->poll_list, repoll);
 6649
 
 6650	netpoll_poll_unlock(have);
 6651
 6652	return work;
 6653}
 6654
 6655static int napi_thread_wait(struct napi_struct *napi)
 6656{
 6657	bool woken = false;
 6658
 6659	set_current_state(TASK_INTERRUPTIBLE);
 6660
 6661	while (!kthread_should_stop()) {
 6662		/* Testing SCHED_THREADED bit here to make sure the current
 6663		 * kthread owns this napi and could poll on this napi.
 6664		 * Testing SCHED bit is not enough because SCHED bit might be
 6665		 * set by some other busy poll thread or by napi_disable().
 6666		 */
 6667		if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
 6668			WARN_ON(!list_empty(&napi->poll_list));
 6669			__set_current_state(TASK_RUNNING);
 6670			return 0;
 6671		}
 6672
 6673		schedule();
 6674		/* woken being true indicates this thread owns this napi. */
 6675		woken = true;
 6676		set_current_state(TASK_INTERRUPTIBLE);
 6677	}
 6678	__set_current_state(TASK_RUNNING);
 6679
 6680	return -1;
 6681}
 6682
 6683static void skb_defer_free_flush(struct softnet_data *sd)
 6684{
 6685	struct sk_buff *skb, *next;
 6686
 6687	/* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
 6688	if (!READ_ONCE(sd->defer_list))
 6689		return;
 6690
 6691	spin_lock(&sd->defer_lock);
 6692	skb = sd->defer_list;
 6693	sd->defer_list = NULL;
 6694	sd->defer_count = 0;
 6695	spin_unlock(&sd->defer_lock);
 6696
 6697	while (skb != NULL) {
 6698		next = skb->next;
 6699		napi_consume_skb(skb, 1);
 6700		skb = next;
 6701	}
 6702}
 6703
 6704static int napi_threaded_poll(void *data)
 6705{
 6706	struct napi_struct *napi = data;
 6707	struct softnet_data *sd;
 6708	void *have;
 6709
 6710	while (!napi_thread_wait(napi)) {
 6711		for (;;) {
 6712			bool repoll = false;
 6713
 6714			local_bh_disable();
 6715			sd = this_cpu_ptr(&softnet_data);
 6716			sd->in_napi_threaded_poll = true;
 6717
 6718			have = netpoll_poll_lock(napi);
 6719			__napi_poll(napi, &repoll);
 6720			netpoll_poll_unlock(have);
 6721
 6722			sd->in_napi_threaded_poll = false;
 6723			barrier();
 6724
 6725			if (sd_has_rps_ipi_waiting(sd)) {
 6726				local_irq_disable();
 6727				net_rps_action_and_irq_enable(sd);
 6728			}
 6729			skb_defer_free_flush(sd);
 6730			local_bh_enable();
 6731
 6732			if (!repoll)
 6733				break;
 6734
 6735			cond_resched();
 6736		}
 6737	}
 6738	return 0;
 6739}
 6740
 6741static __latent_entropy void net_rx_action(struct softirq_action *h)
 6742{
 6743	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 6744	unsigned long time_limit = jiffies +
 6745		usecs_to_jiffies(READ_ONCE(netdev_budget_usecs));
 6746	int budget = READ_ONCE(netdev_budget);
 6747	LIST_HEAD(list);
 6748	LIST_HEAD(repoll);
 6749
 6750start:
 6751	sd->in_net_rx_action = true;
 6752	local_irq_disable();
 6753	list_splice_init(&sd->poll_list, &list);
 6754	local_irq_enable();
 6755
 6756	for (;;) {
 6757		struct napi_struct *n;
 6758
 6759		skb_defer_free_flush(sd);
 6760
 6761		if (list_empty(&list)) {
 6762			if (list_empty(&repoll)) {
 6763				sd->in_net_rx_action = false;
 6764				barrier();
 6765				/* We need to check if ____napi_schedule()
 6766				 * had refilled poll_list while
 6767				 * sd->in_net_rx_action was true.
 6768				 */
 6769				if (!list_empty(&sd->poll_list))
 6770					goto start;
 6771				if (!sd_has_rps_ipi_waiting(sd))
 6772					goto end;
 6773			}
 6774			break;
 6775		}
 6776
 6777		n = list_first_entry(&list, struct napi_struct, poll_list);
 6778		budget -= napi_poll(n, &repoll);
 6779
 6780		/* If softirq window is exhausted then punt.
 6781		 * Allow this to run for 2 jiffies since which will allow
 6782		 * an average latency of 1.5/HZ.
 6783		 */
 6784		if (unlikely(budget <= 0 ||
 6785			     time_after_eq(jiffies, time_limit))) {
 6786			sd->time_squeeze++;
 6787			break;
 6788		}
 6789	}
 6790
 6791	local_irq_disable();
 6792
 6793	list_splice_tail_init(&sd->poll_list, &list);
 6794	list_splice_tail(&repoll, &list);
 6795	list_splice(&list, &sd->poll_list);
 6796	if (!list_empty(&sd->poll_list))
 6797		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 6798	else
 6799		sd->in_net_rx_action = false;
 6800
 6801	net_rps_action_and_irq_enable(sd);
 6802end:;
 
 6803}
 6804
 6805struct netdev_adjacent {
 6806	struct net_device *dev;
 6807	netdevice_tracker dev_tracker;
 6808
 6809	/* upper master flag, there can only be one master device per list */
 6810	bool master;
 6811
 6812	/* lookup ignore flag */
 6813	bool ignore;
 6814
 6815	/* counter for the number of times this device was added to us */
 6816	u16 ref_nr;
 6817
 6818	/* private field for the users */
 6819	void *private;
 6820
 6821	struct list_head list;
 6822	struct rcu_head rcu;
 6823};
 6824
 6825static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
 6826						 struct list_head *adj_list)
 6827{
 6828	struct netdev_adjacent *adj;
 6829
 6830	list_for_each_entry(adj, adj_list, list) {
 6831		if (adj->dev == adj_dev)
 6832			return adj;
 6833	}
 6834	return NULL;
 6835}
 6836
 6837static int ____netdev_has_upper_dev(struct net_device *upper_dev,
 6838				    struct netdev_nested_priv *priv)
 6839{
 6840	struct net_device *dev = (struct net_device *)priv->data;
 6841
 6842	return upper_dev == dev;
 6843}
 6844
 6845/**
 6846 * netdev_has_upper_dev - Check if device is linked to an upper device
 6847 * @dev: device
 6848 * @upper_dev: upper device to check
 6849 *
 6850 * Find out if a device is linked to specified upper device and return true
 6851 * in case it is. Note that this checks only immediate upper device,
 6852 * not through a complete stack of devices. The caller must hold the RTNL lock.
 6853 */
 6854bool netdev_has_upper_dev(struct net_device *dev,
 6855			  struct net_device *upper_dev)
 6856{
 6857	struct netdev_nested_priv priv = {
 6858		.data = (void *)upper_dev,
 6859	};
 6860
 6861	ASSERT_RTNL();
 6862
 6863	return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 6864					     &priv);
 6865}
 6866EXPORT_SYMBOL(netdev_has_upper_dev);
 6867
 6868/**
 6869 * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
 6870 * @dev: device
 6871 * @upper_dev: upper device to check
 6872 *
 6873 * Find out if a device is linked to specified upper device and return true
 6874 * in case it is. Note that this checks the entire upper device chain.
 6875 * The caller must hold rcu lock.
 6876 */
 6877
 6878bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
 6879				  struct net_device *upper_dev)
 6880{
 6881	struct netdev_nested_priv priv = {
 6882		.data = (void *)upper_dev,
 6883	};
 6884
 6885	return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 6886					       &priv);
 6887}
 6888EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
 6889
 6890/**
 6891 * netdev_has_any_upper_dev - Check if device is linked to some device
 6892 * @dev: device
 6893 *
 6894 * Find out if a device is linked to an upper device and return true in case
 6895 * it is. The caller must hold the RTNL lock.
 6896 */
 6897bool netdev_has_any_upper_dev(struct net_device *dev)
 6898{
 6899	ASSERT_RTNL();
 6900
 6901	return !list_empty(&dev->adj_list.upper);
 6902}
 6903EXPORT_SYMBOL(netdev_has_any_upper_dev);
 6904
 6905/**
 6906 * netdev_master_upper_dev_get - Get master upper device
 6907 * @dev: device
 6908 *
 6909 * Find a master upper device and return pointer to it or NULL in case
 6910 * it's not there. The caller must hold the RTNL lock.
 6911 */
 6912struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
 6913{
 6914	struct netdev_adjacent *upper;
 6915
 6916	ASSERT_RTNL();
 6917
 6918	if (list_empty(&dev->adj_list.upper))
 6919		return NULL;
 6920
 6921	upper = list_first_entry(&dev->adj_list.upper,
 6922				 struct netdev_adjacent, list);
 6923	if (likely(upper->master))
 6924		return upper->dev;
 6925	return NULL;
 6926}
 6927EXPORT_SYMBOL(netdev_master_upper_dev_get);
 6928
 6929static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
 6930{
 6931	struct netdev_adjacent *upper;
 6932
 6933	ASSERT_RTNL();
 6934
 6935	if (list_empty(&dev->adj_list.upper))
 6936		return NULL;
 6937
 6938	upper = list_first_entry(&dev->adj_list.upper,
 6939				 struct netdev_adjacent, list);
 6940	if (likely(upper->master) && !upper->ignore)
 6941		return upper->dev;
 6942	return NULL;
 6943}
 6944
 6945/**
 6946 * netdev_has_any_lower_dev - Check if device is linked to some device
 6947 * @dev: device
 6948 *
 6949 * Find out if a device is linked to a lower device and return true in case
 6950 * it is. The caller must hold the RTNL lock.
 6951 */
 6952static bool netdev_has_any_lower_dev(struct net_device *dev)
 6953{
 6954	ASSERT_RTNL();
 6955
 6956	return !list_empty(&dev->adj_list.lower);
 6957}
 6958
 6959void *netdev_adjacent_get_private(struct list_head *adj_list)
 6960{
 6961	struct netdev_adjacent *adj;
 6962
 6963	adj = list_entry(adj_list, struct netdev_adjacent, list);
 6964
 6965	return adj->private;
 6966}
 6967EXPORT_SYMBOL(netdev_adjacent_get_private);
 6968
 6969/**
 6970 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
 6971 * @dev: device
 6972 * @iter: list_head ** of the current position
 6973 *
 6974 * Gets the next device from the dev's upper list, starting from iter
 6975 * position. The caller must hold RCU read lock.
 6976 */
 6977struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
 6978						 struct list_head **iter)
 6979{
 6980	struct netdev_adjacent *upper;
 6981
 6982	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 6983
 6984	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 6985
 6986	if (&upper->list == &dev->adj_list.upper)
 6987		return NULL;
 6988
 6989	*iter = &upper->list;
 6990
 6991	return upper->dev;
 6992}
 6993EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
 6994
 6995static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
 6996						  struct list_head **iter,
 6997						  bool *ignore)
 6998{
 6999	struct netdev_adjacent *upper;
 7000
 7001	upper = list_entry((*iter)->next, struct netdev_adjacent, list);
 7002
 7003	if (&upper->list == &dev->adj_list.upper)
 7004		return NULL;
 7005
 7006	*iter = &upper->list;
 7007	*ignore = upper->ignore;
 7008
 7009	return upper->dev;
 7010}
 7011
 7012static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
 7013						    struct list_head **iter)
 7014{
 7015	struct netdev_adjacent *upper;
 7016
 7017	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 7018
 7019	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7020
 7021	if (&upper->list == &dev->adj_list.upper)
 7022		return NULL;
 7023
 7024	*iter = &upper->list;
 7025
 7026	return upper->dev;
 7027}
 7028
 7029static int __netdev_walk_all_upper_dev(struct net_device *dev,
 7030				       int (*fn)(struct net_device *dev,
 7031					 struct netdev_nested_priv *priv),
 7032				       struct netdev_nested_priv *priv)
 7033{
 7034	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7035	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7036	int ret, cur = 0;
 7037	bool ignore;
 7038
 7039	now = dev;
 7040	iter = &dev->adj_list.upper;
 7041
 7042	while (1) {
 7043		if (now != dev) {
 7044			ret = fn(now, priv);
 7045			if (ret)
 7046				return ret;
 7047		}
 7048
 7049		next = NULL;
 7050		while (1) {
 7051			udev = __netdev_next_upper_dev(now, &iter, &ignore);
 7052			if (!udev)
 7053				break;
 7054			if (ignore)
 7055				continue;
 7056
 7057			next = udev;
 7058			niter = &udev->adj_list.upper;
 7059			dev_stack[cur] = now;
 7060			iter_stack[cur++] = iter;
 7061			break;
 7062		}
 7063
 7064		if (!next) {
 7065			if (!cur)
 7066				return 0;
 7067			next = dev_stack[--cur];
 7068			niter = iter_stack[cur];
 7069		}
 7070
 7071		now = next;
 7072		iter = niter;
 7073	}
 7074
 7075	return 0;
 7076}
 7077
 7078int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
 7079				  int (*fn)(struct net_device *dev,
 7080					    struct netdev_nested_priv *priv),
 7081				  struct netdev_nested_priv *priv)
 7082{
 7083	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7084	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7085	int ret, cur = 0;
 7086
 7087	now = dev;
 7088	iter = &dev->adj_list.upper;
 7089
 7090	while (1) {
 7091		if (now != dev) {
 7092			ret = fn(now, priv);
 7093			if (ret)
 7094				return ret;
 7095		}
 7096
 7097		next = NULL;
 7098		while (1) {
 7099			udev = netdev_next_upper_dev_rcu(now, &iter);
 7100			if (!udev)
 7101				break;
 7102
 7103			next = udev;
 7104			niter = &udev->adj_list.upper;
 7105			dev_stack[cur] = now;
 7106			iter_stack[cur++] = iter;
 7107			break;
 7108		}
 7109
 7110		if (!next) {
 7111			if (!cur)
 7112				return 0;
 7113			next = dev_stack[--cur];
 7114			niter = iter_stack[cur];
 7115		}
 
 
 7116
 7117		now = next;
 7118		iter = niter;
 
 
 7119	}
 7120
 7121	return 0;
 7122}
 7123EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
 7124
 7125static bool __netdev_has_upper_dev(struct net_device *dev,
 7126				   struct net_device *upper_dev)
 7127{
 7128	struct netdev_nested_priv priv = {
 7129		.flags = 0,
 7130		.data = (void *)upper_dev,
 7131	};
 7132
 7133	ASSERT_RTNL();
 7134
 7135	return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
 7136					   &priv);
 7137}
 7138
 7139/**
 7140 * netdev_lower_get_next_private - Get the next ->private from the
 7141 *				   lower neighbour list
 7142 * @dev: device
 7143 * @iter: list_head ** of the current position
 7144 *
 7145 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 7146 * list, starting from iter position. The caller must hold either hold the
 7147 * RTNL lock or its own locking that guarantees that the neighbour lower
 7148 * list will remain unchanged.
 7149 */
 7150void *netdev_lower_get_next_private(struct net_device *dev,
 7151				    struct list_head **iter)
 7152{
 7153	struct netdev_adjacent *lower;
 7154
 7155	lower = list_entry(*iter, struct netdev_adjacent, list);
 7156
 7157	if (&lower->list == &dev->adj_list.lower)
 7158		return NULL;
 7159
 7160	*iter = lower->list.next;
 7161
 7162	return lower->private;
 7163}
 7164EXPORT_SYMBOL(netdev_lower_get_next_private);
 7165
 7166/**
 7167 * netdev_lower_get_next_private_rcu - Get the next ->private from the
 7168 *				       lower neighbour list, RCU
 7169 *				       variant
 7170 * @dev: device
 7171 * @iter: list_head ** of the current position
 7172 *
 7173 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 7174 * list, starting from iter position. The caller must hold RCU read lock.
 7175 */
 7176void *netdev_lower_get_next_private_rcu(struct net_device *dev,
 7177					struct list_head **iter)
 7178{
 7179	struct netdev_adjacent *lower;
 7180
 7181	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
 7182
 7183	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7184
 7185	if (&lower->list == &dev->adj_list.lower)
 7186		return NULL;
 7187
 7188	*iter = &lower->list;
 7189
 7190	return lower->private;
 7191}
 7192EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
 7193
 7194/**
 7195 * netdev_lower_get_next - Get the next device from the lower neighbour
 7196 *                         list
 7197 * @dev: device
 7198 * @iter: list_head ** of the current position
 7199 *
 7200 * Gets the next netdev_adjacent from the dev's lower neighbour
 7201 * list, starting from iter position. The caller must hold RTNL lock or
 7202 * its own locking that guarantees that the neighbour lower
 7203 * list will remain unchanged.
 7204 */
 7205void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
 7206{
 7207	struct netdev_adjacent *lower;
 7208
 7209	lower = list_entry(*iter, struct netdev_adjacent, list);
 7210
 7211	if (&lower->list == &dev->adj_list.lower)
 7212		return NULL;
 7213
 7214	*iter = lower->list.next;
 7215
 7216	return lower->dev;
 7217}
 7218EXPORT_SYMBOL(netdev_lower_get_next);
 7219
 7220static struct net_device *netdev_next_lower_dev(struct net_device *dev,
 7221						struct list_head **iter)
 7222{
 7223	struct netdev_adjacent *lower;
 7224
 7225	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 7226
 7227	if (&lower->list == &dev->adj_list.lower)
 7228		return NULL;
 7229
 7230	*iter = &lower->list;
 7231
 7232	return lower->dev;
 7233}
 7234
 7235static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
 7236						  struct list_head **iter,
 7237						  bool *ignore)
 7238{
 7239	struct netdev_adjacent *lower;
 7240
 7241	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 7242
 7243	if (&lower->list == &dev->adj_list.lower)
 7244		return NULL;
 7245
 7246	*iter = &lower->list;
 7247	*ignore = lower->ignore;
 7248
 7249	return lower->dev;
 7250}
 7251
 7252int netdev_walk_all_lower_dev(struct net_device *dev,
 7253			      int (*fn)(struct net_device *dev,
 7254					struct netdev_nested_priv *priv),
 7255			      struct netdev_nested_priv *priv)
 7256{
 7257	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7258	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7259	int ret, cur = 0;
 7260
 7261	now = dev;
 7262	iter = &dev->adj_list.lower;
 7263
 7264	while (1) {
 7265		if (now != dev) {
 7266			ret = fn(now, priv);
 7267			if (ret)
 7268				return ret;
 7269		}
 7270
 7271		next = NULL;
 7272		while (1) {
 7273			ldev = netdev_next_lower_dev(now, &iter);
 7274			if (!ldev)
 7275				break;
 7276
 7277			next = ldev;
 7278			niter = &ldev->adj_list.lower;
 7279			dev_stack[cur] = now;
 7280			iter_stack[cur++] = iter;
 7281			break;
 7282		}
 7283
 7284		if (!next) {
 7285			if (!cur)
 7286				return 0;
 7287			next = dev_stack[--cur];
 7288			niter = iter_stack[cur];
 7289		}
 
 
 7290
 7291		now = next;
 7292		iter = niter;
 
 
 7293	}
 7294
 7295	return 0;
 7296}
 7297EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
 7298
 7299static int __netdev_walk_all_lower_dev(struct net_device *dev,
 7300				       int (*fn)(struct net_device *dev,
 7301					 struct netdev_nested_priv *priv),
 7302				       struct netdev_nested_priv *priv)
 7303{
 7304	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7305	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7306	int ret, cur = 0;
 7307	bool ignore;
 7308
 7309	now = dev;
 7310	iter = &dev->adj_list.lower;
 7311
 7312	while (1) {
 7313		if (now != dev) {
 7314			ret = fn(now, priv);
 7315			if (ret)
 7316				return ret;
 7317		}
 7318
 7319		next = NULL;
 7320		while (1) {
 7321			ldev = __netdev_next_lower_dev(now, &iter, &ignore);
 7322			if (!ldev)
 7323				break;
 7324			if (ignore)
 7325				continue;
 7326
 7327			next = ldev;
 7328			niter = &ldev->adj_list.lower;
 7329			dev_stack[cur] = now;
 7330			iter_stack[cur++] = iter;
 7331			break;
 7332		}
 7333
 7334		if (!next) {
 7335			if (!cur)
 7336				return 0;
 7337			next = dev_stack[--cur];
 7338			niter = iter_stack[cur];
 7339		}
 7340
 7341		now = next;
 7342		iter = niter;
 7343	}
 7344
 7345	return 0;
 7346}
 7347
 7348struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
 7349					     struct list_head **iter)
 7350{
 7351	struct netdev_adjacent *lower;
 7352
 7353	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7354	if (&lower->list == &dev->adj_list.lower)
 7355		return NULL;
 7356
 7357	*iter = &lower->list;
 7358
 7359	return lower->dev;
 7360}
 7361EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
 7362
 7363static u8 __netdev_upper_depth(struct net_device *dev)
 7364{
 7365	struct net_device *udev;
 7366	struct list_head *iter;
 7367	u8 max_depth = 0;
 7368	bool ignore;
 7369
 7370	for (iter = &dev->adj_list.upper,
 7371	     udev = __netdev_next_upper_dev(dev, &iter, &ignore);
 7372	     udev;
 7373	     udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
 7374		if (ignore)
 7375			continue;
 7376		if (max_depth < udev->upper_level)
 7377			max_depth = udev->upper_level;
 7378	}
 7379
 7380	return max_depth;
 7381}
 7382
 7383static u8 __netdev_lower_depth(struct net_device *dev)
 7384{
 7385	struct net_device *ldev;
 7386	struct list_head *iter;
 7387	u8 max_depth = 0;
 7388	bool ignore;
 7389
 7390	for (iter = &dev->adj_list.lower,
 7391	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
 7392	     ldev;
 7393	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
 7394		if (ignore)
 7395			continue;
 7396		if (max_depth < ldev->lower_level)
 7397			max_depth = ldev->lower_level;
 7398	}
 7399
 7400	return max_depth;
 7401}
 7402
 7403static int __netdev_update_upper_level(struct net_device *dev,
 7404				       struct netdev_nested_priv *__unused)
 7405{
 7406	dev->upper_level = __netdev_upper_depth(dev) + 1;
 7407	return 0;
 7408}
 7409
 7410#ifdef CONFIG_LOCKDEP
 7411static LIST_HEAD(net_unlink_list);
 7412
 7413static void net_unlink_todo(struct net_device *dev)
 7414{
 7415	if (list_empty(&dev->unlink_list))
 7416		list_add_tail(&dev->unlink_list, &net_unlink_list);
 7417}
 7418#endif
 7419
 7420static int __netdev_update_lower_level(struct net_device *dev,
 7421				       struct netdev_nested_priv *priv)
 7422{
 7423	dev->lower_level = __netdev_lower_depth(dev) + 1;
 7424
 7425#ifdef CONFIG_LOCKDEP
 7426	if (!priv)
 7427		return 0;
 7428
 7429	if (priv->flags & NESTED_SYNC_IMM)
 7430		dev->nested_level = dev->lower_level - 1;
 7431	if (priv->flags & NESTED_SYNC_TODO)
 7432		net_unlink_todo(dev);
 7433#endif
 7434	return 0;
 7435}
 7436
 7437int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
 7438				  int (*fn)(struct net_device *dev,
 7439					    struct netdev_nested_priv *priv),
 7440				  struct netdev_nested_priv *priv)
 7441{
 7442	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7443	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7444	int ret, cur = 0;
 7445
 7446	now = dev;
 7447	iter = &dev->adj_list.lower;
 7448
 7449	while (1) {
 7450		if (now != dev) {
 7451			ret = fn(now, priv);
 7452			if (ret)
 7453				return ret;
 7454		}
 7455
 7456		next = NULL;
 7457		while (1) {
 7458			ldev = netdev_next_lower_dev_rcu(now, &iter);
 7459			if (!ldev)
 7460				break;
 7461
 7462			next = ldev;
 7463			niter = &ldev->adj_list.lower;
 7464			dev_stack[cur] = now;
 7465			iter_stack[cur++] = iter;
 7466			break;
 7467		}
 7468
 7469		if (!next) {
 7470			if (!cur)
 7471				return 0;
 7472			next = dev_stack[--cur];
 7473			niter = iter_stack[cur];
 7474		}
 7475
 7476		now = next;
 7477		iter = niter;
 
 
 7478	}
 7479
 7480	return 0;
 7481}
 7482EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
 7483
 7484/**
 7485 * netdev_lower_get_first_private_rcu - Get the first ->private from the
 7486 *				       lower neighbour list, RCU
 7487 *				       variant
 7488 * @dev: device
 7489 *
 7490 * Gets the first netdev_adjacent->private from the dev's lower neighbour
 7491 * list. The caller must hold RCU read lock.
 7492 */
 7493void *netdev_lower_get_first_private_rcu(struct net_device *dev)
 7494{
 7495	struct netdev_adjacent *lower;
 7496
 7497	lower = list_first_or_null_rcu(&dev->adj_list.lower,
 7498			struct netdev_adjacent, list);
 7499	if (lower)
 7500		return lower->private;
 7501	return NULL;
 7502}
 7503EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
 7504
 7505/**
 7506 * netdev_master_upper_dev_get_rcu - Get master upper device
 7507 * @dev: device
 7508 *
 7509 * Find a master upper device and return pointer to it or NULL in case
 7510 * it's not there. The caller must hold the RCU read lock.
 7511 */
 7512struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
 7513{
 7514	struct netdev_adjacent *upper;
 7515
 7516	upper = list_first_or_null_rcu(&dev->adj_list.upper,
 7517				       struct netdev_adjacent, list);
 7518	if (upper && likely(upper->master))
 7519		return upper->dev;
 7520	return NULL;
 7521}
 7522EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
 7523
 7524static int netdev_adjacent_sysfs_add(struct net_device *dev,
 7525			      struct net_device *adj_dev,
 7526			      struct list_head *dev_list)
 7527{
 7528	char linkname[IFNAMSIZ+7];
 7529
 7530	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7531		"upper_%s" : "lower_%s", adj_dev->name);
 7532	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
 7533				 linkname);
 7534}
 7535static void netdev_adjacent_sysfs_del(struct net_device *dev,
 7536			       char *name,
 7537			       struct list_head *dev_list)
 7538{
 7539	char linkname[IFNAMSIZ+7];
 7540
 7541	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7542		"upper_%s" : "lower_%s", name);
 7543	sysfs_remove_link(&(dev->dev.kobj), linkname);
 7544}
 7545
 7546static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
 7547						 struct net_device *adj_dev,
 7548						 struct list_head *dev_list)
 7549{
 7550	return (dev_list == &dev->adj_list.upper ||
 7551		dev_list == &dev->adj_list.lower) &&
 7552		net_eq(dev_net(dev), dev_net(adj_dev));
 7553}
 7554
 7555static int __netdev_adjacent_dev_insert(struct net_device *dev,
 7556					struct net_device *adj_dev,
 7557					struct list_head *dev_list,
 7558					void *private, bool master)
 7559{
 7560	struct netdev_adjacent *adj;
 7561	int ret;
 7562
 7563	adj = __netdev_find_adj(adj_dev, dev_list);
 7564
 7565	if (adj) {
 7566		adj->ref_nr += 1;
 7567		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
 7568			 dev->name, adj_dev->name, adj->ref_nr);
 7569
 7570		return 0;
 7571	}
 7572
 7573	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
 7574	if (!adj)
 7575		return -ENOMEM;
 7576
 7577	adj->dev = adj_dev;
 7578	adj->master = master;
 7579	adj->ref_nr = 1;
 7580	adj->private = private;
 7581	adj->ignore = false;
 7582	netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL);
 7583
 7584	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
 7585		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
 7586
 7587	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
 7588		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
 7589		if (ret)
 7590			goto free_adj;
 7591	}
 7592
 7593	/* Ensure that master link is always the first item in list. */
 7594	if (master) {
 7595		ret = sysfs_create_link(&(dev->dev.kobj),
 7596					&(adj_dev->dev.kobj), "master");
 7597		if (ret)
 7598			goto remove_symlinks;
 7599
 7600		list_add_rcu(&adj->list, dev_list);
 7601	} else {
 7602		list_add_tail_rcu(&adj->list, dev_list);
 7603	}
 7604
 7605	return 0;
 7606
 7607remove_symlinks:
 7608	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 7609		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 7610free_adj:
 7611	netdev_put(adj_dev, &adj->dev_tracker);
 7612	kfree(adj);
 
 7613
 7614	return ret;
 7615}
 7616
 7617static void __netdev_adjacent_dev_remove(struct net_device *dev,
 7618					 struct net_device *adj_dev,
 7619					 u16 ref_nr,
 7620					 struct list_head *dev_list)
 7621{
 7622	struct netdev_adjacent *adj;
 7623
 7624	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
 7625		 dev->name, adj_dev->name, ref_nr);
 7626
 7627	adj = __netdev_find_adj(adj_dev, dev_list);
 7628
 7629	if (!adj) {
 7630		pr_err("Adjacency does not exist for device %s from %s\n",
 7631		       dev->name, adj_dev->name);
 7632		WARN_ON(1);
 7633		return;
 7634	}
 7635
 7636	if (adj->ref_nr > ref_nr) {
 7637		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
 7638			 dev->name, adj_dev->name, ref_nr,
 7639			 adj->ref_nr - ref_nr);
 7640		adj->ref_nr -= ref_nr;
 7641		return;
 7642	}
 7643
 7644	if (adj->master)
 7645		sysfs_remove_link(&(dev->dev.kobj), "master");
 7646
 7647	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 7648		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 7649
 7650	list_del_rcu(&adj->list);
 7651	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
 7652		 adj_dev->name, dev->name, adj_dev->name);
 7653	netdev_put(adj_dev, &adj->dev_tracker);
 7654	kfree_rcu(adj, rcu);
 7655}
 7656
 7657static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
 7658					    struct net_device *upper_dev,
 7659					    struct list_head *up_list,
 7660					    struct list_head *down_list,
 7661					    void *private, bool master)
 7662{
 7663	int ret;
 7664
 7665	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
 7666					   private, master);
 7667	if (ret)
 7668		return ret;
 7669
 7670	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
 7671					   private, false);
 7672	if (ret) {
 7673		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
 7674		return ret;
 7675	}
 7676
 7677	return 0;
 7678}
 7679
 7680static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
 7681					       struct net_device *upper_dev,
 7682					       u16 ref_nr,
 7683					       struct list_head *up_list,
 7684					       struct list_head *down_list)
 7685{
 7686	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
 7687	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
 7688}
 7689
 7690static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
 7691						struct net_device *upper_dev,
 7692						void *private, bool master)
 7693{
 7694	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
 7695						&dev->adj_list.upper,
 7696						&upper_dev->adj_list.lower,
 7697						private, master);
 7698}
 7699
 7700static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
 7701						   struct net_device *upper_dev)
 7702{
 7703	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
 7704					   &dev->adj_list.upper,
 7705					   &upper_dev->adj_list.lower);
 7706}
 7707
 7708static int __netdev_upper_dev_link(struct net_device *dev,
 7709				   struct net_device *upper_dev, bool master,
 7710				   void *upper_priv, void *upper_info,
 7711				   struct netdev_nested_priv *priv,
 7712				   struct netlink_ext_ack *extack)
 7713{
 7714	struct netdev_notifier_changeupper_info changeupper_info = {
 7715		.info = {
 7716			.dev = dev,
 7717			.extack = extack,
 7718		},
 7719		.upper_dev = upper_dev,
 7720		.master = master,
 7721		.linking = true,
 7722		.upper_info = upper_info,
 7723	};
 7724	struct net_device *master_dev;
 7725	int ret = 0;
 7726
 7727	ASSERT_RTNL();
 7728
 7729	if (dev == upper_dev)
 7730		return -EBUSY;
 7731
 7732	/* To prevent loops, check if dev is not upper device to upper_dev. */
 7733	if (__netdev_has_upper_dev(upper_dev, dev))
 7734		return -EBUSY;
 7735
 7736	if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
 7737		return -EMLINK;
 7738
 7739	if (!master) {
 7740		if (__netdev_has_upper_dev(dev, upper_dev))
 7741			return -EEXIST;
 7742	} else {
 7743		master_dev = __netdev_master_upper_dev_get(dev);
 7744		if (master_dev)
 7745			return master_dev == upper_dev ? -EEXIST : -EBUSY;
 7746	}
 7747
 7748	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 7749					    &changeupper_info.info);
 7750	ret = notifier_to_errno(ret);
 7751	if (ret)
 7752		return ret;
 7753
 7754	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
 7755						   master);
 7756	if (ret)
 7757		return ret;
 7758
 7759	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 7760					    &changeupper_info.info);
 7761	ret = notifier_to_errno(ret);
 7762	if (ret)
 7763		goto rollback;
 7764
 7765	__netdev_update_upper_level(dev, NULL);
 7766	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 7767
 7768	__netdev_update_lower_level(upper_dev, priv);
 7769	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 7770				    priv);
 7771
 7772	return 0;
 7773
 7774rollback:
 7775	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 7776
 7777	return ret;
 7778}
 7779
 7780/**
 7781 * netdev_upper_dev_link - Add a link to the upper device
 7782 * @dev: device
 7783 * @upper_dev: new upper device
 7784 * @extack: netlink extended ack
 7785 *
 7786 * Adds a link to device which is upper to this one. The caller must hold
 7787 * the RTNL lock. On a failure a negative errno code is returned.
 7788 * On success the reference counts are adjusted and the function
 7789 * returns zero.
 7790 */
 7791int netdev_upper_dev_link(struct net_device *dev,
 7792			  struct net_device *upper_dev,
 7793			  struct netlink_ext_ack *extack)
 7794{
 7795	struct netdev_nested_priv priv = {
 7796		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 7797		.data = NULL,
 7798	};
 7799
 7800	return __netdev_upper_dev_link(dev, upper_dev, false,
 7801				       NULL, NULL, &priv, extack);
 7802}
 7803EXPORT_SYMBOL(netdev_upper_dev_link);
 7804
 7805/**
 7806 * netdev_master_upper_dev_link - Add a master link to the upper device
 7807 * @dev: device
 7808 * @upper_dev: new upper device
 7809 * @upper_priv: upper device private
 7810 * @upper_info: upper info to be passed down via notifier
 7811 * @extack: netlink extended ack
 7812 *
 7813 * Adds a link to device which is upper to this one. In this case, only
 7814 * one master upper device can be linked, although other non-master devices
 7815 * might be linked as well. The caller must hold the RTNL lock.
 7816 * On a failure a negative errno code is returned. On success the reference
 7817 * counts are adjusted and the function returns zero.
 7818 */
 7819int netdev_master_upper_dev_link(struct net_device *dev,
 7820				 struct net_device *upper_dev,
 7821				 void *upper_priv, void *upper_info,
 7822				 struct netlink_ext_ack *extack)
 7823{
 7824	struct netdev_nested_priv priv = {
 7825		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 7826		.data = NULL,
 7827	};
 7828
 7829	return __netdev_upper_dev_link(dev, upper_dev, true,
 7830				       upper_priv, upper_info, &priv, extack);
 7831}
 7832EXPORT_SYMBOL(netdev_master_upper_dev_link);
 7833
 7834static void __netdev_upper_dev_unlink(struct net_device *dev,
 7835				      struct net_device *upper_dev,
 7836				      struct netdev_nested_priv *priv)
 7837{
 7838	struct netdev_notifier_changeupper_info changeupper_info = {
 7839		.info = {
 7840			.dev = dev,
 7841		},
 7842		.upper_dev = upper_dev,
 7843		.linking = false,
 7844	};
 7845
 7846	ASSERT_RTNL();
 7847
 7848	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
 7849
 7850	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 7851				      &changeupper_info.info);
 7852
 7853	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 7854
 7855	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 7856				      &changeupper_info.info);
 7857
 7858	__netdev_update_upper_level(dev, NULL);
 7859	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 7860
 7861	__netdev_update_lower_level(upper_dev, priv);
 7862	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 7863				    priv);
 7864}
 7865
 7866/**
 7867 * netdev_upper_dev_unlink - Removes a link to upper device
 7868 * @dev: device
 7869 * @upper_dev: new upper device
 7870 *
 7871 * Removes a link to device which is upper to this one. The caller must hold
 7872 * the RTNL lock.
 7873 */
 7874void netdev_upper_dev_unlink(struct net_device *dev,
 7875			     struct net_device *upper_dev)
 7876{
 7877	struct netdev_nested_priv priv = {
 7878		.flags = NESTED_SYNC_TODO,
 7879		.data = NULL,
 7880	};
 7881
 7882	__netdev_upper_dev_unlink(dev, upper_dev, &priv);
 7883}
 7884EXPORT_SYMBOL(netdev_upper_dev_unlink);
 7885
 7886static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
 7887				      struct net_device *lower_dev,
 7888				      bool val)
 7889{
 7890	struct netdev_adjacent *adj;
 7891
 7892	adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
 7893	if (adj)
 7894		adj->ignore = val;
 7895
 7896	adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
 7897	if (adj)
 7898		adj->ignore = val;
 7899}
 7900
 7901static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
 7902					struct net_device *lower_dev)
 7903{
 7904	__netdev_adjacent_dev_set(upper_dev, lower_dev, true);
 7905}
 7906
 7907static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
 7908				       struct net_device *lower_dev)
 7909{
 7910	__netdev_adjacent_dev_set(upper_dev, lower_dev, false);
 7911}
 7912
 7913int netdev_adjacent_change_prepare(struct net_device *old_dev,
 7914				   struct net_device *new_dev,
 7915				   struct net_device *dev,
 7916				   struct netlink_ext_ack *extack)
 7917{
 7918	struct netdev_nested_priv priv = {
 7919		.flags = 0,
 7920		.data = NULL,
 7921	};
 7922	int err;
 7923
 7924	if (!new_dev)
 7925		return 0;
 7926
 7927	if (old_dev && new_dev != old_dev)
 7928		netdev_adjacent_dev_disable(dev, old_dev);
 7929	err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
 7930				      extack);
 7931	if (err) {
 7932		if (old_dev && new_dev != old_dev)
 7933			netdev_adjacent_dev_enable(dev, old_dev);
 7934		return err;
 7935	}
 7936
 7937	return 0;
 7938}
 7939EXPORT_SYMBOL(netdev_adjacent_change_prepare);
 7940
 7941void netdev_adjacent_change_commit(struct net_device *old_dev,
 7942				   struct net_device *new_dev,
 7943				   struct net_device *dev)
 7944{
 7945	struct netdev_nested_priv priv = {
 7946		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 7947		.data = NULL,
 7948	};
 7949
 7950	if (!new_dev || !old_dev)
 7951		return;
 7952
 7953	if (new_dev == old_dev)
 7954		return;
 7955
 7956	netdev_adjacent_dev_enable(dev, old_dev);
 7957	__netdev_upper_dev_unlink(old_dev, dev, &priv);
 7958}
 7959EXPORT_SYMBOL(netdev_adjacent_change_commit);
 7960
 7961void netdev_adjacent_change_abort(struct net_device *old_dev,
 7962				  struct net_device *new_dev,
 7963				  struct net_device *dev)
 7964{
 7965	struct netdev_nested_priv priv = {
 7966		.flags = 0,
 7967		.data = NULL,
 7968	};
 7969
 7970	if (!new_dev)
 7971		return;
 7972
 7973	if (old_dev && new_dev != old_dev)
 7974		netdev_adjacent_dev_enable(dev, old_dev);
 7975
 7976	__netdev_upper_dev_unlink(new_dev, dev, &priv);
 
 7977}
 7978EXPORT_SYMBOL(netdev_adjacent_change_abort);
 7979
 7980/**
 7981 * netdev_bonding_info_change - Dispatch event about slave change
 7982 * @dev: device
 7983 * @bonding_info: info to dispatch
 7984 *
 7985 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
 7986 * The caller must hold the RTNL lock.
 7987 */
 7988void netdev_bonding_info_change(struct net_device *dev,
 7989				struct netdev_bonding_info *bonding_info)
 7990{
 7991	struct netdev_notifier_bonding_info info = {
 7992		.info.dev = dev,
 7993	};
 7994
 7995	memcpy(&info.bonding_info, bonding_info,
 7996	       sizeof(struct netdev_bonding_info));
 7997	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
 7998				      &info.info);
 7999}
 8000EXPORT_SYMBOL(netdev_bonding_info_change);
 8001
 8002static int netdev_offload_xstats_enable_l3(struct net_device *dev,
 8003					   struct netlink_ext_ack *extack)
 8004{
 8005	struct netdev_notifier_offload_xstats_info info = {
 8006		.info.dev = dev,
 8007		.info.extack = extack,
 8008		.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
 8009	};
 8010	int err;
 8011	int rc;
 8012
 8013	dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
 8014					 GFP_KERNEL);
 8015	if (!dev->offload_xstats_l3)
 8016		return -ENOMEM;
 8017
 8018	rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
 8019						  NETDEV_OFFLOAD_XSTATS_DISABLE,
 8020						  &info.info);
 8021	err = notifier_to_errno(rc);
 8022	if (err)
 8023		goto free_stats;
 8024
 8025	return 0;
 8026
 8027free_stats:
 8028	kfree(dev->offload_xstats_l3);
 8029	dev->offload_xstats_l3 = NULL;
 8030	return err;
 8031}
 8032
 8033int netdev_offload_xstats_enable(struct net_device *dev,
 8034				 enum netdev_offload_xstats_type type,
 8035				 struct netlink_ext_ack *extack)
 8036{
 8037	ASSERT_RTNL();
 8038
 8039	if (netdev_offload_xstats_enabled(dev, type))
 8040		return -EALREADY;
 8041
 8042	switch (type) {
 8043	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
 8044		return netdev_offload_xstats_enable_l3(dev, extack);
 8045	}
 8046
 8047	WARN_ON(1);
 8048	return -EINVAL;
 8049}
 8050EXPORT_SYMBOL(netdev_offload_xstats_enable);
 8051
 8052static void netdev_offload_xstats_disable_l3(struct net_device *dev)
 8053{
 8054	struct netdev_notifier_offload_xstats_info info = {
 8055		.info.dev = dev,
 8056		.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
 8057	};
 8058
 8059	call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
 8060				      &info.info);
 8061	kfree(dev->offload_xstats_l3);
 8062	dev->offload_xstats_l3 = NULL;
 8063}
 8064
 8065int netdev_offload_xstats_disable(struct net_device *dev,
 8066				  enum netdev_offload_xstats_type type)
 8067{
 8068	ASSERT_RTNL();
 8069
 8070	if (!netdev_offload_xstats_enabled(dev, type))
 8071		return -EALREADY;
 8072
 8073	switch (type) {
 8074	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
 8075		netdev_offload_xstats_disable_l3(dev);
 8076		return 0;
 8077	}
 8078
 8079	WARN_ON(1);
 8080	return -EINVAL;
 8081}
 8082EXPORT_SYMBOL(netdev_offload_xstats_disable);
 8083
 8084static void netdev_offload_xstats_disable_all(struct net_device *dev)
 8085{
 8086	netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
 8087}
 8088
 8089static struct rtnl_hw_stats64 *
 8090netdev_offload_xstats_get_ptr(const struct net_device *dev,
 8091			      enum netdev_offload_xstats_type type)
 8092{
 8093	switch (type) {
 8094	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
 8095		return dev->offload_xstats_l3;
 8096	}
 8097
 8098	WARN_ON(1);
 8099	return NULL;
 8100}
 8101
 8102bool netdev_offload_xstats_enabled(const struct net_device *dev,
 8103				   enum netdev_offload_xstats_type type)
 8104{
 8105	ASSERT_RTNL();
 8106
 8107	return netdev_offload_xstats_get_ptr(dev, type);
 8108}
 8109EXPORT_SYMBOL(netdev_offload_xstats_enabled);
 8110
 8111struct netdev_notifier_offload_xstats_ru {
 8112	bool used;
 8113};
 8114
 8115struct netdev_notifier_offload_xstats_rd {
 8116	struct rtnl_hw_stats64 stats;
 8117	bool used;
 8118};
 8119
 8120static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
 8121				  const struct rtnl_hw_stats64 *src)
 8122{
 8123	dest->rx_packets	  += src->rx_packets;
 8124	dest->tx_packets	  += src->tx_packets;
 8125	dest->rx_bytes		  += src->rx_bytes;
 8126	dest->tx_bytes		  += src->tx_bytes;
 8127	dest->rx_errors		  += src->rx_errors;
 8128	dest->tx_errors		  += src->tx_errors;
 8129	dest->rx_dropped	  += src->rx_dropped;
 8130	dest->tx_dropped	  += src->tx_dropped;
 8131	dest->multicast		  += src->multicast;
 8132}
 8133
 8134static int netdev_offload_xstats_get_used(struct net_device *dev,
 8135					  enum netdev_offload_xstats_type type,
 8136					  bool *p_used,
 8137					  struct netlink_ext_ack *extack)
 8138{
 8139	struct netdev_notifier_offload_xstats_ru report_used = {};
 8140	struct netdev_notifier_offload_xstats_info info = {
 8141		.info.dev = dev,
 8142		.info.extack = extack,
 8143		.type = type,
 8144		.report_used = &report_used,
 8145	};
 8146	int rc;
 8147
 8148	WARN_ON(!netdev_offload_xstats_enabled(dev, type));
 8149	rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
 8150					   &info.info);
 8151	*p_used = report_used.used;
 8152	return notifier_to_errno(rc);
 8153}
 8154
 8155static int netdev_offload_xstats_get_stats(struct net_device *dev,
 8156					   enum netdev_offload_xstats_type type,
 8157					   struct rtnl_hw_stats64 *p_stats,
 8158					   bool *p_used,
 8159					   struct netlink_ext_ack *extack)
 8160{
 8161	struct netdev_notifier_offload_xstats_rd report_delta = {};
 8162	struct netdev_notifier_offload_xstats_info info = {
 8163		.info.dev = dev,
 8164		.info.extack = extack,
 8165		.type = type,
 8166		.report_delta = &report_delta,
 8167	};
 8168	struct rtnl_hw_stats64 *stats;
 8169	int rc;
 8170
 8171	stats = netdev_offload_xstats_get_ptr(dev, type);
 8172	if (WARN_ON(!stats))
 8173		return -EINVAL;
 8174
 8175	rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
 8176					   &info.info);
 8177
 8178	/* Cache whatever we got, even if there was an error, otherwise the
 8179	 * successful stats retrievals would get lost.
 8180	 */
 8181	netdev_hw_stats64_add(stats, &report_delta.stats);
 8182
 8183	if (p_stats)
 8184		*p_stats = *stats;
 8185	*p_used = report_delta.used;
 8186
 8187	return notifier_to_errno(rc);
 8188}
 8189
 8190int netdev_offload_xstats_get(struct net_device *dev,
 8191			      enum netdev_offload_xstats_type type,
 8192			      struct rtnl_hw_stats64 *p_stats, bool *p_used,
 8193			      struct netlink_ext_ack *extack)
 8194{
 8195	ASSERT_RTNL();
 8196
 8197	if (p_stats)
 8198		return netdev_offload_xstats_get_stats(dev, type, p_stats,
 8199						       p_used, extack);
 8200	else
 8201		return netdev_offload_xstats_get_used(dev, type, p_used,
 8202						      extack);
 8203}
 8204EXPORT_SYMBOL(netdev_offload_xstats_get);
 8205
 8206void
 8207netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
 8208				   const struct rtnl_hw_stats64 *stats)
 8209{
 8210	report_delta->used = true;
 8211	netdev_hw_stats64_add(&report_delta->stats, stats);
 8212}
 8213EXPORT_SYMBOL(netdev_offload_xstats_report_delta);
 8214
 8215void
 8216netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
 8217{
 8218	report_used->used = true;
 8219}
 8220EXPORT_SYMBOL(netdev_offload_xstats_report_used);
 8221
 8222void netdev_offload_xstats_push_delta(struct net_device *dev,
 8223				      enum netdev_offload_xstats_type type,
 8224				      const struct rtnl_hw_stats64 *p_stats)
 8225{
 8226	struct rtnl_hw_stats64 *stats;
 8227
 8228	ASSERT_RTNL();
 8229
 8230	stats = netdev_offload_xstats_get_ptr(dev, type);
 8231	if (WARN_ON(!stats))
 8232		return;
 8233
 8234	netdev_hw_stats64_add(stats, p_stats);
 8235}
 8236EXPORT_SYMBOL(netdev_offload_xstats_push_delta);
 8237
 8238/**
 8239 * netdev_get_xmit_slave - Get the xmit slave of master device
 8240 * @dev: device
 8241 * @skb: The packet
 8242 * @all_slaves: assume all the slaves are active
 8243 *
 8244 * The reference counters are not incremented so the caller must be
 8245 * careful with locks. The caller must hold RCU lock.
 8246 * %NULL is returned if no slave is found.
 8247 */
 8248
 8249struct net_device *netdev_get_xmit_slave(struct net_device *dev,
 8250					 struct sk_buff *skb,
 8251					 bool all_slaves)
 8252{
 8253	const struct net_device_ops *ops = dev->netdev_ops;
 8254
 8255	if (!ops->ndo_get_xmit_slave)
 8256		return NULL;
 8257	return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
 8258}
 8259EXPORT_SYMBOL(netdev_get_xmit_slave);
 8260
 8261static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
 8262						  struct sock *sk)
 8263{
 8264	const struct net_device_ops *ops = dev->netdev_ops;
 8265
 8266	if (!ops->ndo_sk_get_lower_dev)
 8267		return NULL;
 8268	return ops->ndo_sk_get_lower_dev(dev, sk);
 8269}
 8270
 8271/**
 8272 * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
 8273 * @dev: device
 8274 * @sk: the socket
 8275 *
 8276 * %NULL is returned if no lower device is found.
 8277 */
 8278
 8279struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
 8280					    struct sock *sk)
 8281{
 8282	struct net_device *lower;
 8283
 8284	lower = netdev_sk_get_lower_dev(dev, sk);
 8285	while (lower) {
 8286		dev = lower;
 8287		lower = netdev_sk_get_lower_dev(dev, sk);
 8288	}
 8289
 8290	return dev;
 8291}
 8292EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
 8293
 8294static void netdev_adjacent_add_links(struct net_device *dev)
 8295{
 8296	struct netdev_adjacent *iter;
 8297
 8298	struct net *net = dev_net(dev);
 8299
 8300	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8301		if (!net_eq(net, dev_net(iter->dev)))
 8302			continue;
 8303		netdev_adjacent_sysfs_add(iter->dev, dev,
 8304					  &iter->dev->adj_list.lower);
 8305		netdev_adjacent_sysfs_add(dev, iter->dev,
 8306					  &dev->adj_list.upper);
 8307	}
 8308
 8309	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8310		if (!net_eq(net, dev_net(iter->dev)))
 8311			continue;
 8312		netdev_adjacent_sysfs_add(iter->dev, dev,
 8313					  &iter->dev->adj_list.upper);
 8314		netdev_adjacent_sysfs_add(dev, iter->dev,
 8315					  &dev->adj_list.lower);
 8316	}
 8317}
 8318
 8319static void netdev_adjacent_del_links(struct net_device *dev)
 8320{
 8321	struct netdev_adjacent *iter;
 8322
 8323	struct net *net = dev_net(dev);
 8324
 8325	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8326		if (!net_eq(net, dev_net(iter->dev)))
 8327			continue;
 8328		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 8329					  &iter->dev->adj_list.lower);
 8330		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 8331					  &dev->adj_list.upper);
 8332	}
 8333
 8334	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8335		if (!net_eq(net, dev_net(iter->dev)))
 8336			continue;
 8337		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 8338					  &iter->dev->adj_list.upper);
 8339		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 8340					  &dev->adj_list.lower);
 8341	}
 8342}
 8343
 8344void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
 8345{
 8346	struct netdev_adjacent *iter;
 8347
 8348	struct net *net = dev_net(dev);
 8349
 8350	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8351		if (!net_eq(net, dev_net(iter->dev)))
 8352			continue;
 8353		netdev_adjacent_sysfs_del(iter->dev, oldname,
 8354					  &iter->dev->adj_list.lower);
 8355		netdev_adjacent_sysfs_add(iter->dev, dev,
 8356					  &iter->dev->adj_list.lower);
 8357	}
 8358
 8359	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8360		if (!net_eq(net, dev_net(iter->dev)))
 8361			continue;
 8362		netdev_adjacent_sysfs_del(iter->dev, oldname,
 8363					  &iter->dev->adj_list.upper);
 8364		netdev_adjacent_sysfs_add(iter->dev, dev,
 8365					  &iter->dev->adj_list.upper);
 8366	}
 8367}
 8368
 8369void *netdev_lower_dev_get_private(struct net_device *dev,
 8370				   struct net_device *lower_dev)
 8371{
 8372	struct netdev_adjacent *lower;
 8373
 8374	if (!lower_dev)
 8375		return NULL;
 8376	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
 8377	if (!lower)
 8378		return NULL;
 8379
 8380	return lower->private;
 8381}
 8382EXPORT_SYMBOL(netdev_lower_dev_get_private);
 8383
 8384
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 8385/**
 8386 * netdev_lower_state_changed - Dispatch event about lower device state change
 8387 * @lower_dev: device
 8388 * @lower_state_info: state to dispatch
 8389 *
 8390 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
 8391 * The caller must hold the RTNL lock.
 8392 */
 8393void netdev_lower_state_changed(struct net_device *lower_dev,
 8394				void *lower_state_info)
 8395{
 8396	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
 8397		.info.dev = lower_dev,
 8398	};
 8399
 8400	ASSERT_RTNL();
 8401	changelowerstate_info.lower_state_info = lower_state_info;
 8402	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
 8403				      &changelowerstate_info.info);
 8404}
 8405EXPORT_SYMBOL(netdev_lower_state_changed);
 8406
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 8407static void dev_change_rx_flags(struct net_device *dev, int flags)
 8408{
 8409	const struct net_device_ops *ops = dev->netdev_ops;
 8410
 8411	if (ops->ndo_change_rx_flags)
 8412		ops->ndo_change_rx_flags(dev, flags);
 8413}
 8414
 8415static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
 8416{
 8417	unsigned int old_flags = dev->flags;
 8418	kuid_t uid;
 8419	kgid_t gid;
 8420
 8421	ASSERT_RTNL();
 8422
 8423	dev->flags |= IFF_PROMISC;
 8424	dev->promiscuity += inc;
 8425	if (dev->promiscuity == 0) {
 8426		/*
 8427		 * Avoid overflow.
 8428		 * If inc causes overflow, untouch promisc and return error.
 8429		 */
 8430		if (inc < 0)
 8431			dev->flags &= ~IFF_PROMISC;
 8432		else {
 8433			dev->promiscuity -= inc;
 8434			netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
 
 8435			return -EOVERFLOW;
 8436		}
 8437	}
 8438	if (dev->flags != old_flags) {
 8439		netdev_info(dev, "%s promiscuous mode\n",
 8440			    dev->flags & IFF_PROMISC ? "entered" : "left");
 
 8441		if (audit_enabled) {
 8442			current_uid_gid(&uid, &gid);
 8443			audit_log(audit_context(), GFP_ATOMIC,
 8444				  AUDIT_ANOM_PROMISCUOUS,
 8445				  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
 8446				  dev->name, (dev->flags & IFF_PROMISC),
 8447				  (old_flags & IFF_PROMISC),
 8448				  from_kuid(&init_user_ns, audit_get_loginuid(current)),
 8449				  from_kuid(&init_user_ns, uid),
 8450				  from_kgid(&init_user_ns, gid),
 8451				  audit_get_sessionid(current));
 8452		}
 8453
 8454		dev_change_rx_flags(dev, IFF_PROMISC);
 8455	}
 8456	if (notify)
 8457		__dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
 8458	return 0;
 8459}
 8460
 8461/**
 8462 *	dev_set_promiscuity	- update promiscuity count on a device
 8463 *	@dev: device
 8464 *	@inc: modifier
 8465 *
 8466 *	Add or remove promiscuity from a device. While the count in the device
 8467 *	remains above zero the interface remains promiscuous. Once it hits zero
 8468 *	the device reverts back to normal filtering operation. A negative inc
 8469 *	value is used to drop promiscuity on the device.
 8470 *	Return 0 if successful or a negative errno code on error.
 8471 */
 8472int dev_set_promiscuity(struct net_device *dev, int inc)
 8473{
 8474	unsigned int old_flags = dev->flags;
 8475	int err;
 8476
 8477	err = __dev_set_promiscuity(dev, inc, true);
 8478	if (err < 0)
 8479		return err;
 8480	if (dev->flags != old_flags)
 8481		dev_set_rx_mode(dev);
 8482	return err;
 8483}
 8484EXPORT_SYMBOL(dev_set_promiscuity);
 8485
 8486static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
 8487{
 8488	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
 8489
 8490	ASSERT_RTNL();
 8491
 8492	dev->flags |= IFF_ALLMULTI;
 8493	dev->allmulti += inc;
 8494	if (dev->allmulti == 0) {
 8495		/*
 8496		 * Avoid overflow.
 8497		 * If inc causes overflow, untouch allmulti and return error.
 8498		 */
 8499		if (inc < 0)
 8500			dev->flags &= ~IFF_ALLMULTI;
 8501		else {
 8502			dev->allmulti -= inc;
 8503			netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
 
 8504			return -EOVERFLOW;
 8505		}
 8506	}
 8507	if (dev->flags ^ old_flags) {
 8508		netdev_info(dev, "%s allmulticast mode\n",
 8509			    dev->flags & IFF_ALLMULTI ? "entered" : "left");
 8510		dev_change_rx_flags(dev, IFF_ALLMULTI);
 8511		dev_set_rx_mode(dev);
 8512		if (notify)
 8513			__dev_notify_flags(dev, old_flags,
 8514					   dev->gflags ^ old_gflags, 0, NULL);
 8515	}
 8516	return 0;
 8517}
 8518
 8519/**
 8520 *	dev_set_allmulti	- update allmulti count on a device
 8521 *	@dev: device
 8522 *	@inc: modifier
 8523 *
 8524 *	Add or remove reception of all multicast frames to a device. While the
 8525 *	count in the device remains above zero the interface remains listening
 8526 *	to all interfaces. Once it hits zero the device reverts back to normal
 8527 *	filtering operation. A negative @inc value is used to drop the counter
 8528 *	when releasing a resource needing all multicasts.
 8529 *	Return 0 if successful or a negative errno code on error.
 8530 */
 8531
 8532int dev_set_allmulti(struct net_device *dev, int inc)
 8533{
 8534	return __dev_set_allmulti(dev, inc, true);
 8535}
 8536EXPORT_SYMBOL(dev_set_allmulti);
 8537
 8538/*
 8539 *	Upload unicast and multicast address lists to device and
 8540 *	configure RX filtering. When the device doesn't support unicast
 8541 *	filtering it is put in promiscuous mode while unicast addresses
 8542 *	are present.
 8543 */
 8544void __dev_set_rx_mode(struct net_device *dev)
 8545{
 8546	const struct net_device_ops *ops = dev->netdev_ops;
 8547
 8548	/* dev_open will call this function so the list will stay sane. */
 8549	if (!(dev->flags&IFF_UP))
 8550		return;
 8551
 8552	if (!netif_device_present(dev))
 8553		return;
 8554
 8555	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
 8556		/* Unicast addresses changes may only happen under the rtnl,
 8557		 * therefore calling __dev_set_promiscuity here is safe.
 8558		 */
 8559		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
 8560			__dev_set_promiscuity(dev, 1, false);
 8561			dev->uc_promisc = true;
 8562		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
 8563			__dev_set_promiscuity(dev, -1, false);
 8564			dev->uc_promisc = false;
 8565		}
 8566	}
 8567
 8568	if (ops->ndo_set_rx_mode)
 8569		ops->ndo_set_rx_mode(dev);
 8570}
 8571
 8572void dev_set_rx_mode(struct net_device *dev)
 8573{
 8574	netif_addr_lock_bh(dev);
 8575	__dev_set_rx_mode(dev);
 8576	netif_addr_unlock_bh(dev);
 8577}
 8578
 8579/**
 8580 *	dev_get_flags - get flags reported to userspace
 8581 *	@dev: device
 8582 *
 8583 *	Get the combination of flag bits exported through APIs to userspace.
 8584 */
 8585unsigned int dev_get_flags(const struct net_device *dev)
 8586{
 8587	unsigned int flags;
 8588
 8589	flags = (dev->flags & ~(IFF_PROMISC |
 8590				IFF_ALLMULTI |
 8591				IFF_RUNNING |
 8592				IFF_LOWER_UP |
 8593				IFF_DORMANT)) |
 8594		(dev->gflags & (IFF_PROMISC |
 8595				IFF_ALLMULTI));
 8596
 8597	if (netif_running(dev)) {
 8598		if (netif_oper_up(dev))
 8599			flags |= IFF_RUNNING;
 8600		if (netif_carrier_ok(dev))
 8601			flags |= IFF_LOWER_UP;
 8602		if (netif_dormant(dev))
 8603			flags |= IFF_DORMANT;
 8604	}
 8605
 8606	return flags;
 8607}
 8608EXPORT_SYMBOL(dev_get_flags);
 8609
 8610int __dev_change_flags(struct net_device *dev, unsigned int flags,
 8611		       struct netlink_ext_ack *extack)
 8612{
 8613	unsigned int old_flags = dev->flags;
 8614	int ret;
 8615
 8616	ASSERT_RTNL();
 8617
 8618	/*
 8619	 *	Set the flags on our device.
 8620	 */
 8621
 8622	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
 8623			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
 8624			       IFF_AUTOMEDIA)) |
 8625		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
 8626				    IFF_ALLMULTI));
 8627
 8628	/*
 8629	 *	Load in the correct multicast list now the flags have changed.
 8630	 */
 8631
 8632	if ((old_flags ^ flags) & IFF_MULTICAST)
 8633		dev_change_rx_flags(dev, IFF_MULTICAST);
 8634
 8635	dev_set_rx_mode(dev);
 8636
 8637	/*
 8638	 *	Have we downed the interface. We handle IFF_UP ourselves
 8639	 *	according to user attempts to set it, rather than blindly
 8640	 *	setting it.
 8641	 */
 8642
 8643	ret = 0;
 8644	if ((old_flags ^ flags) & IFF_UP) {
 8645		if (old_flags & IFF_UP)
 8646			__dev_close(dev);
 8647		else
 8648			ret = __dev_open(dev, extack);
 8649	}
 8650
 8651	if ((flags ^ dev->gflags) & IFF_PROMISC) {
 8652		int inc = (flags & IFF_PROMISC) ? 1 : -1;
 8653		unsigned int old_flags = dev->flags;
 8654
 8655		dev->gflags ^= IFF_PROMISC;
 8656
 8657		if (__dev_set_promiscuity(dev, inc, false) >= 0)
 8658			if (dev->flags != old_flags)
 8659				dev_set_rx_mode(dev);
 8660	}
 8661
 8662	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
 8663	 * is important. Some (broken) drivers set IFF_PROMISC, when
 8664	 * IFF_ALLMULTI is requested not asking us and not reporting.
 8665	 */
 8666	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
 8667		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
 8668
 8669		dev->gflags ^= IFF_ALLMULTI;
 8670		__dev_set_allmulti(dev, inc, false);
 8671	}
 8672
 8673	return ret;
 8674}
 8675
 8676void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
 8677			unsigned int gchanges, u32 portid,
 8678			const struct nlmsghdr *nlh)
 8679{
 8680	unsigned int changes = dev->flags ^ old_flags;
 8681
 8682	if (gchanges)
 8683		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh);
 8684
 8685	if (changes & IFF_UP) {
 8686		if (dev->flags & IFF_UP)
 8687			call_netdevice_notifiers(NETDEV_UP, dev);
 8688		else
 8689			call_netdevice_notifiers(NETDEV_DOWN, dev);
 8690	}
 8691
 8692	if (dev->flags & IFF_UP &&
 8693	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
 8694		struct netdev_notifier_change_info change_info = {
 8695			.info = {
 8696				.dev = dev,
 8697			},
 8698			.flags_changed = changes,
 8699		};
 8700
 8701		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
 
 
 8702	}
 8703}
 8704
 8705/**
 8706 *	dev_change_flags - change device settings
 8707 *	@dev: device
 8708 *	@flags: device state flags
 8709 *	@extack: netlink extended ack
 8710 *
 8711 *	Change settings on device based state flags. The flags are
 8712 *	in the userspace exported format.
 8713 */
 8714int dev_change_flags(struct net_device *dev, unsigned int flags,
 8715		     struct netlink_ext_ack *extack)
 8716{
 8717	int ret;
 8718	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
 8719
 8720	ret = __dev_change_flags(dev, flags, extack);
 8721	if (ret < 0)
 8722		return ret;
 8723
 8724	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
 8725	__dev_notify_flags(dev, old_flags, changes, 0, NULL);
 8726	return ret;
 8727}
 8728EXPORT_SYMBOL(dev_change_flags);
 8729
 8730int __dev_set_mtu(struct net_device *dev, int new_mtu)
 8731{
 8732	const struct net_device_ops *ops = dev->netdev_ops;
 8733
 8734	if (ops->ndo_change_mtu)
 8735		return ops->ndo_change_mtu(dev, new_mtu);
 8736
 8737	/* Pairs with all the lockless reads of dev->mtu in the stack */
 8738	WRITE_ONCE(dev->mtu, new_mtu);
 8739	return 0;
 8740}
 8741EXPORT_SYMBOL(__dev_set_mtu);
 8742
 8743int dev_validate_mtu(struct net_device *dev, int new_mtu,
 8744		     struct netlink_ext_ack *extack)
 8745{
 8746	/* MTU must be positive, and in range */
 8747	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
 8748		NL_SET_ERR_MSG(extack, "mtu less than device minimum");
 8749		return -EINVAL;
 8750	}
 8751
 8752	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
 8753		NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
 8754		return -EINVAL;
 8755	}
 8756	return 0;
 8757}
 8758
 8759/**
 8760 *	dev_set_mtu_ext - Change maximum transfer unit
 8761 *	@dev: device
 8762 *	@new_mtu: new transfer unit
 8763 *	@extack: netlink extended ack
 8764 *
 8765 *	Change the maximum transfer size of the network device.
 8766 */
 8767int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
 8768		    struct netlink_ext_ack *extack)
 8769{
 8770	int err, orig_mtu;
 8771
 8772	if (new_mtu == dev->mtu)
 8773		return 0;
 8774
 8775	err = dev_validate_mtu(dev, new_mtu, extack);
 8776	if (err)
 8777		return err;
 
 
 
 
 
 
 
 
 
 8778
 8779	if (!netif_device_present(dev))
 8780		return -ENODEV;
 8781
 8782	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
 8783	err = notifier_to_errno(err);
 8784	if (err)
 8785		return err;
 8786
 8787	orig_mtu = dev->mtu;
 8788	err = __dev_set_mtu(dev, new_mtu);
 8789
 8790	if (!err) {
 8791		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 8792						   orig_mtu);
 8793		err = notifier_to_errno(err);
 8794		if (err) {
 8795			/* setting mtu back and notifying everyone again,
 8796			 * so that they have a chance to revert changes.
 8797			 */
 8798			__dev_set_mtu(dev, orig_mtu);
 8799			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 8800						     new_mtu);
 8801		}
 8802	}
 8803	return err;
 8804}
 8805
 8806int dev_set_mtu(struct net_device *dev, int new_mtu)
 8807{
 8808	struct netlink_ext_ack extack;
 8809	int err;
 8810
 8811	memset(&extack, 0, sizeof(extack));
 8812	err = dev_set_mtu_ext(dev, new_mtu, &extack);
 8813	if (err && extack._msg)
 8814		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
 8815	return err;
 8816}
 8817EXPORT_SYMBOL(dev_set_mtu);
 8818
 8819/**
 8820 *	dev_change_tx_queue_len - Change TX queue length of a netdevice
 8821 *	@dev: device
 8822 *	@new_len: new tx queue length
 8823 */
 8824int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
 8825{
 8826	unsigned int orig_len = dev->tx_queue_len;
 8827	int res;
 8828
 8829	if (new_len != (unsigned int)new_len)
 8830		return -ERANGE;
 8831
 8832	if (new_len != orig_len) {
 8833		dev->tx_queue_len = new_len;
 8834		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
 8835		res = notifier_to_errno(res);
 8836		if (res)
 8837			goto err_rollback;
 8838		res = dev_qdisc_change_tx_queue_len(dev);
 8839		if (res)
 8840			goto err_rollback;
 8841	}
 8842
 8843	return 0;
 8844
 8845err_rollback:
 8846	netdev_err(dev, "refused to change device tx_queue_len\n");
 8847	dev->tx_queue_len = orig_len;
 8848	return res;
 8849}
 8850
 8851/**
 8852 *	dev_set_group - Change group this device belongs to
 8853 *	@dev: device
 8854 *	@new_group: group this device should belong to
 8855 */
 8856void dev_set_group(struct net_device *dev, int new_group)
 8857{
 8858	dev->group = new_group;
 8859}
 8860
 8861/**
 8862 *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
 8863 *	@dev: device
 8864 *	@addr: new address
 8865 *	@extack: netlink extended ack
 8866 */
 8867int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
 8868			      struct netlink_ext_ack *extack)
 8869{
 8870	struct netdev_notifier_pre_changeaddr_info info = {
 8871		.info.dev = dev,
 8872		.info.extack = extack,
 8873		.dev_addr = addr,
 8874	};
 8875	int rc;
 8876
 8877	rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
 8878	return notifier_to_errno(rc);
 8879}
 8880EXPORT_SYMBOL(dev_pre_changeaddr_notify);
 8881
 8882/**
 8883 *	dev_set_mac_address - Change Media Access Control Address
 8884 *	@dev: device
 8885 *	@sa: new address
 8886 *	@extack: netlink extended ack
 8887 *
 8888 *	Change the hardware (MAC) address of the device
 8889 */
 8890int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
 8891			struct netlink_ext_ack *extack)
 8892{
 8893	const struct net_device_ops *ops = dev->netdev_ops;
 8894	int err;
 8895
 8896	if (!ops->ndo_set_mac_address)
 8897		return -EOPNOTSUPP;
 8898	if (sa->sa_family != dev->type)
 8899		return -EINVAL;
 8900	if (!netif_device_present(dev))
 8901		return -ENODEV;
 8902	err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
 8903	if (err)
 8904		return err;
 8905	if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) {
 8906		err = ops->ndo_set_mac_address(dev, sa);
 8907		if (err)
 8908			return err;
 8909	}
 8910	dev->addr_assign_type = NET_ADDR_SET;
 8911	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
 8912	add_device_randomness(dev->dev_addr, dev->addr_len);
 8913	return 0;
 8914}
 8915EXPORT_SYMBOL(dev_set_mac_address);
 8916
 8917static DECLARE_RWSEM(dev_addr_sem);
 8918
 8919int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
 8920			     struct netlink_ext_ack *extack)
 8921{
 8922	int ret;
 8923
 8924	down_write(&dev_addr_sem);
 8925	ret = dev_set_mac_address(dev, sa, extack);
 8926	up_write(&dev_addr_sem);
 8927	return ret;
 8928}
 8929EXPORT_SYMBOL(dev_set_mac_address_user);
 8930
 8931int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
 8932{
 8933	size_t size = sizeof(sa->sa_data_min);
 8934	struct net_device *dev;
 8935	int ret = 0;
 8936
 8937	down_read(&dev_addr_sem);
 8938	rcu_read_lock();
 8939
 8940	dev = dev_get_by_name_rcu(net, dev_name);
 8941	if (!dev) {
 8942		ret = -ENODEV;
 8943		goto unlock;
 8944	}
 8945	if (!dev->addr_len)
 8946		memset(sa->sa_data, 0, size);
 8947	else
 8948		memcpy(sa->sa_data, dev->dev_addr,
 8949		       min_t(size_t, size, dev->addr_len));
 8950	sa->sa_family = dev->type;
 8951
 8952unlock:
 8953	rcu_read_unlock();
 8954	up_read(&dev_addr_sem);
 8955	return ret;
 8956}
 8957EXPORT_SYMBOL(dev_get_mac_address);
 8958
 8959/**
 8960 *	dev_change_carrier - Change device carrier
 8961 *	@dev: device
 8962 *	@new_carrier: new value
 8963 *
 8964 *	Change device carrier
 8965 */
 8966int dev_change_carrier(struct net_device *dev, bool new_carrier)
 8967{
 8968	const struct net_device_ops *ops = dev->netdev_ops;
 8969
 8970	if (!ops->ndo_change_carrier)
 8971		return -EOPNOTSUPP;
 8972	if (!netif_device_present(dev))
 8973		return -ENODEV;
 8974	return ops->ndo_change_carrier(dev, new_carrier);
 8975}
 
 8976
 8977/**
 8978 *	dev_get_phys_port_id - Get device physical port ID
 8979 *	@dev: device
 8980 *	@ppid: port ID
 8981 *
 8982 *	Get device physical port ID
 8983 */
 8984int dev_get_phys_port_id(struct net_device *dev,
 8985			 struct netdev_phys_item_id *ppid)
 8986{
 8987	const struct net_device_ops *ops = dev->netdev_ops;
 8988
 8989	if (!ops->ndo_get_phys_port_id)
 8990		return -EOPNOTSUPP;
 8991	return ops->ndo_get_phys_port_id(dev, ppid);
 8992}
 
 8993
 8994/**
 8995 *	dev_get_phys_port_name - Get device physical port name
 8996 *	@dev: device
 8997 *	@name: port name
 8998 *	@len: limit of bytes to copy to name
 8999 *
 9000 *	Get device physical port name
 9001 */
 9002int dev_get_phys_port_name(struct net_device *dev,
 9003			   char *name, size_t len)
 9004{
 9005	const struct net_device_ops *ops = dev->netdev_ops;
 9006	int err;
 9007
 9008	if (ops->ndo_get_phys_port_name) {
 9009		err = ops->ndo_get_phys_port_name(dev, name, len);
 9010		if (err != -EOPNOTSUPP)
 9011			return err;
 9012	}
 9013	return devlink_compat_phys_port_name_get(dev, name, len);
 9014}
 9015
 9016/**
 9017 *	dev_get_port_parent_id - Get the device's port parent identifier
 9018 *	@dev: network device
 9019 *	@ppid: pointer to a storage for the port's parent identifier
 9020 *	@recurse: allow/disallow recursion to lower devices
 9021 *
 9022 *	Get the devices's port parent identifier
 9023 */
 9024int dev_get_port_parent_id(struct net_device *dev,
 9025			   struct netdev_phys_item_id *ppid,
 9026			   bool recurse)
 9027{
 9028	const struct net_device_ops *ops = dev->netdev_ops;
 9029	struct netdev_phys_item_id first = { };
 9030	struct net_device *lower_dev;
 9031	struct list_head *iter;
 9032	int err;
 9033
 9034	if (ops->ndo_get_port_parent_id) {
 9035		err = ops->ndo_get_port_parent_id(dev, ppid);
 9036		if (err != -EOPNOTSUPP)
 9037			return err;
 9038	}
 9039
 9040	err = devlink_compat_switch_id_get(dev, ppid);
 9041	if (!recurse || err != -EOPNOTSUPP)
 9042		return err;
 9043
 9044	netdev_for_each_lower_dev(dev, lower_dev, iter) {
 9045		err = dev_get_port_parent_id(lower_dev, ppid, true);
 9046		if (err)
 9047			break;
 9048		if (!first.id_len)
 9049			first = *ppid;
 9050		else if (memcmp(&first, ppid, sizeof(*ppid)))
 9051			return -EOPNOTSUPP;
 9052	}
 9053
 9054	return err;
 9055}
 9056EXPORT_SYMBOL(dev_get_port_parent_id);
 9057
 9058/**
 9059 *	netdev_port_same_parent_id - Indicate if two network devices have
 9060 *	the same port parent identifier
 9061 *	@a: first network device
 9062 *	@b: second network device
 9063 */
 9064bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
 9065{
 9066	struct netdev_phys_item_id a_id = { };
 9067	struct netdev_phys_item_id b_id = { };
 9068
 9069	if (dev_get_port_parent_id(a, &a_id, true) ||
 9070	    dev_get_port_parent_id(b, &b_id, true))
 9071		return false;
 9072
 9073	return netdev_phys_item_id_same(&a_id, &b_id);
 9074}
 9075EXPORT_SYMBOL(netdev_port_same_parent_id);
 9076
 9077/**
 9078 *	dev_change_proto_down - set carrier according to proto_down.
 9079 *
 9080 *	@dev: device
 9081 *	@proto_down: new value
 
 
 
 9082 */
 9083int dev_change_proto_down(struct net_device *dev, bool proto_down)
 9084{
 9085	if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN))
 
 
 9086		return -EOPNOTSUPP;
 9087	if (!netif_device_present(dev))
 9088		return -ENODEV;
 9089	if (proto_down)
 9090		netif_carrier_off(dev);
 9091	else
 9092		netif_carrier_on(dev);
 9093	dev->proto_down = proto_down;
 9094	return 0;
 9095}
 
 9096
 9097/**
 9098 *	dev_change_proto_down_reason - proto down reason
 9099 *
 9100 *	@dev: device
 9101 *	@mask: proto down mask
 9102 *	@value: proto down value
 
 
 9103 */
 9104void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
 9105				  u32 value)
 9106{
 9107	int b;
 9108
 9109	if (!mask) {
 9110		dev->proto_down_reason = value;
 9111	} else {
 9112		for_each_set_bit(b, &mask, 32) {
 9113			if (value & (1 << b))
 9114				dev->proto_down_reason |= BIT(b);
 9115			else
 9116				dev->proto_down_reason &= ~BIT(b);
 9117		}
 9118	}
 9119}
 9120
 9121struct bpf_xdp_link {
 9122	struct bpf_link link;
 9123	struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
 9124	int flags;
 9125};
 9126
 9127static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
 9128{
 9129	if (flags & XDP_FLAGS_HW_MODE)
 9130		return XDP_MODE_HW;
 9131	if (flags & XDP_FLAGS_DRV_MODE)
 9132		return XDP_MODE_DRV;
 9133	if (flags & XDP_FLAGS_SKB_MODE)
 9134		return XDP_MODE_SKB;
 9135	return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
 9136}
 9137
 9138static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
 9139{
 9140	switch (mode) {
 9141	case XDP_MODE_SKB:
 9142		return generic_xdp_install;
 9143	case XDP_MODE_DRV:
 9144	case XDP_MODE_HW:
 9145		return dev->netdev_ops->ndo_bpf;
 9146	default:
 9147		return NULL;
 9148	}
 9149}
 9150
 9151static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
 9152					 enum bpf_xdp_mode mode)
 9153{
 9154	return dev->xdp_state[mode].link;
 9155}
 9156
 9157static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
 9158				     enum bpf_xdp_mode mode)
 9159{
 9160	struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
 9161
 9162	if (link)
 9163		return link->link.prog;
 9164	return dev->xdp_state[mode].prog;
 9165}
 9166
 9167u8 dev_xdp_prog_count(struct net_device *dev)
 9168{
 9169	u8 count = 0;
 9170	int i;
 9171
 9172	for (i = 0; i < __MAX_XDP_MODE; i++)
 9173		if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
 9174			count++;
 9175	return count;
 9176}
 9177EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
 9178
 9179u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
 9180{
 9181	struct bpf_prog *prog = dev_xdp_prog(dev, mode);
 9182
 9183	return prog ? prog->aux->id : 0;
 9184}
 9185
 9186static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
 9187			     struct bpf_xdp_link *link)
 9188{
 9189	dev->xdp_state[mode].link = link;
 9190	dev->xdp_state[mode].prog = NULL;
 9191}
 9192
 9193static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
 9194			     struct bpf_prog *prog)
 9195{
 9196	dev->xdp_state[mode].link = NULL;
 9197	dev->xdp_state[mode].prog = prog;
 9198}
 9199
 9200static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
 9201			   bpf_op_t bpf_op, struct netlink_ext_ack *extack,
 9202			   u32 flags, struct bpf_prog *prog)
 9203{
 9204	struct netdev_bpf xdp;
 9205	int err;
 9206
 9207	memset(&xdp, 0, sizeof(xdp));
 9208	xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
 9209	xdp.extack = extack;
 9210	xdp.flags = flags;
 9211	xdp.prog = prog;
 9212
 9213	/* Drivers assume refcnt is already incremented (i.e, prog pointer is
 9214	 * "moved" into driver), so they don't increment it on their own, but
 9215	 * they do decrement refcnt when program is detached or replaced.
 9216	 * Given net_device also owns link/prog, we need to bump refcnt here
 9217	 * to prevent drivers from underflowing it.
 9218	 */
 9219	if (prog)
 9220		bpf_prog_inc(prog);
 9221	err = bpf_op(dev, &xdp);
 9222	if (err) {
 9223		if (prog)
 9224			bpf_prog_put(prog);
 9225		return err;
 9226	}
 9227
 9228	if (mode != XDP_MODE_HW)
 9229		bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
 9230
 9231	return 0;
 9232}
 9233
 9234static void dev_xdp_uninstall(struct net_device *dev)
 9235{
 9236	struct bpf_xdp_link *link;
 9237	struct bpf_prog *prog;
 9238	enum bpf_xdp_mode mode;
 9239	bpf_op_t bpf_op;
 9240
 9241	ASSERT_RTNL();
 9242
 9243	for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
 9244		prog = dev_xdp_prog(dev, mode);
 9245		if (!prog)
 9246			continue;
 9247
 9248		bpf_op = dev_xdp_bpf_op(dev, mode);
 9249		if (!bpf_op)
 9250			continue;
 9251
 9252		WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
 9253
 9254		/* auto-detach link from net device */
 9255		link = dev_xdp_link(dev, mode);
 9256		if (link)
 9257			link->dev = NULL;
 9258		else
 9259			bpf_prog_put(prog);
 9260
 9261		dev_xdp_set_link(dev, mode, NULL);
 9262	}
 9263}
 9264
 9265static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
 9266			  struct bpf_xdp_link *link, struct bpf_prog *new_prog,
 9267			  struct bpf_prog *old_prog, u32 flags)
 9268{
 9269	unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
 9270	struct bpf_prog *cur_prog;
 9271	struct net_device *upper;
 9272	struct list_head *iter;
 9273	enum bpf_xdp_mode mode;
 9274	bpf_op_t bpf_op;
 9275	int err;
 9276
 9277	ASSERT_RTNL();
 9278
 9279	/* either link or prog attachment, never both */
 9280	if (link && (new_prog || old_prog))
 9281		return -EINVAL;
 9282	/* link supports only XDP mode flags */
 9283	if (link && (flags & ~XDP_FLAGS_MODES)) {
 9284		NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
 9285		return -EINVAL;
 9286	}
 9287	/* just one XDP mode bit should be set, zero defaults to drv/skb mode */
 9288	if (num_modes > 1) {
 9289		NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
 9290		return -EINVAL;
 9291	}
 9292	/* avoid ambiguity if offload + drv/skb mode progs are both loaded */
 9293	if (!num_modes && dev_xdp_prog_count(dev) > 1) {
 9294		NL_SET_ERR_MSG(extack,
 9295			       "More than one program loaded, unset mode is ambiguous");
 9296		return -EINVAL;
 9297	}
 9298	/* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
 9299	if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
 9300		NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
 9301		return -EINVAL;
 9302	}
 9303
 9304	mode = dev_xdp_mode(dev, flags);
 9305	/* can't replace attached link */
 9306	if (dev_xdp_link(dev, mode)) {
 9307		NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
 9308		return -EBUSY;
 9309	}
 9310
 9311	/* don't allow if an upper device already has a program */
 9312	netdev_for_each_upper_dev_rcu(dev, upper, iter) {
 9313		if (dev_xdp_prog_count(upper) > 0) {
 9314			NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
 9315			return -EEXIST;
 9316		}
 9317	}
 9318
 9319	cur_prog = dev_xdp_prog(dev, mode);
 9320	/* can't replace attached prog with link */
 9321	if (link && cur_prog) {
 9322		NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
 9323		return -EBUSY;
 9324	}
 9325	if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
 9326		NL_SET_ERR_MSG(extack, "Active program does not match expected");
 9327		return -EEXIST;
 9328	}
 9329
 9330	/* put effective new program into new_prog */
 9331	if (link)
 9332		new_prog = link->link.prog;
 9333
 9334	if (new_prog) {
 9335		bool offload = mode == XDP_MODE_HW;
 9336		enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
 9337					       ? XDP_MODE_DRV : XDP_MODE_SKB;
 9338
 9339		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
 9340			NL_SET_ERR_MSG(extack, "XDP program already attached");
 9341			return -EBUSY;
 9342		}
 9343		if (!offload && dev_xdp_prog(dev, other_mode)) {
 9344			NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
 9345			return -EEXIST;
 9346		}
 9347		if (!offload && bpf_prog_is_offloaded(new_prog->aux)) {
 9348			NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported");
 9349			return -EINVAL;
 9350		}
 9351		if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) {
 9352			NL_SET_ERR_MSG(extack, "Program bound to different device");
 9353			return -EINVAL;
 9354		}
 9355		if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
 9356			NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
 9357			return -EINVAL;
 9358		}
 9359		if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
 9360			NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
 9361			return -EINVAL;
 9362		}
 9363	}
 9364
 9365	/* don't call drivers if the effective program didn't change */
 9366	if (new_prog != cur_prog) {
 9367		bpf_op = dev_xdp_bpf_op(dev, mode);
 9368		if (!bpf_op) {
 9369			NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
 9370			return -EOPNOTSUPP;
 9371		}
 9372
 9373		err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
 9374		if (err)
 9375			return err;
 9376	}
 9377
 9378	if (link)
 9379		dev_xdp_set_link(dev, mode, link);
 9380	else
 9381		dev_xdp_set_prog(dev, mode, new_prog);
 9382	if (cur_prog)
 9383		bpf_prog_put(cur_prog);
 9384
 9385	return 0;
 9386}
 9387
 9388static int dev_xdp_attach_link(struct net_device *dev,
 9389			       struct netlink_ext_ack *extack,
 9390			       struct bpf_xdp_link *link)
 9391{
 9392	return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
 9393}
 9394
 9395static int dev_xdp_detach_link(struct net_device *dev,
 9396			       struct netlink_ext_ack *extack,
 9397			       struct bpf_xdp_link *link)
 9398{
 9399	enum bpf_xdp_mode mode;
 9400	bpf_op_t bpf_op;
 9401
 9402	ASSERT_RTNL();
 
 
 9403
 9404	mode = dev_xdp_mode(dev, link->flags);
 9405	if (dev_xdp_link(dev, mode) != link)
 9406		return -EINVAL;
 9407
 9408	bpf_op = dev_xdp_bpf_op(dev, mode);
 9409	WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
 9410	dev_xdp_set_link(dev, mode, NULL);
 9411	return 0;
 9412}
 
 9413
 9414static void bpf_xdp_link_release(struct bpf_link *link)
 
 
 
 
 
 
 
 
 9415{
 9416	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9417
 9418	rtnl_lock();
 9419
 9420	/* if racing with net_device's tear down, xdp_link->dev might be
 9421	 * already NULL, in which case link was already auto-detached
 9422	 */
 9423	if (xdp_link->dev) {
 9424		WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
 9425		xdp_link->dev = NULL;
 9426	}
 9427
 9428	rtnl_unlock();
 9429}
 9430
 9431static int bpf_xdp_link_detach(struct bpf_link *link)
 9432{
 9433	bpf_xdp_link_release(link);
 9434	return 0;
 9435}
 9436
 9437static void bpf_xdp_link_dealloc(struct bpf_link *link)
 9438{
 9439	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9440
 9441	kfree(xdp_link);
 9442}
 9443
 9444static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
 9445				     struct seq_file *seq)
 9446{
 9447	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9448	u32 ifindex = 0;
 9449
 9450	rtnl_lock();
 9451	if (xdp_link->dev)
 9452		ifindex = xdp_link->dev->ifindex;
 9453	rtnl_unlock();
 9454
 9455	seq_printf(seq, "ifindex:\t%u\n", ifindex);
 9456}
 9457
 9458static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
 9459				       struct bpf_link_info *info)
 9460{
 9461	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9462	u32 ifindex = 0;
 9463
 9464	rtnl_lock();
 9465	if (xdp_link->dev)
 9466		ifindex = xdp_link->dev->ifindex;
 9467	rtnl_unlock();
 9468
 9469	info->xdp.ifindex = ifindex;
 9470	return 0;
 9471}
 9472
 9473static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
 9474			       struct bpf_prog *old_prog)
 9475{
 9476	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9477	enum bpf_xdp_mode mode;
 9478	bpf_op_t bpf_op;
 9479	int err = 0;
 9480
 9481	rtnl_lock();
 9482
 9483	/* link might have been auto-released already, so fail */
 9484	if (!xdp_link->dev) {
 9485		err = -ENOLINK;
 9486		goto out_unlock;
 9487	}
 9488
 9489	if (old_prog && link->prog != old_prog) {
 9490		err = -EPERM;
 9491		goto out_unlock;
 9492	}
 9493	old_prog = link->prog;
 9494	if (old_prog->type != new_prog->type ||
 9495	    old_prog->expected_attach_type != new_prog->expected_attach_type) {
 9496		err = -EINVAL;
 9497		goto out_unlock;
 9498	}
 9499
 9500	if (old_prog == new_prog) {
 9501		/* no-op, don't disturb drivers */
 9502		bpf_prog_put(new_prog);
 9503		goto out_unlock;
 9504	}
 9505
 9506	mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
 9507	bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
 9508	err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
 9509			      xdp_link->flags, new_prog);
 9510	if (err)
 9511		goto out_unlock;
 9512
 9513	old_prog = xchg(&link->prog, new_prog);
 9514	bpf_prog_put(old_prog);
 9515
 9516out_unlock:
 9517	rtnl_unlock();
 9518	return err;
 9519}
 9520
 9521static const struct bpf_link_ops bpf_xdp_link_lops = {
 9522	.release = bpf_xdp_link_release,
 9523	.dealloc = bpf_xdp_link_dealloc,
 9524	.detach = bpf_xdp_link_detach,
 9525	.show_fdinfo = bpf_xdp_link_show_fdinfo,
 9526	.fill_link_info = bpf_xdp_link_fill_link_info,
 9527	.update_prog = bpf_xdp_link_update,
 9528};
 9529
 9530int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 9531{
 9532	struct net *net = current->nsproxy->net_ns;
 9533	struct bpf_link_primer link_primer;
 9534	struct netlink_ext_ack extack = {};
 9535	struct bpf_xdp_link *link;
 9536	struct net_device *dev;
 9537	int err, fd;
 9538
 9539	rtnl_lock();
 9540	dev = dev_get_by_index(net, attr->link_create.target_ifindex);
 9541	if (!dev) {
 9542		rtnl_unlock();
 9543		return -EINVAL;
 9544	}
 
 
 9545
 9546	link = kzalloc(sizeof(*link), GFP_USER);
 9547	if (!link) {
 9548		err = -ENOMEM;
 9549		goto unlock;
 
 
 9550	}
 9551
 9552	bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
 9553	link->dev = dev;
 9554	link->flags = attr->link_create.flags;
 9555
 9556	err = bpf_link_prime(&link->link, &link_primer);
 9557	if (err) {
 9558		kfree(link);
 9559		goto unlock;
 9560	}
 9561
 9562	err = dev_xdp_attach_link(dev, &extack, link);
 9563	rtnl_unlock();
 
 9564
 9565	if (err) {
 9566		link->dev = NULL;
 9567		bpf_link_cleanup(&link_primer);
 9568		trace_bpf_xdp_link_attach_failed(extack._msg);
 9569		goto out_put_dev;
 9570	}
 
 9571
 9572	fd = bpf_link_settle(&link_primer);
 9573	/* link itself doesn't hold dev's refcnt to not complicate shutdown */
 9574	dev_put(dev);
 9575	return fd;
 9576
 9577unlock:
 9578	rtnl_unlock();
 9579
 9580out_put_dev:
 9581	dev_put(dev);
 9582	return err;
 9583}
 9584
 9585/**
 9586 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
 9587 *	@dev: device
 9588 *	@extack: netlink extended ack
 9589 *	@fd: new program fd or negative value to clear
 9590 *	@expected_fd: old program fd that userspace expects to replace or clear
 9591 *	@flags: xdp-related flags
 9592 *
 9593 *	Set or clear a bpf program for a device
 9594 */
 9595int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 9596		      int fd, int expected_fd, u32 flags)
 9597{
 9598	enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
 9599	struct bpf_prog *new_prog = NULL, *old_prog = NULL;
 9600	int err;
 9601
 9602	ASSERT_RTNL();
 
 
 
 9603
 9604	if (fd >= 0) {
 9605		new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
 9606						 mode != XDP_MODE_SKB);
 9607		if (IS_ERR(new_prog))
 9608			return PTR_ERR(new_prog);
 9609	}
 9610
 9611	if (expected_fd >= 0) {
 9612		old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
 9613						 mode != XDP_MODE_SKB);
 9614		if (IS_ERR(old_prog)) {
 9615			err = PTR_ERR(old_prog);
 9616			old_prog = NULL;
 9617			goto err_out;
 9618		}
 9619	}
 9620
 9621	err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
 
 9622
 9623err_out:
 9624	if (err && new_prog)
 9625		bpf_prog_put(new_prog);
 9626	if (old_prog)
 9627		bpf_prog_put(old_prog);
 9628	return err;
 9629}
 9630
 9631/**
 9632 * dev_index_reserve() - allocate an ifindex in a namespace
 9633 * @net: the applicable net namespace
 9634 * @ifindex: requested ifindex, pass %0 to get one allocated
 9635 *
 9636 * Allocate a ifindex for a new device. Caller must either use the ifindex
 9637 * to store the device (via list_netdevice()) or call dev_index_release()
 9638 * to give the index up.
 9639 *
 9640 * Return: a suitable unique value for a new device interface number or -errno.
 9641 */
 9642static int dev_index_reserve(struct net *net, u32 ifindex)
 9643{
 9644	int err;
 9645
 9646	if (ifindex > INT_MAX) {
 9647		DEBUG_NET_WARN_ON_ONCE(1);
 9648		return -EINVAL;
 
 
 
 9649	}
 9650
 9651	if (!ifindex)
 9652		err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL,
 9653				      xa_limit_31b, &net->ifindex, GFP_KERNEL);
 9654	else
 9655		err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL);
 9656	if (err < 0)
 9657		return err;
 9658
 9659	return ifindex;
 
 9660}
 9661
 9662static void dev_index_release(struct net *net, int ifindex)
 9663{
 9664	/* Expect only unused indexes, unlist_netdevice() removes the used */
 9665	WARN_ON(xa_erase(&net->dev_by_index, ifindex));
 9666}
 9667
 9668/* Delayed registration/unregisteration */
 9669LIST_HEAD(net_todo_list);
 9670DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
 9671
 9672static void net_set_todo(struct net_device *dev)
 9673{
 9674	list_add_tail(&dev->todo_list, &net_todo_list);
 9675	atomic_inc(&dev_net(dev)->dev_unreg_count);
 9676}
 9677
 9678static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
 9679	struct net_device *upper, netdev_features_t features)
 9680{
 9681	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
 9682	netdev_features_t feature;
 9683	int feature_bit;
 9684
 9685	for_each_netdev_feature(upper_disables, feature_bit) {
 9686		feature = __NETIF_F_BIT(feature_bit);
 9687		if (!(upper->wanted_features & feature)
 9688		    && (features & feature)) {
 9689			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
 9690				   &feature, upper->name);
 9691			features &= ~feature;
 9692		}
 9693	}
 9694
 9695	return features;
 9696}
 9697
 9698static void netdev_sync_lower_features(struct net_device *upper,
 9699	struct net_device *lower, netdev_features_t features)
 9700{
 9701	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
 9702	netdev_features_t feature;
 9703	int feature_bit;
 9704
 9705	for_each_netdev_feature(upper_disables, feature_bit) {
 9706		feature = __NETIF_F_BIT(feature_bit);
 9707		if (!(features & feature) && (lower->features & feature)) {
 9708			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
 9709				   &feature, lower->name);
 9710			lower->wanted_features &= ~feature;
 9711			__netdev_update_features(lower);
 9712
 9713			if (unlikely(lower->features & feature))
 9714				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
 9715					    &feature, lower->name);
 9716			else
 9717				netdev_features_change(lower);
 9718		}
 9719	}
 9720}
 9721
 9722static netdev_features_t netdev_fix_features(struct net_device *dev,
 9723	netdev_features_t features)
 9724{
 9725	/* Fix illegal checksum combinations */
 9726	if ((features & NETIF_F_HW_CSUM) &&
 9727	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
 9728		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
 9729		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
 9730	}
 9731
 9732	/* TSO requires that SG is present as well. */
 9733	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
 9734		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
 9735		features &= ~NETIF_F_ALL_TSO;
 9736	}
 9737
 9738	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
 9739					!(features & NETIF_F_IP_CSUM)) {
 9740		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
 9741		features &= ~NETIF_F_TSO;
 9742		features &= ~NETIF_F_TSO_ECN;
 9743	}
 9744
 9745	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
 9746					 !(features & NETIF_F_IPV6_CSUM)) {
 9747		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
 9748		features &= ~NETIF_F_TSO6;
 9749	}
 9750
 9751	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
 9752	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
 9753		features &= ~NETIF_F_TSO_MANGLEID;
 9754
 9755	/* TSO ECN requires that TSO is present as well. */
 9756	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
 9757		features &= ~NETIF_F_TSO_ECN;
 9758
 9759	/* Software GSO depends on SG. */
 9760	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
 9761		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
 9762		features &= ~NETIF_F_GSO;
 9763	}
 9764
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 9765	/* GSO partial features require GSO partial be set */
 9766	if ((features & dev->gso_partial_features) &&
 9767	    !(features & NETIF_F_GSO_PARTIAL)) {
 9768		netdev_dbg(dev,
 9769			   "Dropping partially supported GSO features since no GSO partial.\n");
 9770		features &= ~dev->gso_partial_features;
 9771	}
 9772
 9773	if (!(features & NETIF_F_RXCSUM)) {
 9774		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
 9775		 * successfully merged by hardware must also have the
 9776		 * checksum verified by hardware.  If the user does not
 9777		 * want to enable RXCSUM, logically, we should disable GRO_HW.
 9778		 */
 9779		if (features & NETIF_F_GRO_HW) {
 9780			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
 9781			features &= ~NETIF_F_GRO_HW;
 9782		}
 9783	}
 9784
 9785	/* LRO/HW-GRO features cannot be combined with RX-FCS */
 9786	if (features & NETIF_F_RXFCS) {
 9787		if (features & NETIF_F_LRO) {
 9788			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
 9789			features &= ~NETIF_F_LRO;
 9790		}
 9791
 9792		if (features & NETIF_F_GRO_HW) {
 9793			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
 9794			features &= ~NETIF_F_GRO_HW;
 9795		}
 9796	}
 9797
 9798	if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
 9799		netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
 9800		features &= ~NETIF_F_LRO;
 9801	}
 9802
 9803	if (features & NETIF_F_HW_TLS_TX) {
 9804		bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
 9805			(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
 9806		bool hw_csum = features & NETIF_F_HW_CSUM;
 9807
 9808		if (!ip_csum && !hw_csum) {
 9809			netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
 9810			features &= ~NETIF_F_HW_TLS_TX;
 9811		}
 9812	}
 9813
 9814	if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
 9815		netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
 9816		features &= ~NETIF_F_HW_TLS_RX;
 9817	}
 9818
 9819	return features;
 9820}
 9821
 9822int __netdev_update_features(struct net_device *dev)
 9823{
 9824	struct net_device *upper, *lower;
 9825	netdev_features_t features;
 9826	struct list_head *iter;
 9827	int err = -1;
 9828
 9829	ASSERT_RTNL();
 9830
 9831	features = netdev_get_wanted_features(dev);
 9832
 9833	if (dev->netdev_ops->ndo_fix_features)
 9834		features = dev->netdev_ops->ndo_fix_features(dev, features);
 9835
 9836	/* driver might be less strict about feature dependencies */
 9837	features = netdev_fix_features(dev, features);
 9838
 9839	/* some features can't be enabled if they're off on an upper device */
 9840	netdev_for_each_upper_dev_rcu(dev, upper, iter)
 9841		features = netdev_sync_upper_features(dev, upper, features);
 9842
 9843	if (dev->features == features)
 9844		goto sync_lower;
 9845
 9846	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
 9847		&dev->features, &features);
 9848
 9849	if (dev->netdev_ops->ndo_set_features)
 9850		err = dev->netdev_ops->ndo_set_features(dev, features);
 9851	else
 9852		err = 0;
 9853
 9854	if (unlikely(err < 0)) {
 9855		netdev_err(dev,
 9856			"set_features() failed (%d); wanted %pNF, left %pNF\n",
 9857			err, &features, &dev->features);
 9858		/* return non-0 since some features might have changed and
 9859		 * it's better to fire a spurious notification than miss it
 9860		 */
 9861		return -1;
 9862	}
 9863
 9864sync_lower:
 9865	/* some features must be disabled on lower devices when disabled
 9866	 * on an upper device (think: bonding master or bridge)
 9867	 */
 9868	netdev_for_each_lower_dev(dev, lower, iter)
 9869		netdev_sync_lower_features(dev, lower, features);
 9870
 9871	if (!err) {
 9872		netdev_features_t diff = features ^ dev->features;
 9873
 9874		if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
 9875			/* udp_tunnel_{get,drop}_rx_info both need
 9876			 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
 9877			 * device, or they won't do anything.
 9878			 * Thus we need to update dev->features
 9879			 * *before* calling udp_tunnel_get_rx_info,
 9880			 * but *after* calling udp_tunnel_drop_rx_info.
 9881			 */
 9882			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
 9883				dev->features = features;
 9884				udp_tunnel_get_rx_info(dev);
 9885			} else {
 9886				udp_tunnel_drop_rx_info(dev);
 9887			}
 9888		}
 9889
 9890		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
 9891			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
 9892				dev->features = features;
 9893				err |= vlan_get_rx_ctag_filter_info(dev);
 9894			} else {
 9895				vlan_drop_rx_ctag_filter_info(dev);
 9896			}
 9897		}
 9898
 9899		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
 9900			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
 9901				dev->features = features;
 9902				err |= vlan_get_rx_stag_filter_info(dev);
 9903			} else {
 9904				vlan_drop_rx_stag_filter_info(dev);
 9905			}
 9906		}
 9907
 9908		dev->features = features;
 9909	}
 9910
 9911	return err < 0 ? 0 : 1;
 9912}
 9913
 9914/**
 9915 *	netdev_update_features - recalculate device features
 9916 *	@dev: the device to check
 9917 *
 9918 *	Recalculate dev->features set and send notifications if it
 9919 *	has changed. Should be called after driver or hardware dependent
 9920 *	conditions might have changed that influence the features.
 9921 */
 9922void netdev_update_features(struct net_device *dev)
 9923{
 9924	if (__netdev_update_features(dev))
 9925		netdev_features_change(dev);
 9926}
 9927EXPORT_SYMBOL(netdev_update_features);
 9928
 9929/**
 9930 *	netdev_change_features - recalculate device features
 9931 *	@dev: the device to check
 9932 *
 9933 *	Recalculate dev->features set and send notifications even
 9934 *	if they have not changed. Should be called instead of
 9935 *	netdev_update_features() if also dev->vlan_features might
 9936 *	have changed to allow the changes to be propagated to stacked
 9937 *	VLAN devices.
 9938 */
 9939void netdev_change_features(struct net_device *dev)
 9940{
 9941	__netdev_update_features(dev);
 9942	netdev_features_change(dev);
 9943}
 9944EXPORT_SYMBOL(netdev_change_features);
 9945
 9946/**
 9947 *	netif_stacked_transfer_operstate -	transfer operstate
 9948 *	@rootdev: the root or lower level device to transfer state from
 9949 *	@dev: the device to transfer operstate to
 9950 *
 9951 *	Transfer operational state from root to device. This is normally
 9952 *	called when a stacking relationship exists between the root
 9953 *	device and the device(a leaf device).
 9954 */
 9955void netif_stacked_transfer_operstate(const struct net_device *rootdev,
 9956					struct net_device *dev)
 9957{
 9958	if (rootdev->operstate == IF_OPER_DORMANT)
 9959		netif_dormant_on(dev);
 9960	else
 9961		netif_dormant_off(dev);
 9962
 9963	if (rootdev->operstate == IF_OPER_TESTING)
 9964		netif_testing_on(dev);
 9965	else
 9966		netif_testing_off(dev);
 9967
 9968	if (netif_carrier_ok(rootdev))
 9969		netif_carrier_on(dev);
 9970	else
 9971		netif_carrier_off(dev);
 9972}
 9973EXPORT_SYMBOL(netif_stacked_transfer_operstate);
 9974
 
 9975static int netif_alloc_rx_queues(struct net_device *dev)
 9976{
 9977	unsigned int i, count = dev->num_rx_queues;
 9978	struct netdev_rx_queue *rx;
 9979	size_t sz = count * sizeof(*rx);
 9980	int err = 0;
 9981
 9982	BUG_ON(count < 1);
 9983
 9984	rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
 9985	if (!rx)
 9986		return -ENOMEM;
 9987
 
 
 9988	dev->_rx = rx;
 9989
 9990	for (i = 0; i < count; i++) {
 9991		rx[i].dev = dev;
 9992
 9993		/* XDP RX-queue setup */
 9994		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
 9995		if (err < 0)
 9996			goto err_rxq_info;
 9997	}
 9998	return 0;
 9999
10000err_rxq_info:
10001	/* Rollback successful reg's and free other resources */
10002	while (i--)
10003		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
10004	kvfree(dev->_rx);
10005	dev->_rx = NULL;
10006	return err;
10007}
10008
10009static void netif_free_rx_queues(struct net_device *dev)
10010{
10011	unsigned int i, count = dev->num_rx_queues;
10012
10013	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
10014	if (!dev->_rx)
10015		return;
10016
10017	for (i = 0; i < count; i++)
10018		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
10019
10020	kvfree(dev->_rx);
10021}
 
10022
10023static void netdev_init_one_queue(struct net_device *dev,
10024				  struct netdev_queue *queue, void *_unused)
10025{
10026	/* Initialize queue lock */
10027	spin_lock_init(&queue->_xmit_lock);
10028	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
10029	queue->xmit_lock_owner = -1;
10030	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
10031	queue->dev = dev;
10032#ifdef CONFIG_BQL
10033	dql_init(&queue->dql, HZ);
10034#endif
10035}
10036
10037static void netif_free_tx_queues(struct net_device *dev)
10038{
10039	kvfree(dev->_tx);
10040}
10041
10042static int netif_alloc_netdev_queues(struct net_device *dev)
10043{
10044	unsigned int count = dev->num_tx_queues;
10045	struct netdev_queue *tx;
10046	size_t sz = count * sizeof(*tx);
10047
10048	if (count < 1 || count > 0xffff)
10049		return -EINVAL;
10050
10051	tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10052	if (!tx)
10053		return -ENOMEM;
10054
 
 
10055	dev->_tx = tx;
10056
10057	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
10058	spin_lock_init(&dev->tx_global_lock);
10059
10060	return 0;
10061}
10062
10063void netif_tx_stop_all_queues(struct net_device *dev)
10064{
10065	unsigned int i;
10066
10067	for (i = 0; i < dev->num_tx_queues; i++) {
10068		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
10069
10070		netif_tx_stop_queue(txq);
10071	}
10072}
10073EXPORT_SYMBOL(netif_tx_stop_all_queues);
10074
10075static int netdev_do_alloc_pcpu_stats(struct net_device *dev)
10076{
10077	void __percpu *v;
10078
10079	/* Drivers implementing ndo_get_peer_dev must support tstat
10080	 * accounting, so that skb_do_redirect() can bump the dev's
10081	 * RX stats upon network namespace switch.
10082	 */
10083	if (dev->netdev_ops->ndo_get_peer_dev &&
10084	    dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS)
10085		return -EOPNOTSUPP;
10086
10087	switch (dev->pcpu_stat_type) {
10088	case NETDEV_PCPU_STAT_NONE:
10089		return 0;
10090	case NETDEV_PCPU_STAT_LSTATS:
10091		v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
10092		break;
10093	case NETDEV_PCPU_STAT_TSTATS:
10094		v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
10095		break;
10096	case NETDEV_PCPU_STAT_DSTATS:
10097		v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
10098		break;
10099	default:
10100		return -EINVAL;
10101	}
10102
10103	return v ? 0 : -ENOMEM;
10104}
10105
10106static void netdev_do_free_pcpu_stats(struct net_device *dev)
10107{
10108	switch (dev->pcpu_stat_type) {
10109	case NETDEV_PCPU_STAT_NONE:
10110		return;
10111	case NETDEV_PCPU_STAT_LSTATS:
10112		free_percpu(dev->lstats);
10113		break;
10114	case NETDEV_PCPU_STAT_TSTATS:
10115		free_percpu(dev->tstats);
10116		break;
10117	case NETDEV_PCPU_STAT_DSTATS:
10118		free_percpu(dev->dstats);
10119		break;
10120	}
10121}
10122
10123/**
10124 * register_netdevice() - register a network device
10125 * @dev: device to register
10126 *
10127 * Take a prepared network device structure and make it externally accessible.
10128 * A %NETDEV_REGISTER message is sent to the netdev notifier chain.
10129 * Callers must hold the rtnl lock - you may want register_netdev()
10130 * instead of this.
 
 
 
 
 
 
 
10131 */
 
10132int register_netdevice(struct net_device *dev)
10133{
10134	int ret;
10135	struct net *net = dev_net(dev);
10136
10137	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
10138		     NETDEV_FEATURE_COUNT);
10139	BUG_ON(dev_boot_phase);
10140	ASSERT_RTNL();
10141
10142	might_sleep();
10143
10144	/* When net_device's are persistent, this will be fatal. */
10145	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
10146	BUG_ON(!net);
10147
10148	ret = ethtool_check_ops(dev->ethtool_ops);
10149	if (ret)
10150		return ret;
10151
10152	spin_lock_init(&dev->addr_list_lock);
10153	netdev_set_addr_lockdep_class(dev);
10154
10155	ret = dev_get_valid_name(net, dev, dev->name);
10156	if (ret < 0)
10157		goto out;
10158
10159	ret = -ENOMEM;
10160	dev->name_node = netdev_name_node_head_alloc(dev);
10161	if (!dev->name_node)
10162		goto out;
10163
10164	/* Init, if this function is available */
10165	if (dev->netdev_ops->ndo_init) {
10166		ret = dev->netdev_ops->ndo_init(dev);
10167		if (ret) {
10168			if (ret > 0)
10169				ret = -EIO;
10170			goto err_free_name;
10171		}
10172	}
10173
10174	if (((dev->hw_features | dev->features) &
10175	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
10176	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
10177	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
10178		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
10179		ret = -EINVAL;
10180		goto err_uninit;
10181	}
10182
10183	ret = netdev_do_alloc_pcpu_stats(dev);
10184	if (ret)
 
 
10185		goto err_uninit;
10186
10187	ret = dev_index_reserve(net, dev->ifindex);
10188	if (ret < 0)
10189		goto err_free_pcpu;
10190	dev->ifindex = ret;
10191
10192	/* Transfer changeable features to wanted_features and enable
10193	 * software offloads (GSO and GRO).
10194	 */
10195	dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
10196	dev->features |= NETIF_F_SOFT_FEATURES;
10197
10198	if (dev->udp_tunnel_nic_info) {
10199		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10200		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10201	}
10202
10203	dev->wanted_features = dev->features & dev->hw_features;
10204
10205	if (!(dev->flags & IFF_LOOPBACK))
10206		dev->hw_features |= NETIF_F_NOCACHE_COPY;
10207
10208	/* If IPv4 TCP segmentation offload is supported we should also
10209	 * allow the device to enable segmenting the frame with the option
10210	 * of ignoring a static IP ID value.  This doesn't enable the
10211	 * feature itself but allows the user to enable it later.
10212	 */
10213	if (dev->hw_features & NETIF_F_TSO)
10214		dev->hw_features |= NETIF_F_TSO_MANGLEID;
10215	if (dev->vlan_features & NETIF_F_TSO)
10216		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
10217	if (dev->mpls_features & NETIF_F_TSO)
10218		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
10219	if (dev->hw_enc_features & NETIF_F_TSO)
10220		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
10221
10222	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
10223	 */
10224	dev->vlan_features |= NETIF_F_HIGHDMA;
10225
10226	/* Make NETIF_F_SG inheritable to tunnel devices.
10227	 */
10228	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
10229
10230	/* Make NETIF_F_SG inheritable to MPLS.
10231	 */
10232	dev->mpls_features |= NETIF_F_SG;
10233
10234	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
10235	ret = notifier_to_errno(ret);
10236	if (ret)
10237		goto err_ifindex_release;
10238
10239	ret = netdev_register_kobject(dev);
10240	write_lock(&dev_base_lock);
10241	dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED;
10242	write_unlock(&dev_base_lock);
10243	if (ret)
10244		goto err_uninit_notify;
 
10245
10246	__netdev_update_features(dev);
10247
10248	/*
10249	 *	Default initial state at registry is that the
10250	 *	device is present.
10251	 */
10252
10253	set_bit(__LINK_STATE_PRESENT, &dev->state);
10254
10255	linkwatch_init_dev(dev);
10256
10257	dev_init_scheduler(dev);
10258
10259	netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL);
10260	list_netdevice(dev);
10261
10262	add_device_randomness(dev->dev_addr, dev->addr_len);
10263
10264	/* If the device has permanent device address, driver should
10265	 * set dev_addr and also addr_assign_type should be set to
10266	 * NET_ADDR_PERM (default value).
10267	 */
10268	if (dev->addr_assign_type == NET_ADDR_PERM)
10269		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
10270
10271	/* Notify protocols, that a new device appeared. */
10272	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
10273	ret = notifier_to_errno(ret);
10274	if (ret) {
10275		/* Expect explicit free_netdev() on failure */
10276		dev->needs_free_netdev = false;
10277		unregister_netdevice_queue(dev, NULL);
10278		goto out;
10279	}
10280	/*
10281	 *	Prevent userspace races by waiting until the network
10282	 *	device is fully setup before sending notifications.
10283	 */
10284	if (!dev->rtnl_link_ops ||
10285	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10286		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
10287
10288out:
10289	return ret;
10290
10291err_uninit_notify:
10292	call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
10293err_ifindex_release:
10294	dev_index_release(net, dev->ifindex);
10295err_free_pcpu:
10296	netdev_do_free_pcpu_stats(dev);
10297err_uninit:
10298	if (dev->netdev_ops->ndo_uninit)
10299		dev->netdev_ops->ndo_uninit(dev);
10300	if (dev->priv_destructor)
10301		dev->priv_destructor(dev);
10302err_free_name:
10303	netdev_name_node_free(dev->name_node);
10304	goto out;
10305}
10306EXPORT_SYMBOL(register_netdevice);
10307
10308/**
10309 *	init_dummy_netdev	- init a dummy network device for NAPI
10310 *	@dev: device to init
10311 *
10312 *	This takes a network device structure and initialize the minimum
10313 *	amount of fields so it can be used to schedule NAPI polls without
10314 *	registering a full blown interface. This is to be used by drivers
10315 *	that need to tie several hardware interfaces to a single NAPI
10316 *	poll scheduler due to HW limitations.
10317 */
10318int init_dummy_netdev(struct net_device *dev)
10319{
10320	/* Clear everything. Note we don't initialize spinlocks
10321	 * are they aren't supposed to be taken by any of the
10322	 * NAPI code and this dummy netdev is supposed to be
10323	 * only ever used for NAPI polls
10324	 */
10325	memset(dev, 0, sizeof(struct net_device));
10326
10327	/* make sure we BUG if trying to hit standard
10328	 * register/unregister code path
10329	 */
10330	dev->reg_state = NETREG_DUMMY;
10331
10332	/* NAPI wants this */
10333	INIT_LIST_HEAD(&dev->napi_list);
10334
10335	/* a dummy interface is started by default */
10336	set_bit(__LINK_STATE_PRESENT, &dev->state);
10337	set_bit(__LINK_STATE_START, &dev->state);
10338
10339	/* napi_busy_loop stats accounting wants this */
10340	dev_net_set(dev, &init_net);
10341
10342	/* Note : We dont allocate pcpu_refcnt for dummy devices,
10343	 * because users of this 'device' dont need to change
10344	 * its refcount.
10345	 */
10346
10347	return 0;
10348}
10349EXPORT_SYMBOL_GPL(init_dummy_netdev);
10350
10351
10352/**
10353 *	register_netdev	- register a network device
10354 *	@dev: device to register
10355 *
10356 *	Take a completed network device structure and add it to the kernel
10357 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
10358 *	chain. 0 is returned on success. A negative errno code is returned
10359 *	on a failure to set up the device, or if the name is a duplicate.
10360 *
10361 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
10362 *	and expands the device name if you passed a format string to
10363 *	alloc_netdev.
10364 */
10365int register_netdev(struct net_device *dev)
10366{
10367	int err;
10368
10369	if (rtnl_lock_killable())
10370		return -EINTR;
10371	err = register_netdevice(dev);
10372	rtnl_unlock();
10373	return err;
10374}
10375EXPORT_SYMBOL(register_netdev);
10376
10377int netdev_refcnt_read(const struct net_device *dev)
10378{
10379#ifdef CONFIG_PCPU_DEV_REFCNT
10380	int i, refcnt = 0;
10381
10382	for_each_possible_cpu(i)
10383		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
10384	return refcnt;
10385#else
10386	return refcount_read(&dev->dev_refcnt);
10387#endif
10388}
10389EXPORT_SYMBOL(netdev_refcnt_read);
10390
10391int netdev_unregister_timeout_secs __read_mostly = 10;
10392
10393#define WAIT_REFS_MIN_MSECS 1
10394#define WAIT_REFS_MAX_MSECS 250
10395/**
10396 * netdev_wait_allrefs_any - wait until all references are gone.
10397 * @list: list of net_devices to wait on
10398 *
10399 * This is called when unregistering network devices.
10400 *
10401 * Any protocol or device that holds a reference should register
10402 * for netdevice notification, and cleanup and put back the
10403 * reference if they receive an UNREGISTER event.
10404 * We can get stuck here if buggy protocols don't correctly
10405 * call dev_put.
10406 */
10407static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
10408{
10409	unsigned long rebroadcast_time, warning_time;
10410	struct net_device *dev;
10411	int wait = 0;
10412
10413	rebroadcast_time = warning_time = jiffies;
10414
10415	list_for_each_entry(dev, list, todo_list)
10416		if (netdev_refcnt_read(dev) == 1)
10417			return dev;
10418
10419	while (true) {
10420		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
10421			rtnl_lock();
10422
10423			/* Rebroadcast unregister notification */
10424			list_for_each_entry(dev, list, todo_list)
10425				call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10426
10427			__rtnl_unlock();
10428			rcu_barrier();
10429			rtnl_lock();
10430
10431			list_for_each_entry(dev, list, todo_list)
10432				if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
10433					     &dev->state)) {
10434					/* We must not have linkwatch events
10435					 * pending on unregister. If this
10436					 * happens, we simply run the queue
10437					 * unscheduled, resulting in a noop
10438					 * for this device.
10439					 */
10440					linkwatch_run_queue();
10441					break;
10442				}
10443
10444			__rtnl_unlock();
10445
10446			rebroadcast_time = jiffies;
10447		}
10448
10449		if (!wait) {
10450			rcu_barrier();
10451			wait = WAIT_REFS_MIN_MSECS;
10452		} else {
10453			msleep(wait);
10454			wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
10455		}
10456
10457		list_for_each_entry(dev, list, todo_list)
10458			if (netdev_refcnt_read(dev) == 1)
10459				return dev;
10460
10461		if (time_after(jiffies, warning_time +
10462			       READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
10463			list_for_each_entry(dev, list, todo_list) {
10464				pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
10465					 dev->name, netdev_refcnt_read(dev));
10466				ref_tracker_dir_print(&dev->refcnt_tracker, 10);
10467			}
10468
 
 
 
10469			warning_time = jiffies;
10470		}
10471	}
10472}
10473
10474/* The sequence is:
10475 *
10476 *	rtnl_lock();
10477 *	...
10478 *	register_netdevice(x1);
10479 *	register_netdevice(x2);
10480 *	...
10481 *	unregister_netdevice(y1);
10482 *	unregister_netdevice(y2);
10483 *      ...
10484 *	rtnl_unlock();
10485 *	free_netdev(y1);
10486 *	free_netdev(y2);
10487 *
10488 * We are invoked by rtnl_unlock().
10489 * This allows us to deal with problems:
10490 * 1) We can delete sysfs objects which invoke hotplug
10491 *    without deadlocking with linkwatch via keventd.
10492 * 2) Since we run with the RTNL semaphore not held, we can sleep
10493 *    safely in order to wait for the netdev refcnt to drop to zero.
10494 *
10495 * We must not return until all unregister events added during
10496 * the interval the lock was held have been completed.
10497 */
10498void netdev_run_todo(void)
10499{
10500	struct net_device *dev, *tmp;
10501	struct list_head list;
10502#ifdef CONFIG_LOCKDEP
10503	struct list_head unlink_list;
10504
10505	list_replace_init(&net_unlink_list, &unlink_list);
10506
10507	while (!list_empty(&unlink_list)) {
10508		struct net_device *dev = list_first_entry(&unlink_list,
10509							  struct net_device,
10510							  unlink_list);
10511		list_del_init(&dev->unlink_list);
10512		dev->nested_level = dev->lower_level - 1;
10513	}
10514#endif
10515
10516	/* Snapshot list, allow later requests */
10517	list_replace_init(&net_todo_list, &list);
10518
10519	__rtnl_unlock();
10520
 
10521	/* Wait for rcu callbacks to finish before next phase */
10522	if (!list_empty(&list))
10523		rcu_barrier();
10524
10525	list_for_each_entry_safe(dev, tmp, &list, todo_list) {
 
 
 
 
 
 
 
 
10526		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
10527			netdev_WARN(dev, "run_todo but not unregistering\n");
10528			list_del(&dev->todo_list);
 
10529			continue;
10530		}
10531
10532		write_lock(&dev_base_lock);
10533		dev->reg_state = NETREG_UNREGISTERED;
10534		write_unlock(&dev_base_lock);
10535		linkwatch_sync_dev(dev);
10536	}
10537
10538	while (!list_empty(&list)) {
10539		dev = netdev_wait_allrefs_any(&list);
10540		list_del(&dev->todo_list);
10541
10542		/* paranoia */
10543		BUG_ON(netdev_refcnt_read(dev) != 1);
10544		BUG_ON(!list_empty(&dev->ptype_all));
10545		BUG_ON(!list_empty(&dev->ptype_specific));
10546		WARN_ON(rcu_access_pointer(dev->ip_ptr));
10547		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
 
10548
10549		netdev_do_free_pcpu_stats(dev);
10550		if (dev->priv_destructor)
10551			dev->priv_destructor(dev);
10552		if (dev->needs_free_netdev)
10553			free_netdev(dev);
10554
10555		if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count))
10556			wake_up(&netdev_unregistering_wq);
 
 
 
10557
10558		/* Free network device */
10559		kobject_put(&dev->dev.kobj);
10560	}
10561}
10562
10563/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
10564 * all the same fields in the same order as net_device_stats, with only
10565 * the type differing, but rtnl_link_stats64 may have additional fields
10566 * at the end for newer counters.
10567 */
10568void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
10569			     const struct net_device_stats *netdev_stats)
10570{
10571	size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
10572	const atomic_long_t *src = (atomic_long_t *)netdev_stats;
 
 
 
 
 
 
 
10573	u64 *dst = (u64 *)stats64;
10574
10575	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
10576	for (i = 0; i < n; i++)
10577		dst[i] = (unsigned long)atomic_long_read(&src[i]);
10578	/* zero out counters that only exist in rtnl_link_stats64 */
10579	memset((char *)stats64 + n * sizeof(u64), 0,
10580	       sizeof(*stats64) - n * sizeof(u64));
 
10581}
10582EXPORT_SYMBOL(netdev_stats_to_stats64);
10583
10584static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc(
10585		struct net_device *dev)
10586{
10587	struct net_device_core_stats __percpu *p;
10588
10589	p = alloc_percpu_gfp(struct net_device_core_stats,
10590			     GFP_ATOMIC | __GFP_NOWARN);
10591
10592	if (p && cmpxchg(&dev->core_stats, NULL, p))
10593		free_percpu(p);
10594
10595	/* This READ_ONCE() pairs with the cmpxchg() above */
10596	return READ_ONCE(dev->core_stats);
10597}
10598
10599noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset)
10600{
10601	/* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
10602	struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
10603	unsigned long __percpu *field;
10604
10605	if (unlikely(!p)) {
10606		p = netdev_core_stats_alloc(dev);
10607		if (!p)
10608			return;
10609	}
10610
10611	field = (__force unsigned long __percpu *)((__force void *)p + offset);
10612	this_cpu_inc(*field);
10613}
10614EXPORT_SYMBOL_GPL(netdev_core_stats_inc);
10615
10616/**
10617 *	dev_get_stats	- get network device statistics
10618 *	@dev: device to get statistics from
10619 *	@storage: place to store stats
10620 *
10621 *	Get network statistics from device. Return @storage.
10622 *	The device driver may provide its own method by setting
10623 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
10624 *	otherwise the internal statistics structure is used.
10625 */
10626struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
10627					struct rtnl_link_stats64 *storage)
10628{
10629	const struct net_device_ops *ops = dev->netdev_ops;
10630	const struct net_device_core_stats __percpu *p;
10631
10632	if (ops->ndo_get_stats64) {
10633		memset(storage, 0, sizeof(*storage));
10634		ops->ndo_get_stats64(dev, storage);
10635	} else if (ops->ndo_get_stats) {
10636		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
10637	} else {
10638		netdev_stats_to_stats64(storage, &dev->stats);
10639	}
10640
10641	/* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
10642	p = READ_ONCE(dev->core_stats);
10643	if (p) {
10644		const struct net_device_core_stats *core_stats;
10645		int i;
10646
10647		for_each_possible_cpu(i) {
10648			core_stats = per_cpu_ptr(p, i);
10649			storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
10650			storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
10651			storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
10652			storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped);
10653		}
10654	}
10655	return storage;
10656}
10657EXPORT_SYMBOL(dev_get_stats);
10658
10659/**
10660 *	dev_fetch_sw_netstats - get per-cpu network device statistics
10661 *	@s: place to store stats
10662 *	@netstats: per-cpu network stats to read from
10663 *
10664 *	Read per-cpu network statistics and populate the related fields in @s.
10665 */
10666void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
10667			   const struct pcpu_sw_netstats __percpu *netstats)
10668{
10669	int cpu;
10670
10671	for_each_possible_cpu(cpu) {
10672		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
10673		const struct pcpu_sw_netstats *stats;
10674		unsigned int start;
10675
10676		stats = per_cpu_ptr(netstats, cpu);
10677		do {
10678			start = u64_stats_fetch_begin(&stats->syncp);
10679			rx_packets = u64_stats_read(&stats->rx_packets);
10680			rx_bytes   = u64_stats_read(&stats->rx_bytes);
10681			tx_packets = u64_stats_read(&stats->tx_packets);
10682			tx_bytes   = u64_stats_read(&stats->tx_bytes);
10683		} while (u64_stats_fetch_retry(&stats->syncp, start));
10684
10685		s->rx_packets += rx_packets;
10686		s->rx_bytes   += rx_bytes;
10687		s->tx_packets += tx_packets;
10688		s->tx_bytes   += tx_bytes;
10689	}
10690}
10691EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
10692
10693/**
10694 *	dev_get_tstats64 - ndo_get_stats64 implementation
10695 *	@dev: device to get statistics from
10696 *	@s: place to store stats
10697 *
10698 *	Populate @s from dev->stats and dev->tstats. Can be used as
10699 *	ndo_get_stats64() callback.
10700 */
10701void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
10702{
10703	netdev_stats_to_stats64(s, &dev->stats);
10704	dev_fetch_sw_netstats(s, dev->tstats);
10705}
10706EXPORT_SYMBOL_GPL(dev_get_tstats64);
10707
10708struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
10709{
10710	struct netdev_queue *queue = dev_ingress_queue(dev);
10711
10712#ifdef CONFIG_NET_CLS_ACT
10713	if (queue)
10714		return queue;
10715	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
10716	if (!queue)
10717		return NULL;
10718	netdev_init_one_queue(dev, queue, NULL);
10719	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
10720	RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc);
10721	rcu_assign_pointer(dev->ingress_queue, queue);
10722#endif
10723	return queue;
10724}
10725
10726static const struct ethtool_ops default_ethtool_ops;
10727
10728void netdev_set_default_ethtool_ops(struct net_device *dev,
10729				    const struct ethtool_ops *ops)
10730{
10731	if (dev->ethtool_ops == &default_ethtool_ops)
10732		dev->ethtool_ops = ops;
10733}
10734EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
10735
10736/**
10737 * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default
10738 * @dev: netdev to enable the IRQ coalescing on
10739 *
10740 * Sets a conservative default for SW IRQ coalescing. Users can use
10741 * sysfs attributes to override the default values.
10742 */
10743void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
10744{
10745	WARN_ON(dev->reg_state == NETREG_REGISTERED);
10746
10747	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
10748		dev->gro_flush_timeout = 20000;
10749		dev->napi_defer_hard_irqs = 1;
10750	}
10751}
10752EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
10753
10754void netdev_freemem(struct net_device *dev)
10755{
10756	char *addr = (char *)dev - dev->padded;
10757
10758	kvfree(addr);
10759}
10760
10761/**
10762 * alloc_netdev_mqs - allocate network device
10763 * @sizeof_priv: size of private data to allocate space for
10764 * @name: device name format string
10765 * @name_assign_type: origin of device name
10766 * @setup: callback to initialize device
10767 * @txqs: the number of TX subqueues to allocate
10768 * @rxqs: the number of RX subqueues to allocate
10769 *
10770 * Allocates a struct net_device with private data area for driver use
10771 * and performs basic initialization.  Also allocates subqueue structs
10772 * for each queue on the device.
10773 */
10774struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
10775		unsigned char name_assign_type,
10776		void (*setup)(struct net_device *),
10777		unsigned int txqs, unsigned int rxqs)
10778{
10779	struct net_device *dev;
10780	unsigned int alloc_size;
10781	struct net_device *p;
10782
10783	BUG_ON(strlen(name) >= sizeof(dev->name));
10784
10785	if (txqs < 1) {
10786		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
10787		return NULL;
10788	}
10789
 
10790	if (rxqs < 1) {
10791		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
10792		return NULL;
10793	}
 
10794
10795	alloc_size = sizeof(struct net_device);
10796	if (sizeof_priv) {
10797		/* ensure 32-byte alignment of private area */
10798		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
10799		alloc_size += sizeof_priv;
10800	}
10801	/* ensure 32-byte alignment of whole construct */
10802	alloc_size += NETDEV_ALIGN - 1;
10803
10804	p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
 
 
10805	if (!p)
10806		return NULL;
10807
10808	dev = PTR_ALIGN(p, NETDEV_ALIGN);
10809	dev->padded = (char *)dev - (char *)p;
10810
10811	ref_tracker_dir_init(&dev->refcnt_tracker, 128, name);
10812#ifdef CONFIG_PCPU_DEV_REFCNT
10813	dev->pcpu_refcnt = alloc_percpu(int);
10814	if (!dev->pcpu_refcnt)
10815		goto free_dev;
10816	__dev_hold(dev);
10817#else
10818	refcount_set(&dev->dev_refcnt, 1);
10819#endif
10820
10821	if (dev_addr_init(dev))
10822		goto free_pcpu;
10823
10824	dev_mc_init(dev);
10825	dev_uc_init(dev);
10826
10827	dev_net_set(dev, &init_net);
10828
10829	dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
10830	dev->xdp_zc_max_segs = 1;
10831	dev->gso_max_segs = GSO_MAX_SEGS;
10832	dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
10833	dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE;
10834	dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE;
10835	dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
10836	dev->tso_max_segs = TSO_MAX_SEGS;
10837	dev->upper_level = 1;
10838	dev->lower_level = 1;
10839#ifdef CONFIG_LOCKDEP
10840	dev->nested_level = 0;
10841	INIT_LIST_HEAD(&dev->unlink_list);
10842#endif
10843
10844	INIT_LIST_HEAD(&dev->napi_list);
10845	INIT_LIST_HEAD(&dev->unreg_list);
10846	INIT_LIST_HEAD(&dev->close_list);
10847	INIT_LIST_HEAD(&dev->link_watch_list);
10848	INIT_LIST_HEAD(&dev->adj_list.upper);
10849	INIT_LIST_HEAD(&dev->adj_list.lower);
10850	INIT_LIST_HEAD(&dev->ptype_all);
10851	INIT_LIST_HEAD(&dev->ptype_specific);
10852	INIT_LIST_HEAD(&dev->net_notifier_list);
10853#ifdef CONFIG_NET_SCHED
10854	hash_init(dev->qdisc_hash);
10855#endif
10856	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
10857	setup(dev);
10858
10859	if (!dev->tx_queue_len) {
10860		dev->priv_flags |= IFF_NO_QUEUE;
10861		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
10862	}
10863
10864	dev->num_tx_queues = txqs;
10865	dev->real_num_tx_queues = txqs;
10866	if (netif_alloc_netdev_queues(dev))
10867		goto free_all;
10868
 
10869	dev->num_rx_queues = rxqs;
10870	dev->real_num_rx_queues = rxqs;
10871	if (netif_alloc_rx_queues(dev))
10872		goto free_all;
 
10873
10874	strcpy(dev->name, name);
10875	dev->name_assign_type = name_assign_type;
10876	dev->group = INIT_NETDEV_GROUP;
10877	if (!dev->ethtool_ops)
10878		dev->ethtool_ops = &default_ethtool_ops;
10879
10880	nf_hook_netdev_init(dev);
10881
10882	return dev;
10883
10884free_all:
10885	free_netdev(dev);
10886	return NULL;
10887
10888free_pcpu:
10889#ifdef CONFIG_PCPU_DEV_REFCNT
10890	free_percpu(dev->pcpu_refcnt);
10891free_dev:
10892#endif
10893	netdev_freemem(dev);
10894	return NULL;
10895}
10896EXPORT_SYMBOL(alloc_netdev_mqs);
10897
10898/**
10899 * free_netdev - free network device
10900 * @dev: device
10901 *
10902 * This function does the last stage of destroying an allocated device
10903 * interface. The reference to the device object is released. If this
10904 * is the last reference then it will be freed.Must be called in process
10905 * context.
10906 */
10907void free_netdev(struct net_device *dev)
10908{
10909	struct napi_struct *p, *n;
10910
10911	might_sleep();
10912
10913	/* When called immediately after register_netdevice() failed the unwind
10914	 * handling may still be dismantling the device. Handle that case by
10915	 * deferring the free.
10916	 */
10917	if (dev->reg_state == NETREG_UNREGISTERING) {
10918		ASSERT_RTNL();
10919		dev->needs_free_netdev = true;
10920		return;
10921	}
10922
10923	netif_free_tx_queues(dev);
10924	netif_free_rx_queues(dev);
 
 
10925
10926	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
10927
10928	/* Flush device addresses */
10929	dev_addr_flush(dev);
10930
10931	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
10932		netif_napi_del(p);
10933
10934	ref_tracker_dir_exit(&dev->refcnt_tracker);
10935#ifdef CONFIG_PCPU_DEV_REFCNT
10936	free_percpu(dev->pcpu_refcnt);
10937	dev->pcpu_refcnt = NULL;
10938#endif
10939	free_percpu(dev->core_stats);
10940	dev->core_stats = NULL;
10941	free_percpu(dev->xdp_bulkq);
10942	dev->xdp_bulkq = NULL;
10943
10944	/*  Compatibility with error handling in drivers */
10945	if (dev->reg_state == NETREG_UNINITIALIZED) {
10946		netdev_freemem(dev);
10947		return;
10948	}
10949
10950	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
10951	dev->reg_state = NETREG_RELEASED;
10952
10953	/* will free via device release */
10954	put_device(&dev->dev);
10955}
10956EXPORT_SYMBOL(free_netdev);
10957
10958/**
10959 *	synchronize_net -  Synchronize with packet receive processing
10960 *
10961 *	Wait for packets currently being received to be done.
10962 *	Does not block later packets from starting.
10963 */
10964void synchronize_net(void)
10965{
10966	might_sleep();
10967	if (rtnl_is_locked())
10968		synchronize_rcu_expedited();
10969	else
10970		synchronize_rcu();
10971}
10972EXPORT_SYMBOL(synchronize_net);
10973
10974/**
10975 *	unregister_netdevice_queue - remove device from the kernel
10976 *	@dev: device
10977 *	@head: list
10978 *
10979 *	This function shuts down a device interface and removes it
10980 *	from the kernel tables.
10981 *	If head not NULL, device is queued to be unregistered later.
10982 *
10983 *	Callers must hold the rtnl semaphore.  You may want
10984 *	unregister_netdev() instead of this.
10985 */
10986
10987void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
10988{
10989	ASSERT_RTNL();
10990
10991	if (head) {
10992		list_move_tail(&dev->unreg_list, head);
10993	} else {
10994		LIST_HEAD(single);
10995
10996		list_add(&dev->unreg_list, &single);
10997		unregister_netdevice_many(&single);
10998	}
10999}
11000EXPORT_SYMBOL(unregister_netdevice_queue);
11001
11002void unregister_netdevice_many_notify(struct list_head *head,
11003				      u32 portid, const struct nlmsghdr *nlh)
11004{
11005	struct net_device *dev, *tmp;
11006	LIST_HEAD(close_head);
11007
11008	BUG_ON(dev_boot_phase);
11009	ASSERT_RTNL();
11010
11011	if (list_empty(head))
11012		return;
11013
11014	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
11015		/* Some devices call without registering
11016		 * for initialization unwind. Remove those
11017		 * devices and proceed with the remaining.
11018		 */
11019		if (dev->reg_state == NETREG_UNINITIALIZED) {
11020			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
11021				 dev->name, dev);
11022
11023			WARN_ON(1);
11024			list_del(&dev->unreg_list);
11025			continue;
11026		}
11027		dev->dismantle = true;
11028		BUG_ON(dev->reg_state != NETREG_REGISTERED);
11029	}
11030
11031	/* If device is running, close it first. */
11032	list_for_each_entry(dev, head, unreg_list)
11033		list_add_tail(&dev->close_list, &close_head);
11034	dev_close_many(&close_head, true);
11035
11036	list_for_each_entry(dev, head, unreg_list) {
11037		/* And unlink it from device chain. */
11038		write_lock(&dev_base_lock);
11039		unlist_netdevice(dev, false);
11040		dev->reg_state = NETREG_UNREGISTERING;
11041		write_unlock(&dev_base_lock);
11042	}
11043	flush_all_backlogs();
11044
11045	synchronize_net();
11046
11047	list_for_each_entry(dev, head, unreg_list) {
11048		struct sk_buff *skb = NULL;
11049
11050		/* Shutdown queueing discipline. */
11051		dev_shutdown(dev);
11052		dev_tcx_uninstall(dev);
11053		dev_xdp_uninstall(dev);
11054		bpf_dev_bound_netdev_unregister(dev);
11055
11056		netdev_offload_xstats_disable_all(dev);
11057
11058		/* Notify protocols, that we are about to destroy
11059		 * this device. They should clean all the things.
11060		 */
11061		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
11062
11063		if (!dev->rtnl_link_ops ||
11064		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
11065			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
11066						     GFP_KERNEL, NULL, 0,
11067						     portid, nlh);
11068
11069		/*
11070		 *	Flush the unicast and multicast chains
11071		 */
11072		dev_uc_flush(dev);
11073		dev_mc_flush(dev);
11074
11075		netdev_name_node_alt_flush(dev);
11076		netdev_name_node_free(dev->name_node);
11077
11078		call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
11079
11080		if (dev->netdev_ops->ndo_uninit)
11081			dev->netdev_ops->ndo_uninit(dev);
11082
11083		if (skb)
11084			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);
11085
11086		/* Notifier chain MUST detach us all upper devices. */
11087		WARN_ON(netdev_has_any_upper_dev(dev));
11088		WARN_ON(netdev_has_any_lower_dev(dev));
11089
11090		/* Remove entries from kobject tree */
11091		netdev_unregister_kobject(dev);
11092#ifdef CONFIG_XPS
11093		/* Remove XPS queueing entries */
11094		netif_reset_xps_queues_gt(dev, 0);
11095#endif
11096	}
11097
11098	synchronize_net();
11099
11100	list_for_each_entry(dev, head, unreg_list) {
11101		netdev_put(dev, &dev->dev_registered_tracker);
11102		net_set_todo(dev);
11103	}
11104
11105	list_del(head);
11106}
 
11107
11108/**
11109 *	unregister_netdevice_many - unregister many devices
11110 *	@head: list of devices
11111 *
11112 *  Note: As most callers use a stack allocated list_head,
11113 *  we force a list_del() to make sure stack wont be corrupted later.
11114 */
11115void unregister_netdevice_many(struct list_head *head)
11116{
11117	unregister_netdevice_many_notify(head, 0, NULL);
 
 
 
 
 
 
 
11118}
11119EXPORT_SYMBOL(unregister_netdevice_many);
11120
11121/**
11122 *	unregister_netdev - remove device from the kernel
11123 *	@dev: device
11124 *
11125 *	This function shuts down a device interface and removes it
11126 *	from the kernel tables.
11127 *
11128 *	This is just a wrapper for unregister_netdevice that takes
11129 *	the rtnl semaphore.  In general you want to use this and not
11130 *	unregister_netdevice.
11131 */
11132void unregister_netdev(struct net_device *dev)
11133{
11134	rtnl_lock();
11135	unregister_netdevice(dev);
11136	rtnl_unlock();
11137}
11138EXPORT_SYMBOL(unregister_netdev);
11139
11140/**
11141 *	__dev_change_net_namespace - move device to different nethost namespace
11142 *	@dev: device
11143 *	@net: network namespace
11144 *	@pat: If not NULL name pattern to try if the current device name
11145 *	      is already taken in the destination network namespace.
11146 *	@new_ifindex: If not zero, specifies device index in the target
11147 *	              namespace.
11148 *
11149 *	This function shuts down a device interface and moves it
11150 *	to a new network namespace. On success 0 is returned, on
11151 *	a failure a netagive errno code is returned.
11152 *
11153 *	Callers must hold the rtnl semaphore.
11154 */
11155
11156int __dev_change_net_namespace(struct net_device *dev, struct net *net,
11157			       const char *pat, int new_ifindex)
11158{
11159	struct netdev_name_node *name_node;
11160	struct net *net_old = dev_net(dev);
11161	char new_name[IFNAMSIZ] = {};
11162	int err, new_nsid;
11163
11164	ASSERT_RTNL();
11165
11166	/* Don't allow namespace local devices to be moved. */
11167	err = -EINVAL;
11168	if (dev->features & NETIF_F_NETNS_LOCAL)
11169		goto out;
11170
11171	/* Ensure the device has been registrered */
11172	if (dev->reg_state != NETREG_REGISTERED)
11173		goto out;
11174
11175	/* Get out if there is nothing todo */
11176	err = 0;
11177	if (net_eq(net_old, net))
11178		goto out;
11179
11180	/* Pick the destination device name, and ensure
11181	 * we can use it in the destination network namespace.
11182	 */
11183	err = -EEXIST;
11184	if (netdev_name_in_use(net, dev->name)) {
11185		/* We get here if we can't use the current device name */
11186		if (!pat)
11187			goto out;
11188		err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST);
11189		if (err < 0)
11190			goto out;
11191	}
11192	/* Check that none of the altnames conflicts. */
11193	err = -EEXIST;
11194	netdev_for_each_altname(dev, name_node)
11195		if (netdev_name_in_use(net, name_node->name))
11196			goto out;
11197
11198	/* Check that new_ifindex isn't used yet. */
11199	if (new_ifindex) {
11200		err = dev_index_reserve(net, new_ifindex);
11201		if (err < 0)
11202			goto out;
11203	} else {
11204		/* If there is an ifindex conflict assign a new one */
11205		err = dev_index_reserve(net, dev->ifindex);
11206		if (err == -EBUSY)
11207			err = dev_index_reserve(net, 0);
11208		if (err < 0)
11209			goto out;
11210		new_ifindex = err;
11211	}
11212
11213	/*
11214	 * And now a mini version of register_netdevice unregister_netdevice.
11215	 */
11216
11217	/* If device is running close it first. */
11218	dev_close(dev);
11219
11220	/* And unlink it from device chain */
11221	unlist_netdevice(dev, true);
 
11222
11223	synchronize_net();
11224
11225	/* Shutdown queueing discipline. */
11226	dev_shutdown(dev);
11227
11228	/* Notify protocols, that we are about to destroy
11229	 * this device. They should clean all the things.
11230	 *
11231	 * Note that dev->reg_state stays at NETREG_REGISTERED.
11232	 * This is wanted because this way 8021q and macvlan know
11233	 * the device is just moving and can keep their slaves up.
11234	 */
11235	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
11236	rcu_barrier();
11237
11238	new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
11239
11240	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
11241			    new_ifindex);
11242
11243	/*
11244	 *	Flush the unicast and multicast chains
11245	 */
11246	dev_uc_flush(dev);
11247	dev_mc_flush(dev);
11248
11249	/* Send a netdev-removed uevent to the old namespace */
11250	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
11251	netdev_adjacent_del_links(dev);
11252
11253	/* Move per-net netdevice notifiers that are following the netdevice */
11254	move_netdevice_notifiers_dev_net(dev, net);
11255
11256	/* Actually switch the network namespace */
11257	dev_net_set(dev, net);
11258	dev->ifindex = new_ifindex;
11259
11260	if (new_name[0]) /* Rename the netdev to prepared name */
11261		strscpy(dev->name, new_name, IFNAMSIZ);
11262
11263	/* Fixup kobjects */
11264	dev_set_uevent_suppress(&dev->dev, 1);
11265	err = device_rename(&dev->dev, dev->name);
11266	dev_set_uevent_suppress(&dev->dev, 0);
11267	WARN_ON(err);
11268
11269	/* Send a netdev-add uevent to the new namespace */
11270	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
11271	netdev_adjacent_add_links(dev);
11272
11273	/* Adapt owner in case owning user namespace of target network
11274	 * namespace is different from the original one.
11275	 */
11276	err = netdev_change_owner(dev, net_old, net);
11277	WARN_ON(err);
11278
11279	/* Add the device back in the hashes */
11280	list_netdevice(dev);
11281
11282	/* Notify protocols, that a new device appeared. */
11283	call_netdevice_notifiers(NETDEV_REGISTER, dev);
11284
11285	/*
11286	 *	Prevent userspace races by waiting until the network
11287	 *	device is fully setup before sending notifications.
11288	 */
11289	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
11290
11291	synchronize_net();
11292	err = 0;
11293out:
11294	return err;
11295}
11296EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
11297
11298static int dev_cpu_dead(unsigned int oldcpu)
11299{
11300	struct sk_buff **list_skb;
11301	struct sk_buff *skb;
11302	unsigned int cpu;
11303	struct softnet_data *sd, *oldsd, *remsd = NULL;
11304
11305	local_irq_disable();
11306	cpu = smp_processor_id();
11307	sd = &per_cpu(softnet_data, cpu);
11308	oldsd = &per_cpu(softnet_data, oldcpu);
11309
11310	/* Find end of our completion_queue. */
11311	list_skb = &sd->completion_queue;
11312	while (*list_skb)
11313		list_skb = &(*list_skb)->next;
11314	/* Append completion queue from offline CPU. */
11315	*list_skb = oldsd->completion_queue;
11316	oldsd->completion_queue = NULL;
11317
11318	/* Append output queue from offline CPU. */
11319	if (oldsd->output_queue) {
11320		*sd->output_queue_tailp = oldsd->output_queue;
11321		sd->output_queue_tailp = oldsd->output_queue_tailp;
11322		oldsd->output_queue = NULL;
11323		oldsd->output_queue_tailp = &oldsd->output_queue;
11324	}
11325	/* Append NAPI poll list from offline CPU, with one exception :
11326	 * process_backlog() must be called by cpu owning percpu backlog.
11327	 * We properly handle process_queue & input_pkt_queue later.
11328	 */
11329	while (!list_empty(&oldsd->poll_list)) {
11330		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
11331							    struct napi_struct,
11332							    poll_list);
11333
11334		list_del_init(&napi->poll_list);
11335		if (napi->poll == process_backlog)
11336			napi->state = 0;
11337		else
11338			____napi_schedule(sd, napi);
11339	}
11340
11341	raise_softirq_irqoff(NET_TX_SOFTIRQ);
11342	local_irq_enable();
11343
11344#ifdef CONFIG_RPS
11345	remsd = oldsd->rps_ipi_list;
11346	oldsd->rps_ipi_list = NULL;
11347#endif
11348	/* send out pending IPI's on offline CPU */
11349	net_rps_send_ipi(remsd);
11350
11351	/* Process offline CPU's input_pkt_queue */
11352	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
11353		netif_rx(skb);
11354		input_queue_head_incr(oldsd);
11355	}
11356	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
11357		netif_rx(skb);
11358		input_queue_head_incr(oldsd);
11359	}
11360
11361	return 0;
11362}
11363
11364/**
11365 *	netdev_increment_features - increment feature set by one
11366 *	@all: current feature set
11367 *	@one: new feature set
11368 *	@mask: mask feature set
11369 *
11370 *	Computes a new feature set after adding a device with feature set
11371 *	@one to the master device with current feature set @all.  Will not
11372 *	enable anything that is off in @mask. Returns the new feature set.
11373 */
11374netdev_features_t netdev_increment_features(netdev_features_t all,
11375	netdev_features_t one, netdev_features_t mask)
11376{
11377	if (mask & NETIF_F_HW_CSUM)
11378		mask |= NETIF_F_CSUM_MASK;
11379	mask |= NETIF_F_VLAN_CHALLENGED;
11380
11381	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
11382	all &= one | ~NETIF_F_ALL_FOR_ALL;
11383
11384	/* If one device supports hw checksumming, set for all. */
11385	if (all & NETIF_F_HW_CSUM)
11386		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
11387
11388	return all;
11389}
11390EXPORT_SYMBOL(netdev_increment_features);
11391
11392static struct hlist_head * __net_init netdev_create_hash(void)
11393{
11394	int i;
11395	struct hlist_head *hash;
11396
11397	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
11398	if (hash != NULL)
11399		for (i = 0; i < NETDEV_HASHENTRIES; i++)
11400			INIT_HLIST_HEAD(&hash[i]);
11401
11402	return hash;
11403}
11404
11405/* Initialize per network namespace state */
11406static int __net_init netdev_init(struct net *net)
11407{
11408	BUILD_BUG_ON(GRO_HASH_BUCKETS >
11409		     8 * sizeof_field(struct napi_struct, gro_bitmask));
11410
11411	INIT_LIST_HEAD(&net->dev_base_head);
11412
11413	net->dev_name_head = netdev_create_hash();
11414	if (net->dev_name_head == NULL)
11415		goto err_name;
11416
11417	net->dev_index_head = netdev_create_hash();
11418	if (net->dev_index_head == NULL)
11419		goto err_idx;
11420
11421	xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1);
11422
11423	RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
11424
11425	return 0;
11426
11427err_idx:
11428	kfree(net->dev_name_head);
11429err_name:
11430	return -ENOMEM;
11431}
11432
11433/**
11434 *	netdev_drivername - network driver for the device
11435 *	@dev: network device
11436 *
11437 *	Determine network driver for device.
11438 */
11439const char *netdev_drivername(const struct net_device *dev)
11440{
11441	const struct device_driver *driver;
11442	const struct device *parent;
11443	const char *empty = "";
11444
11445	parent = dev->dev.parent;
11446	if (!parent)
11447		return empty;
11448
11449	driver = parent->driver;
11450	if (driver && driver->name)
11451		return driver->name;
11452	return empty;
11453}
11454
11455static void __netdev_printk(const char *level, const struct net_device *dev,
11456			    struct va_format *vaf)
11457{
11458	if (dev && dev->dev.parent) {
11459		dev_printk_emit(level[1] - '0',
11460				dev->dev.parent,
11461				"%s %s %s%s: %pV",
11462				dev_driver_string(dev->dev.parent),
11463				dev_name(dev->dev.parent),
11464				netdev_name(dev), netdev_reg_state(dev),
11465				vaf);
11466	} else if (dev) {
11467		printk("%s%s%s: %pV",
11468		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
11469	} else {
11470		printk("%s(NULL net_device): %pV", level, vaf);
11471	}
11472}
11473
11474void netdev_printk(const char *level, const struct net_device *dev,
11475		   const char *format, ...)
11476{
11477	struct va_format vaf;
11478	va_list args;
11479
11480	va_start(args, format);
11481
11482	vaf.fmt = format;
11483	vaf.va = &args;
11484
11485	__netdev_printk(level, dev, &vaf);
11486
11487	va_end(args);
11488}
11489EXPORT_SYMBOL(netdev_printk);
11490
11491#define define_netdev_printk_level(func, level)			\
11492void func(const struct net_device *dev, const char *fmt, ...)	\
11493{								\
11494	struct va_format vaf;					\
11495	va_list args;						\
11496								\
11497	va_start(args, fmt);					\
11498								\
11499	vaf.fmt = fmt;						\
11500	vaf.va = &args;						\
11501								\
11502	__netdev_printk(level, dev, &vaf);			\
11503								\
11504	va_end(args);						\
11505}								\
11506EXPORT_SYMBOL(func);
11507
11508define_netdev_printk_level(netdev_emerg, KERN_EMERG);
11509define_netdev_printk_level(netdev_alert, KERN_ALERT);
11510define_netdev_printk_level(netdev_crit, KERN_CRIT);
11511define_netdev_printk_level(netdev_err, KERN_ERR);
11512define_netdev_printk_level(netdev_warn, KERN_WARNING);
11513define_netdev_printk_level(netdev_notice, KERN_NOTICE);
11514define_netdev_printk_level(netdev_info, KERN_INFO);
11515
11516static void __net_exit netdev_exit(struct net *net)
11517{
11518	kfree(net->dev_name_head);
11519	kfree(net->dev_index_head);
11520	xa_destroy(&net->dev_by_index);
11521	if (net != &init_net)
11522		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
11523}
11524
11525static struct pernet_operations __net_initdata netdev_net_ops = {
11526	.init = netdev_init,
11527	.exit = netdev_exit,
11528};
11529
11530static void __net_exit default_device_exit_net(struct net *net)
11531{
11532	struct netdev_name_node *name_node, *tmp;
11533	struct net_device *dev, *aux;
11534	/*
11535	 * Push all migratable network devices back to the
11536	 * initial network namespace
11537	 */
11538	ASSERT_RTNL();
11539	for_each_netdev_safe(net, dev, aux) {
11540		int err;
11541		char fb_name[IFNAMSIZ];
11542
11543		/* Ignore unmoveable devices (i.e. loopback) */
11544		if (dev->features & NETIF_F_NETNS_LOCAL)
11545			continue;
11546
11547		/* Leave virtual devices for the generic cleanup */
11548		if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
11549			continue;
11550
11551		/* Push remaining network devices to init_net */
11552		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
11553		if (netdev_name_in_use(&init_net, fb_name))
11554			snprintf(fb_name, IFNAMSIZ, "dev%%d");
11555
11556		netdev_for_each_altname_safe(dev, name_node, tmp)
11557			if (netdev_name_in_use(&init_net, name_node->name)) {
11558				netdev_name_node_del(name_node);
11559				synchronize_rcu();
11560				__netdev_name_node_alt_destroy(name_node);
11561			}
11562
11563		err = dev_change_net_namespace(dev, &init_net, fb_name);
11564		if (err) {
11565			pr_emerg("%s: failed to move %s to init_net: %d\n",
11566				 __func__, dev->name, err);
11567			BUG();
11568		}
11569	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11570}
11571
11572static void __net_exit default_device_exit_batch(struct list_head *net_list)
11573{
11574	/* At exit all network devices most be removed from a network
11575	 * namespace.  Do this in the reverse order of registration.
11576	 * Do this across as many network namespaces as possible to
11577	 * improve batching efficiency.
11578	 */
11579	struct net_device *dev;
11580	struct net *net;
11581	LIST_HEAD(dev_kill_list);
11582
11583	rtnl_lock();
11584	list_for_each_entry(net, net_list, exit_list) {
11585		default_device_exit_net(net);
11586		cond_resched();
11587	}
11588
 
 
 
 
 
 
11589	list_for_each_entry(net, net_list, exit_list) {
11590		for_each_netdev_reverse(net, dev) {
11591			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
11592				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
11593			else
11594				unregister_netdevice_queue(dev, &dev_kill_list);
11595		}
11596	}
11597	unregister_netdevice_many(&dev_kill_list);
11598	rtnl_unlock();
11599}
11600
11601static struct pernet_operations __net_initdata default_device_ops = {
 
11602	.exit_batch = default_device_exit_batch,
11603};
11604
11605static void __init net_dev_struct_check(void)
11606{
11607	/* TX read-mostly hotpath */
11608	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags);
11609	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops);
11610	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops);
11611	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx);
11612	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues);
11613	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size);
11614	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size);
11615	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs);
11616	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features);
11617	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc);
11618	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu);
11619	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom);
11620	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq);
11621#ifdef CONFIG_XPS
11622	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps);
11623#endif
11624#ifdef CONFIG_NETFILTER_EGRESS
11625	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress);
11626#endif
11627#ifdef CONFIG_NET_XGRESS
11628	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress);
11629#endif
11630	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160);
11631
11632	/* TXRX read-mostly hotpath */
11633	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats);
11634	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags);
11635	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len);
11636	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features);
11637	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr);
11638	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 38);
11639
11640	/* RX read-mostly hotpath */
11641	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific);
11642	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
11643	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
11644	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
11645	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout);
11646	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs);
11647	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
11648	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
11649	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
11650	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data);
11651	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net);
11652#ifdef CONFIG_NETPOLL
11653	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo);
11654#endif
11655#ifdef CONFIG_NET_XGRESS
11656	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
11657#endif
11658	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104);
11659}
11660
11661/*
11662 *	Initialize the DEV module. At boot time this walks the device list and
11663 *	unhooks any devices that fail to initialise (normally hardware not
11664 *	present) and leaves us with a valid list of present and active devices.
11665 *
11666 */
11667
11668/*
11669 *       This is called single threaded during boot, so no need
11670 *       to take the rtnl semaphore.
11671 */
11672static int __init net_dev_init(void)
11673{
11674	int i, rc = -ENOMEM;
11675
11676	BUG_ON(!dev_boot_phase);
11677
11678	net_dev_struct_check();
11679
11680	if (dev_proc_init())
11681		goto out;
11682
11683	if (netdev_kobject_init())
11684		goto out;
11685
11686	INIT_LIST_HEAD(&ptype_all);
11687	for (i = 0; i < PTYPE_HASH_SIZE; i++)
11688		INIT_LIST_HEAD(&ptype_base[i]);
11689
 
 
11690	if (register_pernet_subsys(&netdev_net_ops))
11691		goto out;
11692
11693	/*
11694	 *	Initialise the packet receive queues.
11695	 */
11696
11697	for_each_possible_cpu(i) {
11698		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
11699		struct softnet_data *sd = &per_cpu(softnet_data, i);
11700
11701		INIT_WORK(flush, flush_backlog);
11702
11703		skb_queue_head_init(&sd->input_pkt_queue);
11704		skb_queue_head_init(&sd->process_queue);
11705#ifdef CONFIG_XFRM_OFFLOAD
11706		skb_queue_head_init(&sd->xfrm_backlog);
11707#endif
11708		INIT_LIST_HEAD(&sd->poll_list);
11709		sd->output_queue_tailp = &sd->output_queue;
11710#ifdef CONFIG_RPS
11711		INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
 
11712		sd->cpu = i;
11713#endif
11714		INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
11715		spin_lock_init(&sd->defer_lock);
11716
11717		init_gro_hash(&sd->backlog);
11718		sd->backlog.poll = process_backlog;
11719		sd->backlog.weight = weight_p;
11720	}
11721
11722	dev_boot_phase = 0;
11723
11724	/* The loopback device is special if any other network devices
11725	 * is present in a network namespace the loopback device must
11726	 * be present. Since we now dynamically allocate and free the
11727	 * loopback device ensure this invariant is maintained by
11728	 * keeping the loopback device as the first device on the
11729	 * list of network devices.  Ensuring the loopback devices
11730	 * is the first device that appears and the last network device
11731	 * that disappears.
11732	 */
11733	if (register_pernet_device(&loopback_net_ops))
11734		goto out;
11735
11736	if (register_pernet_device(&default_device_ops))
11737		goto out;
11738
11739	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
11740	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
11741
11742	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
11743				       NULL, dev_cpu_dead);
11744	WARN_ON(rc < 0);
 
11745	rc = 0;
11746out:
11747	return rc;
11748}
11749
11750subsys_initcall(net_dev_init);
v4.10.11
 
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <linux/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
 
  84#include <linux/mutex.h>
 
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
 
  97#include <linux/bpf.h>
 
  98#include <net/net_namespace.h>
  99#include <net/sock.h>
 100#include <net/busy_poll.h>
 101#include <linux/rtnetlink.h>
 102#include <linux/stat.h>
 
 103#include <net/dst.h>
 104#include <net/dst_metadata.h>
 
 105#include <net/pkt_sched.h>
 
 106#include <net/checksum.h>
 107#include <net/xfrm.h>
 
 108#include <linux/highmem.h>
 109#include <linux/init.h>
 110#include <linux/module.h>
 111#include <linux/netpoll.h>
 112#include <linux/rcupdate.h>
 113#include <linux/delay.h>
 114#include <net/iw_handler.h>
 115#include <asm/current.h>
 116#include <linux/audit.h>
 117#include <linux/dmaengine.h>
 118#include <linux/err.h>
 119#include <linux/ctype.h>
 120#include <linux/if_arp.h>
 121#include <linux/if_vlan.h>
 122#include <linux/ip.h>
 123#include <net/ip.h>
 124#include <net/mpls.h>
 125#include <linux/ipv6.h>
 126#include <linux/in.h>
 127#include <linux/jhash.h>
 128#include <linux/random.h>
 129#include <trace/events/napi.h>
 130#include <trace/events/net.h>
 131#include <trace/events/skb.h>
 132#include <linux/pci.h>
 
 133#include <linux/inetdevice.h>
 134#include <linux/cpu_rmap.h>
 135#include <linux/static_key.h>
 136#include <linux/hashtable.h>
 137#include <linux/vmalloc.h>
 138#include <linux/if_macvlan.h>
 139#include <linux/errqueue.h>
 140#include <linux/hrtimer.h>
 141#include <linux/netfilter_ingress.h>
 142#include <linux/crash_dump.h>
 
 
 
 
 
 
 
 
 
 143
 
 144#include "net-sysfs.h"
 145
 146/* Instead of increasing this, you should create a hash table. */
 147#define MAX_GRO_SKBS 8
 148
 149/* This should be increased if a protocol with a bigger head is added. */
 150#define GRO_MAX_HEAD (MAX_HEADER + 128)
 151
 152static DEFINE_SPINLOCK(ptype_lock);
 153static DEFINE_SPINLOCK(offload_lock);
 154struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 155struct list_head ptype_all __read_mostly;	/* Taps */
 156static struct list_head offload_base __read_mostly;
 157
 158static int netif_rx_internal(struct sk_buff *skb);
 159static int call_netdevice_notifiers_info(unsigned long val,
 160					 struct net_device *dev,
 161					 struct netdev_notifier_info *info);
 162
 163/*
 164 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 165 * semaphore.
 166 *
 167 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 168 *
 169 * Writers must hold the rtnl semaphore while they loop through the
 170 * dev_base_head list, and hold dev_base_lock for writing when they do the
 171 * actual updates.  This allows pure readers to access the list even
 172 * while a writer is preparing to update it.
 173 *
 174 * To put it another way, dev_base_lock is held for writing only to
 175 * protect against pure readers; the rtnl semaphore provides the
 176 * protection against other writers.
 177 *
 178 * See, for example usages, register_netdevice() and
 179 * unregister_netdevice(), which must be called with the rtnl
 180 * semaphore held.
 181 */
 182DEFINE_RWLOCK(dev_base_lock);
 183EXPORT_SYMBOL(dev_base_lock);
 184
 
 
 185/* protects napi_hash addition/deletion and napi_gen_id */
 186static DEFINE_SPINLOCK(napi_hash_lock);
 187
 188static unsigned int napi_gen_id = NR_CPUS;
 189static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 190
 191static seqcount_t devnet_rename_seq;
 192
 193static inline void dev_base_seq_inc(struct net *net)
 194{
 195	while (++net->dev_base_seq == 0);
 
 196}
 197
 198static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 199{
 200	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
 201
 202	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 203}
 204
 205static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 206{
 207	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 208}
 209
 210static inline void rps_lock(struct softnet_data *sd)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 211{
 212#ifdef CONFIG_RPS
 213	spin_lock(&sd->input_pkt_queue.lock);
 214#endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 215}
 216
 217static inline void rps_unlock(struct softnet_data *sd)
 218{
 219#ifdef CONFIG_RPS
 220	spin_unlock(&sd->input_pkt_queue.lock);
 221#endif
 
 222}
 223
 224/* Device list insertion */
 225static void list_netdevice(struct net_device *dev)
 226{
 
 227	struct net *net = dev_net(dev);
 228
 229	ASSERT_RTNL();
 230
 231	write_lock_bh(&dev_base_lock);
 232	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 233	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 234	hlist_add_head_rcu(&dev->index_hlist,
 235			   dev_index_hash(net, dev->ifindex));
 236	write_unlock_bh(&dev_base_lock);
 
 
 
 
 
 
 237
 238	dev_base_seq_inc(net);
 239}
 240
 241/* Device list removal
 242 * caller must respect a RCU grace period before freeing/reusing dev
 243 */
 244static void unlist_netdevice(struct net_device *dev)
 245{
 
 
 
 246	ASSERT_RTNL();
 247
 
 
 
 
 
 248	/* Unlink dev from the device chain */
 249	write_lock_bh(&dev_base_lock);
 
 250	list_del_rcu(&dev->dev_list);
 251	hlist_del_rcu(&dev->name_hlist);
 252	hlist_del_rcu(&dev->index_hlist);
 253	write_unlock_bh(&dev_base_lock);
 
 254
 255	dev_base_seq_inc(dev_net(dev));
 256}
 257
 258/*
 259 *	Our notifier list
 260 */
 261
 262static RAW_NOTIFIER_HEAD(netdev_chain);
 263
 264/*
 265 *	Device drivers call our routines to queue packets here. We empty the
 266 *	queue in the local softnet handler.
 267 */
 268
 269DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 270EXPORT_PER_CPU_SYMBOL(softnet_data);
 271
 272#ifdef CONFIG_LOCKDEP
 273/*
 274 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 275 * according to dev->type
 276 */
 277static const unsigned short netdev_lock_type[] =
 278	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 279	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 280	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 281	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 282	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 283	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 284	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 285	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 286	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 287	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 288	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 289	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 290	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 291	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 292	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 293
 294static const char *const netdev_lock_name[] =
 295	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 296	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 297	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 298	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 299	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 300	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 301	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 302	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 303	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 304	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 305	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 306	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 307	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 308	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 309	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 310
 311static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 312static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 313
 314static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 315{
 316	int i;
 317
 318	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 319		if (netdev_lock_type[i] == dev_type)
 320			return i;
 321	/* the last key is used by default */
 322	return ARRAY_SIZE(netdev_lock_type) - 1;
 323}
 324
 325static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 326						 unsigned short dev_type)
 327{
 328	int i;
 329
 330	i = netdev_lock_pos(dev_type);
 331	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 332				   netdev_lock_name[i]);
 333}
 334
 335static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 336{
 337	int i;
 338
 339	i = netdev_lock_pos(dev->type);
 340	lockdep_set_class_and_name(&dev->addr_list_lock,
 341				   &netdev_addr_lock_key[i],
 342				   netdev_lock_name[i]);
 343}
 344#else
 345static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 346						 unsigned short dev_type)
 347{
 348}
 
 349static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 350{
 351}
 352#endif
 353
 354/*******************************************************************************
 
 
 
 
 355
 356		Protocol management and registration routines
 357
 358*******************************************************************************/
 359
 360/*
 361 *	Add a protocol ID to the list. Now that the input handler is
 362 *	smarter we can dispense with all the messy stuff that used to be
 363 *	here.
 364 *
 365 *	BEWARE!!! Protocol handlers, mangling input packets,
 366 *	MUST BE last in hash buckets and checking protocol handlers
 367 *	MUST start from promiscuous ptype_all chain in net_bh.
 368 *	It is true now, do not change it.
 369 *	Explanation follows: if protocol handler, mangling packet, will
 370 *	be the first on list, it is not able to sense, that packet
 371 *	is cloned and should be copied-on-write, so that it will
 372 *	change it and subsequent readers will get broken packet.
 373 *							--ANK (980803)
 374 */
 375
 376static inline struct list_head *ptype_head(const struct packet_type *pt)
 377{
 378	if (pt->type == htons(ETH_P_ALL))
 379		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 380	else
 381		return pt->dev ? &pt->dev->ptype_specific :
 382				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 383}
 384
 385/**
 386 *	dev_add_pack - add packet handler
 387 *	@pt: packet type declaration
 388 *
 389 *	Add a protocol handler to the networking stack. The passed &packet_type
 390 *	is linked into kernel lists and may not be freed until it has been
 391 *	removed from the kernel lists.
 392 *
 393 *	This call does not sleep therefore it can not
 394 *	guarantee all CPU's that are in middle of receiving packets
 395 *	will see the new packet type (until the next received packet).
 396 */
 397
 398void dev_add_pack(struct packet_type *pt)
 399{
 400	struct list_head *head = ptype_head(pt);
 401
 402	spin_lock(&ptype_lock);
 403	list_add_rcu(&pt->list, head);
 404	spin_unlock(&ptype_lock);
 405}
 406EXPORT_SYMBOL(dev_add_pack);
 407
 408/**
 409 *	__dev_remove_pack	 - remove packet handler
 410 *	@pt: packet type declaration
 411 *
 412 *	Remove a protocol handler that was previously added to the kernel
 413 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 414 *	from the kernel lists and can be freed or reused once this function
 415 *	returns.
 416 *
 417 *      The packet type might still be in use by receivers
 418 *	and must not be freed until after all the CPU's have gone
 419 *	through a quiescent state.
 420 */
 421void __dev_remove_pack(struct packet_type *pt)
 422{
 423	struct list_head *head = ptype_head(pt);
 424	struct packet_type *pt1;
 425
 426	spin_lock(&ptype_lock);
 427
 428	list_for_each_entry(pt1, head, list) {
 429		if (pt == pt1) {
 430			list_del_rcu(&pt->list);
 431			goto out;
 432		}
 433	}
 434
 435	pr_warn("dev_remove_pack: %p not found\n", pt);
 436out:
 437	spin_unlock(&ptype_lock);
 438}
 439EXPORT_SYMBOL(__dev_remove_pack);
 440
 441/**
 442 *	dev_remove_pack	 - remove packet handler
 443 *	@pt: packet type declaration
 444 *
 445 *	Remove a protocol handler that was previously added to the kernel
 446 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 447 *	from the kernel lists and can be freed or reused once this function
 448 *	returns.
 449 *
 450 *	This call sleeps to guarantee that no CPU is looking at the packet
 451 *	type after return.
 452 */
 453void dev_remove_pack(struct packet_type *pt)
 454{
 455	__dev_remove_pack(pt);
 456
 457	synchronize_net();
 458}
 459EXPORT_SYMBOL(dev_remove_pack);
 460
 461
 462/**
 463 *	dev_add_offload - register offload handlers
 464 *	@po: protocol offload declaration
 465 *
 466 *	Add protocol offload handlers to the networking stack. The passed
 467 *	&proto_offload is linked into kernel lists and may not be freed until
 468 *	it has been removed from the kernel lists.
 469 *
 470 *	This call does not sleep therefore it can not
 471 *	guarantee all CPU's that are in middle of receiving packets
 472 *	will see the new offload handlers (until the next received packet).
 473 */
 474void dev_add_offload(struct packet_offload *po)
 475{
 476	struct packet_offload *elem;
 477
 478	spin_lock(&offload_lock);
 479	list_for_each_entry(elem, &offload_base, list) {
 480		if (po->priority < elem->priority)
 481			break;
 482	}
 483	list_add_rcu(&po->list, elem->list.prev);
 484	spin_unlock(&offload_lock);
 485}
 486EXPORT_SYMBOL(dev_add_offload);
 487
 488/**
 489 *	__dev_remove_offload	 - remove offload handler
 490 *	@po: packet offload declaration
 491 *
 492 *	Remove a protocol offload handler that was previously added to the
 493 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
 494 *	is removed from the kernel lists and can be freed or reused once this
 495 *	function returns.
 496 *
 497 *      The packet type might still be in use by receivers
 498 *	and must not be freed until after all the CPU's have gone
 499 *	through a quiescent state.
 500 */
 501static void __dev_remove_offload(struct packet_offload *po)
 502{
 503	struct list_head *head = &offload_base;
 504	struct packet_offload *po1;
 505
 506	spin_lock(&offload_lock);
 507
 508	list_for_each_entry(po1, head, list) {
 509		if (po == po1) {
 510			list_del_rcu(&po->list);
 511			goto out;
 512		}
 513	}
 514
 515	pr_warn("dev_remove_offload: %p not found\n", po);
 516out:
 517	spin_unlock(&offload_lock);
 518}
 519
 520/**
 521 *	dev_remove_offload	 - remove packet offload handler
 522 *	@po: packet offload declaration
 523 *
 524 *	Remove a packet offload handler that was previously added to the kernel
 525 *	offload handlers by dev_add_offload(). The passed &offload_type is
 526 *	removed from the kernel lists and can be freed or reused once this
 527 *	function returns.
 528 *
 529 *	This call sleeps to guarantee that no CPU is looking at the packet
 530 *	type after return.
 531 */
 532void dev_remove_offload(struct packet_offload *po)
 533{
 534	__dev_remove_offload(po);
 535
 536	synchronize_net();
 537}
 538EXPORT_SYMBOL(dev_remove_offload);
 539
 540/******************************************************************************
 541
 542		      Device Boot-time Settings Routines
 543
 544*******************************************************************************/
 545
 546/* Boot time configuration table */
 547static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 548
 549/**
 550 *	netdev_boot_setup_add	- add new setup entry
 551 *	@name: name of the device
 552 *	@map: configured settings for the device
 553 *
 554 *	Adds new setup entry to the dev_boot_setup list.  The function
 555 *	returns 0 on error and 1 on success.  This is a generic routine to
 556 *	all netdevices.
 557 */
 558static int netdev_boot_setup_add(char *name, struct ifmap *map)
 559{
 560	struct netdev_boot_setup *s;
 561	int i;
 562
 563	s = dev_boot_setup;
 564	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 565		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 566			memset(s[i].name, 0, sizeof(s[i].name));
 567			strlcpy(s[i].name, name, IFNAMSIZ);
 568			memcpy(&s[i].map, map, sizeof(s[i].map));
 569			break;
 570		}
 571	}
 572
 573	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 574}
 575
 576/**
 577 *	netdev_boot_setup_check	- check boot time settings
 578 *	@dev: the netdevice
 579 *
 580 * 	Check boot time settings for the device.
 581 *	The found settings are set for the device to be used
 582 *	later in the device probing.
 583 *	Returns 0 if no settings found, 1 if they are.
 584 */
 585int netdev_boot_setup_check(struct net_device *dev)
 586{
 587	struct netdev_boot_setup *s = dev_boot_setup;
 588	int i;
 589
 590	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 591		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 592		    !strcmp(dev->name, s[i].name)) {
 593			dev->irq 	= s[i].map.irq;
 594			dev->base_addr 	= s[i].map.base_addr;
 595			dev->mem_start 	= s[i].map.mem_start;
 596			dev->mem_end 	= s[i].map.mem_end;
 597			return 1;
 598		}
 599	}
 600	return 0;
 601}
 602EXPORT_SYMBOL(netdev_boot_setup_check);
 603
 604
 605/**
 606 *	netdev_boot_base	- get address from boot time settings
 607 *	@prefix: prefix for network device
 608 *	@unit: id for network device
 609 *
 610 * 	Check boot time settings for the base address of device.
 611 *	The found settings are set for the device to be used
 612 *	later in the device probing.
 613 *	Returns 0 if no settings found.
 614 */
 615unsigned long netdev_boot_base(const char *prefix, int unit)
 616{
 617	const struct netdev_boot_setup *s = dev_boot_setup;
 618	char name[IFNAMSIZ];
 619	int i;
 620
 621	sprintf(name, "%s%d", prefix, unit);
 622
 623	/*
 624	 * If device already registered then return base of 1
 625	 * to indicate not to probe for this interface
 626	 */
 627	if (__dev_get_by_name(&init_net, name))
 628		return 1;
 629
 630	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 631		if (!strcmp(name, s[i].name))
 632			return s[i].map.base_addr;
 633	return 0;
 634}
 635
 636/*
 637 * Saves at boot time configured settings for any netdevice.
 638 */
 639int __init netdev_boot_setup(char *str)
 640{
 641	int ints[5];
 642	struct ifmap map;
 643
 644	str = get_options(str, ARRAY_SIZE(ints), ints);
 645	if (!str || !*str)
 646		return 0;
 647
 648	/* Save settings */
 649	memset(&map, 0, sizeof(map));
 650	if (ints[0] > 0)
 651		map.irq = ints[1];
 652	if (ints[0] > 1)
 653		map.base_addr = ints[2];
 654	if (ints[0] > 2)
 655		map.mem_start = ints[3];
 656	if (ints[0] > 3)
 657		map.mem_end = ints[4];
 658
 659	/* Add new entry to the list */
 660	return netdev_boot_setup_add(str, &map);
 661}
 662
 663__setup("netdev=", netdev_boot_setup);
 664
 665/*******************************************************************************
 666
 667			    Device Interface Subroutines
 668
 669*******************************************************************************/
 670
 671/**
 672 *	dev_get_iflink	- get 'iflink' value of a interface
 673 *	@dev: targeted interface
 674 *
 675 *	Indicates the ifindex the interface is linked to.
 676 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
 677 */
 678
 679int dev_get_iflink(const struct net_device *dev)
 680{
 681	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 682		return dev->netdev_ops->ndo_get_iflink(dev);
 683
 684	return dev->ifindex;
 685}
 686EXPORT_SYMBOL(dev_get_iflink);
 687
 688/**
 689 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
 690 *	@dev: targeted interface
 691 *	@skb: The packet.
 692 *
 693 *	For better visibility of tunnel traffic OVS needs to retrieve
 694 *	egress tunnel information for a packet. Following API allows
 695 *	user to get this info.
 696 */
 697int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 698{
 699	struct ip_tunnel_info *info;
 700
 701	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 702		return -EINVAL;
 703
 704	info = skb_tunnel_info_unclone(skb);
 705	if (!info)
 706		return -ENOMEM;
 707	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 708		return -EINVAL;
 709
 710	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 711}
 712EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 713
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 714/**
 715 *	__dev_get_by_name	- find a device by its name
 716 *	@net: the applicable net namespace
 717 *	@name: name to find
 718 *
 719 *	Find an interface by name. Must be called under RTNL semaphore
 720 *	or @dev_base_lock. If the name is found a pointer to the device
 721 *	is returned. If the name is not found then %NULL is returned. The
 722 *	reference counters are not incremented so the caller must be
 723 *	careful with locks.
 724 */
 725
 726struct net_device *__dev_get_by_name(struct net *net, const char *name)
 727{
 728	struct net_device *dev;
 729	struct hlist_head *head = dev_name_hash(net, name);
 730
 731	hlist_for_each_entry(dev, head, name_hlist)
 732		if (!strncmp(dev->name, name, IFNAMSIZ))
 733			return dev;
 734
 735	return NULL;
 
 736}
 737EXPORT_SYMBOL(__dev_get_by_name);
 738
 739/**
 740 *	dev_get_by_name_rcu	- find a device by its name
 741 *	@net: the applicable net namespace
 742 *	@name: name to find
 743 *
 744 *	Find an interface by name.
 745 *	If the name is found a pointer to the device is returned.
 746 * 	If the name is not found then %NULL is returned.
 747 *	The reference counters are not incremented so the caller must be
 748 *	careful with locks. The caller must hold RCU lock.
 749 */
 750
 751struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 752{
 
 
 
 
 
 
 
 
 
 
 753	struct net_device *dev;
 754	struct hlist_head *head = dev_name_hash(net, name);
 755
 756	hlist_for_each_entry_rcu(dev, head, name_hlist)
 757		if (!strncmp(dev->name, name, IFNAMSIZ))
 758			return dev;
 759
 760	return NULL;
 761}
 762EXPORT_SYMBOL(dev_get_by_name_rcu);
 763
 764/**
 765 *	dev_get_by_name		- find a device by its name
 766 *	@net: the applicable net namespace
 767 *	@name: name to find
 
 
 768 *
 769 *	Find an interface by name. This can be called from any
 770 *	context and does its own locking. The returned handle has
 771 *	the usage count incremented and the caller must use dev_put() to
 772 *	release it when it is no longer needed. %NULL is returned if no
 773 *	matching device is found.
 774 */
 775
 776struct net_device *dev_get_by_name(struct net *net, const char *name)
 777{
 778	struct net_device *dev;
 779
 780	rcu_read_lock();
 781	dev = dev_get_by_name_rcu(net, name);
 782	if (dev)
 783		dev_hold(dev);
 784	rcu_read_unlock();
 785	return dev;
 786}
 787EXPORT_SYMBOL(dev_get_by_name);
 788
 789/**
 790 *	__dev_get_by_index - find a device by its ifindex
 791 *	@net: the applicable net namespace
 792 *	@ifindex: index of device
 793 *
 794 *	Search for an interface by index. Returns %NULL if the device
 795 *	is not found or a pointer to the device. The device has not
 796 *	had its reference counter increased so the caller must be careful
 797 *	about locking. The caller must hold either the RTNL semaphore
 798 *	or @dev_base_lock.
 799 */
 800
 801struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 802{
 803	struct net_device *dev;
 804	struct hlist_head *head = dev_index_hash(net, ifindex);
 805
 806	hlist_for_each_entry(dev, head, index_hlist)
 807		if (dev->ifindex == ifindex)
 808			return dev;
 809
 810	return NULL;
 811}
 812EXPORT_SYMBOL(__dev_get_by_index);
 813
 814/**
 815 *	dev_get_by_index_rcu - find a device by its ifindex
 816 *	@net: the applicable net namespace
 817 *	@ifindex: index of device
 818 *
 819 *	Search for an interface by index. Returns %NULL if the device
 820 *	is not found or a pointer to the device. The device has not
 821 *	had its reference counter increased so the caller must be careful
 822 *	about locking. The caller must hold RCU lock.
 823 */
 824
 825struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 826{
 827	struct net_device *dev;
 828	struct hlist_head *head = dev_index_hash(net, ifindex);
 829
 830	hlist_for_each_entry_rcu(dev, head, index_hlist)
 831		if (dev->ifindex == ifindex)
 832			return dev;
 833
 834	return NULL;
 835}
 836EXPORT_SYMBOL(dev_get_by_index_rcu);
 837
 
 
 
 
 
 
 
 
 
 
 
 
 838
 839/**
 840 *	dev_get_by_index - find a device by its ifindex
 841 *	@net: the applicable net namespace
 842 *	@ifindex: index of device
 
 
 843 *
 844 *	Search for an interface by index. Returns NULL if the device
 845 *	is not found or a pointer to the device. The device returned has
 846 *	had a reference added and the pointer is safe until the user calls
 847 *	dev_put to indicate they have finished with it.
 848 */
 849
 850struct net_device *dev_get_by_index(struct net *net, int ifindex)
 851{
 852	struct net_device *dev;
 853
 854	rcu_read_lock();
 855	dev = dev_get_by_index_rcu(net, ifindex);
 856	if (dev)
 857		dev_hold(dev);
 858	rcu_read_unlock();
 859	return dev;
 860}
 861EXPORT_SYMBOL(dev_get_by_index);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 862
 863/**
 864 *	netdev_get_name - get a netdevice name, knowing its ifindex.
 865 *	@net: network namespace
 866 *	@name: a pointer to the buffer where the name will be stored.
 867 *	@ifindex: the ifindex of the interface to get the name from.
 868 *
 869 *	The use of raw_seqcount_begin() and cond_resched() before
 870 *	retrying is required as we want to give the writers a chance
 871 *	to complete when CONFIG_PREEMPT is not set.
 872 */
 873int netdev_get_name(struct net *net, char *name, int ifindex)
 874{
 875	struct net_device *dev;
 876	unsigned int seq;
 877
 878retry:
 879	seq = raw_seqcount_begin(&devnet_rename_seq);
 880	rcu_read_lock();
 
 881	dev = dev_get_by_index_rcu(net, ifindex);
 882	if (!dev) {
 883		rcu_read_unlock();
 884		return -ENODEV;
 885	}
 886
 887	strcpy(name, dev->name);
 
 
 
 888	rcu_read_unlock();
 889	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 890		cond_resched();
 891		goto retry;
 892	}
 893
 894	return 0;
 895}
 896
 897/**
 898 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 899 *	@net: the applicable net namespace
 900 *	@type: media type of device
 901 *	@ha: hardware address
 902 *
 903 *	Search for an interface by MAC address. Returns NULL if the device
 904 *	is not found or a pointer to the device.
 905 *	The caller must hold RCU or RTNL.
 906 *	The returned device has not had its ref count increased
 907 *	and the caller must therefore be careful about locking
 908 *
 909 */
 910
 911struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 912				       const char *ha)
 913{
 914	struct net_device *dev;
 915
 916	for_each_netdev_rcu(net, dev)
 917		if (dev->type == type &&
 918		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 919			return dev;
 920
 921	return NULL;
 922}
 923EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 924
 925struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 926{
 927	struct net_device *dev;
 928
 929	ASSERT_RTNL();
 930	for_each_netdev(net, dev)
 931		if (dev->type == type)
 932			return dev;
 933
 934	return NULL;
 935}
 936EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 937
 938struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 939{
 940	struct net_device *dev, *ret = NULL;
 941
 942	rcu_read_lock();
 943	for_each_netdev_rcu(net, dev)
 944		if (dev->type == type) {
 945			dev_hold(dev);
 946			ret = dev;
 947			break;
 948		}
 949	rcu_read_unlock();
 950	return ret;
 951}
 952EXPORT_SYMBOL(dev_getfirstbyhwtype);
 953
 954/**
 955 *	__dev_get_by_flags - find any device with given flags
 956 *	@net: the applicable net namespace
 957 *	@if_flags: IFF_* values
 958 *	@mask: bitmask of bits in if_flags to check
 959 *
 960 *	Search for any interface with the given flags. Returns NULL if a device
 961 *	is not found or a pointer to the device. Must be called inside
 962 *	rtnl_lock(), and result refcount is unchanged.
 963 */
 964
 965struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 966				      unsigned short mask)
 967{
 968	struct net_device *dev, *ret;
 969
 970	ASSERT_RTNL();
 971
 972	ret = NULL;
 973	for_each_netdev(net, dev) {
 974		if (((dev->flags ^ if_flags) & mask) == 0) {
 975			ret = dev;
 976			break;
 977		}
 978	}
 979	return ret;
 980}
 981EXPORT_SYMBOL(__dev_get_by_flags);
 982
 983/**
 984 *	dev_valid_name - check if name is okay for network device
 985 *	@name: name string
 986 *
 987 *	Network device names need to be valid file names to
 988 *	to allow sysfs to work.  We also disallow any kind of
 989 *	whitespace.
 990 */
 991bool dev_valid_name(const char *name)
 992{
 993	if (*name == '\0')
 994		return false;
 995	if (strlen(name) >= IFNAMSIZ)
 996		return false;
 997	if (!strcmp(name, ".") || !strcmp(name, ".."))
 998		return false;
 999
1000	while (*name) {
1001		if (*name == '/' || *name == ':' || isspace(*name))
1002			return false;
1003		name++;
1004	}
1005	return true;
1006}
1007EXPORT_SYMBOL(dev_valid_name);
1008
1009/**
1010 *	__dev_alloc_name - allocate a name for a device
1011 *	@net: network namespace to allocate the device name in
1012 *	@name: name format string
1013 *	@buf:  scratch buffer and result name string
1014 *
1015 *	Passed a format string - eg "lt%d" it will try and find a suitable
1016 *	id. It scans list of devices to build up a free map, then chooses
1017 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1018 *	while allocating the name and adding the device in order to avoid
1019 *	duplicates.
1020 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021 *	Returns the number of the unit assigned or a negative errno code.
1022 */
1023
1024static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1025{
1026	int i = 0;
1027	const char *p;
1028	const int max_netdevices = 8*PAGE_SIZE;
1029	unsigned long *inuse;
1030	struct net_device *d;
 
 
 
 
 
 
 
 
1031
1032	p = strnchr(name, IFNAMSIZ-1, '%');
1033	if (p) {
1034		/*
1035		 * Verify the string as this thing may have come from
1036		 * the user.  There must be either one "%d" and no other "%"
1037		 * characters.
1038		 */
1039		if (p[1] != 'd' || strchr(p + 2, '%'))
1040			return -EINVAL;
1041
1042		/* Use one page as a bit array of possible slots */
1043		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1044		if (!inuse)
1045			return -ENOMEM;
1046
1047		for_each_netdev(net, d) {
1048			if (!sscanf(d->name, name, &i))
1049				continue;
1050			if (i < 0 || i >= max_netdevices)
1051				continue;
1052
1053			/*  avoid cases where sscanf is not exact inverse of printf */
1054			snprintf(buf, IFNAMSIZ, name, i);
1055			if (!strncmp(buf, d->name, IFNAMSIZ))
1056				set_bit(i, inuse);
1057		}
 
 
 
 
1058
1059		i = find_first_zero_bit(inuse, max_netdevices);
1060		free_page((unsigned long) inuse);
 
 
1061	}
1062
1063	if (buf != name)
1064		snprintf(buf, IFNAMSIZ, name, i);
1065	if (!__dev_get_by_name(net, buf))
1066		return i;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1067
1068	/* It is possible to run out of possible slots
1069	 * when the name is long and there isn't enough space left
1070	 * for the digits, or if all bits are used.
1071	 */
1072	return -ENFILE;
1073}
1074
1075/**
1076 *	dev_alloc_name - allocate a name for a device
1077 *	@dev: device
1078 *	@name: name format string
1079 *
1080 *	Passed a format string - eg "lt%d" it will try and find a suitable
1081 *	id. It scans list of devices to build up a free map, then chooses
1082 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1083 *	while allocating the name and adding the device in order to avoid
1084 *	duplicates.
1085 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086 *	Returns the number of the unit assigned or a negative errno code.
1087 */
1088
1089int dev_alloc_name(struct net_device *dev, const char *name)
1090{
1091	char buf[IFNAMSIZ];
1092	struct net *net;
1093	int ret;
1094
1095	BUG_ON(!dev_net(dev));
1096	net = dev_net(dev);
1097	ret = __dev_alloc_name(net, name, buf);
1098	if (ret >= 0)
1099		strlcpy(dev->name, buf, IFNAMSIZ);
1100	return ret;
1101}
1102EXPORT_SYMBOL(dev_alloc_name);
1103
1104static int dev_alloc_name_ns(struct net *net,
1105			     struct net_device *dev,
1106			     const char *name)
1107{
1108	char buf[IFNAMSIZ];
1109	int ret;
1110
1111	ret = __dev_alloc_name(net, name, buf);
1112	if (ret >= 0)
1113		strlcpy(dev->name, buf, IFNAMSIZ);
1114	return ret;
1115}
1116
1117static int dev_get_valid_name(struct net *net,
1118			      struct net_device *dev,
1119			      const char *name)
1120{
1121	BUG_ON(!net);
1122
1123	if (!dev_valid_name(name))
1124		return -EINVAL;
1125
1126	if (strchr(name, '%'))
1127		return dev_alloc_name_ns(net, dev, name);
1128	else if (__dev_get_by_name(net, name))
1129		return -EEXIST;
1130	else if (dev->name != name)
1131		strlcpy(dev->name, name, IFNAMSIZ);
1132
1133	return 0;
1134}
1135
1136/**
1137 *	dev_change_name - change name of a device
1138 *	@dev: device
1139 *	@newname: name (or format string) must be at least IFNAMSIZ
1140 *
1141 *	Change name of a device, can pass format strings "eth%d".
1142 *	for wildcarding.
1143 */
1144int dev_change_name(struct net_device *dev, const char *newname)
1145{
1146	unsigned char old_assign_type;
1147	char oldname[IFNAMSIZ];
1148	int err = 0;
1149	int ret;
1150	struct net *net;
1151
1152	ASSERT_RTNL();
1153	BUG_ON(!dev_net(dev));
1154
1155	net = dev_net(dev);
1156	if (dev->flags & IFF_UP)
1157		return -EBUSY;
1158
1159	write_seqcount_begin(&devnet_rename_seq);
1160
1161	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1162		write_seqcount_end(&devnet_rename_seq);
1163		return 0;
1164	}
1165
1166	memcpy(oldname, dev->name, IFNAMSIZ);
1167
1168	err = dev_get_valid_name(net, dev, newname);
1169	if (err < 0) {
1170		write_seqcount_end(&devnet_rename_seq);
1171		return err;
1172	}
1173
1174	if (oldname[0] && !strchr(oldname, '%'))
1175		netdev_info(dev, "renamed from %s\n", oldname);
 
1176
1177	old_assign_type = dev->name_assign_type;
1178	dev->name_assign_type = NET_NAME_RENAMED;
1179
1180rollback:
1181	ret = device_rename(&dev->dev, dev->name);
1182	if (ret) {
1183		memcpy(dev->name, oldname, IFNAMSIZ);
1184		dev->name_assign_type = old_assign_type;
1185		write_seqcount_end(&devnet_rename_seq);
1186		return ret;
1187	}
1188
1189	write_seqcount_end(&devnet_rename_seq);
1190
1191	netdev_adjacent_rename_links(dev, oldname);
1192
1193	write_lock_bh(&dev_base_lock);
1194	hlist_del_rcu(&dev->name_hlist);
1195	write_unlock_bh(&dev_base_lock);
1196
1197	synchronize_rcu();
1198
1199	write_lock_bh(&dev_base_lock);
1200	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1201	write_unlock_bh(&dev_base_lock);
1202
1203	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1204	ret = notifier_to_errno(ret);
1205
1206	if (ret) {
1207		/* err >= 0 after dev_alloc_name() or stores the first errno */
1208		if (err >= 0) {
1209			err = ret;
1210			write_seqcount_begin(&devnet_rename_seq);
1211			memcpy(dev->name, oldname, IFNAMSIZ);
1212			memcpy(oldname, newname, IFNAMSIZ);
1213			dev->name_assign_type = old_assign_type;
1214			old_assign_type = NET_NAME_RENAMED;
1215			goto rollback;
1216		} else {
1217			pr_err("%s: name change rollback failed: %d\n",
1218			       dev->name, ret);
1219		}
1220	}
1221
1222	return err;
1223}
1224
1225/**
1226 *	dev_set_alias - change ifalias of a device
1227 *	@dev: device
1228 *	@alias: name up to IFALIASZ
1229 *	@len: limit of bytes to copy from info
1230 *
1231 *	Set ifalias for a device,
1232 */
1233int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1234{
1235	char *new_ifalias;
1236
1237	ASSERT_RTNL();
1238
1239	if (len >= IFALIASZ)
1240		return -EINVAL;
1241
1242	if (!len) {
1243		kfree(dev->ifalias);
1244		dev->ifalias = NULL;
1245		return 0;
 
 
 
1246	}
1247
1248	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1249	if (!new_ifalias)
1250		return -ENOMEM;
1251	dev->ifalias = new_ifalias;
 
 
 
1252
1253	strlcpy(dev->ifalias, alias, len+1);
1254	return len;
1255}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1256
 
 
1257
1258/**
1259 *	netdev_features_change - device changes features
1260 *	@dev: device to cause notification
1261 *
1262 *	Called to indicate a device has changed features.
1263 */
1264void netdev_features_change(struct net_device *dev)
1265{
1266	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1267}
1268EXPORT_SYMBOL(netdev_features_change);
1269
1270/**
1271 *	netdev_state_change - device changes state
1272 *	@dev: device to cause notification
1273 *
1274 *	Called to indicate a device has changed state. This function calls
1275 *	the notifier chains for netdev_chain and sends a NEWLINK message
1276 *	to the routing socket.
1277 */
1278void netdev_state_change(struct net_device *dev)
1279{
1280	if (dev->flags & IFF_UP) {
1281		struct netdev_notifier_change_info change_info;
 
 
1282
1283		change_info.flags_changed = 0;
1284		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1285					      &change_info.info);
1286		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1287	}
1288}
1289EXPORT_SYMBOL(netdev_state_change);
1290
1291/**
1292 * 	netdev_notify_peers - notify network peers about existence of @dev
1293 * 	@dev: network device
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1294 *
1295 * Generate traffic such that interested network peers are aware of
1296 * @dev, such as by generating a gratuitous ARP. This may be used when
1297 * a device wants to inform the rest of the network about some sort of
1298 * reconfiguration such as a failover event or virtual machine
1299 * migration.
1300 */
1301void netdev_notify_peers(struct net_device *dev)
1302{
1303	rtnl_lock();
1304	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1305	rtnl_unlock();
1306}
1307EXPORT_SYMBOL(netdev_notify_peers);
1308
1309static int __dev_open(struct net_device *dev)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1310{
1311	const struct net_device_ops *ops = dev->netdev_ops;
1312	int ret;
1313
1314	ASSERT_RTNL();
 
1315
1316	if (!netif_device_present(dev))
1317		return -ENODEV;
 
 
 
 
 
1318
1319	/* Block netpoll from trying to do any rx path servicing.
1320	 * If we don't do this there is a chance ndo_poll_controller
1321	 * or ndo_poll may be running while we open the device
1322	 */
1323	netpoll_poll_disable(dev);
1324
1325	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1326	ret = notifier_to_errno(ret);
1327	if (ret)
1328		return ret;
1329
1330	set_bit(__LINK_STATE_START, &dev->state);
1331
1332	if (ops->ndo_validate_addr)
1333		ret = ops->ndo_validate_addr(dev);
1334
1335	if (!ret && ops->ndo_open)
1336		ret = ops->ndo_open(dev);
1337
1338	netpoll_poll_enable(dev);
1339
1340	if (ret)
1341		clear_bit(__LINK_STATE_START, &dev->state);
1342	else {
1343		dev->flags |= IFF_UP;
1344		dev_set_rx_mode(dev);
1345		dev_activate(dev);
1346		add_device_randomness(dev->dev_addr, dev->addr_len);
1347	}
1348
1349	return ret;
1350}
1351
1352/**
1353 *	dev_open	- prepare an interface for use.
1354 *	@dev:	device to open
 
1355 *
1356 *	Takes a device from down to up state. The device's private open
1357 *	function is invoked and then the multicast lists are loaded. Finally
1358 *	the device is moved into the up state and a %NETDEV_UP message is
1359 *	sent to the netdev notifier chain.
1360 *
1361 *	Calling this function on an active interface is a nop. On a failure
1362 *	a negative errno code is returned.
1363 */
1364int dev_open(struct net_device *dev)
1365{
1366	int ret;
1367
1368	if (dev->flags & IFF_UP)
1369		return 0;
1370
1371	ret = __dev_open(dev);
1372	if (ret < 0)
1373		return ret;
1374
1375	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376	call_netdevice_notifiers(NETDEV_UP, dev);
1377
1378	return ret;
1379}
1380EXPORT_SYMBOL(dev_open);
1381
1382static int __dev_close_many(struct list_head *head)
1383{
1384	struct net_device *dev;
1385
1386	ASSERT_RTNL();
1387	might_sleep();
1388
1389	list_for_each_entry(dev, head, close_list) {
1390		/* Temporarily disable netpoll until the interface is down */
1391		netpoll_poll_disable(dev);
1392
1393		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1394
1395		clear_bit(__LINK_STATE_START, &dev->state);
1396
1397		/* Synchronize to scheduled poll. We cannot touch poll list, it
1398		 * can be even on different cpu. So just clear netif_running().
1399		 *
1400		 * dev->stop() will invoke napi_disable() on all of it's
1401		 * napi_struct instances on this device.
1402		 */
1403		smp_mb__after_atomic(); /* Commit netif_running(). */
1404	}
1405
1406	dev_deactivate_many(head);
1407
1408	list_for_each_entry(dev, head, close_list) {
1409		const struct net_device_ops *ops = dev->netdev_ops;
1410
1411		/*
1412		 *	Call the device specific close. This cannot fail.
1413		 *	Only if device is UP
1414		 *
1415		 *	We allow it to be called even after a DETACH hot-plug
1416		 *	event.
1417		 */
1418		if (ops->ndo_stop)
1419			ops->ndo_stop(dev);
1420
1421		dev->flags &= ~IFF_UP;
1422		netpoll_poll_enable(dev);
1423	}
1424
1425	return 0;
1426}
1427
1428static int __dev_close(struct net_device *dev)
1429{
1430	int retval;
1431	LIST_HEAD(single);
1432
1433	list_add(&dev->close_list, &single);
1434	retval = __dev_close_many(&single);
1435	list_del(&single);
1436
1437	return retval;
1438}
1439
1440int dev_close_many(struct list_head *head, bool unlink)
1441{
1442	struct net_device *dev, *tmp;
1443
1444	/* Remove the devices that don't need to be closed */
1445	list_for_each_entry_safe(dev, tmp, head, close_list)
1446		if (!(dev->flags & IFF_UP))
1447			list_del_init(&dev->close_list);
1448
1449	__dev_close_many(head);
1450
1451	list_for_each_entry_safe(dev, tmp, head, close_list) {
1452		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1453		call_netdevice_notifiers(NETDEV_DOWN, dev);
1454		if (unlink)
1455			list_del_init(&dev->close_list);
1456	}
1457
1458	return 0;
1459}
1460EXPORT_SYMBOL(dev_close_many);
1461
1462/**
1463 *	dev_close - shutdown an interface.
1464 *	@dev: device to shutdown
1465 *
1466 *	This function moves an active device into down state. A
1467 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1468 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1469 *	chain.
1470 */
1471int dev_close(struct net_device *dev)
1472{
1473	if (dev->flags & IFF_UP) {
1474		LIST_HEAD(single);
1475
1476		list_add(&dev->close_list, &single);
1477		dev_close_many(&single, true);
1478		list_del(&single);
1479	}
1480	return 0;
1481}
1482EXPORT_SYMBOL(dev_close);
1483
1484
1485/**
1486 *	dev_disable_lro - disable Large Receive Offload on a device
1487 *	@dev: device
1488 *
1489 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1490 *	called under RTNL.  This is needed if received packets may be
1491 *	forwarded to another interface.
1492 */
1493void dev_disable_lro(struct net_device *dev)
1494{
1495	struct net_device *lower_dev;
1496	struct list_head *iter;
1497
1498	dev->wanted_features &= ~NETIF_F_LRO;
1499	netdev_update_features(dev);
1500
1501	if (unlikely(dev->features & NETIF_F_LRO))
1502		netdev_WARN(dev, "failed to disable LRO!\n");
1503
1504	netdev_for_each_lower_dev(dev, lower_dev, iter)
1505		dev_disable_lro(lower_dev);
1506}
1507EXPORT_SYMBOL(dev_disable_lro);
1508
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1509static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1510				   struct net_device *dev)
1511{
1512	struct netdev_notifier_info info;
 
 
1513
1514	netdev_notifier_info_init(&info, dev);
1515	return nb->notifier_call(nb, val, &info);
1516}
1517
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1518static int dev_boot_phase = 1;
1519
1520/**
1521 *	register_netdevice_notifier - register a network notifier block
1522 *	@nb: notifier
1523 *
1524 *	Register a notifier to be called when network device events occur.
1525 *	The notifier passed is linked into the kernel structures and must
1526 *	not be reused until it has been unregistered. A negative errno code
1527 *	is returned on a failure.
1528 *
1529 * 	When registered all registration and up events are replayed
1530 *	to the new notifier to allow device to have a race free
1531 *	view of the network device list.
1532 */
1533
1534int register_netdevice_notifier(struct notifier_block *nb)
1535{
1536	struct net_device *dev;
1537	struct net_device *last;
1538	struct net *net;
1539	int err;
1540
 
 
1541	rtnl_lock();
1542	err = raw_notifier_chain_register(&netdev_chain, nb);
1543	if (err)
1544		goto unlock;
1545	if (dev_boot_phase)
1546		goto unlock;
1547	for_each_net(net) {
1548		for_each_netdev(net, dev) {
1549			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1550			err = notifier_to_errno(err);
1551			if (err)
1552				goto rollback;
1553
1554			if (!(dev->flags & IFF_UP))
1555				continue;
1556
1557			call_netdevice_notifier(nb, NETDEV_UP, dev);
1558		}
1559	}
1560
1561unlock:
1562	rtnl_unlock();
 
1563	return err;
1564
1565rollback:
1566	last = dev;
1567	for_each_net(net) {
1568		for_each_netdev(net, dev) {
1569			if (dev == last)
1570				goto outroll;
1571
1572			if (dev->flags & IFF_UP) {
1573				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1574							dev);
1575				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1576			}
1577			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1578		}
1579	}
1580
1581outroll:
1582	raw_notifier_chain_unregister(&netdev_chain, nb);
1583	goto unlock;
1584}
1585EXPORT_SYMBOL(register_netdevice_notifier);
1586
1587/**
1588 *	unregister_netdevice_notifier - unregister a network notifier block
1589 *	@nb: notifier
1590 *
1591 *	Unregister a notifier previously registered by
1592 *	register_netdevice_notifier(). The notifier is unlinked into the
1593 *	kernel structures and may then be reused. A negative errno code
1594 *	is returned on a failure.
1595 *
1596 * 	After unregistering unregister and down device events are synthesized
1597 *	for all devices on the device list to the removed notifier to remove
1598 *	the need for special case cleanup code.
1599 */
1600
1601int unregister_netdevice_notifier(struct notifier_block *nb)
1602{
1603	struct net_device *dev;
1604	struct net *net;
1605	int err;
1606
 
 
1607	rtnl_lock();
1608	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1609	if (err)
1610		goto unlock;
1611
1612	for_each_net(net) {
1613		for_each_netdev(net, dev) {
1614			if (dev->flags & IFF_UP) {
1615				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1616							dev);
1617				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1618			}
1619			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1620		}
1621	}
1622unlock:
1623	rtnl_unlock();
 
1624	return err;
1625}
1626EXPORT_SYMBOL(unregister_netdevice_notifier);
1627
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1628/**
1629 *	call_netdevice_notifiers_info - call all network notifier blocks
1630 *	@val: value passed unmodified to notifier function
1631 *	@dev: net_device pointer passed unmodified to notifier function
1632 *	@info: notifier information data
1633 *
1634 *	Call all network notifier blocks.  Parameters and return value
1635 *	are as for raw_notifier_call_chain().
1636 */
1637
1638static int call_netdevice_notifiers_info(unsigned long val,
1639					 struct net_device *dev,
1640					 struct netdev_notifier_info *info)
1641{
 
 
 
1642	ASSERT_RTNL();
1643	netdev_notifier_info_init(info, dev);
 
 
 
 
 
 
 
1644	return raw_notifier_call_chain(&netdev_chain, val, info);
1645}
1646
1647/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1648 *	call_netdevice_notifiers - call all network notifier blocks
1649 *      @val: value passed unmodified to notifier function
1650 *      @dev: net_device pointer passed unmodified to notifier function
1651 *
1652 *	Call all network notifier blocks.  Parameters and return value
1653 *	are as for raw_notifier_call_chain().
1654 */
1655
1656int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1657{
1658	struct netdev_notifier_info info;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1659
1660	return call_netdevice_notifiers_info(val, dev, &info);
1661}
1662EXPORT_SYMBOL(call_netdevice_notifiers);
1663
1664#ifdef CONFIG_NET_INGRESS
1665static struct static_key ingress_needed __read_mostly;
1666
1667void net_inc_ingress_queue(void)
1668{
1669	static_key_slow_inc(&ingress_needed);
1670}
1671EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1672
1673void net_dec_ingress_queue(void)
1674{
1675	static_key_slow_dec(&ingress_needed);
1676}
1677EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1678#endif
1679
1680#ifdef CONFIG_NET_EGRESS
1681static struct static_key egress_needed __read_mostly;
1682
1683void net_inc_egress_queue(void)
1684{
1685	static_key_slow_inc(&egress_needed);
1686}
1687EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1688
1689void net_dec_egress_queue(void)
1690{
1691	static_key_slow_dec(&egress_needed);
1692}
1693EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1694#endif
1695
1696static struct static_key netstamp_needed __read_mostly;
1697#ifdef HAVE_JUMP_LABEL
 
1698static atomic_t netstamp_needed_deferred;
1699static atomic_t netstamp_wanted;
1700static void netstamp_clear(struct work_struct *work)
1701{
1702	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1703	int wanted;
1704
1705	wanted = atomic_add_return(deferred, &netstamp_wanted);
1706	if (wanted > 0)
1707		static_key_enable(&netstamp_needed);
1708	else
1709		static_key_disable(&netstamp_needed);
1710}
1711static DECLARE_WORK(netstamp_work, netstamp_clear);
1712#endif
1713
1714void net_enable_timestamp(void)
1715{
1716#ifdef HAVE_JUMP_LABEL
1717	int wanted;
1718
1719	while (1) {
1720		wanted = atomic_read(&netstamp_wanted);
1721		if (wanted <= 0)
1722			break;
1723		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1724			return;
1725	}
1726	atomic_inc(&netstamp_needed_deferred);
1727	schedule_work(&netstamp_work);
1728#else
1729	static_key_slow_inc(&netstamp_needed);
1730#endif
1731}
1732EXPORT_SYMBOL(net_enable_timestamp);
1733
1734void net_disable_timestamp(void)
1735{
1736#ifdef HAVE_JUMP_LABEL
1737	int wanted;
1738
1739	while (1) {
1740		wanted = atomic_read(&netstamp_wanted);
1741		if (wanted <= 1)
1742			break;
1743		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1744			return;
1745	}
1746	atomic_dec(&netstamp_needed_deferred);
1747	schedule_work(&netstamp_work);
1748#else
1749	static_key_slow_dec(&netstamp_needed);
1750#endif
1751}
1752EXPORT_SYMBOL(net_disable_timestamp);
1753
1754static inline void net_timestamp_set(struct sk_buff *skb)
1755{
1756	skb->tstamp = 0;
1757	if (static_key_false(&netstamp_needed))
1758		__net_timestamp(skb);
 
1759}
1760
1761#define net_timestamp_check(COND, SKB)			\
1762	if (static_key_false(&netstamp_needed)) {		\
1763		if ((COND) && !(SKB)->tstamp)	\
1764			__net_timestamp(SKB);		\
1765	}						\
1766
1767bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1768{
1769	unsigned int len;
1770
1771	if (!(dev->flags & IFF_UP))
1772		return false;
1773
1774	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1775	if (skb->len <= len)
1776		return true;
1777
1778	/* if TSO is enabled, we don't care about the length as the packet
1779	 * could be forwarded without being segmented before
1780	 */
1781	if (skb_is_gso(skb))
1782		return true;
1783
1784	return false;
1785}
1786EXPORT_SYMBOL_GPL(is_skb_forwardable);
1787
1788int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 
1789{
1790	int ret = ____dev_forward_skb(dev, skb);
1791
1792	if (likely(!ret)) {
1793		skb->protocol = eth_type_trans(skb, dev);
1794		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1795	}
1796
1797	return ret;
1798}
 
 
 
 
 
1799EXPORT_SYMBOL_GPL(__dev_forward_skb);
1800
1801/**
1802 * dev_forward_skb - loopback an skb to another netif
1803 *
1804 * @dev: destination network device
1805 * @skb: buffer to forward
1806 *
1807 * return values:
1808 *	NET_RX_SUCCESS	(no congestion)
1809 *	NET_RX_DROP     (packet was dropped, but freed)
1810 *
1811 * dev_forward_skb can be used for injecting an skb from the
1812 * start_xmit function of one device into the receive queue
1813 * of another device.
1814 *
1815 * The receiving device may be in another namespace, so
1816 * we have to clear all information in the skb that could
1817 * impact namespace isolation.
1818 */
1819int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1820{
1821	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1822}
1823EXPORT_SYMBOL_GPL(dev_forward_skb);
1824
 
 
 
 
 
1825static inline int deliver_skb(struct sk_buff *skb,
1826			      struct packet_type *pt_prev,
1827			      struct net_device *orig_dev)
1828{
1829	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1830		return -ENOMEM;
1831	atomic_inc(&skb->users);
1832	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1833}
1834
1835static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1836					  struct packet_type **pt,
1837					  struct net_device *orig_dev,
1838					  __be16 type,
1839					  struct list_head *ptype_list)
1840{
1841	struct packet_type *ptype, *pt_prev = *pt;
1842
1843	list_for_each_entry_rcu(ptype, ptype_list, list) {
1844		if (ptype->type != type)
1845			continue;
1846		if (pt_prev)
1847			deliver_skb(skb, pt_prev, orig_dev);
1848		pt_prev = ptype;
1849	}
1850	*pt = pt_prev;
1851}
1852
1853static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1854{
1855	if (!ptype->af_packet_priv || !skb->sk)
1856		return false;
1857
1858	if (ptype->id_match)
1859		return ptype->id_match(ptype, skb->sk);
1860	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1861		return true;
1862
1863	return false;
1864}
1865
 
 
 
 
 
 
 
 
 
 
 
1866/*
1867 *	Support routine. Sends outgoing frames to any network
1868 *	taps currently in use.
1869 */
1870
1871void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1872{
1873	struct packet_type *ptype;
1874	struct sk_buff *skb2 = NULL;
1875	struct packet_type *pt_prev = NULL;
1876	struct list_head *ptype_list = &ptype_all;
1877
1878	rcu_read_lock();
1879again:
1880	list_for_each_entry_rcu(ptype, ptype_list, list) {
 
 
 
1881		/* Never send packets back to the socket
1882		 * they originated from - MvS (miquels@drinkel.ow.org)
1883		 */
1884		if (skb_loop_sk(ptype, skb))
1885			continue;
1886
1887		if (pt_prev) {
1888			deliver_skb(skb2, pt_prev, skb->dev);
1889			pt_prev = ptype;
1890			continue;
1891		}
1892
1893		/* need to clone skb, done only once */
1894		skb2 = skb_clone(skb, GFP_ATOMIC);
1895		if (!skb2)
1896			goto out_unlock;
1897
1898		net_timestamp_set(skb2);
1899
1900		/* skb->nh should be correctly
1901		 * set by sender, so that the second statement is
1902		 * just protection against buggy protocols.
1903		 */
1904		skb_reset_mac_header(skb2);
1905
1906		if (skb_network_header(skb2) < skb2->data ||
1907		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1908			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1909					     ntohs(skb2->protocol),
1910					     dev->name);
1911			skb_reset_network_header(skb2);
1912		}
1913
1914		skb2->transport_header = skb2->network_header;
1915		skb2->pkt_type = PACKET_OUTGOING;
1916		pt_prev = ptype;
1917	}
1918
1919	if (ptype_list == &ptype_all) {
1920		ptype_list = &dev->ptype_all;
1921		goto again;
1922	}
1923out_unlock:
1924	if (pt_prev)
1925		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
 
 
 
 
1926	rcu_read_unlock();
1927}
1928EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1929
1930/**
1931 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1932 * @dev: Network device
1933 * @txq: number of queues available
1934 *
1935 * If real_num_tx_queues is changed the tc mappings may no longer be
1936 * valid. To resolve this verify the tc mapping remains valid and if
1937 * not NULL the mapping. With no priorities mapping to this
1938 * offset/count pair it will no longer be used. In the worst case TC0
1939 * is invalid nothing can be done so disable priority mappings. If is
1940 * expected that drivers will fix this mapping if they can before
1941 * calling netif_set_real_num_tx_queues.
1942 */
1943static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1944{
1945	int i;
1946	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1947
1948	/* If TC0 is invalidated disable TC mapping */
1949	if (tc->offset + tc->count > txq) {
1950		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1951		dev->num_tc = 0;
1952		return;
1953	}
1954
1955	/* Invalidated prio to tc mappings set to TC0 */
1956	for (i = 1; i < TC_BITMASK + 1; i++) {
1957		int q = netdev_get_prio_tc_map(dev, i);
1958
1959		tc = &dev->tc_to_txq[q];
1960		if (tc->offset + tc->count > txq) {
1961			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1962				i, q);
1963			netdev_set_prio_tc_map(dev, i, 0);
1964		}
1965	}
1966}
1967
1968int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
1969{
1970	if (dev->num_tc) {
1971		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1972		int i;
1973
 
1974		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
1975			if ((txq - tc->offset) < tc->count)
1976				return i;
1977		}
1978
 
1979		return -1;
1980	}
1981
1982	return 0;
1983}
 
1984
1985#ifdef CONFIG_XPS
 
 
1986static DEFINE_MUTEX(xps_map_mutex);
1987#define xmap_dereference(P)		\
1988	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1989
1990static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
1991			     int tci, u16 index)
1992{
1993	struct xps_map *map = NULL;
1994	int pos;
1995
1996	if (dev_maps)
1997		map = xmap_dereference(dev_maps->cpu_map[tci]);
1998	if (!map)
1999		return false;
2000
2001	for (pos = map->len; pos--;) {
2002		if (map->queues[pos] != index)
2003			continue;
2004
2005		if (map->len > 1) {
2006			map->queues[pos] = map->queues[--map->len];
2007			break;
2008		}
2009
2010		RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
 
 
2011		kfree_rcu(map, rcu);
2012		return false;
2013	}
2014
2015	return true;
2016}
2017
2018static bool remove_xps_queue_cpu(struct net_device *dev,
2019				 struct xps_dev_maps *dev_maps,
2020				 int cpu, u16 offset, u16 count)
2021{
2022	int num_tc = dev->num_tc ? : 1;
2023	bool active = false;
2024	int tci;
2025
2026	for (tci = cpu * num_tc; num_tc--; tci++) {
2027		int i, j;
2028
2029		for (i = count, j = offset; i--; j++) {
2030			if (!remove_xps_queue(dev_maps, cpu, j))
2031				break;
2032		}
2033
2034		active |= i < 0;
2035	}
2036
2037	return active;
2038}
2039
2040static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2041				   u16 count)
 
 
 
 
 
 
 
 
 
 
 
 
 
2042{
2043	struct xps_dev_maps *dev_maps;
2044	int cpu, i;
2045	bool active = false;
 
2046
2047	mutex_lock(&xps_map_mutex);
2048	dev_maps = xmap_dereference(dev->xps_maps);
2049
2050	if (!dev_maps)
2051		goto out_no_maps;
2052
2053	for_each_possible_cpu(cpu)
2054		active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
2055					       offset, count);
2056
2057	if (!active) {
2058		RCU_INIT_POINTER(dev->xps_maps, NULL);
2059		kfree_rcu(dev_maps, rcu);
 
 
2060	}
 
2061
2062	for (i = offset + (count - 1); count--; i--)
2063		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2064					     NUMA_NO_NODE);
 
 
 
 
 
 
 
 
 
 
2065
2066out_no_maps:
2067	mutex_unlock(&xps_map_mutex);
 
2068}
2069
2070static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2071{
2072	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2073}
2074
2075static struct xps_map *expand_xps_map(struct xps_map *map,
2076				      int cpu, u16 index)
2077{
2078	struct xps_map *new_map;
2079	int alloc_len = XPS_MIN_MAP_ALLOC;
2080	int i, pos;
2081
2082	for (pos = 0; map && pos < map->len; pos++) {
2083		if (map->queues[pos] != index)
2084			continue;
2085		return map;
2086	}
2087
2088	/* Need to add queue to this CPU's existing map */
2089	if (map) {
2090		if (pos < map->alloc_len)
2091			return map;
2092
2093		alloc_len = map->alloc_len * 2;
2094	}
2095
2096	/* Need to allocate new map to store queue on this CPU's map */
2097	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2098			       cpu_to_node(cpu));
 
 
 
 
 
2099	if (!new_map)
2100		return NULL;
2101
2102	for (i = 0; i < pos; i++)
2103		new_map->queues[i] = map->queues[i];
2104	new_map->alloc_len = alloc_len;
2105	new_map->len = pos;
2106
2107	return new_map;
2108}
2109
2110int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2111			u16 index)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2112{
2113	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2114	int i, cpu, tci, numa_node_id = -2;
 
 
2115	int maps_sz, num_tc = 1, tc = 0;
2116	struct xps_map *map, *new_map;
2117	bool active = false;
 
 
2118
2119	if (dev->num_tc) {
 
2120		num_tc = dev->num_tc;
 
 
 
 
 
 
2121		tc = netdev_txq_to_tc(dev, index);
2122		if (tc < 0)
2123			return -EINVAL;
2124	}
2125
2126	maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
 
 
 
 
 
 
 
 
 
 
 
 
2127	if (maps_sz < L1_CACHE_BYTES)
2128		maps_sz = L1_CACHE_BYTES;
2129
2130	mutex_lock(&xps_map_mutex);
2131
2132	dev_maps = xmap_dereference(dev->xps_maps);
 
 
 
 
 
2133
2134	/* allocate memory for queue storage */
2135	for_each_cpu_and(cpu, cpu_online_mask, mask) {
2136		if (!new_dev_maps)
 
2137			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2138		if (!new_dev_maps) {
2139			mutex_unlock(&xps_map_mutex);
2140			return -ENOMEM;
 
 
 
 
2141		}
2142
2143		tci = cpu * num_tc + tc;
2144		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
2145				 NULL;
2146
2147		map = expand_xps_map(map, cpu, index);
2148		if (!map)
2149			goto error;
2150
2151		RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2152	}
2153
2154	if (!new_dev_maps)
2155		goto out_no_new_maps;
2156
2157	for_each_possible_cpu(cpu) {
2158		/* copy maps belonging to foreign traffic classes */
2159		for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
2160			/* fill in the new device map from the old device map */
2161			map = xmap_dereference(dev_maps->cpu_map[tci]);
2162			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2163		}
2164
2165		/* We need to explicitly update tci as prevous loop
2166		 * could break out early if dev_maps is NULL.
2167		 */
2168		tci = cpu * num_tc + tc;
 
 
 
 
2169
2170		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2171			/* add queue to CPU maps */
2172			int pos = 0;
2173
2174			map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2175			while ((pos < map->len) && (map->queues[pos] != index))
2176				pos++;
2177
2178			if (pos == map->len)
2179				map->queues[map->len++] = index;
2180#ifdef CONFIG_NUMA
2181			if (numa_node_id == -2)
2182				numa_node_id = cpu_to_node(cpu);
2183			else if (numa_node_id != cpu_to_node(cpu))
2184				numa_node_id = -1;
 
 
2185#endif
2186		} else if (dev_maps) {
2187			/* fill in the new device map from the old device map */
2188			map = xmap_dereference(dev_maps->cpu_map[tci]);
2189			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2190		}
2191
2192		/* copy maps belonging to foreign traffic classes */
2193		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2194			/* fill in the new device map from the old device map */
2195			map = xmap_dereference(dev_maps->cpu_map[tci]);
2196			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2197		}
2198	}
2199
2200	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2201
2202	/* Cleanup old maps */
2203	if (!dev_maps)
2204		goto out_no_old_maps;
2205
2206	for_each_possible_cpu(cpu) {
2207		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2208			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2209			map = xmap_dereference(dev_maps->cpu_map[tci]);
2210			if (map && map != new_map)
2211				kfree_rcu(map, rcu);
 
 
 
 
 
 
 
 
2212		}
2213	}
2214
2215	kfree_rcu(dev_maps, rcu);
2216
2217out_no_old_maps:
2218	dev_maps = new_dev_maps;
2219	active = true;
2220
2221out_no_new_maps:
2222	/* update Tx queue numa node */
2223	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2224				     (numa_node_id >= 0) ? numa_node_id :
2225				     NUMA_NO_NODE);
 
2226
2227	if (!dev_maps)
2228		goto out_no_maps;
2229
2230	/* removes queue from unused CPUs */
2231	for_each_possible_cpu(cpu) {
2232		for (i = tc, tci = cpu * num_tc; i--; tci++)
2233			active |= remove_xps_queue(dev_maps, tci, index);
2234		if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
2235			active |= remove_xps_queue(dev_maps, tci, index);
2236		for (i = num_tc - tc, tci++; --i; tci++)
2237			active |= remove_xps_queue(dev_maps, tci, index);
 
 
 
 
 
 
2238	}
2239
 
 
 
2240	/* free map if not active */
2241	if (!active) {
2242		RCU_INIT_POINTER(dev->xps_maps, NULL);
2243		kfree_rcu(dev_maps, rcu);
2244	}
2245
2246out_no_maps:
2247	mutex_unlock(&xps_map_mutex);
2248
2249	return 0;
2250error:
2251	/* remove any maps that we added */
2252	for_each_possible_cpu(cpu) {
2253		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2254			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2255			map = dev_maps ?
2256			      xmap_dereference(dev_maps->cpu_map[tci]) :
2257			      NULL;
2258			if (new_map && new_map != map)
2259				kfree(new_map);
2260		}
2261	}
2262
2263	mutex_unlock(&xps_map_mutex);
2264
2265	kfree(new_dev_maps);
2266	return -ENOMEM;
2267}
 
 
 
 
 
 
 
 
 
 
 
 
 
2268EXPORT_SYMBOL(netif_set_xps_queue);
2269
2270#endif
 
 
 
 
 
 
 
 
 
 
 
2271void netdev_reset_tc(struct net_device *dev)
2272{
2273#ifdef CONFIG_XPS
2274	netif_reset_xps_queues_gt(dev, 0);
2275#endif
 
 
 
2276	dev->num_tc = 0;
2277	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2278	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2279}
2280EXPORT_SYMBOL(netdev_reset_tc);
2281
2282int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2283{
2284	if (tc >= dev->num_tc)
2285		return -EINVAL;
2286
2287#ifdef CONFIG_XPS
2288	netif_reset_xps_queues(dev, offset, count);
2289#endif
2290	dev->tc_to_txq[tc].count = count;
2291	dev->tc_to_txq[tc].offset = offset;
2292	return 0;
2293}
2294EXPORT_SYMBOL(netdev_set_tc_queue);
2295
2296int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2297{
2298	if (num_tc > TC_MAX_QUEUE)
2299		return -EINVAL;
2300
2301#ifdef CONFIG_XPS
2302	netif_reset_xps_queues_gt(dev, 0);
2303#endif
 
 
2304	dev->num_tc = num_tc;
2305	return 0;
2306}
2307EXPORT_SYMBOL(netdev_set_num_tc);
2308
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2309/*
2310 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2311 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2312 */
2313int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2314{
 
2315	int rc;
2316
 
 
2317	if (txq < 1 || txq > dev->num_tx_queues)
2318		return -EINVAL;
2319
2320	if (dev->reg_state == NETREG_REGISTERED ||
2321	    dev->reg_state == NETREG_UNREGISTERING) {
2322		ASSERT_RTNL();
2323
2324		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2325						  txq);
2326		if (rc)
2327			return rc;
2328
2329		if (dev->num_tc)
2330			netif_setup_tc(dev, txq);
2331
2332		if (txq < dev->real_num_tx_queues) {
 
 
 
 
 
2333			qdisc_reset_all_tx_gt(dev, txq);
2334#ifdef CONFIG_XPS
2335			netif_reset_xps_queues_gt(dev, txq);
2336#endif
2337		}
 
 
2338	}
2339
2340	dev->real_num_tx_queues = txq;
2341	return 0;
2342}
2343EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2344
2345#ifdef CONFIG_SYSFS
2346/**
2347 *	netif_set_real_num_rx_queues - set actual number of RX queues used
2348 *	@dev: Network device
2349 *	@rxq: Actual number of RX queues
2350 *
2351 *	This must be called either with the rtnl_lock held or before
2352 *	registration of the net device.  Returns 0 on success, or a
2353 *	negative error code.  If called before registration, it always
2354 *	succeeds.
2355 */
2356int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2357{
2358	int rc;
2359
2360	if (rxq < 1 || rxq > dev->num_rx_queues)
2361		return -EINVAL;
2362
2363	if (dev->reg_state == NETREG_REGISTERED) {
2364		ASSERT_RTNL();
2365
2366		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2367						  rxq);
2368		if (rc)
2369			return rc;
2370	}
2371
2372	dev->real_num_rx_queues = rxq;
2373	return 0;
2374}
2375EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2376#endif
2377
2378/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2379 * netif_get_num_default_rss_queues - default number of RSS queues
2380 *
2381 * This routine should set an upper limit on the number of RSS queues
2382 * used by default by multiqueue devices.
2383 */
2384int netif_get_num_default_rss_queues(void)
2385{
2386	return is_kdump_kernel() ?
2387		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
 
 
 
 
 
 
 
 
 
 
 
 
2388}
2389EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2390
2391static void __netif_reschedule(struct Qdisc *q)
2392{
2393	struct softnet_data *sd;
2394	unsigned long flags;
2395
2396	local_irq_save(flags);
2397	sd = this_cpu_ptr(&softnet_data);
2398	q->next_sched = NULL;
2399	*sd->output_queue_tailp = q;
2400	sd->output_queue_tailp = &q->next_sched;
2401	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2402	local_irq_restore(flags);
2403}
2404
2405void __netif_schedule(struct Qdisc *q)
2406{
2407	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2408		__netif_reschedule(q);
2409}
2410EXPORT_SYMBOL(__netif_schedule);
2411
2412struct dev_kfree_skb_cb {
2413	enum skb_free_reason reason;
2414};
2415
2416static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2417{
2418	return (struct dev_kfree_skb_cb *)skb->cb;
2419}
2420
2421void netif_schedule_queue(struct netdev_queue *txq)
2422{
2423	rcu_read_lock();
2424	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2425		struct Qdisc *q = rcu_dereference(txq->qdisc);
2426
2427		__netif_schedule(q);
2428	}
2429	rcu_read_unlock();
2430}
2431EXPORT_SYMBOL(netif_schedule_queue);
2432
2433/**
2434 *	netif_wake_subqueue - allow sending packets on subqueue
2435 *	@dev: network device
2436 *	@queue_index: sub queue index
2437 *
2438 * Resume individual transmit queue of a device with multiple transmit queues.
2439 */
2440void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2441{
2442	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2443
2444	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2445		struct Qdisc *q;
2446
2447		rcu_read_lock();
2448		q = rcu_dereference(txq->qdisc);
2449		__netif_schedule(q);
2450		rcu_read_unlock();
2451	}
2452}
2453EXPORT_SYMBOL(netif_wake_subqueue);
2454
2455void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2456{
2457	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2458		struct Qdisc *q;
2459
2460		rcu_read_lock();
2461		q = rcu_dereference(dev_queue->qdisc);
2462		__netif_schedule(q);
2463		rcu_read_unlock();
2464	}
2465}
2466EXPORT_SYMBOL(netif_tx_wake_queue);
2467
2468void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2469{
2470	unsigned long flags;
2471
2472	if (likely(atomic_read(&skb->users) == 1)) {
 
 
 
2473		smp_rmb();
2474		atomic_set(&skb->users, 0);
2475	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2476		return;
2477	}
2478	get_kfree_skb_cb(skb)->reason = reason;
2479	local_irq_save(flags);
2480	skb->next = __this_cpu_read(softnet_data.completion_queue);
2481	__this_cpu_write(softnet_data.completion_queue, skb);
2482	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2483	local_irq_restore(flags);
2484}
2485EXPORT_SYMBOL(__dev_kfree_skb_irq);
2486
2487void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2488{
2489	if (in_irq() || irqs_disabled())
2490		__dev_kfree_skb_irq(skb, reason);
2491	else
2492		dev_kfree_skb(skb);
2493}
2494EXPORT_SYMBOL(__dev_kfree_skb_any);
2495
2496
2497/**
2498 * netif_device_detach - mark device as removed
2499 * @dev: network device
2500 *
2501 * Mark device as removed from system and therefore no longer available.
2502 */
2503void netif_device_detach(struct net_device *dev)
2504{
2505	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2506	    netif_running(dev)) {
2507		netif_tx_stop_all_queues(dev);
2508	}
2509}
2510EXPORT_SYMBOL(netif_device_detach);
2511
2512/**
2513 * netif_device_attach - mark device as attached
2514 * @dev: network device
2515 *
2516 * Mark device as attached from system and restart if needed.
2517 */
2518void netif_device_attach(struct net_device *dev)
2519{
2520	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2521	    netif_running(dev)) {
2522		netif_tx_wake_all_queues(dev);
2523		__netdev_watchdog_up(dev);
2524	}
2525}
2526EXPORT_SYMBOL(netif_device_attach);
2527
2528/*
2529 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2530 * to be used as a distribution range.
2531 */
2532u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2533		  unsigned int num_tx_queues)
 
2534{
2535	u32 hash;
2536	u16 qoffset = 0;
2537	u16 qcount = num_tx_queues;
 
 
 
 
 
 
 
 
 
 
 
 
 
2538
2539	if (skb_rx_queue_recorded(skb)) {
 
2540		hash = skb_get_rx_queue(skb);
2541		while (unlikely(hash >= num_tx_queues))
2542			hash -= num_tx_queues;
2543		return hash;
2544	}
2545
2546	if (dev->num_tc) {
2547		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2548		qoffset = dev->tc_to_txq[tc].offset;
2549		qcount = dev->tc_to_txq[tc].count;
2550	}
2551
2552	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2553}
2554EXPORT_SYMBOL(__skb_tx_hash);
2555
2556static void skb_warn_bad_offload(const struct sk_buff *skb)
2557{
2558	static const netdev_features_t null_features;
2559	struct net_device *dev = skb->dev;
2560	const char *name = "";
2561
2562	if (!net_ratelimit())
2563		return;
2564
2565	if (dev) {
2566		if (dev->dev.parent)
2567			name = dev_driver_string(dev->dev.parent);
2568		else
2569			name = netdev_name(dev);
2570	}
2571	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2572	     "gso_type=%d ip_summed=%d\n",
2573	     name, dev ? &dev->features : &null_features,
2574	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2575	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2576	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2577}
2578
2579/*
2580 * Invalidate hardware checksum when packet is to be mangled, and
2581 * complete checksum manually on outgoing path.
2582 */
2583int skb_checksum_help(struct sk_buff *skb)
2584{
2585	__wsum csum;
2586	int ret = 0, offset;
2587
2588	if (skb->ip_summed == CHECKSUM_COMPLETE)
2589		goto out_set_summed;
2590
2591	if (unlikely(skb_shinfo(skb)->gso_size)) {
2592		skb_warn_bad_offload(skb);
2593		return -EINVAL;
2594	}
2595
2596	/* Before computing a checksum, we should make sure no frag could
2597	 * be modified by an external entity : checksum could be wrong.
2598	 */
2599	if (skb_has_shared_frag(skb)) {
2600		ret = __skb_linearize(skb);
2601		if (ret)
2602			goto out;
2603	}
2604
2605	offset = skb_checksum_start_offset(skb);
2606	BUG_ON(offset >= skb_headlen(skb));
 
 
 
 
 
 
2607	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2608
2609	offset += skb->csum_offset;
2610	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2611
2612	if (skb_cloned(skb) &&
2613	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2614		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2615		if (ret)
2616			goto out;
2617	}
 
 
 
2618
2619	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2620out_set_summed:
2621	skb->ip_summed = CHECKSUM_NONE;
2622out:
2623	return ret;
2624}
2625EXPORT_SYMBOL(skb_checksum_help);
2626
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2627__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2628{
2629	__be16 type = skb->protocol;
2630
2631	/* Tunnel gso handlers can set protocol to ethernet. */
2632	if (type == htons(ETH_P_TEB)) {
2633		struct ethhdr *eth;
2634
2635		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2636			return 0;
2637
2638		eth = (struct ethhdr *)skb_mac_header(skb);
2639		type = eth->h_proto;
2640	}
2641
2642	return __vlan_get_protocol(skb, type, depth);
2643}
2644
2645/**
2646 *	skb_mac_gso_segment - mac layer segmentation handler.
2647 *	@skb: buffer to segment
2648 *	@features: features for the output path (see dev->features)
2649 */
2650struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2651				    netdev_features_t features)
2652{
2653	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2654	struct packet_offload *ptype;
2655	int vlan_depth = skb->mac_len;
2656	__be16 type = skb_network_protocol(skb, &vlan_depth);
2657
2658	if (unlikely(!type))
2659		return ERR_PTR(-EINVAL);
2660
2661	__skb_pull(skb, vlan_depth);
2662
2663	rcu_read_lock();
2664	list_for_each_entry_rcu(ptype, &offload_base, list) {
2665		if (ptype->type == type && ptype->callbacks.gso_segment) {
2666			segs = ptype->callbacks.gso_segment(skb, features);
2667			break;
2668		}
2669	}
2670	rcu_read_unlock();
2671
2672	__skb_push(skb, skb->data - skb_mac_header(skb));
2673
2674	return segs;
2675}
2676EXPORT_SYMBOL(skb_mac_gso_segment);
2677
2678
2679/* openvswitch calls this on rx path, so we need a different check.
2680 */
2681static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2682{
2683	if (tx_path)
2684		return skb->ip_summed != CHECKSUM_PARTIAL;
2685	else
2686		return skb->ip_summed == CHECKSUM_NONE;
2687}
2688
2689/**
2690 *	__skb_gso_segment - Perform segmentation on skb.
2691 *	@skb: buffer to segment
2692 *	@features: features for the output path (see dev->features)
2693 *	@tx_path: whether it is called in TX path
2694 *
2695 *	This function segments the given skb and returns a list of segments.
2696 *
2697 *	It may return NULL if the skb requires no segmentation.  This is
2698 *	only possible when GSO is used for verifying header integrity.
2699 *
2700 *	Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2701 */
2702struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2703				  netdev_features_t features, bool tx_path)
2704{
2705	if (unlikely(skb_needs_check(skb, tx_path))) {
2706		int err;
2707
2708		skb_warn_bad_offload(skb);
2709
2710		err = skb_cow_head(skb, 0);
2711		if (err < 0)
2712			return ERR_PTR(err);
2713	}
2714
2715	/* Only report GSO partial support if it will enable us to
2716	 * support segmentation on this frame without needing additional
2717	 * work.
2718	 */
2719	if (features & NETIF_F_GSO_PARTIAL) {
2720		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2721		struct net_device *dev = skb->dev;
2722
2723		partial_features |= dev->features & dev->gso_partial_features;
2724		if (!skb_gso_ok(skb, features | partial_features))
2725			features &= ~NETIF_F_GSO_PARTIAL;
2726	}
2727
2728	BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2729		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2730
2731	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2732	SKB_GSO_CB(skb)->encap_level = 0;
2733
2734	skb_reset_mac_header(skb);
2735	skb_reset_mac_len(skb);
2736
2737	return skb_mac_gso_segment(skb, features);
2738}
2739EXPORT_SYMBOL(__skb_gso_segment);
2740
2741/* Take action when hardware reception checksum errors are detected. */
2742#ifdef CONFIG_BUG
2743void netdev_rx_csum_fault(struct net_device *dev)
2744{
2745	if (net_ratelimit()) {
2746		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2747		dump_stack();
2748	}
2749}
2750EXPORT_SYMBOL(netdev_rx_csum_fault);
2751#endif
2752
2753/* Actually, we should eliminate this check as soon as we know, that:
2754 * 1. IOMMU is present and allows to map all the memory.
2755 * 2. No high memory really exists on this machine.
2756 */
2757
2758static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2759{
2760#ifdef CONFIG_HIGHMEM
2761	int i;
 
2762	if (!(dev->features & NETIF_F_HIGHDMA)) {
2763		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2764			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
2765			if (PageHighMem(skb_frag_page(frag)))
2766				return 1;
2767		}
2768	}
2769
2770	if (PCI_DMA_BUS_IS_PHYS) {
2771		struct device *pdev = dev->dev.parent;
2772
2773		if (!pdev)
2774			return 0;
2775		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2776			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2777			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2778			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2779				return 1;
2780		}
2781	}
2782#endif
2783	return 0;
2784}
2785
2786/* If MPLS offload request, verify we are testing hardware MPLS features
2787 * instead of standard features for the netdev.
2788 */
2789#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2790static netdev_features_t net_mpls_features(struct sk_buff *skb,
2791					   netdev_features_t features,
2792					   __be16 type)
2793{
2794	if (eth_p_mpls(type))
2795		features &= skb->dev->mpls_features;
2796
2797	return features;
2798}
2799#else
2800static netdev_features_t net_mpls_features(struct sk_buff *skb,
2801					   netdev_features_t features,
2802					   __be16 type)
2803{
2804	return features;
2805}
2806#endif
2807
2808static netdev_features_t harmonize_features(struct sk_buff *skb,
2809	netdev_features_t features)
2810{
2811	int tmp;
2812	__be16 type;
2813
2814	type = skb_network_protocol(skb, &tmp);
2815	features = net_mpls_features(skb, features, type);
2816
2817	if (skb->ip_summed != CHECKSUM_NONE &&
2818	    !can_checksum_protocol(features, type)) {
2819		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2820	}
2821	if (illegal_highdma(skb->dev, skb))
2822		features &= ~NETIF_F_SG;
2823
2824	return features;
2825}
2826
2827netdev_features_t passthru_features_check(struct sk_buff *skb,
2828					  struct net_device *dev,
2829					  netdev_features_t features)
2830{
2831	return features;
2832}
2833EXPORT_SYMBOL(passthru_features_check);
2834
2835static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2836					     struct net_device *dev,
2837					     netdev_features_t features)
2838{
2839	return vlan_features_check(skb, features);
2840}
2841
2842static netdev_features_t gso_features_check(const struct sk_buff *skb,
2843					    struct net_device *dev,
2844					    netdev_features_t features)
2845{
2846	u16 gso_segs = skb_shinfo(skb)->gso_segs;
2847
2848	if (gso_segs > dev->gso_max_segs)
2849		return features & ~NETIF_F_GSO_MASK;
2850
 
 
 
 
 
 
 
 
2851	/* Support for GSO partial features requires software
2852	 * intervention before we can actually process the packets
2853	 * so we need to strip support for any partial features now
2854	 * and we can pull them back in after we have partially
2855	 * segmented the frame.
2856	 */
2857	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2858		features &= ~dev->gso_partial_features;
2859
2860	/* Make sure to clear the IPv4 ID mangling feature if the
2861	 * IPv4 header has the potential to be fragmented.
2862	 */
2863	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2864		struct iphdr *iph = skb->encapsulation ?
2865				    inner_ip_hdr(skb) : ip_hdr(skb);
2866
2867		if (!(iph->frag_off & htons(IP_DF)))
2868			features &= ~NETIF_F_TSO_MANGLEID;
2869	}
2870
2871	return features;
2872}
2873
2874netdev_features_t netif_skb_features(struct sk_buff *skb)
2875{
2876	struct net_device *dev = skb->dev;
2877	netdev_features_t features = dev->features;
2878
2879	if (skb_is_gso(skb))
2880		features = gso_features_check(skb, dev, features);
2881
2882	/* If encapsulation offload request, verify we are testing
2883	 * hardware encapsulation features instead of standard
2884	 * features for the netdev
2885	 */
2886	if (skb->encapsulation)
2887		features &= dev->hw_enc_features;
2888
2889	if (skb_vlan_tagged(skb))
2890		features = netdev_intersect_features(features,
2891						     dev->vlan_features |
2892						     NETIF_F_HW_VLAN_CTAG_TX |
2893						     NETIF_F_HW_VLAN_STAG_TX);
2894
2895	if (dev->netdev_ops->ndo_features_check)
2896		features &= dev->netdev_ops->ndo_features_check(skb, dev,
2897								features);
2898	else
2899		features &= dflt_features_check(skb, dev, features);
2900
2901	return harmonize_features(skb, features);
2902}
2903EXPORT_SYMBOL(netif_skb_features);
2904
2905static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2906		    struct netdev_queue *txq, bool more)
2907{
2908	unsigned int len;
2909	int rc;
2910
2911	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2912		dev_queue_xmit_nit(skb, dev);
2913
2914	len = skb->len;
2915	trace_net_dev_start_xmit(skb, dev);
2916	rc = netdev_start_xmit(skb, dev, txq, more);
2917	trace_net_dev_xmit(skb, rc, dev, len);
2918
2919	return rc;
2920}
2921
2922struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2923				    struct netdev_queue *txq, int *ret)
2924{
2925	struct sk_buff *skb = first;
2926	int rc = NETDEV_TX_OK;
2927
2928	while (skb) {
2929		struct sk_buff *next = skb->next;
2930
2931		skb->next = NULL;
2932		rc = xmit_one(skb, dev, txq, next != NULL);
2933		if (unlikely(!dev_xmit_complete(rc))) {
2934			skb->next = next;
2935			goto out;
2936		}
2937
2938		skb = next;
2939		if (netif_xmit_stopped(txq) && skb) {
2940			rc = NETDEV_TX_BUSY;
2941			break;
2942		}
2943	}
2944
2945out:
2946	*ret = rc;
2947	return skb;
2948}
2949
2950static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2951					  netdev_features_t features)
2952{
2953	if (skb_vlan_tag_present(skb) &&
2954	    !vlan_hw_offload_capable(features, skb->vlan_proto))
2955		skb = __vlan_hwaccel_push_inside(skb);
2956	return skb;
2957}
2958
2959static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2960{
2961	netdev_features_t features;
2962
2963	features = netif_skb_features(skb);
2964	skb = validate_xmit_vlan(skb, features);
2965	if (unlikely(!skb))
2966		goto out_null;
2967
 
 
 
 
2968	if (netif_needs_gso(skb, features)) {
2969		struct sk_buff *segs;
2970
2971		segs = skb_gso_segment(skb, features);
2972		if (IS_ERR(segs)) {
2973			goto out_kfree_skb;
2974		} else if (segs) {
2975			consume_skb(skb);
2976			skb = segs;
2977		}
2978	} else {
2979		if (skb_needs_linearize(skb, features) &&
2980		    __skb_linearize(skb))
2981			goto out_kfree_skb;
2982
2983		/* If packet is not checksummed and device does not
2984		 * support checksumming for this protocol, complete
2985		 * checksumming here.
2986		 */
2987		if (skb->ip_summed == CHECKSUM_PARTIAL) {
2988			if (skb->encapsulation)
2989				skb_set_inner_transport_header(skb,
2990							       skb_checksum_start_offset(skb));
2991			else
2992				skb_set_transport_header(skb,
2993							 skb_checksum_start_offset(skb));
2994			if (!(features & NETIF_F_CSUM_MASK) &&
2995			    skb_checksum_help(skb))
2996				goto out_kfree_skb;
2997		}
2998	}
2999
 
 
3000	return skb;
3001
3002out_kfree_skb:
3003	kfree_skb(skb);
3004out_null:
3005	atomic_long_inc(&dev->tx_dropped);
3006	return NULL;
3007}
3008
3009struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
3010{
3011	struct sk_buff *next, *head = NULL, *tail;
3012
3013	for (; skb != NULL; skb = next) {
3014		next = skb->next;
3015		skb->next = NULL;
3016
3017		/* in case skb wont be segmented, point to itself */
3018		skb->prev = skb;
3019
3020		skb = validate_xmit_skb(skb, dev);
3021		if (!skb)
3022			continue;
3023
3024		if (!head)
3025			head = skb;
3026		else
3027			tail->next = skb;
3028		/* If skb was segmented, skb->prev points to
3029		 * the last segment. If not, it still contains skb.
3030		 */
3031		tail = skb->prev;
3032	}
3033	return head;
3034}
3035EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3036
3037static void qdisc_pkt_len_init(struct sk_buff *skb)
3038{
3039	const struct skb_shared_info *shinfo = skb_shinfo(skb);
3040
3041	qdisc_skb_cb(skb)->pkt_len = skb->len;
3042
3043	/* To get more precise estimation of bytes sent on wire,
3044	 * we add to pkt_len the headers size of all segments
3045	 */
3046	if (shinfo->gso_size)  {
 
3047		unsigned int hdr_len;
3048		u16 gso_segs = shinfo->gso_segs;
3049
3050		/* mac layer + network layer */
3051		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3052
3053		/* + transport layer */
3054		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3055			hdr_len += tcp_hdrlen(skb);
3056		else
3057			hdr_len += sizeof(struct udphdr);
 
 
 
 
 
 
 
 
 
 
 
3058
3059		if (shinfo->gso_type & SKB_GSO_DODGY)
3060			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3061						shinfo->gso_size);
3062
3063		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3064	}
3065}
3066
 
 
 
 
 
 
 
 
 
 
 
 
3067static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3068				 struct net_device *dev,
3069				 struct netdev_queue *txq)
3070{
3071	spinlock_t *root_lock = qdisc_lock(q);
3072	struct sk_buff *to_free = NULL;
3073	bool contended;
3074	int rc;
3075
3076	qdisc_calculate_pkt_len(skb, q);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3077	/*
3078	 * Heuristic to force contended enqueues to serialize on a
3079	 * separate lock before trying to get qdisc main lock.
3080	 * This permits qdisc->running owner to get the lock more
3081	 * often and dequeue packets faster.
 
 
 
 
3082	 */
3083	contended = qdisc_is_running(q);
3084	if (unlikely(contended))
3085		spin_lock(&q->busylock);
3086
3087	spin_lock(root_lock);
3088	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3089		__qdisc_drop(skb, &to_free);
3090		rc = NET_XMIT_DROP;
3091	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3092		   qdisc_run_begin(q)) {
3093		/*
3094		 * This is a work-conserving queue; there are no old skbs
3095		 * waiting to be sent out; and the qdisc is not running -
3096		 * xmit the skb directly.
3097		 */
3098
3099		qdisc_bstats_update(q, skb);
3100
3101		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3102			if (unlikely(contended)) {
3103				spin_unlock(&q->busylock);
3104				contended = false;
3105			}
3106			__qdisc_run(q);
3107		} else
3108			qdisc_run_end(q);
3109
 
3110		rc = NET_XMIT_SUCCESS;
3111	} else {
3112		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3113		if (qdisc_run_begin(q)) {
3114			if (unlikely(contended)) {
3115				spin_unlock(&q->busylock);
3116				contended = false;
3117			}
3118			__qdisc_run(q);
 
3119		}
3120	}
3121	spin_unlock(root_lock);
3122	if (unlikely(to_free))
3123		kfree_skb_list(to_free);
 
3124	if (unlikely(contended))
3125		spin_unlock(&q->busylock);
3126	return rc;
3127}
3128
3129#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3130static void skb_update_prio(struct sk_buff *skb)
3131{
3132	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
 
 
3133
3134	if (!skb->priority && skb->sk && map) {
3135		unsigned int prioidx =
3136			sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
 
 
 
 
 
3137
3138		if (prioidx < map->priomap_len)
3139			skb->priority = map->priomap[prioidx];
3140	}
 
3141}
3142#else
3143#define skb_update_prio(skb)
3144#endif
3145
3146DEFINE_PER_CPU(int, xmit_recursion);
3147EXPORT_SYMBOL(xmit_recursion);
3148
3149/**
3150 *	dev_loopback_xmit - loop back @skb
3151 *	@net: network namespace this loopback is happening in
3152 *	@sk:  sk needed to be a netfilter okfn
3153 *	@skb: buffer to transmit
3154 */
3155int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3156{
3157	skb_reset_mac_header(skb);
3158	__skb_pull(skb, skb_network_offset(skb));
3159	skb->pkt_type = PACKET_LOOPBACK;
3160	skb->ip_summed = CHECKSUM_UNNECESSARY;
3161	WARN_ON(!skb_dst(skb));
 
3162	skb_dst_force(skb);
3163	netif_rx_ni(skb);
3164	return 0;
3165}
3166EXPORT_SYMBOL(dev_loopback_xmit);
3167
3168#ifdef CONFIG_NET_EGRESS
3169static struct sk_buff *
3170sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 
 
 
 
 
 
 
 
 
 
 
 
3171{
3172	struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3173	struct tcf_result cl_res;
 
 
3174
3175	if (!cl)
3176		return skb;
 
 
 
 
 
 
3177
3178	/* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3179	 * earlier by the caller.
3180	 */
3181	qdisc_bstats_cpu_update(cl->q, skb);
3182
3183	switch (tc_classify(skb, cl, &cl_res, false)) {
 
 
 
 
 
 
 
 
 
 
 
3184	case TC_ACT_OK:
3185	case TC_ACT_RECLASSIFY:
3186		skb->tc_index = TC_H_MIN(cl_res.classid);
3187		break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3188	case TC_ACT_SHOT:
3189		qdisc_qstats_cpu_drop(cl->q);
3190		*ret = NET_XMIT_DROP;
3191		kfree_skb(skb);
3192		return NULL;
 
3193	case TC_ACT_STOLEN:
3194	case TC_ACT_QUEUED:
3195		*ret = NET_XMIT_SUCCESS;
3196		consume_skb(skb);
 
 
 
3197		return NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3198	case TC_ACT_REDIRECT:
3199		/* No need to push/pop skb's mac_header here on egress! */
3200		skb_do_redirect(skb);
3201		*ret = NET_XMIT_SUCCESS;
3202		return NULL;
3203	default:
3204		break;
 
 
 
 
 
 
 
 
 
 
 
3205	}
3206
3207	return skb;
3208}
3209#endif /* CONFIG_NET_EGRESS */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3210
3211static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 
3212{
3213#ifdef CONFIG_XPS
3214	struct xps_dev_maps *dev_maps;
3215	struct xps_map *map;
3216	int queue_index = -1;
3217
 
 
 
3218	rcu_read_lock();
3219	dev_maps = rcu_dereference(dev->xps_maps);
 
 
 
3220	if (dev_maps) {
3221		unsigned int tci = skb->sender_cpu - 1;
 
 
 
 
 
3222
3223		if (dev->num_tc) {
3224			tci *= dev->num_tc;
3225			tci += netdev_get_prio_tc_map(dev, skb->priority);
3226		}
 
3227
3228		map = rcu_dereference(dev_maps->cpu_map[tci]);
3229		if (map) {
3230			if (map->len == 1)
3231				queue_index = map->queues[0];
3232			else
3233				queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3234									   map->len)];
3235			if (unlikely(queue_index >= dev->real_num_tx_queues))
3236				queue_index = -1;
3237		}
3238	}
3239	rcu_read_unlock();
3240
3241	return queue_index;
3242#else
3243	return -1;
3244#endif
3245}
3246
3247static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3248{
3249	struct sock *sk = skb->sk;
3250	int queue_index = sk_tx_queue_get(sk);
3251
 
 
3252	if (queue_index < 0 || skb->ooo_okay ||
3253	    queue_index >= dev->real_num_tx_queues) {
3254		int new_index = get_xps_queue(dev, skb);
 
3255		if (new_index < 0)
3256			new_index = skb_tx_hash(dev, skb);
3257
3258		if (queue_index != new_index && sk &&
3259		    sk_fullsock(sk) &&
3260		    rcu_access_pointer(sk->sk_dst_cache))
3261			sk_tx_queue_set(sk, new_index);
3262
3263		queue_index = new_index;
3264	}
3265
3266	return queue_index;
3267}
 
3268
3269struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3270				    struct sk_buff *skb,
3271				    void *accel_priv)
3272{
3273	int queue_index = 0;
3274
3275#ifdef CONFIG_XPS
3276	u32 sender_cpu = skb->sender_cpu - 1;
3277
3278	if (sender_cpu >= (u32)NR_CPUS)
3279		skb->sender_cpu = raw_smp_processor_id() + 1;
3280#endif
3281
3282	if (dev->real_num_tx_queues != 1) {
3283		const struct net_device_ops *ops = dev->netdev_ops;
 
3284		if (ops->ndo_select_queue)
3285			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3286							    __netdev_pick_tx);
3287		else
3288			queue_index = __netdev_pick_tx(dev, skb);
3289
3290		if (!accel_priv)
3291			queue_index = netdev_cap_txqueue(dev, queue_index);
3292	}
3293
3294	skb_set_queue_mapping(skb, queue_index);
3295	return netdev_get_tx_queue(dev, queue_index);
3296}
3297
3298/**
3299 *	__dev_queue_xmit - transmit a buffer
3300 *	@skb: buffer to transmit
3301 *	@accel_priv: private data used for L2 forwarding offload
3302 *
3303 *	Queue a buffer for transmission to a network device. The caller must
3304 *	have set the device and priority and built the buffer before calling
3305 *	this function. The function can be called from an interrupt.
3306 *
3307 *	A negative errno code is returned on a failure. A success does not
3308 *	guarantee the frame will be transmitted as it may be dropped due
3309 *	to congestion or traffic shaping.
3310 *
3311 * -----------------------------------------------------------------------------------
3312 *      I notice this method can also return errors from the queue disciplines,
3313 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3314 *      be positive.
3315 *
3316 *      Regardless of the return value, the skb is consumed, so it is currently
3317 *      difficult to retry a send to this method.  (You can bump the ref count
3318 *      before sending to hold a reference for retry if you are careful.)
3319 *
3320 *      When calling this method, interrupts MUST be enabled.  This is because
3321 *      the BH enable code must have IRQs enabled so that it will not deadlock.
3322 *          --BLG
3323 */
3324static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3325{
3326	struct net_device *dev = skb->dev;
3327	struct netdev_queue *txq;
3328	struct Qdisc *q;
3329	int rc = -ENOMEM;
 
3330
3331	skb_reset_mac_header(skb);
 
3332
3333	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3334		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3335
3336	/* Disable soft irqs for various locks below. Also
3337	 * stops preemption for RCU.
3338	 */
3339	rcu_read_lock_bh();
3340
3341	skb_update_prio(skb);
3342
3343	qdisc_pkt_len_init(skb);
3344#ifdef CONFIG_NET_CLS_ACT
3345	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3346# ifdef CONFIG_NET_EGRESS
3347	if (static_key_false(&egress_needed)) {
 
 
 
 
 
 
 
 
3348		skb = sch_handle_egress(skb, &rc, dev);
3349		if (!skb)
3350			goto out;
 
 
 
 
3351	}
3352# endif
3353#endif
3354	/* If device/qdisc don't need skb->dst, release it right now while
3355	 * its hot in this cpu cache.
3356	 */
3357	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3358		skb_dst_drop(skb);
3359	else
3360		skb_dst_force(skb);
3361
3362	txq = netdev_pick_tx(dev, skb, accel_priv);
 
 
3363	q = rcu_dereference_bh(txq->qdisc);
3364
3365	trace_net_dev_queue(skb);
3366	if (q->enqueue) {
3367		rc = __dev_xmit_skb(skb, q, dev, txq);
3368		goto out;
3369	}
3370
3371	/* The device has no queue. Common case for software devices:
3372	   loopback, all the sorts of tunnels...
3373
3374	   Really, it is unlikely that netif_tx_lock protection is necessary
3375	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3376	   counters.)
3377	   However, it is possible, that they rely on protection
3378	   made by us here.
3379
3380	   Check this and shot the lock. It is not prone from deadlocks.
3381	   Either shot noqueue qdisc, it is even simpler 8)
3382	 */
3383	if (dev->flags & IFF_UP) {
3384		int cpu = smp_processor_id(); /* ok because BHs are off */
3385
3386		if (txq->xmit_lock_owner != cpu) {
3387			if (unlikely(__this_cpu_read(xmit_recursion) >
3388				     XMIT_RECURSION_LIMIT))
 
 
3389				goto recursion_alert;
3390
3391			skb = validate_xmit_skb(skb, dev);
3392			if (!skb)
3393				goto out;
3394
3395			HARD_TX_LOCK(dev, txq, cpu);
3396
3397			if (!netif_xmit_stopped(txq)) {
3398				__this_cpu_inc(xmit_recursion);
3399				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3400				__this_cpu_dec(xmit_recursion);
3401				if (dev_xmit_complete(rc)) {
3402					HARD_TX_UNLOCK(dev, txq);
3403					goto out;
3404				}
3405			}
3406			HARD_TX_UNLOCK(dev, txq);
3407			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3408					     dev->name);
3409		} else {
3410			/* Recursion is detected! It is possible,
3411			 * unfortunately
3412			 */
3413recursion_alert:
3414			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3415					     dev->name);
3416		}
3417	}
3418
3419	rc = -ENETDOWN;
3420	rcu_read_unlock_bh();
3421
3422	atomic_long_inc(&dev->tx_dropped);
3423	kfree_skb_list(skb);
3424	return rc;
3425out:
3426	rcu_read_unlock_bh();
3427	return rc;
3428}
 
3429
3430int dev_queue_xmit(struct sk_buff *skb)
3431{
3432	return __dev_queue_xmit(skb, NULL);
3433}
3434EXPORT_SYMBOL(dev_queue_xmit);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3435
3436int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3437{
3438	return __dev_queue_xmit(skb, accel_priv);
 
 
 
3439}
3440EXPORT_SYMBOL(dev_queue_xmit_accel);
3441
3442
3443/*=======================================================================
3444			Receiver routines
3445  =======================================================================*/
3446
3447int netdev_max_backlog __read_mostly = 1000;
3448EXPORT_SYMBOL(netdev_max_backlog);
3449
3450int netdev_tstamp_prequeue __read_mostly = 1;
 
3451int netdev_budget __read_mostly = 300;
3452int weight_p __read_mostly = 64;            /* old backlog weight */
 
 
 
 
 
 
3453
3454/* Called with irq disabled */
3455static inline void ____napi_schedule(struct softnet_data *sd,
3456				     struct napi_struct *napi)
3457{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3458	list_add_tail(&napi->poll_list, &sd->poll_list);
3459	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 
 
 
 
 
3460}
3461
3462#ifdef CONFIG_RPS
3463
3464/* One global table that all flow-based protocols share. */
3465struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3466EXPORT_SYMBOL(rps_sock_flow_table);
3467u32 rps_cpu_mask __read_mostly;
3468EXPORT_SYMBOL(rps_cpu_mask);
3469
3470struct static_key rps_needed __read_mostly;
3471EXPORT_SYMBOL(rps_needed);
3472struct static_key rfs_needed __read_mostly;
3473EXPORT_SYMBOL(rfs_needed);
3474
3475static struct rps_dev_flow *
3476set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3477	    struct rps_dev_flow *rflow, u16 next_cpu)
3478{
3479	if (next_cpu < nr_cpu_ids) {
3480#ifdef CONFIG_RFS_ACCEL
3481		struct netdev_rx_queue *rxqueue;
3482		struct rps_dev_flow_table *flow_table;
3483		struct rps_dev_flow *old_rflow;
3484		u32 flow_id;
3485		u16 rxq_index;
3486		int rc;
3487
3488		/* Should we steer this flow to a different hardware queue? */
3489		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3490		    !(dev->features & NETIF_F_NTUPLE))
3491			goto out;
3492		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3493		if (rxq_index == skb_get_rx_queue(skb))
3494			goto out;
3495
3496		rxqueue = dev->_rx + rxq_index;
3497		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3498		if (!flow_table)
3499			goto out;
3500		flow_id = skb_get_hash(skb) & flow_table->mask;
3501		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3502							rxq_index, flow_id);
3503		if (rc < 0)
3504			goto out;
3505		old_rflow = rflow;
3506		rflow = &flow_table->flows[flow_id];
3507		rflow->filter = rc;
3508		if (old_rflow->filter == rflow->filter)
3509			old_rflow->filter = RPS_NO_FILTER;
3510	out:
3511#endif
3512		rflow->last_qtail =
3513			per_cpu(softnet_data, next_cpu).input_queue_head;
3514	}
3515
3516	rflow->cpu = next_cpu;
3517	return rflow;
3518}
3519
3520/*
3521 * get_rps_cpu is called from netif_receive_skb and returns the target
3522 * CPU from the RPS map of the receiving queue for a given skb.
3523 * rcu_read_lock must be held on entry.
3524 */
3525static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3526		       struct rps_dev_flow **rflowp)
3527{
3528	const struct rps_sock_flow_table *sock_flow_table;
3529	struct netdev_rx_queue *rxqueue = dev->_rx;
3530	struct rps_dev_flow_table *flow_table;
3531	struct rps_map *map;
3532	int cpu = -1;
3533	u32 tcpu;
3534	u32 hash;
3535
3536	if (skb_rx_queue_recorded(skb)) {
3537		u16 index = skb_get_rx_queue(skb);
3538
3539		if (unlikely(index >= dev->real_num_rx_queues)) {
3540			WARN_ONCE(dev->real_num_rx_queues > 1,
3541				  "%s received packet on queue %u, but number "
3542				  "of RX queues is %u\n",
3543				  dev->name, index, dev->real_num_rx_queues);
3544			goto done;
3545		}
3546		rxqueue += index;
3547	}
3548
3549	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3550
3551	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3552	map = rcu_dereference(rxqueue->rps_map);
3553	if (!flow_table && !map)
3554		goto done;
3555
3556	skb_reset_network_header(skb);
3557	hash = skb_get_hash(skb);
3558	if (!hash)
3559		goto done;
3560
3561	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3562	if (flow_table && sock_flow_table) {
3563		struct rps_dev_flow *rflow;
3564		u32 next_cpu;
3565		u32 ident;
3566
3567		/* First check into global flow table if there is a match */
3568		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
 
 
3569		if ((ident ^ hash) & ~rps_cpu_mask)
3570			goto try_rps;
3571
3572		next_cpu = ident & rps_cpu_mask;
3573
3574		/* OK, now we know there is a match,
3575		 * we can look at the local (per receive queue) flow table
3576		 */
3577		rflow = &flow_table->flows[hash & flow_table->mask];
3578		tcpu = rflow->cpu;
3579
3580		/*
3581		 * If the desired CPU (where last recvmsg was done) is
3582		 * different from current CPU (one in the rx-queue flow
3583		 * table entry), switch if one of the following holds:
3584		 *   - Current CPU is unset (>= nr_cpu_ids).
3585		 *   - Current CPU is offline.
3586		 *   - The current CPU's queue tail has advanced beyond the
3587		 *     last packet that was enqueued using this table entry.
3588		 *     This guarantees that all previous packets for the flow
3589		 *     have been dequeued, thus preserving in order delivery.
3590		 */
3591		if (unlikely(tcpu != next_cpu) &&
3592		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3593		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3594		      rflow->last_qtail)) >= 0)) {
3595			tcpu = next_cpu;
3596			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3597		}
3598
3599		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3600			*rflowp = rflow;
3601			cpu = tcpu;
3602			goto done;
3603		}
3604	}
3605
3606try_rps:
3607
3608	if (map) {
3609		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3610		if (cpu_online(tcpu)) {
3611			cpu = tcpu;
3612			goto done;
3613		}
3614	}
3615
3616done:
3617	return cpu;
3618}
3619
3620#ifdef CONFIG_RFS_ACCEL
3621
3622/**
3623 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3624 * @dev: Device on which the filter was set
3625 * @rxq_index: RX queue index
3626 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3627 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3628 *
3629 * Drivers that implement ndo_rx_flow_steer() should periodically call
3630 * this function for each installed filter and remove the filters for
3631 * which it returns %true.
3632 */
3633bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3634			 u32 flow_id, u16 filter_id)
3635{
3636	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3637	struct rps_dev_flow_table *flow_table;
3638	struct rps_dev_flow *rflow;
3639	bool expire = true;
3640	unsigned int cpu;
3641
3642	rcu_read_lock();
3643	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3644	if (flow_table && flow_id <= flow_table->mask) {
3645		rflow = &flow_table->flows[flow_id];
3646		cpu = ACCESS_ONCE(rflow->cpu);
3647		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3648		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3649			   rflow->last_qtail) <
3650		     (int)(10 * flow_table->mask)))
3651			expire = false;
3652	}
3653	rcu_read_unlock();
3654	return expire;
3655}
3656EXPORT_SYMBOL(rps_may_expire_flow);
3657
3658#endif /* CONFIG_RFS_ACCEL */
3659
3660/* Called from hardirq (IPI) context */
3661static void rps_trigger_softirq(void *data)
3662{
3663	struct softnet_data *sd = data;
3664
3665	____napi_schedule(sd, &sd->backlog);
3666	sd->received_rps++;
3667}
3668
3669#endif /* CONFIG_RPS */
3670
 
 
 
 
 
 
 
 
 
3671/*
3672 * Check if this softnet_data structure is another cpu one
3673 * If yes, queue it to our IPI list and return 1
3674 * If no, return 0
 
 
 
 
 
3675 */
3676static int rps_ipi_queued(struct softnet_data *sd)
3677{
3678#ifdef CONFIG_RPS
3679	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3680
 
3681	if (sd != mysd) {
3682		sd->rps_ipi_next = mysd->rps_ipi_list;
3683		mysd->rps_ipi_list = sd;
3684
3685		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3686		return 1;
 
 
 
 
3687	}
3688#endif /* CONFIG_RPS */
3689	return 0;
3690}
3691
3692#ifdef CONFIG_NET_FLOW_LIMIT
3693int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3694#endif
3695
3696static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3697{
3698#ifdef CONFIG_NET_FLOW_LIMIT
3699	struct sd_flow_limit *fl;
3700	struct softnet_data *sd;
3701	unsigned int old_flow, new_flow;
3702
3703	if (qlen < (netdev_max_backlog >> 1))
3704		return false;
3705
3706	sd = this_cpu_ptr(&softnet_data);
3707
3708	rcu_read_lock();
3709	fl = rcu_dereference(sd->flow_limit);
3710	if (fl) {
3711		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3712		old_flow = fl->history[fl->history_head];
3713		fl->history[fl->history_head] = new_flow;
3714
3715		fl->history_head++;
3716		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3717
3718		if (likely(fl->buckets[old_flow]))
3719			fl->buckets[old_flow]--;
3720
3721		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3722			fl->count++;
3723			rcu_read_unlock();
3724			return true;
3725		}
3726	}
3727	rcu_read_unlock();
3728#endif
3729	return false;
3730}
3731
3732/*
3733 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3734 * queue (may be a remote CPU queue).
3735 */
3736static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3737			      unsigned int *qtail)
3738{
 
3739	struct softnet_data *sd;
3740	unsigned long flags;
3741	unsigned int qlen;
3742
 
3743	sd = &per_cpu(softnet_data, cpu);
3744
3745	local_irq_save(flags);
3746
3747	rps_lock(sd);
3748	if (!netif_running(skb->dev))
3749		goto drop;
3750	qlen = skb_queue_len(&sd->input_pkt_queue);
3751	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3752		if (qlen) {
3753enqueue:
3754			__skb_queue_tail(&sd->input_pkt_queue, skb);
3755			input_queue_tail_incr_save(sd, qtail);
3756			rps_unlock(sd);
3757			local_irq_restore(flags);
3758			return NET_RX_SUCCESS;
3759		}
3760
3761		/* Schedule NAPI for backlog device
3762		 * We can use non atomic operation since we own the queue lock
3763		 */
3764		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3765			if (!rps_ipi_queued(sd))
3766				____napi_schedule(sd, &sd->backlog);
3767		}
3768		goto enqueue;
3769	}
 
3770
3771drop:
3772	sd->dropped++;
3773	rps_unlock(sd);
3774
3775	local_irq_restore(flags);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3776
3777	atomic_long_inc(&skb->dev->rx_dropped);
3778	kfree_skb(skb);
3779	return NET_RX_DROP;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3780}
 
3781
3782static int netif_rx_internal(struct sk_buff *skb)
3783{
3784	int ret;
3785
3786	net_timestamp_check(netdev_tstamp_prequeue, skb);
3787
3788	trace_netif_rx(skb);
 
3789#ifdef CONFIG_RPS
3790	if (static_key_false(&rps_needed)) {
3791		struct rps_dev_flow voidflow, *rflow = &voidflow;
3792		int cpu;
3793
3794		preempt_disable();
3795		rcu_read_lock();
3796
3797		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3798		if (cpu < 0)
3799			cpu = smp_processor_id();
3800
3801		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3802
3803		rcu_read_unlock();
3804		preempt_enable();
3805	} else
3806#endif
3807	{
3808		unsigned int qtail;
3809		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3810		put_cpu();
3811	}
3812	return ret;
3813}
3814
3815/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3816 *	netif_rx	-	post buffer to the network code
3817 *	@skb: buffer to post
3818 *
3819 *	This function receives a packet from a device driver and queues it for
3820 *	the upper (protocol) levels to process.  It always succeeds. The buffer
3821 *	may be dropped during processing for congestion control or by the
3822 *	protocol layers.
 
 
 
 
 
3823 *
3824 *	return values:
3825 *	NET_RX_SUCCESS	(no congestion)
3826 *	NET_RX_DROP     (packet was dropped)
3827 *
3828 */
3829
3830int netif_rx(struct sk_buff *skb)
3831{
 
 
 
 
 
3832	trace_netif_rx_entry(skb);
3833
3834	return netif_rx_internal(skb);
 
 
 
3835}
3836EXPORT_SYMBOL(netif_rx);
3837
3838int netif_rx_ni(struct sk_buff *skb)
3839{
3840	int err;
3841
3842	trace_netif_rx_ni_entry(skb);
3843
3844	preempt_disable();
3845	err = netif_rx_internal(skb);
3846	if (local_softirq_pending())
3847		do_softirq();
3848	preempt_enable();
3849
3850	return err;
3851}
3852EXPORT_SYMBOL(netif_rx_ni);
3853
3854static __latent_entropy void net_tx_action(struct softirq_action *h)
3855{
3856	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3857
3858	if (sd->completion_queue) {
3859		struct sk_buff *clist;
3860
3861		local_irq_disable();
3862		clist = sd->completion_queue;
3863		sd->completion_queue = NULL;
3864		local_irq_enable();
3865
3866		while (clist) {
3867			struct sk_buff *skb = clist;
 
3868			clist = clist->next;
3869
3870			WARN_ON(atomic_read(&skb->users));
3871			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3872				trace_consume_skb(skb);
3873			else
3874				trace_kfree_skb(skb, net_tx_action);
 
3875
3876			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3877				__kfree_skb(skb);
3878			else
3879				__kfree_skb_defer(skb);
 
3880		}
3881
3882		__kfree_skb_flush();
3883	}
3884
3885	if (sd->output_queue) {
3886		struct Qdisc *head;
3887
3888		local_irq_disable();
3889		head = sd->output_queue;
3890		sd->output_queue = NULL;
3891		sd->output_queue_tailp = &sd->output_queue;
3892		local_irq_enable();
3893
 
 
3894		while (head) {
3895			struct Qdisc *q = head;
3896			spinlock_t *root_lock;
3897
3898			head = head->next_sched;
3899
3900			root_lock = qdisc_lock(q);
3901			spin_lock(root_lock);
3902			/* We need to make sure head->next_sched is read
3903			 * before clearing __QDISC_STATE_SCHED
3904			 */
3905			smp_mb__before_atomic();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3906			clear_bit(__QDISC_STATE_SCHED, &q->state);
3907			qdisc_run(q);
3908			spin_unlock(root_lock);
 
3909		}
 
 
3910	}
 
 
3911}
3912
3913#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3914/* This hook is defined here for ATM LANE */
3915int (*br_fdb_test_addr_hook)(struct net_device *dev,
3916			     unsigned char *addr) __read_mostly;
3917EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3918#endif
3919
3920static inline struct sk_buff *
3921sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3922		   struct net_device *orig_dev)
3923{
3924#ifdef CONFIG_NET_CLS_ACT
3925	struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3926	struct tcf_result cl_res;
3927
3928	/* If there's at least one ingress present somewhere (so
3929	 * we get here via enabled static key), remaining devices
3930	 * that are not configured with an ingress qdisc will bail
3931	 * out here.
3932	 */
3933	if (!cl)
3934		return skb;
3935	if (*pt_prev) {
3936		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3937		*pt_prev = NULL;
3938	}
3939
3940	qdisc_skb_cb(skb)->pkt_len = skb->len;
3941	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3942	qdisc_bstats_cpu_update(cl->q, skb);
3943
3944	switch (tc_classify(skb, cl, &cl_res, false)) {
3945	case TC_ACT_OK:
3946	case TC_ACT_RECLASSIFY:
3947		skb->tc_index = TC_H_MIN(cl_res.classid);
3948		break;
3949	case TC_ACT_SHOT:
3950		qdisc_qstats_cpu_drop(cl->q);
3951		kfree_skb(skb);
3952		return NULL;
3953	case TC_ACT_STOLEN:
3954	case TC_ACT_QUEUED:
3955		consume_skb(skb);
3956		return NULL;
3957	case TC_ACT_REDIRECT:
3958		/* skb_mac_header check was done by cls/act_bpf, so
3959		 * we can safely push the L2 header back before
3960		 * redirecting to another netdev
3961		 */
3962		__skb_push(skb, skb->mac_len);
3963		skb_do_redirect(skb);
3964		return NULL;
3965	default:
3966		break;
3967	}
3968#endif /* CONFIG_NET_CLS_ACT */
3969	return skb;
3970}
3971
3972/**
3973 *	netdev_is_rx_handler_busy - check if receive handler is registered
3974 *	@dev: device to check
3975 *
3976 *	Check if a receive handler is already registered for a given device.
3977 *	Return true if there one.
3978 *
3979 *	The caller must hold the rtnl_mutex.
3980 */
3981bool netdev_is_rx_handler_busy(struct net_device *dev)
3982{
3983	ASSERT_RTNL();
3984	return dev && rtnl_dereference(dev->rx_handler);
3985}
3986EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3987
3988/**
3989 *	netdev_rx_handler_register - register receive handler
3990 *	@dev: device to register a handler for
3991 *	@rx_handler: receive handler to register
3992 *	@rx_handler_data: data pointer that is used by rx handler
3993 *
3994 *	Register a receive handler for a device. This handler will then be
3995 *	called from __netif_receive_skb. A negative errno code is returned
3996 *	on a failure.
3997 *
3998 *	The caller must hold the rtnl_mutex.
3999 *
4000 *	For a general description of rx_handler, see enum rx_handler_result.
4001 */
4002int netdev_rx_handler_register(struct net_device *dev,
4003			       rx_handler_func_t *rx_handler,
4004			       void *rx_handler_data)
4005{
4006	ASSERT_RTNL();
 
4007
4008	if (dev->rx_handler)
4009		return -EBUSY;
4010
4011	/* Note: rx_handler_data must be set before rx_handler */
4012	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4013	rcu_assign_pointer(dev->rx_handler, rx_handler);
4014
4015	return 0;
4016}
4017EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4018
4019/**
4020 *	netdev_rx_handler_unregister - unregister receive handler
4021 *	@dev: device to unregister a handler from
4022 *
4023 *	Unregister a receive handler from a device.
4024 *
4025 *	The caller must hold the rtnl_mutex.
4026 */
4027void netdev_rx_handler_unregister(struct net_device *dev)
4028{
4029
4030	ASSERT_RTNL();
4031	RCU_INIT_POINTER(dev->rx_handler, NULL);
4032	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4033	 * section has a guarantee to see a non NULL rx_handler_data
4034	 * as well.
4035	 */
4036	synchronize_net();
4037	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4038}
4039EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4040
4041/*
4042 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4043 * the special handling of PFMEMALLOC skbs.
4044 */
4045static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4046{
4047	switch (skb->protocol) {
4048	case htons(ETH_P_ARP):
4049	case htons(ETH_P_IP):
4050	case htons(ETH_P_IPV6):
4051	case htons(ETH_P_8021Q):
4052	case htons(ETH_P_8021AD):
4053		return true;
4054	default:
4055		return false;
4056	}
4057}
4058
4059static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4060			     int *ret, struct net_device *orig_dev)
4061{
4062#ifdef CONFIG_NETFILTER_INGRESS
4063	if (nf_hook_ingress_active(skb)) {
4064		int ingress_retval;
4065
4066		if (*pt_prev) {
4067			*ret = deliver_skb(skb, *pt_prev, orig_dev);
4068			*pt_prev = NULL;
4069		}
4070
4071		rcu_read_lock();
4072		ingress_retval = nf_hook_ingress(skb);
4073		rcu_read_unlock();
4074		return ingress_retval;
4075	}
4076#endif /* CONFIG_NETFILTER_INGRESS */
4077	return 0;
4078}
4079
4080static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
 
4081{
4082	struct packet_type *ptype, *pt_prev;
4083	rx_handler_func_t *rx_handler;
 
4084	struct net_device *orig_dev;
4085	bool deliver_exact = false;
4086	int ret = NET_RX_DROP;
4087	__be16 type;
4088
4089	net_timestamp_check(!netdev_tstamp_prequeue, skb);
4090
4091	trace_netif_receive_skb(skb);
4092
4093	orig_dev = skb->dev;
4094
4095	skb_reset_network_header(skb);
4096	if (!skb_transport_header_was_set(skb))
4097		skb_reset_transport_header(skb);
4098	skb_reset_mac_len(skb);
4099
4100	pt_prev = NULL;
4101
4102another_round:
4103	skb->skb_iif = skb->dev->ifindex;
4104
4105	__this_cpu_inc(softnet_data.processed);
4106
4107	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4108	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 
 
 
 
 
 
 
 
 
 
 
 
4109		skb = skb_vlan_untag(skb);
4110		if (unlikely(!skb))
4111			goto out;
4112	}
4113
4114#ifdef CONFIG_NET_CLS_ACT
4115	if (skb->tc_verd & TC_NCLS) {
4116		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4117		goto ncls;
4118	}
4119#endif
4120
4121	if (pfmemalloc)
4122		goto skip_taps;
4123
4124	list_for_each_entry_rcu(ptype, &ptype_all, list) {
4125		if (pt_prev)
4126			ret = deliver_skb(skb, pt_prev, orig_dev);
4127		pt_prev = ptype;
4128	}
4129
4130	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4131		if (pt_prev)
4132			ret = deliver_skb(skb, pt_prev, orig_dev);
4133		pt_prev = ptype;
4134	}
4135
4136skip_taps:
4137#ifdef CONFIG_NET_INGRESS
4138	if (static_key_false(&ingress_needed)) {
4139		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
 
 
 
 
 
 
4140		if (!skb)
4141			goto out;
4142
 
4143		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4144			goto out;
4145	}
4146#endif
4147#ifdef CONFIG_NET_CLS_ACT
4148	skb->tc_verd = 0;
4149ncls:
4150#endif
4151	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4152		goto drop;
4153
4154	if (skb_vlan_tag_present(skb)) {
4155		if (pt_prev) {
4156			ret = deliver_skb(skb, pt_prev, orig_dev);
4157			pt_prev = NULL;
4158		}
4159		if (vlan_do_receive(&skb))
4160			goto another_round;
4161		else if (unlikely(!skb))
4162			goto out;
4163	}
4164
4165	rx_handler = rcu_dereference(skb->dev->rx_handler);
4166	if (rx_handler) {
4167		if (pt_prev) {
4168			ret = deliver_skb(skb, pt_prev, orig_dev);
4169			pt_prev = NULL;
4170		}
4171		switch (rx_handler(&skb)) {
4172		case RX_HANDLER_CONSUMED:
4173			ret = NET_RX_SUCCESS;
4174			goto out;
4175		case RX_HANDLER_ANOTHER:
4176			goto another_round;
4177		case RX_HANDLER_EXACT:
4178			deliver_exact = true;
 
4179		case RX_HANDLER_PASS:
4180			break;
4181		default:
4182			BUG();
4183		}
4184	}
4185
4186	if (unlikely(skb_vlan_tag_present(skb))) {
4187		if (skb_vlan_tag_get_id(skb))
 
 
 
 
4188			skb->pkt_type = PACKET_OTHERHOST;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4189		/* Note: we might in the future use prio bits
4190		 * and set skb->priority like in vlan_do_receive()
4191		 * For the time being, just ignore Priority Code Point
4192		 */
4193		skb->vlan_tci = 0;
4194	}
4195
4196	type = skb->protocol;
4197
4198	/* deliver only exact match when indicated */
4199	if (likely(!deliver_exact)) {
4200		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4201				       &ptype_base[ntohs(type) &
4202						   PTYPE_HASH_MASK]);
4203	}
4204
4205	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4206			       &orig_dev->ptype_specific);
4207
4208	if (unlikely(skb->dev != orig_dev)) {
4209		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4210				       &skb->dev->ptype_specific);
4211	}
4212
4213	if (pt_prev) {
4214		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4215			goto drop;
4216		else
4217			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4218	} else {
4219drop:
4220		if (!deliver_exact)
4221			atomic_long_inc(&skb->dev->rx_dropped);
4222		else
4223			atomic_long_inc(&skb->dev->rx_nohandler);
4224		kfree_skb(skb);
4225		/* Jamal, now you will not able to escape explaining
4226		 * me how you were going to use this. :-)
4227		 */
4228		ret = NET_RX_DROP;
4229	}
4230
4231out:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4232	return ret;
4233}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4234
4235static int __netif_receive_skb(struct sk_buff *skb)
4236{
4237	int ret;
4238
4239	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4240		unsigned long pflags = current->flags;
4241
4242		/*
4243		 * PFMEMALLOC skbs are special, they should
4244		 * - be delivered to SOCK_MEMALLOC sockets only
4245		 * - stay away from userspace
4246		 * - have bounded memory usage
4247		 *
4248		 * Use PF_MEMALLOC as this saves us from propagating the allocation
4249		 * context down to all allocation sites.
4250		 */
4251		current->flags |= PF_MEMALLOC;
4252		ret = __netif_receive_skb_core(skb, true);
4253		tsk_restore_flags(current, pflags, PF_MEMALLOC);
4254	} else
4255		ret = __netif_receive_skb_core(skb, false);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4256
4257	return ret;
4258}
4259
4260static int netif_receive_skb_internal(struct sk_buff *skb)
4261{
4262	int ret;
4263
4264	net_timestamp_check(netdev_tstamp_prequeue, skb);
4265
4266	if (skb_defer_rx_timestamp(skb))
4267		return NET_RX_SUCCESS;
4268
4269	rcu_read_lock();
4270
4271#ifdef CONFIG_RPS
4272	if (static_key_false(&rps_needed)) {
4273		struct rps_dev_flow voidflow, *rflow = &voidflow;
4274		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4275
4276		if (cpu >= 0) {
4277			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4278			rcu_read_unlock();
4279			return ret;
4280		}
4281	}
4282#endif
4283	ret = __netif_receive_skb(skb);
4284	rcu_read_unlock();
4285	return ret;
4286}
4287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4288/**
4289 *	netif_receive_skb - process receive buffer from network
4290 *	@skb: buffer to process
4291 *
4292 *	netif_receive_skb() is the main receive data processing function.
4293 *	It always succeeds. The buffer may be dropped during processing
4294 *	for congestion control or by the protocol layers.
4295 *
4296 *	This function may only be called from softirq context and interrupts
4297 *	should be enabled.
4298 *
4299 *	Return values (usually ignored):
4300 *	NET_RX_SUCCESS: no congestion
4301 *	NET_RX_DROP: packet was dropped
4302 */
4303int netif_receive_skb(struct sk_buff *skb)
4304{
 
 
4305	trace_netif_receive_skb_entry(skb);
4306
4307	return netif_receive_skb_internal(skb);
 
 
 
4308}
4309EXPORT_SYMBOL(netif_receive_skb);
4310
4311DEFINE_PER_CPU(struct work_struct, flush_works);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4312
4313/* Network device is going away, flush any packets still pending */
4314static void flush_backlog(struct work_struct *work)
4315{
4316	struct sk_buff *skb, *tmp;
4317	struct softnet_data *sd;
4318
4319	local_bh_disable();
4320	sd = this_cpu_ptr(&softnet_data);
4321
4322	local_irq_disable();
4323	rps_lock(sd);
4324	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4325		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4326			__skb_unlink(skb, &sd->input_pkt_queue);
4327			kfree_skb(skb);
4328			input_queue_head_incr(sd);
4329		}
4330	}
4331	rps_unlock(sd);
4332	local_irq_enable();
4333
4334	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4335		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4336			__skb_unlink(skb, &sd->process_queue);
4337			kfree_skb(skb);
4338			input_queue_head_incr(sd);
4339		}
4340	}
4341	local_bh_enable();
4342}
4343
4344static void flush_all_backlogs(void)
4345{
4346	unsigned int cpu;
 
 
4347
4348	get_online_cpus();
4349
4350	for_each_online_cpu(cpu)
4351		queue_work_on(cpu, system_highpri_wq,
4352			      per_cpu_ptr(&flush_works, cpu));
 
 
 
4353
4354	for_each_online_cpu(cpu)
4355		flush_work(per_cpu_ptr(&flush_works, cpu));
4356
4357	put_online_cpus();
 
 
 
 
4358}
4359
4360static int napi_gro_complete(struct sk_buff *skb)
4361{
4362	struct packet_offload *ptype;
4363	__be16 type = skb->protocol;
4364	struct list_head *head = &offload_base;
4365	int err = -ENOENT;
4366
4367	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4368
4369	if (NAPI_GRO_CB(skb)->count == 1) {
4370		skb_shinfo(skb)->gso_size = 0;
4371		goto out;
4372	}
4373
4374	rcu_read_lock();
4375	list_for_each_entry_rcu(ptype, head, list) {
4376		if (ptype->type != type || !ptype->callbacks.gro_complete)
4377			continue;
4378
4379		err = ptype->callbacks.gro_complete(skb, 0);
4380		break;
4381	}
4382	rcu_read_unlock();
4383
4384	if (err) {
4385		WARN_ON(&ptype->list == head);
4386		kfree_skb(skb);
4387		return NET_RX_SUCCESS;
4388	}
4389
4390out:
4391	return netif_receive_skb_internal(skb);
4392}
4393
4394/* napi->gro_list contains packets ordered by age.
4395 * youngest packets at the head of it.
4396 * Complete skbs in reverse order to reduce latencies.
4397 */
4398void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4399{
4400	struct sk_buff *skb, *prev = NULL;
4401
4402	/* scan list and build reverse chain */
4403	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4404		skb->prev = prev;
4405		prev = skb;
4406	}
4407
4408	for (skb = prev; skb; skb = prev) {
4409		skb->next = NULL;
4410
4411		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4412			return;
4413
4414		prev = skb->prev;
4415		napi_gro_complete(skb);
4416		napi->gro_count--;
4417	}
4418
4419	napi->gro_list = NULL;
4420}
4421EXPORT_SYMBOL(napi_gro_flush);
4422
4423static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4424{
4425	struct sk_buff *p;
4426	unsigned int maclen = skb->dev->hard_header_len;
4427	u32 hash = skb_get_hash_raw(skb);
4428
4429	for (p = napi->gro_list; p; p = p->next) {
4430		unsigned long diffs;
4431
4432		NAPI_GRO_CB(p)->flush = 0;
4433
4434		if (hash != skb_get_hash_raw(p)) {
4435			NAPI_GRO_CB(p)->same_flow = 0;
4436			continue;
4437		}
4438
4439		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4440		diffs |= p->vlan_tci ^ skb->vlan_tci;
4441		diffs |= skb_metadata_dst_cmp(p, skb);
4442		if (maclen == ETH_HLEN)
4443			diffs |= compare_ether_header(skb_mac_header(p),
4444						      skb_mac_header(skb));
4445		else if (!diffs)
4446			diffs = memcmp(skb_mac_header(p),
4447				       skb_mac_header(skb),
4448				       maclen);
4449		NAPI_GRO_CB(p)->same_flow = !diffs;
4450	}
4451}
4452
4453static void skb_gro_reset_offset(struct sk_buff *skb)
4454{
4455	const struct skb_shared_info *pinfo = skb_shinfo(skb);
4456	const skb_frag_t *frag0 = &pinfo->frags[0];
4457
4458	NAPI_GRO_CB(skb)->data_offset = 0;
4459	NAPI_GRO_CB(skb)->frag0 = NULL;
4460	NAPI_GRO_CB(skb)->frag0_len = 0;
4461
4462	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4463	    pinfo->nr_frags &&
4464	    !PageHighMem(skb_frag_page(frag0))) {
4465		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4466		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4467						    skb_frag_size(frag0),
4468						    skb->end - skb->tail);
4469	}
4470}
4471
4472static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4473{
4474	struct skb_shared_info *pinfo = skb_shinfo(skb);
4475
4476	BUG_ON(skb->end - skb->tail < grow);
4477
4478	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4479
4480	skb->data_len -= grow;
4481	skb->tail += grow;
4482
4483	pinfo->frags[0].page_offset += grow;
4484	skb_frag_size_sub(&pinfo->frags[0], grow);
4485
4486	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4487		skb_frag_unref(skb, 0);
4488		memmove(pinfo->frags, pinfo->frags + 1,
4489			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
4490	}
4491}
4492
4493static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4494{
4495	struct sk_buff **pp = NULL;
4496	struct packet_offload *ptype;
4497	__be16 type = skb->protocol;
4498	struct list_head *head = &offload_base;
4499	int same_flow;
4500	enum gro_result ret;
4501	int grow;
4502
4503	if (!(skb->dev->features & NETIF_F_GRO))
4504		goto normal;
4505
4506	if (skb->csum_bad)
4507		goto normal;
4508
4509	gro_list_prepare(napi, skb);
4510
4511	rcu_read_lock();
4512	list_for_each_entry_rcu(ptype, head, list) {
4513		if (ptype->type != type || !ptype->callbacks.gro_receive)
4514			continue;
4515
4516		skb_set_network_header(skb, skb_gro_offset(skb));
4517		skb_reset_mac_len(skb);
4518		NAPI_GRO_CB(skb)->same_flow = 0;
4519		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
4520		NAPI_GRO_CB(skb)->free = 0;
4521		NAPI_GRO_CB(skb)->encap_mark = 0;
4522		NAPI_GRO_CB(skb)->recursion_counter = 0;
4523		NAPI_GRO_CB(skb)->is_fou = 0;
4524		NAPI_GRO_CB(skb)->is_atomic = 1;
4525		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4526
4527		/* Setup for GRO checksum validation */
4528		switch (skb->ip_summed) {
4529		case CHECKSUM_COMPLETE:
4530			NAPI_GRO_CB(skb)->csum = skb->csum;
4531			NAPI_GRO_CB(skb)->csum_valid = 1;
4532			NAPI_GRO_CB(skb)->csum_cnt = 0;
4533			break;
4534		case CHECKSUM_UNNECESSARY:
4535			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4536			NAPI_GRO_CB(skb)->csum_valid = 0;
4537			break;
4538		default:
4539			NAPI_GRO_CB(skb)->csum_cnt = 0;
4540			NAPI_GRO_CB(skb)->csum_valid = 0;
4541		}
4542
4543		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4544		break;
4545	}
4546	rcu_read_unlock();
4547
4548	if (&ptype->list == head)
4549		goto normal;
4550
4551	same_flow = NAPI_GRO_CB(skb)->same_flow;
4552	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4553
4554	if (pp) {
4555		struct sk_buff *nskb = *pp;
4556
4557		*pp = nskb->next;
4558		nskb->next = NULL;
4559		napi_gro_complete(nskb);
4560		napi->gro_count--;
4561	}
4562
4563	if (same_flow)
4564		goto ok;
4565
4566	if (NAPI_GRO_CB(skb)->flush)
4567		goto normal;
4568
4569	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4570		struct sk_buff *nskb = napi->gro_list;
4571
4572		/* locate the end of the list to select the 'oldest' flow */
4573		while (nskb->next) {
4574			pp = &nskb->next;
4575			nskb = *pp;
4576		}
4577		*pp = NULL;
4578		nskb->next = NULL;
4579		napi_gro_complete(nskb);
4580	} else {
4581		napi->gro_count++;
4582	}
4583	NAPI_GRO_CB(skb)->count = 1;
4584	NAPI_GRO_CB(skb)->age = jiffies;
4585	NAPI_GRO_CB(skb)->last = skb;
4586	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4587	skb->next = napi->gro_list;
4588	napi->gro_list = skb;
4589	ret = GRO_HELD;
4590
4591pull:
4592	grow = skb_gro_offset(skb) - skb_headlen(skb);
4593	if (grow > 0)
4594		gro_pull_from_frag0(skb, grow);
4595ok:
4596	return ret;
4597
4598normal:
4599	ret = GRO_NORMAL;
4600	goto pull;
4601}
4602
4603struct packet_offload *gro_find_receive_by_type(__be16 type)
4604{
4605	struct list_head *offload_head = &offload_base;
4606	struct packet_offload *ptype;
4607
4608	list_for_each_entry_rcu(ptype, offload_head, list) {
4609		if (ptype->type != type || !ptype->callbacks.gro_receive)
4610			continue;
4611		return ptype;
4612	}
4613	return NULL;
4614}
4615EXPORT_SYMBOL(gro_find_receive_by_type);
4616
4617struct packet_offload *gro_find_complete_by_type(__be16 type)
4618{
4619	struct list_head *offload_head = &offload_base;
4620	struct packet_offload *ptype;
4621
4622	list_for_each_entry_rcu(ptype, offload_head, list) {
4623		if (ptype->type != type || !ptype->callbacks.gro_complete)
4624			continue;
4625		return ptype;
4626	}
4627	return NULL;
4628}
4629EXPORT_SYMBOL(gro_find_complete_by_type);
4630
4631static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4632{
4633	switch (ret) {
4634	case GRO_NORMAL:
4635		if (netif_receive_skb_internal(skb))
4636			ret = GRO_DROP;
4637		break;
4638
4639	case GRO_DROP:
4640		kfree_skb(skb);
4641		break;
4642
4643	case GRO_MERGED_FREE:
4644		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4645			skb_dst_drop(skb);
4646			kmem_cache_free(skbuff_head_cache, skb);
4647		} else {
4648			__kfree_skb(skb);
4649		}
4650		break;
4651
4652	case GRO_HELD:
4653	case GRO_MERGED:
4654		break;
4655	}
4656
4657	return ret;
4658}
4659
4660gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4661{
4662	skb_mark_napi_id(skb, napi);
4663	trace_napi_gro_receive_entry(skb);
4664
4665	skb_gro_reset_offset(skb);
4666
4667	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4668}
4669EXPORT_SYMBOL(napi_gro_receive);
4670
4671static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4672{
4673	if (unlikely(skb->pfmemalloc)) {
4674		consume_skb(skb);
4675		return;
4676	}
4677	__skb_pull(skb, skb_headlen(skb));
4678	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4679	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4680	skb->vlan_tci = 0;
4681	skb->dev = napi->dev;
4682	skb->skb_iif = 0;
4683	skb->encapsulation = 0;
4684	skb_shinfo(skb)->gso_type = 0;
4685	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4686
4687	napi->skb = skb;
4688}
4689
4690struct sk_buff *napi_get_frags(struct napi_struct *napi)
4691{
4692	struct sk_buff *skb = napi->skb;
4693
4694	if (!skb) {
4695		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4696		if (skb) {
4697			napi->skb = skb;
4698			skb_mark_napi_id(skb, napi);
4699		}
4700	}
4701	return skb;
4702}
4703EXPORT_SYMBOL(napi_get_frags);
4704
4705static gro_result_t napi_frags_finish(struct napi_struct *napi,
4706				      struct sk_buff *skb,
4707				      gro_result_t ret)
4708{
4709	switch (ret) {
4710	case GRO_NORMAL:
4711	case GRO_HELD:
4712		__skb_push(skb, ETH_HLEN);
4713		skb->protocol = eth_type_trans(skb, skb->dev);
4714		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4715			ret = GRO_DROP;
4716		break;
4717
4718	case GRO_DROP:
4719	case GRO_MERGED_FREE:
4720		napi_reuse_skb(napi, skb);
4721		break;
4722
4723	case GRO_MERGED:
4724		break;
4725	}
4726
4727	return ret;
4728}
4729
4730/* Upper GRO stack assumes network header starts at gro_offset=0
4731 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4732 * We copy ethernet header into skb->data to have a common layout.
4733 */
4734static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4735{
4736	struct sk_buff *skb = napi->skb;
4737	const struct ethhdr *eth;
4738	unsigned int hlen = sizeof(*eth);
4739
4740	napi->skb = NULL;
4741
4742	skb_reset_mac_header(skb);
4743	skb_gro_reset_offset(skb);
4744
4745	eth = skb_gro_header_fast(skb, 0);
4746	if (unlikely(skb_gro_header_hard(skb, hlen))) {
4747		eth = skb_gro_header_slow(skb, hlen, 0);
4748		if (unlikely(!eth)) {
4749			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4750					     __func__, napi->dev->name);
4751			napi_reuse_skb(napi, skb);
4752			return NULL;
4753		}
4754	} else {
4755		gro_pull_from_frag0(skb, hlen);
4756		NAPI_GRO_CB(skb)->frag0 += hlen;
4757		NAPI_GRO_CB(skb)->frag0_len -= hlen;
4758	}
4759	__skb_pull(skb, hlen);
4760
4761	/*
4762	 * This works because the only protocols we care about don't require
4763	 * special handling.
4764	 * We'll fix it up properly in napi_frags_finish()
4765	 */
4766	skb->protocol = eth->h_proto;
 
4767
4768	return skb;
4769}
4770
4771gro_result_t napi_gro_frags(struct napi_struct *napi)
4772{
4773	struct sk_buff *skb = napi_frags_skb(napi);
4774
4775	if (!skb)
4776		return GRO_DROP;
4777
4778	trace_napi_gro_frags_entry(skb);
4779
4780	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4781}
4782EXPORT_SYMBOL(napi_gro_frags);
4783
4784/* Compute the checksum from gro_offset and return the folded value
4785 * after adding in any pseudo checksum.
4786 */
4787__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4788{
4789	__wsum wsum;
4790	__sum16 sum;
4791
4792	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4793
4794	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4795	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4796	if (likely(!sum)) {
4797		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4798		    !skb->csum_complete_sw)
4799			netdev_rx_csum_fault(skb->dev);
4800	}
4801
4802	NAPI_GRO_CB(skb)->csum = wsum;
4803	NAPI_GRO_CB(skb)->csum_valid = 1;
4804
4805	return sum;
4806}
4807EXPORT_SYMBOL(__skb_gro_checksum_complete);
4808
4809/*
4810 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4811 * Note: called with local irq disabled, but exits with local irq enabled.
4812 */
4813static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4814{
4815#ifdef CONFIG_RPS
4816	struct softnet_data *remsd = sd->rps_ipi_list;
4817
4818	if (remsd) {
4819		sd->rps_ipi_list = NULL;
4820
4821		local_irq_enable();
4822
4823		/* Send pending IPI's to kick RPS processing on remote cpus. */
4824		while (remsd) {
4825			struct softnet_data *next = remsd->rps_ipi_next;
4826
4827			if (cpu_online(remsd->cpu))
4828				smp_call_function_single_async(remsd->cpu,
4829							   &remsd->csd);
4830			remsd = next;
4831		}
4832	} else
4833#endif
4834		local_irq_enable();
4835}
4836
4837static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4838{
4839#ifdef CONFIG_RPS
4840	return sd->rps_ipi_list != NULL;
4841#else
4842	return false;
4843#endif
4844}
4845
4846static int process_backlog(struct napi_struct *napi, int quota)
4847{
4848	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4849	bool again = true;
4850	int work = 0;
4851
4852	/* Check if we have pending ipi, its better to send them now,
4853	 * not waiting net_rx_action() end.
4854	 */
4855	if (sd_has_rps_ipi_waiting(sd)) {
4856		local_irq_disable();
4857		net_rps_action_and_irq_enable(sd);
4858	}
4859
4860	napi->weight = weight_p;
4861	while (again) {
4862		struct sk_buff *skb;
4863
4864		while ((skb = __skb_dequeue(&sd->process_queue))) {
4865			rcu_read_lock();
4866			__netif_receive_skb(skb);
4867			rcu_read_unlock();
4868			input_queue_head_incr(sd);
4869			if (++work >= quota)
4870				return work;
4871
4872		}
4873
4874		local_irq_disable();
4875		rps_lock(sd);
4876		if (skb_queue_empty(&sd->input_pkt_queue)) {
4877			/*
4878			 * Inline a custom version of __napi_complete().
4879			 * only current cpu owns and manipulates this napi,
4880			 * and NAPI_STATE_SCHED is the only possible flag set
4881			 * on backlog.
4882			 * We can use a plain write instead of clear_bit(),
4883			 * and we dont need an smp_mb() memory barrier.
4884			 */
4885			napi->state = 0;
4886			again = false;
4887		} else {
4888			skb_queue_splice_tail_init(&sd->input_pkt_queue,
4889						   &sd->process_queue);
4890		}
4891		rps_unlock(sd);
4892		local_irq_enable();
4893	}
4894
4895	return work;
4896}
4897
4898/**
4899 * __napi_schedule - schedule for receive
4900 * @n: entry to schedule
4901 *
4902 * The entry's receive function will be scheduled to run.
4903 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4904 */
4905void __napi_schedule(struct napi_struct *n)
4906{
4907	unsigned long flags;
4908
4909	local_irq_save(flags);
4910	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4911	local_irq_restore(flags);
4912}
4913EXPORT_SYMBOL(__napi_schedule);
4914
4915/**
4916 *	napi_schedule_prep - check if napi can be scheduled
4917 *	@n: napi context
4918 *
4919 * Test if NAPI routine is already running, and if not mark
4920 * it as running.  This is used as a condition variable
4921 * insure only one NAPI poll instance runs.  We also make
4922 * sure there is no pending NAPI disable.
4923 */
4924bool napi_schedule_prep(struct napi_struct *n)
4925{
4926	unsigned long val, new;
4927
4928	do {
4929		val = READ_ONCE(n->state);
4930		if (unlikely(val & NAPIF_STATE_DISABLE))
4931			return false;
4932		new = val | NAPIF_STATE_SCHED;
4933
4934		/* Sets STATE_MISSED bit if STATE_SCHED was already set
4935		 * This was suggested by Alexander Duyck, as compiler
4936		 * emits better code than :
4937		 * if (val & NAPIF_STATE_SCHED)
4938		 *     new |= NAPIF_STATE_MISSED;
4939		 */
4940		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
4941						   NAPIF_STATE_MISSED;
4942	} while (cmpxchg(&n->state, val, new) != val);
4943
4944	return !(val & NAPIF_STATE_SCHED);
4945}
4946EXPORT_SYMBOL(napi_schedule_prep);
4947
4948/**
4949 * __napi_schedule_irqoff - schedule for receive
4950 * @n: entry to schedule
4951 *
4952 * Variant of __napi_schedule() assuming hard irqs are masked
 
 
 
 
4953 */
4954void __napi_schedule_irqoff(struct napi_struct *n)
4955{
4956	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 
 
 
4957}
4958EXPORT_SYMBOL(__napi_schedule_irqoff);
4959
4960bool __napi_complete(struct napi_struct *n)
4961{
4962	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4963
4964	/* Some drivers call us directly, instead of calling
4965	 * napi_complete_done().
4966	 */
4967	if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
4968		return false;
4969
4970	list_del_init(&n->poll_list);
4971	smp_mb__before_atomic();
4972	clear_bit(NAPI_STATE_SCHED, &n->state);
4973	return true;
4974}
4975EXPORT_SYMBOL(__napi_complete);
4976
4977bool napi_complete_done(struct napi_struct *n, int work_done)
4978{
4979	unsigned long flags, val, new;
 
4980
4981	/*
4982	 * 1) Don't let napi dequeue from the cpu poll list
4983	 *    just in case its running on a different cpu.
4984	 * 2) If we are busy polling, do nothing here, we have
4985	 *    the guarantee we will be called later.
4986	 */
4987	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
4988				 NAPIF_STATE_IN_BUSY_POLL)))
4989		return false;
4990
4991	if (n->gro_list) {
4992		unsigned long timeout = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4993
4994		if (work_done)
4995			timeout = n->dev->gro_flush_timeout;
4996
4997		if (timeout)
4998			hrtimer_start(&n->timer, ns_to_ktime(timeout),
4999				      HRTIMER_MODE_REL_PINNED);
5000		else
5001			napi_gro_flush(n, false);
5002	}
5003	if (unlikely(!list_empty(&n->poll_list))) {
5004		/* If n->poll_list is not empty, we need to mask irqs */
5005		local_irq_save(flags);
5006		list_del_init(&n->poll_list);
5007		local_irq_restore(flags);
5008	}
 
5009
 
5010	do {
5011		val = READ_ONCE(n->state);
5012
5013		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
5014
5015		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
 
 
5016
5017		/* If STATE_MISSED was set, leave STATE_SCHED set,
5018		 * because we will call napi->poll() one more time.
5019		 * This C code was suggested by Alexander Duyck to help gcc.
5020		 */
5021		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
5022						    NAPIF_STATE_SCHED;
5023	} while (cmpxchg(&n->state, val, new) != val);
5024
5025	if (unlikely(val & NAPIF_STATE_MISSED)) {
5026		__napi_schedule(n);
5027		return false;
5028	}
5029
5030	return true;
 
 
 
5031}
5032EXPORT_SYMBOL(napi_complete_done);
5033
5034/* must be called under rcu_read_lock(), as we dont take a reference */
5035static struct napi_struct *napi_by_id(unsigned int napi_id)
5036{
5037	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
5038	struct napi_struct *napi;
5039
5040	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
5041		if (napi->napi_id == napi_id)
5042			return napi;
5043
5044	return NULL;
5045}
5046
5047#if defined(CONFIG_NET_RX_BUSY_POLL)
5048
5049#define BUSY_POLL_BUDGET 8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5050
5051static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
 
5052{
 
 
5053	int rc;
5054
5055	/* Busy polling means there is a high chance device driver hard irq
5056	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
5057	 * set in napi_schedule_prep().
5058	 * Since we are about to call napi->poll() once more, we can safely
5059	 * clear NAPI_STATE_MISSED.
5060	 *
5061	 * Note: x86 could use a single "lock and ..." instruction
5062	 * to perform these two clear_bit()
5063	 */
5064	clear_bit(NAPI_STATE_MISSED, &napi->state);
5065	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
5066
5067	local_bh_disable();
5068
 
 
 
 
 
 
 
 
 
5069	/* All we really want here is to re-enable device interrupts.
5070	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
5071	 */
5072	rc = napi->poll(napi, BUSY_POLL_BUDGET);
 
 
 
 
 
5073	netpoll_poll_unlock(have_poll_lock);
5074	if (rc == BUSY_POLL_BUDGET)
5075		__napi_schedule(napi);
5076	local_bh_enable();
5077	if (local_softirq_pending())
5078		do_softirq();
5079}
5080
5081bool sk_busy_loop(struct sock *sk, int nonblock)
 
 
5082{
5083	unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
5084	int (*napi_poll)(struct napi_struct *napi, int budget);
5085	int (*busy_poll)(struct napi_struct *dev);
5086	void *have_poll_lock = NULL;
5087	struct napi_struct *napi;
5088	int rc;
5089
5090restart:
5091	rc = false;
5092	napi_poll = NULL;
5093
5094	rcu_read_lock();
5095
5096	napi = napi_by_id(sk->sk_napi_id);
5097	if (!napi)
5098		goto out;
5099
5100	/* Note: ndo_busy_poll method is optional in linux-4.5 */
5101	busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
 
 
5102
5103	preempt_disable();
5104	for (;;) {
5105		rc = 0;
5106		local_bh_disable();
5107		if (busy_poll) {
5108			rc = busy_poll(napi);
5109			goto count;
5110		}
5111		if (!napi_poll) {
5112			unsigned long val = READ_ONCE(napi->state);
5113
5114			/* If multiple threads are competing for this napi,
5115			 * we avoid dirtying napi->state as much as we can.
5116			 */
5117			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
5118				   NAPIF_STATE_IN_BUSY_POLL))
 
 
5119				goto count;
 
5120			if (cmpxchg(&napi->state, val,
5121				    val | NAPIF_STATE_IN_BUSY_POLL |
5122					  NAPIF_STATE_SCHED) != val)
 
 
5123				goto count;
 
5124			have_poll_lock = netpoll_poll_lock(napi);
5125			napi_poll = napi->poll;
5126		}
5127		rc = napi_poll(napi, BUSY_POLL_BUDGET);
5128		trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
 
5129count:
5130		if (rc > 0)
5131			__NET_ADD_STATS(sock_net(sk),
5132					LINUX_MIB_BUSYPOLLRXPACKETS, rc);
5133		local_bh_enable();
5134
5135		if (rc == LL_FLUSH_FAILED)
5136			break; /* permanent failure */
5137
5138		if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
5139		    busy_loop_timeout(end_time))
5140			break;
5141
5142		if (unlikely(need_resched())) {
5143			if (napi_poll)
5144				busy_poll_stop(napi, have_poll_lock);
5145			preempt_enable();
 
5146			rcu_read_unlock();
5147			cond_resched();
5148			rc = !skb_queue_empty(&sk->sk_receive_queue);
5149			if (rc || busy_loop_timeout(end_time))
5150				return rc;
5151			goto restart;
5152		}
5153		cpu_relax();
5154	}
5155	if (napi_poll)
5156		busy_poll_stop(napi, have_poll_lock);
5157	preempt_enable();
5158	rc = !skb_queue_empty(&sk->sk_receive_queue);
5159out:
5160	rcu_read_unlock();
5161	return rc;
5162}
5163EXPORT_SYMBOL(sk_busy_loop);
5164
5165#endif /* CONFIG_NET_RX_BUSY_POLL */
5166
5167static void napi_hash_add(struct napi_struct *napi)
5168{
5169	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5170	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5171		return;
5172
5173	spin_lock(&napi_hash_lock);
5174
5175	/* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5176	do {
5177		if (unlikely(++napi_gen_id < NR_CPUS + 1))
5178			napi_gen_id = NR_CPUS + 1;
5179	} while (napi_by_id(napi_gen_id));
5180	napi->napi_id = napi_gen_id;
5181
5182	hlist_add_head_rcu(&napi->napi_hash_node,
5183			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5184
5185	spin_unlock(&napi_hash_lock);
5186}
5187
5188/* Warning : caller is responsible to make sure rcu grace period
5189 * is respected before freeing memory containing @napi
5190 */
5191bool napi_hash_del(struct napi_struct *napi)
5192{
5193	bool rcu_sync_needed = false;
5194
5195	spin_lock(&napi_hash_lock);
5196
5197	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5198		rcu_sync_needed = true;
5199		hlist_del_rcu(&napi->napi_hash_node);
5200	}
5201	spin_unlock(&napi_hash_lock);
5202	return rcu_sync_needed;
5203}
5204EXPORT_SYMBOL_GPL(napi_hash_del);
5205
5206static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5207{
5208	struct napi_struct *napi;
5209
5210	napi = container_of(timer, struct napi_struct, timer);
5211
5212	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
5213	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
5214	 */
5215	if (napi->gro_list && !napi_disable_pending(napi) &&
5216	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
 
5217		__napi_schedule_irqoff(napi);
 
5218
5219	return HRTIMER_NORESTART;
5220}
5221
5222void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5223		    int (*poll)(struct napi_struct *, int), int weight)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5224{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5225	INIT_LIST_HEAD(&napi->poll_list);
 
5226	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5227	napi->timer.function = napi_watchdog;
5228	napi->gro_count = 0;
5229	napi->gro_list = NULL;
5230	napi->skb = NULL;
 
 
5231	napi->poll = poll;
5232	if (weight > NAPI_POLL_WEIGHT)
5233		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5234			    weight, dev->name);
5235	napi->weight = weight;
5236	list_add(&napi->dev_list, &dev->napi_list);
5237	napi->dev = dev;
5238#ifdef CONFIG_NETPOLL
5239	napi->poll_owner = -1;
5240#endif
 
5241	set_bit(NAPI_STATE_SCHED, &napi->state);
 
 
5242	napi_hash_add(napi);
 
 
 
 
 
 
 
 
5243}
5244EXPORT_SYMBOL(netif_napi_add);
5245
5246void napi_disable(struct napi_struct *n)
5247{
 
 
5248	might_sleep();
5249	set_bit(NAPI_STATE_DISABLE, &n->state);
5250
5251	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5252		msleep(1);
5253	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5254		msleep(1);
 
 
 
 
 
 
5255
5256	hrtimer_cancel(&n->timer);
5257
5258	clear_bit(NAPI_STATE_DISABLE, &n->state);
5259}
5260EXPORT_SYMBOL(napi_disable);
5261
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5262/* Must be called in process context */
5263void netif_napi_del(struct napi_struct *napi)
5264{
5265	might_sleep();
5266	if (napi_hash_del(napi))
5267		synchronize_net();
5268	list_del_init(&napi->dev_list);
 
5269	napi_free_frags(napi);
5270
5271	kfree_skb_list(napi->gro_list);
5272	napi->gro_list = NULL;
5273	napi->gro_count = 0;
 
 
 
 
5274}
5275EXPORT_SYMBOL(netif_napi_del);
5276
5277static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5278{
5279	void *have;
5280	int work, weight;
5281
5282	list_del_init(&n->poll_list);
5283
5284	have = netpoll_poll_lock(n);
5285
5286	weight = n->weight;
5287
5288	/* This NAPI_STATE_SCHED test is for avoiding a race
5289	 * with netpoll's poll_napi().  Only the entity which
5290	 * obtains the lock and sees NAPI_STATE_SCHED set will
5291	 * actually make the ->poll() call.  Therefore we avoid
5292	 * accidentally calling ->poll() when NAPI is not scheduled.
5293	 */
5294	work = 0;
5295	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5296		work = n->poll(n, weight);
5297		trace_napi_poll(n, work, weight);
 
 
5298	}
5299
5300	WARN_ON_ONCE(work > weight);
 
 
5301
5302	if (likely(work < weight))
5303		goto out_unlock;
5304
5305	/* Drivers must not modify the NAPI state if they
5306	 * consume the entire weight.  In such cases this code
5307	 * still "owns" the NAPI instance and therefore can
5308	 * move the instance around on the list at-will.
5309	 */
5310	if (unlikely(napi_disable_pending(n))) {
5311		napi_complete(n);
5312		goto out_unlock;
 
 
 
 
 
 
 
 
 
 
 
 
 
5313	}
5314
5315	if (n->gro_list) {
5316		/* flush too old packets
5317		 * If HZ < 1000, flush all packets.
5318		 */
5319		napi_gro_flush(n, HZ >= 1000);
5320	}
5321
 
 
5322	/* Some drivers may have called napi_schedule
5323	 * prior to exhausting their budget.
5324	 */
5325	if (unlikely(!list_empty(&n->poll_list))) {
5326		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5327			     n->dev ? n->dev->name : "backlog");
5328		goto out_unlock;
5329	}
5330
5331	list_add_tail(&n->poll_list, repoll);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5332
5333out_unlock:
5334	netpoll_poll_unlock(have);
5335
5336	return work;
5337}
5338
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5339static __latent_entropy void net_rx_action(struct softirq_action *h)
5340{
5341	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5342	unsigned long time_limit = jiffies + 2;
5343	int budget = netdev_budget;
 
5344	LIST_HEAD(list);
5345	LIST_HEAD(repoll);
5346
 
 
5347	local_irq_disable();
5348	list_splice_init(&sd->poll_list, &list);
5349	local_irq_enable();
5350
5351	for (;;) {
5352		struct napi_struct *n;
5353
 
 
5354		if (list_empty(&list)) {
5355			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5356				goto out;
 
 
 
 
 
 
 
 
 
 
5357			break;
5358		}
5359
5360		n = list_first_entry(&list, struct napi_struct, poll_list);
5361		budget -= napi_poll(n, &repoll);
5362
5363		/* If softirq window is exhausted then punt.
5364		 * Allow this to run for 2 jiffies since which will allow
5365		 * an average latency of 1.5/HZ.
5366		 */
5367		if (unlikely(budget <= 0 ||
5368			     time_after_eq(jiffies, time_limit))) {
5369			sd->time_squeeze++;
5370			break;
5371		}
5372	}
5373
5374	local_irq_disable();
5375
5376	list_splice_tail_init(&sd->poll_list, &list);
5377	list_splice_tail(&repoll, &list);
5378	list_splice(&list, &sd->poll_list);
5379	if (!list_empty(&sd->poll_list))
5380		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 
 
5381
5382	net_rps_action_and_irq_enable(sd);
5383out:
5384	__kfree_skb_flush();
5385}
5386
5387struct netdev_adjacent {
5388	struct net_device *dev;
 
5389
5390	/* upper master flag, there can only be one master device per list */
5391	bool master;
5392
 
 
 
5393	/* counter for the number of times this device was added to us */
5394	u16 ref_nr;
5395
5396	/* private field for the users */
5397	void *private;
5398
5399	struct list_head list;
5400	struct rcu_head rcu;
5401};
5402
5403static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5404						 struct list_head *adj_list)
5405{
5406	struct netdev_adjacent *adj;
5407
5408	list_for_each_entry(adj, adj_list, list) {
5409		if (adj->dev == adj_dev)
5410			return adj;
5411	}
5412	return NULL;
5413}
5414
5415static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
 
5416{
5417	struct net_device *dev = data;
5418
5419	return upper_dev == dev;
5420}
5421
5422/**
5423 * netdev_has_upper_dev - Check if device is linked to an upper device
5424 * @dev: device
5425 * @upper_dev: upper device to check
5426 *
5427 * Find out if a device is linked to specified upper device and return true
5428 * in case it is. Note that this checks only immediate upper device,
5429 * not through a complete stack of devices. The caller must hold the RTNL lock.
5430 */
5431bool netdev_has_upper_dev(struct net_device *dev,
5432			  struct net_device *upper_dev)
5433{
 
 
 
 
5434	ASSERT_RTNL();
5435
5436	return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5437					     upper_dev);
5438}
5439EXPORT_SYMBOL(netdev_has_upper_dev);
5440
5441/**
5442 * netdev_has_upper_dev_all - Check if device is linked to an upper device
5443 * @dev: device
5444 * @upper_dev: upper device to check
5445 *
5446 * Find out if a device is linked to specified upper device and return true
5447 * in case it is. Note that this checks the entire upper device chain.
5448 * The caller must hold rcu lock.
5449 */
5450
5451bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
5452				  struct net_device *upper_dev)
5453{
5454	return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5455					       upper_dev);
 
 
 
 
5456}
5457EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
5458
5459/**
5460 * netdev_has_any_upper_dev - Check if device is linked to some device
5461 * @dev: device
5462 *
5463 * Find out if a device is linked to an upper device and return true in case
5464 * it is. The caller must hold the RTNL lock.
5465 */
5466static bool netdev_has_any_upper_dev(struct net_device *dev)
5467{
5468	ASSERT_RTNL();
5469
5470	return !list_empty(&dev->adj_list.upper);
5471}
 
5472
5473/**
5474 * netdev_master_upper_dev_get - Get master upper device
5475 * @dev: device
5476 *
5477 * Find a master upper device and return pointer to it or NULL in case
5478 * it's not there. The caller must hold the RTNL lock.
5479 */
5480struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5481{
5482	struct netdev_adjacent *upper;
5483
5484	ASSERT_RTNL();
5485
5486	if (list_empty(&dev->adj_list.upper))
5487		return NULL;
5488
5489	upper = list_first_entry(&dev->adj_list.upper,
5490				 struct netdev_adjacent, list);
5491	if (likely(upper->master))
5492		return upper->dev;
5493	return NULL;
5494}
5495EXPORT_SYMBOL(netdev_master_upper_dev_get);
5496
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5497/**
5498 * netdev_has_any_lower_dev - Check if device is linked to some device
5499 * @dev: device
5500 *
5501 * Find out if a device is linked to a lower device and return true in case
5502 * it is. The caller must hold the RTNL lock.
5503 */
5504static bool netdev_has_any_lower_dev(struct net_device *dev)
5505{
5506	ASSERT_RTNL();
5507
5508	return !list_empty(&dev->adj_list.lower);
5509}
5510
5511void *netdev_adjacent_get_private(struct list_head *adj_list)
5512{
5513	struct netdev_adjacent *adj;
5514
5515	adj = list_entry(adj_list, struct netdev_adjacent, list);
5516
5517	return adj->private;
5518}
5519EXPORT_SYMBOL(netdev_adjacent_get_private);
5520
5521/**
5522 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5523 * @dev: device
5524 * @iter: list_head ** of the current position
5525 *
5526 * Gets the next device from the dev's upper list, starting from iter
5527 * position. The caller must hold RCU read lock.
5528 */
5529struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5530						 struct list_head **iter)
5531{
5532	struct netdev_adjacent *upper;
5533
5534	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5535
5536	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5537
5538	if (&upper->list == &dev->adj_list.upper)
5539		return NULL;
5540
5541	*iter = &upper->list;
5542
5543	return upper->dev;
5544}
5545EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5546
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5547static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
5548						    struct list_head **iter)
5549{
5550	struct netdev_adjacent *upper;
5551
5552	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5553
5554	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5555
5556	if (&upper->list == &dev->adj_list.upper)
5557		return NULL;
5558
5559	*iter = &upper->list;
5560
5561	return upper->dev;
5562}
5563
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5564int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
5565				  int (*fn)(struct net_device *dev,
5566					    void *data),
5567				  void *data)
5568{
5569	struct net_device *udev;
5570	struct list_head *iter;
5571	int ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5572
5573	for (iter = &dev->adj_list.upper,
5574	     udev = netdev_next_upper_dev_rcu(dev, &iter);
5575	     udev;
5576	     udev = netdev_next_upper_dev_rcu(dev, &iter)) {
5577		/* first is the upper device itself */
5578		ret = fn(udev, data);
5579		if (ret)
5580			return ret;
5581
5582		/* then look at all of its upper devices */
5583		ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
5584		if (ret)
5585			return ret;
5586	}
5587
5588	return 0;
5589}
5590EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
5591
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5592/**
5593 * netdev_lower_get_next_private - Get the next ->private from the
5594 *				   lower neighbour list
5595 * @dev: device
5596 * @iter: list_head ** of the current position
5597 *
5598 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5599 * list, starting from iter position. The caller must hold either hold the
5600 * RTNL lock or its own locking that guarantees that the neighbour lower
5601 * list will remain unchanged.
5602 */
5603void *netdev_lower_get_next_private(struct net_device *dev,
5604				    struct list_head **iter)
5605{
5606	struct netdev_adjacent *lower;
5607
5608	lower = list_entry(*iter, struct netdev_adjacent, list);
5609
5610	if (&lower->list == &dev->adj_list.lower)
5611		return NULL;
5612
5613	*iter = lower->list.next;
5614
5615	return lower->private;
5616}
5617EXPORT_SYMBOL(netdev_lower_get_next_private);
5618
5619/**
5620 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5621 *				       lower neighbour list, RCU
5622 *				       variant
5623 * @dev: device
5624 * @iter: list_head ** of the current position
5625 *
5626 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5627 * list, starting from iter position. The caller must hold RCU read lock.
5628 */
5629void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5630					struct list_head **iter)
5631{
5632	struct netdev_adjacent *lower;
5633
5634	WARN_ON_ONCE(!rcu_read_lock_held());
5635
5636	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5637
5638	if (&lower->list == &dev->adj_list.lower)
5639		return NULL;
5640
5641	*iter = &lower->list;
5642
5643	return lower->private;
5644}
5645EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5646
5647/**
5648 * netdev_lower_get_next - Get the next device from the lower neighbour
5649 *                         list
5650 * @dev: device
5651 * @iter: list_head ** of the current position
5652 *
5653 * Gets the next netdev_adjacent from the dev's lower neighbour
5654 * list, starting from iter position. The caller must hold RTNL lock or
5655 * its own locking that guarantees that the neighbour lower
5656 * list will remain unchanged.
5657 */
5658void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5659{
5660	struct netdev_adjacent *lower;
5661
5662	lower = list_entry(*iter, struct netdev_adjacent, list);
5663
5664	if (&lower->list == &dev->adj_list.lower)
5665		return NULL;
5666
5667	*iter = lower->list.next;
5668
5669	return lower->dev;
5670}
5671EXPORT_SYMBOL(netdev_lower_get_next);
5672
5673static struct net_device *netdev_next_lower_dev(struct net_device *dev,
5674						struct list_head **iter)
5675{
5676	struct netdev_adjacent *lower;
5677
5678	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5679
5680	if (&lower->list == &dev->adj_list.lower)
5681		return NULL;
5682
5683	*iter = &lower->list;
5684
5685	return lower->dev;
5686}
5687
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5688int netdev_walk_all_lower_dev(struct net_device *dev,
5689			      int (*fn)(struct net_device *dev,
5690					void *data),
5691			      void *data)
5692{
5693	struct net_device *ldev;
5694	struct list_head *iter;
5695	int ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5696
5697	for (iter = &dev->adj_list.lower,
5698	     ldev = netdev_next_lower_dev(dev, &iter);
5699	     ldev;
5700	     ldev = netdev_next_lower_dev(dev, &iter)) {
5701		/* first is the lower device itself */
5702		ret = fn(ldev, data);
5703		if (ret)
5704			return ret;
5705
5706		/* then look at all of its lower devices */
5707		ret = netdev_walk_all_lower_dev(ldev, fn, data);
5708		if (ret)
5709			return ret;
5710	}
5711
5712	return 0;
5713}
5714EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
5715
5716static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
5717						    struct list_head **iter)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5718{
5719	struct netdev_adjacent *lower;
5720
5721	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5722	if (&lower->list == &dev->adj_list.lower)
5723		return NULL;
5724
5725	*iter = &lower->list;
5726
5727	return lower->dev;
5728}
 
5729
5730int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
5731				  int (*fn)(struct net_device *dev,
5732					    void *data),
5733				  void *data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5734{
5735	struct net_device *ldev;
5736	struct list_head *iter;
5737	int ret;
 
5738
5739	for (iter = &dev->adj_list.lower,
5740	     ldev = netdev_next_lower_dev_rcu(dev, &iter);
5741	     ldev;
5742	     ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
5743		/* first is the lower device itself */
5744		ret = fn(ldev, data);
5745		if (ret)
5746			return ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5747
5748		/* then look at all of its lower devices */
5749		ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
5750		if (ret)
5751			return ret;
5752	}
5753
5754	return 0;
5755}
5756EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
5757
5758/**
5759 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5760 *				       lower neighbour list, RCU
5761 *				       variant
5762 * @dev: device
5763 *
5764 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5765 * list. The caller must hold RCU read lock.
5766 */
5767void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5768{
5769	struct netdev_adjacent *lower;
5770
5771	lower = list_first_or_null_rcu(&dev->adj_list.lower,
5772			struct netdev_adjacent, list);
5773	if (lower)
5774		return lower->private;
5775	return NULL;
5776}
5777EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5778
5779/**
5780 * netdev_master_upper_dev_get_rcu - Get master upper device
5781 * @dev: device
5782 *
5783 * Find a master upper device and return pointer to it or NULL in case
5784 * it's not there. The caller must hold the RCU read lock.
5785 */
5786struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5787{
5788	struct netdev_adjacent *upper;
5789
5790	upper = list_first_or_null_rcu(&dev->adj_list.upper,
5791				       struct netdev_adjacent, list);
5792	if (upper && likely(upper->master))
5793		return upper->dev;
5794	return NULL;
5795}
5796EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5797
5798static int netdev_adjacent_sysfs_add(struct net_device *dev,
5799			      struct net_device *adj_dev,
5800			      struct list_head *dev_list)
5801{
5802	char linkname[IFNAMSIZ+7];
 
5803	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5804		"upper_%s" : "lower_%s", adj_dev->name);
5805	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5806				 linkname);
5807}
5808static void netdev_adjacent_sysfs_del(struct net_device *dev,
5809			       char *name,
5810			       struct list_head *dev_list)
5811{
5812	char linkname[IFNAMSIZ+7];
 
5813	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5814		"upper_%s" : "lower_%s", name);
5815	sysfs_remove_link(&(dev->dev.kobj), linkname);
5816}
5817
5818static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5819						 struct net_device *adj_dev,
5820						 struct list_head *dev_list)
5821{
5822	return (dev_list == &dev->adj_list.upper ||
5823		dev_list == &dev->adj_list.lower) &&
5824		net_eq(dev_net(dev), dev_net(adj_dev));
5825}
5826
5827static int __netdev_adjacent_dev_insert(struct net_device *dev,
5828					struct net_device *adj_dev,
5829					struct list_head *dev_list,
5830					void *private, bool master)
5831{
5832	struct netdev_adjacent *adj;
5833	int ret;
5834
5835	adj = __netdev_find_adj(adj_dev, dev_list);
5836
5837	if (adj) {
5838		adj->ref_nr += 1;
5839		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
5840			 dev->name, adj_dev->name, adj->ref_nr);
5841
5842		return 0;
5843	}
5844
5845	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5846	if (!adj)
5847		return -ENOMEM;
5848
5849	adj->dev = adj_dev;
5850	adj->master = master;
5851	adj->ref_nr = 1;
5852	adj->private = private;
5853	dev_hold(adj_dev);
 
5854
5855	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
5856		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
5857
5858	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5859		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5860		if (ret)
5861			goto free_adj;
5862	}
5863
5864	/* Ensure that master link is always the first item in list. */
5865	if (master) {
5866		ret = sysfs_create_link(&(dev->dev.kobj),
5867					&(adj_dev->dev.kobj), "master");
5868		if (ret)
5869			goto remove_symlinks;
5870
5871		list_add_rcu(&adj->list, dev_list);
5872	} else {
5873		list_add_tail_rcu(&adj->list, dev_list);
5874	}
5875
5876	return 0;
5877
5878remove_symlinks:
5879	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5880		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5881free_adj:
 
5882	kfree(adj);
5883	dev_put(adj_dev);
5884
5885	return ret;
5886}
5887
5888static void __netdev_adjacent_dev_remove(struct net_device *dev,
5889					 struct net_device *adj_dev,
5890					 u16 ref_nr,
5891					 struct list_head *dev_list)
5892{
5893	struct netdev_adjacent *adj;
5894
5895	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
5896		 dev->name, adj_dev->name, ref_nr);
5897
5898	adj = __netdev_find_adj(adj_dev, dev_list);
5899
5900	if (!adj) {
5901		pr_err("Adjacency does not exist for device %s from %s\n",
5902		       dev->name, adj_dev->name);
5903		WARN_ON(1);
5904		return;
5905	}
5906
5907	if (adj->ref_nr > ref_nr) {
5908		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
5909			 dev->name, adj_dev->name, ref_nr,
5910			 adj->ref_nr - ref_nr);
5911		adj->ref_nr -= ref_nr;
5912		return;
5913	}
5914
5915	if (adj->master)
5916		sysfs_remove_link(&(dev->dev.kobj), "master");
5917
5918	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5919		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5920
5921	list_del_rcu(&adj->list);
5922	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
5923		 adj_dev->name, dev->name, adj_dev->name);
5924	dev_put(adj_dev);
5925	kfree_rcu(adj, rcu);
5926}
5927
5928static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5929					    struct net_device *upper_dev,
5930					    struct list_head *up_list,
5931					    struct list_head *down_list,
5932					    void *private, bool master)
5933{
5934	int ret;
5935
5936	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
5937					   private, master);
5938	if (ret)
5939		return ret;
5940
5941	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
5942					   private, false);
5943	if (ret) {
5944		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
5945		return ret;
5946	}
5947
5948	return 0;
5949}
5950
5951static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5952					       struct net_device *upper_dev,
5953					       u16 ref_nr,
5954					       struct list_head *up_list,
5955					       struct list_head *down_list)
5956{
5957	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5958	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5959}
5960
5961static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5962						struct net_device *upper_dev,
5963						void *private, bool master)
5964{
5965	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5966						&dev->adj_list.upper,
5967						&upper_dev->adj_list.lower,
5968						private, master);
5969}
5970
5971static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5972						   struct net_device *upper_dev)
5973{
5974	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5975					   &dev->adj_list.upper,
5976					   &upper_dev->adj_list.lower);
5977}
5978
5979static int __netdev_upper_dev_link(struct net_device *dev,
5980				   struct net_device *upper_dev, bool master,
5981				   void *upper_priv, void *upper_info)
5982{
5983	struct netdev_notifier_changeupper_info changeupper_info;
 
 
 
 
 
 
 
 
 
 
 
 
5984	int ret = 0;
5985
5986	ASSERT_RTNL();
5987
5988	if (dev == upper_dev)
5989		return -EBUSY;
5990
5991	/* To prevent loops, check if dev is not upper device to upper_dev. */
5992	if (netdev_has_upper_dev(upper_dev, dev))
5993		return -EBUSY;
5994
5995	if (netdev_has_upper_dev(dev, upper_dev))
5996		return -EEXIST;
5997
5998	if (master && netdev_master_upper_dev_get(dev))
5999		return -EBUSY;
6000
6001	changeupper_info.upper_dev = upper_dev;
6002	changeupper_info.master = master;
6003	changeupper_info.linking = true;
6004	changeupper_info.upper_info = upper_info;
 
6005
6006	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
6007					    &changeupper_info.info);
6008	ret = notifier_to_errno(ret);
6009	if (ret)
6010		return ret;
6011
6012	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
6013						   master);
6014	if (ret)
6015		return ret;
6016
6017	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
6018					    &changeupper_info.info);
6019	ret = notifier_to_errno(ret);
6020	if (ret)
6021		goto rollback;
6022
 
 
 
 
 
 
 
6023	return 0;
6024
6025rollback:
6026	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6027
6028	return ret;
6029}
6030
6031/**
6032 * netdev_upper_dev_link - Add a link to the upper device
6033 * @dev: device
6034 * @upper_dev: new upper device
 
6035 *
6036 * Adds a link to device which is upper to this one. The caller must hold
6037 * the RTNL lock. On a failure a negative errno code is returned.
6038 * On success the reference counts are adjusted and the function
6039 * returns zero.
6040 */
6041int netdev_upper_dev_link(struct net_device *dev,
6042			  struct net_device *upper_dev)
 
6043{
6044	return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
 
 
 
 
 
 
6045}
6046EXPORT_SYMBOL(netdev_upper_dev_link);
6047
6048/**
6049 * netdev_master_upper_dev_link - Add a master link to the upper device
6050 * @dev: device
6051 * @upper_dev: new upper device
6052 * @upper_priv: upper device private
6053 * @upper_info: upper info to be passed down via notifier
 
6054 *
6055 * Adds a link to device which is upper to this one. In this case, only
6056 * one master upper device can be linked, although other non-master devices
6057 * might be linked as well. The caller must hold the RTNL lock.
6058 * On a failure a negative errno code is returned. On success the reference
6059 * counts are adjusted and the function returns zero.
6060 */
6061int netdev_master_upper_dev_link(struct net_device *dev,
6062				 struct net_device *upper_dev,
6063				 void *upper_priv, void *upper_info)
 
6064{
 
 
 
 
 
6065	return __netdev_upper_dev_link(dev, upper_dev, true,
6066				       upper_priv, upper_info);
6067}
6068EXPORT_SYMBOL(netdev_master_upper_dev_link);
6069
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6070/**
6071 * netdev_upper_dev_unlink - Removes a link to upper device
6072 * @dev: device
6073 * @upper_dev: new upper device
6074 *
6075 * Removes a link to device which is upper to this one. The caller must hold
6076 * the RTNL lock.
6077 */
6078void netdev_upper_dev_unlink(struct net_device *dev,
6079			     struct net_device *upper_dev)
6080{
6081	struct netdev_notifier_changeupper_info changeupper_info;
6082	ASSERT_RTNL();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6083
6084	changeupper_info.upper_dev = upper_dev;
6085	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
6086	changeupper_info.linking = false;
 
 
 
 
 
6087
6088	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
6089				      &changeupper_info.info);
6090
6091	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 
6092
6093	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
6094				      &changeupper_info.info);
6095}
6096EXPORT_SYMBOL(netdev_upper_dev_unlink);
6097
6098/**
6099 * netdev_bonding_info_change - Dispatch event about slave change
6100 * @dev: device
6101 * @bonding_info: info to dispatch
6102 *
6103 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
6104 * The caller must hold the RTNL lock.
6105 */
6106void netdev_bonding_info_change(struct net_device *dev,
6107				struct netdev_bonding_info *bonding_info)
6108{
6109	struct netdev_notifier_bonding_info	info;
 
 
6110
6111	memcpy(&info.bonding_info, bonding_info,
6112	       sizeof(struct netdev_bonding_info));
6113	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
6114				      &info.info);
6115}
6116EXPORT_SYMBOL(netdev_bonding_info_change);
6117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6118static void netdev_adjacent_add_links(struct net_device *dev)
6119{
6120	struct netdev_adjacent *iter;
6121
6122	struct net *net = dev_net(dev);
6123
6124	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6125		if (!net_eq(net, dev_net(iter->dev)))
6126			continue;
6127		netdev_adjacent_sysfs_add(iter->dev, dev,
6128					  &iter->dev->adj_list.lower);
6129		netdev_adjacent_sysfs_add(dev, iter->dev,
6130					  &dev->adj_list.upper);
6131	}
6132
6133	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6134		if (!net_eq(net, dev_net(iter->dev)))
6135			continue;
6136		netdev_adjacent_sysfs_add(iter->dev, dev,
6137					  &iter->dev->adj_list.upper);
6138		netdev_adjacent_sysfs_add(dev, iter->dev,
6139					  &dev->adj_list.lower);
6140	}
6141}
6142
6143static void netdev_adjacent_del_links(struct net_device *dev)
6144{
6145	struct netdev_adjacent *iter;
6146
6147	struct net *net = dev_net(dev);
6148
6149	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6150		if (!net_eq(net, dev_net(iter->dev)))
6151			continue;
6152		netdev_adjacent_sysfs_del(iter->dev, dev->name,
6153					  &iter->dev->adj_list.lower);
6154		netdev_adjacent_sysfs_del(dev, iter->dev->name,
6155					  &dev->adj_list.upper);
6156	}
6157
6158	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6159		if (!net_eq(net, dev_net(iter->dev)))
6160			continue;
6161		netdev_adjacent_sysfs_del(iter->dev, dev->name,
6162					  &iter->dev->adj_list.upper);
6163		netdev_adjacent_sysfs_del(dev, iter->dev->name,
6164					  &dev->adj_list.lower);
6165	}
6166}
6167
6168void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
6169{
6170	struct netdev_adjacent *iter;
6171
6172	struct net *net = dev_net(dev);
6173
6174	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6175		if (!net_eq(net, dev_net(iter->dev)))
6176			continue;
6177		netdev_adjacent_sysfs_del(iter->dev, oldname,
6178					  &iter->dev->adj_list.lower);
6179		netdev_adjacent_sysfs_add(iter->dev, dev,
6180					  &iter->dev->adj_list.lower);
6181	}
6182
6183	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6184		if (!net_eq(net, dev_net(iter->dev)))
6185			continue;
6186		netdev_adjacent_sysfs_del(iter->dev, oldname,
6187					  &iter->dev->adj_list.upper);
6188		netdev_adjacent_sysfs_add(iter->dev, dev,
6189					  &iter->dev->adj_list.upper);
6190	}
6191}
6192
6193void *netdev_lower_dev_get_private(struct net_device *dev,
6194				   struct net_device *lower_dev)
6195{
6196	struct netdev_adjacent *lower;
6197
6198	if (!lower_dev)
6199		return NULL;
6200	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
6201	if (!lower)
6202		return NULL;
6203
6204	return lower->private;
6205}
6206EXPORT_SYMBOL(netdev_lower_dev_get_private);
6207
6208
6209int dev_get_nest_level(struct net_device *dev)
6210{
6211	struct net_device *lower = NULL;
6212	struct list_head *iter;
6213	int max_nest = -1;
6214	int nest;
6215
6216	ASSERT_RTNL();
6217
6218	netdev_for_each_lower_dev(dev, lower, iter) {
6219		nest = dev_get_nest_level(lower);
6220		if (max_nest < nest)
6221			max_nest = nest;
6222	}
6223
6224	return max_nest + 1;
6225}
6226EXPORT_SYMBOL(dev_get_nest_level);
6227
6228/**
6229 * netdev_lower_change - Dispatch event about lower device state change
6230 * @lower_dev: device
6231 * @lower_state_info: state to dispatch
6232 *
6233 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6234 * The caller must hold the RTNL lock.
6235 */
6236void netdev_lower_state_changed(struct net_device *lower_dev,
6237				void *lower_state_info)
6238{
6239	struct netdev_notifier_changelowerstate_info changelowerstate_info;
 
 
6240
6241	ASSERT_RTNL();
6242	changelowerstate_info.lower_state_info = lower_state_info;
6243	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6244				      &changelowerstate_info.info);
6245}
6246EXPORT_SYMBOL(netdev_lower_state_changed);
6247
6248int netdev_default_l2upper_neigh_construct(struct net_device *dev,
6249					   struct neighbour *n)
6250{
6251	struct net_device *lower_dev, *stop_dev;
6252	struct list_head *iter;
6253	int err;
6254
6255	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6256		if (!lower_dev->netdev_ops->ndo_neigh_construct)
6257			continue;
6258		err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
6259		if (err) {
6260			stop_dev = lower_dev;
6261			goto rollback;
6262		}
6263	}
6264	return 0;
6265
6266rollback:
6267	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6268		if (lower_dev == stop_dev)
6269			break;
6270		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6271			continue;
6272		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6273	}
6274	return err;
6275}
6276EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6277
6278void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6279					  struct neighbour *n)
6280{
6281	struct net_device *lower_dev;
6282	struct list_head *iter;
6283
6284	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6285		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6286			continue;
6287		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6288	}
6289}
6290EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6291
6292static void dev_change_rx_flags(struct net_device *dev, int flags)
6293{
6294	const struct net_device_ops *ops = dev->netdev_ops;
6295
6296	if (ops->ndo_change_rx_flags)
6297		ops->ndo_change_rx_flags(dev, flags);
6298}
6299
6300static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6301{
6302	unsigned int old_flags = dev->flags;
6303	kuid_t uid;
6304	kgid_t gid;
6305
6306	ASSERT_RTNL();
6307
6308	dev->flags |= IFF_PROMISC;
6309	dev->promiscuity += inc;
6310	if (dev->promiscuity == 0) {
6311		/*
6312		 * Avoid overflow.
6313		 * If inc causes overflow, untouch promisc and return error.
6314		 */
6315		if (inc < 0)
6316			dev->flags &= ~IFF_PROMISC;
6317		else {
6318			dev->promiscuity -= inc;
6319			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6320				dev->name);
6321			return -EOVERFLOW;
6322		}
6323	}
6324	if (dev->flags != old_flags) {
6325		pr_info("device %s %s promiscuous mode\n",
6326			dev->name,
6327			dev->flags & IFF_PROMISC ? "entered" : "left");
6328		if (audit_enabled) {
6329			current_uid_gid(&uid, &gid);
6330			audit_log(current->audit_context, GFP_ATOMIC,
6331				AUDIT_ANOM_PROMISCUOUS,
6332				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6333				dev->name, (dev->flags & IFF_PROMISC),
6334				(old_flags & IFF_PROMISC),
6335				from_kuid(&init_user_ns, audit_get_loginuid(current)),
6336				from_kuid(&init_user_ns, uid),
6337				from_kgid(&init_user_ns, gid),
6338				audit_get_sessionid(current));
6339		}
6340
6341		dev_change_rx_flags(dev, IFF_PROMISC);
6342	}
6343	if (notify)
6344		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
6345	return 0;
6346}
6347
6348/**
6349 *	dev_set_promiscuity	- update promiscuity count on a device
6350 *	@dev: device
6351 *	@inc: modifier
6352 *
6353 *	Add or remove promiscuity from a device. While the count in the device
6354 *	remains above zero the interface remains promiscuous. Once it hits zero
6355 *	the device reverts back to normal filtering operation. A negative inc
6356 *	value is used to drop promiscuity on the device.
6357 *	Return 0 if successful or a negative errno code on error.
6358 */
6359int dev_set_promiscuity(struct net_device *dev, int inc)
6360{
6361	unsigned int old_flags = dev->flags;
6362	int err;
6363
6364	err = __dev_set_promiscuity(dev, inc, true);
6365	if (err < 0)
6366		return err;
6367	if (dev->flags != old_flags)
6368		dev_set_rx_mode(dev);
6369	return err;
6370}
6371EXPORT_SYMBOL(dev_set_promiscuity);
6372
6373static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6374{
6375	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6376
6377	ASSERT_RTNL();
6378
6379	dev->flags |= IFF_ALLMULTI;
6380	dev->allmulti += inc;
6381	if (dev->allmulti == 0) {
6382		/*
6383		 * Avoid overflow.
6384		 * If inc causes overflow, untouch allmulti and return error.
6385		 */
6386		if (inc < 0)
6387			dev->flags &= ~IFF_ALLMULTI;
6388		else {
6389			dev->allmulti -= inc;
6390			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6391				dev->name);
6392			return -EOVERFLOW;
6393		}
6394	}
6395	if (dev->flags ^ old_flags) {
 
 
6396		dev_change_rx_flags(dev, IFF_ALLMULTI);
6397		dev_set_rx_mode(dev);
6398		if (notify)
6399			__dev_notify_flags(dev, old_flags,
6400					   dev->gflags ^ old_gflags);
6401	}
6402	return 0;
6403}
6404
6405/**
6406 *	dev_set_allmulti	- update allmulti count on a device
6407 *	@dev: device
6408 *	@inc: modifier
6409 *
6410 *	Add or remove reception of all multicast frames to a device. While the
6411 *	count in the device remains above zero the interface remains listening
6412 *	to all interfaces. Once it hits zero the device reverts back to normal
6413 *	filtering operation. A negative @inc value is used to drop the counter
6414 *	when releasing a resource needing all multicasts.
6415 *	Return 0 if successful or a negative errno code on error.
6416 */
6417
6418int dev_set_allmulti(struct net_device *dev, int inc)
6419{
6420	return __dev_set_allmulti(dev, inc, true);
6421}
6422EXPORT_SYMBOL(dev_set_allmulti);
6423
6424/*
6425 *	Upload unicast and multicast address lists to device and
6426 *	configure RX filtering. When the device doesn't support unicast
6427 *	filtering it is put in promiscuous mode while unicast addresses
6428 *	are present.
6429 */
6430void __dev_set_rx_mode(struct net_device *dev)
6431{
6432	const struct net_device_ops *ops = dev->netdev_ops;
6433
6434	/* dev_open will call this function so the list will stay sane. */
6435	if (!(dev->flags&IFF_UP))
6436		return;
6437
6438	if (!netif_device_present(dev))
6439		return;
6440
6441	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6442		/* Unicast addresses changes may only happen under the rtnl,
6443		 * therefore calling __dev_set_promiscuity here is safe.
6444		 */
6445		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6446			__dev_set_promiscuity(dev, 1, false);
6447			dev->uc_promisc = true;
6448		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6449			__dev_set_promiscuity(dev, -1, false);
6450			dev->uc_promisc = false;
6451		}
6452	}
6453
6454	if (ops->ndo_set_rx_mode)
6455		ops->ndo_set_rx_mode(dev);
6456}
6457
6458void dev_set_rx_mode(struct net_device *dev)
6459{
6460	netif_addr_lock_bh(dev);
6461	__dev_set_rx_mode(dev);
6462	netif_addr_unlock_bh(dev);
6463}
6464
6465/**
6466 *	dev_get_flags - get flags reported to userspace
6467 *	@dev: device
6468 *
6469 *	Get the combination of flag bits exported through APIs to userspace.
6470 */
6471unsigned int dev_get_flags(const struct net_device *dev)
6472{
6473	unsigned int flags;
6474
6475	flags = (dev->flags & ~(IFF_PROMISC |
6476				IFF_ALLMULTI |
6477				IFF_RUNNING |
6478				IFF_LOWER_UP |
6479				IFF_DORMANT)) |
6480		(dev->gflags & (IFF_PROMISC |
6481				IFF_ALLMULTI));
6482
6483	if (netif_running(dev)) {
6484		if (netif_oper_up(dev))
6485			flags |= IFF_RUNNING;
6486		if (netif_carrier_ok(dev))
6487			flags |= IFF_LOWER_UP;
6488		if (netif_dormant(dev))
6489			flags |= IFF_DORMANT;
6490	}
6491
6492	return flags;
6493}
6494EXPORT_SYMBOL(dev_get_flags);
6495
6496int __dev_change_flags(struct net_device *dev, unsigned int flags)
 
6497{
6498	unsigned int old_flags = dev->flags;
6499	int ret;
6500
6501	ASSERT_RTNL();
6502
6503	/*
6504	 *	Set the flags on our device.
6505	 */
6506
6507	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6508			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6509			       IFF_AUTOMEDIA)) |
6510		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6511				    IFF_ALLMULTI));
6512
6513	/*
6514	 *	Load in the correct multicast list now the flags have changed.
6515	 */
6516
6517	if ((old_flags ^ flags) & IFF_MULTICAST)
6518		dev_change_rx_flags(dev, IFF_MULTICAST);
6519
6520	dev_set_rx_mode(dev);
6521
6522	/*
6523	 *	Have we downed the interface. We handle IFF_UP ourselves
6524	 *	according to user attempts to set it, rather than blindly
6525	 *	setting it.
6526	 */
6527
6528	ret = 0;
6529	if ((old_flags ^ flags) & IFF_UP)
6530		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
 
 
 
 
6531
6532	if ((flags ^ dev->gflags) & IFF_PROMISC) {
6533		int inc = (flags & IFF_PROMISC) ? 1 : -1;
6534		unsigned int old_flags = dev->flags;
6535
6536		dev->gflags ^= IFF_PROMISC;
6537
6538		if (__dev_set_promiscuity(dev, inc, false) >= 0)
6539			if (dev->flags != old_flags)
6540				dev_set_rx_mode(dev);
6541	}
6542
6543	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6544	   is important. Some (broken) drivers set IFF_PROMISC, when
6545	   IFF_ALLMULTI is requested not asking us and not reporting.
6546	 */
6547	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6548		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6549
6550		dev->gflags ^= IFF_ALLMULTI;
6551		__dev_set_allmulti(dev, inc, false);
6552	}
6553
6554	return ret;
6555}
6556
6557void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6558			unsigned int gchanges)
 
6559{
6560	unsigned int changes = dev->flags ^ old_flags;
6561
6562	if (gchanges)
6563		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6564
6565	if (changes & IFF_UP) {
6566		if (dev->flags & IFF_UP)
6567			call_netdevice_notifiers(NETDEV_UP, dev);
6568		else
6569			call_netdevice_notifiers(NETDEV_DOWN, dev);
6570	}
6571
6572	if (dev->flags & IFF_UP &&
6573	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6574		struct netdev_notifier_change_info change_info;
 
 
 
 
 
6575
6576		change_info.flags_changed = changes;
6577		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6578					      &change_info.info);
6579	}
6580}
6581
6582/**
6583 *	dev_change_flags - change device settings
6584 *	@dev: device
6585 *	@flags: device state flags
 
6586 *
6587 *	Change settings on device based state flags. The flags are
6588 *	in the userspace exported format.
6589 */
6590int dev_change_flags(struct net_device *dev, unsigned int flags)
 
6591{
6592	int ret;
6593	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6594
6595	ret = __dev_change_flags(dev, flags);
6596	if (ret < 0)
6597		return ret;
6598
6599	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6600	__dev_notify_flags(dev, old_flags, changes);
6601	return ret;
6602}
6603EXPORT_SYMBOL(dev_change_flags);
6604
6605static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6606{
6607	const struct net_device_ops *ops = dev->netdev_ops;
6608
6609	if (ops->ndo_change_mtu)
6610		return ops->ndo_change_mtu(dev, new_mtu);
6611
6612	dev->mtu = new_mtu;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6613	return 0;
6614}
6615
6616/**
6617 *	dev_set_mtu - Change maximum transfer unit
6618 *	@dev: device
6619 *	@new_mtu: new transfer unit
 
6620 *
6621 *	Change the maximum transfer size of the network device.
6622 */
6623int dev_set_mtu(struct net_device *dev, int new_mtu)
 
6624{
6625	int err, orig_mtu;
6626
6627	if (new_mtu == dev->mtu)
6628		return 0;
6629
6630	/* MTU must be positive, and in range */
6631	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
6632		net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
6633				    dev->name, new_mtu, dev->min_mtu);
6634		return -EINVAL;
6635	}
6636
6637	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
6638		net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
6639				    dev->name, new_mtu, dev->max_mtu);
6640		return -EINVAL;
6641	}
6642
6643	if (!netif_device_present(dev))
6644		return -ENODEV;
6645
6646	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6647	err = notifier_to_errno(err);
6648	if (err)
6649		return err;
6650
6651	orig_mtu = dev->mtu;
6652	err = __dev_set_mtu(dev, new_mtu);
6653
6654	if (!err) {
6655		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
 
6656		err = notifier_to_errno(err);
6657		if (err) {
6658			/* setting mtu back and notifying everyone again,
6659			 * so that they have a chance to revert changes.
6660			 */
6661			__dev_set_mtu(dev, orig_mtu);
6662			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
 
6663		}
6664	}
6665	return err;
6666}
 
 
 
 
 
 
 
 
 
 
 
 
6667EXPORT_SYMBOL(dev_set_mtu);
6668
6669/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6670 *	dev_set_group - Change group this device belongs to
6671 *	@dev: device
6672 *	@new_group: group this device should belong to
6673 */
6674void dev_set_group(struct net_device *dev, int new_group)
6675{
6676	dev->group = new_group;
6677}
6678EXPORT_SYMBOL(dev_set_group);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6679
6680/**
6681 *	dev_set_mac_address - Change Media Access Control Address
6682 *	@dev: device
6683 *	@sa: new address
 
6684 *
6685 *	Change the hardware (MAC) address of the device
6686 */
6687int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
 
6688{
6689	const struct net_device_ops *ops = dev->netdev_ops;
6690	int err;
6691
6692	if (!ops->ndo_set_mac_address)
6693		return -EOPNOTSUPP;
6694	if (sa->sa_family != dev->type)
6695		return -EINVAL;
6696	if (!netif_device_present(dev))
6697		return -ENODEV;
6698	err = ops->ndo_set_mac_address(dev, sa);
6699	if (err)
6700		return err;
 
 
 
 
 
6701	dev->addr_assign_type = NET_ADDR_SET;
6702	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6703	add_device_randomness(dev->dev_addr, dev->addr_len);
6704	return 0;
6705}
6706EXPORT_SYMBOL(dev_set_mac_address);
6707
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6708/**
6709 *	dev_change_carrier - Change device carrier
6710 *	@dev: device
6711 *	@new_carrier: new value
6712 *
6713 *	Change device carrier
6714 */
6715int dev_change_carrier(struct net_device *dev, bool new_carrier)
6716{
6717	const struct net_device_ops *ops = dev->netdev_ops;
6718
6719	if (!ops->ndo_change_carrier)
6720		return -EOPNOTSUPP;
6721	if (!netif_device_present(dev))
6722		return -ENODEV;
6723	return ops->ndo_change_carrier(dev, new_carrier);
6724}
6725EXPORT_SYMBOL(dev_change_carrier);
6726
6727/**
6728 *	dev_get_phys_port_id - Get device physical port ID
6729 *	@dev: device
6730 *	@ppid: port ID
6731 *
6732 *	Get device physical port ID
6733 */
6734int dev_get_phys_port_id(struct net_device *dev,
6735			 struct netdev_phys_item_id *ppid)
6736{
6737	const struct net_device_ops *ops = dev->netdev_ops;
6738
6739	if (!ops->ndo_get_phys_port_id)
6740		return -EOPNOTSUPP;
6741	return ops->ndo_get_phys_port_id(dev, ppid);
6742}
6743EXPORT_SYMBOL(dev_get_phys_port_id);
6744
6745/**
6746 *	dev_get_phys_port_name - Get device physical port name
6747 *	@dev: device
6748 *	@name: port name
6749 *	@len: limit of bytes to copy to name
6750 *
6751 *	Get device physical port name
6752 */
6753int dev_get_phys_port_name(struct net_device *dev,
6754			   char *name, size_t len)
6755{
6756	const struct net_device_ops *ops = dev->netdev_ops;
 
 
 
 
 
 
 
 
 
6757
6758	if (!ops->ndo_get_phys_port_name)
6759		return -EOPNOTSUPP;
6760	return ops->ndo_get_phys_port_name(dev, name, len);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6761}
6762EXPORT_SYMBOL(dev_get_phys_port_name);
6763
6764/**
6765 *	dev_change_proto_down - update protocol port state information
 
6766 *	@dev: device
6767 *	@proto_down: new value
6768 *
6769 *	This info can be used by switch drivers to set the phys state of the
6770 *	port.
6771 */
6772int dev_change_proto_down(struct net_device *dev, bool proto_down)
6773{
6774	const struct net_device_ops *ops = dev->netdev_ops;
6775
6776	if (!ops->ndo_change_proto_down)
6777		return -EOPNOTSUPP;
6778	if (!netif_device_present(dev))
6779		return -ENODEV;
6780	return ops->ndo_change_proto_down(dev, proto_down);
 
 
 
 
 
6781}
6782EXPORT_SYMBOL(dev_change_proto_down);
6783
6784/**
6785 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
 
6786 *	@dev: device
6787 *	@fd: new program fd or negative value to clear
6788 *	@flags: xdp-related flags
6789 *
6790 *	Set or clear a bpf program for a device
6791 */
6792int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6793{
6794	const struct net_device_ops *ops = dev->netdev_ops;
6795	struct bpf_prog *prog = NULL;
6796	struct netdev_xdp xdp;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6797	int err;
6798
6799	ASSERT_RTNL();
6800
6801	if (!ops->ndo_xdp)
6802		return -EOPNOTSUPP;
6803	if (fd >= 0) {
6804		if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
6805			memset(&xdp, 0, sizeof(xdp));
6806			xdp.command = XDP_QUERY_PROG;
6807
6808			err = ops->ndo_xdp(dev, &xdp);
6809			if (err < 0)
6810				return err;
6811			if (xdp.prog_attached)
6812				return -EBUSY;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6813		}
6814
6815		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6816		if (IS_ERR(prog))
6817			return PTR_ERR(prog);
6818	}
6819
6820	memset(&xdp, 0, sizeof(xdp));
6821	xdp.command = XDP_SETUP_PROG;
6822	xdp.prog = prog;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6823
6824	err = ops->ndo_xdp(dev, &xdp);
6825	if (err < 0 && prog)
6826		bpf_prog_put(prog);
6827
6828	return err;
 
 
 
 
 
 
 
6829}
6830EXPORT_SYMBOL(dev_change_xdp_fd);
6831
6832/**
6833 *	dev_new_index	-	allocate an ifindex
6834 *	@net: the applicable net namespace
6835 *
6836 *	Returns a suitable unique value for a new device interface
6837 *	number.  The caller must hold the rtnl semaphore or the
6838 *	dev_base_lock to be sure it remains unique.
6839 */
6840static int dev_new_index(struct net *net)
6841{
6842	int ifindex = net->ifindex;
6843	for (;;) {
6844		if (++ifindex <= 0)
6845			ifindex = 1;
6846		if (!__dev_get_by_index(net, ifindex))
6847			return net->ifindex = ifindex;
 
 
 
 
6848	}
 
 
 
 
 
 
 
 
6849}
6850
6851/* Delayed registration/unregisteration */
6852static LIST_HEAD(net_todo_list);
6853DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6854
6855static void net_set_todo(struct net_device *dev)
 
6856{
6857	list_add_tail(&dev->todo_list, &net_todo_list);
6858	dev_net(dev)->dev_unreg_count++;
 
 
 
 
 
 
 
 
6859}
6860
6861static void rollback_registered_many(struct list_head *head)
 
6862{
6863	struct net_device *dev, *tmp;
6864	LIST_HEAD(close_head);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6865
6866	BUG_ON(dev_boot_phase);
6867	ASSERT_RTNL();
 
 
 
 
 
 
6868
6869	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6870		/* Some devices call without registering
6871		 * for initialization unwind. Remove those
6872		 * devices and proceed with the remaining.
6873		 */
6874		if (dev->reg_state == NETREG_UNINITIALIZED) {
6875			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6876				 dev->name, dev);
6877
6878			WARN_ON(1);
6879			list_del(&dev->unreg_list);
6880			continue;
6881		}
6882		dev->dismantle = true;
6883		BUG_ON(dev->reg_state != NETREG_REGISTERED);
6884	}
6885
6886	/* If device is running, close it first. */
6887	list_for_each_entry(dev, head, unreg_list)
6888		list_add_tail(&dev->close_list, &close_head);
6889	dev_close_many(&close_head, true);
 
 
 
 
 
6890
6891	list_for_each_entry(dev, head, unreg_list) {
6892		/* And unlink it from device chain. */
6893		unlist_netdevice(dev);
6894
6895		dev->reg_state = NETREG_UNREGISTERING;
 
 
 
 
6896	}
6897	flush_all_backlogs();
6898
6899	synchronize_net();
 
 
 
6900
6901	list_for_each_entry(dev, head, unreg_list) {
6902		struct sk_buff *skb = NULL;
6903
6904		/* Shutdown queueing discipline. */
6905		dev_shutdown(dev);
 
 
6906
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6907
6908		/* Notify protocols, that we are about to destroy
6909		   this device. They should clean all the things.
6910		*/
6911		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6912
6913		if (!dev->rtnl_link_ops ||
6914		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6915			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6916						     GFP_KERNEL);
 
 
6917
6918		/*
6919		 *	Flush the unicast and multicast chains
6920		 */
6921		dev_uc_flush(dev);
6922		dev_mc_flush(dev);
 
 
 
 
6923
6924		if (dev->netdev_ops->ndo_uninit)
6925			dev->netdev_ops->ndo_uninit(dev);
6926
6927		if (skb)
6928			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
 
 
 
 
 
6929
6930		/* Notifier chain MUST detach us all upper devices. */
6931		WARN_ON(netdev_has_any_upper_dev(dev));
6932		WARN_ON(netdev_has_any_lower_dev(dev));
 
 
 
 
 
 
 
 
 
 
 
6933
6934		/* Remove entries from kobject tree */
6935		netdev_unregister_kobject(dev);
6936#ifdef CONFIG_XPS
6937		/* Remove XPS queueing entries */
6938		netif_reset_xps_queues_gt(dev, 0);
6939#endif
6940	}
6941
6942	synchronize_net();
 
 
 
 
 
 
6943
6944	list_for_each_entry(dev, head, unreg_list)
6945		dev_put(dev);
6946}
6947
6948static void rollback_registered(struct net_device *dev)
6949{
6950	LIST_HEAD(single);
 
 
 
 
 
 
6951
6952	list_add(&dev->unreg_list, &single);
6953	rollback_registered_many(&single);
6954	list_del(&single);
 
6955}
6956
6957static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6958	struct net_device *upper, netdev_features_t features)
6959{
6960	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6961	netdev_features_t feature;
6962	int feature_bit;
6963
6964	for_each_netdev_feature(&upper_disables, feature_bit) {
6965		feature = __NETIF_F_BIT(feature_bit);
6966		if (!(upper->wanted_features & feature)
6967		    && (features & feature)) {
6968			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6969				   &feature, upper->name);
6970			features &= ~feature;
6971		}
6972	}
6973
6974	return features;
6975}
6976
6977static void netdev_sync_lower_features(struct net_device *upper,
6978	struct net_device *lower, netdev_features_t features)
6979{
6980	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6981	netdev_features_t feature;
6982	int feature_bit;
6983
6984	for_each_netdev_feature(&upper_disables, feature_bit) {
6985		feature = __NETIF_F_BIT(feature_bit);
6986		if (!(features & feature) && (lower->features & feature)) {
6987			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6988				   &feature, lower->name);
6989			lower->wanted_features &= ~feature;
6990			netdev_update_features(lower);
6991
6992			if (unlikely(lower->features & feature))
6993				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6994					    &feature, lower->name);
 
 
6995		}
6996	}
6997}
6998
6999static netdev_features_t netdev_fix_features(struct net_device *dev,
7000	netdev_features_t features)
7001{
7002	/* Fix illegal checksum combinations */
7003	if ((features & NETIF_F_HW_CSUM) &&
7004	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
7005		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
7006		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
7007	}
7008
7009	/* TSO requires that SG is present as well. */
7010	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
7011		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
7012		features &= ~NETIF_F_ALL_TSO;
7013	}
7014
7015	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
7016					!(features & NETIF_F_IP_CSUM)) {
7017		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
7018		features &= ~NETIF_F_TSO;
7019		features &= ~NETIF_F_TSO_ECN;
7020	}
7021
7022	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
7023					 !(features & NETIF_F_IPV6_CSUM)) {
7024		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
7025		features &= ~NETIF_F_TSO6;
7026	}
7027
7028	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
7029	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
7030		features &= ~NETIF_F_TSO_MANGLEID;
7031
7032	/* TSO ECN requires that TSO is present as well. */
7033	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
7034		features &= ~NETIF_F_TSO_ECN;
7035
7036	/* Software GSO depends on SG. */
7037	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
7038		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
7039		features &= ~NETIF_F_GSO;
7040	}
7041
7042	/* UFO needs SG and checksumming */
7043	if (features & NETIF_F_UFO) {
7044		/* maybe split UFO into V4 and V6? */
7045		if (!(features & NETIF_F_HW_CSUM) &&
7046		    ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
7047		     (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
7048			netdev_dbg(dev,
7049				"Dropping NETIF_F_UFO since no checksum offload features.\n");
7050			features &= ~NETIF_F_UFO;
7051		}
7052
7053		if (!(features & NETIF_F_SG)) {
7054			netdev_dbg(dev,
7055				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
7056			features &= ~NETIF_F_UFO;
7057		}
7058	}
7059
7060	/* GSO partial features require GSO partial be set */
7061	if ((features & dev->gso_partial_features) &&
7062	    !(features & NETIF_F_GSO_PARTIAL)) {
7063		netdev_dbg(dev,
7064			   "Dropping partially supported GSO features since no GSO partial.\n");
7065		features &= ~dev->gso_partial_features;
7066	}
7067
7068#ifdef CONFIG_NET_RX_BUSY_POLL
7069	if (dev->netdev_ops->ndo_busy_poll)
7070		features |= NETIF_F_BUSY_POLL;
7071	else
7072#endif
7073		features &= ~NETIF_F_BUSY_POLL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7074
7075	return features;
7076}
7077
7078int __netdev_update_features(struct net_device *dev)
7079{
7080	struct net_device *upper, *lower;
7081	netdev_features_t features;
7082	struct list_head *iter;
7083	int err = -1;
7084
7085	ASSERT_RTNL();
7086
7087	features = netdev_get_wanted_features(dev);
7088
7089	if (dev->netdev_ops->ndo_fix_features)
7090		features = dev->netdev_ops->ndo_fix_features(dev, features);
7091
7092	/* driver might be less strict about feature dependencies */
7093	features = netdev_fix_features(dev, features);
7094
7095	/* some features can't be enabled if they're off an an upper device */
7096	netdev_for_each_upper_dev_rcu(dev, upper, iter)
7097		features = netdev_sync_upper_features(dev, upper, features);
7098
7099	if (dev->features == features)
7100		goto sync_lower;
7101
7102	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
7103		&dev->features, &features);
7104
7105	if (dev->netdev_ops->ndo_set_features)
7106		err = dev->netdev_ops->ndo_set_features(dev, features);
7107	else
7108		err = 0;
7109
7110	if (unlikely(err < 0)) {
7111		netdev_err(dev,
7112			"set_features() failed (%d); wanted %pNF, left %pNF\n",
7113			err, &features, &dev->features);
7114		/* return non-0 since some features might have changed and
7115		 * it's better to fire a spurious notification than miss it
7116		 */
7117		return -1;
7118	}
7119
7120sync_lower:
7121	/* some features must be disabled on lower devices when disabled
7122	 * on an upper device (think: bonding master or bridge)
7123	 */
7124	netdev_for_each_lower_dev(dev, lower, iter)
7125		netdev_sync_lower_features(dev, lower, features);
7126
7127	if (!err)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7128		dev->features = features;
 
7129
7130	return err < 0 ? 0 : 1;
7131}
7132
7133/**
7134 *	netdev_update_features - recalculate device features
7135 *	@dev: the device to check
7136 *
7137 *	Recalculate dev->features set and send notifications if it
7138 *	has changed. Should be called after driver or hardware dependent
7139 *	conditions might have changed that influence the features.
7140 */
7141void netdev_update_features(struct net_device *dev)
7142{
7143	if (__netdev_update_features(dev))
7144		netdev_features_change(dev);
7145}
7146EXPORT_SYMBOL(netdev_update_features);
7147
7148/**
7149 *	netdev_change_features - recalculate device features
7150 *	@dev: the device to check
7151 *
7152 *	Recalculate dev->features set and send notifications even
7153 *	if they have not changed. Should be called instead of
7154 *	netdev_update_features() if also dev->vlan_features might
7155 *	have changed to allow the changes to be propagated to stacked
7156 *	VLAN devices.
7157 */
7158void netdev_change_features(struct net_device *dev)
7159{
7160	__netdev_update_features(dev);
7161	netdev_features_change(dev);
7162}
7163EXPORT_SYMBOL(netdev_change_features);
7164
7165/**
7166 *	netif_stacked_transfer_operstate -	transfer operstate
7167 *	@rootdev: the root or lower level device to transfer state from
7168 *	@dev: the device to transfer operstate to
7169 *
7170 *	Transfer operational state from root to device. This is normally
7171 *	called when a stacking relationship exists between the root
7172 *	device and the device(a leaf device).
7173 */
7174void netif_stacked_transfer_operstate(const struct net_device *rootdev,
7175					struct net_device *dev)
7176{
7177	if (rootdev->operstate == IF_OPER_DORMANT)
7178		netif_dormant_on(dev);
7179	else
7180		netif_dormant_off(dev);
7181
7182	if (netif_carrier_ok(rootdev)) {
7183		if (!netif_carrier_ok(dev))
7184			netif_carrier_on(dev);
7185	} else {
7186		if (netif_carrier_ok(dev))
7187			netif_carrier_off(dev);
7188	}
 
 
7189}
7190EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7191
7192#ifdef CONFIG_SYSFS
7193static int netif_alloc_rx_queues(struct net_device *dev)
7194{
7195	unsigned int i, count = dev->num_rx_queues;
7196	struct netdev_rx_queue *rx;
7197	size_t sz = count * sizeof(*rx);
 
7198
7199	BUG_ON(count < 1);
7200
7201	rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7202	if (!rx) {
7203		rx = vzalloc(sz);
7204		if (!rx)
7205			return -ENOMEM;
7206	}
7207	dev->_rx = rx;
7208
7209	for (i = 0; i < count; i++)
7210		rx[i].dev = dev;
 
 
 
 
 
 
7211	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7212}
7213#endif
7214
7215static void netdev_init_one_queue(struct net_device *dev,
7216				  struct netdev_queue *queue, void *_unused)
7217{
7218	/* Initialize queue lock */
7219	spin_lock_init(&queue->_xmit_lock);
7220	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7221	queue->xmit_lock_owner = -1;
7222	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
7223	queue->dev = dev;
7224#ifdef CONFIG_BQL
7225	dql_init(&queue->dql, HZ);
7226#endif
7227}
7228
7229static void netif_free_tx_queues(struct net_device *dev)
7230{
7231	kvfree(dev->_tx);
7232}
7233
7234static int netif_alloc_netdev_queues(struct net_device *dev)
7235{
7236	unsigned int count = dev->num_tx_queues;
7237	struct netdev_queue *tx;
7238	size_t sz = count * sizeof(*tx);
7239
7240	if (count < 1 || count > 0xffff)
7241		return -EINVAL;
7242
7243	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7244	if (!tx) {
7245		tx = vzalloc(sz);
7246		if (!tx)
7247			return -ENOMEM;
7248	}
7249	dev->_tx = tx;
7250
7251	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7252	spin_lock_init(&dev->tx_global_lock);
7253
7254	return 0;
7255}
7256
7257void netif_tx_stop_all_queues(struct net_device *dev)
7258{
7259	unsigned int i;
7260
7261	for (i = 0; i < dev->num_tx_queues; i++) {
7262		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 
7263		netif_tx_stop_queue(txq);
7264	}
7265}
7266EXPORT_SYMBOL(netif_tx_stop_all_queues);
7267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7268/**
7269 *	register_netdevice	- register a network device
7270 *	@dev: device to register
7271 *
7272 *	Take a completed network device structure and add it to the kernel
7273 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7274 *	chain. 0 is returned on success. A negative errno code is returned
7275 *	on a failure to set up the device, or if the name is a duplicate.
7276 *
7277 *	Callers must hold the rtnl semaphore. You may want
7278 *	register_netdev() instead of this.
7279 *
7280 *	BUGS:
7281 *	The locking appears insufficient to guarantee two parallel registers
7282 *	will not get the same name.
7283 */
7284
7285int register_netdevice(struct net_device *dev)
7286{
7287	int ret;
7288	struct net *net = dev_net(dev);
7289
 
 
7290	BUG_ON(dev_boot_phase);
7291	ASSERT_RTNL();
7292
7293	might_sleep();
7294
7295	/* When net_device's are persistent, this will be fatal. */
7296	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7297	BUG_ON(!net);
7298
 
 
 
 
7299	spin_lock_init(&dev->addr_list_lock);
7300	netdev_set_addr_lockdep_class(dev);
7301
7302	ret = dev_get_valid_name(net, dev, dev->name);
7303	if (ret < 0)
7304		goto out;
7305
 
 
 
 
 
7306	/* Init, if this function is available */
7307	if (dev->netdev_ops->ndo_init) {
7308		ret = dev->netdev_ops->ndo_init(dev);
7309		if (ret) {
7310			if (ret > 0)
7311				ret = -EIO;
7312			goto out;
7313		}
7314	}
7315
7316	if (((dev->hw_features | dev->features) &
7317	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
7318	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7319	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7320		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7321		ret = -EINVAL;
7322		goto err_uninit;
7323	}
7324
7325	ret = -EBUSY;
7326	if (!dev->ifindex)
7327		dev->ifindex = dev_new_index(net);
7328	else if (__dev_get_by_index(net, dev->ifindex))
7329		goto err_uninit;
7330
 
 
 
 
 
7331	/* Transfer changeable features to wanted_features and enable
7332	 * software offloads (GSO and GRO).
7333	 */
7334	dev->hw_features |= NETIF_F_SOFT_FEATURES;
7335	dev->features |= NETIF_F_SOFT_FEATURES;
 
 
 
 
 
 
7336	dev->wanted_features = dev->features & dev->hw_features;
7337
7338	if (!(dev->flags & IFF_LOOPBACK))
7339		dev->hw_features |= NETIF_F_NOCACHE_COPY;
7340
7341	/* If IPv4 TCP segmentation offload is supported we should also
7342	 * allow the device to enable segmenting the frame with the option
7343	 * of ignoring a static IP ID value.  This doesn't enable the
7344	 * feature itself but allows the user to enable it later.
7345	 */
7346	if (dev->hw_features & NETIF_F_TSO)
7347		dev->hw_features |= NETIF_F_TSO_MANGLEID;
7348	if (dev->vlan_features & NETIF_F_TSO)
7349		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7350	if (dev->mpls_features & NETIF_F_TSO)
7351		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7352	if (dev->hw_enc_features & NETIF_F_TSO)
7353		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7354
7355	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7356	 */
7357	dev->vlan_features |= NETIF_F_HIGHDMA;
7358
7359	/* Make NETIF_F_SG inheritable to tunnel devices.
7360	 */
7361	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7362
7363	/* Make NETIF_F_SG inheritable to MPLS.
7364	 */
7365	dev->mpls_features |= NETIF_F_SG;
7366
7367	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7368	ret = notifier_to_errno(ret);
7369	if (ret)
7370		goto err_uninit;
7371
7372	ret = netdev_register_kobject(dev);
 
 
 
7373	if (ret)
7374		goto err_uninit;
7375	dev->reg_state = NETREG_REGISTERED;
7376
7377	__netdev_update_features(dev);
7378
7379	/*
7380	 *	Default initial state at registry is that the
7381	 *	device is present.
7382	 */
7383
7384	set_bit(__LINK_STATE_PRESENT, &dev->state);
7385
7386	linkwatch_init_dev(dev);
7387
7388	dev_init_scheduler(dev);
7389	dev_hold(dev);
 
7390	list_netdevice(dev);
 
7391	add_device_randomness(dev->dev_addr, dev->addr_len);
7392
7393	/* If the device has permanent device address, driver should
7394	 * set dev_addr and also addr_assign_type should be set to
7395	 * NET_ADDR_PERM (default value).
7396	 */
7397	if (dev->addr_assign_type == NET_ADDR_PERM)
7398		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7399
7400	/* Notify protocols, that a new device appeared. */
7401	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7402	ret = notifier_to_errno(ret);
7403	if (ret) {
7404		rollback_registered(dev);
7405		dev->reg_state = NETREG_UNREGISTERED;
 
 
7406	}
7407	/*
7408	 *	Prevent userspace races by waiting until the network
7409	 *	device is fully setup before sending notifications.
7410	 */
7411	if (!dev->rtnl_link_ops ||
7412	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7413		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7414
7415out:
7416	return ret;
7417
 
 
 
 
 
 
7418err_uninit:
7419	if (dev->netdev_ops->ndo_uninit)
7420		dev->netdev_ops->ndo_uninit(dev);
 
 
 
 
7421	goto out;
7422}
7423EXPORT_SYMBOL(register_netdevice);
7424
7425/**
7426 *	init_dummy_netdev	- init a dummy network device for NAPI
7427 *	@dev: device to init
7428 *
7429 *	This takes a network device structure and initialize the minimum
7430 *	amount of fields so it can be used to schedule NAPI polls without
7431 *	registering a full blown interface. This is to be used by drivers
7432 *	that need to tie several hardware interfaces to a single NAPI
7433 *	poll scheduler due to HW limitations.
7434 */
7435int init_dummy_netdev(struct net_device *dev)
7436{
7437	/* Clear everything. Note we don't initialize spinlocks
7438	 * are they aren't supposed to be taken by any of the
7439	 * NAPI code and this dummy netdev is supposed to be
7440	 * only ever used for NAPI polls
7441	 */
7442	memset(dev, 0, sizeof(struct net_device));
7443
7444	/* make sure we BUG if trying to hit standard
7445	 * register/unregister code path
7446	 */
7447	dev->reg_state = NETREG_DUMMY;
7448
7449	/* NAPI wants this */
7450	INIT_LIST_HEAD(&dev->napi_list);
7451
7452	/* a dummy interface is started by default */
7453	set_bit(__LINK_STATE_PRESENT, &dev->state);
7454	set_bit(__LINK_STATE_START, &dev->state);
7455
 
 
 
7456	/* Note : We dont allocate pcpu_refcnt for dummy devices,
7457	 * because users of this 'device' dont need to change
7458	 * its refcount.
7459	 */
7460
7461	return 0;
7462}
7463EXPORT_SYMBOL_GPL(init_dummy_netdev);
7464
7465
7466/**
7467 *	register_netdev	- register a network device
7468 *	@dev: device to register
7469 *
7470 *	Take a completed network device structure and add it to the kernel
7471 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7472 *	chain. 0 is returned on success. A negative errno code is returned
7473 *	on a failure to set up the device, or if the name is a duplicate.
7474 *
7475 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
7476 *	and expands the device name if you passed a format string to
7477 *	alloc_netdev.
7478 */
7479int register_netdev(struct net_device *dev)
7480{
7481	int err;
7482
7483	rtnl_lock();
 
7484	err = register_netdevice(dev);
7485	rtnl_unlock();
7486	return err;
7487}
7488EXPORT_SYMBOL(register_netdev);
7489
7490int netdev_refcnt_read(const struct net_device *dev)
7491{
 
7492	int i, refcnt = 0;
7493
7494	for_each_possible_cpu(i)
7495		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7496	return refcnt;
 
 
 
7497}
7498EXPORT_SYMBOL(netdev_refcnt_read);
7499
 
 
 
 
7500/**
7501 * netdev_wait_allrefs - wait until all references are gone.
7502 * @dev: target net_device
7503 *
7504 * This is called when unregistering network devices.
7505 *
7506 * Any protocol or device that holds a reference should register
7507 * for netdevice notification, and cleanup and put back the
7508 * reference if they receive an UNREGISTER event.
7509 * We can get stuck here if buggy protocols don't correctly
7510 * call dev_put.
7511 */
7512static void netdev_wait_allrefs(struct net_device *dev)
7513{
7514	unsigned long rebroadcast_time, warning_time;
7515	int refcnt;
 
7516
7517	linkwatch_forget_dev(dev);
7518
7519	rebroadcast_time = warning_time = jiffies;
7520	refcnt = netdev_refcnt_read(dev);
 
7521
7522	while (refcnt != 0) {
7523		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7524			rtnl_lock();
7525
7526			/* Rebroadcast unregister notification */
7527			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 
7528
7529			__rtnl_unlock();
7530			rcu_barrier();
7531			rtnl_lock();
7532
7533			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7534			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7535				     &dev->state)) {
7536				/* We must not have linkwatch events
7537				 * pending on unregister. If this
7538				 * happens, we simply run the queue
7539				 * unscheduled, resulting in a noop
7540				 * for this device.
7541				 */
7542				linkwatch_run_queue();
7543			}
 
7544
7545			__rtnl_unlock();
7546
7547			rebroadcast_time = jiffies;
7548		}
7549
7550		msleep(250);
 
 
 
 
 
 
7551
7552		refcnt = netdev_refcnt_read(dev);
 
 
 
 
 
 
 
 
 
 
7553
7554		if (time_after(jiffies, warning_time + 10 * HZ)) {
7555			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7556				 dev->name, refcnt);
7557			warning_time = jiffies;
7558		}
7559	}
7560}
7561
7562/* The sequence is:
7563 *
7564 *	rtnl_lock();
7565 *	...
7566 *	register_netdevice(x1);
7567 *	register_netdevice(x2);
7568 *	...
7569 *	unregister_netdevice(y1);
7570 *	unregister_netdevice(y2);
7571 *      ...
7572 *	rtnl_unlock();
7573 *	free_netdev(y1);
7574 *	free_netdev(y2);
7575 *
7576 * We are invoked by rtnl_unlock().
7577 * This allows us to deal with problems:
7578 * 1) We can delete sysfs objects which invoke hotplug
7579 *    without deadlocking with linkwatch via keventd.
7580 * 2) Since we run with the RTNL semaphore not held, we can sleep
7581 *    safely in order to wait for the netdev refcnt to drop to zero.
7582 *
7583 * We must not return until all unregister events added during
7584 * the interval the lock was held have been completed.
7585 */
7586void netdev_run_todo(void)
7587{
 
7588	struct list_head list;
 
 
 
 
 
 
 
 
 
 
 
 
 
7589
7590	/* Snapshot list, allow later requests */
7591	list_replace_init(&net_todo_list, &list);
7592
7593	__rtnl_unlock();
7594
7595
7596	/* Wait for rcu callbacks to finish before next phase */
7597	if (!list_empty(&list))
7598		rcu_barrier();
7599
7600	while (!list_empty(&list)) {
7601		struct net_device *dev
7602			= list_first_entry(&list, struct net_device, todo_list);
7603		list_del(&dev->todo_list);
7604
7605		rtnl_lock();
7606		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7607		__rtnl_unlock();
7608
7609		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7610			pr_err("network todo '%s' but state %d\n",
7611			       dev->name, dev->reg_state);
7612			dump_stack();
7613			continue;
7614		}
7615
 
7616		dev->reg_state = NETREG_UNREGISTERED;
 
 
 
7617
7618		netdev_wait_allrefs(dev);
 
 
7619
7620		/* paranoia */
7621		BUG_ON(netdev_refcnt_read(dev));
7622		BUG_ON(!list_empty(&dev->ptype_all));
7623		BUG_ON(!list_empty(&dev->ptype_specific));
7624		WARN_ON(rcu_access_pointer(dev->ip_ptr));
7625		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7626		WARN_ON(dev->dn_ptr);
7627
7628		if (dev->destructor)
7629			dev->destructor(dev);
 
 
 
7630
7631		/* Report a network device has been unregistered */
7632		rtnl_lock();
7633		dev_net(dev)->dev_unreg_count--;
7634		__rtnl_unlock();
7635		wake_up(&netdev_unregistering_wq);
7636
7637		/* Free network device */
7638		kobject_put(&dev->dev.kobj);
7639	}
7640}
7641
7642/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7643 * all the same fields in the same order as net_device_stats, with only
7644 * the type differing, but rtnl_link_stats64 may have additional fields
7645 * at the end for newer counters.
7646 */
7647void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7648			     const struct net_device_stats *netdev_stats)
7649{
7650#if BITS_PER_LONG == 64
7651	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7652	memcpy(stats64, netdev_stats, sizeof(*stats64));
7653	/* zero out counters that only exist in rtnl_link_stats64 */
7654	memset((char *)stats64 + sizeof(*netdev_stats), 0,
7655	       sizeof(*stats64) - sizeof(*netdev_stats));
7656#else
7657	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7658	const unsigned long *src = (const unsigned long *)netdev_stats;
7659	u64 *dst = (u64 *)stats64;
7660
7661	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7662	for (i = 0; i < n; i++)
7663		dst[i] = src[i];
7664	/* zero out counters that only exist in rtnl_link_stats64 */
7665	memset((char *)stats64 + n * sizeof(u64), 0,
7666	       sizeof(*stats64) - n * sizeof(u64));
7667#endif
7668}
7669EXPORT_SYMBOL(netdev_stats_to_stats64);
7670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7671/**
7672 *	dev_get_stats	- get network device statistics
7673 *	@dev: device to get statistics from
7674 *	@storage: place to store stats
7675 *
7676 *	Get network statistics from device. Return @storage.
7677 *	The device driver may provide its own method by setting
7678 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7679 *	otherwise the internal statistics structure is used.
7680 */
7681struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7682					struct rtnl_link_stats64 *storage)
7683{
7684	const struct net_device_ops *ops = dev->netdev_ops;
 
7685
7686	if (ops->ndo_get_stats64) {
7687		memset(storage, 0, sizeof(*storage));
7688		ops->ndo_get_stats64(dev, storage);
7689	} else if (ops->ndo_get_stats) {
7690		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7691	} else {
7692		netdev_stats_to_stats64(storage, &dev->stats);
7693	}
7694	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7695	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7696	storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
 
 
 
 
 
 
 
 
 
 
 
 
7697	return storage;
7698}
7699EXPORT_SYMBOL(dev_get_stats);
7700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7701struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7702{
7703	struct netdev_queue *queue = dev_ingress_queue(dev);
7704
7705#ifdef CONFIG_NET_CLS_ACT
7706	if (queue)
7707		return queue;
7708	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7709	if (!queue)
7710		return NULL;
7711	netdev_init_one_queue(dev, queue, NULL);
7712	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7713	queue->qdisc_sleeping = &noop_qdisc;
7714	rcu_assign_pointer(dev->ingress_queue, queue);
7715#endif
7716	return queue;
7717}
7718
7719static const struct ethtool_ops default_ethtool_ops;
7720
7721void netdev_set_default_ethtool_ops(struct net_device *dev,
7722				    const struct ethtool_ops *ops)
7723{
7724	if (dev->ethtool_ops == &default_ethtool_ops)
7725		dev->ethtool_ops = ops;
7726}
7727EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7728
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7729void netdev_freemem(struct net_device *dev)
7730{
7731	char *addr = (char *)dev - dev->padded;
7732
7733	kvfree(addr);
7734}
7735
7736/**
7737 *	alloc_netdev_mqs - allocate network device
7738 *	@sizeof_priv:		size of private data to allocate space for
7739 *	@name:			device name format string
7740 *	@name_assign_type: 	origin of device name
7741 *	@setup:			callback to initialize device
7742 *	@txqs:			the number of TX subqueues to allocate
7743 *	@rxqs:			the number of RX subqueues to allocate
7744 *
7745 *	Allocates a struct net_device with private data area for driver use
7746 *	and performs basic initialization.  Also allocates subqueue structs
7747 *	for each queue on the device.
7748 */
7749struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7750		unsigned char name_assign_type,
7751		void (*setup)(struct net_device *),
7752		unsigned int txqs, unsigned int rxqs)
7753{
7754	struct net_device *dev;
7755	size_t alloc_size;
7756	struct net_device *p;
7757
7758	BUG_ON(strlen(name) >= sizeof(dev->name));
7759
7760	if (txqs < 1) {
7761		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7762		return NULL;
7763	}
7764
7765#ifdef CONFIG_SYSFS
7766	if (rxqs < 1) {
7767		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7768		return NULL;
7769	}
7770#endif
7771
7772	alloc_size = sizeof(struct net_device);
7773	if (sizeof_priv) {
7774		/* ensure 32-byte alignment of private area */
7775		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7776		alloc_size += sizeof_priv;
7777	}
7778	/* ensure 32-byte alignment of whole construct */
7779	alloc_size += NETDEV_ALIGN - 1;
7780
7781	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7782	if (!p)
7783		p = vzalloc(alloc_size);
7784	if (!p)
7785		return NULL;
7786
7787	dev = PTR_ALIGN(p, NETDEV_ALIGN);
7788	dev->padded = (char *)dev - (char *)p;
7789
 
 
7790	dev->pcpu_refcnt = alloc_percpu(int);
7791	if (!dev->pcpu_refcnt)
7792		goto free_dev;
 
 
 
 
7793
7794	if (dev_addr_init(dev))
7795		goto free_pcpu;
7796
7797	dev_mc_init(dev);
7798	dev_uc_init(dev);
7799
7800	dev_net_set(dev, &init_net);
7801
7802	dev->gso_max_size = GSO_MAX_SIZE;
 
7803	dev->gso_max_segs = GSO_MAX_SEGS;
 
 
 
 
 
 
 
 
 
 
 
7804
7805	INIT_LIST_HEAD(&dev->napi_list);
7806	INIT_LIST_HEAD(&dev->unreg_list);
7807	INIT_LIST_HEAD(&dev->close_list);
7808	INIT_LIST_HEAD(&dev->link_watch_list);
7809	INIT_LIST_HEAD(&dev->adj_list.upper);
7810	INIT_LIST_HEAD(&dev->adj_list.lower);
7811	INIT_LIST_HEAD(&dev->ptype_all);
7812	INIT_LIST_HEAD(&dev->ptype_specific);
 
7813#ifdef CONFIG_NET_SCHED
7814	hash_init(dev->qdisc_hash);
7815#endif
7816	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7817	setup(dev);
7818
7819	if (!dev->tx_queue_len) {
7820		dev->priv_flags |= IFF_NO_QUEUE;
7821		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
7822	}
7823
7824	dev->num_tx_queues = txqs;
7825	dev->real_num_tx_queues = txqs;
7826	if (netif_alloc_netdev_queues(dev))
7827		goto free_all;
7828
7829#ifdef CONFIG_SYSFS
7830	dev->num_rx_queues = rxqs;
7831	dev->real_num_rx_queues = rxqs;
7832	if (netif_alloc_rx_queues(dev))
7833		goto free_all;
7834#endif
7835
7836	strcpy(dev->name, name);
7837	dev->name_assign_type = name_assign_type;
7838	dev->group = INIT_NETDEV_GROUP;
7839	if (!dev->ethtool_ops)
7840		dev->ethtool_ops = &default_ethtool_ops;
7841
7842	nf_hook_ingress_init(dev);
7843
7844	return dev;
7845
7846free_all:
7847	free_netdev(dev);
7848	return NULL;
7849
7850free_pcpu:
 
7851	free_percpu(dev->pcpu_refcnt);
7852free_dev:
 
7853	netdev_freemem(dev);
7854	return NULL;
7855}
7856EXPORT_SYMBOL(alloc_netdev_mqs);
7857
7858/**
7859 *	free_netdev - free network device
7860 *	@dev: device
7861 *
7862 *	This function does the last stage of destroying an allocated device
7863 * 	interface. The reference to the device object is released.
7864 *	If this is the last reference then it will be freed.
7865 *	Must be called in process context.
7866 */
7867void free_netdev(struct net_device *dev)
7868{
7869	struct napi_struct *p, *n;
7870
7871	might_sleep();
 
 
 
 
 
 
 
 
 
 
 
7872	netif_free_tx_queues(dev);
7873#ifdef CONFIG_SYSFS
7874	kvfree(dev->_rx);
7875#endif
7876
7877	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7878
7879	/* Flush device addresses */
7880	dev_addr_flush(dev);
7881
7882	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7883		netif_napi_del(p);
7884
 
 
7885	free_percpu(dev->pcpu_refcnt);
7886	dev->pcpu_refcnt = NULL;
 
 
 
 
 
7887
7888	/*  Compatibility with error handling in drivers */
7889	if (dev->reg_state == NETREG_UNINITIALIZED) {
7890		netdev_freemem(dev);
7891		return;
7892	}
7893
7894	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7895	dev->reg_state = NETREG_RELEASED;
7896
7897	/* will free via device release */
7898	put_device(&dev->dev);
7899}
7900EXPORT_SYMBOL(free_netdev);
7901
7902/**
7903 *	synchronize_net -  Synchronize with packet receive processing
7904 *
7905 *	Wait for packets currently being received to be done.
7906 *	Does not block later packets from starting.
7907 */
7908void synchronize_net(void)
7909{
7910	might_sleep();
7911	if (rtnl_is_locked())
7912		synchronize_rcu_expedited();
7913	else
7914		synchronize_rcu();
7915}
7916EXPORT_SYMBOL(synchronize_net);
7917
7918/**
7919 *	unregister_netdevice_queue - remove device from the kernel
7920 *	@dev: device
7921 *	@head: list
7922 *
7923 *	This function shuts down a device interface and removes it
7924 *	from the kernel tables.
7925 *	If head not NULL, device is queued to be unregistered later.
7926 *
7927 *	Callers must hold the rtnl semaphore.  You may want
7928 *	unregister_netdev() instead of this.
7929 */
7930
7931void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7932{
7933	ASSERT_RTNL();
7934
7935	if (head) {
7936		list_move_tail(&dev->unreg_list, head);
7937	} else {
7938		rollback_registered(dev);
7939		/* Finish processing unregister after unlock */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7940		net_set_todo(dev);
7941	}
 
 
7942}
7943EXPORT_SYMBOL(unregister_netdevice_queue);
7944
7945/**
7946 *	unregister_netdevice_many - unregister many devices
7947 *	@head: list of devices
7948 *
7949 *  Note: As most callers use a stack allocated list_head,
7950 *  we force a list_del() to make sure stack wont be corrupted later.
7951 */
7952void unregister_netdevice_many(struct list_head *head)
7953{
7954	struct net_device *dev;
7955
7956	if (!list_empty(head)) {
7957		rollback_registered_many(head);
7958		list_for_each_entry(dev, head, unreg_list)
7959			net_set_todo(dev);
7960		list_del(head);
7961	}
7962}
7963EXPORT_SYMBOL(unregister_netdevice_many);
7964
7965/**
7966 *	unregister_netdev - remove device from the kernel
7967 *	@dev: device
7968 *
7969 *	This function shuts down a device interface and removes it
7970 *	from the kernel tables.
7971 *
7972 *	This is just a wrapper for unregister_netdevice that takes
7973 *	the rtnl semaphore.  In general you want to use this and not
7974 *	unregister_netdevice.
7975 */
7976void unregister_netdev(struct net_device *dev)
7977{
7978	rtnl_lock();
7979	unregister_netdevice(dev);
7980	rtnl_unlock();
7981}
7982EXPORT_SYMBOL(unregister_netdev);
7983
7984/**
7985 *	dev_change_net_namespace - move device to different nethost namespace
7986 *	@dev: device
7987 *	@net: network namespace
7988 *	@pat: If not NULL name pattern to try if the current device name
7989 *	      is already taken in the destination network namespace.
 
 
7990 *
7991 *	This function shuts down a device interface and moves it
7992 *	to a new network namespace. On success 0 is returned, on
7993 *	a failure a netagive errno code is returned.
7994 *
7995 *	Callers must hold the rtnl semaphore.
7996 */
7997
7998int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
 
7999{
8000	int err;
 
 
 
8001
8002	ASSERT_RTNL();
8003
8004	/* Don't allow namespace local devices to be moved. */
8005	err = -EINVAL;
8006	if (dev->features & NETIF_F_NETNS_LOCAL)
8007		goto out;
8008
8009	/* Ensure the device has been registrered */
8010	if (dev->reg_state != NETREG_REGISTERED)
8011		goto out;
8012
8013	/* Get out if there is nothing todo */
8014	err = 0;
8015	if (net_eq(dev_net(dev), net))
8016		goto out;
8017
8018	/* Pick the destination device name, and ensure
8019	 * we can use it in the destination network namespace.
8020	 */
8021	err = -EEXIST;
8022	if (__dev_get_by_name(net, dev->name)) {
8023		/* We get here if we can't use the current device name */
8024		if (!pat)
8025			goto out;
8026		if (dev_get_valid_name(net, dev, pat) < 0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8027			goto out;
 
8028	}
8029
8030	/*
8031	 * And now a mini version of register_netdevice unregister_netdevice.
8032	 */
8033
8034	/* If device is running close it first. */
8035	dev_close(dev);
8036
8037	/* And unlink it from device chain */
8038	err = -ENODEV;
8039	unlist_netdevice(dev);
8040
8041	synchronize_net();
8042
8043	/* Shutdown queueing discipline. */
8044	dev_shutdown(dev);
8045
8046	/* Notify protocols, that we are about to destroy
8047	   this device. They should clean all the things.
8048
8049	   Note that dev->reg_state stays at NETREG_REGISTERED.
8050	   This is wanted because this way 8021q and macvlan know
8051	   the device is just moving and can keep their slaves up.
8052	*/
8053	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8054	rcu_barrier();
8055	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
8056	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
 
 
 
8057
8058	/*
8059	 *	Flush the unicast and multicast chains
8060	 */
8061	dev_uc_flush(dev);
8062	dev_mc_flush(dev);
8063
8064	/* Send a netdev-removed uevent to the old namespace */
8065	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
8066	netdev_adjacent_del_links(dev);
8067
 
 
 
8068	/* Actually switch the network namespace */
8069	dev_net_set(dev, net);
 
8070
8071	/* If there is an ifindex conflict assign a new one */
8072	if (__dev_get_by_index(net, dev->ifindex))
8073		dev->ifindex = dev_new_index(net);
 
 
 
 
 
8074
8075	/* Send a netdev-add uevent to the new namespace */
8076	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
8077	netdev_adjacent_add_links(dev);
8078
8079	/* Fixup kobjects */
8080	err = device_rename(&dev->dev, dev->name);
 
 
8081	WARN_ON(err);
8082
8083	/* Add the device back in the hashes */
8084	list_netdevice(dev);
8085
8086	/* Notify protocols, that a new device appeared. */
8087	call_netdevice_notifiers(NETDEV_REGISTER, dev);
8088
8089	/*
8090	 *	Prevent userspace races by waiting until the network
8091	 *	device is fully setup before sending notifications.
8092	 */
8093	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
8094
8095	synchronize_net();
8096	err = 0;
8097out:
8098	return err;
8099}
8100EXPORT_SYMBOL_GPL(dev_change_net_namespace);
8101
8102static int dev_cpu_dead(unsigned int oldcpu)
8103{
8104	struct sk_buff **list_skb;
8105	struct sk_buff *skb;
8106	unsigned int cpu;
8107	struct softnet_data *sd, *oldsd;
8108
8109	local_irq_disable();
8110	cpu = smp_processor_id();
8111	sd = &per_cpu(softnet_data, cpu);
8112	oldsd = &per_cpu(softnet_data, oldcpu);
8113
8114	/* Find end of our completion_queue. */
8115	list_skb = &sd->completion_queue;
8116	while (*list_skb)
8117		list_skb = &(*list_skb)->next;
8118	/* Append completion queue from offline CPU. */
8119	*list_skb = oldsd->completion_queue;
8120	oldsd->completion_queue = NULL;
8121
8122	/* Append output queue from offline CPU. */
8123	if (oldsd->output_queue) {
8124		*sd->output_queue_tailp = oldsd->output_queue;
8125		sd->output_queue_tailp = oldsd->output_queue_tailp;
8126		oldsd->output_queue = NULL;
8127		oldsd->output_queue_tailp = &oldsd->output_queue;
8128	}
8129	/* Append NAPI poll list from offline CPU, with one exception :
8130	 * process_backlog() must be called by cpu owning percpu backlog.
8131	 * We properly handle process_queue & input_pkt_queue later.
8132	 */
8133	while (!list_empty(&oldsd->poll_list)) {
8134		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
8135							    struct napi_struct,
8136							    poll_list);
8137
8138		list_del_init(&napi->poll_list);
8139		if (napi->poll == process_backlog)
8140			napi->state = 0;
8141		else
8142			____napi_schedule(sd, napi);
8143	}
8144
8145	raise_softirq_irqoff(NET_TX_SOFTIRQ);
8146	local_irq_enable();
8147
 
 
 
 
 
 
 
8148	/* Process offline CPU's input_pkt_queue */
8149	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
8150		netif_rx_ni(skb);
8151		input_queue_head_incr(oldsd);
8152	}
8153	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
8154		netif_rx_ni(skb);
8155		input_queue_head_incr(oldsd);
8156	}
8157
8158	return 0;
8159}
8160
8161/**
8162 *	netdev_increment_features - increment feature set by one
8163 *	@all: current feature set
8164 *	@one: new feature set
8165 *	@mask: mask feature set
8166 *
8167 *	Computes a new feature set after adding a device with feature set
8168 *	@one to the master device with current feature set @all.  Will not
8169 *	enable anything that is off in @mask. Returns the new feature set.
8170 */
8171netdev_features_t netdev_increment_features(netdev_features_t all,
8172	netdev_features_t one, netdev_features_t mask)
8173{
8174	if (mask & NETIF_F_HW_CSUM)
8175		mask |= NETIF_F_CSUM_MASK;
8176	mask |= NETIF_F_VLAN_CHALLENGED;
8177
8178	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
8179	all &= one | ~NETIF_F_ALL_FOR_ALL;
8180
8181	/* If one device supports hw checksumming, set for all. */
8182	if (all & NETIF_F_HW_CSUM)
8183		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
8184
8185	return all;
8186}
8187EXPORT_SYMBOL(netdev_increment_features);
8188
8189static struct hlist_head * __net_init netdev_create_hash(void)
8190{
8191	int i;
8192	struct hlist_head *hash;
8193
8194	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8195	if (hash != NULL)
8196		for (i = 0; i < NETDEV_HASHENTRIES; i++)
8197			INIT_HLIST_HEAD(&hash[i]);
8198
8199	return hash;
8200}
8201
8202/* Initialize per network namespace state */
8203static int __net_init netdev_init(struct net *net)
8204{
8205	if (net != &init_net)
8206		INIT_LIST_HEAD(&net->dev_base_head);
 
 
8207
8208	net->dev_name_head = netdev_create_hash();
8209	if (net->dev_name_head == NULL)
8210		goto err_name;
8211
8212	net->dev_index_head = netdev_create_hash();
8213	if (net->dev_index_head == NULL)
8214		goto err_idx;
8215
 
 
 
 
8216	return 0;
8217
8218err_idx:
8219	kfree(net->dev_name_head);
8220err_name:
8221	return -ENOMEM;
8222}
8223
8224/**
8225 *	netdev_drivername - network driver for the device
8226 *	@dev: network device
8227 *
8228 *	Determine network driver for device.
8229 */
8230const char *netdev_drivername(const struct net_device *dev)
8231{
8232	const struct device_driver *driver;
8233	const struct device *parent;
8234	const char *empty = "";
8235
8236	parent = dev->dev.parent;
8237	if (!parent)
8238		return empty;
8239
8240	driver = parent->driver;
8241	if (driver && driver->name)
8242		return driver->name;
8243	return empty;
8244}
8245
8246static void __netdev_printk(const char *level, const struct net_device *dev,
8247			    struct va_format *vaf)
8248{
8249	if (dev && dev->dev.parent) {
8250		dev_printk_emit(level[1] - '0',
8251				dev->dev.parent,
8252				"%s %s %s%s: %pV",
8253				dev_driver_string(dev->dev.parent),
8254				dev_name(dev->dev.parent),
8255				netdev_name(dev), netdev_reg_state(dev),
8256				vaf);
8257	} else if (dev) {
8258		printk("%s%s%s: %pV",
8259		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
8260	} else {
8261		printk("%s(NULL net_device): %pV", level, vaf);
8262	}
8263}
8264
8265void netdev_printk(const char *level, const struct net_device *dev,
8266		   const char *format, ...)
8267{
8268	struct va_format vaf;
8269	va_list args;
8270
8271	va_start(args, format);
8272
8273	vaf.fmt = format;
8274	vaf.va = &args;
8275
8276	__netdev_printk(level, dev, &vaf);
8277
8278	va_end(args);
8279}
8280EXPORT_SYMBOL(netdev_printk);
8281
8282#define define_netdev_printk_level(func, level)			\
8283void func(const struct net_device *dev, const char *fmt, ...)	\
8284{								\
8285	struct va_format vaf;					\
8286	va_list args;						\
8287								\
8288	va_start(args, fmt);					\
8289								\
8290	vaf.fmt = fmt;						\
8291	vaf.va = &args;						\
8292								\
8293	__netdev_printk(level, dev, &vaf);			\
8294								\
8295	va_end(args);						\
8296}								\
8297EXPORT_SYMBOL(func);
8298
8299define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8300define_netdev_printk_level(netdev_alert, KERN_ALERT);
8301define_netdev_printk_level(netdev_crit, KERN_CRIT);
8302define_netdev_printk_level(netdev_err, KERN_ERR);
8303define_netdev_printk_level(netdev_warn, KERN_WARNING);
8304define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8305define_netdev_printk_level(netdev_info, KERN_INFO);
8306
8307static void __net_exit netdev_exit(struct net *net)
8308{
8309	kfree(net->dev_name_head);
8310	kfree(net->dev_index_head);
 
 
 
8311}
8312
8313static struct pernet_operations __net_initdata netdev_net_ops = {
8314	.init = netdev_init,
8315	.exit = netdev_exit,
8316};
8317
8318static void __net_exit default_device_exit(struct net *net)
8319{
 
8320	struct net_device *dev, *aux;
8321	/*
8322	 * Push all migratable network devices back to the
8323	 * initial network namespace
8324	 */
8325	rtnl_lock();
8326	for_each_netdev_safe(net, dev, aux) {
8327		int err;
8328		char fb_name[IFNAMSIZ];
8329
8330		/* Ignore unmoveable devices (i.e. loopback) */
8331		if (dev->features & NETIF_F_NETNS_LOCAL)
8332			continue;
8333
8334		/* Leave virtual devices for the generic cleanup */
8335		if (dev->rtnl_link_ops)
8336			continue;
8337
8338		/* Push remaining network devices to init_net */
8339		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
 
 
 
 
 
 
 
 
 
 
8340		err = dev_change_net_namespace(dev, &init_net, fb_name);
8341		if (err) {
8342			pr_emerg("%s: failed to move %s to init_net: %d\n",
8343				 __func__, dev->name, err);
8344			BUG();
8345		}
8346	}
8347	rtnl_unlock();
8348}
8349
8350static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8351{
8352	/* Return with the rtnl_lock held when there are no network
8353	 * devices unregistering in any network namespace in net_list.
8354	 */
8355	struct net *net;
8356	bool unregistering;
8357	DEFINE_WAIT_FUNC(wait, woken_wake_function);
8358
8359	add_wait_queue(&netdev_unregistering_wq, &wait);
8360	for (;;) {
8361		unregistering = false;
8362		rtnl_lock();
8363		list_for_each_entry(net, net_list, exit_list) {
8364			if (net->dev_unreg_count > 0) {
8365				unregistering = true;
8366				break;
8367			}
8368		}
8369		if (!unregistering)
8370			break;
8371		__rtnl_unlock();
8372
8373		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8374	}
8375	remove_wait_queue(&netdev_unregistering_wq, &wait);
8376}
8377
8378static void __net_exit default_device_exit_batch(struct list_head *net_list)
8379{
8380	/* At exit all network devices most be removed from a network
8381	 * namespace.  Do this in the reverse order of registration.
8382	 * Do this across as many network namespaces as possible to
8383	 * improve batching efficiency.
8384	 */
8385	struct net_device *dev;
8386	struct net *net;
8387	LIST_HEAD(dev_kill_list);
8388
8389	/* To prevent network device cleanup code from dereferencing
8390	 * loopback devices or network devices that have been freed
8391	 * wait here for all pending unregistrations to complete,
8392	 * before unregistring the loopback device and allowing the
8393	 * network namespace be freed.
8394	 *
8395	 * The netdev todo list containing all network devices
8396	 * unregistrations that happen in default_device_exit_batch
8397	 * will run in the rtnl_unlock() at the end of
8398	 * default_device_exit_batch.
8399	 */
8400	rtnl_lock_unregistering(net_list);
8401	list_for_each_entry(net, net_list, exit_list) {
8402		for_each_netdev_reverse(net, dev) {
8403			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8404				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8405			else
8406				unregister_netdevice_queue(dev, &dev_kill_list);
8407		}
8408	}
8409	unregister_netdevice_many(&dev_kill_list);
8410	rtnl_unlock();
8411}
8412
8413static struct pernet_operations __net_initdata default_device_ops = {
8414	.exit = default_device_exit,
8415	.exit_batch = default_device_exit_batch,
8416};
8417
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8418/*
8419 *	Initialize the DEV module. At boot time this walks the device list and
8420 *	unhooks any devices that fail to initialise (normally hardware not
8421 *	present) and leaves us with a valid list of present and active devices.
8422 *
8423 */
8424
8425/*
8426 *       This is called single threaded during boot, so no need
8427 *       to take the rtnl semaphore.
8428 */
8429static int __init net_dev_init(void)
8430{
8431	int i, rc = -ENOMEM;
8432
8433	BUG_ON(!dev_boot_phase);
8434
 
 
8435	if (dev_proc_init())
8436		goto out;
8437
8438	if (netdev_kobject_init())
8439		goto out;
8440
8441	INIT_LIST_HEAD(&ptype_all);
8442	for (i = 0; i < PTYPE_HASH_SIZE; i++)
8443		INIT_LIST_HEAD(&ptype_base[i]);
8444
8445	INIT_LIST_HEAD(&offload_base);
8446
8447	if (register_pernet_subsys(&netdev_net_ops))
8448		goto out;
8449
8450	/*
8451	 *	Initialise the packet receive queues.
8452	 */
8453
8454	for_each_possible_cpu(i) {
8455		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
8456		struct softnet_data *sd = &per_cpu(softnet_data, i);
8457
8458		INIT_WORK(flush, flush_backlog);
8459
8460		skb_queue_head_init(&sd->input_pkt_queue);
8461		skb_queue_head_init(&sd->process_queue);
 
 
 
8462		INIT_LIST_HEAD(&sd->poll_list);
8463		sd->output_queue_tailp = &sd->output_queue;
8464#ifdef CONFIG_RPS
8465		sd->csd.func = rps_trigger_softirq;
8466		sd->csd.info = sd;
8467		sd->cpu = i;
8468#endif
 
 
8469
 
8470		sd->backlog.poll = process_backlog;
8471		sd->backlog.weight = weight_p;
8472	}
8473
8474	dev_boot_phase = 0;
8475
8476	/* The loopback device is special if any other network devices
8477	 * is present in a network namespace the loopback device must
8478	 * be present. Since we now dynamically allocate and free the
8479	 * loopback device ensure this invariant is maintained by
8480	 * keeping the loopback device as the first device on the
8481	 * list of network devices.  Ensuring the loopback devices
8482	 * is the first device that appears and the last network device
8483	 * that disappears.
8484	 */
8485	if (register_pernet_device(&loopback_net_ops))
8486		goto out;
8487
8488	if (register_pernet_device(&default_device_ops))
8489		goto out;
8490
8491	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8492	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8493
8494	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
8495				       NULL, dev_cpu_dead);
8496	WARN_ON(rc < 0);
8497	dst_subsys_init();
8498	rc = 0;
8499out:
8500	return rc;
8501}
8502
8503subsys_initcall(net_dev_init);