Linux Audio

Check our new training course

Loading...
v6.9.4
    1// SPDX-License-Identifier: GPL-2.0-or-later
    2/*
    3 *      NET3    Protocol independent device support routines.
 
 
 
 
 
    4 *
    5 *	Derived from the non IP parts of dev.c 1.0.19
    6 *              Authors:	Ross Biro
    7 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
    8 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
    9 *
   10 *	Additional Authors:
   11 *		Florian la Roche <rzsfl@rz.uni-sb.de>
   12 *		Alan Cox <gw4pts@gw4pts.ampr.org>
   13 *		David Hinds <dahinds@users.sourceforge.net>
   14 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
   15 *		Adam Sulmicki <adam@cfar.umd.edu>
   16 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
   17 *
   18 *	Changes:
   19 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
   20 *                                      to 2 if register_netdev gets called
   21 *                                      before net_dev_init & also removed a
   22 *                                      few lines of code in the process.
   23 *		Alan Cox	:	device private ioctl copies fields back.
   24 *		Alan Cox	:	Transmit queue code does relevant
   25 *					stunts to keep the queue safe.
   26 *		Alan Cox	:	Fixed double lock.
   27 *		Alan Cox	:	Fixed promisc NULL pointer trap
   28 *		????????	:	Support the full private ioctl range
   29 *		Alan Cox	:	Moved ioctl permission check into
   30 *					drivers
   31 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
   32 *		Alan Cox	:	100 backlog just doesn't cut it when
   33 *					you start doing multicast video 8)
   34 *		Alan Cox	:	Rewrote net_bh and list manager.
   35 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
   36 *		Alan Cox	:	Took out transmit every packet pass
   37 *					Saved a few bytes in the ioctl handler
   38 *		Alan Cox	:	Network driver sets packet type before
   39 *					calling netif_rx. Saves a function
   40 *					call a packet.
   41 *		Alan Cox	:	Hashed net_bh()
   42 *		Richard Kooijman:	Timestamp fixes.
   43 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
   44 *		Alan Cox	:	Device lock protection.
   45 *              Alan Cox        :       Fixed nasty side effect of device close
   46 *					changes.
   47 *		Rudi Cilibrasi	:	Pass the right thing to
   48 *					set_mac_address()
   49 *		Dave Miller	:	32bit quantity for the device lock to
   50 *					make it work out on a Sparc.
   51 *		Bjorn Ekwall	:	Added KERNELD hack.
   52 *		Alan Cox	:	Cleaned up the backlog initialise.
   53 *		Craig Metz	:	SIOCGIFCONF fix if space for under
   54 *					1 device.
   55 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
   56 *					is no device open function.
   57 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
   58 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
   59 *		Cyrus Durgin	:	Cleaned for KMOD
   60 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
   61 *					A network device unload needs to purge
   62 *					the backlog queue.
   63 *	Paul Rusty Russell	:	SIOCSIFNAME
   64 *              Pekka Riikonen  :	Netdev boot-time settings code
   65 *              Andrew Morton   :       Make unregister_netdevice wait
   66 *                                      indefinitely on dev->refcnt
   67 *              J Hadi Salim    :       - Backlog queue sampling
   68 *				        - netif_rx() feedback
   69 */
   70
   71#include <linux/uaccess.h>
   72#include <linux/bitmap.h>
   73#include <linux/capability.h>
   74#include <linux/cpu.h>
   75#include <linux/types.h>
   76#include <linux/kernel.h>
   77#include <linux/hash.h>
   78#include <linux/slab.h>
   79#include <linux/sched.h>
   80#include <linux/sched/mm.h>
   81#include <linux/mutex.h>
   82#include <linux/rwsem.h>
   83#include <linux/string.h>
   84#include <linux/mm.h>
   85#include <linux/socket.h>
   86#include <linux/sockios.h>
   87#include <linux/errno.h>
   88#include <linux/interrupt.h>
   89#include <linux/if_ether.h>
   90#include <linux/netdevice.h>
   91#include <linux/etherdevice.h>
   92#include <linux/ethtool.h>
 
   93#include <linux/skbuff.h>
   94#include <linux/kthread.h>
   95#include <linux/bpf.h>
   96#include <linux/bpf_trace.h>
   97#include <net/net_namespace.h>
   98#include <net/sock.h>
   99#include <net/busy_poll.h>
  100#include <linux/rtnetlink.h>
  101#include <linux/stat.h>
  102#include <net/dsa.h>
  103#include <net/dst.h>
  104#include <net/dst_metadata.h>
  105#include <net/gro.h>
  106#include <net/pkt_sched.h>
  107#include <net/pkt_cls.h>
  108#include <net/checksum.h>
  109#include <net/xfrm.h>
  110#include <net/tcx.h>
  111#include <linux/highmem.h>
  112#include <linux/init.h>
  113#include <linux/module.h>
  114#include <linux/netpoll.h>
  115#include <linux/rcupdate.h>
  116#include <linux/delay.h>
  117#include <net/iw_handler.h>
  118#include <asm/current.h>
  119#include <linux/audit.h>
  120#include <linux/dmaengine.h>
  121#include <linux/err.h>
  122#include <linux/ctype.h>
  123#include <linux/if_arp.h>
  124#include <linux/if_vlan.h>
  125#include <linux/ip.h>
  126#include <net/ip.h>
  127#include <net/mpls.h>
  128#include <linux/ipv6.h>
  129#include <linux/in.h>
  130#include <linux/jhash.h>
  131#include <linux/random.h>
  132#include <trace/events/napi.h>
  133#include <trace/events/net.h>
  134#include <trace/events/skb.h>
  135#include <trace/events/qdisc.h>
  136#include <trace/events/xdp.h>
  137#include <linux/inetdevice.h>
  138#include <linux/cpu_rmap.h>
  139#include <linux/static_key.h>
  140#include <linux/hashtable.h>
  141#include <linux/vmalloc.h>
  142#include <linux/if_macvlan.h>
  143#include <linux/errqueue.h>
  144#include <linux/hrtimer.h>
  145#include <linux/netfilter_netdev.h>
  146#include <linux/crash_dump.h>
  147#include <linux/sctp.h>
  148#include <net/udp_tunnel.h>
  149#include <linux/net_namespace.h>
  150#include <linux/indirect_call_wrapper.h>
  151#include <net/devlink.h>
  152#include <linux/pm_runtime.h>
  153#include <linux/prandom.h>
  154#include <linux/once_lite.h>
  155#include <net/netdev_rx_queue.h>
  156#include <net/page_pool/types.h>
  157#include <net/page_pool/helpers.h>
  158#include <net/rps.h>
  159
  160#include "dev.h"
  161#include "net-sysfs.h"
  162
 
 
 
 
 
 
  163static DEFINE_SPINLOCK(ptype_lock);
 
  164struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 
 
  165
  166static int netif_rx_internal(struct sk_buff *skb);
  167static int call_netdevice_notifiers_extack(unsigned long val,
  168					   struct net_device *dev,
  169					   struct netlink_ext_ack *extack);
  170
  171static DEFINE_MUTEX(ifalias_mutex);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  172
  173/* protects napi_hash addition/deletion and napi_gen_id */
  174static DEFINE_SPINLOCK(napi_hash_lock);
  175
  176static unsigned int napi_gen_id = NR_CPUS;
  177static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
  178
  179static DECLARE_RWSEM(devnet_rename_sem);
  180
  181static inline void dev_base_seq_inc(struct net *net)
  182{
  183	unsigned int val = net->dev_base_seq + 1;
  184
  185	WRITE_ONCE(net->dev_base_seq, val ?: 1);
  186}
  187
  188static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
  189{
  190	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
  191
  192	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
  193}
  194
  195static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
  196{
  197	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
  198}
  199
  200static inline void rps_lock_irqsave(struct softnet_data *sd,
  201				    unsigned long *flags)
  202{
  203	if (IS_ENABLED(CONFIG_RPS))
  204		spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
  205	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
  206		local_irq_save(*flags);
  207}
  208
  209static inline void rps_lock_irq_disable(struct softnet_data *sd)
  210{
  211	if (IS_ENABLED(CONFIG_RPS))
  212		spin_lock_irq(&sd->input_pkt_queue.lock);
  213	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
  214		local_irq_disable();
  215}
  216
  217static inline void rps_unlock_irq_restore(struct softnet_data *sd,
  218					  unsigned long *flags)
  219{
  220	if (IS_ENABLED(CONFIG_RPS))
  221		spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
  222	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
  223		local_irq_restore(*flags);
  224}
  225
  226static inline void rps_unlock_irq_enable(struct softnet_data *sd)
  227{
  228	if (IS_ENABLED(CONFIG_RPS))
  229		spin_unlock_irq(&sd->input_pkt_queue.lock);
  230	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
  231		local_irq_enable();
  232}
  233
  234static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
  235						       const char *name)
  236{
  237	struct netdev_name_node *name_node;
  238
  239	name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
  240	if (!name_node)
  241		return NULL;
  242	INIT_HLIST_NODE(&name_node->hlist);
  243	name_node->dev = dev;
  244	name_node->name = name;
  245	return name_node;
  246}
  247
  248static struct netdev_name_node *
  249netdev_name_node_head_alloc(struct net_device *dev)
  250{
  251	struct netdev_name_node *name_node;
  252
  253	name_node = netdev_name_node_alloc(dev, dev->name);
  254	if (!name_node)
  255		return NULL;
  256	INIT_LIST_HEAD(&name_node->list);
  257	return name_node;
  258}
  259
  260static void netdev_name_node_free(struct netdev_name_node *name_node)
  261{
  262	kfree(name_node);
  263}
  264
  265static void netdev_name_node_add(struct net *net,
  266				 struct netdev_name_node *name_node)
  267{
  268	hlist_add_head_rcu(&name_node->hlist,
  269			   dev_name_hash(net, name_node->name));
  270}
  271
  272static void netdev_name_node_del(struct netdev_name_node *name_node)
  273{
  274	hlist_del_rcu(&name_node->hlist);
  275}
  276
  277static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
  278							const char *name)
  279{
  280	struct hlist_head *head = dev_name_hash(net, name);
  281	struct netdev_name_node *name_node;
  282
  283	hlist_for_each_entry(name_node, head, hlist)
  284		if (!strcmp(name_node->name, name))
  285			return name_node;
  286	return NULL;
  287}
  288
  289static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
  290							    const char *name)
  291{
  292	struct hlist_head *head = dev_name_hash(net, name);
  293	struct netdev_name_node *name_node;
  294
  295	hlist_for_each_entry_rcu(name_node, head, hlist)
  296		if (!strcmp(name_node->name, name))
  297			return name_node;
  298	return NULL;
  299}
  300
  301bool netdev_name_in_use(struct net *net, const char *name)
  302{
  303	return netdev_name_node_lookup(net, name);
  304}
  305EXPORT_SYMBOL(netdev_name_in_use);
  306
  307int netdev_name_node_alt_create(struct net_device *dev, const char *name)
  308{
  309	struct netdev_name_node *name_node;
  310	struct net *net = dev_net(dev);
  311
  312	name_node = netdev_name_node_lookup(net, name);
  313	if (name_node)
  314		return -EEXIST;
  315	name_node = netdev_name_node_alloc(dev, name);
  316	if (!name_node)
  317		return -ENOMEM;
  318	netdev_name_node_add(net, name_node);
  319	/* The node that holds dev->name acts as a head of per-device list. */
  320	list_add_tail_rcu(&name_node->list, &dev->name_node->list);
  321
  322	return 0;
  323}
  324
  325static void netdev_name_node_alt_free(struct rcu_head *head)
  326{
  327	struct netdev_name_node *name_node =
  328		container_of(head, struct netdev_name_node, rcu);
  329
  330	kfree(name_node->name);
  331	netdev_name_node_free(name_node);
  332}
  333
  334static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
  335{
  336	netdev_name_node_del(name_node);
  337	list_del(&name_node->list);
  338	call_rcu(&name_node->rcu, netdev_name_node_alt_free);
  339}
  340
  341int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
  342{
  343	struct netdev_name_node *name_node;
  344	struct net *net = dev_net(dev);
  345
  346	name_node = netdev_name_node_lookup(net, name);
  347	if (!name_node)
  348		return -ENOENT;
  349	/* lookup might have found our primary name or a name belonging
  350	 * to another device.
  351	 */
  352	if (name_node == dev->name_node || name_node->dev != dev)
  353		return -EINVAL;
  354
  355	__netdev_name_node_alt_destroy(name_node);
  356	return 0;
  357}
  358
  359static void netdev_name_node_alt_flush(struct net_device *dev)
  360{
  361	struct netdev_name_node *name_node, *tmp;
  362
  363	list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) {
  364		list_del(&name_node->list);
  365		netdev_name_node_alt_free(&name_node->rcu);
  366	}
  367}
  368
  369/* Device list insertion */
  370static void list_netdevice(struct net_device *dev)
  371{
  372	struct netdev_name_node *name_node;
  373	struct net *net = dev_net(dev);
  374
  375	ASSERT_RTNL();
  376
 
  377	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
  378	netdev_name_node_add(net, dev->name_node);
  379	hlist_add_head_rcu(&dev->index_hlist,
  380			   dev_index_hash(net, dev->ifindex));
  381
  382	netdev_for_each_altname(dev, name_node)
  383		netdev_name_node_add(net, name_node);
  384
  385	/* We reserved the ifindex, this can't fail */
  386	WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));
  387
  388	dev_base_seq_inc(net);
  389}
  390
  391/* Device list removal
  392 * caller must respect a RCU grace period before freeing/reusing dev
  393 */
  394static void unlist_netdevice(struct net_device *dev)
  395{
  396	struct netdev_name_node *name_node;
  397	struct net *net = dev_net(dev);
  398
  399	ASSERT_RTNL();
  400
  401	xa_erase(&net->dev_by_index, dev->ifindex);
  402
  403	netdev_for_each_altname(dev, name_node)
  404		netdev_name_node_del(name_node);
  405
  406	/* Unlink dev from the device chain */
 
  407	list_del_rcu(&dev->dev_list);
  408	netdev_name_node_del(dev->name_node);
  409	hlist_del_rcu(&dev->index_hlist);
 
  410
  411	dev_base_seq_inc(dev_net(dev));
  412}
  413
  414/*
  415 *	Our notifier list
  416 */
  417
  418static RAW_NOTIFIER_HEAD(netdev_chain);
  419
  420/*
  421 *	Device drivers call our routines to queue packets here. We empty the
  422 *	queue in the local softnet handler.
  423 */
  424
  425DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
  426EXPORT_PER_CPU_SYMBOL(softnet_data);
  427
  428/* Page_pool has a lockless array/stack to alloc/recycle pages.
  429 * PP consumers must pay attention to run APIs in the appropriate context
  430 * (e.g. NAPI context).
  431 */
  432static DEFINE_PER_CPU(struct page_pool *, system_page_pool);
  433
  434#ifdef CONFIG_LOCKDEP
  435/*
  436 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
  437 * according to dev->type
  438 */
  439static const unsigned short netdev_lock_type[] = {
  440	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
  441	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
  442	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
  443	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
  444	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
  445	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
  446	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
  447	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
  448	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
  449	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
  450	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
  451	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
  452	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
  453	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
  454	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
  455
  456static const char *const netdev_lock_name[] = {
  457	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
  458	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
  459	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
  460	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
  461	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
  462	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
  463	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
  464	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
  465	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
  466	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
  467	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
  468	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
  469	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
  470	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
  471	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
  472
  473static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
  474static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
  475
  476static inline unsigned short netdev_lock_pos(unsigned short dev_type)
  477{
  478	int i;
  479
  480	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
  481		if (netdev_lock_type[i] == dev_type)
  482			return i;
  483	/* the last key is used by default */
  484	return ARRAY_SIZE(netdev_lock_type) - 1;
  485}
  486
  487static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
  488						 unsigned short dev_type)
  489{
  490	int i;
  491
  492	i = netdev_lock_pos(dev_type);
  493	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
  494				   netdev_lock_name[i]);
  495}
  496
  497static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
  498{
  499	int i;
  500
  501	i = netdev_lock_pos(dev->type);
  502	lockdep_set_class_and_name(&dev->addr_list_lock,
  503				   &netdev_addr_lock_key[i],
  504				   netdev_lock_name[i]);
  505}
  506#else
  507static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
  508						 unsigned short dev_type)
  509{
  510}
  511
  512static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
  513{
  514}
  515#endif
  516
  517/*******************************************************************************
  518 *
  519 *		Protocol management and registration routines
  520 *
  521 *******************************************************************************/
  522
 
 
 
  523
  524/*
  525 *	Add a protocol ID to the list. Now that the input handler is
  526 *	smarter we can dispense with all the messy stuff that used to be
  527 *	here.
  528 *
  529 *	BEWARE!!! Protocol handlers, mangling input packets,
  530 *	MUST BE last in hash buckets and checking protocol handlers
  531 *	MUST start from promiscuous ptype_all chain in net_bh.
  532 *	It is true now, do not change it.
  533 *	Explanation follows: if protocol handler, mangling packet, will
  534 *	be the first on list, it is not able to sense, that packet
  535 *	is cloned and should be copied-on-write, so that it will
  536 *	change it and subsequent readers will get broken packet.
  537 *							--ANK (980803)
  538 */
  539
  540static inline struct list_head *ptype_head(const struct packet_type *pt)
  541{
  542	if (pt->type == htons(ETH_P_ALL))
  543		return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all;
  544	else
  545		return pt->dev ? &pt->dev->ptype_specific :
  546				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
  547}
  548
  549/**
  550 *	dev_add_pack - add packet handler
  551 *	@pt: packet type declaration
  552 *
  553 *	Add a protocol handler to the networking stack. The passed &packet_type
  554 *	is linked into kernel lists and may not be freed until it has been
  555 *	removed from the kernel lists.
  556 *
  557 *	This call does not sleep therefore it can not
  558 *	guarantee all CPU's that are in middle of receiving packets
  559 *	will see the new packet type (until the next received packet).
  560 */
  561
  562void dev_add_pack(struct packet_type *pt)
  563{
  564	struct list_head *head = ptype_head(pt);
  565
  566	spin_lock(&ptype_lock);
  567	list_add_rcu(&pt->list, head);
  568	spin_unlock(&ptype_lock);
  569}
  570EXPORT_SYMBOL(dev_add_pack);
  571
  572/**
  573 *	__dev_remove_pack	 - remove packet handler
  574 *	@pt: packet type declaration
  575 *
  576 *	Remove a protocol handler that was previously added to the kernel
  577 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  578 *	from the kernel lists and can be freed or reused once this function
  579 *	returns.
  580 *
  581 *      The packet type might still be in use by receivers
  582 *	and must not be freed until after all the CPU's have gone
  583 *	through a quiescent state.
  584 */
  585void __dev_remove_pack(struct packet_type *pt)
  586{
  587	struct list_head *head = ptype_head(pt);
  588	struct packet_type *pt1;
  589
  590	spin_lock(&ptype_lock);
  591
  592	list_for_each_entry(pt1, head, list) {
  593		if (pt == pt1) {
  594			list_del_rcu(&pt->list);
  595			goto out;
  596		}
  597	}
  598
  599	pr_warn("dev_remove_pack: %p not found\n", pt);
  600out:
  601	spin_unlock(&ptype_lock);
  602}
  603EXPORT_SYMBOL(__dev_remove_pack);
  604
  605/**
  606 *	dev_remove_pack	 - remove packet handler
  607 *	@pt: packet type declaration
  608 *
  609 *	Remove a protocol handler that was previously added to the kernel
  610 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  611 *	from the kernel lists and can be freed or reused once this function
  612 *	returns.
  613 *
  614 *	This call sleeps to guarantee that no CPU is looking at the packet
  615 *	type after return.
  616 */
  617void dev_remove_pack(struct packet_type *pt)
  618{
  619	__dev_remove_pack(pt);
  620
  621	synchronize_net();
  622}
  623EXPORT_SYMBOL(dev_remove_pack);
  624
  625
  626/*******************************************************************************
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  627 *
  628 *			    Device Interface Subroutines
 
 
 
  629 *
  630 *******************************************************************************/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  631
  632/**
  633 *	dev_get_iflink	- get 'iflink' value of a interface
  634 *	@dev: targeted interface
  635 *
  636 *	Indicates the ifindex the interface is linked to.
  637 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
  638 */
  639
  640int dev_get_iflink(const struct net_device *dev)
  641{
  642	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
  643		return dev->netdev_ops->ndo_get_iflink(dev);
  644
  645	return READ_ONCE(dev->ifindex);
  646}
  647EXPORT_SYMBOL(dev_get_iflink);
  648
  649/**
  650 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
  651 *	@dev: targeted interface
  652 *	@skb: The packet.
  653 *
  654 *	For better visibility of tunnel traffic OVS needs to retrieve
  655 *	egress tunnel information for a packet. Following API allows
  656 *	user to get this info.
  657 */
  658int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
  659{
  660	struct ip_tunnel_info *info;
  661
  662	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
  663		return -EINVAL;
  664
  665	info = skb_tunnel_info_unclone(skb);
  666	if (!info)
  667		return -ENOMEM;
  668	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
  669		return -EINVAL;
  670
  671	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
  672}
  673EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
  674
  675static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
  676{
  677	int k = stack->num_paths++;
  678
  679	if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
  680		return NULL;
  681
  682	return &stack->path[k];
  683}
  684
  685int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
  686			  struct net_device_path_stack *stack)
  687{
  688	const struct net_device *last_dev;
  689	struct net_device_path_ctx ctx = {
  690		.dev	= dev,
  691	};
  692	struct net_device_path *path;
  693	int ret = 0;
  694
  695	memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
  696	stack->num_paths = 0;
  697	while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
  698		last_dev = ctx.dev;
  699		path = dev_fwd_path(stack);
  700		if (!path)
  701			return -1;
  702
  703		memset(path, 0, sizeof(struct net_device_path));
  704		ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
  705		if (ret < 0)
  706			return -1;
  707
  708		if (WARN_ON_ONCE(last_dev == ctx.dev))
  709			return -1;
  710	}
  711
  712	if (!ctx.dev)
  713		return ret;
  714
  715	path = dev_fwd_path(stack);
  716	if (!path)
  717		return -1;
  718	path->type = DEV_PATH_ETHERNET;
  719	path->dev = ctx.dev;
  720
  721	return ret;
  722}
  723EXPORT_SYMBOL_GPL(dev_fill_forward_path);
  724
  725/**
  726 *	__dev_get_by_name	- find a device by its name
  727 *	@net: the applicable net namespace
  728 *	@name: name to find
  729 *
  730 *	Find an interface by name. Must be called under RTNL semaphore.
  731 *	If the name is found a pointer to the device is returned.
  732 *	If the name is not found then %NULL is returned. The
  733 *	reference counters are not incremented so the caller must be
  734 *	careful with locks.
  735 */
  736
  737struct net_device *__dev_get_by_name(struct net *net, const char *name)
  738{
  739	struct netdev_name_node *node_name;
 
  740
  741	node_name = netdev_name_node_lookup(net, name);
  742	return node_name ? node_name->dev : NULL;
 
 
 
  743}
  744EXPORT_SYMBOL(__dev_get_by_name);
  745
  746/**
  747 * dev_get_by_name_rcu	- find a device by its name
  748 * @net: the applicable net namespace
  749 * @name: name to find
  750 *
  751 * Find an interface by name.
  752 * If the name is found a pointer to the device is returned.
  753 * If the name is not found then %NULL is returned.
  754 * The reference counters are not incremented so the caller must be
  755 * careful with locks. The caller must hold RCU lock.
  756 */
  757
  758struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
  759{
  760	struct netdev_name_node *node_name;
  761
  762	node_name = netdev_name_node_lookup_rcu(net, name);
  763	return node_name ? node_name->dev : NULL;
  764}
  765EXPORT_SYMBOL(dev_get_by_name_rcu);
  766
  767/* Deprecated for new users, call netdev_get_by_name() instead */
  768struct net_device *dev_get_by_name(struct net *net, const char *name)
  769{
  770	struct net_device *dev;
 
  771
  772	rcu_read_lock();
  773	dev = dev_get_by_name_rcu(net, name);
  774	dev_hold(dev);
  775	rcu_read_unlock();
  776	return dev;
  777}
  778EXPORT_SYMBOL(dev_get_by_name);
  779
  780/**
  781 *	netdev_get_by_name() - find a device by its name
  782 *	@net: the applicable net namespace
  783 *	@name: name to find
  784 *	@tracker: tracking object for the acquired reference
  785 *	@gfp: allocation flags for the tracker
  786 *
  787 *	Find an interface by name. This can be called from any
  788 *	context and does its own locking. The returned handle has
  789 *	the usage count incremented and the caller must use netdev_put() to
  790 *	release it when it is no longer needed. %NULL is returned if no
  791 *	matching device is found.
  792 */
  793struct net_device *netdev_get_by_name(struct net *net, const char *name,
  794				      netdevice_tracker *tracker, gfp_t gfp)
  795{
  796	struct net_device *dev;
  797
  798	dev = dev_get_by_name(net, name);
 
  799	if (dev)
  800		netdev_tracker_alloc(dev, tracker, gfp);
 
  801	return dev;
  802}
  803EXPORT_SYMBOL(netdev_get_by_name);
  804
  805/**
  806 *	__dev_get_by_index - find a device by its ifindex
  807 *	@net: the applicable net namespace
  808 *	@ifindex: index of device
  809 *
  810 *	Search for an interface by index. Returns %NULL if the device
  811 *	is not found or a pointer to the device. The device has not
  812 *	had its reference counter increased so the caller must be careful
  813 *	about locking. The caller must hold the RTNL semaphore.
 
  814 */
  815
  816struct net_device *__dev_get_by_index(struct net *net, int ifindex)
  817{
  818	struct net_device *dev;
  819	struct hlist_head *head = dev_index_hash(net, ifindex);
  820
  821	hlist_for_each_entry(dev, head, index_hlist)
  822		if (dev->ifindex == ifindex)
  823			return dev;
  824
  825	return NULL;
  826}
  827EXPORT_SYMBOL(__dev_get_by_index);
  828
  829/**
  830 *	dev_get_by_index_rcu - find a device by its ifindex
  831 *	@net: the applicable net namespace
  832 *	@ifindex: index of device
  833 *
  834 *	Search for an interface by index. Returns %NULL if the device
  835 *	is not found or a pointer to the device. The device has not
  836 *	had its reference counter increased so the caller must be careful
  837 *	about locking. The caller must hold RCU lock.
  838 */
  839
  840struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
  841{
  842	struct net_device *dev;
  843	struct hlist_head *head = dev_index_hash(net, ifindex);
  844
  845	hlist_for_each_entry_rcu(dev, head, index_hlist)
  846		if (dev->ifindex == ifindex)
  847			return dev;
  848
  849	return NULL;
  850}
  851EXPORT_SYMBOL(dev_get_by_index_rcu);
  852
  853/* Deprecated for new users, call netdev_get_by_index() instead */
  854struct net_device *dev_get_by_index(struct net *net, int ifindex)
  855{
  856	struct net_device *dev;
  857
  858	rcu_read_lock();
  859	dev = dev_get_by_index_rcu(net, ifindex);
  860	dev_hold(dev);
  861	rcu_read_unlock();
  862	return dev;
  863}
  864EXPORT_SYMBOL(dev_get_by_index);
  865
  866/**
  867 *	netdev_get_by_index() - find a device by its ifindex
  868 *	@net: the applicable net namespace
  869 *	@ifindex: index of device
  870 *	@tracker: tracking object for the acquired reference
  871 *	@gfp: allocation flags for the tracker
  872 *
  873 *	Search for an interface by index. Returns NULL if the device
  874 *	is not found or a pointer to the device. The device returned has
  875 *	had a reference added and the pointer is safe until the user calls
  876 *	netdev_put() to indicate they have finished with it.
  877 */
  878struct net_device *netdev_get_by_index(struct net *net, int ifindex,
  879				       netdevice_tracker *tracker, gfp_t gfp)
  880{
  881	struct net_device *dev;
  882
  883	dev = dev_get_by_index(net, ifindex);
 
  884	if (dev)
  885		netdev_tracker_alloc(dev, tracker, gfp);
 
  886	return dev;
  887}
  888EXPORT_SYMBOL(netdev_get_by_index);
  889
  890/**
  891 *	dev_get_by_napi_id - find a device by napi_id
  892 *	@napi_id: ID of the NAPI struct
  893 *
  894 *	Search for an interface by NAPI ID. Returns %NULL if the device
  895 *	is not found or a pointer to the device. The device has not had
  896 *	its reference counter increased so the caller must be careful
  897 *	about locking. The caller must hold RCU lock.
  898 */
  899
  900struct net_device *dev_get_by_napi_id(unsigned int napi_id)
  901{
  902	struct napi_struct *napi;
  903
  904	WARN_ON_ONCE(!rcu_read_lock_held());
  905
  906	if (napi_id < MIN_NAPI_ID)
  907		return NULL;
  908
  909	napi = napi_by_id(napi_id);
  910
  911	return napi ? napi->dev : NULL;
  912}
  913EXPORT_SYMBOL(dev_get_by_napi_id);
  914
  915/**
  916 *	netdev_get_name - get a netdevice name, knowing its ifindex.
  917 *	@net: network namespace
  918 *	@name: a pointer to the buffer where the name will be stored.
  919 *	@ifindex: the ifindex of the interface to get the name from.
 
 
 
 
  920 */
  921int netdev_get_name(struct net *net, char *name, int ifindex)
  922{
  923	struct net_device *dev;
  924	int ret;
  925
  926	down_read(&devnet_rename_sem);
 
  927	rcu_read_lock();
  928
  929	dev = dev_get_by_index_rcu(net, ifindex);
  930	if (!dev) {
  931		ret = -ENODEV;
  932		goto out;
  933	}
  934
  935	strcpy(name, dev->name);
  936
  937	ret = 0;
  938out:
  939	rcu_read_unlock();
  940	up_read(&devnet_rename_sem);
  941	return ret;
 
 
 
 
  942}
  943
  944/**
  945 *	dev_getbyhwaddr_rcu - find a device by its hardware address
  946 *	@net: the applicable net namespace
  947 *	@type: media type of device
  948 *	@ha: hardware address
  949 *
  950 *	Search for an interface by MAC address. Returns NULL if the device
  951 *	is not found or a pointer to the device.
  952 *	The caller must hold RCU or RTNL.
  953 *	The returned device has not had its ref count increased
  954 *	and the caller must therefore be careful about locking
  955 *
  956 */
  957
  958struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
  959				       const char *ha)
  960{
  961	struct net_device *dev;
  962
  963	for_each_netdev_rcu(net, dev)
  964		if (dev->type == type &&
  965		    !memcmp(dev->dev_addr, ha, dev->addr_len))
  966			return dev;
  967
  968	return NULL;
  969}
  970EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
  971
 
 
 
 
 
 
 
 
 
 
 
 
 
  972struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
  973{
  974	struct net_device *dev, *ret = NULL;
  975
  976	rcu_read_lock();
  977	for_each_netdev_rcu(net, dev)
  978		if (dev->type == type) {
  979			dev_hold(dev);
  980			ret = dev;
  981			break;
  982		}
  983	rcu_read_unlock();
  984	return ret;
  985}
  986EXPORT_SYMBOL(dev_getfirstbyhwtype);
  987
  988/**
  989 *	__dev_get_by_flags - find any device with given flags
  990 *	@net: the applicable net namespace
  991 *	@if_flags: IFF_* values
  992 *	@mask: bitmask of bits in if_flags to check
  993 *
  994 *	Search for any interface with the given flags. Returns NULL if a device
  995 *	is not found or a pointer to the device. Must be called inside
  996 *	rtnl_lock(), and result refcount is unchanged.
  997 */
  998
  999struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 1000				      unsigned short mask)
 1001{
 1002	struct net_device *dev, *ret;
 1003
 1004	ASSERT_RTNL();
 1005
 1006	ret = NULL;
 1007	for_each_netdev(net, dev) {
 1008		if (((dev->flags ^ if_flags) & mask) == 0) {
 1009			ret = dev;
 1010			break;
 1011		}
 1012	}
 1013	return ret;
 1014}
 1015EXPORT_SYMBOL(__dev_get_by_flags);
 1016
 1017/**
 1018 *	dev_valid_name - check if name is okay for network device
 1019 *	@name: name string
 1020 *
 1021 *	Network device names need to be valid file names to
 1022 *	allow sysfs to work.  We also disallow any kind of
 1023 *	whitespace.
 1024 */
 1025bool dev_valid_name(const char *name)
 1026{
 1027	if (*name == '\0')
 1028		return false;
 1029	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
 1030		return false;
 1031	if (!strcmp(name, ".") || !strcmp(name, ".."))
 1032		return false;
 1033
 1034	while (*name) {
 1035		if (*name == '/' || *name == ':' || isspace(*name))
 1036			return false;
 1037		name++;
 1038	}
 1039	return true;
 1040}
 1041EXPORT_SYMBOL(dev_valid_name);
 1042
 1043/**
 1044 *	__dev_alloc_name - allocate a name for a device
 1045 *	@net: network namespace to allocate the device name in
 1046 *	@name: name format string
 1047 *	@res: result name string
 1048 *
 1049 *	Passed a format string - eg "lt%d" it will try and find a suitable
 1050 *	id. It scans list of devices to build up a free map, then chooses
 1051 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 1052 *	while allocating the name and adding the device in order to avoid
 1053 *	duplicates.
 1054 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 1055 *	Returns the number of the unit assigned or a negative errno code.
 1056 */
 1057
 1058static int __dev_alloc_name(struct net *net, const char *name, char *res)
 1059{
 1060	int i = 0;
 1061	const char *p;
 1062	const int max_netdevices = 8*PAGE_SIZE;
 1063	unsigned long *inuse;
 1064	struct net_device *d;
 1065	char buf[IFNAMSIZ];
 1066
 1067	/* Verify the string as this thing may have come from the user.
 1068	 * There must be one "%d" and no other "%" characters.
 1069	 */
 1070	p = strchr(name, '%');
 1071	if (!p || p[1] != 'd' || strchr(p + 2, '%'))
 1072		return -EINVAL;
 1073
 1074	/* Use one page as a bit array of possible slots */
 1075	inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
 1076	if (!inuse)
 1077		return -ENOMEM;
 
 
 
 
 
 1078
 1079	for_each_netdev(net, d) {
 1080		struct netdev_name_node *name_node;
 
 
 1081
 1082		netdev_for_each_altname(d, name_node) {
 1083			if (!sscanf(name_node->name, name, &i))
 1084				continue;
 1085			if (i < 0 || i >= max_netdevices)
 1086				continue;
 1087
 1088			/* avoid cases where sscanf is not exact inverse of printf */
 1089			snprintf(buf, IFNAMSIZ, name, i);
 1090			if (!strncmp(buf, name_node->name, IFNAMSIZ))
 1091				__set_bit(i, inuse);
 1092		}
 1093		if (!sscanf(d->name, name, &i))
 1094			continue;
 1095		if (i < 0 || i >= max_netdevices)
 1096			continue;
 1097
 1098		/* avoid cases where sscanf is not exact inverse of printf */
 1099		snprintf(buf, IFNAMSIZ, name, i);
 1100		if (!strncmp(buf, d->name, IFNAMSIZ))
 1101			__set_bit(i, inuse);
 1102	}
 1103
 1104	i = find_first_zero_bit(inuse, max_netdevices);
 1105	bitmap_free(inuse);
 1106	if (i == max_netdevices)
 1107		return -ENFILE;
 1108
 1109	/* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */
 1110	strscpy(buf, name, IFNAMSIZ);
 1111	snprintf(res, IFNAMSIZ, buf, i);
 1112	return i;
 1113}
 1114
 1115/* Returns negative errno or allocated unit id (see __dev_alloc_name()) */
 1116static int dev_prep_valid_name(struct net *net, struct net_device *dev,
 1117			       const char *want_name, char *out_name,
 1118			       int dup_errno)
 1119{
 1120	if (!dev_valid_name(want_name))
 1121		return -EINVAL;
 1122
 1123	if (strchr(want_name, '%'))
 1124		return __dev_alloc_name(net, want_name, out_name);
 1125
 1126	if (netdev_name_in_use(net, want_name))
 1127		return -dup_errno;
 1128	if (out_name != want_name)
 1129		strscpy(out_name, want_name, IFNAMSIZ);
 1130	return 0;
 1131}
 1132
 1133/**
 1134 *	dev_alloc_name - allocate a name for a device
 1135 *	@dev: device
 1136 *	@name: name format string
 1137 *
 1138 *	Passed a format string - eg "lt%d" it will try and find a suitable
 1139 *	id. It scans list of devices to build up a free map, then chooses
 1140 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 1141 *	while allocating the name and adding the device in order to avoid
 1142 *	duplicates.
 1143 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 1144 *	Returns the number of the unit assigned or a negative errno code.
 1145 */
 1146
 1147int dev_alloc_name(struct net_device *dev, const char *name)
 1148{
 1149	return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE);
 
 
 
 
 
 
 
 
 
 1150}
 1151EXPORT_SYMBOL(dev_alloc_name);
 1152
 1153static int dev_get_valid_name(struct net *net, struct net_device *dev,
 1154			      const char *name)
 
 1155{
 
 1156	int ret;
 1157
 1158	ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST);
 1159	return ret < 0 ? ret : 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 1160}
 1161
 1162/**
 1163 *	dev_change_name - change name of a device
 1164 *	@dev: device
 1165 *	@newname: name (or format string) must be at least IFNAMSIZ
 1166 *
 1167 *	Change name of a device, can pass format strings "eth%d".
 1168 *	for wildcarding.
 1169 */
 1170int dev_change_name(struct net_device *dev, const char *newname)
 1171{
 1172	unsigned char old_assign_type;
 1173	char oldname[IFNAMSIZ];
 1174	int err = 0;
 1175	int ret;
 1176	struct net *net;
 1177
 1178	ASSERT_RTNL();
 1179	BUG_ON(!dev_net(dev));
 1180
 1181	net = dev_net(dev);
 
 
 1182
 1183	down_write(&devnet_rename_sem);
 1184
 1185	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
 1186		up_write(&devnet_rename_sem);
 1187		return 0;
 1188	}
 1189
 1190	memcpy(oldname, dev->name, IFNAMSIZ);
 1191
 1192	err = dev_get_valid_name(net, dev, newname);
 1193	if (err < 0) {
 1194		up_write(&devnet_rename_sem);
 1195		return err;
 1196	}
 1197
 1198	if (oldname[0] && !strchr(oldname, '%'))
 1199		netdev_info(dev, "renamed from %s%s\n", oldname,
 1200			    dev->flags & IFF_UP ? " (while UP)" : "");
 1201
 1202	old_assign_type = dev->name_assign_type;
 1203	WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED);
 1204
 1205rollback:
 1206	ret = device_rename(&dev->dev, dev->name);
 1207	if (ret) {
 1208		memcpy(dev->name, oldname, IFNAMSIZ);
 1209		WRITE_ONCE(dev->name_assign_type, old_assign_type);
 1210		up_write(&devnet_rename_sem);
 1211		return ret;
 1212	}
 1213
 1214	up_write(&devnet_rename_sem);
 1215
 1216	netdev_adjacent_rename_links(dev, oldname);
 1217
 1218	netdev_name_node_del(dev->name_node);
 1219
 1220	synchronize_net();
 1221
 1222	netdev_name_node_add(net, dev->name_node);
 
 
 
 
 1223
 1224	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 1225	ret = notifier_to_errno(ret);
 1226
 1227	if (ret) {
 1228		/* err >= 0 after dev_alloc_name() or stores the first errno */
 1229		if (err >= 0) {
 1230			err = ret;
 1231			down_write(&devnet_rename_sem);
 1232			memcpy(dev->name, oldname, IFNAMSIZ);
 1233			memcpy(oldname, newname, IFNAMSIZ);
 1234			WRITE_ONCE(dev->name_assign_type, old_assign_type);
 1235			old_assign_type = NET_NAME_RENAMED;
 1236			goto rollback;
 1237		} else {
 1238			netdev_err(dev, "name change rollback failed: %d\n",
 1239				   ret);
 1240		}
 1241	}
 1242
 1243	return err;
 1244}
 1245
 1246/**
 1247 *	dev_set_alias - change ifalias of a device
 1248 *	@dev: device
 1249 *	@alias: name up to IFALIASZ
 1250 *	@len: limit of bytes to copy from info
 1251 *
 1252 *	Set ifalias for a device,
 1253 */
 1254int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 1255{
 1256	struct dev_ifalias *new_alias = NULL;
 
 
 1257
 1258	if (len >= IFALIASZ)
 1259		return -EINVAL;
 1260
 1261	if (len) {
 1262		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
 1263		if (!new_alias)
 1264			return -ENOMEM;
 1265
 1266		memcpy(new_alias->ifalias, alias, len);
 1267		new_alias->ifalias[len] = 0;
 1268	}
 1269
 1270	mutex_lock(&ifalias_mutex);
 1271	new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
 1272					mutex_is_locked(&ifalias_mutex));
 1273	mutex_unlock(&ifalias_mutex);
 1274
 1275	if (new_alias)
 1276		kfree_rcu(new_alias, rcuhead);
 1277
 
 1278	return len;
 1279}
 1280EXPORT_SYMBOL(dev_set_alias);
 1281
 1282/**
 1283 *	dev_get_alias - get ifalias of a device
 1284 *	@dev: device
 1285 *	@name: buffer to store name of ifalias
 1286 *	@len: size of buffer
 1287 *
 1288 *	get ifalias for a device.  Caller must make sure dev cannot go
 1289 *	away,  e.g. rcu read lock or own a reference count to device.
 1290 */
 1291int dev_get_alias(const struct net_device *dev, char *name, size_t len)
 1292{
 1293	const struct dev_ifalias *alias;
 1294	int ret = 0;
 1295
 1296	rcu_read_lock();
 1297	alias = rcu_dereference(dev->ifalias);
 1298	if (alias)
 1299		ret = snprintf(name, len, "%s", alias->ifalias);
 1300	rcu_read_unlock();
 1301
 1302	return ret;
 1303}
 1304
 1305/**
 1306 *	netdev_features_change - device changes features
 1307 *	@dev: device to cause notification
 1308 *
 1309 *	Called to indicate a device has changed features.
 1310 */
 1311void netdev_features_change(struct net_device *dev)
 1312{
 1313	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
 1314}
 1315EXPORT_SYMBOL(netdev_features_change);
 1316
 1317/**
 1318 *	netdev_state_change - device changes state
 1319 *	@dev: device to cause notification
 1320 *
 1321 *	Called to indicate a device has changed state. This function calls
 1322 *	the notifier chains for netdev_chain and sends a NEWLINK message
 1323 *	to the routing socket.
 1324 */
 1325void netdev_state_change(struct net_device *dev)
 1326{
 1327	if (dev->flags & IFF_UP) {
 1328		struct netdev_notifier_change_info change_info = {
 1329			.info.dev = dev,
 1330		};
 1331
 1332		call_netdevice_notifiers_info(NETDEV_CHANGE,
 
 1333					      &change_info.info);
 1334		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
 1335	}
 1336}
 1337EXPORT_SYMBOL(netdev_state_change);
 1338
 1339/**
 1340 * __netdev_notify_peers - notify network peers about existence of @dev,
 1341 * to be called when rtnl lock is already held.
 1342 * @dev: network device
 1343 *
 1344 * Generate traffic such that interested network peers are aware of
 1345 * @dev, such as by generating a gratuitous ARP. This may be used when
 1346 * a device wants to inform the rest of the network about some sort of
 1347 * reconfiguration such as a failover event or virtual machine
 1348 * migration.
 1349 */
 1350void __netdev_notify_peers(struct net_device *dev)
 1351{
 1352	ASSERT_RTNL();
 1353	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
 1354	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
 1355}
 1356EXPORT_SYMBOL(__netdev_notify_peers);
 1357
 1358/**
 1359 * netdev_notify_peers - notify network peers about existence of @dev
 1360 * @dev: network device
 1361 *
 1362 * Generate traffic such that interested network peers are aware of
 1363 * @dev, such as by generating a gratuitous ARP. This may be used when
 1364 * a device wants to inform the rest of the network about some sort of
 1365 * reconfiguration such as a failover event or virtual machine
 1366 * migration.
 1367 */
 1368void netdev_notify_peers(struct net_device *dev)
 1369{
 1370	rtnl_lock();
 1371	__netdev_notify_peers(dev);
 1372	rtnl_unlock();
 1373}
 1374EXPORT_SYMBOL(netdev_notify_peers);
 1375
 1376static int napi_threaded_poll(void *data);
 1377
 1378static int napi_kthread_create(struct napi_struct *n)
 1379{
 1380	int err = 0;
 1381
 1382	/* Create and wake up the kthread once to put it in
 1383	 * TASK_INTERRUPTIBLE mode to avoid the blocked task
 1384	 * warning and work with loadavg.
 1385	 */
 1386	n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
 1387				n->dev->name, n->napi_id);
 1388	if (IS_ERR(n->thread)) {
 1389		err = PTR_ERR(n->thread);
 1390		pr_err("kthread_run failed with err %d\n", err);
 1391		n->thread = NULL;
 1392	}
 1393
 1394	return err;
 1395}
 1396
 1397static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1398{
 1399	const struct net_device_ops *ops = dev->netdev_ops;
 1400	int ret;
 1401
 1402	ASSERT_RTNL();
 1403	dev_addr_check(dev);
 1404
 1405	if (!netif_device_present(dev)) {
 1406		/* may be detached because parent is runtime-suspended */
 1407		if (dev->dev.parent)
 1408			pm_runtime_resume(dev->dev.parent);
 1409		if (!netif_device_present(dev))
 1410			return -ENODEV;
 1411	}
 1412
 1413	/* Block netpoll from trying to do any rx path servicing.
 1414	 * If we don't do this there is a chance ndo_poll_controller
 1415	 * or ndo_poll may be running while we open the device
 1416	 */
 1417	netpoll_poll_disable(dev);
 1418
 1419	ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
 1420	ret = notifier_to_errno(ret);
 1421	if (ret)
 1422		return ret;
 1423
 1424	set_bit(__LINK_STATE_START, &dev->state);
 1425
 1426	if (ops->ndo_validate_addr)
 1427		ret = ops->ndo_validate_addr(dev);
 1428
 1429	if (!ret && ops->ndo_open)
 1430		ret = ops->ndo_open(dev);
 1431
 1432	netpoll_poll_enable(dev);
 1433
 1434	if (ret)
 1435		clear_bit(__LINK_STATE_START, &dev->state);
 1436	else {
 1437		dev->flags |= IFF_UP;
 1438		dev_set_rx_mode(dev);
 1439		dev_activate(dev);
 1440		add_device_randomness(dev->dev_addr, dev->addr_len);
 1441	}
 1442
 1443	return ret;
 1444}
 1445
 1446/**
 1447 *	dev_open	- prepare an interface for use.
 1448 *	@dev: device to open
 1449 *	@extack: netlink extended ack
 1450 *
 1451 *	Takes a device from down to up state. The device's private open
 1452 *	function is invoked and then the multicast lists are loaded. Finally
 1453 *	the device is moved into the up state and a %NETDEV_UP message is
 1454 *	sent to the netdev notifier chain.
 1455 *
 1456 *	Calling this function on an active interface is a nop. On a failure
 1457 *	a negative errno code is returned.
 1458 */
 1459int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1460{
 1461	int ret;
 1462
 1463	if (dev->flags & IFF_UP)
 1464		return 0;
 1465
 1466	ret = __dev_open(dev, extack);
 1467	if (ret < 0)
 1468		return ret;
 1469
 1470	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
 1471	call_netdevice_notifiers(NETDEV_UP, dev);
 1472
 1473	return ret;
 1474}
 1475EXPORT_SYMBOL(dev_open);
 1476
 1477static void __dev_close_many(struct list_head *head)
 1478{
 1479	struct net_device *dev;
 1480
 1481	ASSERT_RTNL();
 1482	might_sleep();
 1483
 1484	list_for_each_entry(dev, head, close_list) {
 1485		/* Temporarily disable netpoll until the interface is down */
 1486		netpoll_poll_disable(dev);
 1487
 1488		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
 1489
 1490		clear_bit(__LINK_STATE_START, &dev->state);
 1491
 1492		/* Synchronize to scheduled poll. We cannot touch poll list, it
 1493		 * can be even on different cpu. So just clear netif_running().
 1494		 *
 1495		 * dev->stop() will invoke napi_disable() on all of it's
 1496		 * napi_struct instances on this device.
 1497		 */
 1498		smp_mb__after_atomic(); /* Commit netif_running(). */
 1499	}
 1500
 1501	dev_deactivate_many(head);
 1502
 1503	list_for_each_entry(dev, head, close_list) {
 1504		const struct net_device_ops *ops = dev->netdev_ops;
 1505
 1506		/*
 1507		 *	Call the device specific close. This cannot fail.
 1508		 *	Only if device is UP
 1509		 *
 1510		 *	We allow it to be called even after a DETACH hot-plug
 1511		 *	event.
 1512		 */
 1513		if (ops->ndo_stop)
 1514			ops->ndo_stop(dev);
 1515
 1516		dev->flags &= ~IFF_UP;
 1517		netpoll_poll_enable(dev);
 1518	}
 
 
 1519}
 1520
 1521static void __dev_close(struct net_device *dev)
 1522{
 
 1523	LIST_HEAD(single);
 1524
 1525	list_add(&dev->close_list, &single);
 1526	__dev_close_many(&single);
 1527	list_del(&single);
 
 
 1528}
 1529
 1530void dev_close_many(struct list_head *head, bool unlink)
 1531{
 1532	struct net_device *dev, *tmp;
 1533
 1534	/* Remove the devices that don't need to be closed */
 1535	list_for_each_entry_safe(dev, tmp, head, close_list)
 1536		if (!(dev->flags & IFF_UP))
 1537			list_del_init(&dev->close_list);
 1538
 1539	__dev_close_many(head);
 1540
 1541	list_for_each_entry_safe(dev, tmp, head, close_list) {
 1542		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
 1543		call_netdevice_notifiers(NETDEV_DOWN, dev);
 1544		if (unlink)
 1545			list_del_init(&dev->close_list);
 1546	}
 
 
 1547}
 1548EXPORT_SYMBOL(dev_close_many);
 1549
 1550/**
 1551 *	dev_close - shutdown an interface.
 1552 *	@dev: device to shutdown
 1553 *
 1554 *	This function moves an active device into down state. A
 1555 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
 1556 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
 1557 *	chain.
 1558 */
 1559void dev_close(struct net_device *dev)
 1560{
 1561	if (dev->flags & IFF_UP) {
 1562		LIST_HEAD(single);
 1563
 1564		list_add(&dev->close_list, &single);
 1565		dev_close_many(&single, true);
 1566		list_del(&single);
 1567	}
 
 1568}
 1569EXPORT_SYMBOL(dev_close);
 1570
 1571
 1572/**
 1573 *	dev_disable_lro - disable Large Receive Offload on a device
 1574 *	@dev: device
 1575 *
 1576 *	Disable Large Receive Offload (LRO) on a net device.  Must be
 1577 *	called under RTNL.  This is needed if received packets may be
 1578 *	forwarded to another interface.
 1579 */
 1580void dev_disable_lro(struct net_device *dev)
 1581{
 1582	struct net_device *lower_dev;
 1583	struct list_head *iter;
 1584
 1585	dev->wanted_features &= ~NETIF_F_LRO;
 1586	netdev_update_features(dev);
 1587
 1588	if (unlikely(dev->features & NETIF_F_LRO))
 1589		netdev_WARN(dev, "failed to disable LRO!\n");
 1590
 1591	netdev_for_each_lower_dev(dev, lower_dev, iter)
 1592		dev_disable_lro(lower_dev);
 1593}
 1594EXPORT_SYMBOL(dev_disable_lro);
 1595
 1596/**
 1597 *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
 1598 *	@dev: device
 1599 *
 1600 *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
 1601 *	called under RTNL.  This is needed if Generic XDP is installed on
 1602 *	the device.
 1603 */
 1604static void dev_disable_gro_hw(struct net_device *dev)
 1605{
 1606	dev->wanted_features &= ~NETIF_F_GRO_HW;
 1607	netdev_update_features(dev);
 1608
 1609	if (unlikely(dev->features & NETIF_F_GRO_HW))
 1610		netdev_WARN(dev, "failed to disable GRO_HW!\n");
 1611}
 1612
 1613const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 1614{
 1615#define N(val) 						\
 1616	case NETDEV_##val:				\
 1617		return "NETDEV_" __stringify(val);
 1618	switch (cmd) {
 1619	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
 1620	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
 1621	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
 1622	N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN)
 1623	N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA)
 1624	N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE)
 1625	N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
 1626	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
 1627	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
 1628	N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
 1629	N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
 1630	N(XDP_FEAT_CHANGE)
 1631	}
 1632#undef N
 1633	return "UNKNOWN_NETDEV_EVENT";
 1634}
 1635EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
 1636
 1637static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
 1638				   struct net_device *dev)
 1639{
 1640	struct netdev_notifier_info info = {
 1641		.dev = dev,
 1642	};
 1643
 
 1644	return nb->notifier_call(nb, val, &info);
 1645}
 1646
 1647static int call_netdevice_register_notifiers(struct notifier_block *nb,
 1648					     struct net_device *dev)
 1649{
 1650	int err;
 1651
 1652	err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
 1653	err = notifier_to_errno(err);
 1654	if (err)
 1655		return err;
 1656
 1657	if (!(dev->flags & IFF_UP))
 1658		return 0;
 1659
 1660	call_netdevice_notifier(nb, NETDEV_UP, dev);
 1661	return 0;
 1662}
 1663
 1664static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
 1665						struct net_device *dev)
 1666{
 1667	if (dev->flags & IFF_UP) {
 1668		call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
 1669					dev);
 1670		call_netdevice_notifier(nb, NETDEV_DOWN, dev);
 1671	}
 1672	call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
 1673}
 1674
 1675static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
 1676						 struct net *net)
 1677{
 1678	struct net_device *dev;
 1679	int err;
 1680
 1681	for_each_netdev(net, dev) {
 1682		err = call_netdevice_register_notifiers(nb, dev);
 1683		if (err)
 1684			goto rollback;
 1685	}
 1686	return 0;
 1687
 1688rollback:
 1689	for_each_netdev_continue_reverse(net, dev)
 1690		call_netdevice_unregister_notifiers(nb, dev);
 1691	return err;
 1692}
 1693
 1694static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
 1695						    struct net *net)
 1696{
 1697	struct net_device *dev;
 1698
 1699	for_each_netdev(net, dev)
 1700		call_netdevice_unregister_notifiers(nb, dev);
 1701}
 1702
 1703static int dev_boot_phase = 1;
 1704
 1705/**
 1706 * register_netdevice_notifier - register a network notifier block
 1707 * @nb: notifier
 1708 *
 1709 * Register a notifier to be called when network device events occur.
 1710 * The notifier passed is linked into the kernel structures and must
 1711 * not be reused until it has been unregistered. A negative errno code
 1712 * is returned on a failure.
 1713 *
 1714 * When registered all registration and up events are replayed
 1715 * to the new notifier to allow device to have a race free
 1716 * view of the network device list.
 1717 */
 1718
 1719int register_netdevice_notifier(struct notifier_block *nb)
 1720{
 
 
 1721	struct net *net;
 1722	int err;
 1723
 1724	/* Close race with setup_net() and cleanup_net() */
 1725	down_write(&pernet_ops_rwsem);
 1726	rtnl_lock();
 1727	err = raw_notifier_chain_register(&netdev_chain, nb);
 1728	if (err)
 1729		goto unlock;
 1730	if (dev_boot_phase)
 1731		goto unlock;
 1732	for_each_net(net) {
 1733		err = call_netdevice_register_net_notifiers(nb, net);
 1734		if (err)
 1735			goto rollback;
 
 
 
 
 
 
 
 
 1736	}
 1737
 1738unlock:
 1739	rtnl_unlock();
 1740	up_write(&pernet_ops_rwsem);
 1741	return err;
 1742
 1743rollback:
 1744	for_each_net_continue_reverse(net)
 1745		call_netdevice_unregister_net_notifiers(nb, net);
 
 
 
 
 
 
 
 
 
 
 
 
 1746
 
 1747	raw_notifier_chain_unregister(&netdev_chain, nb);
 1748	goto unlock;
 1749}
 1750EXPORT_SYMBOL(register_netdevice_notifier);
 1751
 1752/**
 1753 * unregister_netdevice_notifier - unregister a network notifier block
 1754 * @nb: notifier
 1755 *
 1756 * Unregister a notifier previously registered by
 1757 * register_netdevice_notifier(). The notifier is unlinked into the
 1758 * kernel structures and may then be reused. A negative errno code
 1759 * is returned on a failure.
 1760 *
 1761 * After unregistering unregister and down device events are synthesized
 1762 * for all devices on the device list to the removed notifier to remove
 1763 * the need for special case cleanup code.
 1764 */
 1765
 1766int unregister_netdevice_notifier(struct notifier_block *nb)
 1767{
 
 1768	struct net *net;
 1769	int err;
 1770
 1771	/* Close race with setup_net() and cleanup_net() */
 1772	down_write(&pernet_ops_rwsem);
 1773	rtnl_lock();
 1774	err = raw_notifier_chain_unregister(&netdev_chain, nb);
 1775	if (err)
 1776		goto unlock;
 1777
 1778	for_each_net(net)
 1779		call_netdevice_unregister_net_notifiers(nb, net);
 1780
 
 
 
 
 
 
 
 1781unlock:
 1782	rtnl_unlock();
 1783	up_write(&pernet_ops_rwsem);
 1784	return err;
 1785}
 1786EXPORT_SYMBOL(unregister_netdevice_notifier);
 1787
 1788static int __register_netdevice_notifier_net(struct net *net,
 1789					     struct notifier_block *nb,
 1790					     bool ignore_call_fail)
 1791{
 1792	int err;
 1793
 1794	err = raw_notifier_chain_register(&net->netdev_chain, nb);
 1795	if (err)
 1796		return err;
 1797	if (dev_boot_phase)
 1798		return 0;
 1799
 1800	err = call_netdevice_register_net_notifiers(nb, net);
 1801	if (err && !ignore_call_fail)
 1802		goto chain_unregister;
 1803
 1804	return 0;
 1805
 1806chain_unregister:
 1807	raw_notifier_chain_unregister(&net->netdev_chain, nb);
 1808	return err;
 1809}
 1810
 1811static int __unregister_netdevice_notifier_net(struct net *net,
 1812					       struct notifier_block *nb)
 1813{
 1814	int err;
 1815
 1816	err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
 1817	if (err)
 1818		return err;
 1819
 1820	call_netdevice_unregister_net_notifiers(nb, net);
 1821	return 0;
 1822}
 1823
 1824/**
 1825 * register_netdevice_notifier_net - register a per-netns network notifier block
 1826 * @net: network namespace
 1827 * @nb: notifier
 1828 *
 1829 * Register a notifier to be called when network device events occur.
 1830 * The notifier passed is linked into the kernel structures and must
 1831 * not be reused until it has been unregistered. A negative errno code
 1832 * is returned on a failure.
 1833 *
 1834 * When registered all registration and up events are replayed
 1835 * to the new notifier to allow device to have a race free
 1836 * view of the network device list.
 1837 */
 1838
 1839int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
 1840{
 1841	int err;
 1842
 1843	rtnl_lock();
 1844	err = __register_netdevice_notifier_net(net, nb, false);
 1845	rtnl_unlock();
 1846	return err;
 1847}
 1848EXPORT_SYMBOL(register_netdevice_notifier_net);
 1849
 1850/**
 1851 * unregister_netdevice_notifier_net - unregister a per-netns
 1852 *                                     network notifier block
 1853 * @net: network namespace
 1854 * @nb: notifier
 1855 *
 1856 * Unregister a notifier previously registered by
 1857 * register_netdevice_notifier_net(). The notifier is unlinked from the
 1858 * kernel structures and may then be reused. A negative errno code
 1859 * is returned on a failure.
 1860 *
 1861 * After unregistering unregister and down device events are synthesized
 1862 * for all devices on the device list to the removed notifier to remove
 1863 * the need for special case cleanup code.
 1864 */
 1865
 1866int unregister_netdevice_notifier_net(struct net *net,
 1867				      struct notifier_block *nb)
 1868{
 1869	int err;
 1870
 1871	rtnl_lock();
 1872	err = __unregister_netdevice_notifier_net(net, nb);
 1873	rtnl_unlock();
 1874	return err;
 1875}
 1876EXPORT_SYMBOL(unregister_netdevice_notifier_net);
 1877
 1878static void __move_netdevice_notifier_net(struct net *src_net,
 1879					  struct net *dst_net,
 1880					  struct notifier_block *nb)
 1881{
 1882	__unregister_netdevice_notifier_net(src_net, nb);
 1883	__register_netdevice_notifier_net(dst_net, nb, true);
 1884}
 1885
 1886int register_netdevice_notifier_dev_net(struct net_device *dev,
 1887					struct notifier_block *nb,
 1888					struct netdev_net_notifier *nn)
 1889{
 1890	int err;
 1891
 1892	rtnl_lock();
 1893	err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
 1894	if (!err) {
 1895		nn->nb = nb;
 1896		list_add(&nn->list, &dev->net_notifier_list);
 1897	}
 1898	rtnl_unlock();
 1899	return err;
 1900}
 1901EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
 1902
 1903int unregister_netdevice_notifier_dev_net(struct net_device *dev,
 1904					  struct notifier_block *nb,
 1905					  struct netdev_net_notifier *nn)
 1906{
 1907	int err;
 1908
 1909	rtnl_lock();
 1910	list_del(&nn->list);
 1911	err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
 1912	rtnl_unlock();
 1913	return err;
 1914}
 1915EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
 1916
 1917static void move_netdevice_notifiers_dev_net(struct net_device *dev,
 1918					     struct net *net)
 1919{
 1920	struct netdev_net_notifier *nn;
 1921
 1922	list_for_each_entry(nn, &dev->net_notifier_list, list)
 1923		__move_netdevice_notifier_net(dev_net(dev), net, nn->nb);
 1924}
 1925
 1926/**
 1927 *	call_netdevice_notifiers_info - call all network notifier blocks
 1928 *	@val: value passed unmodified to notifier function
 
 1929 *	@info: notifier information data
 1930 *
 1931 *	Call all network notifier blocks.  Parameters and return value
 1932 *	are as for raw_notifier_call_chain().
 1933 */
 1934
 1935int call_netdevice_notifiers_info(unsigned long val,
 1936				  struct netdev_notifier_info *info)
 
 1937{
 1938	struct net *net = dev_net(info->dev);
 1939	int ret;
 1940
 1941	ASSERT_RTNL();
 1942
 1943	/* Run per-netns notifier block chain first, then run the global one.
 1944	 * Hopefully, one day, the global one is going to be removed after
 1945	 * all notifier block registrators get converted to be per-netns.
 1946	 */
 1947	ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
 1948	if (ret & NOTIFY_STOP_MASK)
 1949		return ret;
 1950	return raw_notifier_call_chain(&netdev_chain, val, info);
 1951}
 1952
 1953/**
 1954 *	call_netdevice_notifiers_info_robust - call per-netns notifier blocks
 1955 *	                                       for and rollback on error
 1956 *	@val_up: value passed unmodified to notifier function
 1957 *	@val_down: value passed unmodified to the notifier function when
 1958 *	           recovering from an error on @val_up
 1959 *	@info: notifier information data
 1960 *
 1961 *	Call all per-netns network notifier blocks, but not notifier blocks on
 1962 *	the global notifier chain. Parameters and return value are as for
 1963 *	raw_notifier_call_chain_robust().
 1964 */
 1965
 1966static int
 1967call_netdevice_notifiers_info_robust(unsigned long val_up,
 1968				     unsigned long val_down,
 1969				     struct netdev_notifier_info *info)
 1970{
 1971	struct net *net = dev_net(info->dev);
 1972
 1973	ASSERT_RTNL();
 1974
 1975	return raw_notifier_call_chain_robust(&net->netdev_chain,
 1976					      val_up, val_down, info);
 1977}
 1978
 1979static int call_netdevice_notifiers_extack(unsigned long val,
 1980					   struct net_device *dev,
 1981					   struct netlink_ext_ack *extack)
 1982{
 1983	struct netdev_notifier_info info = {
 1984		.dev = dev,
 1985		.extack = extack,
 1986	};
 1987
 1988	return call_netdevice_notifiers_info(val, &info);
 1989}
 1990
 1991/**
 1992 *	call_netdevice_notifiers - call all network notifier blocks
 1993 *      @val: value passed unmodified to notifier function
 1994 *      @dev: net_device pointer passed unmodified to notifier function
 1995 *
 1996 *	Call all network notifier blocks.  Parameters and return value
 1997 *	are as for raw_notifier_call_chain().
 1998 */
 1999
 2000int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
 2001{
 2002	return call_netdevice_notifiers_extack(val, dev, NULL);
 2003}
 2004EXPORT_SYMBOL(call_netdevice_notifiers);
 2005
 2006/**
 2007 *	call_netdevice_notifiers_mtu - call all network notifier blocks
 2008 *	@val: value passed unmodified to notifier function
 2009 *	@dev: net_device pointer passed unmodified to notifier function
 2010 *	@arg: additional u32 argument passed to the notifier function
 2011 *
 2012 *	Call all network notifier blocks.  Parameters and return value
 2013 *	are as for raw_notifier_call_chain().
 2014 */
 2015static int call_netdevice_notifiers_mtu(unsigned long val,
 2016					struct net_device *dev, u32 arg)
 2017{
 2018	struct netdev_notifier_info_ext info = {
 2019		.info.dev = dev,
 2020		.ext.mtu = arg,
 2021	};
 2022
 2023	BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
 2024
 2025	return call_netdevice_notifiers_info(val, &info.info);
 2026}
 
 2027
 2028#ifdef CONFIG_NET_INGRESS
 2029static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
 2030
 2031void net_inc_ingress_queue(void)
 2032{
 2033	static_branch_inc(&ingress_needed_key);
 2034}
 2035EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
 2036
 2037void net_dec_ingress_queue(void)
 2038{
 2039	static_branch_dec(&ingress_needed_key);
 2040}
 2041EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
 2042#endif
 2043
 2044#ifdef CONFIG_NET_EGRESS
 2045static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
 2046
 2047void net_inc_egress_queue(void)
 2048{
 2049	static_branch_inc(&egress_needed_key);
 2050}
 2051EXPORT_SYMBOL_GPL(net_inc_egress_queue);
 2052
 2053void net_dec_egress_queue(void)
 2054{
 2055	static_branch_dec(&egress_needed_key);
 2056}
 2057EXPORT_SYMBOL_GPL(net_dec_egress_queue);
 2058#endif
 2059
 2060DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
 2061EXPORT_SYMBOL(netstamp_needed_key);
 2062#ifdef CONFIG_JUMP_LABEL
 2063static atomic_t netstamp_needed_deferred;
 2064static atomic_t netstamp_wanted;
 2065static void netstamp_clear(struct work_struct *work)
 2066{
 2067	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
 2068	int wanted;
 2069
 2070	wanted = atomic_add_return(deferred, &netstamp_wanted);
 2071	if (wanted > 0)
 2072		static_branch_enable(&netstamp_needed_key);
 2073	else
 2074		static_branch_disable(&netstamp_needed_key);
 2075}
 2076static DECLARE_WORK(netstamp_work, netstamp_clear);
 2077#endif
 2078
 2079void net_enable_timestamp(void)
 2080{
 2081#ifdef CONFIG_JUMP_LABEL
 2082	int wanted = atomic_read(&netstamp_wanted);
 2083
 2084	while (wanted > 0) {
 2085		if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1))
 
 
 
 2086			return;
 2087	}
 2088	atomic_inc(&netstamp_needed_deferred);
 2089	schedule_work(&netstamp_work);
 2090#else
 2091	static_branch_inc(&netstamp_needed_key);
 2092#endif
 2093}
 2094EXPORT_SYMBOL(net_enable_timestamp);
 2095
 2096void net_disable_timestamp(void)
 2097{
 2098#ifdef CONFIG_JUMP_LABEL
 2099	int wanted = atomic_read(&netstamp_wanted);
 2100
 2101	while (wanted > 1) {
 2102		if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1))
 
 
 
 2103			return;
 2104	}
 2105	atomic_dec(&netstamp_needed_deferred);
 2106	schedule_work(&netstamp_work);
 2107#else
 2108	static_branch_dec(&netstamp_needed_key);
 2109#endif
 2110}
 2111EXPORT_SYMBOL(net_disable_timestamp);
 2112
 2113static inline void net_timestamp_set(struct sk_buff *skb)
 2114{
 2115	skb->tstamp = 0;
 2116	skb->mono_delivery_time = 0;
 2117	if (static_branch_unlikely(&netstamp_needed_key))
 2118		skb->tstamp = ktime_get_real();
 2119}
 2120
 2121#define net_timestamp_check(COND, SKB)				\
 2122	if (static_branch_unlikely(&netstamp_needed_key)) {	\
 2123		if ((COND) && !(SKB)->tstamp)			\
 2124			(SKB)->tstamp = ktime_get_real();	\
 2125	}							\
 2126
 2127bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
 2128{
 2129	return __is_skb_forwardable(dev, skb, true);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 2130}
 2131EXPORT_SYMBOL_GPL(is_skb_forwardable);
 2132
 2133static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
 2134			      bool check_mtu)
 2135{
 2136	int ret = ____dev_forward_skb(dev, skb, check_mtu);
 2137
 2138	if (likely(!ret)) {
 2139		skb->protocol = eth_type_trans(skb, dev);
 2140		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 2141	}
 2142
 2143	return ret;
 2144}
 2145
 2146int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 2147{
 2148	return __dev_forward_skb2(dev, skb, true);
 2149}
 2150EXPORT_SYMBOL_GPL(__dev_forward_skb);
 2151
 2152/**
 2153 * dev_forward_skb - loopback an skb to another netif
 2154 *
 2155 * @dev: destination network device
 2156 * @skb: buffer to forward
 2157 *
 2158 * return values:
 2159 *	NET_RX_SUCCESS	(no congestion)
 2160 *	NET_RX_DROP     (packet was dropped, but freed)
 2161 *
 2162 * dev_forward_skb can be used for injecting an skb from the
 2163 * start_xmit function of one device into the receive queue
 2164 * of another device.
 2165 *
 2166 * The receiving device may be in another namespace, so
 2167 * we have to clear all information in the skb that could
 2168 * impact namespace isolation.
 2169 */
 2170int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 2171{
 2172	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
 2173}
 2174EXPORT_SYMBOL_GPL(dev_forward_skb);
 2175
 2176int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
 2177{
 2178	return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
 2179}
 2180
 2181static inline int deliver_skb(struct sk_buff *skb,
 2182			      struct packet_type *pt_prev,
 2183			      struct net_device *orig_dev)
 2184{
 2185	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 2186		return -ENOMEM;
 2187	refcount_inc(&skb->users);
 2188	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 2189}
 2190
 2191static inline void deliver_ptype_list_skb(struct sk_buff *skb,
 2192					  struct packet_type **pt,
 2193					  struct net_device *orig_dev,
 2194					  __be16 type,
 2195					  struct list_head *ptype_list)
 2196{
 2197	struct packet_type *ptype, *pt_prev = *pt;
 2198
 2199	list_for_each_entry_rcu(ptype, ptype_list, list) {
 2200		if (ptype->type != type)
 2201			continue;
 2202		if (pt_prev)
 2203			deliver_skb(skb, pt_prev, orig_dev);
 2204		pt_prev = ptype;
 2205	}
 2206	*pt = pt_prev;
 2207}
 2208
 2209static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
 2210{
 2211	if (!ptype->af_packet_priv || !skb->sk)
 2212		return false;
 2213
 2214	if (ptype->id_match)
 2215		return ptype->id_match(ptype, skb->sk);
 2216	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
 2217		return true;
 2218
 2219	return false;
 2220}
 2221
 2222/**
 2223 * dev_nit_active - return true if any network interface taps are in use
 2224 *
 2225 * @dev: network device to check for the presence of taps
 2226 */
 2227bool dev_nit_active(struct net_device *dev)
 2228{
 2229	return !list_empty(&net_hotdata.ptype_all) ||
 2230	       !list_empty(&dev->ptype_all);
 2231}
 2232EXPORT_SYMBOL_GPL(dev_nit_active);
 2233
 2234/*
 2235 *	Support routine. Sends outgoing frames to any network
 2236 *	taps currently in use.
 2237 */
 2238
 2239void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 2240{
 2241	struct list_head *ptype_list = &net_hotdata.ptype_all;
 2242	struct packet_type *ptype, *pt_prev = NULL;
 2243	struct sk_buff *skb2 = NULL;
 
 
 2244
 2245	rcu_read_lock();
 2246again:
 2247	list_for_each_entry_rcu(ptype, ptype_list, list) {
 2248		if (READ_ONCE(ptype->ignore_outgoing))
 2249			continue;
 2250
 2251		/* Never send packets back to the socket
 2252		 * they originated from - MvS (miquels@drinkel.ow.org)
 2253		 */
 2254		if (skb_loop_sk(ptype, skb))
 2255			continue;
 2256
 2257		if (pt_prev) {
 2258			deliver_skb(skb2, pt_prev, skb->dev);
 2259			pt_prev = ptype;
 2260			continue;
 2261		}
 2262
 2263		/* need to clone skb, done only once */
 2264		skb2 = skb_clone(skb, GFP_ATOMIC);
 2265		if (!skb2)
 2266			goto out_unlock;
 2267
 2268		net_timestamp_set(skb2);
 2269
 2270		/* skb->nh should be correctly
 2271		 * set by sender, so that the second statement is
 2272		 * just protection against buggy protocols.
 2273		 */
 2274		skb_reset_mac_header(skb2);
 2275
 2276		if (skb_network_header(skb2) < skb2->data ||
 2277		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
 2278			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
 2279					     ntohs(skb2->protocol),
 2280					     dev->name);
 2281			skb_reset_network_header(skb2);
 2282		}
 2283
 2284		skb2->transport_header = skb2->network_header;
 2285		skb2->pkt_type = PACKET_OUTGOING;
 2286		pt_prev = ptype;
 2287	}
 2288
 2289	if (ptype_list == &net_hotdata.ptype_all) {
 2290		ptype_list = &dev->ptype_all;
 2291		goto again;
 2292	}
 2293out_unlock:
 2294	if (pt_prev) {
 2295		if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
 2296			pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
 2297		else
 2298			kfree_skb(skb2);
 2299	}
 2300	rcu_read_unlock();
 2301}
 2302EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
 2303
 2304/**
 2305 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
 2306 * @dev: Network device
 2307 * @txq: number of queues available
 2308 *
 2309 * If real_num_tx_queues is changed the tc mappings may no longer be
 2310 * valid. To resolve this verify the tc mapping remains valid and if
 2311 * not NULL the mapping. With no priorities mapping to this
 2312 * offset/count pair it will no longer be used. In the worst case TC0
 2313 * is invalid nothing can be done so disable priority mappings. If is
 2314 * expected that drivers will fix this mapping if they can before
 2315 * calling netif_set_real_num_tx_queues.
 2316 */
 2317static void netif_setup_tc(struct net_device *dev, unsigned int txq)
 2318{
 2319	int i;
 2320	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2321
 2322	/* If TC0 is invalidated disable TC mapping */
 2323	if (tc->offset + tc->count > txq) {
 2324		netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
 2325		dev->num_tc = 0;
 2326		return;
 2327	}
 2328
 2329	/* Invalidated prio to tc mappings set to TC0 */
 2330	for (i = 1; i < TC_BITMASK + 1; i++) {
 2331		int q = netdev_get_prio_tc_map(dev, i);
 2332
 2333		tc = &dev->tc_to_txq[q];
 2334		if (tc->offset + tc->count > txq) {
 2335			netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
 2336				    i, q);
 2337			netdev_set_prio_tc_map(dev, i, 0);
 2338		}
 2339	}
 2340}
 2341
 2342int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
 2343{
 2344	if (dev->num_tc) {
 2345		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2346		int i;
 2347
 2348		/* walk through the TCs and see if it falls into any of them */
 2349		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
 2350			if ((txq - tc->offset) < tc->count)
 2351				return i;
 2352		}
 2353
 2354		/* didn't find it, just return -1 to indicate no match */
 2355		return -1;
 2356	}
 2357
 2358	return 0;
 2359}
 2360EXPORT_SYMBOL(netdev_txq_to_tc);
 2361
 2362#ifdef CONFIG_XPS
 2363static struct static_key xps_needed __read_mostly;
 2364static struct static_key xps_rxqs_needed __read_mostly;
 2365static DEFINE_MUTEX(xps_map_mutex);
 2366#define xmap_dereference(P)		\
 2367	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
 2368
 2369static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
 2370			     struct xps_dev_maps *old_maps, int tci, u16 index)
 2371{
 2372	struct xps_map *map = NULL;
 2373	int pos;
 2374
 2375	map = xmap_dereference(dev_maps->attr_map[tci]);
 
 2376	if (!map)
 2377		return false;
 2378
 2379	for (pos = map->len; pos--;) {
 2380		if (map->queues[pos] != index)
 2381			continue;
 2382
 2383		if (map->len > 1) {
 2384			map->queues[pos] = map->queues[--map->len];
 2385			break;
 2386		}
 2387
 2388		if (old_maps)
 2389			RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
 2390		RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
 2391		kfree_rcu(map, rcu);
 2392		return false;
 2393	}
 2394
 2395	return true;
 2396}
 2397
 2398static bool remove_xps_queue_cpu(struct net_device *dev,
 2399				 struct xps_dev_maps *dev_maps,
 2400				 int cpu, u16 offset, u16 count)
 2401{
 2402	int num_tc = dev_maps->num_tc;
 2403	bool active = false;
 2404	int tci;
 2405
 2406	for (tci = cpu * num_tc; num_tc--; tci++) {
 2407		int i, j;
 2408
 2409		for (i = count, j = offset; i--; j++) {
 2410			if (!remove_xps_queue(dev_maps, NULL, tci, j))
 2411				break;
 2412		}
 2413
 2414		active |= i < 0;
 2415	}
 2416
 2417	return active;
 2418}
 2419
 2420static void reset_xps_maps(struct net_device *dev,
 2421			   struct xps_dev_maps *dev_maps,
 2422			   enum xps_map_type type)
 2423{
 2424	static_key_slow_dec_cpuslocked(&xps_needed);
 2425	if (type == XPS_RXQS)
 2426		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
 2427
 2428	RCU_INIT_POINTER(dev->xps_maps[type], NULL);
 2429
 2430	kfree_rcu(dev_maps, rcu);
 2431}
 2432
 2433static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
 2434			   u16 offset, u16 count)
 2435{
 2436	struct xps_dev_maps *dev_maps;
 
 2437	bool active = false;
 2438	int i, j;
 2439
 2440	dev_maps = xmap_dereference(dev->xps_maps[type]);
 
 
 2441	if (!dev_maps)
 2442		return;
 2443
 2444	for (j = 0; j < dev_maps->nr_ids; j++)
 2445		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
 2446	if (!active)
 2447		reset_xps_maps(dev, dev_maps, type);
 2448
 2449	if (type == XPS_CPUS) {
 2450		for (i = offset + (count - 1); count--; i--)
 2451			netdev_queue_numa_node_write(
 2452				netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
 2453	}
 2454}
 2455
 2456static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
 2457				   u16 count)
 2458{
 2459	if (!static_key_false(&xps_needed))
 2460		return;
 2461
 2462	cpus_read_lock();
 2463	mutex_lock(&xps_map_mutex);
 2464
 2465	if (static_key_false(&xps_rxqs_needed))
 2466		clean_xps_maps(dev, XPS_RXQS, offset, count);
 2467
 2468	clean_xps_maps(dev, XPS_CPUS, offset, count);
 2469
 
 2470	mutex_unlock(&xps_map_mutex);
 2471	cpus_read_unlock();
 2472}
 2473
 2474static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
 2475{
 2476	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
 2477}
 2478
 2479static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
 2480				      u16 index, bool is_rxqs_map)
 2481{
 2482	struct xps_map *new_map;
 2483	int alloc_len = XPS_MIN_MAP_ALLOC;
 2484	int i, pos;
 2485
 2486	for (pos = 0; map && pos < map->len; pos++) {
 2487		if (map->queues[pos] != index)
 2488			continue;
 2489		return map;
 2490	}
 2491
 2492	/* Need to add tx-queue to this CPU's/rx-queue's existing map */
 2493	if (map) {
 2494		if (pos < map->alloc_len)
 2495			return map;
 2496
 2497		alloc_len = map->alloc_len * 2;
 2498	}
 2499
 2500	/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
 2501	 *  map
 2502	 */
 2503	if (is_rxqs_map)
 2504		new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
 2505	else
 2506		new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
 2507				       cpu_to_node(attr_index));
 2508	if (!new_map)
 2509		return NULL;
 2510
 2511	for (i = 0; i < pos; i++)
 2512		new_map->queues[i] = map->queues[i];
 2513	new_map->alloc_len = alloc_len;
 2514	new_map->len = pos;
 2515
 2516	return new_map;
 2517}
 2518
 2519/* Copy xps maps at a given index */
 2520static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
 2521			      struct xps_dev_maps *new_dev_maps, int index,
 2522			      int tc, bool skip_tc)
 2523{
 2524	int i, tci = index * dev_maps->num_tc;
 2525	struct xps_map *map;
 2526
 2527	/* copy maps belonging to foreign traffic classes */
 2528	for (i = 0; i < dev_maps->num_tc; i++, tci++) {
 2529		if (i == tc && skip_tc)
 2530			continue;
 2531
 2532		/* fill in the new device map from the old device map */
 2533		map = xmap_dereference(dev_maps->attr_map[tci]);
 2534		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2535	}
 2536}
 2537
 2538/* Must be called under cpus_read_lock */
 2539int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 2540			  u16 index, enum xps_map_type type)
 2541{
 2542	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
 2543	const unsigned long *online_mask = NULL;
 2544	bool active = false, copy = false;
 2545	int i, j, tci, numa_node_id = -2;
 2546	int maps_sz, num_tc = 1, tc = 0;
 2547	struct xps_map *map, *new_map;
 2548	unsigned int nr_ids;
 2549
 2550	WARN_ON_ONCE(index >= dev->num_tx_queues);
 2551
 2552	if (dev->num_tc) {
 2553		/* Do not allow XPS on subordinate device directly */
 2554		num_tc = dev->num_tc;
 2555		if (num_tc < 0)
 2556			return -EINVAL;
 2557
 2558		/* If queue belongs to subordinate dev use its map */
 2559		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
 2560
 2561		tc = netdev_txq_to_tc(dev, index);
 2562		if (tc < 0)
 2563			return -EINVAL;
 2564	}
 2565
 2566	mutex_lock(&xps_map_mutex);
 2567
 2568	dev_maps = xmap_dereference(dev->xps_maps[type]);
 2569	if (type == XPS_RXQS) {
 2570		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
 2571		nr_ids = dev->num_rx_queues;
 2572	} else {
 2573		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
 2574		if (num_possible_cpus() > 1)
 2575			online_mask = cpumask_bits(cpu_online_mask);
 2576		nr_ids = nr_cpu_ids;
 2577	}
 2578
 2579	if (maps_sz < L1_CACHE_BYTES)
 2580		maps_sz = L1_CACHE_BYTES;
 2581
 2582	/* The old dev_maps could be larger or smaller than the one we're
 2583	 * setting up now, as dev->num_tc or nr_ids could have been updated in
 2584	 * between. We could try to be smart, but let's be safe instead and only
 2585	 * copy foreign traffic classes if the two map sizes match.
 2586	 */
 2587	if (dev_maps &&
 2588	    dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
 2589		copy = true;
 2590
 2591	/* allocate memory for queue storage */
 2592	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
 2593	     j < nr_ids;) {
 2594		if (!new_dev_maps) {
 2595			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
 2596			if (!new_dev_maps) {
 2597				mutex_unlock(&xps_map_mutex);
 2598				return -ENOMEM;
 2599			}
 2600
 2601			new_dev_maps->nr_ids = nr_ids;
 2602			new_dev_maps->num_tc = num_tc;
 2603		}
 2604
 2605		tci = j * num_tc + tc;
 2606		map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
 
 2607
 2608		map = expand_xps_map(map, j, index, type == XPS_RXQS);
 2609		if (!map)
 2610			goto error;
 2611
 2612		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2613	}
 2614
 2615	if (!new_dev_maps)
 2616		goto out_no_new_maps;
 2617
 2618	if (!dev_maps) {
 2619		/* Increment static keys at most once per type */
 2620		static_key_slow_inc_cpuslocked(&xps_needed);
 2621		if (type == XPS_RXQS)
 2622			static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
 2623	}
 
 2624
 2625	for (j = 0; j < nr_ids; j++) {
 2626		bool skip_tc = false;
 2627
 2628		tci = j * num_tc + tc;
 2629		if (netif_attr_test_mask(j, mask, nr_ids) &&
 2630		    netif_attr_test_online(j, online_mask, nr_ids)) {
 2631			/* add tx-queue to CPU/rx-queue maps */
 2632			int pos = 0;
 2633
 2634			skip_tc = true;
 
 
 2635
 2636			map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2637			while ((pos < map->len) && (map->queues[pos] != index))
 2638				pos++;
 2639
 2640			if (pos == map->len)
 2641				map->queues[map->len++] = index;
 2642#ifdef CONFIG_NUMA
 2643			if (type == XPS_CPUS) {
 2644				if (numa_node_id == -2)
 2645					numa_node_id = cpu_to_node(j);
 2646				else if (numa_node_id != cpu_to_node(j))
 2647					numa_node_id = -1;
 2648			}
 2649#endif
 
 
 
 
 2650		}
 2651
 2652		if (copy)
 2653			xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
 2654					  skip_tc);
 
 
 
 2655	}
 2656
 2657	rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
 2658
 2659	/* Cleanup old maps */
 2660	if (!dev_maps)
 2661		goto out_no_old_maps;
 2662
 2663	for (j = 0; j < dev_maps->nr_ids; j++) {
 2664		for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
 2665			map = xmap_dereference(dev_maps->attr_map[tci]);
 2666			if (!map)
 2667				continue;
 2668
 2669			if (copy) {
 2670				new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2671				if (map == new_map)
 2672					continue;
 2673			}
 2674
 2675			RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
 2676			kfree_rcu(map, rcu);
 2677		}
 2678	}
 2679
 2680	old_dev_maps = dev_maps;
 2681
 2682out_no_old_maps:
 2683	dev_maps = new_dev_maps;
 2684	active = true;
 2685
 2686out_no_new_maps:
 2687	if (type == XPS_CPUS)
 2688		/* update Tx queue numa node */
 2689		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
 2690					     (numa_node_id >= 0) ?
 2691					     numa_node_id : NUMA_NO_NODE);
 2692
 2693	if (!dev_maps)
 2694		goto out_no_maps;
 2695
 2696	/* removes tx-queue from unused CPUs/rx-queues */
 2697	for (j = 0; j < dev_maps->nr_ids; j++) {
 2698		tci = j * dev_maps->num_tc;
 2699
 2700		for (i = 0; i < dev_maps->num_tc; i++, tci++) {
 2701			if (i == tc &&
 2702			    netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
 2703			    netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
 2704				continue;
 2705
 2706			active |= remove_xps_queue(dev_maps,
 2707						   copy ? old_dev_maps : NULL,
 2708						   tci, index);
 2709		}
 2710	}
 2711
 2712	if (old_dev_maps)
 2713		kfree_rcu(old_dev_maps, rcu);
 2714
 2715	/* free map if not active */
 2716	if (!active)
 2717		reset_xps_maps(dev, dev_maps, type);
 
 
 2718
 2719out_no_maps:
 2720	mutex_unlock(&xps_map_mutex);
 2721
 2722	return 0;
 2723error:
 2724	/* remove any maps that we added */
 2725	for (j = 0; j < nr_ids; j++) {
 2726		for (i = num_tc, tci = j * num_tc; i--; tci++) {
 2727			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2728			map = copy ?
 2729			      xmap_dereference(dev_maps->attr_map[tci]) :
 2730			      NULL;
 2731			if (new_map && new_map != map)
 2732				kfree(new_map);
 2733		}
 2734	}
 2735
 2736	mutex_unlock(&xps_map_mutex);
 2737
 2738	kfree(new_dev_maps);
 2739	return -ENOMEM;
 2740}
 2741EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
 2742
 2743int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 2744			u16 index)
 2745{
 2746	int ret;
 2747
 2748	cpus_read_lock();
 2749	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
 2750	cpus_read_unlock();
 2751
 2752	return ret;
 2753}
 2754EXPORT_SYMBOL(netif_set_xps_queue);
 2755
 2756#endif
 2757static void netdev_unbind_all_sb_channels(struct net_device *dev)
 2758{
 2759	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2760
 2761	/* Unbind any subordinate channels */
 2762	while (txq-- != &dev->_tx[0]) {
 2763		if (txq->sb_dev)
 2764			netdev_unbind_sb_channel(dev, txq->sb_dev);
 2765	}
 2766}
 2767
 2768void netdev_reset_tc(struct net_device *dev)
 2769{
 2770#ifdef CONFIG_XPS
 2771	netif_reset_xps_queues_gt(dev, 0);
 2772#endif
 2773	netdev_unbind_all_sb_channels(dev);
 2774
 2775	/* Reset TC configuration of device */
 2776	dev->num_tc = 0;
 2777	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
 2778	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
 2779}
 2780EXPORT_SYMBOL(netdev_reset_tc);
 2781
 2782int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
 2783{
 2784	if (tc >= dev->num_tc)
 2785		return -EINVAL;
 2786
 2787#ifdef CONFIG_XPS
 2788	netif_reset_xps_queues(dev, offset, count);
 2789#endif
 2790	dev->tc_to_txq[tc].count = count;
 2791	dev->tc_to_txq[tc].offset = offset;
 2792	return 0;
 2793}
 2794EXPORT_SYMBOL(netdev_set_tc_queue);
 2795
 2796int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
 2797{
 2798	if (num_tc > TC_MAX_QUEUE)
 2799		return -EINVAL;
 2800
 2801#ifdef CONFIG_XPS
 2802	netif_reset_xps_queues_gt(dev, 0);
 2803#endif
 2804	netdev_unbind_all_sb_channels(dev);
 2805
 2806	dev->num_tc = num_tc;
 2807	return 0;
 2808}
 2809EXPORT_SYMBOL(netdev_set_num_tc);
 2810
 2811void netdev_unbind_sb_channel(struct net_device *dev,
 2812			      struct net_device *sb_dev)
 2813{
 2814	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2815
 2816#ifdef CONFIG_XPS
 2817	netif_reset_xps_queues_gt(sb_dev, 0);
 2818#endif
 2819	memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
 2820	memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
 2821
 2822	while (txq-- != &dev->_tx[0]) {
 2823		if (txq->sb_dev == sb_dev)
 2824			txq->sb_dev = NULL;
 2825	}
 2826}
 2827EXPORT_SYMBOL(netdev_unbind_sb_channel);
 2828
 2829int netdev_bind_sb_channel_queue(struct net_device *dev,
 2830				 struct net_device *sb_dev,
 2831				 u8 tc, u16 count, u16 offset)
 2832{
 2833	/* Make certain the sb_dev and dev are already configured */
 2834	if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
 2835		return -EINVAL;
 2836
 2837	/* We cannot hand out queues we don't have */
 2838	if ((offset + count) > dev->real_num_tx_queues)
 2839		return -EINVAL;
 2840
 2841	/* Record the mapping */
 2842	sb_dev->tc_to_txq[tc].count = count;
 2843	sb_dev->tc_to_txq[tc].offset = offset;
 2844
 2845	/* Provide a way for Tx queue to find the tc_to_txq map or
 2846	 * XPS map for itself.
 2847	 */
 2848	while (count--)
 2849		netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
 2850
 2851	return 0;
 2852}
 2853EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
 2854
 2855int netdev_set_sb_channel(struct net_device *dev, u16 channel)
 2856{
 2857	/* Do not use a multiqueue device to represent a subordinate channel */
 2858	if (netif_is_multiqueue(dev))
 2859		return -ENODEV;
 2860
 2861	/* We allow channels 1 - 32767 to be used for subordinate channels.
 2862	 * Channel 0 is meant to be "native" mode and used only to represent
 2863	 * the main root device. We allow writing 0 to reset the device back
 2864	 * to normal mode after being used as a subordinate channel.
 2865	 */
 2866	if (channel > S16_MAX)
 2867		return -EINVAL;
 2868
 2869	dev->num_tc = -channel;
 2870
 2871	return 0;
 2872}
 2873EXPORT_SYMBOL(netdev_set_sb_channel);
 2874
 2875/*
 2876 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
 2877 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
 2878 */
 2879int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
 2880{
 2881	bool disabling;
 2882	int rc;
 2883
 2884	disabling = txq < dev->real_num_tx_queues;
 2885
 2886	if (txq < 1 || txq > dev->num_tx_queues)
 2887		return -EINVAL;
 2888
 2889	if (dev->reg_state == NETREG_REGISTERED ||
 2890	    dev->reg_state == NETREG_UNREGISTERING) {
 2891		ASSERT_RTNL();
 2892
 2893		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
 2894						  txq);
 2895		if (rc)
 2896			return rc;
 2897
 2898		if (dev->num_tc)
 2899			netif_setup_tc(dev, txq);
 2900
 2901		dev_qdisc_change_real_num_tx(dev, txq);
 2902
 2903		dev->real_num_tx_queues = txq;
 2904
 2905		if (disabling) {
 2906			synchronize_net();
 2907			qdisc_reset_all_tx_gt(dev, txq);
 2908#ifdef CONFIG_XPS
 2909			netif_reset_xps_queues_gt(dev, txq);
 2910#endif
 2911		}
 2912	} else {
 2913		dev->real_num_tx_queues = txq;
 2914	}
 2915
 
 2916	return 0;
 2917}
 2918EXPORT_SYMBOL(netif_set_real_num_tx_queues);
 2919
 2920#ifdef CONFIG_SYSFS
 2921/**
 2922 *	netif_set_real_num_rx_queues - set actual number of RX queues used
 2923 *	@dev: Network device
 2924 *	@rxq: Actual number of RX queues
 2925 *
 2926 *	This must be called either with the rtnl_lock held or before
 2927 *	registration of the net device.  Returns 0 on success, or a
 2928 *	negative error code.  If called before registration, it always
 2929 *	succeeds.
 2930 */
 2931int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
 2932{
 2933	int rc;
 2934
 2935	if (rxq < 1 || rxq > dev->num_rx_queues)
 2936		return -EINVAL;
 2937
 2938	if (dev->reg_state == NETREG_REGISTERED) {
 2939		ASSERT_RTNL();
 2940
 2941		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
 2942						  rxq);
 2943		if (rc)
 2944			return rc;
 2945	}
 2946
 2947	dev->real_num_rx_queues = rxq;
 2948	return 0;
 2949}
 2950EXPORT_SYMBOL(netif_set_real_num_rx_queues);
 2951#endif
 2952
 2953/**
 2954 *	netif_set_real_num_queues - set actual number of RX and TX queues used
 2955 *	@dev: Network device
 2956 *	@txq: Actual number of TX queues
 2957 *	@rxq: Actual number of RX queues
 2958 *
 2959 *	Set the real number of both TX and RX queues.
 2960 *	Does nothing if the number of queues is already correct.
 2961 */
 2962int netif_set_real_num_queues(struct net_device *dev,
 2963			      unsigned int txq, unsigned int rxq)
 2964{
 2965	unsigned int old_rxq = dev->real_num_rx_queues;
 2966	int err;
 2967
 2968	if (txq < 1 || txq > dev->num_tx_queues ||
 2969	    rxq < 1 || rxq > dev->num_rx_queues)
 2970		return -EINVAL;
 2971
 2972	/* Start from increases, so the error path only does decreases -
 2973	 * decreases can't fail.
 2974	 */
 2975	if (rxq > dev->real_num_rx_queues) {
 2976		err = netif_set_real_num_rx_queues(dev, rxq);
 2977		if (err)
 2978			return err;
 2979	}
 2980	if (txq > dev->real_num_tx_queues) {
 2981		err = netif_set_real_num_tx_queues(dev, txq);
 2982		if (err)
 2983			goto undo_rx;
 2984	}
 2985	if (rxq < dev->real_num_rx_queues)
 2986		WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
 2987	if (txq < dev->real_num_tx_queues)
 2988		WARN_ON(netif_set_real_num_tx_queues(dev, txq));
 2989
 2990	return 0;
 2991undo_rx:
 2992	WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
 2993	return err;
 2994}
 2995EXPORT_SYMBOL(netif_set_real_num_queues);
 2996
 2997/**
 2998 * netif_set_tso_max_size() - set the max size of TSO frames supported
 2999 * @dev:	netdev to update
 3000 * @size:	max skb->len of a TSO frame
 3001 *
 3002 * Set the limit on the size of TSO super-frames the device can handle.
 3003 * Unless explicitly set the stack will assume the value of
 3004 * %GSO_LEGACY_MAX_SIZE.
 3005 */
 3006void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
 3007{
 3008	dev->tso_max_size = min(GSO_MAX_SIZE, size);
 3009	if (size < READ_ONCE(dev->gso_max_size))
 3010		netif_set_gso_max_size(dev, size);
 3011	if (size < READ_ONCE(dev->gso_ipv4_max_size))
 3012		netif_set_gso_ipv4_max_size(dev, size);
 3013}
 3014EXPORT_SYMBOL(netif_set_tso_max_size);
 3015
 3016/**
 3017 * netif_set_tso_max_segs() - set the max number of segs supported for TSO
 3018 * @dev:	netdev to update
 3019 * @segs:	max number of TCP segments
 3020 *
 3021 * Set the limit on the number of TCP segments the device can generate from
 3022 * a single TSO super-frame.
 3023 * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
 3024 */
 3025void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
 3026{
 3027	dev->tso_max_segs = segs;
 3028	if (segs < READ_ONCE(dev->gso_max_segs))
 3029		netif_set_gso_max_segs(dev, segs);
 3030}
 3031EXPORT_SYMBOL(netif_set_tso_max_segs);
 3032
 3033/**
 3034 * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
 3035 * @to:		netdev to update
 3036 * @from:	netdev from which to copy the limits
 3037 */
 3038void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
 3039{
 3040	netif_set_tso_max_size(to, from->tso_max_size);
 3041	netif_set_tso_max_segs(to, from->tso_max_segs);
 3042}
 3043EXPORT_SYMBOL(netif_inherit_tso_max);
 3044
 3045/**
 3046 * netif_get_num_default_rss_queues - default number of RSS queues
 3047 *
 3048 * Default value is the number of physical cores if there are only 1 or 2, or
 3049 * divided by 2 if there are more.
 3050 */
 3051int netif_get_num_default_rss_queues(void)
 3052{
 3053	cpumask_var_t cpus;
 3054	int cpu, count = 0;
 3055
 3056	if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
 3057		return 1;
 3058
 3059	cpumask_copy(cpus, cpu_online_mask);
 3060	for_each_cpu(cpu, cpus) {
 3061		++count;
 3062		cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
 3063	}
 3064	free_cpumask_var(cpus);
 3065
 3066	return count > 2 ? DIV_ROUND_UP(count, 2) : count;
 3067}
 3068EXPORT_SYMBOL(netif_get_num_default_rss_queues);
 3069
 3070static void __netif_reschedule(struct Qdisc *q)
 3071{
 3072	struct softnet_data *sd;
 3073	unsigned long flags;
 3074
 3075	local_irq_save(flags);
 3076	sd = this_cpu_ptr(&softnet_data);
 3077	q->next_sched = NULL;
 3078	*sd->output_queue_tailp = q;
 3079	sd->output_queue_tailp = &q->next_sched;
 3080	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 3081	local_irq_restore(flags);
 3082}
 3083
 3084void __netif_schedule(struct Qdisc *q)
 3085{
 3086	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
 3087		__netif_reschedule(q);
 3088}
 3089EXPORT_SYMBOL(__netif_schedule);
 3090
 3091struct dev_kfree_skb_cb {
 3092	enum skb_drop_reason reason;
 3093};
 3094
 3095static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
 3096{
 3097	return (struct dev_kfree_skb_cb *)skb->cb;
 3098}
 3099
 3100void netif_schedule_queue(struct netdev_queue *txq)
 3101{
 3102	rcu_read_lock();
 3103	if (!netif_xmit_stopped(txq)) {
 3104		struct Qdisc *q = rcu_dereference(txq->qdisc);
 3105
 3106		__netif_schedule(q);
 3107	}
 3108	rcu_read_unlock();
 3109}
 3110EXPORT_SYMBOL(netif_schedule_queue);
 3111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3112void netif_tx_wake_queue(struct netdev_queue *dev_queue)
 3113{
 3114	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
 3115		struct Qdisc *q;
 3116
 3117		rcu_read_lock();
 3118		q = rcu_dereference(dev_queue->qdisc);
 3119		__netif_schedule(q);
 3120		rcu_read_unlock();
 3121	}
 3122}
 3123EXPORT_SYMBOL(netif_tx_wake_queue);
 3124
 3125void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason)
 3126{
 3127	unsigned long flags;
 3128
 3129	if (unlikely(!skb))
 3130		return;
 3131
 3132	if (likely(refcount_read(&skb->users) == 1)) {
 3133		smp_rmb();
 3134		refcount_set(&skb->users, 0);
 3135	} else if (likely(!refcount_dec_and_test(&skb->users))) {
 3136		return;
 3137	}
 3138	get_kfree_skb_cb(skb)->reason = reason;
 3139	local_irq_save(flags);
 3140	skb->next = __this_cpu_read(softnet_data.completion_queue);
 3141	__this_cpu_write(softnet_data.completion_queue, skb);
 3142	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 3143	local_irq_restore(flags);
 3144}
 3145EXPORT_SYMBOL(dev_kfree_skb_irq_reason);
 3146
 3147void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason)
 3148{
 3149	if (in_hardirq() || irqs_disabled())
 3150		dev_kfree_skb_irq_reason(skb, reason);
 3151	else
 3152		kfree_skb_reason(skb, reason);
 3153}
 3154EXPORT_SYMBOL(dev_kfree_skb_any_reason);
 3155
 3156
 3157/**
 3158 * netif_device_detach - mark device as removed
 3159 * @dev: network device
 3160 *
 3161 * Mark device as removed from system and therefore no longer available.
 3162 */
 3163void netif_device_detach(struct net_device *dev)
 3164{
 3165	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
 3166	    netif_running(dev)) {
 3167		netif_tx_stop_all_queues(dev);
 3168	}
 3169}
 3170EXPORT_SYMBOL(netif_device_detach);
 3171
 3172/**
 3173 * netif_device_attach - mark device as attached
 3174 * @dev: network device
 3175 *
 3176 * Mark device as attached from system and restart if needed.
 3177 */
 3178void netif_device_attach(struct net_device *dev)
 3179{
 3180	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
 3181	    netif_running(dev)) {
 3182		netif_tx_wake_all_queues(dev);
 3183		__netdev_watchdog_up(dev);
 3184	}
 3185}
 3186EXPORT_SYMBOL(netif_device_attach);
 3187
 3188/*
 3189 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
 3190 * to be used as a distribution range.
 3191 */
 3192static u16 skb_tx_hash(const struct net_device *dev,
 3193		       const struct net_device *sb_dev,
 3194		       struct sk_buff *skb)
 3195{
 3196	u32 hash;
 3197	u16 qoffset = 0;
 3198	u16 qcount = dev->real_num_tx_queues;
 3199
 3200	if (dev->num_tc) {
 3201		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
 3202
 3203		qoffset = sb_dev->tc_to_txq[tc].offset;
 3204		qcount = sb_dev->tc_to_txq[tc].count;
 3205		if (unlikely(!qcount)) {
 3206			net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
 3207					     sb_dev->name, qoffset, tc);
 3208			qoffset = 0;
 3209			qcount = dev->real_num_tx_queues;
 3210		}
 3211	}
 3212
 3213	if (skb_rx_queue_recorded(skb)) {
 3214		DEBUG_NET_WARN_ON_ONCE(qcount == 0);
 3215		hash = skb_get_rx_queue(skb);
 3216		if (hash >= qoffset)
 3217			hash -= qoffset;
 3218		while (unlikely(hash >= qcount))
 3219			hash -= qcount;
 3220		return hash + qoffset;
 
 
 
 
 3221	}
 3222
 3223	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
 3224}
 
 3225
 3226void skb_warn_bad_offload(const struct sk_buff *skb)
 3227{
 3228	static const netdev_features_t null_features;
 3229	struct net_device *dev = skb->dev;
 3230	const char *name = "";
 3231
 3232	if (!net_ratelimit())
 3233		return;
 3234
 3235	if (dev) {
 3236		if (dev->dev.parent)
 3237			name = dev_driver_string(dev->dev.parent);
 3238		else
 3239			name = netdev_name(dev);
 3240	}
 3241	skb_dump(KERN_WARNING, skb, false);
 3242	WARN(1, "%s: caps=(%pNF, %pNF)\n",
 3243	     name, dev ? &dev->features : &null_features,
 3244	     skb->sk ? &skb->sk->sk_route_caps : &null_features);
 
 
 3245}
 3246
 3247/*
 3248 * Invalidate hardware checksum when packet is to be mangled, and
 3249 * complete checksum manually on outgoing path.
 3250 */
 3251int skb_checksum_help(struct sk_buff *skb)
 3252{
 3253	__wsum csum;
 3254	int ret = 0, offset;
 3255
 3256	if (skb->ip_summed == CHECKSUM_COMPLETE)
 3257		goto out_set_summed;
 3258
 3259	if (unlikely(skb_is_gso(skb))) {
 3260		skb_warn_bad_offload(skb);
 3261		return -EINVAL;
 3262	}
 3263
 3264	/* Before computing a checksum, we should make sure no frag could
 3265	 * be modified by an external entity : checksum could be wrong.
 3266	 */
 3267	if (skb_has_shared_frag(skb)) {
 3268		ret = __skb_linearize(skb);
 3269		if (ret)
 3270			goto out;
 3271	}
 3272
 3273	offset = skb_checksum_start_offset(skb);
 3274	ret = -EINVAL;
 3275	if (unlikely(offset >= skb_headlen(skb))) {
 3276		DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
 3277		WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n",
 3278			  offset, skb_headlen(skb));
 3279		goto out;
 3280	}
 3281	csum = skb_checksum(skb, offset, skb->len - offset, 0);
 3282
 3283	offset += skb->csum_offset;
 3284	if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) {
 3285		DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
 3286		WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n",
 3287			  offset + sizeof(__sum16), skb_headlen(skb));
 3288		goto out;
 
 
 3289	}
 3290	ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
 3291	if (ret)
 3292		goto out;
 3293
 3294	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
 3295out_set_summed:
 3296	skb->ip_summed = CHECKSUM_NONE;
 3297out:
 3298	return ret;
 3299}
 3300EXPORT_SYMBOL(skb_checksum_help);
 3301
 3302int skb_crc32c_csum_help(struct sk_buff *skb)
 3303{
 3304	__le32 crc32c_csum;
 3305	int ret = 0, offset, start;
 3306
 3307	if (skb->ip_summed != CHECKSUM_PARTIAL)
 3308		goto out;
 3309
 3310	if (unlikely(skb_is_gso(skb)))
 3311		goto out;
 3312
 3313	/* Before computing a checksum, we should make sure no frag could
 3314	 * be modified by an external entity : checksum could be wrong.
 3315	 */
 3316	if (unlikely(skb_has_shared_frag(skb))) {
 3317		ret = __skb_linearize(skb);
 3318		if (ret)
 3319			goto out;
 3320	}
 3321	start = skb_checksum_start_offset(skb);
 3322	offset = start + offsetof(struct sctphdr, checksum);
 3323	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
 3324		ret = -EINVAL;
 3325		goto out;
 3326	}
 3327
 3328	ret = skb_ensure_writable(skb, offset + sizeof(__le32));
 3329	if (ret)
 3330		goto out;
 3331
 3332	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
 3333						  skb->len - start, ~(__u32)0,
 3334						  crc32c_csum_stub));
 3335	*(__le32 *)(skb->data + offset) = crc32c_csum;
 3336	skb_reset_csum_not_inet(skb);
 3337out:
 3338	return ret;
 3339}
 3340
 3341__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
 3342{
 3343	__be16 type = skb->protocol;
 3344
 3345	/* Tunnel gso handlers can set protocol to ethernet. */
 3346	if (type == htons(ETH_P_TEB)) {
 3347		struct ethhdr *eth;
 3348
 3349		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
 3350			return 0;
 3351
 3352		eth = (struct ethhdr *)skb->data;
 3353		type = eth->h_proto;
 3354	}
 3355
 3356	return vlan_get_protocol_and_depth(skb, type, depth);
 3357}
 3358
 
 
 
 
 
 
 
 
 
 
 
 
 3359
 3360/* Take action when hardware reception checksum errors are detected. */
 3361#ifdef CONFIG_BUG
 3362static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3363{
 3364	netdev_err(dev, "hw csum failure\n");
 3365	skb_dump(KERN_ERR, skb, true);
 3366	dump_stack();
 
 3367}
 3368
 3369void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3370{
 3371	DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3372}
 3373EXPORT_SYMBOL(netdev_rx_csum_fault);
 3374#endif
 3375
 3376/* XXX: check that highmem exists at all on the given machine. */
 
 
 
 
 3377static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 3378{
 3379#ifdef CONFIG_HIGHMEM
 3380	int i;
 3381
 3382	if (!(dev->features & NETIF_F_HIGHDMA)) {
 3383		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 3384			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 3385
 3386			if (PageHighMem(skb_frag_page(frag)))
 3387				return 1;
 3388		}
 3389	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 3390#endif
 3391	return 0;
 3392}
 3393
 3394/* If MPLS offload request, verify we are testing hardware MPLS features
 3395 * instead of standard features for the netdev.
 3396 */
 3397#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
 3398static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3399					   netdev_features_t features,
 3400					   __be16 type)
 3401{
 3402	if (eth_p_mpls(type))
 3403		features &= skb->dev->mpls_features;
 3404
 3405	return features;
 3406}
 3407#else
 3408static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3409					   netdev_features_t features,
 3410					   __be16 type)
 3411{
 3412	return features;
 3413}
 3414#endif
 3415
 3416static netdev_features_t harmonize_features(struct sk_buff *skb,
 3417	netdev_features_t features)
 3418{
 
 3419	__be16 type;
 3420
 3421	type = skb_network_protocol(skb, NULL);
 3422	features = net_mpls_features(skb, features, type);
 3423
 3424	if (skb->ip_summed != CHECKSUM_NONE &&
 3425	    !can_checksum_protocol(features, type)) {
 3426		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
 3427	}
 3428	if (illegal_highdma(skb->dev, skb))
 3429		features &= ~NETIF_F_SG;
 3430
 3431	return features;
 3432}
 3433
 3434netdev_features_t passthru_features_check(struct sk_buff *skb,
 3435					  struct net_device *dev,
 3436					  netdev_features_t features)
 3437{
 3438	return features;
 3439}
 3440EXPORT_SYMBOL(passthru_features_check);
 3441
 3442static netdev_features_t dflt_features_check(struct sk_buff *skb,
 3443					     struct net_device *dev,
 3444					     netdev_features_t features)
 3445{
 3446	return vlan_features_check(skb, features);
 3447}
 3448
 3449static netdev_features_t gso_features_check(const struct sk_buff *skb,
 3450					    struct net_device *dev,
 3451					    netdev_features_t features)
 3452{
 3453	u16 gso_segs = skb_shinfo(skb)->gso_segs;
 3454
 3455	if (gso_segs > READ_ONCE(dev->gso_max_segs))
 3456		return features & ~NETIF_F_GSO_MASK;
 3457
 3458	if (unlikely(skb->len >= READ_ONCE(dev->gso_max_size)))
 3459		return features & ~NETIF_F_GSO_MASK;
 3460
 3461	if (!skb_shinfo(skb)->gso_type) {
 3462		skb_warn_bad_offload(skb);
 3463		return features & ~NETIF_F_GSO_MASK;
 3464	}
 3465
 3466	/* Support for GSO partial features requires software
 3467	 * intervention before we can actually process the packets
 3468	 * so we need to strip support for any partial features now
 3469	 * and we can pull them back in after we have partially
 3470	 * segmented the frame.
 3471	 */
 3472	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
 3473		features &= ~dev->gso_partial_features;
 3474
 3475	/* Make sure to clear the IPv4 ID mangling feature if the
 3476	 * IPv4 header has the potential to be fragmented.
 3477	 */
 3478	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
 3479		struct iphdr *iph = skb->encapsulation ?
 3480				    inner_ip_hdr(skb) : ip_hdr(skb);
 3481
 3482		if (!(iph->frag_off & htons(IP_DF)))
 3483			features &= ~NETIF_F_TSO_MANGLEID;
 3484	}
 3485
 3486	return features;
 3487}
 3488
 3489netdev_features_t netif_skb_features(struct sk_buff *skb)
 3490{
 3491	struct net_device *dev = skb->dev;
 3492	netdev_features_t features = dev->features;
 3493
 3494	if (skb_is_gso(skb))
 3495		features = gso_features_check(skb, dev, features);
 3496
 3497	/* If encapsulation offload request, verify we are testing
 3498	 * hardware encapsulation features instead of standard
 3499	 * features for the netdev
 3500	 */
 3501	if (skb->encapsulation)
 3502		features &= dev->hw_enc_features;
 3503
 3504	if (skb_vlan_tagged(skb))
 3505		features = netdev_intersect_features(features,
 3506						     dev->vlan_features |
 3507						     NETIF_F_HW_VLAN_CTAG_TX |
 3508						     NETIF_F_HW_VLAN_STAG_TX);
 3509
 3510	if (dev->netdev_ops->ndo_features_check)
 3511		features &= dev->netdev_ops->ndo_features_check(skb, dev,
 3512								features);
 3513	else
 3514		features &= dflt_features_check(skb, dev, features);
 3515
 3516	return harmonize_features(skb, features);
 3517}
 3518EXPORT_SYMBOL(netif_skb_features);
 3519
 3520static int xmit_one(struct sk_buff *skb, struct net_device *dev,
 3521		    struct netdev_queue *txq, bool more)
 3522{
 3523	unsigned int len;
 3524	int rc;
 3525
 3526	if (dev_nit_active(dev))
 3527		dev_queue_xmit_nit(skb, dev);
 3528
 3529	len = skb->len;
 3530	trace_net_dev_start_xmit(skb, dev);
 3531	rc = netdev_start_xmit(skb, dev, txq, more);
 3532	trace_net_dev_xmit(skb, rc, dev, len);
 3533
 3534	return rc;
 3535}
 3536
 3537struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
 3538				    struct netdev_queue *txq, int *ret)
 3539{
 3540	struct sk_buff *skb = first;
 3541	int rc = NETDEV_TX_OK;
 3542
 3543	while (skb) {
 3544		struct sk_buff *next = skb->next;
 3545
 3546		skb_mark_not_on_list(skb);
 3547		rc = xmit_one(skb, dev, txq, next != NULL);
 3548		if (unlikely(!dev_xmit_complete(rc))) {
 3549			skb->next = next;
 3550			goto out;
 3551		}
 3552
 3553		skb = next;
 3554		if (netif_tx_queue_stopped(txq) && skb) {
 3555			rc = NETDEV_TX_BUSY;
 3556			break;
 3557		}
 3558	}
 3559
 3560out:
 3561	*ret = rc;
 3562	return skb;
 3563}
 3564
 3565static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
 3566					  netdev_features_t features)
 3567{
 3568	if (skb_vlan_tag_present(skb) &&
 3569	    !vlan_hw_offload_capable(features, skb->vlan_proto))
 3570		skb = __vlan_hwaccel_push_inside(skb);
 3571	return skb;
 3572}
 3573
 3574int skb_csum_hwoffload_help(struct sk_buff *skb,
 3575			    const netdev_features_t features)
 3576{
 3577	if (unlikely(skb_csum_is_sctp(skb)))
 3578		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
 3579			skb_crc32c_csum_help(skb);
 3580
 3581	if (features & NETIF_F_HW_CSUM)
 3582		return 0;
 3583
 3584	if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
 3585		switch (skb->csum_offset) {
 3586		case offsetof(struct tcphdr, check):
 3587		case offsetof(struct udphdr, check):
 3588			return 0;
 3589		}
 3590	}
 3591
 3592	return skb_checksum_help(skb);
 3593}
 3594EXPORT_SYMBOL(skb_csum_hwoffload_help);
 3595
 3596static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
 3597{
 3598	netdev_features_t features;
 3599
 3600	features = netif_skb_features(skb);
 3601	skb = validate_xmit_vlan(skb, features);
 3602	if (unlikely(!skb))
 3603		goto out_null;
 3604
 3605	skb = sk_validate_xmit_skb(skb, dev);
 3606	if (unlikely(!skb))
 3607		goto out_null;
 3608
 3609	if (netif_needs_gso(skb, features)) {
 3610		struct sk_buff *segs;
 3611
 3612		segs = skb_gso_segment(skb, features);
 3613		if (IS_ERR(segs)) {
 3614			goto out_kfree_skb;
 3615		} else if (segs) {
 3616			consume_skb(skb);
 3617			skb = segs;
 3618		}
 3619	} else {
 3620		if (skb_needs_linearize(skb, features) &&
 3621		    __skb_linearize(skb))
 3622			goto out_kfree_skb;
 3623
 3624		/* If packet is not checksummed and device does not
 3625		 * support checksumming for this protocol, complete
 3626		 * checksumming here.
 3627		 */
 3628		if (skb->ip_summed == CHECKSUM_PARTIAL) {
 3629			if (skb->encapsulation)
 3630				skb_set_inner_transport_header(skb,
 3631							       skb_checksum_start_offset(skb));
 3632			else
 3633				skb_set_transport_header(skb,
 3634							 skb_checksum_start_offset(skb));
 3635			if (skb_csum_hwoffload_help(skb, features))
 
 3636				goto out_kfree_skb;
 3637		}
 3638	}
 3639
 3640	skb = validate_xmit_xfrm(skb, features, again);
 3641
 3642	return skb;
 3643
 3644out_kfree_skb:
 3645	kfree_skb(skb);
 3646out_null:
 3647	dev_core_stats_tx_dropped_inc(dev);
 3648	return NULL;
 3649}
 3650
 3651struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
 3652{
 3653	struct sk_buff *next, *head = NULL, *tail;
 3654
 3655	for (; skb != NULL; skb = next) {
 3656		next = skb->next;
 3657		skb_mark_not_on_list(skb);
 3658
 3659		/* in case skb wont be segmented, point to itself */
 3660		skb->prev = skb;
 3661
 3662		skb = validate_xmit_skb(skb, dev, again);
 3663		if (!skb)
 3664			continue;
 3665
 3666		if (!head)
 3667			head = skb;
 3668		else
 3669			tail->next = skb;
 3670		/* If skb was segmented, skb->prev points to
 3671		 * the last segment. If not, it still contains skb.
 3672		 */
 3673		tail = skb->prev;
 3674	}
 3675	return head;
 3676}
 3677EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
 3678
 3679static void qdisc_pkt_len_init(struct sk_buff *skb)
 3680{
 3681	const struct skb_shared_info *shinfo = skb_shinfo(skb);
 3682
 3683	qdisc_skb_cb(skb)->pkt_len = skb->len;
 3684
 3685	/* To get more precise estimation of bytes sent on wire,
 3686	 * we add to pkt_len the headers size of all segments
 3687	 */
 3688	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
 3689		u16 gso_segs = shinfo->gso_segs;
 3690		unsigned int hdr_len;
 
 3691
 3692		/* mac layer + network layer */
 3693		hdr_len = skb_transport_offset(skb);
 3694
 3695		/* + transport layer */
 3696		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
 3697			const struct tcphdr *th;
 3698			struct tcphdr _tcphdr;
 3699
 3700			th = skb_header_pointer(skb, hdr_len,
 3701						sizeof(_tcphdr), &_tcphdr);
 3702			if (likely(th))
 3703				hdr_len += __tcp_hdrlen(th);
 3704		} else {
 3705			struct udphdr _udphdr;
 3706
 3707			if (skb_header_pointer(skb, hdr_len,
 3708					       sizeof(_udphdr), &_udphdr))
 3709				hdr_len += sizeof(struct udphdr);
 3710		}
 3711
 3712		if (shinfo->gso_type & SKB_GSO_DODGY)
 3713			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
 3714						shinfo->gso_size);
 3715
 3716		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
 3717	}
 3718}
 3719
 3720static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
 3721			     struct sk_buff **to_free,
 3722			     struct netdev_queue *txq)
 3723{
 3724	int rc;
 3725
 3726	rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
 3727	if (rc == NET_XMIT_SUCCESS)
 3728		trace_qdisc_enqueue(q, txq, skb);
 3729	return rc;
 3730}
 3731
 3732static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 3733				 struct net_device *dev,
 3734				 struct netdev_queue *txq)
 3735{
 3736	spinlock_t *root_lock = qdisc_lock(q);
 3737	struct sk_buff *to_free = NULL;
 3738	bool contended;
 3739	int rc;
 3740
 3741	qdisc_calculate_pkt_len(skb, q);
 3742
 3743	tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP);
 3744
 3745	if (q->flags & TCQ_F_NOLOCK) {
 3746		if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
 3747		    qdisc_run_begin(q)) {
 3748			/* Retest nolock_qdisc_is_empty() within the protection
 3749			 * of q->seqlock to protect from racing with requeuing.
 3750			 */
 3751			if (unlikely(!nolock_qdisc_is_empty(q))) {
 3752				rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
 3753				__qdisc_run(q);
 3754				qdisc_run_end(q);
 3755
 3756				goto no_lock_out;
 3757			}
 3758
 3759			qdisc_bstats_cpu_update(q, skb);
 3760			if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
 3761			    !nolock_qdisc_is_empty(q))
 3762				__qdisc_run(q);
 3763
 3764			qdisc_run_end(q);
 3765			return NET_XMIT_SUCCESS;
 3766		}
 3767
 3768		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
 3769		qdisc_run(q);
 3770
 3771no_lock_out:
 3772		if (unlikely(to_free))
 3773			kfree_skb_list_reason(to_free,
 3774					      tcf_get_drop_reason(to_free));
 3775		return rc;
 3776	}
 3777
 3778	if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
 3779		kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
 3780		return NET_XMIT_DROP;
 3781	}
 3782	/*
 3783	 * Heuristic to force contended enqueues to serialize on a
 3784	 * separate lock before trying to get qdisc main lock.
 3785	 * This permits qdisc->running owner to get the lock more
 3786	 * often and dequeue packets faster.
 3787	 * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
 3788	 * and then other tasks will only enqueue packets. The packets will be
 3789	 * sent after the qdisc owner is scheduled again. To prevent this
 3790	 * scenario the task always serialize on the lock.
 3791	 */
 3792	contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
 3793	if (unlikely(contended))
 3794		spin_lock(&q->busylock);
 3795
 3796	spin_lock(root_lock);
 3797	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
 3798		__qdisc_drop(skb, &to_free);
 3799		rc = NET_XMIT_DROP;
 3800	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
 3801		   qdisc_run_begin(q)) {
 3802		/*
 3803		 * This is a work-conserving queue; there are no old skbs
 3804		 * waiting to be sent out; and the qdisc is not running -
 3805		 * xmit the skb directly.
 3806		 */
 3807
 3808		qdisc_bstats_update(q, skb);
 3809
 3810		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
 3811			if (unlikely(contended)) {
 3812				spin_unlock(&q->busylock);
 3813				contended = false;
 3814			}
 3815			__qdisc_run(q);
 3816		}
 
 3817
 3818		qdisc_run_end(q);
 3819		rc = NET_XMIT_SUCCESS;
 3820	} else {
 3821		WRITE_ONCE(q->owner, smp_processor_id());
 3822		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
 3823		WRITE_ONCE(q->owner, -1);
 3824		if (qdisc_run_begin(q)) {
 3825			if (unlikely(contended)) {
 3826				spin_unlock(&q->busylock);
 3827				contended = false;
 3828			}
 3829			__qdisc_run(q);
 3830			qdisc_run_end(q);
 3831		}
 3832	}
 3833	spin_unlock(root_lock);
 3834	if (unlikely(to_free))
 3835		kfree_skb_list_reason(to_free,
 3836				      tcf_get_drop_reason(to_free));
 3837	if (unlikely(contended))
 3838		spin_unlock(&q->busylock);
 3839	return rc;
 3840}
 3841
 3842#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 3843static void skb_update_prio(struct sk_buff *skb)
 3844{
 3845	const struct netprio_map *map;
 3846	const struct sock *sk;
 3847	unsigned int prioidx;
 3848
 3849	if (skb->priority)
 3850		return;
 3851	map = rcu_dereference_bh(skb->dev->priomap);
 3852	if (!map)
 3853		return;
 3854	sk = skb_to_full_sk(skb);
 3855	if (!sk)
 3856		return;
 3857
 3858	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
 
 
 3859
 3860	if (prioidx < map->priomap_len)
 3861		skb->priority = map->priomap[prioidx];
 
 3862}
 3863#else
 3864#define skb_update_prio(skb)
 3865#endif
 3866
 
 
 
 3867/**
 3868 *	dev_loopback_xmit - loop back @skb
 3869 *	@net: network namespace this loopback is happening in
 3870 *	@sk:  sk needed to be a netfilter okfn
 3871 *	@skb: buffer to transmit
 3872 */
 3873int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
 3874{
 3875	skb_reset_mac_header(skb);
 3876	__skb_pull(skb, skb_network_offset(skb));
 3877	skb->pkt_type = PACKET_LOOPBACK;
 3878	if (skb->ip_summed == CHECKSUM_NONE)
 3879		skb->ip_summed = CHECKSUM_UNNECESSARY;
 3880	DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb));
 3881	skb_dst_force(skb);
 3882	netif_rx(skb);
 3883	return 0;
 3884}
 3885EXPORT_SYMBOL(dev_loopback_xmit);
 3886
 3887#ifdef CONFIG_NET_EGRESS
 3888static struct netdev_queue *
 3889netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
 3890{
 3891	int qm = skb_get_queue_mapping(skb);
 3892
 3893	return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
 3894}
 3895
 3896static bool netdev_xmit_txqueue_skipped(void)
 3897{
 3898	return __this_cpu_read(softnet_data.xmit.skip_txqueue);
 3899}
 3900
 3901void netdev_xmit_skip_txqueue(bool skip)
 3902{
 3903	__this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
 3904}
 3905EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
 3906#endif /* CONFIG_NET_EGRESS */
 3907
 3908#ifdef CONFIG_NET_XGRESS
 3909static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
 3910		  enum skb_drop_reason *drop_reason)
 3911{
 3912	int ret = TC_ACT_UNSPEC;
 3913#ifdef CONFIG_NET_CLS_ACT
 3914	struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
 3915	struct tcf_result res;
 3916
 3917	if (!miniq)
 3918		return ret;
 
 
 3919
 3920	tc_skb_cb(skb)->mru = 0;
 3921	tc_skb_cb(skb)->post_ct = false;
 3922	tcf_set_drop_reason(skb, *drop_reason);
 3923
 3924	mini_qdisc_bstats_cpu_update(miniq, skb);
 3925	ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
 3926	/* Only tcf related quirks below. */
 3927	switch (ret) {
 3928	case TC_ACT_SHOT:
 3929		*drop_reason = tcf_get_drop_reason(skb);
 3930		mini_qdisc_qstats_cpu_drop(miniq);
 3931		break;
 3932	case TC_ACT_OK:
 3933	case TC_ACT_RECLASSIFY:
 3934		skb->tc_index = TC_H_MIN(res.classid);
 3935		break;
 3936	}
 3937#endif /* CONFIG_NET_CLS_ACT */
 3938	return ret;
 3939}
 3940
 3941static DEFINE_STATIC_KEY_FALSE(tcx_needed_key);
 3942
 3943void tcx_inc(void)
 3944{
 3945	static_branch_inc(&tcx_needed_key);
 3946}
 3947
 3948void tcx_dec(void)
 3949{
 3950	static_branch_dec(&tcx_needed_key);
 3951}
 3952
 3953static __always_inline enum tcx_action_base
 3954tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
 3955	const bool needs_mac)
 3956{
 3957	const struct bpf_mprog_fp *fp;
 3958	const struct bpf_prog *prog;
 3959	int ret = TCX_NEXT;
 3960
 3961	if (needs_mac)
 3962		__skb_push(skb, skb->mac_len);
 3963	bpf_mprog_foreach_prog(entry, fp, prog) {
 3964		bpf_compute_data_pointers(skb);
 3965		ret = bpf_prog_run(prog, skb);
 3966		if (ret != TCX_NEXT)
 3967			break;
 3968	}
 3969	if (needs_mac)
 3970		__skb_pull(skb, skb->mac_len);
 3971	return tcx_action_code(skb, ret);
 3972}
 3973
 3974static __always_inline struct sk_buff *
 3975sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 3976		   struct net_device *orig_dev, bool *another)
 3977{
 3978	struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
 3979	enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
 3980	int sch_ret;
 3981
 3982	if (!entry)
 3983		return skb;
 3984	if (*pt_prev) {
 3985		*ret = deliver_skb(skb, *pt_prev, orig_dev);
 3986		*pt_prev = NULL;
 3987	}
 3988
 3989	qdisc_skb_cb(skb)->pkt_len = skb->len;
 3990	tcx_set_ingress(skb, true);
 3991
 3992	if (static_branch_unlikely(&tcx_needed_key)) {
 3993		sch_ret = tcx_run(entry, skb, true);
 3994		if (sch_ret != TC_ACT_UNSPEC)
 3995			goto ingress_verdict;
 3996	}
 3997	sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
 3998ingress_verdict:
 3999	switch (sch_ret) {
 4000	case TC_ACT_REDIRECT:
 4001		/* skb_mac_header check was done by BPF, so we can safely
 4002		 * push the L2 header back before redirecting to another
 4003		 * netdev.
 4004		 */
 4005		__skb_push(skb, skb->mac_len);
 4006		if (skb_do_redirect(skb) == -EAGAIN) {
 4007			__skb_pull(skb, skb->mac_len);
 4008			*another = true;
 4009			break;
 4010		}
 4011		*ret = NET_RX_SUCCESS;
 4012		return NULL;
 4013	case TC_ACT_SHOT:
 4014		kfree_skb_reason(skb, drop_reason);
 4015		*ret = NET_RX_DROP;
 
 4016		return NULL;
 4017	/* used by tc_run */
 4018	case TC_ACT_STOLEN:
 4019	case TC_ACT_QUEUED:
 4020	case TC_ACT_TRAP:
 4021		consume_skb(skb);
 4022		fallthrough;
 4023	case TC_ACT_CONSUMED:
 4024		*ret = NET_RX_SUCCESS;
 4025		return NULL;
 4026	}
 4027
 4028	return skb;
 4029}
 4030
 4031static __always_inline struct sk_buff *
 4032sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 4033{
 4034	struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
 4035	enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
 4036	int sch_ret;
 4037
 4038	if (!entry)
 4039		return skb;
 4040
 4041	/* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
 4042	 * already set by the caller.
 4043	 */
 4044	if (static_branch_unlikely(&tcx_needed_key)) {
 4045		sch_ret = tcx_run(entry, skb, false);
 4046		if (sch_ret != TC_ACT_UNSPEC)
 4047			goto egress_verdict;
 4048	}
 4049	sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
 4050egress_verdict:
 4051	switch (sch_ret) {
 4052	case TC_ACT_REDIRECT:
 4053		/* No need to push/pop skb's mac_header here on egress! */
 4054		skb_do_redirect(skb);
 4055		*ret = NET_XMIT_SUCCESS;
 4056		return NULL;
 4057	case TC_ACT_SHOT:
 4058		kfree_skb_reason(skb, drop_reason);
 4059		*ret = NET_XMIT_DROP;
 4060		return NULL;
 4061	/* used by tc_run */
 4062	case TC_ACT_STOLEN:
 4063	case TC_ACT_QUEUED:
 4064	case TC_ACT_TRAP:
 4065		consume_skb(skb);
 4066		fallthrough;
 4067	case TC_ACT_CONSUMED:
 4068		*ret = NET_XMIT_SUCCESS;
 4069		return NULL;
 4070	}
 4071
 4072	return skb;
 4073}
 4074#else
 4075static __always_inline struct sk_buff *
 4076sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 4077		   struct net_device *orig_dev, bool *another)
 4078{
 4079	return skb;
 4080}
 4081
 4082static __always_inline struct sk_buff *
 4083sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 4084{
 4085	return skb;
 4086}
 4087#endif /* CONFIG_NET_XGRESS */
 4088
 4089#ifdef CONFIG_XPS
 4090static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
 4091			       struct xps_dev_maps *dev_maps, unsigned int tci)
 4092{
 4093	int tc = netdev_get_prio_tc_map(dev, skb->priority);
 4094	struct xps_map *map;
 4095	int queue_index = -1;
 4096
 4097	if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
 4098		return queue_index;
 4099
 4100	tci *= dev_maps->num_tc;
 4101	tci += tc;
 4102
 4103	map = rcu_dereference(dev_maps->attr_map[tci]);
 4104	if (map) {
 4105		if (map->len == 1)
 4106			queue_index = map->queues[0];
 4107		else
 4108			queue_index = map->queues[reciprocal_scale(
 4109						skb_get_hash(skb), map->len)];
 4110		if (unlikely(queue_index >= dev->real_num_tx_queues))
 4111			queue_index = -1;
 4112	}
 4113	return queue_index;
 4114}
 4115#endif
 4116
 4117static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
 4118			 struct sk_buff *skb)
 4119{
 4120#ifdef CONFIG_XPS
 4121	struct xps_dev_maps *dev_maps;
 4122	struct sock *sk = skb->sk;
 4123	int queue_index = -1;
 4124
 4125	if (!static_key_false(&xps_needed))
 4126		return -1;
 4127
 4128	rcu_read_lock();
 4129	if (!static_key_false(&xps_rxqs_needed))
 4130		goto get_cpus_map;
 4131
 4132	dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
 4133	if (dev_maps) {
 4134		int tci = sk_rx_queue_get(sk);
 4135
 4136		if (tci >= 0)
 4137			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 4138							  tci);
 4139	}
 4140
 4141get_cpus_map:
 4142	if (queue_index < 0) {
 4143		dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
 4144		if (dev_maps) {
 4145			unsigned int tci = skb->sender_cpu - 1;
 4146
 4147			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 4148							  tci);
 
 
 
 
 
 
 
 4149		}
 4150	}
 4151	rcu_read_unlock();
 4152
 4153	return queue_index;
 4154#else
 4155	return -1;
 4156#endif
 4157}
 4158
 4159u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
 4160		     struct net_device *sb_dev)
 4161{
 4162	return 0;
 4163}
 4164EXPORT_SYMBOL(dev_pick_tx_zero);
 4165
 4166u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
 4167		       struct net_device *sb_dev)
 4168{
 4169	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
 4170}
 4171EXPORT_SYMBOL(dev_pick_tx_cpu_id);
 4172
 4173u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
 4174		     struct net_device *sb_dev)
 4175{
 4176	struct sock *sk = skb->sk;
 4177	int queue_index = sk_tx_queue_get(sk);
 4178
 4179	sb_dev = sb_dev ? : dev;
 4180
 4181	if (queue_index < 0 || skb->ooo_okay ||
 4182	    queue_index >= dev->real_num_tx_queues) {
 4183		int new_index = get_xps_queue(dev, sb_dev, skb);
 4184
 4185		if (new_index < 0)
 4186			new_index = skb_tx_hash(dev, sb_dev, skb);
 4187
 4188		if (queue_index != new_index && sk &&
 4189		    sk_fullsock(sk) &&
 4190		    rcu_access_pointer(sk->sk_dst_cache))
 4191			sk_tx_queue_set(sk, new_index);
 4192
 4193		queue_index = new_index;
 4194	}
 4195
 4196	return queue_index;
 4197}
 4198EXPORT_SYMBOL(netdev_pick_tx);
 4199
 4200struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
 4201					 struct sk_buff *skb,
 4202					 struct net_device *sb_dev)
 4203{
 4204	int queue_index = 0;
 4205
 4206#ifdef CONFIG_XPS
 4207	u32 sender_cpu = skb->sender_cpu - 1;
 4208
 4209	if (sender_cpu >= (u32)NR_CPUS)
 4210		skb->sender_cpu = raw_smp_processor_id() + 1;
 4211#endif
 4212
 4213	if (dev->real_num_tx_queues != 1) {
 4214		const struct net_device_ops *ops = dev->netdev_ops;
 4215
 4216		if (ops->ndo_select_queue)
 4217			queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
 
 4218		else
 4219			queue_index = netdev_pick_tx(dev, skb, sb_dev);
 4220
 4221		queue_index = netdev_cap_txqueue(dev, queue_index);
 
 4222	}
 4223
 4224	skb_set_queue_mapping(skb, queue_index);
 4225	return netdev_get_tx_queue(dev, queue_index);
 4226}
 4227
 4228/**
 4229 * __dev_queue_xmit() - transmit a buffer
 4230 * @skb:	buffer to transmit
 4231 * @sb_dev:	suboordinate device used for L2 forwarding offload
 4232 *
 4233 * Queue a buffer for transmission to a network device. The caller must
 4234 * have set the device and priority and built the buffer before calling
 4235 * this function. The function can be called from an interrupt.
 4236 *
 4237 * When calling this method, interrupts MUST be enabled. This is because
 4238 * the BH enable code must have IRQs enabled so that it will not deadlock.
 4239 *
 4240 * Regardless of the return value, the skb is consumed, so it is currently
 4241 * difficult to retry a send to this method. (You can bump the ref count
 4242 * before sending to hold a reference for retry if you are careful.)
 4243 *
 4244 * Return:
 4245 * * 0				- buffer successfully transmitted
 4246 * * positive qdisc return code	- NET_XMIT_DROP etc.
 4247 * * negative errno		- other errors
 
 
 
 
 
 4248 */
 4249int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 4250{
 4251	struct net_device *dev = skb->dev;
 4252	struct netdev_queue *txq = NULL;
 4253	struct Qdisc *q;
 4254	int rc = -ENOMEM;
 4255	bool again = false;
 4256
 4257	skb_reset_mac_header(skb);
 4258	skb_assert_len(skb);
 4259
 4260	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
 4261		__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
 4262
 4263	/* Disable soft irqs for various locks below. Also
 4264	 * stops preemption for RCU.
 4265	 */
 4266	rcu_read_lock_bh();
 4267
 4268	skb_update_prio(skb);
 4269
 4270	qdisc_pkt_len_init(skb);
 4271	tcx_set_ingress(skb, false);
 4272#ifdef CONFIG_NET_EGRESS
 4273	if (static_branch_unlikely(&egress_needed_key)) {
 4274		if (nf_hook_egress_active()) {
 4275			skb = nf_hook_egress(skb, &rc, dev);
 4276			if (!skb)
 4277				goto out;
 4278		}
 4279
 4280		netdev_xmit_skip_txqueue(false);
 4281
 4282		nf_skip_egress(skb, true);
 4283		skb = sch_handle_egress(skb, &rc, dev);
 4284		if (!skb)
 4285			goto out;
 4286		nf_skip_egress(skb, false);
 4287
 4288		if (netdev_xmit_txqueue_skipped())
 4289			txq = netdev_tx_queue_mapping(dev, skb);
 4290	}
 
 4291#endif
 4292	/* If device/qdisc don't need skb->dst, release it right now while
 4293	 * its hot in this cpu cache.
 4294	 */
 4295	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
 4296		skb_dst_drop(skb);
 4297	else
 4298		skb_dst_force(skb);
 4299
 4300	if (!txq)
 4301		txq = netdev_core_pick_tx(dev, skb, sb_dev);
 4302
 4303	q = rcu_dereference_bh(txq->qdisc);
 4304
 4305	trace_net_dev_queue(skb);
 4306	if (q->enqueue) {
 4307		rc = __dev_xmit_skb(skb, q, dev, txq);
 4308		goto out;
 4309	}
 4310
 4311	/* The device has no queue. Common case for software devices:
 4312	 * loopback, all the sorts of tunnels...
 4313
 4314	 * Really, it is unlikely that netif_tx_lock protection is necessary
 4315	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
 4316	 * counters.)
 4317	 * However, it is possible, that they rely on protection
 4318	 * made by us here.
 4319
 4320	 * Check this and shot the lock. It is not prone from deadlocks.
 4321	 *Either shot noqueue qdisc, it is even simpler 8)
 4322	 */
 4323	if (dev->flags & IFF_UP) {
 4324		int cpu = smp_processor_id(); /* ok because BHs are off */
 4325
 4326		/* Other cpus might concurrently change txq->xmit_lock_owner
 4327		 * to -1 or to their cpu id, but not to our id.
 4328		 */
 4329		if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
 4330			if (dev_xmit_recursion())
 4331				goto recursion_alert;
 4332
 4333			skb = validate_xmit_skb(skb, dev, &again);
 4334			if (!skb)
 4335				goto out;
 4336
 4337			HARD_TX_LOCK(dev, txq, cpu);
 4338
 4339			if (!netif_xmit_stopped(txq)) {
 4340				dev_xmit_recursion_inc();
 4341				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
 4342				dev_xmit_recursion_dec();
 4343				if (dev_xmit_complete(rc)) {
 4344					HARD_TX_UNLOCK(dev, txq);
 4345					goto out;
 4346				}
 4347			}
 4348			HARD_TX_UNLOCK(dev, txq);
 4349			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
 4350					     dev->name);
 4351		} else {
 4352			/* Recursion is detected! It is possible,
 4353			 * unfortunately
 4354			 */
 4355recursion_alert:
 4356			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
 4357					     dev->name);
 4358		}
 4359	}
 4360
 4361	rc = -ENETDOWN;
 4362	rcu_read_unlock_bh();
 4363
 4364	dev_core_stats_tx_dropped_inc(dev);
 4365	kfree_skb_list(skb);
 4366	return rc;
 4367out:
 4368	rcu_read_unlock_bh();
 4369	return rc;
 4370}
 4371EXPORT_SYMBOL(__dev_queue_xmit);
 4372
 4373int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
 4374{
 4375	struct net_device *dev = skb->dev;
 4376	struct sk_buff *orig_skb = skb;
 4377	struct netdev_queue *txq;
 4378	int ret = NETDEV_TX_BUSY;
 4379	bool again = false;
 4380
 4381	if (unlikely(!netif_running(dev) ||
 4382		     !netif_carrier_ok(dev)))
 4383		goto drop;
 4384
 4385	skb = validate_xmit_skb_list(skb, dev, &again);
 4386	if (skb != orig_skb)
 4387		goto drop;
 4388
 4389	skb_set_queue_mapping(skb, queue_id);
 4390	txq = skb_get_tx_queue(dev, skb);
 
 
 
 4391
 4392	local_bh_disable();
 4393
 4394	dev_xmit_recursion_inc();
 4395	HARD_TX_LOCK(dev, txq, smp_processor_id());
 4396	if (!netif_xmit_frozen_or_drv_stopped(txq))
 4397		ret = netdev_start_xmit(skb, dev, txq, false);
 4398	HARD_TX_UNLOCK(dev, txq);
 4399	dev_xmit_recursion_dec();
 4400
 4401	local_bh_enable();
 4402	return ret;
 4403drop:
 4404	dev_core_stats_tx_dropped_inc(dev);
 4405	kfree_skb_list(skb);
 4406	return NET_XMIT_DROP;
 4407}
 4408EXPORT_SYMBOL(__dev_direct_xmit);
 4409
 4410/*************************************************************************
 4411 *			Receiver routines
 4412 *************************************************************************/
 4413
 4414unsigned int sysctl_skb_defer_max __read_mostly = 64;
 4415int weight_p __read_mostly = 64;           /* old backlog weight */
 4416int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
 4417int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
 4418
 4419/* Called with irq disabled */
 4420static inline void ____napi_schedule(struct softnet_data *sd,
 4421				     struct napi_struct *napi)
 4422{
 4423	struct task_struct *thread;
 4424
 4425	lockdep_assert_irqs_disabled();
 4426
 4427	if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
 4428		/* Paired with smp_mb__before_atomic() in
 4429		 * napi_enable()/dev_set_threaded().
 4430		 * Use READ_ONCE() to guarantee a complete
 4431		 * read on napi->thread. Only call
 4432		 * wake_up_process() when it's not NULL.
 4433		 */
 4434		thread = READ_ONCE(napi->thread);
 4435		if (thread) {
 4436			/* Avoid doing set_bit() if the thread is in
 4437			 * INTERRUPTIBLE state, cause napi_thread_wait()
 4438			 * makes sure to proceed with napi polling
 4439			 * if the thread is explicitly woken from here.
 4440			 */
 4441			if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
 4442				set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
 4443			wake_up_process(thread);
 4444			return;
 4445		}
 4446	}
 4447
 4448	list_add_tail(&napi->poll_list, &sd->poll_list);
 4449	WRITE_ONCE(napi->list_owner, smp_processor_id());
 4450	/* If not called from net_rx_action()
 4451	 * we have to raise NET_RX_SOFTIRQ.
 4452	 */
 4453	if (!sd->in_net_rx_action)
 4454		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4455}
 4456
 4457#ifdef CONFIG_RPS
 4458
 4459struct static_key_false rps_needed __read_mostly;
 
 
 
 
 
 
 4460EXPORT_SYMBOL(rps_needed);
 4461struct static_key_false rfs_needed __read_mostly;
 4462EXPORT_SYMBOL(rfs_needed);
 4463
 4464static struct rps_dev_flow *
 4465set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 4466	    struct rps_dev_flow *rflow, u16 next_cpu)
 4467{
 4468	if (next_cpu < nr_cpu_ids) {
 4469#ifdef CONFIG_RFS_ACCEL
 4470		struct netdev_rx_queue *rxqueue;
 4471		struct rps_dev_flow_table *flow_table;
 4472		struct rps_dev_flow *old_rflow;
 4473		u32 flow_id;
 4474		u16 rxq_index;
 4475		int rc;
 4476
 4477		/* Should we steer this flow to a different hardware queue? */
 4478		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
 4479		    !(dev->features & NETIF_F_NTUPLE))
 4480			goto out;
 4481		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
 4482		if (rxq_index == skb_get_rx_queue(skb))
 4483			goto out;
 4484
 4485		rxqueue = dev->_rx + rxq_index;
 4486		flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4487		if (!flow_table)
 4488			goto out;
 4489		flow_id = skb_get_hash(skb) & flow_table->mask;
 4490		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
 4491							rxq_index, flow_id);
 4492		if (rc < 0)
 4493			goto out;
 4494		old_rflow = rflow;
 4495		rflow = &flow_table->flows[flow_id];
 4496		rflow->filter = rc;
 4497		if (old_rflow->filter == rflow->filter)
 4498			old_rflow->filter = RPS_NO_FILTER;
 4499	out:
 4500#endif
 4501		rflow->last_qtail =
 4502			per_cpu(softnet_data, next_cpu).input_queue_head;
 4503	}
 4504
 4505	rflow->cpu = next_cpu;
 4506	return rflow;
 4507}
 4508
 4509/*
 4510 * get_rps_cpu is called from netif_receive_skb and returns the target
 4511 * CPU from the RPS map of the receiving queue for a given skb.
 4512 * rcu_read_lock must be held on entry.
 4513 */
 4514static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 4515		       struct rps_dev_flow **rflowp)
 4516{
 4517	const struct rps_sock_flow_table *sock_flow_table;
 4518	struct netdev_rx_queue *rxqueue = dev->_rx;
 4519	struct rps_dev_flow_table *flow_table;
 4520	struct rps_map *map;
 4521	int cpu = -1;
 4522	u32 tcpu;
 4523	u32 hash;
 4524
 4525	if (skb_rx_queue_recorded(skb)) {
 4526		u16 index = skb_get_rx_queue(skb);
 4527
 4528		if (unlikely(index >= dev->real_num_rx_queues)) {
 4529			WARN_ONCE(dev->real_num_rx_queues > 1,
 4530				  "%s received packet on queue %u, but number "
 4531				  "of RX queues is %u\n",
 4532				  dev->name, index, dev->real_num_rx_queues);
 4533			goto done;
 4534		}
 4535		rxqueue += index;
 4536	}
 4537
 4538	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
 4539
 4540	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4541	map = rcu_dereference(rxqueue->rps_map);
 4542	if (!flow_table && !map)
 4543		goto done;
 4544
 4545	skb_reset_network_header(skb);
 4546	hash = skb_get_hash(skb);
 4547	if (!hash)
 4548		goto done;
 4549
 4550	sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
 4551	if (flow_table && sock_flow_table) {
 4552		struct rps_dev_flow *rflow;
 4553		u32 next_cpu;
 4554		u32 ident;
 4555
 4556		/* First check into global flow table if there is a match.
 4557		 * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
 4558		 */
 4559		ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
 4560		if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask)
 4561			goto try_rps;
 4562
 4563		next_cpu = ident & net_hotdata.rps_cpu_mask;
 4564
 4565		/* OK, now we know there is a match,
 4566		 * we can look at the local (per receive queue) flow table
 4567		 */
 4568		rflow = &flow_table->flows[hash & flow_table->mask];
 4569		tcpu = rflow->cpu;
 4570
 4571		/*
 4572		 * If the desired CPU (where last recvmsg was done) is
 4573		 * different from current CPU (one in the rx-queue flow
 4574		 * table entry), switch if one of the following holds:
 4575		 *   - Current CPU is unset (>= nr_cpu_ids).
 4576		 *   - Current CPU is offline.
 4577		 *   - The current CPU's queue tail has advanced beyond the
 4578		 *     last packet that was enqueued using this table entry.
 4579		 *     This guarantees that all previous packets for the flow
 4580		 *     have been dequeued, thus preserving in order delivery.
 4581		 */
 4582		if (unlikely(tcpu != next_cpu) &&
 4583		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
 4584		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
 4585		      rflow->last_qtail)) >= 0)) {
 4586			tcpu = next_cpu;
 4587			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
 4588		}
 4589
 4590		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
 4591			*rflowp = rflow;
 4592			cpu = tcpu;
 4593			goto done;
 4594		}
 4595	}
 4596
 4597try_rps:
 4598
 4599	if (map) {
 4600		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
 4601		if (cpu_online(tcpu)) {
 4602			cpu = tcpu;
 4603			goto done;
 4604		}
 4605	}
 4606
 4607done:
 4608	return cpu;
 4609}
 4610
 4611#ifdef CONFIG_RFS_ACCEL
 4612
 4613/**
 4614 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
 4615 * @dev: Device on which the filter was set
 4616 * @rxq_index: RX queue index
 4617 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
 4618 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
 4619 *
 4620 * Drivers that implement ndo_rx_flow_steer() should periodically call
 4621 * this function for each installed filter and remove the filters for
 4622 * which it returns %true.
 4623 */
 4624bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
 4625			 u32 flow_id, u16 filter_id)
 4626{
 4627	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
 4628	struct rps_dev_flow_table *flow_table;
 4629	struct rps_dev_flow *rflow;
 4630	bool expire = true;
 4631	unsigned int cpu;
 4632
 4633	rcu_read_lock();
 4634	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4635	if (flow_table && flow_id <= flow_table->mask) {
 4636		rflow = &flow_table->flows[flow_id];
 4637		cpu = READ_ONCE(rflow->cpu);
 4638		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
 4639		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
 4640			   rflow->last_qtail) <
 4641		     (int)(10 * flow_table->mask)))
 4642			expire = false;
 4643	}
 4644	rcu_read_unlock();
 4645	return expire;
 4646}
 4647EXPORT_SYMBOL(rps_may_expire_flow);
 4648
 4649#endif /* CONFIG_RFS_ACCEL */
 4650
 4651/* Called from hardirq (IPI) context */
 4652static void rps_trigger_softirq(void *data)
 4653{
 4654	struct softnet_data *sd = data;
 4655
 4656	____napi_schedule(sd, &sd->backlog);
 4657	sd->received_rps++;
 4658}
 4659
 4660#endif /* CONFIG_RPS */
 4661
 4662/* Called from hardirq (IPI) context */
 4663static void trigger_rx_softirq(void *data)
 4664{
 4665	struct softnet_data *sd = data;
 4666
 4667	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4668	smp_store_release(&sd->defer_ipi_scheduled, 0);
 4669}
 4670
 4671/*
 4672 * After we queued a packet into sd->input_pkt_queue,
 4673 * we need to make sure this queue is serviced soon.
 4674 *
 4675 * - If this is another cpu queue, link it to our rps_ipi_list,
 4676 *   and make sure we will process rps_ipi_list from net_rx_action().
 4677 *
 4678 * - If this is our own queue, NAPI schedule our backlog.
 4679 *   Note that this also raises NET_RX_SOFTIRQ.
 4680 */
 4681static void napi_schedule_rps(struct softnet_data *sd)
 4682{
 
 4683	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
 4684
 4685#ifdef CONFIG_RPS
 4686	if (sd != mysd) {
 4687		sd->rps_ipi_next = mysd->rps_ipi_list;
 4688		mysd->rps_ipi_list = sd;
 4689
 4690		/* If not called from net_rx_action() or napi_threaded_poll()
 4691		 * we have to raise NET_RX_SOFTIRQ.
 4692		 */
 4693		if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll)
 4694			__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4695		return;
 4696	}
 4697#endif /* CONFIG_RPS */
 4698	__napi_schedule_irqoff(&mysd->backlog);
 4699}
 4700
 4701#ifdef CONFIG_NET_FLOW_LIMIT
 4702int netdev_flow_limit_table_len __read_mostly = (1 << 12);
 4703#endif
 4704
 4705static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
 4706{
 4707#ifdef CONFIG_NET_FLOW_LIMIT
 4708	struct sd_flow_limit *fl;
 4709	struct softnet_data *sd;
 4710	unsigned int old_flow, new_flow;
 4711
 4712	if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1))
 4713		return false;
 4714
 4715	sd = this_cpu_ptr(&softnet_data);
 4716
 4717	rcu_read_lock();
 4718	fl = rcu_dereference(sd->flow_limit);
 4719	if (fl) {
 4720		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
 4721		old_flow = fl->history[fl->history_head];
 4722		fl->history[fl->history_head] = new_flow;
 4723
 4724		fl->history_head++;
 4725		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
 4726
 4727		if (likely(fl->buckets[old_flow]))
 4728			fl->buckets[old_flow]--;
 4729
 4730		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
 4731			fl->count++;
 4732			rcu_read_unlock();
 4733			return true;
 4734		}
 4735	}
 4736	rcu_read_unlock();
 4737#endif
 4738	return false;
 4739}
 4740
 4741/*
 4742 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
 4743 * queue (may be a remote CPU queue).
 4744 */
 4745static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 4746			      unsigned int *qtail)
 4747{
 4748	enum skb_drop_reason reason;
 4749	struct softnet_data *sd;
 4750	unsigned long flags;
 4751	unsigned int qlen;
 4752
 4753	reason = SKB_DROP_REASON_NOT_SPECIFIED;
 4754	sd = &per_cpu(softnet_data, cpu);
 4755
 4756	rps_lock_irqsave(sd, &flags);
 
 
 4757	if (!netif_running(skb->dev))
 4758		goto drop;
 4759	qlen = skb_queue_len(&sd->input_pkt_queue);
 4760	if (qlen <= READ_ONCE(net_hotdata.max_backlog) &&
 4761	    !skb_flow_limit(skb, qlen)) {
 4762		if (qlen) {
 4763enqueue:
 4764			__skb_queue_tail(&sd->input_pkt_queue, skb);
 4765			input_queue_tail_incr_save(sd, qtail);
 4766			rps_unlock_irq_restore(sd, &flags);
 
 4767			return NET_RX_SUCCESS;
 4768		}
 4769
 4770		/* Schedule NAPI for backlog device
 4771		 * We can use non atomic operation since we own the queue lock
 4772		 */
 4773		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
 4774			napi_schedule_rps(sd);
 
 
 4775		goto enqueue;
 4776	}
 4777	reason = SKB_DROP_REASON_CPU_BACKLOG;
 4778
 4779drop:
 4780	sd->dropped++;
 4781	rps_unlock_irq_restore(sd, &flags);
 4782
 4783	dev_core_stats_rx_dropped_inc(skb->dev);
 4784	kfree_skb_reason(skb, reason);
 4785	return NET_RX_DROP;
 4786}
 4787
 4788static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
 4789{
 4790	struct net_device *dev = skb->dev;
 4791	struct netdev_rx_queue *rxqueue;
 4792
 4793	rxqueue = dev->_rx;
 4794
 4795	if (skb_rx_queue_recorded(skb)) {
 4796		u16 index = skb_get_rx_queue(skb);
 4797
 4798		if (unlikely(index >= dev->real_num_rx_queues)) {
 4799			WARN_ONCE(dev->real_num_rx_queues > 1,
 4800				  "%s received packet on queue %u, but number "
 4801				  "of RX queues is %u\n",
 4802				  dev->name, index, dev->real_num_rx_queues);
 4803
 4804			return rxqueue; /* Return first rxqueue */
 4805		}
 4806		rxqueue += index;
 4807	}
 4808	return rxqueue;
 4809}
 4810
 4811u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
 4812			     struct bpf_prog *xdp_prog)
 4813{
 4814	void *orig_data, *orig_data_end, *hard_start;
 4815	struct netdev_rx_queue *rxqueue;
 4816	bool orig_bcast, orig_host;
 4817	u32 mac_len, frame_sz;
 4818	__be16 orig_eth_type;
 4819	struct ethhdr *eth;
 4820	u32 metalen, act;
 4821	int off;
 4822
 4823	/* The XDP program wants to see the packet starting at the MAC
 4824	 * header.
 4825	 */
 4826	mac_len = skb->data - skb_mac_header(skb);
 4827	hard_start = skb->data - skb_headroom(skb);
 4828
 4829	/* SKB "head" area always have tailroom for skb_shared_info */
 4830	frame_sz = (void *)skb_end_pointer(skb) - hard_start;
 4831	frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 4832
 4833	rxqueue = netif_get_rxqueue(skb);
 4834	xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
 4835	xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
 4836			 skb_headlen(skb) + mac_len, true);
 4837	if (skb_is_nonlinear(skb)) {
 4838		skb_shinfo(skb)->xdp_frags_size = skb->data_len;
 4839		xdp_buff_set_frags_flag(xdp);
 4840	} else {
 4841		xdp_buff_clear_frags_flag(xdp);
 4842	}
 4843
 4844	orig_data_end = xdp->data_end;
 4845	orig_data = xdp->data;
 4846	eth = (struct ethhdr *)xdp->data;
 4847	orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
 4848	orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
 4849	orig_eth_type = eth->h_proto;
 4850
 4851	act = bpf_prog_run_xdp(xdp_prog, xdp);
 4852
 4853	/* check if bpf_xdp_adjust_head was used */
 4854	off = xdp->data - orig_data;
 4855	if (off) {
 4856		if (off > 0)
 4857			__skb_pull(skb, off);
 4858		else if (off < 0)
 4859			__skb_push(skb, -off);
 4860
 4861		skb->mac_header += off;
 4862		skb_reset_network_header(skb);
 4863	}
 4864
 4865	/* check if bpf_xdp_adjust_tail was used */
 4866	off = xdp->data_end - orig_data_end;
 4867	if (off != 0) {
 4868		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
 4869		skb->len += off; /* positive on grow, negative on shrink */
 4870	}
 4871
 4872	/* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
 4873	 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
 4874	 */
 4875	if (xdp_buff_has_frags(xdp))
 4876		skb->data_len = skb_shinfo(skb)->xdp_frags_size;
 4877	else
 4878		skb->data_len = 0;
 4879
 4880	/* check if XDP changed eth hdr such SKB needs update */
 4881	eth = (struct ethhdr *)xdp->data;
 4882	if ((orig_eth_type != eth->h_proto) ||
 4883	    (orig_host != ether_addr_equal_64bits(eth->h_dest,
 4884						  skb->dev->dev_addr)) ||
 4885	    (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
 4886		__skb_push(skb, ETH_HLEN);
 4887		skb->pkt_type = PACKET_HOST;
 4888		skb->protocol = eth_type_trans(skb, skb->dev);
 4889	}
 4890
 4891	/* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
 4892	 * before calling us again on redirect path. We do not call do_redirect
 4893	 * as we leave that up to the caller.
 4894	 *
 4895	 * Caller is responsible for managing lifetime of skb (i.e. calling
 4896	 * kfree_skb in response to actions it cannot handle/XDP_DROP).
 4897	 */
 4898	switch (act) {
 4899	case XDP_REDIRECT:
 4900	case XDP_TX:
 4901		__skb_push(skb, mac_len);
 4902		break;
 4903	case XDP_PASS:
 4904		metalen = xdp->data - xdp->data_meta;
 4905		if (metalen)
 4906			skb_metadata_set(skb, metalen);
 4907		break;
 4908	}
 4909
 4910	return act;
 4911}
 4912
 4913static int
 4914netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog)
 4915{
 4916	struct sk_buff *skb = *pskb;
 4917	int err, hroom, troom;
 4918
 4919	if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog))
 4920		return 0;
 4921
 4922	/* In case we have to go down the path and also linearize,
 4923	 * then lets do the pskb_expand_head() work just once here.
 4924	 */
 4925	hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
 4926	troom = skb->tail + skb->data_len - skb->end;
 4927	err = pskb_expand_head(skb,
 4928			       hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
 4929			       troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
 4930	if (err)
 4931		return err;
 4932
 4933	return skb_linearize(skb);
 4934}
 4935
 4936static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
 4937				     struct xdp_buff *xdp,
 4938				     struct bpf_prog *xdp_prog)
 4939{
 4940	struct sk_buff *skb = *pskb;
 4941	u32 mac_len, act = XDP_DROP;
 4942
 4943	/* Reinjected packets coming from act_mirred or similar should
 4944	 * not get XDP generic processing.
 4945	 */
 4946	if (skb_is_redirected(skb))
 4947		return XDP_PASS;
 4948
 4949	/* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
 4950	 * bytes. This is the guarantee that also native XDP provides,
 4951	 * thus we need to do it here as well.
 4952	 */
 4953	mac_len = skb->data - skb_mac_header(skb);
 4954	__skb_push(skb, mac_len);
 4955
 4956	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
 4957	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
 4958		if (netif_skb_check_for_xdp(pskb, xdp_prog))
 4959			goto do_drop;
 4960	}
 4961
 4962	__skb_pull(*pskb, mac_len);
 4963
 4964	act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
 4965	switch (act) {
 4966	case XDP_REDIRECT:
 4967	case XDP_TX:
 4968	case XDP_PASS:
 4969		break;
 4970	default:
 4971		bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
 4972		fallthrough;
 4973	case XDP_ABORTED:
 4974		trace_xdp_exception((*pskb)->dev, xdp_prog, act);
 4975		fallthrough;
 4976	case XDP_DROP:
 4977	do_drop:
 4978		kfree_skb(*pskb);
 4979		break;
 4980	}
 4981
 4982	return act;
 4983}
 4984
 4985/* When doing generic XDP we have to bypass the qdisc layer and the
 4986 * network taps in order to match in-driver-XDP behavior. This also means
 4987 * that XDP packets are able to starve other packets going through a qdisc,
 4988 * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
 4989 * queues, so they do not have this starvation issue.
 4990 */
 4991void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 4992{
 4993	struct net_device *dev = skb->dev;
 4994	struct netdev_queue *txq;
 4995	bool free_skb = true;
 4996	int cpu, rc;
 4997
 4998	txq = netdev_core_pick_tx(dev, skb, NULL);
 4999	cpu = smp_processor_id();
 5000	HARD_TX_LOCK(dev, txq, cpu);
 5001	if (!netif_xmit_frozen_or_drv_stopped(txq)) {
 5002		rc = netdev_start_xmit(skb, dev, txq, 0);
 5003		if (dev_xmit_complete(rc))
 5004			free_skb = false;
 5005	}
 5006	HARD_TX_UNLOCK(dev, txq);
 5007	if (free_skb) {
 5008		trace_xdp_exception(dev, xdp_prog, XDP_TX);
 5009		dev_core_stats_tx_dropped_inc(dev);
 5010		kfree_skb(skb);
 5011	}
 5012}
 5013
 5014static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
 5015
 5016int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb)
 5017{
 5018	if (xdp_prog) {
 5019		struct xdp_buff xdp;
 5020		u32 act;
 5021		int err;
 5022
 5023		act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog);
 5024		if (act != XDP_PASS) {
 5025			switch (act) {
 5026			case XDP_REDIRECT:
 5027				err = xdp_do_generic_redirect((*pskb)->dev, *pskb,
 5028							      &xdp, xdp_prog);
 5029				if (err)
 5030					goto out_redir;
 5031				break;
 5032			case XDP_TX:
 5033				generic_xdp_tx(*pskb, xdp_prog);
 5034				break;
 5035			}
 5036			return XDP_DROP;
 5037		}
 5038	}
 5039	return XDP_PASS;
 5040out_redir:
 5041	kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP);
 5042	return XDP_DROP;
 5043}
 5044EXPORT_SYMBOL_GPL(do_xdp_generic);
 5045
 5046static int netif_rx_internal(struct sk_buff *skb)
 5047{
 5048	int ret;
 5049
 5050	net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
 5051
 5052	trace_netif_rx(skb);
 5053
 5054#ifdef CONFIG_RPS
 5055	if (static_branch_unlikely(&rps_needed)) {
 5056		struct rps_dev_flow voidflow, *rflow = &voidflow;
 5057		int cpu;
 5058
 
 5059		rcu_read_lock();
 5060
 5061		cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5062		if (cpu < 0)
 5063			cpu = smp_processor_id();
 5064
 5065		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5066
 5067		rcu_read_unlock();
 
 5068	} else
 5069#endif
 5070	{
 5071		unsigned int qtail;
 5072
 5073		ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
 5074	}
 5075	return ret;
 5076}
 5077
 5078/**
 5079 *	__netif_rx	-	Slightly optimized version of netif_rx
 5080 *	@skb: buffer to post
 5081 *
 5082 *	This behaves as netif_rx except that it does not disable bottom halves.
 5083 *	As a result this function may only be invoked from the interrupt context
 5084 *	(either hard or soft interrupt).
 5085 */
 5086int __netif_rx(struct sk_buff *skb)
 5087{
 5088	int ret;
 5089
 5090	lockdep_assert_once(hardirq_count() | softirq_count());
 5091
 5092	trace_netif_rx_entry(skb);
 5093	ret = netif_rx_internal(skb);
 5094	trace_netif_rx_exit(ret);
 5095	return ret;
 5096}
 5097EXPORT_SYMBOL(__netif_rx);
 5098
 5099/**
 5100 *	netif_rx	-	post buffer to the network code
 5101 *	@skb: buffer to post
 5102 *
 5103 *	This function receives a packet from a device driver and queues it for
 5104 *	the upper (protocol) levels to process via the backlog NAPI device. It
 5105 *	always succeeds. The buffer may be dropped during processing for
 5106 *	congestion control or by the protocol layers.
 5107 *	The network buffer is passed via the backlog NAPI device. Modern NIC
 5108 *	driver should use NAPI and GRO.
 5109 *	This function can used from interrupt and from process context. The
 5110 *	caller from process context must not disable interrupts before invoking
 5111 *	this function.
 5112 *
 5113 *	return values:
 5114 *	NET_RX_SUCCESS	(no congestion)
 5115 *	NET_RX_DROP     (packet was dropped)
 5116 *
 5117 */
 
 5118int netif_rx(struct sk_buff *skb)
 5119{
 5120	bool need_bh_off = !(hardirq_count() | softirq_count());
 5121	int ret;
 5122
 5123	if (need_bh_off)
 5124		local_bh_disable();
 5125	trace_netif_rx_entry(skb);
 5126	ret = netif_rx_internal(skb);
 5127	trace_netif_rx_exit(ret);
 5128	if (need_bh_off)
 5129		local_bh_enable();
 5130	return ret;
 5131}
 5132EXPORT_SYMBOL(netif_rx);
 5133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 5134static __latent_entropy void net_tx_action(struct softirq_action *h)
 5135{
 5136	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 5137
 5138	if (sd->completion_queue) {
 5139		struct sk_buff *clist;
 5140
 5141		local_irq_disable();
 5142		clist = sd->completion_queue;
 5143		sd->completion_queue = NULL;
 5144		local_irq_enable();
 5145
 5146		while (clist) {
 5147			struct sk_buff *skb = clist;
 5148
 5149			clist = clist->next;
 5150
 5151			WARN_ON(refcount_read(&skb->users));
 5152			if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED))
 5153				trace_consume_skb(skb, net_tx_action);
 5154			else
 5155				trace_kfree_skb(skb, net_tx_action,
 5156						get_kfree_skb_cb(skb)->reason);
 5157
 5158			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
 5159				__kfree_skb(skb);
 5160			else
 5161				__napi_kfree_skb(skb,
 5162						 get_kfree_skb_cb(skb)->reason);
 5163		}
 
 
 5164	}
 5165
 5166	if (sd->output_queue) {
 5167		struct Qdisc *head;
 5168
 5169		local_irq_disable();
 5170		head = sd->output_queue;
 5171		sd->output_queue = NULL;
 5172		sd->output_queue_tailp = &sd->output_queue;
 5173		local_irq_enable();
 5174
 5175		rcu_read_lock();
 5176
 5177		while (head) {
 5178			struct Qdisc *q = head;
 5179			spinlock_t *root_lock = NULL;
 5180
 5181			head = head->next_sched;
 5182
 
 
 5183			/* We need to make sure head->next_sched is read
 5184			 * before clearing __QDISC_STATE_SCHED
 5185			 */
 5186			smp_mb__before_atomic();
 5187
 5188			if (!(q->flags & TCQ_F_NOLOCK)) {
 5189				root_lock = qdisc_lock(q);
 5190				spin_lock(root_lock);
 5191			} else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
 5192						     &q->state))) {
 5193				/* There is a synchronize_net() between
 5194				 * STATE_DEACTIVATED flag being set and
 5195				 * qdisc_reset()/some_qdisc_is_busy() in
 5196				 * dev_deactivate(), so we can safely bail out
 5197				 * early here to avoid data race between
 5198				 * qdisc_deactivate() and some_qdisc_is_busy()
 5199				 * for lockless qdisc.
 5200				 */
 5201				clear_bit(__QDISC_STATE_SCHED, &q->state);
 5202				continue;
 5203			}
 5204
 5205			clear_bit(__QDISC_STATE_SCHED, &q->state);
 5206			qdisc_run(q);
 5207			if (root_lock)
 5208				spin_unlock(root_lock);
 5209		}
 5210
 5211		rcu_read_unlock();
 5212	}
 5213
 5214	xfrm_dev_backlog(sd);
 5215}
 5216
 5217#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
 5218/* This hook is defined here for ATM LANE */
 5219int (*br_fdb_test_addr_hook)(struct net_device *dev,
 5220			     unsigned char *addr) __read_mostly;
 5221EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
 5222#endif
 5223
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 5224/**
 5225 *	netdev_is_rx_handler_busy - check if receive handler is registered
 5226 *	@dev: device to check
 5227 *
 5228 *	Check if a receive handler is already registered for a given device.
 5229 *	Return true if there one.
 5230 *
 5231 *	The caller must hold the rtnl_mutex.
 5232 */
 5233bool netdev_is_rx_handler_busy(struct net_device *dev)
 5234{
 5235	ASSERT_RTNL();
 5236	return dev && rtnl_dereference(dev->rx_handler);
 5237}
 5238EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
 5239
 5240/**
 5241 *	netdev_rx_handler_register - register receive handler
 5242 *	@dev: device to register a handler for
 5243 *	@rx_handler: receive handler to register
 5244 *	@rx_handler_data: data pointer that is used by rx handler
 5245 *
 5246 *	Register a receive handler for a device. This handler will then be
 5247 *	called from __netif_receive_skb. A negative errno code is returned
 5248 *	on a failure.
 5249 *
 5250 *	The caller must hold the rtnl_mutex.
 5251 *
 5252 *	For a general description of rx_handler, see enum rx_handler_result.
 5253 */
 5254int netdev_rx_handler_register(struct net_device *dev,
 5255			       rx_handler_func_t *rx_handler,
 5256			       void *rx_handler_data)
 5257{
 5258	if (netdev_is_rx_handler_busy(dev))
 5259		return -EBUSY;
 5260
 5261	if (dev->priv_flags & IFF_NO_RX_HANDLER)
 5262		return -EINVAL;
 5263
 5264	/* Note: rx_handler_data must be set before rx_handler */
 5265	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
 5266	rcu_assign_pointer(dev->rx_handler, rx_handler);
 5267
 5268	return 0;
 5269}
 5270EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
 5271
 5272/**
 5273 *	netdev_rx_handler_unregister - unregister receive handler
 5274 *	@dev: device to unregister a handler from
 5275 *
 5276 *	Unregister a receive handler from a device.
 5277 *
 5278 *	The caller must hold the rtnl_mutex.
 5279 */
 5280void netdev_rx_handler_unregister(struct net_device *dev)
 5281{
 5282
 5283	ASSERT_RTNL();
 5284	RCU_INIT_POINTER(dev->rx_handler, NULL);
 5285	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
 5286	 * section has a guarantee to see a non NULL rx_handler_data
 5287	 * as well.
 5288	 */
 5289	synchronize_net();
 5290	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
 5291}
 5292EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
 5293
 5294/*
 5295 * Limit the use of PFMEMALLOC reserves to those protocols that implement
 5296 * the special handling of PFMEMALLOC skbs.
 5297 */
 5298static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
 5299{
 5300	switch (skb->protocol) {
 5301	case htons(ETH_P_ARP):
 5302	case htons(ETH_P_IP):
 5303	case htons(ETH_P_IPV6):
 5304	case htons(ETH_P_8021Q):
 5305	case htons(ETH_P_8021AD):
 5306		return true;
 5307	default:
 5308		return false;
 5309	}
 5310}
 5311
 5312static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
 5313			     int *ret, struct net_device *orig_dev)
 5314{
 
 5315	if (nf_hook_ingress_active(skb)) {
 5316		int ingress_retval;
 5317
 5318		if (*pt_prev) {
 5319			*ret = deliver_skb(skb, *pt_prev, orig_dev);
 5320			*pt_prev = NULL;
 5321		}
 5322
 5323		rcu_read_lock();
 5324		ingress_retval = nf_hook_ingress(skb);
 5325		rcu_read_unlock();
 5326		return ingress_retval;
 5327	}
 
 5328	return 0;
 5329}
 5330
 5331static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
 5332				    struct packet_type **ppt_prev)
 5333{
 5334	struct packet_type *ptype, *pt_prev;
 5335	rx_handler_func_t *rx_handler;
 5336	struct sk_buff *skb = *pskb;
 5337	struct net_device *orig_dev;
 5338	bool deliver_exact = false;
 5339	int ret = NET_RX_DROP;
 5340	__be16 type;
 5341
 5342	net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb);
 5343
 5344	trace_netif_receive_skb(skb);
 5345
 5346	orig_dev = skb->dev;
 5347
 5348	skb_reset_network_header(skb);
 5349	if (!skb_transport_header_was_set(skb))
 5350		skb_reset_transport_header(skb);
 5351	skb_reset_mac_len(skb);
 5352
 5353	pt_prev = NULL;
 5354
 5355another_round:
 5356	skb->skb_iif = skb->dev->ifindex;
 5357
 5358	__this_cpu_inc(softnet_data.processed);
 5359
 5360	if (static_branch_unlikely(&generic_xdp_needed_key)) {
 5361		int ret2;
 5362
 5363		migrate_disable();
 5364		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
 5365				      &skb);
 5366		migrate_enable();
 5367
 5368		if (ret2 != XDP_PASS) {
 5369			ret = NET_RX_DROP;
 5370			goto out;
 5371		}
 5372	}
 5373
 5374	if (eth_type_vlan(skb->protocol)) {
 5375		skb = skb_vlan_untag(skb);
 5376		if (unlikely(!skb))
 5377			goto out;
 5378	}
 5379
 5380	if (skb_skip_tc_classify(skb))
 5381		goto skip_classify;
 
 
 
 
 5382
 5383	if (pfmemalloc)
 5384		goto skip_taps;
 5385
 5386	list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) {
 5387		if (pt_prev)
 5388			ret = deliver_skb(skb, pt_prev, orig_dev);
 5389		pt_prev = ptype;
 5390	}
 5391
 5392	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
 5393		if (pt_prev)
 5394			ret = deliver_skb(skb, pt_prev, orig_dev);
 5395		pt_prev = ptype;
 5396	}
 5397
 5398skip_taps:
 5399#ifdef CONFIG_NET_INGRESS
 5400	if (static_branch_unlikely(&ingress_needed_key)) {
 5401		bool another = false;
 5402
 5403		nf_skip_egress(skb, true);
 5404		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
 5405					 &another);
 5406		if (another)
 5407			goto another_round;
 5408		if (!skb)
 5409			goto out;
 5410
 5411		nf_skip_egress(skb, false);
 5412		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
 5413			goto out;
 5414	}
 5415#endif
 5416	skb_reset_redirect(skb);
 5417skip_classify:
 
 
 5418	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
 5419		goto drop;
 5420
 5421	if (skb_vlan_tag_present(skb)) {
 5422		if (pt_prev) {
 5423			ret = deliver_skb(skb, pt_prev, orig_dev);
 5424			pt_prev = NULL;
 5425		}
 5426		if (vlan_do_receive(&skb))
 5427			goto another_round;
 5428		else if (unlikely(!skb))
 5429			goto out;
 5430	}
 5431
 5432	rx_handler = rcu_dereference(skb->dev->rx_handler);
 5433	if (rx_handler) {
 5434		if (pt_prev) {
 5435			ret = deliver_skb(skb, pt_prev, orig_dev);
 5436			pt_prev = NULL;
 5437		}
 5438		switch (rx_handler(&skb)) {
 5439		case RX_HANDLER_CONSUMED:
 5440			ret = NET_RX_SUCCESS;
 5441			goto out;
 5442		case RX_HANDLER_ANOTHER:
 5443			goto another_round;
 5444		case RX_HANDLER_EXACT:
 5445			deliver_exact = true;
 5446			break;
 5447		case RX_HANDLER_PASS:
 5448			break;
 5449		default:
 5450			BUG();
 5451		}
 5452	}
 5453
 5454	if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
 5455check_vlan_id:
 5456		if (skb_vlan_tag_get_id(skb)) {
 5457			/* Vlan id is non 0 and vlan_do_receive() above couldn't
 5458			 * find vlan device.
 5459			 */
 5460			skb->pkt_type = PACKET_OTHERHOST;
 5461		} else if (eth_type_vlan(skb->protocol)) {
 5462			/* Outer header is 802.1P with vlan 0, inner header is
 5463			 * 802.1Q or 802.1AD and vlan_do_receive() above could
 5464			 * not find vlan dev for vlan id 0.
 5465			 */
 5466			__vlan_hwaccel_clear_tag(skb);
 5467			skb = skb_vlan_untag(skb);
 5468			if (unlikely(!skb))
 5469				goto out;
 5470			if (vlan_do_receive(&skb))
 5471				/* After stripping off 802.1P header with vlan 0
 5472				 * vlan dev is found for inner header.
 5473				 */
 5474				goto another_round;
 5475			else if (unlikely(!skb))
 5476				goto out;
 5477			else
 5478				/* We have stripped outer 802.1P vlan 0 header.
 5479				 * But could not find vlan dev.
 5480				 * check again for vlan id to set OTHERHOST.
 5481				 */
 5482				goto check_vlan_id;
 5483		}
 5484		/* Note: we might in the future use prio bits
 5485		 * and set skb->priority like in vlan_do_receive()
 5486		 * For the time being, just ignore Priority Code Point
 5487		 */
 5488		__vlan_hwaccel_clear_tag(skb);
 5489	}
 5490
 5491	type = skb->protocol;
 5492
 5493	/* deliver only exact match when indicated */
 5494	if (likely(!deliver_exact)) {
 5495		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5496				       &ptype_base[ntohs(type) &
 5497						   PTYPE_HASH_MASK]);
 5498	}
 5499
 5500	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5501			       &orig_dev->ptype_specific);
 5502
 5503	if (unlikely(skb->dev != orig_dev)) {
 5504		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5505				       &skb->dev->ptype_specific);
 5506	}
 5507
 5508	if (pt_prev) {
 5509		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 5510			goto drop;
 5511		*ppt_prev = pt_prev;
 
 5512	} else {
 5513drop:
 5514		if (!deliver_exact)
 5515			dev_core_stats_rx_dropped_inc(skb->dev);
 5516		else
 5517			dev_core_stats_rx_nohandler_inc(skb->dev);
 5518		kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
 5519		/* Jamal, now you will not able to escape explaining
 5520		 * me how you were going to use this. :-)
 5521		 */
 5522		ret = NET_RX_DROP;
 5523	}
 5524
 5525out:
 5526	/* The invariant here is that if *ppt_prev is not NULL
 5527	 * then skb should also be non-NULL.
 5528	 *
 5529	 * Apparently *ppt_prev assignment above holds this invariant due to
 5530	 * skb dereferencing near it.
 5531	 */
 5532	*pskb = skb;
 5533	return ret;
 5534}
 5535
 5536static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
 5537{
 5538	struct net_device *orig_dev = skb->dev;
 5539	struct packet_type *pt_prev = NULL;
 5540	int ret;
 5541
 5542	ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 5543	if (pt_prev)
 5544		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
 5545					 skb->dev, pt_prev, orig_dev);
 5546	return ret;
 5547}
 5548
 5549/**
 5550 *	netif_receive_skb_core - special purpose version of netif_receive_skb
 5551 *	@skb: buffer to process
 5552 *
 5553 *	More direct receive version of netif_receive_skb().  It should
 5554 *	only be used by callers that have a need to skip RPS and Generic XDP.
 5555 *	Caller must also take care of handling if ``(page_is_)pfmemalloc``.
 5556 *
 5557 *	This function may only be called from softirq context and interrupts
 5558 *	should be enabled.
 5559 *
 5560 *	Return values (usually ignored):
 5561 *	NET_RX_SUCCESS: no congestion
 5562 *	NET_RX_DROP: packet was dropped
 5563 */
 5564int netif_receive_skb_core(struct sk_buff *skb)
 5565{
 5566	int ret;
 5567
 5568	rcu_read_lock();
 5569	ret = __netif_receive_skb_one_core(skb, false);
 5570	rcu_read_unlock();
 5571
 5572	return ret;
 5573}
 5574EXPORT_SYMBOL(netif_receive_skb_core);
 5575
 5576static inline void __netif_receive_skb_list_ptype(struct list_head *head,
 5577						  struct packet_type *pt_prev,
 5578						  struct net_device *orig_dev)
 5579{
 5580	struct sk_buff *skb, *next;
 5581
 5582	if (!pt_prev)
 5583		return;
 5584	if (list_empty(head))
 5585		return;
 5586	if (pt_prev->list_func != NULL)
 5587		INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
 5588				   ip_list_rcv, head, pt_prev, orig_dev);
 5589	else
 5590		list_for_each_entry_safe(skb, next, head, list) {
 5591			skb_list_del_init(skb);
 5592			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 5593		}
 5594}
 5595
 5596static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
 5597{
 5598	/* Fast-path assumptions:
 5599	 * - There is no RX handler.
 5600	 * - Only one packet_type matches.
 5601	 * If either of these fails, we will end up doing some per-packet
 5602	 * processing in-line, then handling the 'last ptype' for the whole
 5603	 * sublist.  This can't cause out-of-order delivery to any single ptype,
 5604	 * because the 'last ptype' must be constant across the sublist, and all
 5605	 * other ptypes are handled per-packet.
 5606	 */
 5607	/* Current (common) ptype of sublist */
 5608	struct packet_type *pt_curr = NULL;
 5609	/* Current (common) orig_dev of sublist */
 5610	struct net_device *od_curr = NULL;
 5611	struct list_head sublist;
 5612	struct sk_buff *skb, *next;
 5613
 5614	INIT_LIST_HEAD(&sublist);
 5615	list_for_each_entry_safe(skb, next, head, list) {
 5616		struct net_device *orig_dev = skb->dev;
 5617		struct packet_type *pt_prev = NULL;
 5618
 5619		skb_list_del_init(skb);
 5620		__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 5621		if (!pt_prev)
 5622			continue;
 5623		if (pt_curr != pt_prev || od_curr != orig_dev) {
 5624			/* dispatch old sublist */
 5625			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 5626			/* start new sublist */
 5627			INIT_LIST_HEAD(&sublist);
 5628			pt_curr = pt_prev;
 5629			od_curr = orig_dev;
 5630		}
 5631		list_add_tail(&skb->list, &sublist);
 5632	}
 5633
 5634	/* dispatch final sublist */
 5635	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 5636}
 5637
 5638static int __netif_receive_skb(struct sk_buff *skb)
 5639{
 5640	int ret;
 5641
 5642	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
 5643		unsigned int noreclaim_flag;
 5644
 5645		/*
 5646		 * PFMEMALLOC skbs are special, they should
 5647		 * - be delivered to SOCK_MEMALLOC sockets only
 5648		 * - stay away from userspace
 5649		 * - have bounded memory usage
 5650		 *
 5651		 * Use PF_MEMALLOC as this saves us from propagating the allocation
 5652		 * context down to all allocation sites.
 5653		 */
 5654		noreclaim_flag = memalloc_noreclaim_save();
 5655		ret = __netif_receive_skb_one_core(skb, true);
 5656		memalloc_noreclaim_restore(noreclaim_flag);
 5657	} else
 5658		ret = __netif_receive_skb_one_core(skb, false);
 5659
 5660	return ret;
 5661}
 5662
 5663static void __netif_receive_skb_list(struct list_head *head)
 5664{
 5665	unsigned long noreclaim_flag = 0;
 5666	struct sk_buff *skb, *next;
 5667	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
 5668
 5669	list_for_each_entry_safe(skb, next, head, list) {
 5670		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
 5671			struct list_head sublist;
 5672
 5673			/* Handle the previous sublist */
 5674			list_cut_before(&sublist, head, &skb->list);
 5675			if (!list_empty(&sublist))
 5676				__netif_receive_skb_list_core(&sublist, pfmemalloc);
 5677			pfmemalloc = !pfmemalloc;
 5678			/* See comments in __netif_receive_skb */
 5679			if (pfmemalloc)
 5680				noreclaim_flag = memalloc_noreclaim_save();
 5681			else
 5682				memalloc_noreclaim_restore(noreclaim_flag);
 5683		}
 5684	}
 5685	/* Handle the remaining sublist */
 5686	if (!list_empty(head))
 5687		__netif_receive_skb_list_core(head, pfmemalloc);
 5688	/* Restore pflags */
 5689	if (pfmemalloc)
 5690		memalloc_noreclaim_restore(noreclaim_flag);
 5691}
 5692
 5693static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 5694{
 5695	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
 5696	struct bpf_prog *new = xdp->prog;
 5697	int ret = 0;
 5698
 5699	switch (xdp->command) {
 5700	case XDP_SETUP_PROG:
 5701		rcu_assign_pointer(dev->xdp_prog, new);
 5702		if (old)
 5703			bpf_prog_put(old);
 5704
 5705		if (old && !new) {
 5706			static_branch_dec(&generic_xdp_needed_key);
 5707		} else if (new && !old) {
 5708			static_branch_inc(&generic_xdp_needed_key);
 5709			dev_disable_lro(dev);
 5710			dev_disable_gro_hw(dev);
 5711		}
 5712		break;
 5713
 5714	default:
 5715		ret = -EINVAL;
 5716		break;
 5717	}
 5718
 5719	return ret;
 5720}
 5721
 5722static int netif_receive_skb_internal(struct sk_buff *skb)
 5723{
 5724	int ret;
 5725
 5726	net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
 5727
 5728	if (skb_defer_rx_timestamp(skb))
 5729		return NET_RX_SUCCESS;
 5730
 5731	rcu_read_lock();
 
 5732#ifdef CONFIG_RPS
 5733	if (static_branch_unlikely(&rps_needed)) {
 5734		struct rps_dev_flow voidflow, *rflow = &voidflow;
 5735		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5736
 5737		if (cpu >= 0) {
 5738			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5739			rcu_read_unlock();
 5740			return ret;
 5741		}
 5742	}
 5743#endif
 5744	ret = __netif_receive_skb(skb);
 5745	rcu_read_unlock();
 5746	return ret;
 5747}
 5748
 5749void netif_receive_skb_list_internal(struct list_head *head)
 5750{
 5751	struct sk_buff *skb, *next;
 5752	struct list_head sublist;
 5753
 5754	INIT_LIST_HEAD(&sublist);
 5755	list_for_each_entry_safe(skb, next, head, list) {
 5756		net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue),
 5757				    skb);
 5758		skb_list_del_init(skb);
 5759		if (!skb_defer_rx_timestamp(skb))
 5760			list_add_tail(&skb->list, &sublist);
 5761	}
 5762	list_splice_init(&sublist, head);
 5763
 5764	rcu_read_lock();
 5765#ifdef CONFIG_RPS
 5766	if (static_branch_unlikely(&rps_needed)) {
 5767		list_for_each_entry_safe(skb, next, head, list) {
 5768			struct rps_dev_flow voidflow, *rflow = &voidflow;
 5769			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5770
 5771			if (cpu >= 0) {
 5772				/* Will be handled, remove from list */
 5773				skb_list_del_init(skb);
 5774				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5775			}
 5776		}
 5777	}
 5778#endif
 5779	__netif_receive_skb_list(head);
 5780	rcu_read_unlock();
 5781}
 5782
 5783/**
 5784 *	netif_receive_skb - process receive buffer from network
 5785 *	@skb: buffer to process
 5786 *
 5787 *	netif_receive_skb() is the main receive data processing function.
 5788 *	It always succeeds. The buffer may be dropped during processing
 5789 *	for congestion control or by the protocol layers.
 5790 *
 5791 *	This function may only be called from softirq context and interrupts
 5792 *	should be enabled.
 5793 *
 5794 *	Return values (usually ignored):
 5795 *	NET_RX_SUCCESS: no congestion
 5796 *	NET_RX_DROP: packet was dropped
 5797 */
 5798int netif_receive_skb(struct sk_buff *skb)
 5799{
 5800	int ret;
 5801
 5802	trace_netif_receive_skb_entry(skb);
 5803
 5804	ret = netif_receive_skb_internal(skb);
 5805	trace_netif_receive_skb_exit(ret);
 5806
 5807	return ret;
 5808}
 5809EXPORT_SYMBOL(netif_receive_skb);
 5810
 5811/**
 5812 *	netif_receive_skb_list - process many receive buffers from network
 5813 *	@head: list of skbs to process.
 5814 *
 5815 *	Since return value of netif_receive_skb() is normally ignored, and
 5816 *	wouldn't be meaningful for a list, this function returns void.
 5817 *
 5818 *	This function may only be called from softirq context and interrupts
 5819 *	should be enabled.
 5820 */
 5821void netif_receive_skb_list(struct list_head *head)
 5822{
 5823	struct sk_buff *skb;
 5824
 5825	if (list_empty(head))
 5826		return;
 5827	if (trace_netif_receive_skb_list_entry_enabled()) {
 5828		list_for_each_entry(skb, head, list)
 5829			trace_netif_receive_skb_list_entry(skb);
 5830	}
 5831	netif_receive_skb_list_internal(head);
 5832	trace_netif_receive_skb_list_exit(0);
 5833}
 5834EXPORT_SYMBOL(netif_receive_skb_list);
 5835
 5836static DEFINE_PER_CPU(struct work_struct, flush_works);
 5837
 5838/* Network device is going away, flush any packets still pending */
 5839static void flush_backlog(struct work_struct *work)
 5840{
 5841	struct sk_buff *skb, *tmp;
 5842	struct softnet_data *sd;
 5843
 5844	local_bh_disable();
 5845	sd = this_cpu_ptr(&softnet_data);
 5846
 5847	rps_lock_irq_disable(sd);
 
 5848	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
 5849		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 5850			__skb_unlink(skb, &sd->input_pkt_queue);
 5851			dev_kfree_skb_irq(skb);
 5852			input_queue_head_incr(sd);
 5853		}
 5854	}
 5855	rps_unlock_irq_enable(sd);
 
 5856
 5857	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
 5858		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 5859			__skb_unlink(skb, &sd->process_queue);
 5860			kfree_skb(skb);
 5861			input_queue_head_incr(sd);
 5862		}
 5863	}
 5864	local_bh_enable();
 5865}
 5866
 5867static bool flush_required(int cpu)
 5868{
 5869#if IS_ENABLED(CONFIG_RPS)
 5870	struct softnet_data *sd = &per_cpu(softnet_data, cpu);
 5871	bool do_flush;
 5872
 5873	rps_lock_irq_disable(sd);
 5874
 5875	/* as insertion into process_queue happens with the rps lock held,
 5876	 * process_queue access may race only with dequeue
 5877	 */
 5878	do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
 5879		   !skb_queue_empty_lockless(&sd->process_queue);
 5880	rps_unlock_irq_enable(sd);
 5881
 5882	return do_flush;
 5883#endif
 5884	/* without RPS we can't safely check input_pkt_queue: during a
 5885	 * concurrent remote skb_queue_splice() we can detect as empty both
 5886	 * input_pkt_queue and process_queue even if the latter could end-up
 5887	 * containing a lot of packets.
 5888	 */
 5889	return true;
 5890}
 5891
 5892static void flush_all_backlogs(void)
 5893{
 5894	static cpumask_t flush_cpus;
 5895	unsigned int cpu;
 
 
 5896
 5897	/* since we are under rtnl lock protection we can use static data
 5898	 * for the cpumask and avoid allocating on stack the possibly
 5899	 * large mask
 5900	 */
 5901	ASSERT_RTNL();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 5902
 5903	cpus_read_lock();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 5904
 5905	cpumask_clear(&flush_cpus);
 5906	for_each_online_cpu(cpu) {
 5907		if (flush_required(cpu)) {
 5908			queue_work_on(cpu, system_highpri_wq,
 5909				      per_cpu_ptr(&flush_works, cpu));
 5910			cpumask_set_cpu(cpu, &flush_cpus);
 
 
 5911		}
 
 
 
 
 5912	}
 
 5913
 5914	/* we can have in flight packet[s] on the cpus we are not flushing,
 5915	 * synchronize_net() in unregister_netdevice_many() will take care of
 5916	 * them
 
 5917	 */
 5918	for_each_cpu(cpu, &flush_cpus)
 5919		flush_work(per_cpu_ptr(&flush_works, cpu));
 5920
 5921	cpus_read_unlock();
 5922}
 5923
 5924static void net_rps_send_ipi(struct softnet_data *remsd)
 5925{
 5926#ifdef CONFIG_RPS
 5927	while (remsd) {
 5928		struct softnet_data *next = remsd->rps_ipi_next;
 
 5929
 5930		if (cpu_online(remsd->cpu))
 5931			smp_call_function_single_async(remsd->cpu, &remsd->csd);
 5932		remsd = next;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 5933	}
 5934#endif
 
 
 
 
 5935}
 
 5936
 5937/*
 5938 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
 5939 * Note: called with local irq disabled, but exits with local irq enabled.
 5940 */
 5941static void net_rps_action_and_irq_enable(struct softnet_data *sd)
 5942{
 5943#ifdef CONFIG_RPS
 5944	struct softnet_data *remsd = sd->rps_ipi_list;
 5945
 5946	if (remsd) {
 5947		sd->rps_ipi_list = NULL;
 5948
 5949		local_irq_enable();
 5950
 5951		/* Send pending IPI's to kick RPS processing on remote cpus. */
 5952		net_rps_send_ipi(remsd);
 
 
 
 
 
 
 
 5953	} else
 5954#endif
 5955		local_irq_enable();
 5956}
 5957
 5958static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
 5959{
 5960#ifdef CONFIG_RPS
 5961	return sd->rps_ipi_list != NULL;
 5962#else
 5963	return false;
 5964#endif
 5965}
 5966
 5967static int process_backlog(struct napi_struct *napi, int quota)
 5968{
 5969	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
 5970	bool again = true;
 5971	int work = 0;
 5972
 5973	/* Check if we have pending ipi, its better to send them now,
 5974	 * not waiting net_rx_action() end.
 5975	 */
 5976	if (sd_has_rps_ipi_waiting(sd)) {
 5977		local_irq_disable();
 5978		net_rps_action_and_irq_enable(sd);
 5979	}
 5980
 5981	napi->weight = READ_ONCE(net_hotdata.dev_rx_weight);
 5982	while (again) {
 5983		struct sk_buff *skb;
 5984
 5985		while ((skb = __skb_dequeue(&sd->process_queue))) {
 5986			rcu_read_lock();
 5987			__netif_receive_skb(skb);
 5988			rcu_read_unlock();
 5989			input_queue_head_incr(sd);
 5990			if (++work >= quota)
 5991				return work;
 5992
 5993		}
 5994
 5995		rps_lock_irq_disable(sd);
 
 5996		if (skb_queue_empty(&sd->input_pkt_queue)) {
 5997			/*
 5998			 * Inline a custom version of __napi_complete().
 5999			 * only current cpu owns and manipulates this napi,
 6000			 * and NAPI_STATE_SCHED is the only possible flag set
 6001			 * on backlog.
 6002			 * We can use a plain write instead of clear_bit(),
 6003			 * and we dont need an smp_mb() memory barrier.
 6004			 */
 6005			napi->state = 0;
 6006			again = false;
 6007		} else {
 6008			skb_queue_splice_tail_init(&sd->input_pkt_queue,
 6009						   &sd->process_queue);
 6010		}
 6011		rps_unlock_irq_enable(sd);
 
 6012	}
 6013
 6014	return work;
 6015}
 6016
 6017/**
 6018 * __napi_schedule - schedule for receive
 6019 * @n: entry to schedule
 6020 *
 6021 * The entry's receive function will be scheduled to run.
 6022 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
 6023 */
 6024void __napi_schedule(struct napi_struct *n)
 6025{
 6026	unsigned long flags;
 6027
 6028	local_irq_save(flags);
 6029	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 6030	local_irq_restore(flags);
 6031}
 6032EXPORT_SYMBOL(__napi_schedule);
 6033
 6034/**
 6035 *	napi_schedule_prep - check if napi can be scheduled
 6036 *	@n: napi context
 6037 *
 6038 * Test if NAPI routine is already running, and if not mark
 6039 * it as running.  This is used as a condition variable to
 6040 * insure only one NAPI poll instance runs.  We also make
 6041 * sure there is no pending NAPI disable.
 6042 */
 6043bool napi_schedule_prep(struct napi_struct *n)
 6044{
 6045	unsigned long new, val = READ_ONCE(n->state);
 6046
 6047	do {
 
 6048		if (unlikely(val & NAPIF_STATE_DISABLE))
 6049			return false;
 6050		new = val | NAPIF_STATE_SCHED;
 6051
 6052		/* Sets STATE_MISSED bit if STATE_SCHED was already set
 6053		 * This was suggested by Alexander Duyck, as compiler
 6054		 * emits better code than :
 6055		 * if (val & NAPIF_STATE_SCHED)
 6056		 *     new |= NAPIF_STATE_MISSED;
 6057		 */
 6058		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
 6059						   NAPIF_STATE_MISSED;
 6060	} while (!try_cmpxchg(&n->state, &val, new));
 6061
 6062	return !(val & NAPIF_STATE_SCHED);
 6063}
 6064EXPORT_SYMBOL(napi_schedule_prep);
 6065
 6066/**
 6067 * __napi_schedule_irqoff - schedule for receive
 6068 * @n: entry to schedule
 6069 *
 6070 * Variant of __napi_schedule() assuming hard irqs are masked.
 6071 *
 6072 * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
 6073 * because the interrupt disabled assumption might not be true
 6074 * due to force-threaded interrupts and spinlock substitution.
 6075 */
 6076void __napi_schedule_irqoff(struct napi_struct *n)
 6077{
 6078	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 6079		____napi_schedule(this_cpu_ptr(&softnet_data), n);
 6080	else
 6081		__napi_schedule(n);
 6082}
 6083EXPORT_SYMBOL(__napi_schedule_irqoff);
 6084
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 6085bool napi_complete_done(struct napi_struct *n, int work_done)
 6086{
 6087	unsigned long flags, val, new, timeout = 0;
 6088	bool ret = true;
 6089
 6090	/*
 6091	 * 1) Don't let napi dequeue from the cpu poll list
 6092	 *    just in case its running on a different cpu.
 6093	 * 2) If we are busy polling, do nothing here, we have
 6094	 *    the guarantee we will be called later.
 6095	 */
 6096	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
 6097				 NAPIF_STATE_IN_BUSY_POLL)))
 6098		return false;
 6099
 6100	if (work_done) {
 6101		if (n->gro_bitmask)
 6102			timeout = READ_ONCE(n->dev->gro_flush_timeout);
 6103		n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
 6104	}
 6105	if (n->defer_hard_irqs_count > 0) {
 6106		n->defer_hard_irqs_count--;
 6107		timeout = READ_ONCE(n->dev->gro_flush_timeout);
 6108		if (timeout)
 6109			ret = false;
 6110	}
 6111	if (n->gro_bitmask) {
 6112		/* When the NAPI instance uses a timeout and keeps postponing
 6113		 * it, we need to bound somehow the time packets are kept in
 6114		 * the GRO layer
 6115		 */
 6116		napi_gro_flush(n, !!timeout);
 6117	}
 6118
 6119	gro_normal_list(n);
 
 6120
 
 
 
 
 
 
 6121	if (unlikely(!list_empty(&n->poll_list))) {
 6122		/* If n->poll_list is not empty, we need to mask irqs */
 6123		local_irq_save(flags);
 6124		list_del_init(&n->poll_list);
 6125		local_irq_restore(flags);
 6126	}
 6127	WRITE_ONCE(n->list_owner, -1);
 6128
 6129	val = READ_ONCE(n->state);
 6130	do {
 
 
 6131		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
 6132
 6133		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
 6134			      NAPIF_STATE_SCHED_THREADED |
 6135			      NAPIF_STATE_PREFER_BUSY_POLL);
 6136
 6137		/* If STATE_MISSED was set, leave STATE_SCHED set,
 6138		 * because we will call napi->poll() one more time.
 6139		 * This C code was suggested by Alexander Duyck to help gcc.
 6140		 */
 6141		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
 6142						    NAPIF_STATE_SCHED;
 6143	} while (!try_cmpxchg(&n->state, &val, new));
 6144
 6145	if (unlikely(val & NAPIF_STATE_MISSED)) {
 6146		__napi_schedule(n);
 6147		return false;
 6148	}
 6149
 6150	if (timeout)
 6151		hrtimer_start(&n->timer, ns_to_ktime(timeout),
 6152			      HRTIMER_MODE_REL_PINNED);
 6153	return ret;
 6154}
 6155EXPORT_SYMBOL(napi_complete_done);
 6156
 6157/* must be called under rcu_read_lock(), as we dont take a reference */
 6158struct napi_struct *napi_by_id(unsigned int napi_id)
 6159{
 6160	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
 6161	struct napi_struct *napi;
 6162
 6163	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
 6164		if (napi->napi_id == napi_id)
 6165			return napi;
 6166
 6167	return NULL;
 6168}
 6169
 6170static void skb_defer_free_flush(struct softnet_data *sd)
 6171{
 6172	struct sk_buff *skb, *next;
 6173
 6174	/* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
 6175	if (!READ_ONCE(sd->defer_list))
 6176		return;
 6177
 6178	spin_lock(&sd->defer_lock);
 6179	skb = sd->defer_list;
 6180	sd->defer_list = NULL;
 6181	sd->defer_count = 0;
 6182	spin_unlock(&sd->defer_lock);
 6183
 6184	while (skb != NULL) {
 6185		next = skb->next;
 6186		napi_consume_skb(skb, 1);
 6187		skb = next;
 6188	}
 6189}
 6190
 6191#if defined(CONFIG_NET_RX_BUSY_POLL)
 6192
 6193static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
 6194{
 6195	if (!skip_schedule) {
 6196		gro_normal_list(napi);
 6197		__napi_schedule(napi);
 6198		return;
 6199	}
 6200
 6201	if (napi->gro_bitmask) {
 6202		/* flush too old packets
 6203		 * If HZ < 1000, flush all packets.
 6204		 */
 6205		napi_gro_flush(napi, HZ >= 1000);
 6206	}
 6207
 6208	gro_normal_list(napi);
 6209	clear_bit(NAPI_STATE_SCHED, &napi->state);
 6210}
 6211
 6212enum {
 6213	NAPI_F_PREFER_BUSY_POLL	= 1,
 6214	NAPI_F_END_ON_RESCHED	= 2,
 6215};
 6216
 6217static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
 6218			   unsigned flags, u16 budget)
 6219{
 6220	bool skip_schedule = false;
 6221	unsigned long timeout;
 6222	int rc;
 6223
 6224	/* Busy polling means there is a high chance device driver hard irq
 6225	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
 6226	 * set in napi_schedule_prep().
 6227	 * Since we are about to call napi->poll() once more, we can safely
 6228	 * clear NAPI_STATE_MISSED.
 6229	 *
 6230	 * Note: x86 could use a single "lock and ..." instruction
 6231	 * to perform these two clear_bit()
 6232	 */
 6233	clear_bit(NAPI_STATE_MISSED, &napi->state);
 6234	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
 6235
 6236	local_bh_disable();
 6237
 6238	if (flags & NAPI_F_PREFER_BUSY_POLL) {
 6239		napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
 6240		timeout = READ_ONCE(napi->dev->gro_flush_timeout);
 6241		if (napi->defer_hard_irqs_count && timeout) {
 6242			hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
 6243			skip_schedule = true;
 6244		}
 6245	}
 6246
 6247	/* All we really want here is to re-enable device interrupts.
 6248	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
 6249	 */
 6250	rc = napi->poll(napi, budget);
 6251	/* We can't gro_normal_list() here, because napi->poll() might have
 6252	 * rearmed the napi (napi_complete_done()) in which case it could
 6253	 * already be running on another CPU.
 6254	 */
 6255	trace_napi_poll(napi, rc, budget);
 6256	netpoll_poll_unlock(have_poll_lock);
 6257	if (rc == budget)
 6258		__busy_poll_stop(napi, skip_schedule);
 6259	local_bh_enable();
 
 
 6260}
 6261
 6262static void __napi_busy_loop(unsigned int napi_id,
 6263		      bool (*loop_end)(void *, unsigned long),
 6264		      void *loop_end_arg, unsigned flags, u16 budget)
 6265{
 6266	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
 6267	int (*napi_poll)(struct napi_struct *napi, int budget);
 
 6268	void *have_poll_lock = NULL;
 6269	struct napi_struct *napi;
 6270
 6271	WARN_ON_ONCE(!rcu_read_lock_held());
 6272
 6273restart:
 
 6274	napi_poll = NULL;
 6275
 6276	napi = napi_by_id(napi_id);
 
 
 6277	if (!napi)
 6278		return;
 6279
 6280	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 6281		preempt_disable();
 6282	for (;;) {
 6283		int work = 0;
 6284
 
 
 
 6285		local_bh_disable();
 
 
 
 
 6286		if (!napi_poll) {
 6287			unsigned long val = READ_ONCE(napi->state);
 6288
 6289			/* If multiple threads are competing for this napi,
 6290			 * we avoid dirtying napi->state as much as we can.
 6291			 */
 6292			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
 6293				   NAPIF_STATE_IN_BUSY_POLL)) {
 6294				if (flags & NAPI_F_PREFER_BUSY_POLL)
 6295					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
 6296				goto count;
 6297			}
 6298			if (cmpxchg(&napi->state, val,
 6299				    val | NAPIF_STATE_IN_BUSY_POLL |
 6300					  NAPIF_STATE_SCHED) != val) {
 6301				if (flags & NAPI_F_PREFER_BUSY_POLL)
 6302					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
 6303				goto count;
 6304			}
 6305			have_poll_lock = netpoll_poll_lock(napi);
 6306			napi_poll = napi->poll;
 6307		}
 6308		work = napi_poll(napi, budget);
 6309		trace_napi_poll(napi, work, budget);
 6310		gro_normal_list(napi);
 6311count:
 6312		if (work > 0)
 6313			__NET_ADD_STATS(dev_net(napi->dev),
 6314					LINUX_MIB_BUSYPOLLRXPACKETS, work);
 6315		skb_defer_free_flush(this_cpu_ptr(&softnet_data));
 6316		local_bh_enable();
 6317
 6318		if (!loop_end || loop_end(loop_end_arg, start_time))
 
 
 
 
 6319			break;
 6320
 6321		if (unlikely(need_resched())) {
 6322			if (flags & NAPI_F_END_ON_RESCHED)
 6323				break;
 6324			if (napi_poll)
 6325				busy_poll_stop(napi, have_poll_lock, flags, budget);
 6326			if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 6327				preempt_enable();
 6328			rcu_read_unlock();
 6329			cond_resched();
 6330			rcu_read_lock();
 6331			if (loop_end(loop_end_arg, start_time))
 6332				return;
 6333			goto restart;
 6334		}
 6335		cpu_relax();
 6336	}
 6337	if (napi_poll)
 6338		busy_poll_stop(napi, have_poll_lock, flags, budget);
 6339	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 6340		preempt_enable();
 6341}
 6342
 6343void napi_busy_loop_rcu(unsigned int napi_id,
 6344			bool (*loop_end)(void *, unsigned long),
 6345			void *loop_end_arg, bool prefer_busy_poll, u16 budget)
 6346{
 6347	unsigned flags = NAPI_F_END_ON_RESCHED;
 6348
 6349	if (prefer_busy_poll)
 6350		flags |= NAPI_F_PREFER_BUSY_POLL;
 6351
 6352	__napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
 6353}
 6354
 6355void napi_busy_loop(unsigned int napi_id,
 6356		    bool (*loop_end)(void *, unsigned long),
 6357		    void *loop_end_arg, bool prefer_busy_poll, u16 budget)
 6358{
 6359	unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0;
 6360
 6361	rcu_read_lock();
 6362	__napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
 6363	rcu_read_unlock();
 
 6364}
 6365EXPORT_SYMBOL(napi_busy_loop);
 6366
 6367#endif /* CONFIG_NET_RX_BUSY_POLL */
 6368
 6369static void napi_hash_add(struct napi_struct *napi)
 6370{
 6371	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
 
 6372		return;
 6373
 6374	spin_lock(&napi_hash_lock);
 6375
 6376	/* 0..NR_CPUS range is reserved for sender_cpu use */
 6377	do {
 6378		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
 6379			napi_gen_id = MIN_NAPI_ID;
 6380	} while (napi_by_id(napi_gen_id));
 6381	napi->napi_id = napi_gen_id;
 6382
 6383	hlist_add_head_rcu(&napi->napi_hash_node,
 6384			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
 6385
 6386	spin_unlock(&napi_hash_lock);
 6387}
 6388
 6389/* Warning : caller is responsible to make sure rcu grace period
 6390 * is respected before freeing memory containing @napi
 6391 */
 6392static void napi_hash_del(struct napi_struct *napi)
 6393{
 6394	spin_lock(&napi_hash_lock);
 6395
 6396	hlist_del_init_rcu(&napi->napi_hash_node);
 6397
 
 
 
 
 6398	spin_unlock(&napi_hash_lock);
 
 6399}
 
 6400
 6401static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 6402{
 6403	struct napi_struct *napi;
 6404
 6405	napi = container_of(timer, struct napi_struct, timer);
 6406
 6407	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
 6408	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
 6409	 */
 6410	if (!napi_disable_pending(napi) &&
 6411	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
 6412		clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
 6413		__napi_schedule_irqoff(napi);
 6414	}
 6415
 6416	return HRTIMER_NORESTART;
 6417}
 6418
 6419static void init_gro_hash(struct napi_struct *napi)
 6420{
 6421	int i;
 6422
 6423	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6424		INIT_LIST_HEAD(&napi->gro_hash[i].list);
 6425		napi->gro_hash[i].count = 0;
 6426	}
 6427	napi->gro_bitmask = 0;
 6428}
 6429
 6430int dev_set_threaded(struct net_device *dev, bool threaded)
 6431{
 6432	struct napi_struct *napi;
 6433	int err = 0;
 6434
 6435	if (dev->threaded == threaded)
 6436		return 0;
 6437
 6438	if (threaded) {
 6439		list_for_each_entry(napi, &dev->napi_list, dev_list) {
 6440			if (!napi->thread) {
 6441				err = napi_kthread_create(napi);
 6442				if (err) {
 6443					threaded = false;
 6444					break;
 6445				}
 6446			}
 6447		}
 6448	}
 6449
 6450	dev->threaded = threaded;
 6451
 6452	/* Make sure kthread is created before THREADED bit
 6453	 * is set.
 6454	 */
 6455	smp_mb__before_atomic();
 6456
 6457	/* Setting/unsetting threaded mode on a napi might not immediately
 6458	 * take effect, if the current napi instance is actively being
 6459	 * polled. In this case, the switch between threaded mode and
 6460	 * softirq mode will happen in the next round of napi_schedule().
 6461	 * This should not cause hiccups/stalls to the live traffic.
 6462	 */
 6463	list_for_each_entry(napi, &dev->napi_list, dev_list)
 6464		assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
 6465
 6466	return err;
 6467}
 6468EXPORT_SYMBOL(dev_set_threaded);
 6469
 6470/**
 6471 * netif_queue_set_napi - Associate queue with the napi
 6472 * @dev: device to which NAPI and queue belong
 6473 * @queue_index: Index of queue
 6474 * @type: queue type as RX or TX
 6475 * @napi: NAPI context, pass NULL to clear previously set NAPI
 6476 *
 6477 * Set queue with its corresponding napi context. This should be done after
 6478 * registering the NAPI handler for the queue-vector and the queues have been
 6479 * mapped to the corresponding interrupt vector.
 6480 */
 6481void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
 6482			  enum netdev_queue_type type, struct napi_struct *napi)
 6483{
 6484	struct netdev_rx_queue *rxq;
 6485	struct netdev_queue *txq;
 6486
 6487	if (WARN_ON_ONCE(napi && !napi->dev))
 6488		return;
 6489	if (dev->reg_state >= NETREG_REGISTERED)
 6490		ASSERT_RTNL();
 6491
 6492	switch (type) {
 6493	case NETDEV_QUEUE_TYPE_RX:
 6494		rxq = __netif_get_rx_queue(dev, queue_index);
 6495		rxq->napi = napi;
 6496		return;
 6497	case NETDEV_QUEUE_TYPE_TX:
 6498		txq = netdev_get_tx_queue(dev, queue_index);
 6499		txq->napi = napi;
 6500		return;
 6501	default:
 6502		return;
 6503	}
 6504}
 6505EXPORT_SYMBOL(netif_queue_set_napi);
 6506
 6507void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 6508			   int (*poll)(struct napi_struct *, int), int weight)
 6509{
 6510	if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
 6511		return;
 6512
 6513	INIT_LIST_HEAD(&napi->poll_list);
 6514	INIT_HLIST_NODE(&napi->napi_hash_node);
 6515	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 6516	napi->timer.function = napi_watchdog;
 6517	init_gro_hash(napi);
 
 6518	napi->skb = NULL;
 6519	INIT_LIST_HEAD(&napi->rx_list);
 6520	napi->rx_count = 0;
 6521	napi->poll = poll;
 6522	if (weight > NAPI_POLL_WEIGHT)
 6523		netdev_err_once(dev, "%s() called with weight %d\n", __func__,
 6524				weight);
 6525	napi->weight = weight;
 
 6526	napi->dev = dev;
 6527#ifdef CONFIG_NETPOLL
 6528	napi->poll_owner = -1;
 6529#endif
 6530	napi->list_owner = -1;
 6531	set_bit(NAPI_STATE_SCHED, &napi->state);
 6532	set_bit(NAPI_STATE_NPSVC, &napi->state);
 6533	list_add_rcu(&napi->dev_list, &dev->napi_list);
 6534	napi_hash_add(napi);
 6535	napi_get_frags_check(napi);
 6536	/* Create kthread for this napi if dev->threaded is set.
 6537	 * Clear dev->threaded if kthread creation failed so that
 6538	 * threaded mode will not be enabled in napi_enable().
 6539	 */
 6540	if (dev->threaded && napi_kthread_create(napi))
 6541		dev->threaded = 0;
 6542	netif_napi_set_irq(napi, -1);
 6543}
 6544EXPORT_SYMBOL(netif_napi_add_weight);
 6545
 6546void napi_disable(struct napi_struct *n)
 6547{
 6548	unsigned long val, new;
 6549
 6550	might_sleep();
 6551	set_bit(NAPI_STATE_DISABLE, &n->state);
 6552
 6553	val = READ_ONCE(n->state);
 6554	do {
 6555		while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
 6556			usleep_range(20, 200);
 6557			val = READ_ONCE(n->state);
 6558		}
 6559
 6560		new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
 6561		new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
 6562	} while (!try_cmpxchg(&n->state, &val, new));
 6563
 6564	hrtimer_cancel(&n->timer);
 6565
 6566	clear_bit(NAPI_STATE_DISABLE, &n->state);
 6567}
 6568EXPORT_SYMBOL(napi_disable);
 6569
 6570/**
 6571 *	napi_enable - enable NAPI scheduling
 6572 *	@n: NAPI context
 6573 *
 6574 * Resume NAPI from being scheduled on this context.
 6575 * Must be paired with napi_disable.
 6576 */
 6577void napi_enable(struct napi_struct *n)
 6578{
 6579	unsigned long new, val = READ_ONCE(n->state);
 6580
 6581	do {
 6582		BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
 6583
 6584		new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
 6585		if (n->dev->threaded && n->thread)
 6586			new |= NAPIF_STATE_THREADED;
 6587	} while (!try_cmpxchg(&n->state, &val, new));
 6588}
 6589EXPORT_SYMBOL(napi_enable);
 6590
 6591static void flush_gro_hash(struct napi_struct *napi)
 6592{
 6593	int i;
 6594
 6595	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6596		struct sk_buff *skb, *n;
 6597
 6598		list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
 6599			kfree_skb(skb);
 6600		napi->gro_hash[i].count = 0;
 6601	}
 6602}
 6603
 6604/* Must be called in process context */
 6605void __netif_napi_del(struct napi_struct *napi)
 6606{
 6607	if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
 6608		return;
 6609
 6610	napi_hash_del(napi);
 6611	list_del_rcu(&napi->dev_list);
 6612	napi_free_frags(napi);
 6613
 6614	flush_gro_hash(napi);
 6615	napi->gro_bitmask = 0;
 6616
 6617	if (napi->thread) {
 6618		kthread_stop(napi->thread);
 6619		napi->thread = NULL;
 6620	}
 6621}
 6622EXPORT_SYMBOL(__netif_napi_del);
 6623
 6624static int __napi_poll(struct napi_struct *n, bool *repoll)
 6625{
 
 6626	int work, weight;
 6627
 
 
 
 
 6628	weight = n->weight;
 6629
 6630	/* This NAPI_STATE_SCHED test is for avoiding a race
 6631	 * with netpoll's poll_napi().  Only the entity which
 6632	 * obtains the lock and sees NAPI_STATE_SCHED set will
 6633	 * actually make the ->poll() call.  Therefore we avoid
 6634	 * accidentally calling ->poll() when NAPI is not scheduled.
 6635	 */
 6636	work = 0;
 6637	if (napi_is_scheduled(n)) {
 6638		work = n->poll(n, weight);
 6639		trace_napi_poll(n, work, weight);
 6640
 6641		xdp_do_check_flushed(n);
 6642	}
 6643
 6644	if (unlikely(work > weight))
 6645		netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
 6646				n->poll, work, weight);
 6647
 6648	if (likely(work < weight))
 6649		return work;
 6650
 6651	/* Drivers must not modify the NAPI state if they
 6652	 * consume the entire weight.  In such cases this code
 6653	 * still "owns" the NAPI instance and therefore can
 6654	 * move the instance around on the list at-will.
 6655	 */
 6656	if (unlikely(napi_disable_pending(n))) {
 6657		napi_complete(n);
 6658		return work;
 6659	}
 6660
 6661	/* The NAPI context has more processing work, but busy-polling
 6662	 * is preferred. Exit early.
 6663	 */
 6664	if (napi_prefer_busy_poll(n)) {
 6665		if (napi_complete_done(n, work)) {
 6666			/* If timeout is not set, we need to make sure
 6667			 * that the NAPI is re-scheduled.
 6668			 */
 6669			napi_schedule(n);
 6670		}
 6671		return work;
 6672	}
 6673
 6674	if (n->gro_bitmask) {
 6675		/* flush too old packets
 6676		 * If HZ < 1000, flush all packets.
 6677		 */
 6678		napi_gro_flush(n, HZ >= 1000);
 6679	}
 6680
 6681	gro_normal_list(n);
 6682
 6683	/* Some drivers may have called napi_schedule
 6684	 * prior to exhausting their budget.
 6685	 */
 6686	if (unlikely(!list_empty(&n->poll_list))) {
 6687		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
 6688			     n->dev ? n->dev->name : "backlog");
 6689		return work;
 6690	}
 6691
 6692	*repoll = true;
 6693
 6694	return work;
 6695}
 6696
 6697static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 6698{
 6699	bool do_repoll = false;
 6700	void *have;
 6701	int work;
 6702
 6703	list_del_init(&n->poll_list);
 6704
 6705	have = netpoll_poll_lock(n);
 6706
 6707	work = __napi_poll(n, &do_repoll);
 6708
 6709	if (do_repoll)
 6710		list_add_tail(&n->poll_list, repoll);
 6711
 
 6712	netpoll_poll_unlock(have);
 6713
 6714	return work;
 6715}
 6716
 6717static int napi_thread_wait(struct napi_struct *napi)
 6718{
 6719	bool woken = false;
 6720
 6721	set_current_state(TASK_INTERRUPTIBLE);
 6722
 6723	while (!kthread_should_stop()) {
 6724		/* Testing SCHED_THREADED bit here to make sure the current
 6725		 * kthread owns this napi and could poll on this napi.
 6726		 * Testing SCHED bit is not enough because SCHED bit might be
 6727		 * set by some other busy poll thread or by napi_disable().
 6728		 */
 6729		if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
 6730			WARN_ON(!list_empty(&napi->poll_list));
 6731			__set_current_state(TASK_RUNNING);
 6732			return 0;
 6733		}
 6734
 6735		schedule();
 6736		/* woken being true indicates this thread owns this napi. */
 6737		woken = true;
 6738		set_current_state(TASK_INTERRUPTIBLE);
 6739	}
 6740	__set_current_state(TASK_RUNNING);
 6741
 6742	return -1;
 6743}
 6744
 6745static int napi_threaded_poll(void *data)
 6746{
 6747	struct napi_struct *napi = data;
 6748	struct softnet_data *sd;
 6749	void *have;
 6750
 6751	while (!napi_thread_wait(napi)) {
 6752		unsigned long last_qs = jiffies;
 6753
 6754		for (;;) {
 6755			bool repoll = false;
 6756
 6757			local_bh_disable();
 6758			sd = this_cpu_ptr(&softnet_data);
 6759			sd->in_napi_threaded_poll = true;
 6760
 6761			have = netpoll_poll_lock(napi);
 6762			__napi_poll(napi, &repoll);
 6763			netpoll_poll_unlock(have);
 6764
 6765			sd->in_napi_threaded_poll = false;
 6766			barrier();
 6767
 6768			if (sd_has_rps_ipi_waiting(sd)) {
 6769				local_irq_disable();
 6770				net_rps_action_and_irq_enable(sd);
 6771			}
 6772			skb_defer_free_flush(sd);
 6773			local_bh_enable();
 6774
 6775			if (!repoll)
 6776				break;
 6777
 6778			rcu_softirq_qs_periodic(last_qs);
 6779			cond_resched();
 6780		}
 6781	}
 6782	return 0;
 6783}
 6784
 6785static __latent_entropy void net_rx_action(struct softirq_action *h)
 6786{
 6787	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 6788	unsigned long time_limit = jiffies +
 6789		usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
 6790	int budget = READ_ONCE(net_hotdata.netdev_budget);
 6791	LIST_HEAD(list);
 6792	LIST_HEAD(repoll);
 6793
 6794start:
 6795	sd->in_net_rx_action = true;
 6796	local_irq_disable();
 6797	list_splice_init(&sd->poll_list, &list);
 6798	local_irq_enable();
 6799
 6800	for (;;) {
 6801		struct napi_struct *n;
 6802
 6803		skb_defer_free_flush(sd);
 6804
 6805		if (list_empty(&list)) {
 6806			if (list_empty(&repoll)) {
 6807				sd->in_net_rx_action = false;
 6808				barrier();
 6809				/* We need to check if ____napi_schedule()
 6810				 * had refilled poll_list while
 6811				 * sd->in_net_rx_action was true.
 6812				 */
 6813				if (!list_empty(&sd->poll_list))
 6814					goto start;
 6815				if (!sd_has_rps_ipi_waiting(sd))
 6816					goto end;
 6817			}
 6818			break;
 6819		}
 6820
 6821		n = list_first_entry(&list, struct napi_struct, poll_list);
 6822		budget -= napi_poll(n, &repoll);
 6823
 6824		/* If softirq window is exhausted then punt.
 6825		 * Allow this to run for 2 jiffies since which will allow
 6826		 * an average latency of 1.5/HZ.
 6827		 */
 6828		if (unlikely(budget <= 0 ||
 6829			     time_after_eq(jiffies, time_limit))) {
 6830			sd->time_squeeze++;
 6831			break;
 6832		}
 6833	}
 6834
 6835	local_irq_disable();
 6836
 6837	list_splice_tail_init(&sd->poll_list, &list);
 6838	list_splice_tail(&repoll, &list);
 6839	list_splice(&list, &sd->poll_list);
 6840	if (!list_empty(&sd->poll_list))
 6841		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 6842	else
 6843		sd->in_net_rx_action = false;
 6844
 6845	net_rps_action_and_irq_enable(sd);
 6846end:;
 
 6847}
 6848
 6849struct netdev_adjacent {
 6850	struct net_device *dev;
 6851	netdevice_tracker dev_tracker;
 6852
 6853	/* upper master flag, there can only be one master device per list */
 6854	bool master;
 6855
 6856	/* lookup ignore flag */
 6857	bool ignore;
 6858
 6859	/* counter for the number of times this device was added to us */
 6860	u16 ref_nr;
 6861
 6862	/* private field for the users */
 6863	void *private;
 6864
 6865	struct list_head list;
 6866	struct rcu_head rcu;
 6867};
 6868
 6869static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
 6870						 struct list_head *adj_list)
 6871{
 6872	struct netdev_adjacent *adj;
 6873
 6874	list_for_each_entry(adj, adj_list, list) {
 6875		if (adj->dev == adj_dev)
 6876			return adj;
 6877	}
 6878	return NULL;
 6879}
 6880
 6881static int ____netdev_has_upper_dev(struct net_device *upper_dev,
 6882				    struct netdev_nested_priv *priv)
 6883{
 6884	struct net_device *dev = (struct net_device *)priv->data;
 6885
 6886	return upper_dev == dev;
 6887}
 6888
 6889/**
 6890 * netdev_has_upper_dev - Check if device is linked to an upper device
 6891 * @dev: device
 6892 * @upper_dev: upper device to check
 6893 *
 6894 * Find out if a device is linked to specified upper device and return true
 6895 * in case it is. Note that this checks only immediate upper device,
 6896 * not through a complete stack of devices. The caller must hold the RTNL lock.
 6897 */
 6898bool netdev_has_upper_dev(struct net_device *dev,
 6899			  struct net_device *upper_dev)
 6900{
 6901	struct netdev_nested_priv priv = {
 6902		.data = (void *)upper_dev,
 6903	};
 6904
 6905	ASSERT_RTNL();
 6906
 6907	return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 6908					     &priv);
 6909}
 6910EXPORT_SYMBOL(netdev_has_upper_dev);
 6911
 6912/**
 6913 * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
 6914 * @dev: device
 6915 * @upper_dev: upper device to check
 6916 *
 6917 * Find out if a device is linked to specified upper device and return true
 6918 * in case it is. Note that this checks the entire upper device chain.
 6919 * The caller must hold rcu lock.
 6920 */
 6921
 6922bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
 6923				  struct net_device *upper_dev)
 6924{
 6925	struct netdev_nested_priv priv = {
 6926		.data = (void *)upper_dev,
 6927	};
 6928
 6929	return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 6930					       &priv);
 6931}
 6932EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
 6933
 6934/**
 6935 * netdev_has_any_upper_dev - Check if device is linked to some device
 6936 * @dev: device
 6937 *
 6938 * Find out if a device is linked to an upper device and return true in case
 6939 * it is. The caller must hold the RTNL lock.
 6940 */
 6941bool netdev_has_any_upper_dev(struct net_device *dev)
 6942{
 6943	ASSERT_RTNL();
 6944
 6945	return !list_empty(&dev->adj_list.upper);
 6946}
 6947EXPORT_SYMBOL(netdev_has_any_upper_dev);
 6948
 6949/**
 6950 * netdev_master_upper_dev_get - Get master upper device
 6951 * @dev: device
 6952 *
 6953 * Find a master upper device and return pointer to it or NULL in case
 6954 * it's not there. The caller must hold the RTNL lock.
 6955 */
 6956struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
 6957{
 6958	struct netdev_adjacent *upper;
 6959
 6960	ASSERT_RTNL();
 6961
 6962	if (list_empty(&dev->adj_list.upper))
 6963		return NULL;
 6964
 6965	upper = list_first_entry(&dev->adj_list.upper,
 6966				 struct netdev_adjacent, list);
 6967	if (likely(upper->master))
 6968		return upper->dev;
 6969	return NULL;
 6970}
 6971EXPORT_SYMBOL(netdev_master_upper_dev_get);
 6972
 6973static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
 6974{
 6975	struct netdev_adjacent *upper;
 6976
 6977	ASSERT_RTNL();
 6978
 6979	if (list_empty(&dev->adj_list.upper))
 6980		return NULL;
 6981
 6982	upper = list_first_entry(&dev->adj_list.upper,
 6983				 struct netdev_adjacent, list);
 6984	if (likely(upper->master) && !upper->ignore)
 6985		return upper->dev;
 6986	return NULL;
 6987}
 6988
 6989/**
 6990 * netdev_has_any_lower_dev - Check if device is linked to some device
 6991 * @dev: device
 6992 *
 6993 * Find out if a device is linked to a lower device and return true in case
 6994 * it is. The caller must hold the RTNL lock.
 6995 */
 6996static bool netdev_has_any_lower_dev(struct net_device *dev)
 6997{
 6998	ASSERT_RTNL();
 6999
 7000	return !list_empty(&dev->adj_list.lower);
 7001}
 7002
 7003void *netdev_adjacent_get_private(struct list_head *adj_list)
 7004{
 7005	struct netdev_adjacent *adj;
 7006
 7007	adj = list_entry(adj_list, struct netdev_adjacent, list);
 7008
 7009	return adj->private;
 7010}
 7011EXPORT_SYMBOL(netdev_adjacent_get_private);
 7012
 7013/**
 7014 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
 7015 * @dev: device
 7016 * @iter: list_head ** of the current position
 7017 *
 7018 * Gets the next device from the dev's upper list, starting from iter
 7019 * position. The caller must hold RCU read lock.
 7020 */
 7021struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
 7022						 struct list_head **iter)
 7023{
 7024	struct netdev_adjacent *upper;
 7025
 7026	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 7027
 7028	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7029
 7030	if (&upper->list == &dev->adj_list.upper)
 7031		return NULL;
 7032
 7033	*iter = &upper->list;
 7034
 7035	return upper->dev;
 7036}
 7037EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
 7038
 7039static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
 7040						  struct list_head **iter,
 7041						  bool *ignore)
 7042{
 7043	struct netdev_adjacent *upper;
 7044
 7045	upper = list_entry((*iter)->next, struct netdev_adjacent, list);
 7046
 7047	if (&upper->list == &dev->adj_list.upper)
 7048		return NULL;
 7049
 7050	*iter = &upper->list;
 7051	*ignore = upper->ignore;
 7052
 7053	return upper->dev;
 7054}
 7055
 7056static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
 7057						    struct list_head **iter)
 7058{
 7059	struct netdev_adjacent *upper;
 7060
 7061	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 7062
 7063	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7064
 7065	if (&upper->list == &dev->adj_list.upper)
 7066		return NULL;
 7067
 7068	*iter = &upper->list;
 7069
 7070	return upper->dev;
 7071}
 7072
 7073static int __netdev_walk_all_upper_dev(struct net_device *dev,
 7074				       int (*fn)(struct net_device *dev,
 7075					 struct netdev_nested_priv *priv),
 7076				       struct netdev_nested_priv *priv)
 7077{
 7078	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7079	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7080	int ret, cur = 0;
 7081	bool ignore;
 7082
 7083	now = dev;
 7084	iter = &dev->adj_list.upper;
 7085
 7086	while (1) {
 7087		if (now != dev) {
 7088			ret = fn(now, priv);
 7089			if (ret)
 7090				return ret;
 7091		}
 7092
 7093		next = NULL;
 7094		while (1) {
 7095			udev = __netdev_next_upper_dev(now, &iter, &ignore);
 7096			if (!udev)
 7097				break;
 7098			if (ignore)
 7099				continue;
 7100
 7101			next = udev;
 7102			niter = &udev->adj_list.upper;
 7103			dev_stack[cur] = now;
 7104			iter_stack[cur++] = iter;
 7105			break;
 7106		}
 7107
 7108		if (!next) {
 7109			if (!cur)
 7110				return 0;
 7111			next = dev_stack[--cur];
 7112			niter = iter_stack[cur];
 7113		}
 7114
 7115		now = next;
 7116		iter = niter;
 7117	}
 7118
 7119	return 0;
 7120}
 7121
 7122int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
 7123				  int (*fn)(struct net_device *dev,
 7124					    struct netdev_nested_priv *priv),
 7125				  struct netdev_nested_priv *priv)
 7126{
 7127	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7128	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7129	int ret, cur = 0;
 7130
 7131	now = dev;
 7132	iter = &dev->adj_list.upper;
 7133
 7134	while (1) {
 7135		if (now != dev) {
 7136			ret = fn(now, priv);
 7137			if (ret)
 7138				return ret;
 7139		}
 7140
 7141		next = NULL;
 7142		while (1) {
 7143			udev = netdev_next_upper_dev_rcu(now, &iter);
 7144			if (!udev)
 7145				break;
 7146
 7147			next = udev;
 7148			niter = &udev->adj_list.upper;
 7149			dev_stack[cur] = now;
 7150			iter_stack[cur++] = iter;
 7151			break;
 7152		}
 7153
 7154		if (!next) {
 7155			if (!cur)
 7156				return 0;
 7157			next = dev_stack[--cur];
 7158			niter = iter_stack[cur];
 7159		}
 
 
 7160
 7161		now = next;
 7162		iter = niter;
 
 
 7163	}
 7164
 7165	return 0;
 7166}
 7167EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
 7168
 7169static bool __netdev_has_upper_dev(struct net_device *dev,
 7170				   struct net_device *upper_dev)
 7171{
 7172	struct netdev_nested_priv priv = {
 7173		.flags = 0,
 7174		.data = (void *)upper_dev,
 7175	};
 7176
 7177	ASSERT_RTNL();
 7178
 7179	return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
 7180					   &priv);
 7181}
 7182
 7183/**
 7184 * netdev_lower_get_next_private - Get the next ->private from the
 7185 *				   lower neighbour list
 7186 * @dev: device
 7187 * @iter: list_head ** of the current position
 7188 *
 7189 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 7190 * list, starting from iter position. The caller must hold either hold the
 7191 * RTNL lock or its own locking that guarantees that the neighbour lower
 7192 * list will remain unchanged.
 7193 */
 7194void *netdev_lower_get_next_private(struct net_device *dev,
 7195				    struct list_head **iter)
 7196{
 7197	struct netdev_adjacent *lower;
 7198
 7199	lower = list_entry(*iter, struct netdev_adjacent, list);
 7200
 7201	if (&lower->list == &dev->adj_list.lower)
 7202		return NULL;
 7203
 7204	*iter = lower->list.next;
 7205
 7206	return lower->private;
 7207}
 7208EXPORT_SYMBOL(netdev_lower_get_next_private);
 7209
 7210/**
 7211 * netdev_lower_get_next_private_rcu - Get the next ->private from the
 7212 *				       lower neighbour list, RCU
 7213 *				       variant
 7214 * @dev: device
 7215 * @iter: list_head ** of the current position
 7216 *
 7217 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 7218 * list, starting from iter position. The caller must hold RCU read lock.
 7219 */
 7220void *netdev_lower_get_next_private_rcu(struct net_device *dev,
 7221					struct list_head **iter)
 7222{
 7223	struct netdev_adjacent *lower;
 7224
 7225	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
 7226
 7227	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7228
 7229	if (&lower->list == &dev->adj_list.lower)
 7230		return NULL;
 7231
 7232	*iter = &lower->list;
 7233
 7234	return lower->private;
 7235}
 7236EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
 7237
 7238/**
 7239 * netdev_lower_get_next - Get the next device from the lower neighbour
 7240 *                         list
 7241 * @dev: device
 7242 * @iter: list_head ** of the current position
 7243 *
 7244 * Gets the next netdev_adjacent from the dev's lower neighbour
 7245 * list, starting from iter position. The caller must hold RTNL lock or
 7246 * its own locking that guarantees that the neighbour lower
 7247 * list will remain unchanged.
 7248 */
 7249void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
 7250{
 7251	struct netdev_adjacent *lower;
 7252
 7253	lower = list_entry(*iter, struct netdev_adjacent, list);
 7254
 7255	if (&lower->list == &dev->adj_list.lower)
 7256		return NULL;
 7257
 7258	*iter = lower->list.next;
 7259
 7260	return lower->dev;
 7261}
 7262EXPORT_SYMBOL(netdev_lower_get_next);
 7263
 7264static struct net_device *netdev_next_lower_dev(struct net_device *dev,
 7265						struct list_head **iter)
 7266{
 7267	struct netdev_adjacent *lower;
 7268
 7269	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 7270
 7271	if (&lower->list == &dev->adj_list.lower)
 7272		return NULL;
 7273
 7274	*iter = &lower->list;
 7275
 7276	return lower->dev;
 7277}
 7278
 7279static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
 7280						  struct list_head **iter,
 7281						  bool *ignore)
 7282{
 7283	struct netdev_adjacent *lower;
 7284
 7285	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 7286
 7287	if (&lower->list == &dev->adj_list.lower)
 7288		return NULL;
 7289
 7290	*iter = &lower->list;
 7291	*ignore = lower->ignore;
 7292
 7293	return lower->dev;
 7294}
 7295
 7296int netdev_walk_all_lower_dev(struct net_device *dev,
 7297			      int (*fn)(struct net_device *dev,
 7298					struct netdev_nested_priv *priv),
 7299			      struct netdev_nested_priv *priv)
 7300{
 7301	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7302	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7303	int ret, cur = 0;
 7304
 7305	now = dev;
 7306	iter = &dev->adj_list.lower;
 7307
 7308	while (1) {
 7309		if (now != dev) {
 7310			ret = fn(now, priv);
 7311			if (ret)
 7312				return ret;
 7313		}
 7314
 7315		next = NULL;
 7316		while (1) {
 7317			ldev = netdev_next_lower_dev(now, &iter);
 7318			if (!ldev)
 7319				break;
 7320
 7321			next = ldev;
 7322			niter = &ldev->adj_list.lower;
 7323			dev_stack[cur] = now;
 7324			iter_stack[cur++] = iter;
 7325			break;
 7326		}
 7327
 7328		if (!next) {
 7329			if (!cur)
 7330				return 0;
 7331			next = dev_stack[--cur];
 7332			niter = iter_stack[cur];
 7333		}
 
 
 7334
 7335		now = next;
 7336		iter = niter;
 
 
 7337	}
 7338
 7339	return 0;
 7340}
 7341EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
 7342
 7343static int __netdev_walk_all_lower_dev(struct net_device *dev,
 7344				       int (*fn)(struct net_device *dev,
 7345					 struct netdev_nested_priv *priv),
 7346				       struct netdev_nested_priv *priv)
 7347{
 7348	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7349	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7350	int ret, cur = 0;
 7351	bool ignore;
 7352
 7353	now = dev;
 7354	iter = &dev->adj_list.lower;
 7355
 7356	while (1) {
 7357		if (now != dev) {
 7358			ret = fn(now, priv);
 7359			if (ret)
 7360				return ret;
 7361		}
 7362
 7363		next = NULL;
 7364		while (1) {
 7365			ldev = __netdev_next_lower_dev(now, &iter, &ignore);
 7366			if (!ldev)
 7367				break;
 7368			if (ignore)
 7369				continue;
 7370
 7371			next = ldev;
 7372			niter = &ldev->adj_list.lower;
 7373			dev_stack[cur] = now;
 7374			iter_stack[cur++] = iter;
 7375			break;
 7376		}
 7377
 7378		if (!next) {
 7379			if (!cur)
 7380				return 0;
 7381			next = dev_stack[--cur];
 7382			niter = iter_stack[cur];
 7383		}
 7384
 7385		now = next;
 7386		iter = niter;
 7387	}
 7388
 7389	return 0;
 7390}
 7391
 7392struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
 7393					     struct list_head **iter)
 7394{
 7395	struct netdev_adjacent *lower;
 7396
 7397	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7398	if (&lower->list == &dev->adj_list.lower)
 7399		return NULL;
 7400
 7401	*iter = &lower->list;
 7402
 7403	return lower->dev;
 7404}
 7405EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
 7406
 7407static u8 __netdev_upper_depth(struct net_device *dev)
 7408{
 7409	struct net_device *udev;
 7410	struct list_head *iter;
 7411	u8 max_depth = 0;
 7412	bool ignore;
 7413
 7414	for (iter = &dev->adj_list.upper,
 7415	     udev = __netdev_next_upper_dev(dev, &iter, &ignore);
 7416	     udev;
 7417	     udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
 7418		if (ignore)
 7419			continue;
 7420		if (max_depth < udev->upper_level)
 7421			max_depth = udev->upper_level;
 7422	}
 7423
 7424	return max_depth;
 7425}
 7426
 7427static u8 __netdev_lower_depth(struct net_device *dev)
 
 
 
 7428{
 7429	struct net_device *ldev;
 7430	struct list_head *iter;
 7431	u8 max_depth = 0;
 7432	bool ignore;
 7433
 7434	for (iter = &dev->adj_list.lower,
 7435	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
 7436	     ldev;
 7437	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
 7438		if (ignore)
 7439			continue;
 7440		if (max_depth < ldev->lower_level)
 7441			max_depth = ldev->lower_level;
 7442	}
 7443
 7444	return max_depth;
 7445}
 7446
 7447static int __netdev_update_upper_level(struct net_device *dev,
 7448				       struct netdev_nested_priv *__unused)
 7449{
 7450	dev->upper_level = __netdev_upper_depth(dev) + 1;
 7451	return 0;
 7452}
 7453
 7454#ifdef CONFIG_LOCKDEP
 7455static LIST_HEAD(net_unlink_list);
 7456
 7457static void net_unlink_todo(struct net_device *dev)
 7458{
 7459	if (list_empty(&dev->unlink_list))
 7460		list_add_tail(&dev->unlink_list, &net_unlink_list);
 7461}
 7462#endif
 7463
 7464static int __netdev_update_lower_level(struct net_device *dev,
 7465				       struct netdev_nested_priv *priv)
 7466{
 7467	dev->lower_level = __netdev_lower_depth(dev) + 1;
 7468
 7469#ifdef CONFIG_LOCKDEP
 7470	if (!priv)
 7471		return 0;
 7472
 7473	if (priv->flags & NESTED_SYNC_IMM)
 7474		dev->nested_level = dev->lower_level - 1;
 7475	if (priv->flags & NESTED_SYNC_TODO)
 7476		net_unlink_todo(dev);
 7477#endif
 7478	return 0;
 7479}
 7480
 7481int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
 7482				  int (*fn)(struct net_device *dev,
 7483					    struct netdev_nested_priv *priv),
 7484				  struct netdev_nested_priv *priv)
 7485{
 7486	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7487	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7488	int ret, cur = 0;
 7489
 7490	now = dev;
 7491	iter = &dev->adj_list.lower;
 7492
 7493	while (1) {
 7494		if (now != dev) {
 7495			ret = fn(now, priv);
 7496			if (ret)
 7497				return ret;
 7498		}
 7499
 7500		next = NULL;
 7501		while (1) {
 7502			ldev = netdev_next_lower_dev_rcu(now, &iter);
 7503			if (!ldev)
 7504				break;
 7505
 7506			next = ldev;
 7507			niter = &ldev->adj_list.lower;
 7508			dev_stack[cur] = now;
 7509			iter_stack[cur++] = iter;
 7510			break;
 7511		}
 7512
 7513		if (!next) {
 7514			if (!cur)
 7515				return 0;
 7516			next = dev_stack[--cur];
 7517			niter = iter_stack[cur];
 7518		}
 7519
 7520		now = next;
 7521		iter = niter;
 
 
 7522	}
 7523
 7524	return 0;
 7525}
 7526EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
 7527
 7528/**
 7529 * netdev_lower_get_first_private_rcu - Get the first ->private from the
 7530 *				       lower neighbour list, RCU
 7531 *				       variant
 7532 * @dev: device
 7533 *
 7534 * Gets the first netdev_adjacent->private from the dev's lower neighbour
 7535 * list. The caller must hold RCU read lock.
 7536 */
 7537void *netdev_lower_get_first_private_rcu(struct net_device *dev)
 7538{
 7539	struct netdev_adjacent *lower;
 7540
 7541	lower = list_first_or_null_rcu(&dev->adj_list.lower,
 7542			struct netdev_adjacent, list);
 7543	if (lower)
 7544		return lower->private;
 7545	return NULL;
 7546}
 7547EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
 7548
 7549/**
 7550 * netdev_master_upper_dev_get_rcu - Get master upper device
 7551 * @dev: device
 7552 *
 7553 * Find a master upper device and return pointer to it or NULL in case
 7554 * it's not there. The caller must hold the RCU read lock.
 7555 */
 7556struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
 7557{
 7558	struct netdev_adjacent *upper;
 7559
 7560	upper = list_first_or_null_rcu(&dev->adj_list.upper,
 7561				       struct netdev_adjacent, list);
 7562	if (upper && likely(upper->master))
 7563		return upper->dev;
 7564	return NULL;
 7565}
 7566EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
 7567
 7568static int netdev_adjacent_sysfs_add(struct net_device *dev,
 7569			      struct net_device *adj_dev,
 7570			      struct list_head *dev_list)
 7571{
 7572	char linkname[IFNAMSIZ+7];
 7573
 7574	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7575		"upper_%s" : "lower_%s", adj_dev->name);
 7576	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
 7577				 linkname);
 7578}
 7579static void netdev_adjacent_sysfs_del(struct net_device *dev,
 7580			       char *name,
 7581			       struct list_head *dev_list)
 7582{
 7583	char linkname[IFNAMSIZ+7];
 7584
 7585	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7586		"upper_%s" : "lower_%s", name);
 7587	sysfs_remove_link(&(dev->dev.kobj), linkname);
 7588}
 7589
 7590static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
 7591						 struct net_device *adj_dev,
 7592						 struct list_head *dev_list)
 7593{
 7594	return (dev_list == &dev->adj_list.upper ||
 7595		dev_list == &dev->adj_list.lower) &&
 7596		net_eq(dev_net(dev), dev_net(adj_dev));
 7597}
 7598
 7599static int __netdev_adjacent_dev_insert(struct net_device *dev,
 7600					struct net_device *adj_dev,
 7601					struct list_head *dev_list,
 7602					void *private, bool master)
 7603{
 7604	struct netdev_adjacent *adj;
 7605	int ret;
 7606
 7607	adj = __netdev_find_adj(adj_dev, dev_list);
 7608
 7609	if (adj) {
 7610		adj->ref_nr += 1;
 7611		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
 7612			 dev->name, adj_dev->name, adj->ref_nr);
 7613
 7614		return 0;
 7615	}
 7616
 7617	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
 7618	if (!adj)
 7619		return -ENOMEM;
 7620
 7621	adj->dev = adj_dev;
 7622	adj->master = master;
 7623	adj->ref_nr = 1;
 7624	adj->private = private;
 7625	adj->ignore = false;
 7626	netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL);
 7627
 7628	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
 7629		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
 7630
 7631	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
 7632		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
 7633		if (ret)
 7634			goto free_adj;
 7635	}
 7636
 7637	/* Ensure that master link is always the first item in list. */
 7638	if (master) {
 7639		ret = sysfs_create_link(&(dev->dev.kobj),
 7640					&(adj_dev->dev.kobj), "master");
 7641		if (ret)
 7642			goto remove_symlinks;
 7643
 7644		list_add_rcu(&adj->list, dev_list);
 7645	} else {
 7646		list_add_tail_rcu(&adj->list, dev_list);
 7647	}
 7648
 7649	return 0;
 7650
 7651remove_symlinks:
 7652	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 7653		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 7654free_adj:
 7655	netdev_put(adj_dev, &adj->dev_tracker);
 7656	kfree(adj);
 
 7657
 7658	return ret;
 7659}
 7660
 7661static void __netdev_adjacent_dev_remove(struct net_device *dev,
 7662					 struct net_device *adj_dev,
 7663					 u16 ref_nr,
 7664					 struct list_head *dev_list)
 7665{
 7666	struct netdev_adjacent *adj;
 7667
 7668	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
 7669		 dev->name, adj_dev->name, ref_nr);
 7670
 7671	adj = __netdev_find_adj(adj_dev, dev_list);
 7672
 7673	if (!adj) {
 7674		pr_err("Adjacency does not exist for device %s from %s\n",
 7675		       dev->name, adj_dev->name);
 7676		WARN_ON(1);
 7677		return;
 7678	}
 7679
 7680	if (adj->ref_nr > ref_nr) {
 7681		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
 7682			 dev->name, adj_dev->name, ref_nr,
 7683			 adj->ref_nr - ref_nr);
 7684		adj->ref_nr -= ref_nr;
 7685		return;
 7686	}
 7687
 7688	if (adj->master)
 7689		sysfs_remove_link(&(dev->dev.kobj), "master");
 7690
 7691	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 7692		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 7693
 7694	list_del_rcu(&adj->list);
 7695	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
 7696		 adj_dev->name, dev->name, adj_dev->name);
 7697	netdev_put(adj_dev, &adj->dev_tracker);
 7698	kfree_rcu(adj, rcu);
 7699}
 7700
 7701static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
 7702					    struct net_device *upper_dev,
 7703					    struct list_head *up_list,
 7704					    struct list_head *down_list,
 7705					    void *private, bool master)
 7706{
 7707	int ret;
 7708
 7709	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
 7710					   private, master);
 7711	if (ret)
 7712		return ret;
 7713
 7714	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
 7715					   private, false);
 7716	if (ret) {
 7717		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
 7718		return ret;
 7719	}
 7720
 7721	return 0;
 7722}
 7723
 7724static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
 7725					       struct net_device *upper_dev,
 7726					       u16 ref_nr,
 7727					       struct list_head *up_list,
 7728					       struct list_head *down_list)
 7729{
 7730	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
 7731	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
 7732}
 7733
 7734static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
 7735						struct net_device *upper_dev,
 7736						void *private, bool master)
 7737{
 7738	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
 7739						&dev->adj_list.upper,
 7740						&upper_dev->adj_list.lower,
 7741						private, master);
 7742}
 7743
 7744static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
 7745						   struct net_device *upper_dev)
 7746{
 7747	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
 7748					   &dev->adj_list.upper,
 7749					   &upper_dev->adj_list.lower);
 7750}
 7751
 7752static int __netdev_upper_dev_link(struct net_device *dev,
 7753				   struct net_device *upper_dev, bool master,
 7754				   void *upper_priv, void *upper_info,
 7755				   struct netdev_nested_priv *priv,
 7756				   struct netlink_ext_ack *extack)
 7757{
 7758	struct netdev_notifier_changeupper_info changeupper_info = {
 7759		.info = {
 7760			.dev = dev,
 7761			.extack = extack,
 7762		},
 7763		.upper_dev = upper_dev,
 7764		.master = master,
 7765		.linking = true,
 7766		.upper_info = upper_info,
 7767	};
 7768	struct net_device *master_dev;
 7769	int ret = 0;
 7770
 7771	ASSERT_RTNL();
 7772
 7773	if (dev == upper_dev)
 7774		return -EBUSY;
 7775
 7776	/* To prevent loops, check if dev is not upper device to upper_dev. */
 7777	if (__netdev_has_upper_dev(upper_dev, dev))
 7778		return -EBUSY;
 7779
 7780	if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
 7781		return -EMLINK;
 7782
 7783	if (!master) {
 7784		if (__netdev_has_upper_dev(dev, upper_dev))
 7785			return -EEXIST;
 7786	} else {
 7787		master_dev = __netdev_master_upper_dev_get(dev);
 7788		if (master_dev)
 7789			return master_dev == upper_dev ? -EEXIST : -EBUSY;
 7790	}
 7791
 7792	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 7793					    &changeupper_info.info);
 7794	ret = notifier_to_errno(ret);
 7795	if (ret)
 7796		return ret;
 7797
 7798	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
 7799						   master);
 7800	if (ret)
 7801		return ret;
 7802
 7803	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 7804					    &changeupper_info.info);
 7805	ret = notifier_to_errno(ret);
 7806	if (ret)
 7807		goto rollback;
 7808
 7809	__netdev_update_upper_level(dev, NULL);
 7810	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 7811
 7812	__netdev_update_lower_level(upper_dev, priv);
 7813	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 7814				    priv);
 7815
 7816	return 0;
 7817
 7818rollback:
 7819	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 7820
 7821	return ret;
 7822}
 7823
 7824/**
 7825 * netdev_upper_dev_link - Add a link to the upper device
 7826 * @dev: device
 7827 * @upper_dev: new upper device
 7828 * @extack: netlink extended ack
 7829 *
 7830 * Adds a link to device which is upper to this one. The caller must hold
 7831 * the RTNL lock. On a failure a negative errno code is returned.
 7832 * On success the reference counts are adjusted and the function
 7833 * returns zero.
 7834 */
 7835int netdev_upper_dev_link(struct net_device *dev,
 7836			  struct net_device *upper_dev,
 7837			  struct netlink_ext_ack *extack)
 7838{
 7839	struct netdev_nested_priv priv = {
 7840		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 7841		.data = NULL,
 7842	};
 7843
 7844	return __netdev_upper_dev_link(dev, upper_dev, false,
 7845				       NULL, NULL, &priv, extack);
 7846}
 7847EXPORT_SYMBOL(netdev_upper_dev_link);
 7848
 7849/**
 7850 * netdev_master_upper_dev_link - Add a master link to the upper device
 7851 * @dev: device
 7852 * @upper_dev: new upper device
 7853 * @upper_priv: upper device private
 7854 * @upper_info: upper info to be passed down via notifier
 7855 * @extack: netlink extended ack
 7856 *
 7857 * Adds a link to device which is upper to this one. In this case, only
 7858 * one master upper device can be linked, although other non-master devices
 7859 * might be linked as well. The caller must hold the RTNL lock.
 7860 * On a failure a negative errno code is returned. On success the reference
 7861 * counts are adjusted and the function returns zero.
 7862 */
 7863int netdev_master_upper_dev_link(struct net_device *dev,
 7864				 struct net_device *upper_dev,
 7865				 void *upper_priv, void *upper_info,
 7866				 struct netlink_ext_ack *extack)
 7867{
 7868	struct netdev_nested_priv priv = {
 7869		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 7870		.data = NULL,
 7871	};
 7872
 7873	return __netdev_upper_dev_link(dev, upper_dev, true,
 7874				       upper_priv, upper_info, &priv, extack);
 7875}
 7876EXPORT_SYMBOL(netdev_master_upper_dev_link);
 7877
 7878static void __netdev_upper_dev_unlink(struct net_device *dev,
 7879				      struct net_device *upper_dev,
 7880				      struct netdev_nested_priv *priv)
 7881{
 7882	struct netdev_notifier_changeupper_info changeupper_info = {
 7883		.info = {
 7884			.dev = dev,
 7885		},
 7886		.upper_dev = upper_dev,
 7887		.linking = false,
 7888	};
 7889
 7890	ASSERT_RTNL();
 7891
 7892	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
 7893
 7894	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 7895				      &changeupper_info.info);
 7896
 7897	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 7898
 7899	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 7900				      &changeupper_info.info);
 7901
 7902	__netdev_update_upper_level(dev, NULL);
 7903	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 7904
 7905	__netdev_update_lower_level(upper_dev, priv);
 7906	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 7907				    priv);
 7908}
 7909
 7910/**
 7911 * netdev_upper_dev_unlink - Removes a link to upper device
 7912 * @dev: device
 7913 * @upper_dev: new upper device
 7914 *
 7915 * Removes a link to device which is upper to this one. The caller must hold
 7916 * the RTNL lock.
 7917 */
 7918void netdev_upper_dev_unlink(struct net_device *dev,
 7919			     struct net_device *upper_dev)
 7920{
 7921	struct netdev_nested_priv priv = {
 7922		.flags = NESTED_SYNC_TODO,
 7923		.data = NULL,
 7924	};
 7925
 7926	__netdev_upper_dev_unlink(dev, upper_dev, &priv);
 7927}
 7928EXPORT_SYMBOL(netdev_upper_dev_unlink);
 7929
 7930static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
 7931				      struct net_device *lower_dev,
 7932				      bool val)
 7933{
 7934	struct netdev_adjacent *adj;
 7935
 7936	adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
 7937	if (adj)
 7938		adj->ignore = val;
 7939
 7940	adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
 7941	if (adj)
 7942		adj->ignore = val;
 7943}
 7944
 7945static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
 7946					struct net_device *lower_dev)
 7947{
 7948	__netdev_adjacent_dev_set(upper_dev, lower_dev, true);
 7949}
 7950
 7951static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
 7952				       struct net_device *lower_dev)
 7953{
 7954	__netdev_adjacent_dev_set(upper_dev, lower_dev, false);
 7955}
 7956
 7957int netdev_adjacent_change_prepare(struct net_device *old_dev,
 7958				   struct net_device *new_dev,
 7959				   struct net_device *dev,
 7960				   struct netlink_ext_ack *extack)
 7961{
 7962	struct netdev_nested_priv priv = {
 7963		.flags = 0,
 7964		.data = NULL,
 7965	};
 7966	int err;
 7967
 7968	if (!new_dev)
 7969		return 0;
 7970
 7971	if (old_dev && new_dev != old_dev)
 7972		netdev_adjacent_dev_disable(dev, old_dev);
 7973	err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
 7974				      extack);
 7975	if (err) {
 7976		if (old_dev && new_dev != old_dev)
 7977			netdev_adjacent_dev_enable(dev, old_dev);
 7978		return err;
 7979	}
 7980
 7981	return 0;
 7982}
 7983EXPORT_SYMBOL(netdev_adjacent_change_prepare);
 7984
 7985void netdev_adjacent_change_commit(struct net_device *old_dev,
 7986				   struct net_device *new_dev,
 7987				   struct net_device *dev)
 7988{
 7989	struct netdev_nested_priv priv = {
 7990		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 7991		.data = NULL,
 7992	};
 7993
 7994	if (!new_dev || !old_dev)
 7995		return;
 7996
 7997	if (new_dev == old_dev)
 7998		return;
 7999
 8000	netdev_adjacent_dev_enable(dev, old_dev);
 8001	__netdev_upper_dev_unlink(old_dev, dev, &priv);
 8002}
 8003EXPORT_SYMBOL(netdev_adjacent_change_commit);
 8004
 8005void netdev_adjacent_change_abort(struct net_device *old_dev,
 8006				  struct net_device *new_dev,
 8007				  struct net_device *dev)
 8008{
 8009	struct netdev_nested_priv priv = {
 8010		.flags = 0,
 8011		.data = NULL,
 8012	};
 8013
 8014	if (!new_dev)
 8015		return;
 8016
 8017	if (old_dev && new_dev != old_dev)
 8018		netdev_adjacent_dev_enable(dev, old_dev);
 8019
 8020	__netdev_upper_dev_unlink(new_dev, dev, &priv);
 
 8021}
 8022EXPORT_SYMBOL(netdev_adjacent_change_abort);
 8023
 8024/**
 8025 * netdev_bonding_info_change - Dispatch event about slave change
 8026 * @dev: device
 8027 * @bonding_info: info to dispatch
 8028 *
 8029 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
 8030 * The caller must hold the RTNL lock.
 8031 */
 8032void netdev_bonding_info_change(struct net_device *dev,
 8033				struct netdev_bonding_info *bonding_info)
 8034{
 8035	struct netdev_notifier_bonding_info info = {
 8036		.info.dev = dev,
 8037	};
 8038
 8039	memcpy(&info.bonding_info, bonding_info,
 8040	       sizeof(struct netdev_bonding_info));
 8041	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
 8042				      &info.info);
 8043}
 8044EXPORT_SYMBOL(netdev_bonding_info_change);
 8045
 8046static int netdev_offload_xstats_enable_l3(struct net_device *dev,
 8047					   struct netlink_ext_ack *extack)
 8048{
 8049	struct netdev_notifier_offload_xstats_info info = {
 8050		.info.dev = dev,
 8051		.info.extack = extack,
 8052		.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
 8053	};
 8054	int err;
 8055	int rc;
 8056
 8057	dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
 8058					 GFP_KERNEL);
 8059	if (!dev->offload_xstats_l3)
 8060		return -ENOMEM;
 8061
 8062	rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
 8063						  NETDEV_OFFLOAD_XSTATS_DISABLE,
 8064						  &info.info);
 8065	err = notifier_to_errno(rc);
 8066	if (err)
 8067		goto free_stats;
 8068
 8069	return 0;
 8070
 8071free_stats:
 8072	kfree(dev->offload_xstats_l3);
 8073	dev->offload_xstats_l3 = NULL;
 8074	return err;
 8075}
 8076
 8077int netdev_offload_xstats_enable(struct net_device *dev,
 8078				 enum netdev_offload_xstats_type type,
 8079				 struct netlink_ext_ack *extack)
 8080{
 8081	ASSERT_RTNL();
 8082
 8083	if (netdev_offload_xstats_enabled(dev, type))
 8084		return -EALREADY;
 8085
 8086	switch (type) {
 8087	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
 8088		return netdev_offload_xstats_enable_l3(dev, extack);
 8089	}
 8090
 8091	WARN_ON(1);
 8092	return -EINVAL;
 8093}
 8094EXPORT_SYMBOL(netdev_offload_xstats_enable);
 8095
 8096static void netdev_offload_xstats_disable_l3(struct net_device *dev)
 8097{
 8098	struct netdev_notifier_offload_xstats_info info = {
 8099		.info.dev = dev,
 8100		.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
 8101	};
 8102
 8103	call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
 8104				      &info.info);
 8105	kfree(dev->offload_xstats_l3);
 8106	dev->offload_xstats_l3 = NULL;
 8107}
 8108
 8109int netdev_offload_xstats_disable(struct net_device *dev,
 8110				  enum netdev_offload_xstats_type type)
 8111{
 8112	ASSERT_RTNL();
 8113
 8114	if (!netdev_offload_xstats_enabled(dev, type))
 8115		return -EALREADY;
 8116
 8117	switch (type) {
 8118	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
 8119		netdev_offload_xstats_disable_l3(dev);
 8120		return 0;
 8121	}
 8122
 8123	WARN_ON(1);
 8124	return -EINVAL;
 8125}
 8126EXPORT_SYMBOL(netdev_offload_xstats_disable);
 8127
 8128static void netdev_offload_xstats_disable_all(struct net_device *dev)
 8129{
 8130	netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
 8131}
 8132
 8133static struct rtnl_hw_stats64 *
 8134netdev_offload_xstats_get_ptr(const struct net_device *dev,
 8135			      enum netdev_offload_xstats_type type)
 8136{
 8137	switch (type) {
 8138	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
 8139		return dev->offload_xstats_l3;
 8140	}
 8141
 8142	WARN_ON(1);
 8143	return NULL;
 8144}
 8145
 8146bool netdev_offload_xstats_enabled(const struct net_device *dev,
 8147				   enum netdev_offload_xstats_type type)
 8148{
 8149	ASSERT_RTNL();
 8150
 8151	return netdev_offload_xstats_get_ptr(dev, type);
 8152}
 8153EXPORT_SYMBOL(netdev_offload_xstats_enabled);
 8154
 8155struct netdev_notifier_offload_xstats_ru {
 8156	bool used;
 8157};
 8158
 8159struct netdev_notifier_offload_xstats_rd {
 8160	struct rtnl_hw_stats64 stats;
 8161	bool used;
 8162};
 8163
 8164static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
 8165				  const struct rtnl_hw_stats64 *src)
 8166{
 8167	dest->rx_packets	  += src->rx_packets;
 8168	dest->tx_packets	  += src->tx_packets;
 8169	dest->rx_bytes		  += src->rx_bytes;
 8170	dest->tx_bytes		  += src->tx_bytes;
 8171	dest->rx_errors		  += src->rx_errors;
 8172	dest->tx_errors		  += src->tx_errors;
 8173	dest->rx_dropped	  += src->rx_dropped;
 8174	dest->tx_dropped	  += src->tx_dropped;
 8175	dest->multicast		  += src->multicast;
 8176}
 8177
 8178static int netdev_offload_xstats_get_used(struct net_device *dev,
 8179					  enum netdev_offload_xstats_type type,
 8180					  bool *p_used,
 8181					  struct netlink_ext_ack *extack)
 8182{
 8183	struct netdev_notifier_offload_xstats_ru report_used = {};
 8184	struct netdev_notifier_offload_xstats_info info = {
 8185		.info.dev = dev,
 8186		.info.extack = extack,
 8187		.type = type,
 8188		.report_used = &report_used,
 8189	};
 8190	int rc;
 8191
 8192	WARN_ON(!netdev_offload_xstats_enabled(dev, type));
 8193	rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
 8194					   &info.info);
 8195	*p_used = report_used.used;
 8196	return notifier_to_errno(rc);
 8197}
 8198
 8199static int netdev_offload_xstats_get_stats(struct net_device *dev,
 8200					   enum netdev_offload_xstats_type type,
 8201					   struct rtnl_hw_stats64 *p_stats,
 8202					   bool *p_used,
 8203					   struct netlink_ext_ack *extack)
 8204{
 8205	struct netdev_notifier_offload_xstats_rd report_delta = {};
 8206	struct netdev_notifier_offload_xstats_info info = {
 8207		.info.dev = dev,
 8208		.info.extack = extack,
 8209		.type = type,
 8210		.report_delta = &report_delta,
 8211	};
 8212	struct rtnl_hw_stats64 *stats;
 8213	int rc;
 8214
 8215	stats = netdev_offload_xstats_get_ptr(dev, type);
 8216	if (WARN_ON(!stats))
 8217		return -EINVAL;
 8218
 8219	rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
 8220					   &info.info);
 8221
 8222	/* Cache whatever we got, even if there was an error, otherwise the
 8223	 * successful stats retrievals would get lost.
 8224	 */
 8225	netdev_hw_stats64_add(stats, &report_delta.stats);
 8226
 8227	if (p_stats)
 8228		*p_stats = *stats;
 8229	*p_used = report_delta.used;
 8230
 8231	return notifier_to_errno(rc);
 8232}
 8233
 8234int netdev_offload_xstats_get(struct net_device *dev,
 8235			      enum netdev_offload_xstats_type type,
 8236			      struct rtnl_hw_stats64 *p_stats, bool *p_used,
 8237			      struct netlink_ext_ack *extack)
 8238{
 8239	ASSERT_RTNL();
 8240
 8241	if (p_stats)
 8242		return netdev_offload_xstats_get_stats(dev, type, p_stats,
 8243						       p_used, extack);
 8244	else
 8245		return netdev_offload_xstats_get_used(dev, type, p_used,
 8246						      extack);
 8247}
 8248EXPORT_SYMBOL(netdev_offload_xstats_get);
 8249
 8250void
 8251netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
 8252				   const struct rtnl_hw_stats64 *stats)
 8253{
 8254	report_delta->used = true;
 8255	netdev_hw_stats64_add(&report_delta->stats, stats);
 8256}
 8257EXPORT_SYMBOL(netdev_offload_xstats_report_delta);
 8258
 8259void
 8260netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
 8261{
 8262	report_used->used = true;
 8263}
 8264EXPORT_SYMBOL(netdev_offload_xstats_report_used);
 8265
 8266void netdev_offload_xstats_push_delta(struct net_device *dev,
 8267				      enum netdev_offload_xstats_type type,
 8268				      const struct rtnl_hw_stats64 *p_stats)
 8269{
 8270	struct rtnl_hw_stats64 *stats;
 8271
 8272	ASSERT_RTNL();
 8273
 8274	stats = netdev_offload_xstats_get_ptr(dev, type);
 8275	if (WARN_ON(!stats))
 8276		return;
 8277
 8278	netdev_hw_stats64_add(stats, p_stats);
 8279}
 8280EXPORT_SYMBOL(netdev_offload_xstats_push_delta);
 8281
 8282/**
 8283 * netdev_get_xmit_slave - Get the xmit slave of master device
 8284 * @dev: device
 8285 * @skb: The packet
 8286 * @all_slaves: assume all the slaves are active
 8287 *
 8288 * The reference counters are not incremented so the caller must be
 8289 * careful with locks. The caller must hold RCU lock.
 8290 * %NULL is returned if no slave is found.
 8291 */
 8292
 8293struct net_device *netdev_get_xmit_slave(struct net_device *dev,
 8294					 struct sk_buff *skb,
 8295					 bool all_slaves)
 8296{
 8297	const struct net_device_ops *ops = dev->netdev_ops;
 8298
 8299	if (!ops->ndo_get_xmit_slave)
 8300		return NULL;
 8301	return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
 8302}
 8303EXPORT_SYMBOL(netdev_get_xmit_slave);
 8304
 8305static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
 8306						  struct sock *sk)
 8307{
 8308	const struct net_device_ops *ops = dev->netdev_ops;
 8309
 8310	if (!ops->ndo_sk_get_lower_dev)
 8311		return NULL;
 8312	return ops->ndo_sk_get_lower_dev(dev, sk);
 8313}
 8314
 8315/**
 8316 * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
 8317 * @dev: device
 8318 * @sk: the socket
 8319 *
 8320 * %NULL is returned if no lower device is found.
 8321 */
 8322
 8323struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
 8324					    struct sock *sk)
 8325{
 8326	struct net_device *lower;
 8327
 8328	lower = netdev_sk_get_lower_dev(dev, sk);
 8329	while (lower) {
 8330		dev = lower;
 8331		lower = netdev_sk_get_lower_dev(dev, sk);
 8332	}
 8333
 8334	return dev;
 8335}
 8336EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
 8337
 8338static void netdev_adjacent_add_links(struct net_device *dev)
 8339{
 8340	struct netdev_adjacent *iter;
 8341
 8342	struct net *net = dev_net(dev);
 8343
 8344	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8345		if (!net_eq(net, dev_net(iter->dev)))
 8346			continue;
 8347		netdev_adjacent_sysfs_add(iter->dev, dev,
 8348					  &iter->dev->adj_list.lower);
 8349		netdev_adjacent_sysfs_add(dev, iter->dev,
 8350					  &dev->adj_list.upper);
 8351	}
 8352
 8353	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8354		if (!net_eq(net, dev_net(iter->dev)))
 8355			continue;
 8356		netdev_adjacent_sysfs_add(iter->dev, dev,
 8357					  &iter->dev->adj_list.upper);
 8358		netdev_adjacent_sysfs_add(dev, iter->dev,
 8359					  &dev->adj_list.lower);
 8360	}
 8361}
 8362
 8363static void netdev_adjacent_del_links(struct net_device *dev)
 8364{
 8365	struct netdev_adjacent *iter;
 8366
 8367	struct net *net = dev_net(dev);
 8368
 8369	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8370		if (!net_eq(net, dev_net(iter->dev)))
 8371			continue;
 8372		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 8373					  &iter->dev->adj_list.lower);
 8374		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 8375					  &dev->adj_list.upper);
 8376	}
 8377
 8378	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8379		if (!net_eq(net, dev_net(iter->dev)))
 8380			continue;
 8381		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 8382					  &iter->dev->adj_list.upper);
 8383		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 8384					  &dev->adj_list.lower);
 8385	}
 8386}
 8387
 8388void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
 8389{
 8390	struct netdev_adjacent *iter;
 8391
 8392	struct net *net = dev_net(dev);
 8393
 8394	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8395		if (!net_eq(net, dev_net(iter->dev)))
 8396			continue;
 8397		netdev_adjacent_sysfs_del(iter->dev, oldname,
 8398					  &iter->dev->adj_list.lower);
 8399		netdev_adjacent_sysfs_add(iter->dev, dev,
 8400					  &iter->dev->adj_list.lower);
 8401	}
 8402
 8403	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8404		if (!net_eq(net, dev_net(iter->dev)))
 8405			continue;
 8406		netdev_adjacent_sysfs_del(iter->dev, oldname,
 8407					  &iter->dev->adj_list.upper);
 8408		netdev_adjacent_sysfs_add(iter->dev, dev,
 8409					  &iter->dev->adj_list.upper);
 8410	}
 8411}
 8412
 8413void *netdev_lower_dev_get_private(struct net_device *dev,
 8414				   struct net_device *lower_dev)
 8415{
 8416	struct netdev_adjacent *lower;
 8417
 8418	if (!lower_dev)
 8419		return NULL;
 8420	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
 8421	if (!lower)
 8422		return NULL;
 8423
 8424	return lower->private;
 8425}
 8426EXPORT_SYMBOL(netdev_lower_dev_get_private);
 8427
 8428
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 8429/**
 8430 * netdev_lower_state_changed - Dispatch event about lower device state change
 8431 * @lower_dev: device
 8432 * @lower_state_info: state to dispatch
 8433 *
 8434 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
 8435 * The caller must hold the RTNL lock.
 8436 */
 8437void netdev_lower_state_changed(struct net_device *lower_dev,
 8438				void *lower_state_info)
 8439{
 8440	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
 8441		.info.dev = lower_dev,
 8442	};
 8443
 8444	ASSERT_RTNL();
 8445	changelowerstate_info.lower_state_info = lower_state_info;
 8446	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
 8447				      &changelowerstate_info.info);
 8448}
 8449EXPORT_SYMBOL(netdev_lower_state_changed);
 8450
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 8451static void dev_change_rx_flags(struct net_device *dev, int flags)
 8452{
 8453	const struct net_device_ops *ops = dev->netdev_ops;
 8454
 8455	if (ops->ndo_change_rx_flags)
 8456		ops->ndo_change_rx_flags(dev, flags);
 8457}
 8458
 8459static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
 8460{
 8461	unsigned int old_flags = dev->flags;
 8462	kuid_t uid;
 8463	kgid_t gid;
 8464
 8465	ASSERT_RTNL();
 8466
 8467	dev->flags |= IFF_PROMISC;
 8468	dev->promiscuity += inc;
 8469	if (dev->promiscuity == 0) {
 8470		/*
 8471		 * Avoid overflow.
 8472		 * If inc causes overflow, untouch promisc and return error.
 8473		 */
 8474		if (inc < 0)
 8475			dev->flags &= ~IFF_PROMISC;
 8476		else {
 8477			dev->promiscuity -= inc;
 8478			netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
 
 8479			return -EOVERFLOW;
 8480		}
 8481	}
 8482	if (dev->flags != old_flags) {
 8483		netdev_info(dev, "%s promiscuous mode\n",
 8484			    dev->flags & IFF_PROMISC ? "entered" : "left");
 
 8485		if (audit_enabled) {
 8486			current_uid_gid(&uid, &gid);
 8487			audit_log(audit_context(), GFP_ATOMIC,
 8488				  AUDIT_ANOM_PROMISCUOUS,
 8489				  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
 8490				  dev->name, (dev->flags & IFF_PROMISC),
 8491				  (old_flags & IFF_PROMISC),
 8492				  from_kuid(&init_user_ns, audit_get_loginuid(current)),
 8493				  from_kuid(&init_user_ns, uid),
 8494				  from_kgid(&init_user_ns, gid),
 8495				  audit_get_sessionid(current));
 8496		}
 8497
 8498		dev_change_rx_flags(dev, IFF_PROMISC);
 8499	}
 8500	if (notify)
 8501		__dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
 8502	return 0;
 8503}
 8504
 8505/**
 8506 *	dev_set_promiscuity	- update promiscuity count on a device
 8507 *	@dev: device
 8508 *	@inc: modifier
 8509 *
 8510 *	Add or remove promiscuity from a device. While the count in the device
 8511 *	remains above zero the interface remains promiscuous. Once it hits zero
 8512 *	the device reverts back to normal filtering operation. A negative inc
 8513 *	value is used to drop promiscuity on the device.
 8514 *	Return 0 if successful or a negative errno code on error.
 8515 */
 8516int dev_set_promiscuity(struct net_device *dev, int inc)
 8517{
 8518	unsigned int old_flags = dev->flags;
 8519	int err;
 8520
 8521	err = __dev_set_promiscuity(dev, inc, true);
 8522	if (err < 0)
 8523		return err;
 8524	if (dev->flags != old_flags)
 8525		dev_set_rx_mode(dev);
 8526	return err;
 8527}
 8528EXPORT_SYMBOL(dev_set_promiscuity);
 8529
 8530static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
 8531{
 8532	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
 8533
 8534	ASSERT_RTNL();
 8535
 8536	dev->flags |= IFF_ALLMULTI;
 8537	dev->allmulti += inc;
 8538	if (dev->allmulti == 0) {
 8539		/*
 8540		 * Avoid overflow.
 8541		 * If inc causes overflow, untouch allmulti and return error.
 8542		 */
 8543		if (inc < 0)
 8544			dev->flags &= ~IFF_ALLMULTI;
 8545		else {
 8546			dev->allmulti -= inc;
 8547			netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
 
 8548			return -EOVERFLOW;
 8549		}
 8550	}
 8551	if (dev->flags ^ old_flags) {
 8552		netdev_info(dev, "%s allmulticast mode\n",
 8553			    dev->flags & IFF_ALLMULTI ? "entered" : "left");
 8554		dev_change_rx_flags(dev, IFF_ALLMULTI);
 8555		dev_set_rx_mode(dev);
 8556		if (notify)
 8557			__dev_notify_flags(dev, old_flags,
 8558					   dev->gflags ^ old_gflags, 0, NULL);
 8559	}
 8560	return 0;
 8561}
 8562
 8563/**
 8564 *	dev_set_allmulti	- update allmulti count on a device
 8565 *	@dev: device
 8566 *	@inc: modifier
 8567 *
 8568 *	Add or remove reception of all multicast frames to a device. While the
 8569 *	count in the device remains above zero the interface remains listening
 8570 *	to all interfaces. Once it hits zero the device reverts back to normal
 8571 *	filtering operation. A negative @inc value is used to drop the counter
 8572 *	when releasing a resource needing all multicasts.
 8573 *	Return 0 if successful or a negative errno code on error.
 8574 */
 8575
 8576int dev_set_allmulti(struct net_device *dev, int inc)
 8577{
 8578	return __dev_set_allmulti(dev, inc, true);
 8579}
 8580EXPORT_SYMBOL(dev_set_allmulti);
 8581
 8582/*
 8583 *	Upload unicast and multicast address lists to device and
 8584 *	configure RX filtering. When the device doesn't support unicast
 8585 *	filtering it is put in promiscuous mode while unicast addresses
 8586 *	are present.
 8587 */
 8588void __dev_set_rx_mode(struct net_device *dev)
 8589{
 8590	const struct net_device_ops *ops = dev->netdev_ops;
 8591
 8592	/* dev_open will call this function so the list will stay sane. */
 8593	if (!(dev->flags&IFF_UP))
 8594		return;
 8595
 8596	if (!netif_device_present(dev))
 8597		return;
 8598
 8599	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
 8600		/* Unicast addresses changes may only happen under the rtnl,
 8601		 * therefore calling __dev_set_promiscuity here is safe.
 8602		 */
 8603		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
 8604			__dev_set_promiscuity(dev, 1, false);
 8605			dev->uc_promisc = true;
 8606		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
 8607			__dev_set_promiscuity(dev, -1, false);
 8608			dev->uc_promisc = false;
 8609		}
 8610	}
 8611
 8612	if (ops->ndo_set_rx_mode)
 8613		ops->ndo_set_rx_mode(dev);
 8614}
 8615
 8616void dev_set_rx_mode(struct net_device *dev)
 8617{
 8618	netif_addr_lock_bh(dev);
 8619	__dev_set_rx_mode(dev);
 8620	netif_addr_unlock_bh(dev);
 8621}
 8622
 8623/**
 8624 *	dev_get_flags - get flags reported to userspace
 8625 *	@dev: device
 8626 *
 8627 *	Get the combination of flag bits exported through APIs to userspace.
 8628 */
 8629unsigned int dev_get_flags(const struct net_device *dev)
 8630{
 8631	unsigned int flags;
 8632
 8633	flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC |
 8634				IFF_ALLMULTI |
 8635				IFF_RUNNING |
 8636				IFF_LOWER_UP |
 8637				IFF_DORMANT)) |
 8638		(READ_ONCE(dev->gflags) & (IFF_PROMISC |
 8639				IFF_ALLMULTI));
 8640
 8641	if (netif_running(dev)) {
 8642		if (netif_oper_up(dev))
 8643			flags |= IFF_RUNNING;
 8644		if (netif_carrier_ok(dev))
 8645			flags |= IFF_LOWER_UP;
 8646		if (netif_dormant(dev))
 8647			flags |= IFF_DORMANT;
 8648	}
 8649
 8650	return flags;
 8651}
 8652EXPORT_SYMBOL(dev_get_flags);
 8653
 8654int __dev_change_flags(struct net_device *dev, unsigned int flags,
 8655		       struct netlink_ext_ack *extack)
 8656{
 8657	unsigned int old_flags = dev->flags;
 8658	int ret;
 8659
 8660	ASSERT_RTNL();
 8661
 8662	/*
 8663	 *	Set the flags on our device.
 8664	 */
 8665
 8666	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
 8667			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
 8668			       IFF_AUTOMEDIA)) |
 8669		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
 8670				    IFF_ALLMULTI));
 8671
 8672	/*
 8673	 *	Load in the correct multicast list now the flags have changed.
 8674	 */
 8675
 8676	if ((old_flags ^ flags) & IFF_MULTICAST)
 8677		dev_change_rx_flags(dev, IFF_MULTICAST);
 8678
 8679	dev_set_rx_mode(dev);
 8680
 8681	/*
 8682	 *	Have we downed the interface. We handle IFF_UP ourselves
 8683	 *	according to user attempts to set it, rather than blindly
 8684	 *	setting it.
 8685	 */
 8686
 8687	ret = 0;
 8688	if ((old_flags ^ flags) & IFF_UP) {
 8689		if (old_flags & IFF_UP)
 8690			__dev_close(dev);
 8691		else
 8692			ret = __dev_open(dev, extack);
 8693	}
 8694
 8695	if ((flags ^ dev->gflags) & IFF_PROMISC) {
 8696		int inc = (flags & IFF_PROMISC) ? 1 : -1;
 8697		unsigned int old_flags = dev->flags;
 8698
 8699		dev->gflags ^= IFF_PROMISC;
 8700
 8701		if (__dev_set_promiscuity(dev, inc, false) >= 0)
 8702			if (dev->flags != old_flags)
 8703				dev_set_rx_mode(dev);
 8704	}
 8705
 8706	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
 8707	 * is important. Some (broken) drivers set IFF_PROMISC, when
 8708	 * IFF_ALLMULTI is requested not asking us and not reporting.
 8709	 */
 8710	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
 8711		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
 8712
 8713		dev->gflags ^= IFF_ALLMULTI;
 8714		__dev_set_allmulti(dev, inc, false);
 8715	}
 8716
 8717	return ret;
 8718}
 8719
 8720void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
 8721			unsigned int gchanges, u32 portid,
 8722			const struct nlmsghdr *nlh)
 8723{
 8724	unsigned int changes = dev->flags ^ old_flags;
 8725
 8726	if (gchanges)
 8727		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh);
 8728
 8729	if (changes & IFF_UP) {
 8730		if (dev->flags & IFF_UP)
 8731			call_netdevice_notifiers(NETDEV_UP, dev);
 8732		else
 8733			call_netdevice_notifiers(NETDEV_DOWN, dev);
 8734	}
 8735
 8736	if (dev->flags & IFF_UP &&
 8737	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
 8738		struct netdev_notifier_change_info change_info = {
 8739			.info = {
 8740				.dev = dev,
 8741			},
 8742			.flags_changed = changes,
 8743		};
 8744
 8745		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
 
 
 8746	}
 8747}
 8748
 8749/**
 8750 *	dev_change_flags - change device settings
 8751 *	@dev: device
 8752 *	@flags: device state flags
 8753 *	@extack: netlink extended ack
 8754 *
 8755 *	Change settings on device based state flags. The flags are
 8756 *	in the userspace exported format.
 8757 */
 8758int dev_change_flags(struct net_device *dev, unsigned int flags,
 8759		     struct netlink_ext_ack *extack)
 8760{
 8761	int ret;
 8762	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
 8763
 8764	ret = __dev_change_flags(dev, flags, extack);
 8765	if (ret < 0)
 8766		return ret;
 8767
 8768	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
 8769	__dev_notify_flags(dev, old_flags, changes, 0, NULL);
 8770	return ret;
 8771}
 8772EXPORT_SYMBOL(dev_change_flags);
 8773
 8774int __dev_set_mtu(struct net_device *dev, int new_mtu)
 8775{
 8776	const struct net_device_ops *ops = dev->netdev_ops;
 8777
 8778	if (ops->ndo_change_mtu)
 8779		return ops->ndo_change_mtu(dev, new_mtu);
 8780
 8781	/* Pairs with all the lockless reads of dev->mtu in the stack */
 8782	WRITE_ONCE(dev->mtu, new_mtu);
 8783	return 0;
 8784}
 8785EXPORT_SYMBOL(__dev_set_mtu);
 8786
 8787int dev_validate_mtu(struct net_device *dev, int new_mtu,
 8788		     struct netlink_ext_ack *extack)
 8789{
 8790	/* MTU must be positive, and in range */
 8791	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
 8792		NL_SET_ERR_MSG(extack, "mtu less than device minimum");
 8793		return -EINVAL;
 8794	}
 8795
 8796	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
 8797		NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
 8798		return -EINVAL;
 8799	}
 8800	return 0;
 8801}
 8802
 8803/**
 8804 *	dev_set_mtu_ext - Change maximum transfer unit
 8805 *	@dev: device
 8806 *	@new_mtu: new transfer unit
 8807 *	@extack: netlink extended ack
 8808 *
 8809 *	Change the maximum transfer size of the network device.
 8810 */
 8811int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
 8812		    struct netlink_ext_ack *extack)
 8813{
 8814	int err, orig_mtu;
 8815
 8816	if (new_mtu == dev->mtu)
 8817		return 0;
 8818
 8819	err = dev_validate_mtu(dev, new_mtu, extack);
 8820	if (err)
 8821		return err;
 
 
 
 
 
 
 
 
 
 8822
 8823	if (!netif_device_present(dev))
 8824		return -ENODEV;
 8825
 8826	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
 8827	err = notifier_to_errno(err);
 8828	if (err)
 8829		return err;
 8830
 8831	orig_mtu = dev->mtu;
 8832	err = __dev_set_mtu(dev, new_mtu);
 8833
 8834	if (!err) {
 8835		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 8836						   orig_mtu);
 8837		err = notifier_to_errno(err);
 8838		if (err) {
 8839			/* setting mtu back and notifying everyone again,
 8840			 * so that they have a chance to revert changes.
 8841			 */
 8842			__dev_set_mtu(dev, orig_mtu);
 8843			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 8844						     new_mtu);
 8845		}
 8846	}
 8847	return err;
 8848}
 8849
 8850int dev_set_mtu(struct net_device *dev, int new_mtu)
 8851{
 8852	struct netlink_ext_ack extack;
 8853	int err;
 8854
 8855	memset(&extack, 0, sizeof(extack));
 8856	err = dev_set_mtu_ext(dev, new_mtu, &extack);
 8857	if (err && extack._msg)
 8858		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
 8859	return err;
 8860}
 8861EXPORT_SYMBOL(dev_set_mtu);
 8862
 8863/**
 8864 *	dev_change_tx_queue_len - Change TX queue length of a netdevice
 8865 *	@dev: device
 8866 *	@new_len: new tx queue length
 8867 */
 8868int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
 8869{
 8870	unsigned int orig_len = dev->tx_queue_len;
 8871	int res;
 8872
 8873	if (new_len != (unsigned int)new_len)
 8874		return -ERANGE;
 8875
 8876	if (new_len != orig_len) {
 8877		dev->tx_queue_len = new_len;
 8878		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
 8879		res = notifier_to_errno(res);
 8880		if (res)
 8881			goto err_rollback;
 8882		res = dev_qdisc_change_tx_queue_len(dev);
 8883		if (res)
 8884			goto err_rollback;
 8885	}
 8886
 8887	return 0;
 8888
 8889err_rollback:
 8890	netdev_err(dev, "refused to change device tx_queue_len\n");
 8891	dev->tx_queue_len = orig_len;
 8892	return res;
 8893}
 8894
 8895/**
 8896 *	dev_set_group - Change group this device belongs to
 8897 *	@dev: device
 8898 *	@new_group: group this device should belong to
 8899 */
 8900void dev_set_group(struct net_device *dev, int new_group)
 8901{
 8902	dev->group = new_group;
 8903}
 8904
 8905/**
 8906 *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
 8907 *	@dev: device
 8908 *	@addr: new address
 8909 *	@extack: netlink extended ack
 8910 */
 8911int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
 8912			      struct netlink_ext_ack *extack)
 8913{
 8914	struct netdev_notifier_pre_changeaddr_info info = {
 8915		.info.dev = dev,
 8916		.info.extack = extack,
 8917		.dev_addr = addr,
 8918	};
 8919	int rc;
 8920
 8921	rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
 8922	return notifier_to_errno(rc);
 8923}
 8924EXPORT_SYMBOL(dev_pre_changeaddr_notify);
 8925
 8926/**
 8927 *	dev_set_mac_address - Change Media Access Control Address
 8928 *	@dev: device
 8929 *	@sa: new address
 8930 *	@extack: netlink extended ack
 8931 *
 8932 *	Change the hardware (MAC) address of the device
 8933 */
 8934int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
 8935			struct netlink_ext_ack *extack)
 8936{
 8937	const struct net_device_ops *ops = dev->netdev_ops;
 8938	int err;
 8939
 8940	if (!ops->ndo_set_mac_address)
 8941		return -EOPNOTSUPP;
 8942	if (sa->sa_family != dev->type)
 8943		return -EINVAL;
 8944	if (!netif_device_present(dev))
 8945		return -ENODEV;
 8946	err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
 8947	if (err)
 8948		return err;
 8949	if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) {
 8950		err = ops->ndo_set_mac_address(dev, sa);
 8951		if (err)
 8952			return err;
 8953	}
 8954	dev->addr_assign_type = NET_ADDR_SET;
 8955	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
 8956	add_device_randomness(dev->dev_addr, dev->addr_len);
 8957	return 0;
 8958}
 8959EXPORT_SYMBOL(dev_set_mac_address);
 8960
 8961DECLARE_RWSEM(dev_addr_sem);
 8962
 8963int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
 8964			     struct netlink_ext_ack *extack)
 8965{
 8966	int ret;
 8967
 8968	down_write(&dev_addr_sem);
 8969	ret = dev_set_mac_address(dev, sa, extack);
 8970	up_write(&dev_addr_sem);
 8971	return ret;
 8972}
 8973EXPORT_SYMBOL(dev_set_mac_address_user);
 8974
 8975int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
 8976{
 8977	size_t size = sizeof(sa->sa_data_min);
 8978	struct net_device *dev;
 8979	int ret = 0;
 8980
 8981	down_read(&dev_addr_sem);
 8982	rcu_read_lock();
 8983
 8984	dev = dev_get_by_name_rcu(net, dev_name);
 8985	if (!dev) {
 8986		ret = -ENODEV;
 8987		goto unlock;
 8988	}
 8989	if (!dev->addr_len)
 8990		memset(sa->sa_data, 0, size);
 8991	else
 8992		memcpy(sa->sa_data, dev->dev_addr,
 8993		       min_t(size_t, size, dev->addr_len));
 8994	sa->sa_family = dev->type;
 8995
 8996unlock:
 8997	rcu_read_unlock();
 8998	up_read(&dev_addr_sem);
 8999	return ret;
 9000}
 9001EXPORT_SYMBOL(dev_get_mac_address);
 9002
 9003/**
 9004 *	dev_change_carrier - Change device carrier
 9005 *	@dev: device
 9006 *	@new_carrier: new value
 9007 *
 9008 *	Change device carrier
 9009 */
 9010int dev_change_carrier(struct net_device *dev, bool new_carrier)
 9011{
 9012	const struct net_device_ops *ops = dev->netdev_ops;
 9013
 9014	if (!ops->ndo_change_carrier)
 9015		return -EOPNOTSUPP;
 9016	if (!netif_device_present(dev))
 9017		return -ENODEV;
 9018	return ops->ndo_change_carrier(dev, new_carrier);
 9019}
 
 9020
 9021/**
 9022 *	dev_get_phys_port_id - Get device physical port ID
 9023 *	@dev: device
 9024 *	@ppid: port ID
 9025 *
 9026 *	Get device physical port ID
 9027 */
 9028int dev_get_phys_port_id(struct net_device *dev,
 9029			 struct netdev_phys_item_id *ppid)
 9030{
 9031	const struct net_device_ops *ops = dev->netdev_ops;
 9032
 9033	if (!ops->ndo_get_phys_port_id)
 9034		return -EOPNOTSUPP;
 9035	return ops->ndo_get_phys_port_id(dev, ppid);
 9036}
 
 9037
 9038/**
 9039 *	dev_get_phys_port_name - Get device physical port name
 9040 *	@dev: device
 9041 *	@name: port name
 9042 *	@len: limit of bytes to copy to name
 9043 *
 9044 *	Get device physical port name
 9045 */
 9046int dev_get_phys_port_name(struct net_device *dev,
 9047			   char *name, size_t len)
 9048{
 9049	const struct net_device_ops *ops = dev->netdev_ops;
 9050	int err;
 9051
 9052	if (ops->ndo_get_phys_port_name) {
 9053		err = ops->ndo_get_phys_port_name(dev, name, len);
 9054		if (err != -EOPNOTSUPP)
 9055			return err;
 9056	}
 9057	return devlink_compat_phys_port_name_get(dev, name, len);
 9058}
 9059
 9060/**
 9061 *	dev_get_port_parent_id - Get the device's port parent identifier
 9062 *	@dev: network device
 9063 *	@ppid: pointer to a storage for the port's parent identifier
 9064 *	@recurse: allow/disallow recursion to lower devices
 9065 *
 9066 *	Get the devices's port parent identifier
 9067 */
 9068int dev_get_port_parent_id(struct net_device *dev,
 9069			   struct netdev_phys_item_id *ppid,
 9070			   bool recurse)
 9071{
 9072	const struct net_device_ops *ops = dev->netdev_ops;
 9073	struct netdev_phys_item_id first = { };
 9074	struct net_device *lower_dev;
 9075	struct list_head *iter;
 9076	int err;
 9077
 9078	if (ops->ndo_get_port_parent_id) {
 9079		err = ops->ndo_get_port_parent_id(dev, ppid);
 9080		if (err != -EOPNOTSUPP)
 9081			return err;
 9082	}
 9083
 9084	err = devlink_compat_switch_id_get(dev, ppid);
 9085	if (!recurse || err != -EOPNOTSUPP)
 9086		return err;
 9087
 9088	netdev_for_each_lower_dev(dev, lower_dev, iter) {
 9089		err = dev_get_port_parent_id(lower_dev, ppid, true);
 9090		if (err)
 9091			break;
 9092		if (!first.id_len)
 9093			first = *ppid;
 9094		else if (memcmp(&first, ppid, sizeof(*ppid)))
 9095			return -EOPNOTSUPP;
 9096	}
 9097
 9098	return err;
 9099}
 9100EXPORT_SYMBOL(dev_get_port_parent_id);
 9101
 9102/**
 9103 *	netdev_port_same_parent_id - Indicate if two network devices have
 9104 *	the same port parent identifier
 9105 *	@a: first network device
 9106 *	@b: second network device
 9107 */
 9108bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
 9109{
 9110	struct netdev_phys_item_id a_id = { };
 9111	struct netdev_phys_item_id b_id = { };
 9112
 9113	if (dev_get_port_parent_id(a, &a_id, true) ||
 9114	    dev_get_port_parent_id(b, &b_id, true))
 9115		return false;
 9116
 9117	return netdev_phys_item_id_same(&a_id, &b_id);
 9118}
 9119EXPORT_SYMBOL(netdev_port_same_parent_id);
 9120
 9121/**
 9122 *	dev_change_proto_down - set carrier according to proto_down.
 9123 *
 9124 *	@dev: device
 9125 *	@proto_down: new value
 
 
 
 9126 */
 9127int dev_change_proto_down(struct net_device *dev, bool proto_down)
 9128{
 9129	if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN))
 
 
 9130		return -EOPNOTSUPP;
 9131	if (!netif_device_present(dev))
 9132		return -ENODEV;
 9133	if (proto_down)
 9134		netif_carrier_off(dev);
 9135	else
 9136		netif_carrier_on(dev);
 9137	dev->proto_down = proto_down;
 9138	return 0;
 9139}
 
 9140
 9141/**
 9142 *	dev_change_proto_down_reason - proto down reason
 9143 *
 9144 *	@dev: device
 9145 *	@mask: proto down mask
 9146 *	@value: proto down value
 
 
 9147 */
 9148void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
 9149				  u32 value)
 9150{
 9151	int b;
 9152
 9153	if (!mask) {
 9154		dev->proto_down_reason = value;
 9155	} else {
 9156		for_each_set_bit(b, &mask, 32) {
 9157			if (value & (1 << b))
 9158				dev->proto_down_reason |= BIT(b);
 9159			else
 9160				dev->proto_down_reason &= ~BIT(b);
 9161		}
 9162	}
 9163}
 9164
 9165struct bpf_xdp_link {
 9166	struct bpf_link link;
 9167	struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
 9168	int flags;
 9169};
 9170
 9171static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
 9172{
 9173	if (flags & XDP_FLAGS_HW_MODE)
 9174		return XDP_MODE_HW;
 9175	if (flags & XDP_FLAGS_DRV_MODE)
 9176		return XDP_MODE_DRV;
 9177	if (flags & XDP_FLAGS_SKB_MODE)
 9178		return XDP_MODE_SKB;
 9179	return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
 9180}
 9181
 9182static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
 9183{
 9184	switch (mode) {
 9185	case XDP_MODE_SKB:
 9186		return generic_xdp_install;
 9187	case XDP_MODE_DRV:
 9188	case XDP_MODE_HW:
 9189		return dev->netdev_ops->ndo_bpf;
 9190	default:
 9191		return NULL;
 9192	}
 9193}
 9194
 9195static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
 9196					 enum bpf_xdp_mode mode)
 9197{
 9198	return dev->xdp_state[mode].link;
 9199}
 9200
 9201static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
 9202				     enum bpf_xdp_mode mode)
 9203{
 9204	struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
 9205
 9206	if (link)
 9207		return link->link.prog;
 9208	return dev->xdp_state[mode].prog;
 9209}
 9210
 9211u8 dev_xdp_prog_count(struct net_device *dev)
 9212{
 9213	u8 count = 0;
 9214	int i;
 9215
 9216	for (i = 0; i < __MAX_XDP_MODE; i++)
 9217		if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
 9218			count++;
 9219	return count;
 9220}
 9221EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
 9222
 9223u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
 9224{
 9225	struct bpf_prog *prog = dev_xdp_prog(dev, mode);
 9226
 9227	return prog ? prog->aux->id : 0;
 9228}
 9229
 9230static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
 9231			     struct bpf_xdp_link *link)
 9232{
 9233	dev->xdp_state[mode].link = link;
 9234	dev->xdp_state[mode].prog = NULL;
 9235}
 9236
 9237static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
 9238			     struct bpf_prog *prog)
 9239{
 9240	dev->xdp_state[mode].link = NULL;
 9241	dev->xdp_state[mode].prog = prog;
 9242}
 9243
 9244static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
 9245			   bpf_op_t bpf_op, struct netlink_ext_ack *extack,
 9246			   u32 flags, struct bpf_prog *prog)
 9247{
 9248	struct netdev_bpf xdp;
 9249	int err;
 9250
 9251	memset(&xdp, 0, sizeof(xdp));
 9252	xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
 9253	xdp.extack = extack;
 9254	xdp.flags = flags;
 9255	xdp.prog = prog;
 9256
 9257	/* Drivers assume refcnt is already incremented (i.e, prog pointer is
 9258	 * "moved" into driver), so they don't increment it on their own, but
 9259	 * they do decrement refcnt when program is detached or replaced.
 9260	 * Given net_device also owns link/prog, we need to bump refcnt here
 9261	 * to prevent drivers from underflowing it.
 9262	 */
 9263	if (prog)
 9264		bpf_prog_inc(prog);
 9265	err = bpf_op(dev, &xdp);
 9266	if (err) {
 9267		if (prog)
 9268			bpf_prog_put(prog);
 9269		return err;
 9270	}
 9271
 9272	if (mode != XDP_MODE_HW)
 9273		bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
 9274
 9275	return 0;
 9276}
 9277
 9278static void dev_xdp_uninstall(struct net_device *dev)
 9279{
 9280	struct bpf_xdp_link *link;
 9281	struct bpf_prog *prog;
 9282	enum bpf_xdp_mode mode;
 9283	bpf_op_t bpf_op;
 9284
 9285	ASSERT_RTNL();
 9286
 9287	for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
 9288		prog = dev_xdp_prog(dev, mode);
 9289		if (!prog)
 9290			continue;
 9291
 9292		bpf_op = dev_xdp_bpf_op(dev, mode);
 9293		if (!bpf_op)
 9294			continue;
 9295
 9296		WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
 9297
 9298		/* auto-detach link from net device */
 9299		link = dev_xdp_link(dev, mode);
 9300		if (link)
 9301			link->dev = NULL;
 9302		else
 9303			bpf_prog_put(prog);
 9304
 9305		dev_xdp_set_link(dev, mode, NULL);
 9306	}
 9307}
 9308
 9309static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
 9310			  struct bpf_xdp_link *link, struct bpf_prog *new_prog,
 9311			  struct bpf_prog *old_prog, u32 flags)
 9312{
 9313	unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
 9314	struct bpf_prog *cur_prog;
 9315	struct net_device *upper;
 9316	struct list_head *iter;
 9317	enum bpf_xdp_mode mode;
 9318	bpf_op_t bpf_op;
 9319	int err;
 9320
 9321	ASSERT_RTNL();
 9322
 9323	/* either link or prog attachment, never both */
 9324	if (link && (new_prog || old_prog))
 9325		return -EINVAL;
 9326	/* link supports only XDP mode flags */
 9327	if (link && (flags & ~XDP_FLAGS_MODES)) {
 9328		NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
 9329		return -EINVAL;
 9330	}
 9331	/* just one XDP mode bit should be set, zero defaults to drv/skb mode */
 9332	if (num_modes > 1) {
 9333		NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
 9334		return -EINVAL;
 9335	}
 9336	/* avoid ambiguity if offload + drv/skb mode progs are both loaded */
 9337	if (!num_modes && dev_xdp_prog_count(dev) > 1) {
 9338		NL_SET_ERR_MSG(extack,
 9339			       "More than one program loaded, unset mode is ambiguous");
 9340		return -EINVAL;
 9341	}
 9342	/* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
 9343	if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
 9344		NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
 9345		return -EINVAL;
 9346	}
 9347
 9348	mode = dev_xdp_mode(dev, flags);
 9349	/* can't replace attached link */
 9350	if (dev_xdp_link(dev, mode)) {
 9351		NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
 9352		return -EBUSY;
 9353	}
 9354
 9355	/* don't allow if an upper device already has a program */
 9356	netdev_for_each_upper_dev_rcu(dev, upper, iter) {
 9357		if (dev_xdp_prog_count(upper) > 0) {
 9358			NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
 9359			return -EEXIST;
 9360		}
 9361	}
 9362
 9363	cur_prog = dev_xdp_prog(dev, mode);
 9364	/* can't replace attached prog with link */
 9365	if (link && cur_prog) {
 9366		NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
 9367		return -EBUSY;
 9368	}
 9369	if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
 9370		NL_SET_ERR_MSG(extack, "Active program does not match expected");
 9371		return -EEXIST;
 9372	}
 9373
 9374	/* put effective new program into new_prog */
 9375	if (link)
 9376		new_prog = link->link.prog;
 9377
 9378	if (new_prog) {
 9379		bool offload = mode == XDP_MODE_HW;
 9380		enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
 9381					       ? XDP_MODE_DRV : XDP_MODE_SKB;
 9382
 9383		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
 9384			NL_SET_ERR_MSG(extack, "XDP program already attached");
 9385			return -EBUSY;
 9386		}
 9387		if (!offload && dev_xdp_prog(dev, other_mode)) {
 9388			NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
 9389			return -EEXIST;
 9390		}
 9391		if (!offload && bpf_prog_is_offloaded(new_prog->aux)) {
 9392			NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported");
 9393			return -EINVAL;
 9394		}
 9395		if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) {
 9396			NL_SET_ERR_MSG(extack, "Program bound to different device");
 9397			return -EINVAL;
 9398		}
 9399		if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
 9400			NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
 9401			return -EINVAL;
 9402		}
 9403		if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
 9404			NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
 9405			return -EINVAL;
 9406		}
 9407	}
 9408
 9409	/* don't call drivers if the effective program didn't change */
 9410	if (new_prog != cur_prog) {
 9411		bpf_op = dev_xdp_bpf_op(dev, mode);
 9412		if (!bpf_op) {
 9413			NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
 9414			return -EOPNOTSUPP;
 9415		}
 9416
 9417		err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
 9418		if (err)
 9419			return err;
 9420	}
 9421
 9422	if (link)
 9423		dev_xdp_set_link(dev, mode, link);
 9424	else
 9425		dev_xdp_set_prog(dev, mode, new_prog);
 9426	if (cur_prog)
 9427		bpf_prog_put(cur_prog);
 9428
 9429	return 0;
 9430}
 9431
 9432static int dev_xdp_attach_link(struct net_device *dev,
 9433			       struct netlink_ext_ack *extack,
 9434			       struct bpf_xdp_link *link)
 9435{
 9436	return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
 9437}
 9438
 9439static int dev_xdp_detach_link(struct net_device *dev,
 9440			       struct netlink_ext_ack *extack,
 9441			       struct bpf_xdp_link *link)
 9442{
 9443	enum bpf_xdp_mode mode;
 9444	bpf_op_t bpf_op;
 9445
 9446	ASSERT_RTNL();
 9447
 9448	mode = dev_xdp_mode(dev, link->flags);
 9449	if (dev_xdp_link(dev, mode) != link)
 9450		return -EINVAL;
 9451
 9452	bpf_op = dev_xdp_bpf_op(dev, mode);
 9453	WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
 9454	dev_xdp_set_link(dev, mode, NULL);
 9455	return 0;
 9456}
 
 9457
 9458static void bpf_xdp_link_release(struct bpf_link *link)
 
 
 
 
 
 
 
 
 9459{
 9460	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9461
 9462	rtnl_lock();
 9463
 9464	/* if racing with net_device's tear down, xdp_link->dev might be
 9465	 * already NULL, in which case link was already auto-detached
 9466	 */
 9467	if (xdp_link->dev) {
 9468		WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
 9469		xdp_link->dev = NULL;
 9470	}
 9471
 9472	rtnl_unlock();
 9473}
 9474
 9475static int bpf_xdp_link_detach(struct bpf_link *link)
 9476{
 9477	bpf_xdp_link_release(link);
 9478	return 0;
 9479}
 9480
 9481static void bpf_xdp_link_dealloc(struct bpf_link *link)
 9482{
 9483	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9484
 9485	kfree(xdp_link);
 9486}
 9487
 9488static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
 9489				     struct seq_file *seq)
 9490{
 9491	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9492	u32 ifindex = 0;
 9493
 9494	rtnl_lock();
 9495	if (xdp_link->dev)
 9496		ifindex = xdp_link->dev->ifindex;
 9497	rtnl_unlock();
 9498
 9499	seq_printf(seq, "ifindex:\t%u\n", ifindex);
 9500}
 9501
 9502static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
 9503				       struct bpf_link_info *info)
 9504{
 9505	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9506	u32 ifindex = 0;
 9507
 9508	rtnl_lock();
 9509	if (xdp_link->dev)
 9510		ifindex = xdp_link->dev->ifindex;
 9511	rtnl_unlock();
 9512
 9513	info->xdp.ifindex = ifindex;
 9514	return 0;
 9515}
 9516
 9517static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
 9518			       struct bpf_prog *old_prog)
 9519{
 9520	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9521	enum bpf_xdp_mode mode;
 9522	bpf_op_t bpf_op;
 9523	int err = 0;
 9524
 9525	rtnl_lock();
 9526
 9527	/* link might have been auto-released already, so fail */
 9528	if (!xdp_link->dev) {
 9529		err = -ENOLINK;
 9530		goto out_unlock;
 9531	}
 9532
 9533	if (old_prog && link->prog != old_prog) {
 9534		err = -EPERM;
 9535		goto out_unlock;
 9536	}
 9537	old_prog = link->prog;
 9538	if (old_prog->type != new_prog->type ||
 9539	    old_prog->expected_attach_type != new_prog->expected_attach_type) {
 9540		err = -EINVAL;
 9541		goto out_unlock;
 9542	}
 9543
 9544	if (old_prog == new_prog) {
 9545		/* no-op, don't disturb drivers */
 9546		bpf_prog_put(new_prog);
 9547		goto out_unlock;
 9548	}
 9549
 9550	mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
 9551	bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
 9552	err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
 9553			      xdp_link->flags, new_prog);
 9554	if (err)
 9555		goto out_unlock;
 9556
 9557	old_prog = xchg(&link->prog, new_prog);
 9558	bpf_prog_put(old_prog);
 9559
 9560out_unlock:
 9561	rtnl_unlock();
 9562	return err;
 9563}
 9564
 9565static const struct bpf_link_ops bpf_xdp_link_lops = {
 9566	.release = bpf_xdp_link_release,
 9567	.dealloc = bpf_xdp_link_dealloc,
 9568	.detach = bpf_xdp_link_detach,
 9569	.show_fdinfo = bpf_xdp_link_show_fdinfo,
 9570	.fill_link_info = bpf_xdp_link_fill_link_info,
 9571	.update_prog = bpf_xdp_link_update,
 9572};
 9573
 9574int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 9575{
 9576	struct net *net = current->nsproxy->net_ns;
 9577	struct bpf_link_primer link_primer;
 9578	struct netlink_ext_ack extack = {};
 9579	struct bpf_xdp_link *link;
 9580	struct net_device *dev;
 9581	int err, fd;
 9582
 9583	rtnl_lock();
 9584	dev = dev_get_by_index(net, attr->link_create.target_ifindex);
 9585	if (!dev) {
 9586		rtnl_unlock();
 9587		return -EINVAL;
 9588	}
 
 
 9589
 9590	link = kzalloc(sizeof(*link), GFP_USER);
 9591	if (!link) {
 9592		err = -ENOMEM;
 9593		goto unlock;
 
 
 9594	}
 9595
 9596	bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
 9597	link->dev = dev;
 9598	link->flags = attr->link_create.flags;
 9599
 9600	err = bpf_link_prime(&link->link, &link_primer);
 9601	if (err) {
 9602		kfree(link);
 9603		goto unlock;
 9604	}
 9605
 9606	err = dev_xdp_attach_link(dev, &extack, link);
 9607	rtnl_unlock();
 
 9608
 9609	if (err) {
 9610		link->dev = NULL;
 9611		bpf_link_cleanup(&link_primer);
 9612		trace_bpf_xdp_link_attach_failed(extack._msg);
 9613		goto out_put_dev;
 9614	}
 
 9615
 9616	fd = bpf_link_settle(&link_primer);
 9617	/* link itself doesn't hold dev's refcnt to not complicate shutdown */
 9618	dev_put(dev);
 9619	return fd;
 9620
 9621unlock:
 9622	rtnl_unlock();
 9623
 9624out_put_dev:
 9625	dev_put(dev);
 9626	return err;
 9627}
 9628
 9629/**
 9630 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
 9631 *	@dev: device
 9632 *	@extack: netlink extended ack
 9633 *	@fd: new program fd or negative value to clear
 9634 *	@expected_fd: old program fd that userspace expects to replace or clear
 9635 *	@flags: xdp-related flags
 9636 *
 9637 *	Set or clear a bpf program for a device
 9638 */
 9639int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 9640		      int fd, int expected_fd, u32 flags)
 9641{
 9642	enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
 9643	struct bpf_prog *new_prog = NULL, *old_prog = NULL;
 9644	int err;
 9645
 9646	ASSERT_RTNL();
 
 
 
 9647
 9648	if (fd >= 0) {
 9649		new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
 9650						 mode != XDP_MODE_SKB);
 9651		if (IS_ERR(new_prog))
 9652			return PTR_ERR(new_prog);
 9653	}
 9654
 9655	if (expected_fd >= 0) {
 9656		old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
 9657						 mode != XDP_MODE_SKB);
 9658		if (IS_ERR(old_prog)) {
 9659			err = PTR_ERR(old_prog);
 9660			old_prog = NULL;
 9661			goto err_out;
 9662		}
 9663	}
 9664
 9665	err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
 
 9666
 9667err_out:
 9668	if (err && new_prog)
 9669		bpf_prog_put(new_prog);
 9670	if (old_prog)
 9671		bpf_prog_put(old_prog);
 9672	return err;
 9673}
 9674
 9675/**
 9676 * dev_index_reserve() - allocate an ifindex in a namespace
 9677 * @net: the applicable net namespace
 9678 * @ifindex: requested ifindex, pass %0 to get one allocated
 9679 *
 9680 * Allocate a ifindex for a new device. Caller must either use the ifindex
 9681 * to store the device (via list_netdevice()) or call dev_index_release()
 9682 * to give the index up.
 9683 *
 9684 * Return: a suitable unique value for a new device interface number or -errno.
 9685 */
 9686static int dev_index_reserve(struct net *net, u32 ifindex)
 9687{
 9688	int err;
 9689
 9690	if (ifindex > INT_MAX) {
 9691		DEBUG_NET_WARN_ON_ONCE(1);
 9692		return -EINVAL;
 
 
 
 9693	}
 9694
 9695	if (!ifindex)
 9696		err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL,
 9697				      xa_limit_31b, &net->ifindex, GFP_KERNEL);
 9698	else
 9699		err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL);
 9700	if (err < 0)
 9701		return err;
 9702
 9703	return ifindex;
 
 9704}
 9705
 9706static void dev_index_release(struct net *net, int ifindex)
 9707{
 9708	/* Expect only unused indexes, unlist_netdevice() removes the used */
 9709	WARN_ON(xa_erase(&net->dev_by_index, ifindex));
 9710}
 9711
 9712/* Delayed registration/unregisteration */
 9713LIST_HEAD(net_todo_list);
 9714DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
 9715atomic_t dev_unreg_count = ATOMIC_INIT(0);
 9716
 9717static void net_set_todo(struct net_device *dev)
 9718{
 9719	list_add_tail(&dev->todo_list, &net_todo_list);
 9720}
 9721
 9722static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
 9723	struct net_device *upper, netdev_features_t features)
 9724{
 9725	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
 9726	netdev_features_t feature;
 9727	int feature_bit;
 9728
 9729	for_each_netdev_feature(upper_disables, feature_bit) {
 9730		feature = __NETIF_F_BIT(feature_bit);
 9731		if (!(upper->wanted_features & feature)
 9732		    && (features & feature)) {
 9733			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
 9734				   &feature, upper->name);
 9735			features &= ~feature;
 9736		}
 9737	}
 9738
 9739	return features;
 9740}
 9741
 9742static void netdev_sync_lower_features(struct net_device *upper,
 9743	struct net_device *lower, netdev_features_t features)
 9744{
 9745	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
 9746	netdev_features_t feature;
 9747	int feature_bit;
 9748
 9749	for_each_netdev_feature(upper_disables, feature_bit) {
 9750		feature = __NETIF_F_BIT(feature_bit);
 9751		if (!(features & feature) && (lower->features & feature)) {
 9752			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
 9753				   &feature, lower->name);
 9754			lower->wanted_features &= ~feature;
 9755			__netdev_update_features(lower);
 9756
 9757			if (unlikely(lower->features & feature))
 9758				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
 9759					    &feature, lower->name);
 9760			else
 9761				netdev_features_change(lower);
 9762		}
 9763	}
 9764}
 9765
 9766static netdev_features_t netdev_fix_features(struct net_device *dev,
 9767	netdev_features_t features)
 9768{
 9769	/* Fix illegal checksum combinations */
 9770	if ((features & NETIF_F_HW_CSUM) &&
 9771	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
 9772		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
 9773		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
 9774	}
 9775
 9776	/* TSO requires that SG is present as well. */
 9777	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
 9778		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
 9779		features &= ~NETIF_F_ALL_TSO;
 9780	}
 9781
 9782	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
 9783					!(features & NETIF_F_IP_CSUM)) {
 9784		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
 9785		features &= ~NETIF_F_TSO;
 9786		features &= ~NETIF_F_TSO_ECN;
 9787	}
 9788
 9789	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
 9790					 !(features & NETIF_F_IPV6_CSUM)) {
 9791		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
 9792		features &= ~NETIF_F_TSO6;
 9793	}
 9794
 9795	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
 9796	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
 9797		features &= ~NETIF_F_TSO_MANGLEID;
 9798
 9799	/* TSO ECN requires that TSO is present as well. */
 9800	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
 9801		features &= ~NETIF_F_TSO_ECN;
 9802
 9803	/* Software GSO depends on SG. */
 9804	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
 9805		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
 9806		features &= ~NETIF_F_GSO;
 9807	}
 9808
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 9809	/* GSO partial features require GSO partial be set */
 9810	if ((features & dev->gso_partial_features) &&
 9811	    !(features & NETIF_F_GSO_PARTIAL)) {
 9812		netdev_dbg(dev,
 9813			   "Dropping partially supported GSO features since no GSO partial.\n");
 9814		features &= ~dev->gso_partial_features;
 9815	}
 9816
 9817	if (!(features & NETIF_F_RXCSUM)) {
 9818		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
 9819		 * successfully merged by hardware must also have the
 9820		 * checksum verified by hardware.  If the user does not
 9821		 * want to enable RXCSUM, logically, we should disable GRO_HW.
 9822		 */
 9823		if (features & NETIF_F_GRO_HW) {
 9824			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
 9825			features &= ~NETIF_F_GRO_HW;
 9826		}
 9827	}
 9828
 9829	/* LRO/HW-GRO features cannot be combined with RX-FCS */
 9830	if (features & NETIF_F_RXFCS) {
 9831		if (features & NETIF_F_LRO) {
 9832			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
 9833			features &= ~NETIF_F_LRO;
 9834		}
 9835
 9836		if (features & NETIF_F_GRO_HW) {
 9837			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
 9838			features &= ~NETIF_F_GRO_HW;
 9839		}
 9840	}
 9841
 9842	if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
 9843		netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
 9844		features &= ~NETIF_F_LRO;
 9845	}
 9846
 9847	if (features & NETIF_F_HW_TLS_TX) {
 9848		bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
 9849			(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
 9850		bool hw_csum = features & NETIF_F_HW_CSUM;
 9851
 9852		if (!ip_csum && !hw_csum) {
 9853			netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
 9854			features &= ~NETIF_F_HW_TLS_TX;
 9855		}
 9856	}
 9857
 9858	if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
 9859		netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
 9860		features &= ~NETIF_F_HW_TLS_RX;
 9861	}
 9862
 9863	return features;
 9864}
 9865
 9866int __netdev_update_features(struct net_device *dev)
 9867{
 9868	struct net_device *upper, *lower;
 9869	netdev_features_t features;
 9870	struct list_head *iter;
 9871	int err = -1;
 9872
 9873	ASSERT_RTNL();
 9874
 9875	features = netdev_get_wanted_features(dev);
 9876
 9877	if (dev->netdev_ops->ndo_fix_features)
 9878		features = dev->netdev_ops->ndo_fix_features(dev, features);
 9879
 9880	/* driver might be less strict about feature dependencies */
 9881	features = netdev_fix_features(dev, features);
 9882
 9883	/* some features can't be enabled if they're off on an upper device */
 9884	netdev_for_each_upper_dev_rcu(dev, upper, iter)
 9885		features = netdev_sync_upper_features(dev, upper, features);
 9886
 9887	if (dev->features == features)
 9888		goto sync_lower;
 9889
 9890	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
 9891		&dev->features, &features);
 9892
 9893	if (dev->netdev_ops->ndo_set_features)
 9894		err = dev->netdev_ops->ndo_set_features(dev, features);
 9895	else
 9896		err = 0;
 9897
 9898	if (unlikely(err < 0)) {
 9899		netdev_err(dev,
 9900			"set_features() failed (%d); wanted %pNF, left %pNF\n",
 9901			err, &features, &dev->features);
 9902		/* return non-0 since some features might have changed and
 9903		 * it's better to fire a spurious notification than miss it
 9904		 */
 9905		return -1;
 9906	}
 9907
 9908sync_lower:
 9909	/* some features must be disabled on lower devices when disabled
 9910	 * on an upper device (think: bonding master or bridge)
 9911	 */
 9912	netdev_for_each_lower_dev(dev, lower, iter)
 9913		netdev_sync_lower_features(dev, lower, features);
 9914
 9915	if (!err) {
 9916		netdev_features_t diff = features ^ dev->features;
 9917
 9918		if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
 9919			/* udp_tunnel_{get,drop}_rx_info both need
 9920			 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
 9921			 * device, or they won't do anything.
 9922			 * Thus we need to update dev->features
 9923			 * *before* calling udp_tunnel_get_rx_info,
 9924			 * but *after* calling udp_tunnel_drop_rx_info.
 9925			 */
 9926			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
 9927				dev->features = features;
 9928				udp_tunnel_get_rx_info(dev);
 9929			} else {
 9930				udp_tunnel_drop_rx_info(dev);
 9931			}
 9932		}
 9933
 9934		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
 9935			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
 9936				dev->features = features;
 9937				err |= vlan_get_rx_ctag_filter_info(dev);
 9938			} else {
 9939				vlan_drop_rx_ctag_filter_info(dev);
 9940			}
 9941		}
 9942
 9943		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
 9944			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
 9945				dev->features = features;
 9946				err |= vlan_get_rx_stag_filter_info(dev);
 9947			} else {
 9948				vlan_drop_rx_stag_filter_info(dev);
 9949			}
 9950		}
 9951
 9952		dev->features = features;
 9953	}
 9954
 9955	return err < 0 ? 0 : 1;
 9956}
 9957
 9958/**
 9959 *	netdev_update_features - recalculate device features
 9960 *	@dev: the device to check
 9961 *
 9962 *	Recalculate dev->features set and send notifications if it
 9963 *	has changed. Should be called after driver or hardware dependent
 9964 *	conditions might have changed that influence the features.
 9965 */
 9966void netdev_update_features(struct net_device *dev)
 9967{
 9968	if (__netdev_update_features(dev))
 9969		netdev_features_change(dev);
 9970}
 9971EXPORT_SYMBOL(netdev_update_features);
 9972
 9973/**
 9974 *	netdev_change_features - recalculate device features
 9975 *	@dev: the device to check
 9976 *
 9977 *	Recalculate dev->features set and send notifications even
 9978 *	if they have not changed. Should be called instead of
 9979 *	netdev_update_features() if also dev->vlan_features might
 9980 *	have changed to allow the changes to be propagated to stacked
 9981 *	VLAN devices.
 9982 */
 9983void netdev_change_features(struct net_device *dev)
 9984{
 9985	__netdev_update_features(dev);
 9986	netdev_features_change(dev);
 9987}
 9988EXPORT_SYMBOL(netdev_change_features);
 9989
 9990/**
 9991 *	netif_stacked_transfer_operstate -	transfer operstate
 9992 *	@rootdev: the root or lower level device to transfer state from
 9993 *	@dev: the device to transfer operstate to
 9994 *
 9995 *	Transfer operational state from root to device. This is normally
 9996 *	called when a stacking relationship exists between the root
 9997 *	device and the device(a leaf device).
 9998 */
 9999void netif_stacked_transfer_operstate(const struct net_device *rootdev,
10000					struct net_device *dev)
10001{
10002	if (rootdev->operstate == IF_OPER_DORMANT)
10003		netif_dormant_on(dev);
10004	else
10005		netif_dormant_off(dev);
10006
10007	if (rootdev->operstate == IF_OPER_TESTING)
10008		netif_testing_on(dev);
10009	else
10010		netif_testing_off(dev);
10011
10012	if (netif_carrier_ok(rootdev))
10013		netif_carrier_on(dev);
10014	else
10015		netif_carrier_off(dev);
10016}
10017EXPORT_SYMBOL(netif_stacked_transfer_operstate);
10018
 
10019static int netif_alloc_rx_queues(struct net_device *dev)
10020{
10021	unsigned int i, count = dev->num_rx_queues;
10022	struct netdev_rx_queue *rx;
10023	size_t sz = count * sizeof(*rx);
10024	int err = 0;
10025
10026	BUG_ON(count < 1);
10027
10028	rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10029	if (!rx)
10030		return -ENOMEM;
10031
 
 
10032	dev->_rx = rx;
10033
10034	for (i = 0; i < count; i++) {
10035		rx[i].dev = dev;
10036
10037		/* XDP RX-queue setup */
10038		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
10039		if (err < 0)
10040			goto err_rxq_info;
10041	}
10042	return 0;
10043
10044err_rxq_info:
10045	/* Rollback successful reg's and free other resources */
10046	while (i--)
10047		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
10048	kvfree(dev->_rx);
10049	dev->_rx = NULL;
10050	return err;
10051}
10052
10053static void netif_free_rx_queues(struct net_device *dev)
10054{
10055	unsigned int i, count = dev->num_rx_queues;
10056
10057	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
10058	if (!dev->_rx)
10059		return;
10060
10061	for (i = 0; i < count; i++)
10062		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
10063
10064	kvfree(dev->_rx);
10065}
 
10066
10067static void netdev_init_one_queue(struct net_device *dev,
10068				  struct netdev_queue *queue, void *_unused)
10069{
10070	/* Initialize queue lock */
10071	spin_lock_init(&queue->_xmit_lock);
10072	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
10073	queue->xmit_lock_owner = -1;
10074	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
10075	queue->dev = dev;
10076#ifdef CONFIG_BQL
10077	dql_init(&queue->dql, HZ);
10078#endif
10079}
10080
10081static void netif_free_tx_queues(struct net_device *dev)
10082{
10083	kvfree(dev->_tx);
10084}
10085
10086static int netif_alloc_netdev_queues(struct net_device *dev)
10087{
10088	unsigned int count = dev->num_tx_queues;
10089	struct netdev_queue *tx;
10090	size_t sz = count * sizeof(*tx);
10091
10092	if (count < 1 || count > 0xffff)
10093		return -EINVAL;
10094
10095	tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10096	if (!tx)
10097		return -ENOMEM;
10098
 
 
10099	dev->_tx = tx;
10100
10101	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
10102	spin_lock_init(&dev->tx_global_lock);
10103
10104	return 0;
10105}
10106
10107void netif_tx_stop_all_queues(struct net_device *dev)
10108{
10109	unsigned int i;
10110
10111	for (i = 0; i < dev->num_tx_queues; i++) {
10112		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
10113
10114		netif_tx_stop_queue(txq);
10115	}
10116}
10117EXPORT_SYMBOL(netif_tx_stop_all_queues);
10118
10119static int netdev_do_alloc_pcpu_stats(struct net_device *dev)
10120{
10121	void __percpu *v;
10122
10123	/* Drivers implementing ndo_get_peer_dev must support tstat
10124	 * accounting, so that skb_do_redirect() can bump the dev's
10125	 * RX stats upon network namespace switch.
10126	 */
10127	if (dev->netdev_ops->ndo_get_peer_dev &&
10128	    dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS)
10129		return -EOPNOTSUPP;
10130
10131	switch (dev->pcpu_stat_type) {
10132	case NETDEV_PCPU_STAT_NONE:
10133		return 0;
10134	case NETDEV_PCPU_STAT_LSTATS:
10135		v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
10136		break;
10137	case NETDEV_PCPU_STAT_TSTATS:
10138		v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
10139		break;
10140	case NETDEV_PCPU_STAT_DSTATS:
10141		v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
10142		break;
10143	default:
10144		return -EINVAL;
10145	}
10146
10147	return v ? 0 : -ENOMEM;
10148}
10149
10150static void netdev_do_free_pcpu_stats(struct net_device *dev)
10151{
10152	switch (dev->pcpu_stat_type) {
10153	case NETDEV_PCPU_STAT_NONE:
10154		return;
10155	case NETDEV_PCPU_STAT_LSTATS:
10156		free_percpu(dev->lstats);
10157		break;
10158	case NETDEV_PCPU_STAT_TSTATS:
10159		free_percpu(dev->tstats);
10160		break;
10161	case NETDEV_PCPU_STAT_DSTATS:
10162		free_percpu(dev->dstats);
10163		break;
10164	}
10165}
10166
10167/**
10168 * register_netdevice() - register a network device
10169 * @dev: device to register
10170 *
10171 * Take a prepared network device structure and make it externally accessible.
10172 * A %NETDEV_REGISTER message is sent to the netdev notifier chain.
10173 * Callers must hold the rtnl lock - you may want register_netdev()
10174 * instead of this.
 
 
 
 
 
 
 
10175 */
 
10176int register_netdevice(struct net_device *dev)
10177{
10178	int ret;
10179	struct net *net = dev_net(dev);
10180
10181	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
10182		     NETDEV_FEATURE_COUNT);
10183	BUG_ON(dev_boot_phase);
10184	ASSERT_RTNL();
10185
10186	might_sleep();
10187
10188	/* When net_device's are persistent, this will be fatal. */
10189	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
10190	BUG_ON(!net);
10191
10192	ret = ethtool_check_ops(dev->ethtool_ops);
10193	if (ret)
10194		return ret;
10195
10196	spin_lock_init(&dev->addr_list_lock);
10197	netdev_set_addr_lockdep_class(dev);
10198
10199	ret = dev_get_valid_name(net, dev, dev->name);
10200	if (ret < 0)
10201		goto out;
10202
10203	ret = -ENOMEM;
10204	dev->name_node = netdev_name_node_head_alloc(dev);
10205	if (!dev->name_node)
10206		goto out;
10207
10208	/* Init, if this function is available */
10209	if (dev->netdev_ops->ndo_init) {
10210		ret = dev->netdev_ops->ndo_init(dev);
10211		if (ret) {
10212			if (ret > 0)
10213				ret = -EIO;
10214			goto err_free_name;
10215		}
10216	}
10217
10218	if (((dev->hw_features | dev->features) &
10219	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
10220	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
10221	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
10222		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
10223		ret = -EINVAL;
10224		goto err_uninit;
10225	}
10226
10227	ret = netdev_do_alloc_pcpu_stats(dev);
10228	if (ret)
 
 
10229		goto err_uninit;
10230
10231	ret = dev_index_reserve(net, dev->ifindex);
10232	if (ret < 0)
10233		goto err_free_pcpu;
10234	dev->ifindex = ret;
10235
10236	/* Transfer changeable features to wanted_features and enable
10237	 * software offloads (GSO and GRO).
10238	 */
10239	dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
10240	dev->features |= NETIF_F_SOFT_FEATURES;
10241
10242	if (dev->udp_tunnel_nic_info) {
10243		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10244		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10245	}
10246
10247	dev->wanted_features = dev->features & dev->hw_features;
10248
10249	if (!(dev->flags & IFF_LOOPBACK))
10250		dev->hw_features |= NETIF_F_NOCACHE_COPY;
10251
10252	/* If IPv4 TCP segmentation offload is supported we should also
10253	 * allow the device to enable segmenting the frame with the option
10254	 * of ignoring a static IP ID value.  This doesn't enable the
10255	 * feature itself but allows the user to enable it later.
10256	 */
10257	if (dev->hw_features & NETIF_F_TSO)
10258		dev->hw_features |= NETIF_F_TSO_MANGLEID;
10259	if (dev->vlan_features & NETIF_F_TSO)
10260		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
10261	if (dev->mpls_features & NETIF_F_TSO)
10262		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
10263	if (dev->hw_enc_features & NETIF_F_TSO)
10264		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
10265
10266	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
10267	 */
10268	dev->vlan_features |= NETIF_F_HIGHDMA;
10269
10270	/* Make NETIF_F_SG inheritable to tunnel devices.
10271	 */
10272	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
10273
10274	/* Make NETIF_F_SG inheritable to MPLS.
10275	 */
10276	dev->mpls_features |= NETIF_F_SG;
10277
10278	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
10279	ret = notifier_to_errno(ret);
10280	if (ret)
10281		goto err_ifindex_release;
10282
10283	ret = netdev_register_kobject(dev);
10284
10285	WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
10286
10287	if (ret)
10288		goto err_uninit_notify;
 
10289
10290	__netdev_update_features(dev);
10291
10292	/*
10293	 *	Default initial state at registry is that the
10294	 *	device is present.
10295	 */
10296
10297	set_bit(__LINK_STATE_PRESENT, &dev->state);
10298
10299	linkwatch_init_dev(dev);
10300
10301	dev_init_scheduler(dev);
10302
10303	netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL);
10304	list_netdevice(dev);
10305
10306	add_device_randomness(dev->dev_addr, dev->addr_len);
10307
10308	/* If the device has permanent device address, driver should
10309	 * set dev_addr and also addr_assign_type should be set to
10310	 * NET_ADDR_PERM (default value).
10311	 */
10312	if (dev->addr_assign_type == NET_ADDR_PERM)
10313		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
10314
10315	/* Notify protocols, that a new device appeared. */
10316	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
10317	ret = notifier_to_errno(ret);
10318	if (ret) {
10319		/* Expect explicit free_netdev() on failure */
10320		dev->needs_free_netdev = false;
10321		unregister_netdevice_queue(dev, NULL);
10322		goto out;
10323	}
10324	/*
10325	 *	Prevent userspace races by waiting until the network
10326	 *	device is fully setup before sending notifications.
10327	 */
10328	if (!dev->rtnl_link_ops ||
10329	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10330		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
10331
10332out:
10333	return ret;
10334
10335err_uninit_notify:
10336	call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
10337err_ifindex_release:
10338	dev_index_release(net, dev->ifindex);
10339err_free_pcpu:
10340	netdev_do_free_pcpu_stats(dev);
10341err_uninit:
10342	if (dev->netdev_ops->ndo_uninit)
10343		dev->netdev_ops->ndo_uninit(dev);
10344	if (dev->priv_destructor)
10345		dev->priv_destructor(dev);
10346err_free_name:
10347	netdev_name_node_free(dev->name_node);
10348	goto out;
10349}
10350EXPORT_SYMBOL(register_netdevice);
10351
10352/**
10353 *	init_dummy_netdev	- init a dummy network device for NAPI
10354 *	@dev: device to init
10355 *
10356 *	This takes a network device structure and initialize the minimum
10357 *	amount of fields so it can be used to schedule NAPI polls without
10358 *	registering a full blown interface. This is to be used by drivers
10359 *	that need to tie several hardware interfaces to a single NAPI
10360 *	poll scheduler due to HW limitations.
10361 */
10362void init_dummy_netdev(struct net_device *dev)
10363{
10364	/* Clear everything. Note we don't initialize spinlocks
10365	 * are they aren't supposed to be taken by any of the
10366	 * NAPI code and this dummy netdev is supposed to be
10367	 * only ever used for NAPI polls
10368	 */
10369	memset(dev, 0, sizeof(struct net_device));
10370
10371	/* make sure we BUG if trying to hit standard
10372	 * register/unregister code path
10373	 */
10374	dev->reg_state = NETREG_DUMMY;
10375
10376	/* NAPI wants this */
10377	INIT_LIST_HEAD(&dev->napi_list);
10378
10379	/* a dummy interface is started by default */
10380	set_bit(__LINK_STATE_PRESENT, &dev->state);
10381	set_bit(__LINK_STATE_START, &dev->state);
10382
10383	/* napi_busy_loop stats accounting wants this */
10384	dev_net_set(dev, &init_net);
10385
10386	/* Note : We dont allocate pcpu_refcnt for dummy devices,
10387	 * because users of this 'device' dont need to change
10388	 * its refcount.
10389	 */
 
 
10390}
10391EXPORT_SYMBOL_GPL(init_dummy_netdev);
10392
10393
10394/**
10395 *	register_netdev	- register a network device
10396 *	@dev: device to register
10397 *
10398 *	Take a completed network device structure and add it to the kernel
10399 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
10400 *	chain. 0 is returned on success. A negative errno code is returned
10401 *	on a failure to set up the device, or if the name is a duplicate.
10402 *
10403 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
10404 *	and expands the device name if you passed a format string to
10405 *	alloc_netdev.
10406 */
10407int register_netdev(struct net_device *dev)
10408{
10409	int err;
10410
10411	if (rtnl_lock_killable())
10412		return -EINTR;
10413	err = register_netdevice(dev);
10414	rtnl_unlock();
10415	return err;
10416}
10417EXPORT_SYMBOL(register_netdev);
10418
10419int netdev_refcnt_read(const struct net_device *dev)
10420{
10421#ifdef CONFIG_PCPU_DEV_REFCNT
10422	int i, refcnt = 0;
10423
10424	for_each_possible_cpu(i)
10425		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
10426	return refcnt;
10427#else
10428	return refcount_read(&dev->dev_refcnt);
10429#endif
10430}
10431EXPORT_SYMBOL(netdev_refcnt_read);
10432
10433int netdev_unregister_timeout_secs __read_mostly = 10;
10434
10435#define WAIT_REFS_MIN_MSECS 1
10436#define WAIT_REFS_MAX_MSECS 250
10437/**
10438 * netdev_wait_allrefs_any - wait until all references are gone.
10439 * @list: list of net_devices to wait on
10440 *
10441 * This is called when unregistering network devices.
10442 *
10443 * Any protocol or device that holds a reference should register
10444 * for netdevice notification, and cleanup and put back the
10445 * reference if they receive an UNREGISTER event.
10446 * We can get stuck here if buggy protocols don't correctly
10447 * call dev_put.
10448 */
10449static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
10450{
10451	unsigned long rebroadcast_time, warning_time;
10452	struct net_device *dev;
10453	int wait = 0;
10454
10455	rebroadcast_time = warning_time = jiffies;
10456
10457	list_for_each_entry(dev, list, todo_list)
10458		if (netdev_refcnt_read(dev) == 1)
10459			return dev;
10460
10461	while (true) {
10462		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
10463			rtnl_lock();
10464
10465			/* Rebroadcast unregister notification */
10466			list_for_each_entry(dev, list, todo_list)
10467				call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10468
10469			__rtnl_unlock();
10470			rcu_barrier();
10471			rtnl_lock();
10472
10473			list_for_each_entry(dev, list, todo_list)
10474				if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
10475					     &dev->state)) {
10476					/* We must not have linkwatch events
10477					 * pending on unregister. If this
10478					 * happens, we simply run the queue
10479					 * unscheduled, resulting in a noop
10480					 * for this device.
10481					 */
10482					linkwatch_run_queue();
10483					break;
10484				}
10485
10486			__rtnl_unlock();
10487
10488			rebroadcast_time = jiffies;
10489		}
10490
10491		rcu_barrier();
10492
10493		if (!wait) {
10494			wait = WAIT_REFS_MIN_MSECS;
10495		} else {
10496			msleep(wait);
10497			wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
10498		}
10499
10500		list_for_each_entry(dev, list, todo_list)
10501			if (netdev_refcnt_read(dev) == 1)
10502				return dev;
10503
10504		if (time_after(jiffies, warning_time +
10505			       READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
10506			list_for_each_entry(dev, list, todo_list) {
10507				pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
10508					 dev->name, netdev_refcnt_read(dev));
10509				ref_tracker_dir_print(&dev->refcnt_tracker, 10);
10510			}
10511
 
 
 
10512			warning_time = jiffies;
10513		}
10514	}
10515}
10516
10517/* The sequence is:
10518 *
10519 *	rtnl_lock();
10520 *	...
10521 *	register_netdevice(x1);
10522 *	register_netdevice(x2);
10523 *	...
10524 *	unregister_netdevice(y1);
10525 *	unregister_netdevice(y2);
10526 *      ...
10527 *	rtnl_unlock();
10528 *	free_netdev(y1);
10529 *	free_netdev(y2);
10530 *
10531 * We are invoked by rtnl_unlock().
10532 * This allows us to deal with problems:
10533 * 1) We can delete sysfs objects which invoke hotplug
10534 *    without deadlocking with linkwatch via keventd.
10535 * 2) Since we run with the RTNL semaphore not held, we can sleep
10536 *    safely in order to wait for the netdev refcnt to drop to zero.
10537 *
10538 * We must not return until all unregister events added during
10539 * the interval the lock was held have been completed.
10540 */
10541void netdev_run_todo(void)
10542{
10543	struct net_device *dev, *tmp;
10544	struct list_head list;
10545	int cnt;
10546#ifdef CONFIG_LOCKDEP
10547	struct list_head unlink_list;
10548
10549	list_replace_init(&net_unlink_list, &unlink_list);
10550
10551	while (!list_empty(&unlink_list)) {
10552		struct net_device *dev = list_first_entry(&unlink_list,
10553							  struct net_device,
10554							  unlink_list);
10555		list_del_init(&dev->unlink_list);
10556		dev->nested_level = dev->lower_level - 1;
10557	}
10558#endif
10559
10560	/* Snapshot list, allow later requests */
10561	list_replace_init(&net_todo_list, &list);
10562
10563	__rtnl_unlock();
10564
 
10565	/* Wait for rcu callbacks to finish before next phase */
10566	if (!list_empty(&list))
10567		rcu_barrier();
10568
10569	list_for_each_entry_safe(dev, tmp, &list, todo_list) {
 
 
 
 
 
 
 
 
10570		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
10571			netdev_WARN(dev, "run_todo but not unregistering\n");
10572			list_del(&dev->todo_list);
 
10573			continue;
10574		}
10575
10576		WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
10577		linkwatch_sync_dev(dev);
10578	}
10579
10580	cnt = 0;
10581	while (!list_empty(&list)) {
10582		dev = netdev_wait_allrefs_any(&list);
10583		list_del(&dev->todo_list);
10584
10585		/* paranoia */
10586		BUG_ON(netdev_refcnt_read(dev) != 1);
10587		BUG_ON(!list_empty(&dev->ptype_all));
10588		BUG_ON(!list_empty(&dev->ptype_specific));
10589		WARN_ON(rcu_access_pointer(dev->ip_ptr));
10590		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
 
10591
10592		netdev_do_free_pcpu_stats(dev);
10593		if (dev->priv_destructor)
10594			dev->priv_destructor(dev);
10595		if (dev->needs_free_netdev)
10596			free_netdev(dev);
10597
10598		cnt++;
 
 
 
 
10599
10600		/* Free network device */
10601		kobject_put(&dev->dev.kobj);
10602	}
10603	if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count))
10604		wake_up(&netdev_unregistering_wq);
10605}
10606
10607/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
10608 * all the same fields in the same order as net_device_stats, with only
10609 * the type differing, but rtnl_link_stats64 may have additional fields
10610 * at the end for newer counters.
10611 */
10612void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
10613			     const struct net_device_stats *netdev_stats)
10614{
10615	size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
10616	const atomic_long_t *src = (atomic_long_t *)netdev_stats;
 
 
 
 
 
 
 
10617	u64 *dst = (u64 *)stats64;
10618
10619	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
10620	for (i = 0; i < n; i++)
10621		dst[i] = (unsigned long)atomic_long_read(&src[i]);
10622	/* zero out counters that only exist in rtnl_link_stats64 */
10623	memset((char *)stats64 + n * sizeof(u64), 0,
10624	       sizeof(*stats64) - n * sizeof(u64));
 
10625}
10626EXPORT_SYMBOL(netdev_stats_to_stats64);
10627
10628static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc(
10629		struct net_device *dev)
10630{
10631	struct net_device_core_stats __percpu *p;
10632
10633	p = alloc_percpu_gfp(struct net_device_core_stats,
10634			     GFP_ATOMIC | __GFP_NOWARN);
10635
10636	if (p && cmpxchg(&dev->core_stats, NULL, p))
10637		free_percpu(p);
10638
10639	/* This READ_ONCE() pairs with the cmpxchg() above */
10640	return READ_ONCE(dev->core_stats);
10641}
10642
10643noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset)
10644{
10645	/* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
10646	struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
10647	unsigned long __percpu *field;
10648
10649	if (unlikely(!p)) {
10650		p = netdev_core_stats_alloc(dev);
10651		if (!p)
10652			return;
10653	}
10654
10655	field = (__force unsigned long __percpu *)((__force void *)p + offset);
10656	this_cpu_inc(*field);
10657}
10658EXPORT_SYMBOL_GPL(netdev_core_stats_inc);
10659
10660/**
10661 *	dev_get_stats	- get network device statistics
10662 *	@dev: device to get statistics from
10663 *	@storage: place to store stats
10664 *
10665 *	Get network statistics from device. Return @storage.
10666 *	The device driver may provide its own method by setting
10667 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
10668 *	otherwise the internal statistics structure is used.
10669 */
10670struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
10671					struct rtnl_link_stats64 *storage)
10672{
10673	const struct net_device_ops *ops = dev->netdev_ops;
10674	const struct net_device_core_stats __percpu *p;
10675
10676	if (ops->ndo_get_stats64) {
10677		memset(storage, 0, sizeof(*storage));
10678		ops->ndo_get_stats64(dev, storage);
10679	} else if (ops->ndo_get_stats) {
10680		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
10681	} else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) {
10682		dev_get_tstats64(dev, storage);
10683	} else {
10684		netdev_stats_to_stats64(storage, &dev->stats);
10685	}
10686
10687	/* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
10688	p = READ_ONCE(dev->core_stats);
10689	if (p) {
10690		const struct net_device_core_stats *core_stats;
10691		int i;
10692
10693		for_each_possible_cpu(i) {
10694			core_stats = per_cpu_ptr(p, i);
10695			storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
10696			storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
10697			storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
10698			storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped);
10699		}
10700	}
10701	return storage;
10702}
10703EXPORT_SYMBOL(dev_get_stats);
10704
10705/**
10706 *	dev_fetch_sw_netstats - get per-cpu network device statistics
10707 *	@s: place to store stats
10708 *	@netstats: per-cpu network stats to read from
10709 *
10710 *	Read per-cpu network statistics and populate the related fields in @s.
10711 */
10712void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
10713			   const struct pcpu_sw_netstats __percpu *netstats)
10714{
10715	int cpu;
10716
10717	for_each_possible_cpu(cpu) {
10718		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
10719		const struct pcpu_sw_netstats *stats;
10720		unsigned int start;
10721
10722		stats = per_cpu_ptr(netstats, cpu);
10723		do {
10724			start = u64_stats_fetch_begin(&stats->syncp);
10725			rx_packets = u64_stats_read(&stats->rx_packets);
10726			rx_bytes   = u64_stats_read(&stats->rx_bytes);
10727			tx_packets = u64_stats_read(&stats->tx_packets);
10728			tx_bytes   = u64_stats_read(&stats->tx_bytes);
10729		} while (u64_stats_fetch_retry(&stats->syncp, start));
10730
10731		s->rx_packets += rx_packets;
10732		s->rx_bytes   += rx_bytes;
10733		s->tx_packets += tx_packets;
10734		s->tx_bytes   += tx_bytes;
10735	}
10736}
10737EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
10738
10739/**
10740 *	dev_get_tstats64 - ndo_get_stats64 implementation
10741 *	@dev: device to get statistics from
10742 *	@s: place to store stats
10743 *
10744 *	Populate @s from dev->stats and dev->tstats. Can be used as
10745 *	ndo_get_stats64() callback.
10746 */
10747void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
10748{
10749	netdev_stats_to_stats64(s, &dev->stats);
10750	dev_fetch_sw_netstats(s, dev->tstats);
10751}
10752EXPORT_SYMBOL_GPL(dev_get_tstats64);
10753
10754struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
10755{
10756	struct netdev_queue *queue = dev_ingress_queue(dev);
10757
10758#ifdef CONFIG_NET_CLS_ACT
10759	if (queue)
10760		return queue;
10761	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
10762	if (!queue)
10763		return NULL;
10764	netdev_init_one_queue(dev, queue, NULL);
10765	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
10766	RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc);
10767	rcu_assign_pointer(dev->ingress_queue, queue);
10768#endif
10769	return queue;
10770}
10771
10772static const struct ethtool_ops default_ethtool_ops;
10773
10774void netdev_set_default_ethtool_ops(struct net_device *dev,
10775				    const struct ethtool_ops *ops)
10776{
10777	if (dev->ethtool_ops == &default_ethtool_ops)
10778		dev->ethtool_ops = ops;
10779}
10780EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
10781
10782/**
10783 * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default
10784 * @dev: netdev to enable the IRQ coalescing on
10785 *
10786 * Sets a conservative default for SW IRQ coalescing. Users can use
10787 * sysfs attributes to override the default values.
10788 */
10789void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
10790{
10791	WARN_ON(dev->reg_state == NETREG_REGISTERED);
10792
10793	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
10794		dev->gro_flush_timeout = 20000;
10795		dev->napi_defer_hard_irqs = 1;
10796	}
10797}
10798EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
10799
10800void netdev_freemem(struct net_device *dev)
10801{
10802	char *addr = (char *)dev - dev->padded;
10803
10804	kvfree(addr);
10805}
10806
10807/**
10808 * alloc_netdev_mqs - allocate network device
10809 * @sizeof_priv: size of private data to allocate space for
10810 * @name: device name format string
10811 * @name_assign_type: origin of device name
10812 * @setup: callback to initialize device
10813 * @txqs: the number of TX subqueues to allocate
10814 * @rxqs: the number of RX subqueues to allocate
10815 *
10816 * Allocates a struct net_device with private data area for driver use
10817 * and performs basic initialization.  Also allocates subqueue structs
10818 * for each queue on the device.
10819 */
10820struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
10821		unsigned char name_assign_type,
10822		void (*setup)(struct net_device *),
10823		unsigned int txqs, unsigned int rxqs)
10824{
10825	struct net_device *dev;
10826	unsigned int alloc_size;
10827	struct net_device *p;
10828
10829	BUG_ON(strlen(name) >= sizeof(dev->name));
10830
10831	if (txqs < 1) {
10832		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
10833		return NULL;
10834	}
10835
 
10836	if (rxqs < 1) {
10837		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
10838		return NULL;
10839	}
 
10840
10841	alloc_size = sizeof(struct net_device);
10842	if (sizeof_priv) {
10843		/* ensure 32-byte alignment of private area */
10844		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
10845		alloc_size += sizeof_priv;
10846	}
10847	/* ensure 32-byte alignment of whole construct */
10848	alloc_size += NETDEV_ALIGN - 1;
10849
10850	p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
 
 
10851	if (!p)
10852		return NULL;
10853
10854	dev = PTR_ALIGN(p, NETDEV_ALIGN);
10855	dev->padded = (char *)dev - (char *)p;
10856
10857	ref_tracker_dir_init(&dev->refcnt_tracker, 128, name);
10858#ifdef CONFIG_PCPU_DEV_REFCNT
10859	dev->pcpu_refcnt = alloc_percpu(int);
10860	if (!dev->pcpu_refcnt)
10861		goto free_dev;
10862	__dev_hold(dev);
10863#else
10864	refcount_set(&dev->dev_refcnt, 1);
10865#endif
10866
10867	if (dev_addr_init(dev))
10868		goto free_pcpu;
10869
10870	dev_mc_init(dev);
10871	dev_uc_init(dev);
10872
10873	dev_net_set(dev, &init_net);
10874
10875	dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
10876	dev->xdp_zc_max_segs = 1;
10877	dev->gso_max_segs = GSO_MAX_SEGS;
10878	dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
10879	dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE;
10880	dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE;
10881	dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
10882	dev->tso_max_segs = TSO_MAX_SEGS;
10883	dev->upper_level = 1;
10884	dev->lower_level = 1;
10885#ifdef CONFIG_LOCKDEP
10886	dev->nested_level = 0;
10887	INIT_LIST_HEAD(&dev->unlink_list);
10888#endif
10889
10890	INIT_LIST_HEAD(&dev->napi_list);
10891	INIT_LIST_HEAD(&dev->unreg_list);
10892	INIT_LIST_HEAD(&dev->close_list);
10893	INIT_LIST_HEAD(&dev->link_watch_list);
10894	INIT_LIST_HEAD(&dev->adj_list.upper);
10895	INIT_LIST_HEAD(&dev->adj_list.lower);
10896	INIT_LIST_HEAD(&dev->ptype_all);
10897	INIT_LIST_HEAD(&dev->ptype_specific);
10898	INIT_LIST_HEAD(&dev->net_notifier_list);
10899#ifdef CONFIG_NET_SCHED
10900	hash_init(dev->qdisc_hash);
10901#endif
10902	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
10903	setup(dev);
10904
10905	if (!dev->tx_queue_len) {
10906		dev->priv_flags |= IFF_NO_QUEUE;
10907		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
10908	}
10909
10910	dev->num_tx_queues = txqs;
10911	dev->real_num_tx_queues = txqs;
10912	if (netif_alloc_netdev_queues(dev))
10913		goto free_all;
10914
 
10915	dev->num_rx_queues = rxqs;
10916	dev->real_num_rx_queues = rxqs;
10917	if (netif_alloc_rx_queues(dev))
10918		goto free_all;
 
10919
10920	strcpy(dev->name, name);
10921	dev->name_assign_type = name_assign_type;
10922	dev->group = INIT_NETDEV_GROUP;
10923	if (!dev->ethtool_ops)
10924		dev->ethtool_ops = &default_ethtool_ops;
10925
10926	nf_hook_netdev_init(dev);
10927
10928	return dev;
10929
10930free_all:
10931	free_netdev(dev);
10932	return NULL;
10933
10934free_pcpu:
10935#ifdef CONFIG_PCPU_DEV_REFCNT
10936	free_percpu(dev->pcpu_refcnt);
10937free_dev:
10938#endif
10939	netdev_freemem(dev);
10940	return NULL;
10941}
10942EXPORT_SYMBOL(alloc_netdev_mqs);
10943
10944/**
10945 * free_netdev - free network device
10946 * @dev: device
10947 *
10948 * This function does the last stage of destroying an allocated device
10949 * interface. The reference to the device object is released. If this
10950 * is the last reference then it will be freed.Must be called in process
10951 * context.
10952 */
10953void free_netdev(struct net_device *dev)
10954{
10955	struct napi_struct *p, *n;
10956
10957	might_sleep();
10958
10959	/* When called immediately after register_netdevice() failed the unwind
10960	 * handling may still be dismantling the device. Handle that case by
10961	 * deferring the free.
10962	 */
10963	if (dev->reg_state == NETREG_UNREGISTERING) {
10964		ASSERT_RTNL();
10965		dev->needs_free_netdev = true;
10966		return;
10967	}
10968
10969	netif_free_tx_queues(dev);
10970	netif_free_rx_queues(dev);
 
 
10971
10972	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
10973
10974	/* Flush device addresses */
10975	dev_addr_flush(dev);
10976
10977	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
10978		netif_napi_del(p);
10979
10980	ref_tracker_dir_exit(&dev->refcnt_tracker);
10981#ifdef CONFIG_PCPU_DEV_REFCNT
10982	free_percpu(dev->pcpu_refcnt);
10983	dev->pcpu_refcnt = NULL;
10984#endif
10985	free_percpu(dev->core_stats);
10986	dev->core_stats = NULL;
10987	free_percpu(dev->xdp_bulkq);
10988	dev->xdp_bulkq = NULL;
10989
10990	/*  Compatibility with error handling in drivers */
10991	if (dev->reg_state == NETREG_UNINITIALIZED) {
10992		netdev_freemem(dev);
10993		return;
10994	}
10995
10996	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
10997	WRITE_ONCE(dev->reg_state, NETREG_RELEASED);
10998
10999	/* will free via device release */
11000	put_device(&dev->dev);
11001}
11002EXPORT_SYMBOL(free_netdev);
11003
11004/**
11005 *	synchronize_net -  Synchronize with packet receive processing
11006 *
11007 *	Wait for packets currently being received to be done.
11008 *	Does not block later packets from starting.
11009 */
11010void synchronize_net(void)
11011{
11012	might_sleep();
11013	if (rtnl_is_locked())
11014		synchronize_rcu_expedited();
11015	else
11016		synchronize_rcu();
11017}
11018EXPORT_SYMBOL(synchronize_net);
11019
11020/**
11021 *	unregister_netdevice_queue - remove device from the kernel
11022 *	@dev: device
11023 *	@head: list
11024 *
11025 *	This function shuts down a device interface and removes it
11026 *	from the kernel tables.
11027 *	If head not NULL, device is queued to be unregistered later.
11028 *
11029 *	Callers must hold the rtnl semaphore.  You may want
11030 *	unregister_netdev() instead of this.
11031 */
11032
11033void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
11034{
11035	ASSERT_RTNL();
11036
11037	if (head) {
11038		list_move_tail(&dev->unreg_list, head);
11039	} else {
11040		LIST_HEAD(single);
11041
11042		list_add(&dev->unreg_list, &single);
11043		unregister_netdevice_many(&single);
11044	}
11045}
11046EXPORT_SYMBOL(unregister_netdevice_queue);
11047
11048void unregister_netdevice_many_notify(struct list_head *head,
11049				      u32 portid, const struct nlmsghdr *nlh)
11050{
11051	struct net_device *dev, *tmp;
11052	LIST_HEAD(close_head);
11053	int cnt = 0;
11054
11055	BUG_ON(dev_boot_phase);
11056	ASSERT_RTNL();
11057
11058	if (list_empty(head))
11059		return;
11060
11061	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
11062		/* Some devices call without registering
11063		 * for initialization unwind. Remove those
11064		 * devices and proceed with the remaining.
11065		 */
11066		if (dev->reg_state == NETREG_UNINITIALIZED) {
11067			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
11068				 dev->name, dev);
11069
11070			WARN_ON(1);
11071			list_del(&dev->unreg_list);
11072			continue;
11073		}
11074		dev->dismantle = true;
11075		BUG_ON(dev->reg_state != NETREG_REGISTERED);
11076	}
11077
11078	/* If device is running, close it first. */
11079	list_for_each_entry(dev, head, unreg_list)
11080		list_add_tail(&dev->close_list, &close_head);
11081	dev_close_many(&close_head, true);
11082
11083	list_for_each_entry(dev, head, unreg_list) {
11084		/* And unlink it from device chain. */
11085		unlist_netdevice(dev);
11086		WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
11087	}
11088	flush_all_backlogs();
11089
11090	synchronize_net();
11091
11092	list_for_each_entry(dev, head, unreg_list) {
11093		struct sk_buff *skb = NULL;
11094
11095		/* Shutdown queueing discipline. */
11096		dev_shutdown(dev);
11097		dev_tcx_uninstall(dev);
11098		dev_xdp_uninstall(dev);
11099		bpf_dev_bound_netdev_unregister(dev);
11100
11101		netdev_offload_xstats_disable_all(dev);
11102
11103		/* Notify protocols, that we are about to destroy
11104		 * this device. They should clean all the things.
11105		 */
11106		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
11107
11108		if (!dev->rtnl_link_ops ||
11109		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
11110			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
11111						     GFP_KERNEL, NULL, 0,
11112						     portid, nlh);
11113
11114		/*
11115		 *	Flush the unicast and multicast chains
11116		 */
11117		dev_uc_flush(dev);
11118		dev_mc_flush(dev);
11119
11120		netdev_name_node_alt_flush(dev);
11121		netdev_name_node_free(dev->name_node);
11122
11123		call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
11124
11125		if (dev->netdev_ops->ndo_uninit)
11126			dev->netdev_ops->ndo_uninit(dev);
11127
11128		if (skb)
11129			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);
11130
11131		/* Notifier chain MUST detach us all upper devices. */
11132		WARN_ON(netdev_has_any_upper_dev(dev));
11133		WARN_ON(netdev_has_any_lower_dev(dev));
11134
11135		/* Remove entries from kobject tree */
11136		netdev_unregister_kobject(dev);
11137#ifdef CONFIG_XPS
11138		/* Remove XPS queueing entries */
11139		netif_reset_xps_queues_gt(dev, 0);
11140#endif
11141	}
11142
11143	synchronize_net();
11144
11145	list_for_each_entry(dev, head, unreg_list) {
11146		netdev_put(dev, &dev->dev_registered_tracker);
11147		net_set_todo(dev);
11148		cnt++;
11149	}
11150	atomic_add(cnt, &dev_unreg_count);
11151
11152	list_del(head);
11153}
 
11154
11155/**
11156 *	unregister_netdevice_many - unregister many devices
11157 *	@head: list of devices
11158 *
11159 *  Note: As most callers use a stack allocated list_head,
11160 *  we force a list_del() to make sure stack wont be corrupted later.
11161 */
11162void unregister_netdevice_many(struct list_head *head)
11163{
11164	unregister_netdevice_many_notify(head, 0, NULL);
 
 
 
 
 
 
 
11165}
11166EXPORT_SYMBOL(unregister_netdevice_many);
11167
11168/**
11169 *	unregister_netdev - remove device from the kernel
11170 *	@dev: device
11171 *
11172 *	This function shuts down a device interface and removes it
11173 *	from the kernel tables.
11174 *
11175 *	This is just a wrapper for unregister_netdevice that takes
11176 *	the rtnl semaphore.  In general you want to use this and not
11177 *	unregister_netdevice.
11178 */
11179void unregister_netdev(struct net_device *dev)
11180{
11181	rtnl_lock();
11182	unregister_netdevice(dev);
11183	rtnl_unlock();
11184}
11185EXPORT_SYMBOL(unregister_netdev);
11186
11187/**
11188 *	__dev_change_net_namespace - move device to different nethost namespace
11189 *	@dev: device
11190 *	@net: network namespace
11191 *	@pat: If not NULL name pattern to try if the current device name
11192 *	      is already taken in the destination network namespace.
11193 *	@new_ifindex: If not zero, specifies device index in the target
11194 *	              namespace.
11195 *
11196 *	This function shuts down a device interface and moves it
11197 *	to a new network namespace. On success 0 is returned, on
11198 *	a failure a netagive errno code is returned.
11199 *
11200 *	Callers must hold the rtnl semaphore.
11201 */
11202
11203int __dev_change_net_namespace(struct net_device *dev, struct net *net,
11204			       const char *pat, int new_ifindex)
11205{
11206	struct netdev_name_node *name_node;
11207	struct net *net_old = dev_net(dev);
11208	char new_name[IFNAMSIZ] = {};
11209	int err, new_nsid;
11210
11211	ASSERT_RTNL();
11212
11213	/* Don't allow namespace local devices to be moved. */
11214	err = -EINVAL;
11215	if (dev->features & NETIF_F_NETNS_LOCAL)
11216		goto out;
11217
11218	/* Ensure the device has been registrered */
11219	if (dev->reg_state != NETREG_REGISTERED)
11220		goto out;
11221
11222	/* Get out if there is nothing todo */
11223	err = 0;
11224	if (net_eq(net_old, net))
11225		goto out;
11226
11227	/* Pick the destination device name, and ensure
11228	 * we can use it in the destination network namespace.
11229	 */
11230	err = -EEXIST;
11231	if (netdev_name_in_use(net, dev->name)) {
11232		/* We get here if we can't use the current device name */
11233		if (!pat)
11234			goto out;
11235		err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST);
11236		if (err < 0)
11237			goto out;
11238	}
11239	/* Check that none of the altnames conflicts. */
11240	err = -EEXIST;
11241	netdev_for_each_altname(dev, name_node)
11242		if (netdev_name_in_use(net, name_node->name))
11243			goto out;
11244
11245	/* Check that new_ifindex isn't used yet. */
11246	if (new_ifindex) {
11247		err = dev_index_reserve(net, new_ifindex);
11248		if (err < 0)
11249			goto out;
11250	} else {
11251		/* If there is an ifindex conflict assign a new one */
11252		err = dev_index_reserve(net, dev->ifindex);
11253		if (err == -EBUSY)
11254			err = dev_index_reserve(net, 0);
11255		if (err < 0)
11256			goto out;
11257		new_ifindex = err;
11258	}
11259
11260	/*
11261	 * And now a mini version of register_netdevice unregister_netdevice.
11262	 */
11263
11264	/* If device is running close it first. */
11265	dev_close(dev);
11266
11267	/* And unlink it from device chain */
 
11268	unlist_netdevice(dev);
11269
11270	synchronize_net();
11271
11272	/* Shutdown queueing discipline. */
11273	dev_shutdown(dev);
11274
11275	/* Notify protocols, that we are about to destroy
11276	 * this device. They should clean all the things.
11277	 *
11278	 * Note that dev->reg_state stays at NETREG_REGISTERED.
11279	 * This is wanted because this way 8021q and macvlan know
11280	 * the device is just moving and can keep their slaves up.
11281	 */
11282	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
11283	rcu_barrier();
11284
11285	new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
11286
11287	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
11288			    new_ifindex);
11289
11290	/*
11291	 *	Flush the unicast and multicast chains
11292	 */
11293	dev_uc_flush(dev);
11294	dev_mc_flush(dev);
11295
11296	/* Send a netdev-removed uevent to the old namespace */
11297	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
11298	netdev_adjacent_del_links(dev);
11299
11300	/* Move per-net netdevice notifiers that are following the netdevice */
11301	move_netdevice_notifiers_dev_net(dev, net);
11302
11303	/* Actually switch the network namespace */
11304	dev_net_set(dev, net);
11305	dev->ifindex = new_ifindex;
11306
11307	if (new_name[0]) /* Rename the netdev to prepared name */
11308		strscpy(dev->name, new_name, IFNAMSIZ);
11309
11310	/* Fixup kobjects */
11311	dev_set_uevent_suppress(&dev->dev, 1);
11312	err = device_rename(&dev->dev, dev->name);
11313	dev_set_uevent_suppress(&dev->dev, 0);
11314	WARN_ON(err);
11315
11316	/* Send a netdev-add uevent to the new namespace */
11317	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
11318	netdev_adjacent_add_links(dev);
11319
11320	/* Adapt owner in case owning user namespace of target network
11321	 * namespace is different from the original one.
11322	 */
11323	err = netdev_change_owner(dev, net_old, net);
11324	WARN_ON(err);
11325
11326	/* Add the device back in the hashes */
11327	list_netdevice(dev);
11328
11329	/* Notify protocols, that a new device appeared. */
11330	call_netdevice_notifiers(NETDEV_REGISTER, dev);
11331
11332	/*
11333	 *	Prevent userspace races by waiting until the network
11334	 *	device is fully setup before sending notifications.
11335	 */
11336	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
11337
11338	synchronize_net();
11339	err = 0;
11340out:
11341	return err;
11342}
11343EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
11344
11345static int dev_cpu_dead(unsigned int oldcpu)
11346{
11347	struct sk_buff **list_skb;
11348	struct sk_buff *skb;
11349	unsigned int cpu;
11350	struct softnet_data *sd, *oldsd, *remsd = NULL;
11351
11352	local_irq_disable();
11353	cpu = smp_processor_id();
11354	sd = &per_cpu(softnet_data, cpu);
11355	oldsd = &per_cpu(softnet_data, oldcpu);
11356
11357	/* Find end of our completion_queue. */
11358	list_skb = &sd->completion_queue;
11359	while (*list_skb)
11360		list_skb = &(*list_skb)->next;
11361	/* Append completion queue from offline CPU. */
11362	*list_skb = oldsd->completion_queue;
11363	oldsd->completion_queue = NULL;
11364
11365	/* Append output queue from offline CPU. */
11366	if (oldsd->output_queue) {
11367		*sd->output_queue_tailp = oldsd->output_queue;
11368		sd->output_queue_tailp = oldsd->output_queue_tailp;
11369		oldsd->output_queue = NULL;
11370		oldsd->output_queue_tailp = &oldsd->output_queue;
11371	}
11372	/* Append NAPI poll list from offline CPU, with one exception :
11373	 * process_backlog() must be called by cpu owning percpu backlog.
11374	 * We properly handle process_queue & input_pkt_queue later.
11375	 */
11376	while (!list_empty(&oldsd->poll_list)) {
11377		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
11378							    struct napi_struct,
11379							    poll_list);
11380
11381		list_del_init(&napi->poll_list);
11382		if (napi->poll == process_backlog)
11383			napi->state = 0;
11384		else
11385			____napi_schedule(sd, napi);
11386	}
11387
11388	raise_softirq_irqoff(NET_TX_SOFTIRQ);
11389	local_irq_enable();
11390
11391#ifdef CONFIG_RPS
11392	remsd = oldsd->rps_ipi_list;
11393	oldsd->rps_ipi_list = NULL;
11394#endif
11395	/* send out pending IPI's on offline CPU */
11396	net_rps_send_ipi(remsd);
11397
11398	/* Process offline CPU's input_pkt_queue */
11399	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
11400		netif_rx(skb);
11401		input_queue_head_incr(oldsd);
11402	}
11403	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
11404		netif_rx(skb);
11405		input_queue_head_incr(oldsd);
11406	}
11407
11408	return 0;
11409}
11410
11411/**
11412 *	netdev_increment_features - increment feature set by one
11413 *	@all: current feature set
11414 *	@one: new feature set
11415 *	@mask: mask feature set
11416 *
11417 *	Computes a new feature set after adding a device with feature set
11418 *	@one to the master device with current feature set @all.  Will not
11419 *	enable anything that is off in @mask. Returns the new feature set.
11420 */
11421netdev_features_t netdev_increment_features(netdev_features_t all,
11422	netdev_features_t one, netdev_features_t mask)
11423{
11424	if (mask & NETIF_F_HW_CSUM)
11425		mask |= NETIF_F_CSUM_MASK;
11426	mask |= NETIF_F_VLAN_CHALLENGED;
11427
11428	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
11429	all &= one | ~NETIF_F_ALL_FOR_ALL;
11430
11431	/* If one device supports hw checksumming, set for all. */
11432	if (all & NETIF_F_HW_CSUM)
11433		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
11434
11435	return all;
11436}
11437EXPORT_SYMBOL(netdev_increment_features);
11438
11439static struct hlist_head * __net_init netdev_create_hash(void)
11440{
11441	int i;
11442	struct hlist_head *hash;
11443
11444	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
11445	if (hash != NULL)
11446		for (i = 0; i < NETDEV_HASHENTRIES; i++)
11447			INIT_HLIST_HEAD(&hash[i]);
11448
11449	return hash;
11450}
11451
11452/* Initialize per network namespace state */
11453static int __net_init netdev_init(struct net *net)
11454{
11455	BUILD_BUG_ON(GRO_HASH_BUCKETS >
11456		     8 * sizeof_field(struct napi_struct, gro_bitmask));
11457
11458	INIT_LIST_HEAD(&net->dev_base_head);
11459
11460	net->dev_name_head = netdev_create_hash();
11461	if (net->dev_name_head == NULL)
11462		goto err_name;
11463
11464	net->dev_index_head = netdev_create_hash();
11465	if (net->dev_index_head == NULL)
11466		goto err_idx;
11467
11468	xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1);
11469
11470	RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
11471
11472	return 0;
11473
11474err_idx:
11475	kfree(net->dev_name_head);
11476err_name:
11477	return -ENOMEM;
11478}
11479
11480/**
11481 *	netdev_drivername - network driver for the device
11482 *	@dev: network device
11483 *
11484 *	Determine network driver for device.
11485 */
11486const char *netdev_drivername(const struct net_device *dev)
11487{
11488	const struct device_driver *driver;
11489	const struct device *parent;
11490	const char *empty = "";
11491
11492	parent = dev->dev.parent;
11493	if (!parent)
11494		return empty;
11495
11496	driver = parent->driver;
11497	if (driver && driver->name)
11498		return driver->name;
11499	return empty;
11500}
11501
11502static void __netdev_printk(const char *level, const struct net_device *dev,
11503			    struct va_format *vaf)
11504{
11505	if (dev && dev->dev.parent) {
11506		dev_printk_emit(level[1] - '0',
11507				dev->dev.parent,
11508				"%s %s %s%s: %pV",
11509				dev_driver_string(dev->dev.parent),
11510				dev_name(dev->dev.parent),
11511				netdev_name(dev), netdev_reg_state(dev),
11512				vaf);
11513	} else if (dev) {
11514		printk("%s%s%s: %pV",
11515		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
11516	} else {
11517		printk("%s(NULL net_device): %pV", level, vaf);
11518	}
11519}
11520
11521void netdev_printk(const char *level, const struct net_device *dev,
11522		   const char *format, ...)
11523{
11524	struct va_format vaf;
11525	va_list args;
11526
11527	va_start(args, format);
11528
11529	vaf.fmt = format;
11530	vaf.va = &args;
11531
11532	__netdev_printk(level, dev, &vaf);
11533
11534	va_end(args);
11535}
11536EXPORT_SYMBOL(netdev_printk);
11537
11538#define define_netdev_printk_level(func, level)			\
11539void func(const struct net_device *dev, const char *fmt, ...)	\
11540{								\
11541	struct va_format vaf;					\
11542	va_list args;						\
11543								\
11544	va_start(args, fmt);					\
11545								\
11546	vaf.fmt = fmt;						\
11547	vaf.va = &args;						\
11548								\
11549	__netdev_printk(level, dev, &vaf);			\
11550								\
11551	va_end(args);						\
11552}								\
11553EXPORT_SYMBOL(func);
11554
11555define_netdev_printk_level(netdev_emerg, KERN_EMERG);
11556define_netdev_printk_level(netdev_alert, KERN_ALERT);
11557define_netdev_printk_level(netdev_crit, KERN_CRIT);
11558define_netdev_printk_level(netdev_err, KERN_ERR);
11559define_netdev_printk_level(netdev_warn, KERN_WARNING);
11560define_netdev_printk_level(netdev_notice, KERN_NOTICE);
11561define_netdev_printk_level(netdev_info, KERN_INFO);
11562
11563static void __net_exit netdev_exit(struct net *net)
11564{
11565	kfree(net->dev_name_head);
11566	kfree(net->dev_index_head);
11567	xa_destroy(&net->dev_by_index);
11568	if (net != &init_net)
11569		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
11570}
11571
11572static struct pernet_operations __net_initdata netdev_net_ops = {
11573	.init = netdev_init,
11574	.exit = netdev_exit,
11575};
11576
11577static void __net_exit default_device_exit_net(struct net *net)
11578{
11579	struct netdev_name_node *name_node, *tmp;
11580	struct net_device *dev, *aux;
11581	/*
11582	 * Push all migratable network devices back to the
11583	 * initial network namespace
11584	 */
11585	ASSERT_RTNL();
11586	for_each_netdev_safe(net, dev, aux) {
11587		int err;
11588		char fb_name[IFNAMSIZ];
11589
11590		/* Ignore unmoveable devices (i.e. loopback) */
11591		if (dev->features & NETIF_F_NETNS_LOCAL)
11592			continue;
11593
11594		/* Leave virtual devices for the generic cleanup */
11595		if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
11596			continue;
11597
11598		/* Push remaining network devices to init_net */
11599		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
11600		if (netdev_name_in_use(&init_net, fb_name))
11601			snprintf(fb_name, IFNAMSIZ, "dev%%d");
11602
11603		netdev_for_each_altname_safe(dev, name_node, tmp)
11604			if (netdev_name_in_use(&init_net, name_node->name))
11605				__netdev_name_node_alt_destroy(name_node);
11606
11607		err = dev_change_net_namespace(dev, &init_net, fb_name);
11608		if (err) {
11609			pr_emerg("%s: failed to move %s to init_net: %d\n",
11610				 __func__, dev->name, err);
11611			BUG();
11612		}
11613	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11614}
11615
11616static void __net_exit default_device_exit_batch(struct list_head *net_list)
11617{
11618	/* At exit all network devices most be removed from a network
11619	 * namespace.  Do this in the reverse order of registration.
11620	 * Do this across as many network namespaces as possible to
11621	 * improve batching efficiency.
11622	 */
11623	struct net_device *dev;
11624	struct net *net;
11625	LIST_HEAD(dev_kill_list);
11626
11627	rtnl_lock();
11628	list_for_each_entry(net, net_list, exit_list) {
11629		default_device_exit_net(net);
11630		cond_resched();
11631	}
11632
 
 
 
 
 
 
11633	list_for_each_entry(net, net_list, exit_list) {
11634		for_each_netdev_reverse(net, dev) {
11635			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
11636				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
11637			else
11638				unregister_netdevice_queue(dev, &dev_kill_list);
11639		}
11640	}
11641	unregister_netdevice_many(&dev_kill_list);
11642	rtnl_unlock();
11643}
11644
11645static struct pernet_operations __net_initdata default_device_ops = {
 
11646	.exit_batch = default_device_exit_batch,
11647};
11648
11649static void __init net_dev_struct_check(void)
11650{
11651	/* TX read-mostly hotpath */
11652	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags);
11653	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops);
11654	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops);
11655	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx);
11656	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues);
11657	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size);
11658	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size);
11659	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs);
11660	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features);
11661	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc);
11662	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu);
11663	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom);
11664	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq);
11665#ifdef CONFIG_XPS
11666	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps);
11667#endif
11668#ifdef CONFIG_NETFILTER_EGRESS
11669	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress);
11670#endif
11671#ifdef CONFIG_NET_XGRESS
11672	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress);
11673#endif
11674	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160);
11675
11676	/* TXRX read-mostly hotpath */
11677	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats);
11678	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state);
11679	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags);
11680	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len);
11681	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features);
11682	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr);
11683	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46);
11684
11685	/* RX read-mostly hotpath */
11686	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific);
11687	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
11688	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
11689	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
11690	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout);
11691	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs);
11692	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
11693	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
11694	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
11695	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data);
11696	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net);
11697#ifdef CONFIG_NETPOLL
11698	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo);
11699#endif
11700#ifdef CONFIG_NET_XGRESS
11701	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
11702#endif
11703	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104);
11704}
11705
11706/*
11707 *	Initialize the DEV module. At boot time this walks the device list and
11708 *	unhooks any devices that fail to initialise (normally hardware not
11709 *	present) and leaves us with a valid list of present and active devices.
11710 *
11711 */
11712
11713/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */
11714#define SYSTEM_PERCPU_PAGE_POOL_SIZE	((1 << 20) / PAGE_SIZE)
11715
11716static int net_page_pool_create(int cpuid)
11717{
11718#if IS_ENABLED(CONFIG_PAGE_POOL)
11719	struct page_pool_params page_pool_params = {
11720		.pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE,
11721		.flags = PP_FLAG_SYSTEM_POOL,
11722		.nid = NUMA_NO_NODE,
11723	};
11724	struct page_pool *pp_ptr;
11725
11726	pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
11727	if (IS_ERR(pp_ptr))
11728		return -ENOMEM;
11729
11730	per_cpu(system_page_pool, cpuid) = pp_ptr;
11731#endif
11732	return 0;
11733}
11734
11735/*
11736 *       This is called single threaded during boot, so no need
11737 *       to take the rtnl semaphore.
11738 */
11739static int __init net_dev_init(void)
11740{
11741	int i, rc = -ENOMEM;
11742
11743	BUG_ON(!dev_boot_phase);
11744
11745	net_dev_struct_check();
11746
11747	if (dev_proc_init())
11748		goto out;
11749
11750	if (netdev_kobject_init())
11751		goto out;
11752
 
11753	for (i = 0; i < PTYPE_HASH_SIZE; i++)
11754		INIT_LIST_HEAD(&ptype_base[i]);
11755
 
 
11756	if (register_pernet_subsys(&netdev_net_ops))
11757		goto out;
11758
11759	/*
11760	 *	Initialise the packet receive queues.
11761	 */
11762
11763	for_each_possible_cpu(i) {
11764		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
11765		struct softnet_data *sd = &per_cpu(softnet_data, i);
11766
11767		INIT_WORK(flush, flush_backlog);
11768
11769		skb_queue_head_init(&sd->input_pkt_queue);
11770		skb_queue_head_init(&sd->process_queue);
11771#ifdef CONFIG_XFRM_OFFLOAD
11772		skb_queue_head_init(&sd->xfrm_backlog);
11773#endif
11774		INIT_LIST_HEAD(&sd->poll_list);
11775		sd->output_queue_tailp = &sd->output_queue;
11776#ifdef CONFIG_RPS
11777		INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
 
11778		sd->cpu = i;
11779#endif
11780		INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
11781		spin_lock_init(&sd->defer_lock);
11782
11783		init_gro_hash(&sd->backlog);
11784		sd->backlog.poll = process_backlog;
11785		sd->backlog.weight = weight_p;
11786
11787		if (net_page_pool_create(i))
11788			goto out;
11789	}
11790
11791	dev_boot_phase = 0;
11792
11793	/* The loopback device is special if any other network devices
11794	 * is present in a network namespace the loopback device must
11795	 * be present. Since we now dynamically allocate and free the
11796	 * loopback device ensure this invariant is maintained by
11797	 * keeping the loopback device as the first device on the
11798	 * list of network devices.  Ensuring the loopback devices
11799	 * is the first device that appears and the last network device
11800	 * that disappears.
11801	 */
11802	if (register_pernet_device(&loopback_net_ops))
11803		goto out;
11804
11805	if (register_pernet_device(&default_device_ops))
11806		goto out;
11807
11808	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
11809	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
11810
11811	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
11812				       NULL, dev_cpu_dead);
11813	WARN_ON(rc < 0);
 
11814	rc = 0;
11815out:
11816	if (rc < 0) {
11817		for_each_possible_cpu(i) {
11818			struct page_pool *pp_ptr;
11819
11820			pp_ptr = per_cpu(system_page_pool, i);
11821			if (!pp_ptr)
11822				continue;
11823
11824			page_pool_destroy(pp_ptr);
11825			per_cpu(system_page_pool, i) = NULL;
11826		}
11827	}
11828
11829	return rc;
11830}
11831
11832subsys_initcall(net_dev_init);
v4.10.11
 
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <linux/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
 
  84#include <linux/mutex.h>
 
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
 
  97#include <linux/bpf.h>
 
  98#include <net/net_namespace.h>
  99#include <net/sock.h>
 100#include <net/busy_poll.h>
 101#include <linux/rtnetlink.h>
 102#include <linux/stat.h>
 
 103#include <net/dst.h>
 104#include <net/dst_metadata.h>
 
 105#include <net/pkt_sched.h>
 
 106#include <net/checksum.h>
 107#include <net/xfrm.h>
 
 108#include <linux/highmem.h>
 109#include <linux/init.h>
 110#include <linux/module.h>
 111#include <linux/netpoll.h>
 112#include <linux/rcupdate.h>
 113#include <linux/delay.h>
 114#include <net/iw_handler.h>
 115#include <asm/current.h>
 116#include <linux/audit.h>
 117#include <linux/dmaengine.h>
 118#include <linux/err.h>
 119#include <linux/ctype.h>
 120#include <linux/if_arp.h>
 121#include <linux/if_vlan.h>
 122#include <linux/ip.h>
 123#include <net/ip.h>
 124#include <net/mpls.h>
 125#include <linux/ipv6.h>
 126#include <linux/in.h>
 127#include <linux/jhash.h>
 128#include <linux/random.h>
 129#include <trace/events/napi.h>
 130#include <trace/events/net.h>
 131#include <trace/events/skb.h>
 132#include <linux/pci.h>
 
 133#include <linux/inetdevice.h>
 134#include <linux/cpu_rmap.h>
 135#include <linux/static_key.h>
 136#include <linux/hashtable.h>
 137#include <linux/vmalloc.h>
 138#include <linux/if_macvlan.h>
 139#include <linux/errqueue.h>
 140#include <linux/hrtimer.h>
 141#include <linux/netfilter_ingress.h>
 142#include <linux/crash_dump.h>
 
 
 
 
 
 
 
 
 
 
 
 
 143
 
 144#include "net-sysfs.h"
 145
 146/* Instead of increasing this, you should create a hash table. */
 147#define MAX_GRO_SKBS 8
 148
 149/* This should be increased if a protocol with a bigger head is added. */
 150#define GRO_MAX_HEAD (MAX_HEADER + 128)
 151
 152static DEFINE_SPINLOCK(ptype_lock);
 153static DEFINE_SPINLOCK(offload_lock);
 154struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 155struct list_head ptype_all __read_mostly;	/* Taps */
 156static struct list_head offload_base __read_mostly;
 157
 158static int netif_rx_internal(struct sk_buff *skb);
 159static int call_netdevice_notifiers_info(unsigned long val,
 160					 struct net_device *dev,
 161					 struct netdev_notifier_info *info);
 162
 163/*
 164 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 165 * semaphore.
 166 *
 167 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 168 *
 169 * Writers must hold the rtnl semaphore while they loop through the
 170 * dev_base_head list, and hold dev_base_lock for writing when they do the
 171 * actual updates.  This allows pure readers to access the list even
 172 * while a writer is preparing to update it.
 173 *
 174 * To put it another way, dev_base_lock is held for writing only to
 175 * protect against pure readers; the rtnl semaphore provides the
 176 * protection against other writers.
 177 *
 178 * See, for example usages, register_netdevice() and
 179 * unregister_netdevice(), which must be called with the rtnl
 180 * semaphore held.
 181 */
 182DEFINE_RWLOCK(dev_base_lock);
 183EXPORT_SYMBOL(dev_base_lock);
 184
 185/* protects napi_hash addition/deletion and napi_gen_id */
 186static DEFINE_SPINLOCK(napi_hash_lock);
 187
 188static unsigned int napi_gen_id = NR_CPUS;
 189static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 190
 191static seqcount_t devnet_rename_seq;
 192
 193static inline void dev_base_seq_inc(struct net *net)
 194{
 195	while (++net->dev_base_seq == 0);
 
 
 196}
 197
 198static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 199{
 200	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
 201
 202	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 203}
 204
 205static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 206{
 207	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 208}
 209
 210static inline void rps_lock(struct softnet_data *sd)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 211{
 212#ifdef CONFIG_RPS
 213	spin_lock(&sd->input_pkt_queue.lock);
 214#endif
 
 
 
 
 
 
 
 
 
 
 
 215}
 216
 217static inline void rps_unlock(struct softnet_data *sd)
 218{
 219#ifdef CONFIG_RPS
 220	spin_unlock(&sd->input_pkt_queue.lock);
 221#endif
 
 
 
 222}
 223
 224/* Device list insertion */
 225static void list_netdevice(struct net_device *dev)
 226{
 
 227	struct net *net = dev_net(dev);
 228
 229	ASSERT_RTNL();
 230
 231	write_lock_bh(&dev_base_lock);
 232	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 233	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 234	hlist_add_head_rcu(&dev->index_hlist,
 235			   dev_index_hash(net, dev->ifindex));
 236	write_unlock_bh(&dev_base_lock);
 
 
 
 
 
 237
 238	dev_base_seq_inc(net);
 239}
 240
 241/* Device list removal
 242 * caller must respect a RCU grace period before freeing/reusing dev
 243 */
 244static void unlist_netdevice(struct net_device *dev)
 245{
 
 
 
 246	ASSERT_RTNL();
 247
 
 
 
 
 
 248	/* Unlink dev from the device chain */
 249	write_lock_bh(&dev_base_lock);
 250	list_del_rcu(&dev->dev_list);
 251	hlist_del_rcu(&dev->name_hlist);
 252	hlist_del_rcu(&dev->index_hlist);
 253	write_unlock_bh(&dev_base_lock);
 254
 255	dev_base_seq_inc(dev_net(dev));
 256}
 257
 258/*
 259 *	Our notifier list
 260 */
 261
 262static RAW_NOTIFIER_HEAD(netdev_chain);
 263
 264/*
 265 *	Device drivers call our routines to queue packets here. We empty the
 266 *	queue in the local softnet handler.
 267 */
 268
 269DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 270EXPORT_PER_CPU_SYMBOL(softnet_data);
 271
 
 
 
 
 
 
 272#ifdef CONFIG_LOCKDEP
 273/*
 274 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 275 * according to dev->type
 276 */
 277static const unsigned short netdev_lock_type[] =
 278	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 279	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 280	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 281	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 282	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 283	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 284	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 285	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 286	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 287	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 288	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 289	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 290	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 291	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 292	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 293
 294static const char *const netdev_lock_name[] =
 295	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 296	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 297	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 298	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 299	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 300	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 301	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 302	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 303	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 304	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 305	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 306	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 307	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 308	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 309	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 310
 311static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 312static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 313
 314static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 315{
 316	int i;
 317
 318	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 319		if (netdev_lock_type[i] == dev_type)
 320			return i;
 321	/* the last key is used by default */
 322	return ARRAY_SIZE(netdev_lock_type) - 1;
 323}
 324
 325static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 326						 unsigned short dev_type)
 327{
 328	int i;
 329
 330	i = netdev_lock_pos(dev_type);
 331	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 332				   netdev_lock_name[i]);
 333}
 334
 335static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 336{
 337	int i;
 338
 339	i = netdev_lock_pos(dev->type);
 340	lockdep_set_class_and_name(&dev->addr_list_lock,
 341				   &netdev_addr_lock_key[i],
 342				   netdev_lock_name[i]);
 343}
 344#else
 345static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 346						 unsigned short dev_type)
 347{
 348}
 
 349static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 350{
 351}
 352#endif
 353
 354/*******************************************************************************
 
 
 
 
 355
 356		Protocol management and registration routines
 357
 358*******************************************************************************/
 359
 360/*
 361 *	Add a protocol ID to the list. Now that the input handler is
 362 *	smarter we can dispense with all the messy stuff that used to be
 363 *	here.
 364 *
 365 *	BEWARE!!! Protocol handlers, mangling input packets,
 366 *	MUST BE last in hash buckets and checking protocol handlers
 367 *	MUST start from promiscuous ptype_all chain in net_bh.
 368 *	It is true now, do not change it.
 369 *	Explanation follows: if protocol handler, mangling packet, will
 370 *	be the first on list, it is not able to sense, that packet
 371 *	is cloned and should be copied-on-write, so that it will
 372 *	change it and subsequent readers will get broken packet.
 373 *							--ANK (980803)
 374 */
 375
 376static inline struct list_head *ptype_head(const struct packet_type *pt)
 377{
 378	if (pt->type == htons(ETH_P_ALL))
 379		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 380	else
 381		return pt->dev ? &pt->dev->ptype_specific :
 382				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 383}
 384
 385/**
 386 *	dev_add_pack - add packet handler
 387 *	@pt: packet type declaration
 388 *
 389 *	Add a protocol handler to the networking stack. The passed &packet_type
 390 *	is linked into kernel lists and may not be freed until it has been
 391 *	removed from the kernel lists.
 392 *
 393 *	This call does not sleep therefore it can not
 394 *	guarantee all CPU's that are in middle of receiving packets
 395 *	will see the new packet type (until the next received packet).
 396 */
 397
 398void dev_add_pack(struct packet_type *pt)
 399{
 400	struct list_head *head = ptype_head(pt);
 401
 402	spin_lock(&ptype_lock);
 403	list_add_rcu(&pt->list, head);
 404	spin_unlock(&ptype_lock);
 405}
 406EXPORT_SYMBOL(dev_add_pack);
 407
 408/**
 409 *	__dev_remove_pack	 - remove packet handler
 410 *	@pt: packet type declaration
 411 *
 412 *	Remove a protocol handler that was previously added to the kernel
 413 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 414 *	from the kernel lists and can be freed or reused once this function
 415 *	returns.
 416 *
 417 *      The packet type might still be in use by receivers
 418 *	and must not be freed until after all the CPU's have gone
 419 *	through a quiescent state.
 420 */
 421void __dev_remove_pack(struct packet_type *pt)
 422{
 423	struct list_head *head = ptype_head(pt);
 424	struct packet_type *pt1;
 425
 426	spin_lock(&ptype_lock);
 427
 428	list_for_each_entry(pt1, head, list) {
 429		if (pt == pt1) {
 430			list_del_rcu(&pt->list);
 431			goto out;
 432		}
 433	}
 434
 435	pr_warn("dev_remove_pack: %p not found\n", pt);
 436out:
 437	spin_unlock(&ptype_lock);
 438}
 439EXPORT_SYMBOL(__dev_remove_pack);
 440
 441/**
 442 *	dev_remove_pack	 - remove packet handler
 443 *	@pt: packet type declaration
 444 *
 445 *	Remove a protocol handler that was previously added to the kernel
 446 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 447 *	from the kernel lists and can be freed or reused once this function
 448 *	returns.
 449 *
 450 *	This call sleeps to guarantee that no CPU is looking at the packet
 451 *	type after return.
 452 */
 453void dev_remove_pack(struct packet_type *pt)
 454{
 455	__dev_remove_pack(pt);
 456
 457	synchronize_net();
 458}
 459EXPORT_SYMBOL(dev_remove_pack);
 460
 461
 462/**
 463 *	dev_add_offload - register offload handlers
 464 *	@po: protocol offload declaration
 465 *
 466 *	Add protocol offload handlers to the networking stack. The passed
 467 *	&proto_offload is linked into kernel lists and may not be freed until
 468 *	it has been removed from the kernel lists.
 469 *
 470 *	This call does not sleep therefore it can not
 471 *	guarantee all CPU's that are in middle of receiving packets
 472 *	will see the new offload handlers (until the next received packet).
 473 */
 474void dev_add_offload(struct packet_offload *po)
 475{
 476	struct packet_offload *elem;
 477
 478	spin_lock(&offload_lock);
 479	list_for_each_entry(elem, &offload_base, list) {
 480		if (po->priority < elem->priority)
 481			break;
 482	}
 483	list_add_rcu(&po->list, elem->list.prev);
 484	spin_unlock(&offload_lock);
 485}
 486EXPORT_SYMBOL(dev_add_offload);
 487
 488/**
 489 *	__dev_remove_offload	 - remove offload handler
 490 *	@po: packet offload declaration
 491 *
 492 *	Remove a protocol offload handler that was previously added to the
 493 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
 494 *	is removed from the kernel lists and can be freed or reused once this
 495 *	function returns.
 496 *
 497 *      The packet type might still be in use by receivers
 498 *	and must not be freed until after all the CPU's have gone
 499 *	through a quiescent state.
 500 */
 501static void __dev_remove_offload(struct packet_offload *po)
 502{
 503	struct list_head *head = &offload_base;
 504	struct packet_offload *po1;
 505
 506	spin_lock(&offload_lock);
 507
 508	list_for_each_entry(po1, head, list) {
 509		if (po == po1) {
 510			list_del_rcu(&po->list);
 511			goto out;
 512		}
 513	}
 514
 515	pr_warn("dev_remove_offload: %p not found\n", po);
 516out:
 517	spin_unlock(&offload_lock);
 518}
 519
 520/**
 521 *	dev_remove_offload	 - remove packet offload handler
 522 *	@po: packet offload declaration
 523 *
 524 *	Remove a packet offload handler that was previously added to the kernel
 525 *	offload handlers by dev_add_offload(). The passed &offload_type is
 526 *	removed from the kernel lists and can be freed or reused once this
 527 *	function returns.
 528 *
 529 *	This call sleeps to guarantee that no CPU is looking at the packet
 530 *	type after return.
 531 */
 532void dev_remove_offload(struct packet_offload *po)
 533{
 534	__dev_remove_offload(po);
 535
 536	synchronize_net();
 537}
 538EXPORT_SYMBOL(dev_remove_offload);
 539
 540/******************************************************************************
 541
 542		      Device Boot-time Settings Routines
 543
 544*******************************************************************************/
 545
 546/* Boot time configuration table */
 547static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 548
 549/**
 550 *	netdev_boot_setup_add	- add new setup entry
 551 *	@name: name of the device
 552 *	@map: configured settings for the device
 553 *
 554 *	Adds new setup entry to the dev_boot_setup list.  The function
 555 *	returns 0 on error and 1 on success.  This is a generic routine to
 556 *	all netdevices.
 557 */
 558static int netdev_boot_setup_add(char *name, struct ifmap *map)
 559{
 560	struct netdev_boot_setup *s;
 561	int i;
 562
 563	s = dev_boot_setup;
 564	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 565		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 566			memset(s[i].name, 0, sizeof(s[i].name));
 567			strlcpy(s[i].name, name, IFNAMSIZ);
 568			memcpy(&s[i].map, map, sizeof(s[i].map));
 569			break;
 570		}
 571	}
 572
 573	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 574}
 575
 576/**
 577 *	netdev_boot_setup_check	- check boot time settings
 578 *	@dev: the netdevice
 579 *
 580 * 	Check boot time settings for the device.
 581 *	The found settings are set for the device to be used
 582 *	later in the device probing.
 583 *	Returns 0 if no settings found, 1 if they are.
 584 */
 585int netdev_boot_setup_check(struct net_device *dev)
 586{
 587	struct netdev_boot_setup *s = dev_boot_setup;
 588	int i;
 589
 590	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 591		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 592		    !strcmp(dev->name, s[i].name)) {
 593			dev->irq 	= s[i].map.irq;
 594			dev->base_addr 	= s[i].map.base_addr;
 595			dev->mem_start 	= s[i].map.mem_start;
 596			dev->mem_end 	= s[i].map.mem_end;
 597			return 1;
 598		}
 599	}
 600	return 0;
 601}
 602EXPORT_SYMBOL(netdev_boot_setup_check);
 603
 604
 605/**
 606 *	netdev_boot_base	- get address from boot time settings
 607 *	@prefix: prefix for network device
 608 *	@unit: id for network device
 609 *
 610 * 	Check boot time settings for the base address of device.
 611 *	The found settings are set for the device to be used
 612 *	later in the device probing.
 613 *	Returns 0 if no settings found.
 614 */
 615unsigned long netdev_boot_base(const char *prefix, int unit)
 616{
 617	const struct netdev_boot_setup *s = dev_boot_setup;
 618	char name[IFNAMSIZ];
 619	int i;
 620
 621	sprintf(name, "%s%d", prefix, unit);
 622
 623	/*
 624	 * If device already registered then return base of 1
 625	 * to indicate not to probe for this interface
 626	 */
 627	if (__dev_get_by_name(&init_net, name))
 628		return 1;
 629
 630	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 631		if (!strcmp(name, s[i].name))
 632			return s[i].map.base_addr;
 633	return 0;
 634}
 635
 636/*
 637 * Saves at boot time configured settings for any netdevice.
 638 */
 639int __init netdev_boot_setup(char *str)
 640{
 641	int ints[5];
 642	struct ifmap map;
 643
 644	str = get_options(str, ARRAY_SIZE(ints), ints);
 645	if (!str || !*str)
 646		return 0;
 647
 648	/* Save settings */
 649	memset(&map, 0, sizeof(map));
 650	if (ints[0] > 0)
 651		map.irq = ints[1];
 652	if (ints[0] > 1)
 653		map.base_addr = ints[2];
 654	if (ints[0] > 2)
 655		map.mem_start = ints[3];
 656	if (ints[0] > 3)
 657		map.mem_end = ints[4];
 658
 659	/* Add new entry to the list */
 660	return netdev_boot_setup_add(str, &map);
 661}
 662
 663__setup("netdev=", netdev_boot_setup);
 664
 665/*******************************************************************************
 666
 667			    Device Interface Subroutines
 668
 669*******************************************************************************/
 670
 671/**
 672 *	dev_get_iflink	- get 'iflink' value of a interface
 673 *	@dev: targeted interface
 674 *
 675 *	Indicates the ifindex the interface is linked to.
 676 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
 677 */
 678
 679int dev_get_iflink(const struct net_device *dev)
 680{
 681	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 682		return dev->netdev_ops->ndo_get_iflink(dev);
 683
 684	return dev->ifindex;
 685}
 686EXPORT_SYMBOL(dev_get_iflink);
 687
 688/**
 689 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
 690 *	@dev: targeted interface
 691 *	@skb: The packet.
 692 *
 693 *	For better visibility of tunnel traffic OVS needs to retrieve
 694 *	egress tunnel information for a packet. Following API allows
 695 *	user to get this info.
 696 */
 697int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 698{
 699	struct ip_tunnel_info *info;
 700
 701	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 702		return -EINVAL;
 703
 704	info = skb_tunnel_info_unclone(skb);
 705	if (!info)
 706		return -ENOMEM;
 707	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 708		return -EINVAL;
 709
 710	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 711}
 712EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 713
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 714/**
 715 *	__dev_get_by_name	- find a device by its name
 716 *	@net: the applicable net namespace
 717 *	@name: name to find
 718 *
 719 *	Find an interface by name. Must be called under RTNL semaphore
 720 *	or @dev_base_lock. If the name is found a pointer to the device
 721 *	is returned. If the name is not found then %NULL is returned. The
 722 *	reference counters are not incremented so the caller must be
 723 *	careful with locks.
 724 */
 725
 726struct net_device *__dev_get_by_name(struct net *net, const char *name)
 727{
 728	struct net_device *dev;
 729	struct hlist_head *head = dev_name_hash(net, name);
 730
 731	hlist_for_each_entry(dev, head, name_hlist)
 732		if (!strncmp(dev->name, name, IFNAMSIZ))
 733			return dev;
 734
 735	return NULL;
 736}
 737EXPORT_SYMBOL(__dev_get_by_name);
 738
 739/**
 740 *	dev_get_by_name_rcu	- find a device by its name
 741 *	@net: the applicable net namespace
 742 *	@name: name to find
 743 *
 744 *	Find an interface by name.
 745 *	If the name is found a pointer to the device is returned.
 746 * 	If the name is not found then %NULL is returned.
 747 *	The reference counters are not incremented so the caller must be
 748 *	careful with locks. The caller must hold RCU lock.
 749 */
 750
 751struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 752{
 
 
 
 
 
 
 
 
 
 
 753	struct net_device *dev;
 754	struct hlist_head *head = dev_name_hash(net, name);
 755
 756	hlist_for_each_entry_rcu(dev, head, name_hlist)
 757		if (!strncmp(dev->name, name, IFNAMSIZ))
 758			return dev;
 759
 760	return NULL;
 761}
 762EXPORT_SYMBOL(dev_get_by_name_rcu);
 763
 764/**
 765 *	dev_get_by_name		- find a device by its name
 766 *	@net: the applicable net namespace
 767 *	@name: name to find
 
 
 768 *
 769 *	Find an interface by name. This can be called from any
 770 *	context and does its own locking. The returned handle has
 771 *	the usage count incremented and the caller must use dev_put() to
 772 *	release it when it is no longer needed. %NULL is returned if no
 773 *	matching device is found.
 774 */
 775
 776struct net_device *dev_get_by_name(struct net *net, const char *name)
 777{
 778	struct net_device *dev;
 779
 780	rcu_read_lock();
 781	dev = dev_get_by_name_rcu(net, name);
 782	if (dev)
 783		dev_hold(dev);
 784	rcu_read_unlock();
 785	return dev;
 786}
 787EXPORT_SYMBOL(dev_get_by_name);
 788
 789/**
 790 *	__dev_get_by_index - find a device by its ifindex
 791 *	@net: the applicable net namespace
 792 *	@ifindex: index of device
 793 *
 794 *	Search for an interface by index. Returns %NULL if the device
 795 *	is not found or a pointer to the device. The device has not
 796 *	had its reference counter increased so the caller must be careful
 797 *	about locking. The caller must hold either the RTNL semaphore
 798 *	or @dev_base_lock.
 799 */
 800
 801struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 802{
 803	struct net_device *dev;
 804	struct hlist_head *head = dev_index_hash(net, ifindex);
 805
 806	hlist_for_each_entry(dev, head, index_hlist)
 807		if (dev->ifindex == ifindex)
 808			return dev;
 809
 810	return NULL;
 811}
 812EXPORT_SYMBOL(__dev_get_by_index);
 813
 814/**
 815 *	dev_get_by_index_rcu - find a device by its ifindex
 816 *	@net: the applicable net namespace
 817 *	@ifindex: index of device
 818 *
 819 *	Search for an interface by index. Returns %NULL if the device
 820 *	is not found or a pointer to the device. The device has not
 821 *	had its reference counter increased so the caller must be careful
 822 *	about locking. The caller must hold RCU lock.
 823 */
 824
 825struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 826{
 827	struct net_device *dev;
 828	struct hlist_head *head = dev_index_hash(net, ifindex);
 829
 830	hlist_for_each_entry_rcu(dev, head, index_hlist)
 831		if (dev->ifindex == ifindex)
 832			return dev;
 833
 834	return NULL;
 835}
 836EXPORT_SYMBOL(dev_get_by_index_rcu);
 837
 
 
 
 
 
 
 
 
 
 
 
 
 838
 839/**
 840 *	dev_get_by_index - find a device by its ifindex
 841 *	@net: the applicable net namespace
 842 *	@ifindex: index of device
 
 
 843 *
 844 *	Search for an interface by index. Returns NULL if the device
 845 *	is not found or a pointer to the device. The device returned has
 846 *	had a reference added and the pointer is safe until the user calls
 847 *	dev_put to indicate they have finished with it.
 848 */
 849
 850struct net_device *dev_get_by_index(struct net *net, int ifindex)
 851{
 852	struct net_device *dev;
 853
 854	rcu_read_lock();
 855	dev = dev_get_by_index_rcu(net, ifindex);
 856	if (dev)
 857		dev_hold(dev);
 858	rcu_read_unlock();
 859	return dev;
 860}
 861EXPORT_SYMBOL(dev_get_by_index);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 862
 863/**
 864 *	netdev_get_name - get a netdevice name, knowing its ifindex.
 865 *	@net: network namespace
 866 *	@name: a pointer to the buffer where the name will be stored.
 867 *	@ifindex: the ifindex of the interface to get the name from.
 868 *
 869 *	The use of raw_seqcount_begin() and cond_resched() before
 870 *	retrying is required as we want to give the writers a chance
 871 *	to complete when CONFIG_PREEMPT is not set.
 872 */
 873int netdev_get_name(struct net *net, char *name, int ifindex)
 874{
 875	struct net_device *dev;
 876	unsigned int seq;
 877
 878retry:
 879	seq = raw_seqcount_begin(&devnet_rename_seq);
 880	rcu_read_lock();
 
 881	dev = dev_get_by_index_rcu(net, ifindex);
 882	if (!dev) {
 883		rcu_read_unlock();
 884		return -ENODEV;
 885	}
 886
 887	strcpy(name, dev->name);
 
 
 
 888	rcu_read_unlock();
 889	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 890		cond_resched();
 891		goto retry;
 892	}
 893
 894	return 0;
 895}
 896
 897/**
 898 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 899 *	@net: the applicable net namespace
 900 *	@type: media type of device
 901 *	@ha: hardware address
 902 *
 903 *	Search for an interface by MAC address. Returns NULL if the device
 904 *	is not found or a pointer to the device.
 905 *	The caller must hold RCU or RTNL.
 906 *	The returned device has not had its ref count increased
 907 *	and the caller must therefore be careful about locking
 908 *
 909 */
 910
 911struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 912				       const char *ha)
 913{
 914	struct net_device *dev;
 915
 916	for_each_netdev_rcu(net, dev)
 917		if (dev->type == type &&
 918		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 919			return dev;
 920
 921	return NULL;
 922}
 923EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 924
 925struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 926{
 927	struct net_device *dev;
 928
 929	ASSERT_RTNL();
 930	for_each_netdev(net, dev)
 931		if (dev->type == type)
 932			return dev;
 933
 934	return NULL;
 935}
 936EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 937
 938struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 939{
 940	struct net_device *dev, *ret = NULL;
 941
 942	rcu_read_lock();
 943	for_each_netdev_rcu(net, dev)
 944		if (dev->type == type) {
 945			dev_hold(dev);
 946			ret = dev;
 947			break;
 948		}
 949	rcu_read_unlock();
 950	return ret;
 951}
 952EXPORT_SYMBOL(dev_getfirstbyhwtype);
 953
 954/**
 955 *	__dev_get_by_flags - find any device with given flags
 956 *	@net: the applicable net namespace
 957 *	@if_flags: IFF_* values
 958 *	@mask: bitmask of bits in if_flags to check
 959 *
 960 *	Search for any interface with the given flags. Returns NULL if a device
 961 *	is not found or a pointer to the device. Must be called inside
 962 *	rtnl_lock(), and result refcount is unchanged.
 963 */
 964
 965struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 966				      unsigned short mask)
 967{
 968	struct net_device *dev, *ret;
 969
 970	ASSERT_RTNL();
 971
 972	ret = NULL;
 973	for_each_netdev(net, dev) {
 974		if (((dev->flags ^ if_flags) & mask) == 0) {
 975			ret = dev;
 976			break;
 977		}
 978	}
 979	return ret;
 980}
 981EXPORT_SYMBOL(__dev_get_by_flags);
 982
 983/**
 984 *	dev_valid_name - check if name is okay for network device
 985 *	@name: name string
 986 *
 987 *	Network device names need to be valid file names to
 988 *	to allow sysfs to work.  We also disallow any kind of
 989 *	whitespace.
 990 */
 991bool dev_valid_name(const char *name)
 992{
 993	if (*name == '\0')
 994		return false;
 995	if (strlen(name) >= IFNAMSIZ)
 996		return false;
 997	if (!strcmp(name, ".") || !strcmp(name, ".."))
 998		return false;
 999
1000	while (*name) {
1001		if (*name == '/' || *name == ':' || isspace(*name))
1002			return false;
1003		name++;
1004	}
1005	return true;
1006}
1007EXPORT_SYMBOL(dev_valid_name);
1008
1009/**
1010 *	__dev_alloc_name - allocate a name for a device
1011 *	@net: network namespace to allocate the device name in
1012 *	@name: name format string
1013 *	@buf:  scratch buffer and result name string
1014 *
1015 *	Passed a format string - eg "lt%d" it will try and find a suitable
1016 *	id. It scans list of devices to build up a free map, then chooses
1017 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1018 *	while allocating the name and adding the device in order to avoid
1019 *	duplicates.
1020 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021 *	Returns the number of the unit assigned or a negative errno code.
1022 */
1023
1024static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1025{
1026	int i = 0;
1027	const char *p;
1028	const int max_netdevices = 8*PAGE_SIZE;
1029	unsigned long *inuse;
1030	struct net_device *d;
 
 
 
 
 
 
 
 
1031
1032	p = strnchr(name, IFNAMSIZ-1, '%');
1033	if (p) {
1034		/*
1035		 * Verify the string as this thing may have come from
1036		 * the user.  There must be either one "%d" and no other "%"
1037		 * characters.
1038		 */
1039		if (p[1] != 'd' || strchr(p + 2, '%'))
1040			return -EINVAL;
1041
1042		/* Use one page as a bit array of possible slots */
1043		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1044		if (!inuse)
1045			return -ENOMEM;
1046
1047		for_each_netdev(net, d) {
1048			if (!sscanf(d->name, name, &i))
1049				continue;
1050			if (i < 0 || i >= max_netdevices)
1051				continue;
1052
1053			/*  avoid cases where sscanf is not exact inverse of printf */
1054			snprintf(buf, IFNAMSIZ, name, i);
1055			if (!strncmp(buf, d->name, IFNAMSIZ))
1056				set_bit(i, inuse);
1057		}
 
 
 
 
1058
1059		i = find_first_zero_bit(inuse, max_netdevices);
1060		free_page((unsigned long) inuse);
 
 
1061	}
1062
1063	if (buf != name)
1064		snprintf(buf, IFNAMSIZ, name, i);
1065	if (!__dev_get_by_name(net, buf))
1066		return i;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1067
1068	/* It is possible to run out of possible slots
1069	 * when the name is long and there isn't enough space left
1070	 * for the digits, or if all bits are used.
1071	 */
1072	return -ENFILE;
1073}
1074
1075/**
1076 *	dev_alloc_name - allocate a name for a device
1077 *	@dev: device
1078 *	@name: name format string
1079 *
1080 *	Passed a format string - eg "lt%d" it will try and find a suitable
1081 *	id. It scans list of devices to build up a free map, then chooses
1082 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1083 *	while allocating the name and adding the device in order to avoid
1084 *	duplicates.
1085 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086 *	Returns the number of the unit assigned or a negative errno code.
1087 */
1088
1089int dev_alloc_name(struct net_device *dev, const char *name)
1090{
1091	char buf[IFNAMSIZ];
1092	struct net *net;
1093	int ret;
1094
1095	BUG_ON(!dev_net(dev));
1096	net = dev_net(dev);
1097	ret = __dev_alloc_name(net, name, buf);
1098	if (ret >= 0)
1099		strlcpy(dev->name, buf, IFNAMSIZ);
1100	return ret;
1101}
1102EXPORT_SYMBOL(dev_alloc_name);
1103
1104static int dev_alloc_name_ns(struct net *net,
1105			     struct net_device *dev,
1106			     const char *name)
1107{
1108	char buf[IFNAMSIZ];
1109	int ret;
1110
1111	ret = __dev_alloc_name(net, name, buf);
1112	if (ret >= 0)
1113		strlcpy(dev->name, buf, IFNAMSIZ);
1114	return ret;
1115}
1116
1117static int dev_get_valid_name(struct net *net,
1118			      struct net_device *dev,
1119			      const char *name)
1120{
1121	BUG_ON(!net);
1122
1123	if (!dev_valid_name(name))
1124		return -EINVAL;
1125
1126	if (strchr(name, '%'))
1127		return dev_alloc_name_ns(net, dev, name);
1128	else if (__dev_get_by_name(net, name))
1129		return -EEXIST;
1130	else if (dev->name != name)
1131		strlcpy(dev->name, name, IFNAMSIZ);
1132
1133	return 0;
1134}
1135
1136/**
1137 *	dev_change_name - change name of a device
1138 *	@dev: device
1139 *	@newname: name (or format string) must be at least IFNAMSIZ
1140 *
1141 *	Change name of a device, can pass format strings "eth%d".
1142 *	for wildcarding.
1143 */
1144int dev_change_name(struct net_device *dev, const char *newname)
1145{
1146	unsigned char old_assign_type;
1147	char oldname[IFNAMSIZ];
1148	int err = 0;
1149	int ret;
1150	struct net *net;
1151
1152	ASSERT_RTNL();
1153	BUG_ON(!dev_net(dev));
1154
1155	net = dev_net(dev);
1156	if (dev->flags & IFF_UP)
1157		return -EBUSY;
1158
1159	write_seqcount_begin(&devnet_rename_seq);
1160
1161	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1162		write_seqcount_end(&devnet_rename_seq);
1163		return 0;
1164	}
1165
1166	memcpy(oldname, dev->name, IFNAMSIZ);
1167
1168	err = dev_get_valid_name(net, dev, newname);
1169	if (err < 0) {
1170		write_seqcount_end(&devnet_rename_seq);
1171		return err;
1172	}
1173
1174	if (oldname[0] && !strchr(oldname, '%'))
1175		netdev_info(dev, "renamed from %s\n", oldname);
 
1176
1177	old_assign_type = dev->name_assign_type;
1178	dev->name_assign_type = NET_NAME_RENAMED;
1179
1180rollback:
1181	ret = device_rename(&dev->dev, dev->name);
1182	if (ret) {
1183		memcpy(dev->name, oldname, IFNAMSIZ);
1184		dev->name_assign_type = old_assign_type;
1185		write_seqcount_end(&devnet_rename_seq);
1186		return ret;
1187	}
1188
1189	write_seqcount_end(&devnet_rename_seq);
1190
1191	netdev_adjacent_rename_links(dev, oldname);
1192
1193	write_lock_bh(&dev_base_lock);
1194	hlist_del_rcu(&dev->name_hlist);
1195	write_unlock_bh(&dev_base_lock);
1196
1197	synchronize_rcu();
1198
1199	write_lock_bh(&dev_base_lock);
1200	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1201	write_unlock_bh(&dev_base_lock);
1202
1203	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1204	ret = notifier_to_errno(ret);
1205
1206	if (ret) {
1207		/* err >= 0 after dev_alloc_name() or stores the first errno */
1208		if (err >= 0) {
1209			err = ret;
1210			write_seqcount_begin(&devnet_rename_seq);
1211			memcpy(dev->name, oldname, IFNAMSIZ);
1212			memcpy(oldname, newname, IFNAMSIZ);
1213			dev->name_assign_type = old_assign_type;
1214			old_assign_type = NET_NAME_RENAMED;
1215			goto rollback;
1216		} else {
1217			pr_err("%s: name change rollback failed: %d\n",
1218			       dev->name, ret);
1219		}
1220	}
1221
1222	return err;
1223}
1224
1225/**
1226 *	dev_set_alias - change ifalias of a device
1227 *	@dev: device
1228 *	@alias: name up to IFALIASZ
1229 *	@len: limit of bytes to copy from info
1230 *
1231 *	Set ifalias for a device,
1232 */
1233int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1234{
1235	char *new_ifalias;
1236
1237	ASSERT_RTNL();
1238
1239	if (len >= IFALIASZ)
1240		return -EINVAL;
1241
1242	if (!len) {
1243		kfree(dev->ifalias);
1244		dev->ifalias = NULL;
1245		return 0;
 
 
 
1246	}
1247
1248	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1249	if (!new_ifalias)
1250		return -ENOMEM;
1251	dev->ifalias = new_ifalias;
 
 
 
1252
1253	strlcpy(dev->ifalias, alias, len+1);
1254	return len;
1255}
 
1256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1257
1258/**
1259 *	netdev_features_change - device changes features
1260 *	@dev: device to cause notification
1261 *
1262 *	Called to indicate a device has changed features.
1263 */
1264void netdev_features_change(struct net_device *dev)
1265{
1266	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1267}
1268EXPORT_SYMBOL(netdev_features_change);
1269
1270/**
1271 *	netdev_state_change - device changes state
1272 *	@dev: device to cause notification
1273 *
1274 *	Called to indicate a device has changed state. This function calls
1275 *	the notifier chains for netdev_chain and sends a NEWLINK message
1276 *	to the routing socket.
1277 */
1278void netdev_state_change(struct net_device *dev)
1279{
1280	if (dev->flags & IFF_UP) {
1281		struct netdev_notifier_change_info change_info;
 
 
1282
1283		change_info.flags_changed = 0;
1284		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1285					      &change_info.info);
1286		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1287	}
1288}
1289EXPORT_SYMBOL(netdev_state_change);
1290
1291/**
1292 * 	netdev_notify_peers - notify network peers about existence of @dev
1293 * 	@dev: network device
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1294 *
1295 * Generate traffic such that interested network peers are aware of
1296 * @dev, such as by generating a gratuitous ARP. This may be used when
1297 * a device wants to inform the rest of the network about some sort of
1298 * reconfiguration such as a failover event or virtual machine
1299 * migration.
1300 */
1301void netdev_notify_peers(struct net_device *dev)
1302{
1303	rtnl_lock();
1304	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1305	rtnl_unlock();
1306}
1307EXPORT_SYMBOL(netdev_notify_peers);
1308
1309static int __dev_open(struct net_device *dev)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1310{
1311	const struct net_device_ops *ops = dev->netdev_ops;
1312	int ret;
1313
1314	ASSERT_RTNL();
 
1315
1316	if (!netif_device_present(dev))
1317		return -ENODEV;
 
 
 
 
 
1318
1319	/* Block netpoll from trying to do any rx path servicing.
1320	 * If we don't do this there is a chance ndo_poll_controller
1321	 * or ndo_poll may be running while we open the device
1322	 */
1323	netpoll_poll_disable(dev);
1324
1325	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1326	ret = notifier_to_errno(ret);
1327	if (ret)
1328		return ret;
1329
1330	set_bit(__LINK_STATE_START, &dev->state);
1331
1332	if (ops->ndo_validate_addr)
1333		ret = ops->ndo_validate_addr(dev);
1334
1335	if (!ret && ops->ndo_open)
1336		ret = ops->ndo_open(dev);
1337
1338	netpoll_poll_enable(dev);
1339
1340	if (ret)
1341		clear_bit(__LINK_STATE_START, &dev->state);
1342	else {
1343		dev->flags |= IFF_UP;
1344		dev_set_rx_mode(dev);
1345		dev_activate(dev);
1346		add_device_randomness(dev->dev_addr, dev->addr_len);
1347	}
1348
1349	return ret;
1350}
1351
1352/**
1353 *	dev_open	- prepare an interface for use.
1354 *	@dev:	device to open
 
1355 *
1356 *	Takes a device from down to up state. The device's private open
1357 *	function is invoked and then the multicast lists are loaded. Finally
1358 *	the device is moved into the up state and a %NETDEV_UP message is
1359 *	sent to the netdev notifier chain.
1360 *
1361 *	Calling this function on an active interface is a nop. On a failure
1362 *	a negative errno code is returned.
1363 */
1364int dev_open(struct net_device *dev)
1365{
1366	int ret;
1367
1368	if (dev->flags & IFF_UP)
1369		return 0;
1370
1371	ret = __dev_open(dev);
1372	if (ret < 0)
1373		return ret;
1374
1375	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376	call_netdevice_notifiers(NETDEV_UP, dev);
1377
1378	return ret;
1379}
1380EXPORT_SYMBOL(dev_open);
1381
1382static int __dev_close_many(struct list_head *head)
1383{
1384	struct net_device *dev;
1385
1386	ASSERT_RTNL();
1387	might_sleep();
1388
1389	list_for_each_entry(dev, head, close_list) {
1390		/* Temporarily disable netpoll until the interface is down */
1391		netpoll_poll_disable(dev);
1392
1393		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1394
1395		clear_bit(__LINK_STATE_START, &dev->state);
1396
1397		/* Synchronize to scheduled poll. We cannot touch poll list, it
1398		 * can be even on different cpu. So just clear netif_running().
1399		 *
1400		 * dev->stop() will invoke napi_disable() on all of it's
1401		 * napi_struct instances on this device.
1402		 */
1403		smp_mb__after_atomic(); /* Commit netif_running(). */
1404	}
1405
1406	dev_deactivate_many(head);
1407
1408	list_for_each_entry(dev, head, close_list) {
1409		const struct net_device_ops *ops = dev->netdev_ops;
1410
1411		/*
1412		 *	Call the device specific close. This cannot fail.
1413		 *	Only if device is UP
1414		 *
1415		 *	We allow it to be called even after a DETACH hot-plug
1416		 *	event.
1417		 */
1418		if (ops->ndo_stop)
1419			ops->ndo_stop(dev);
1420
1421		dev->flags &= ~IFF_UP;
1422		netpoll_poll_enable(dev);
1423	}
1424
1425	return 0;
1426}
1427
1428static int __dev_close(struct net_device *dev)
1429{
1430	int retval;
1431	LIST_HEAD(single);
1432
1433	list_add(&dev->close_list, &single);
1434	retval = __dev_close_many(&single);
1435	list_del(&single);
1436
1437	return retval;
1438}
1439
1440int dev_close_many(struct list_head *head, bool unlink)
1441{
1442	struct net_device *dev, *tmp;
1443
1444	/* Remove the devices that don't need to be closed */
1445	list_for_each_entry_safe(dev, tmp, head, close_list)
1446		if (!(dev->flags & IFF_UP))
1447			list_del_init(&dev->close_list);
1448
1449	__dev_close_many(head);
1450
1451	list_for_each_entry_safe(dev, tmp, head, close_list) {
1452		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1453		call_netdevice_notifiers(NETDEV_DOWN, dev);
1454		if (unlink)
1455			list_del_init(&dev->close_list);
1456	}
1457
1458	return 0;
1459}
1460EXPORT_SYMBOL(dev_close_many);
1461
1462/**
1463 *	dev_close - shutdown an interface.
1464 *	@dev: device to shutdown
1465 *
1466 *	This function moves an active device into down state. A
1467 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1468 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1469 *	chain.
1470 */
1471int dev_close(struct net_device *dev)
1472{
1473	if (dev->flags & IFF_UP) {
1474		LIST_HEAD(single);
1475
1476		list_add(&dev->close_list, &single);
1477		dev_close_many(&single, true);
1478		list_del(&single);
1479	}
1480	return 0;
1481}
1482EXPORT_SYMBOL(dev_close);
1483
1484
1485/**
1486 *	dev_disable_lro - disable Large Receive Offload on a device
1487 *	@dev: device
1488 *
1489 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1490 *	called under RTNL.  This is needed if received packets may be
1491 *	forwarded to another interface.
1492 */
1493void dev_disable_lro(struct net_device *dev)
1494{
1495	struct net_device *lower_dev;
1496	struct list_head *iter;
1497
1498	dev->wanted_features &= ~NETIF_F_LRO;
1499	netdev_update_features(dev);
1500
1501	if (unlikely(dev->features & NETIF_F_LRO))
1502		netdev_WARN(dev, "failed to disable LRO!\n");
1503
1504	netdev_for_each_lower_dev(dev, lower_dev, iter)
1505		dev_disable_lro(lower_dev);
1506}
1507EXPORT_SYMBOL(dev_disable_lro);
1508
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1509static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1510				   struct net_device *dev)
1511{
1512	struct netdev_notifier_info info;
 
 
1513
1514	netdev_notifier_info_init(&info, dev);
1515	return nb->notifier_call(nb, val, &info);
1516}
1517
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1518static int dev_boot_phase = 1;
1519
1520/**
1521 *	register_netdevice_notifier - register a network notifier block
1522 *	@nb: notifier
1523 *
1524 *	Register a notifier to be called when network device events occur.
1525 *	The notifier passed is linked into the kernel structures and must
1526 *	not be reused until it has been unregistered. A negative errno code
1527 *	is returned on a failure.
1528 *
1529 * 	When registered all registration and up events are replayed
1530 *	to the new notifier to allow device to have a race free
1531 *	view of the network device list.
1532 */
1533
1534int register_netdevice_notifier(struct notifier_block *nb)
1535{
1536	struct net_device *dev;
1537	struct net_device *last;
1538	struct net *net;
1539	int err;
1540
 
 
1541	rtnl_lock();
1542	err = raw_notifier_chain_register(&netdev_chain, nb);
1543	if (err)
1544		goto unlock;
1545	if (dev_boot_phase)
1546		goto unlock;
1547	for_each_net(net) {
1548		for_each_netdev(net, dev) {
1549			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1550			err = notifier_to_errno(err);
1551			if (err)
1552				goto rollback;
1553
1554			if (!(dev->flags & IFF_UP))
1555				continue;
1556
1557			call_netdevice_notifier(nb, NETDEV_UP, dev);
1558		}
1559	}
1560
1561unlock:
1562	rtnl_unlock();
 
1563	return err;
1564
1565rollback:
1566	last = dev;
1567	for_each_net(net) {
1568		for_each_netdev(net, dev) {
1569			if (dev == last)
1570				goto outroll;
1571
1572			if (dev->flags & IFF_UP) {
1573				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1574							dev);
1575				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1576			}
1577			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1578		}
1579	}
1580
1581outroll:
1582	raw_notifier_chain_unregister(&netdev_chain, nb);
1583	goto unlock;
1584}
1585EXPORT_SYMBOL(register_netdevice_notifier);
1586
1587/**
1588 *	unregister_netdevice_notifier - unregister a network notifier block
1589 *	@nb: notifier
1590 *
1591 *	Unregister a notifier previously registered by
1592 *	register_netdevice_notifier(). The notifier is unlinked into the
1593 *	kernel structures and may then be reused. A negative errno code
1594 *	is returned on a failure.
1595 *
1596 * 	After unregistering unregister and down device events are synthesized
1597 *	for all devices on the device list to the removed notifier to remove
1598 *	the need for special case cleanup code.
1599 */
1600
1601int unregister_netdevice_notifier(struct notifier_block *nb)
1602{
1603	struct net_device *dev;
1604	struct net *net;
1605	int err;
1606
 
 
1607	rtnl_lock();
1608	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1609	if (err)
1610		goto unlock;
1611
1612	for_each_net(net) {
1613		for_each_netdev(net, dev) {
1614			if (dev->flags & IFF_UP) {
1615				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1616							dev);
1617				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1618			}
1619			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1620		}
1621	}
1622unlock:
1623	rtnl_unlock();
 
1624	return err;
1625}
1626EXPORT_SYMBOL(unregister_netdevice_notifier);
1627
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1628/**
1629 *	call_netdevice_notifiers_info - call all network notifier blocks
1630 *	@val: value passed unmodified to notifier function
1631 *	@dev: net_device pointer passed unmodified to notifier function
1632 *	@info: notifier information data
1633 *
1634 *	Call all network notifier blocks.  Parameters and return value
1635 *	are as for raw_notifier_call_chain().
1636 */
1637
1638static int call_netdevice_notifiers_info(unsigned long val,
1639					 struct net_device *dev,
1640					 struct netdev_notifier_info *info)
1641{
 
 
 
1642	ASSERT_RTNL();
1643	netdev_notifier_info_init(info, dev);
 
 
 
 
 
 
 
1644	return raw_notifier_call_chain(&netdev_chain, val, info);
1645}
1646
1647/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1648 *	call_netdevice_notifiers - call all network notifier blocks
1649 *      @val: value passed unmodified to notifier function
1650 *      @dev: net_device pointer passed unmodified to notifier function
1651 *
1652 *	Call all network notifier blocks.  Parameters and return value
1653 *	are as for raw_notifier_call_chain().
1654 */
1655
1656int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1657{
1658	struct netdev_notifier_info info;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1659
1660	return call_netdevice_notifiers_info(val, dev, &info);
 
 
1661}
1662EXPORT_SYMBOL(call_netdevice_notifiers);
1663
1664#ifdef CONFIG_NET_INGRESS
1665static struct static_key ingress_needed __read_mostly;
1666
1667void net_inc_ingress_queue(void)
1668{
1669	static_key_slow_inc(&ingress_needed);
1670}
1671EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1672
1673void net_dec_ingress_queue(void)
1674{
1675	static_key_slow_dec(&ingress_needed);
1676}
1677EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1678#endif
1679
1680#ifdef CONFIG_NET_EGRESS
1681static struct static_key egress_needed __read_mostly;
1682
1683void net_inc_egress_queue(void)
1684{
1685	static_key_slow_inc(&egress_needed);
1686}
1687EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1688
1689void net_dec_egress_queue(void)
1690{
1691	static_key_slow_dec(&egress_needed);
1692}
1693EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1694#endif
1695
1696static struct static_key netstamp_needed __read_mostly;
1697#ifdef HAVE_JUMP_LABEL
 
1698static atomic_t netstamp_needed_deferred;
1699static atomic_t netstamp_wanted;
1700static void netstamp_clear(struct work_struct *work)
1701{
1702	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1703	int wanted;
1704
1705	wanted = atomic_add_return(deferred, &netstamp_wanted);
1706	if (wanted > 0)
1707		static_key_enable(&netstamp_needed);
1708	else
1709		static_key_disable(&netstamp_needed);
1710}
1711static DECLARE_WORK(netstamp_work, netstamp_clear);
1712#endif
1713
1714void net_enable_timestamp(void)
1715{
1716#ifdef HAVE_JUMP_LABEL
1717	int wanted;
1718
1719	while (1) {
1720		wanted = atomic_read(&netstamp_wanted);
1721		if (wanted <= 0)
1722			break;
1723		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1724			return;
1725	}
1726	atomic_inc(&netstamp_needed_deferred);
1727	schedule_work(&netstamp_work);
1728#else
1729	static_key_slow_inc(&netstamp_needed);
1730#endif
1731}
1732EXPORT_SYMBOL(net_enable_timestamp);
1733
1734void net_disable_timestamp(void)
1735{
1736#ifdef HAVE_JUMP_LABEL
1737	int wanted;
1738
1739	while (1) {
1740		wanted = atomic_read(&netstamp_wanted);
1741		if (wanted <= 1)
1742			break;
1743		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1744			return;
1745	}
1746	atomic_dec(&netstamp_needed_deferred);
1747	schedule_work(&netstamp_work);
1748#else
1749	static_key_slow_dec(&netstamp_needed);
1750#endif
1751}
1752EXPORT_SYMBOL(net_disable_timestamp);
1753
1754static inline void net_timestamp_set(struct sk_buff *skb)
1755{
1756	skb->tstamp = 0;
1757	if (static_key_false(&netstamp_needed))
1758		__net_timestamp(skb);
 
1759}
1760
1761#define net_timestamp_check(COND, SKB)			\
1762	if (static_key_false(&netstamp_needed)) {		\
1763		if ((COND) && !(SKB)->tstamp)	\
1764			__net_timestamp(SKB);		\
1765	}						\
1766
1767bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1768{
1769	unsigned int len;
1770
1771	if (!(dev->flags & IFF_UP))
1772		return false;
1773
1774	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1775	if (skb->len <= len)
1776		return true;
1777
1778	/* if TSO is enabled, we don't care about the length as the packet
1779	 * could be forwarded without being segmented before
1780	 */
1781	if (skb_is_gso(skb))
1782		return true;
1783
1784	return false;
1785}
1786EXPORT_SYMBOL_GPL(is_skb_forwardable);
1787
1788int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 
1789{
1790	int ret = ____dev_forward_skb(dev, skb);
1791
1792	if (likely(!ret)) {
1793		skb->protocol = eth_type_trans(skb, dev);
1794		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1795	}
1796
1797	return ret;
1798}
 
 
 
 
 
1799EXPORT_SYMBOL_GPL(__dev_forward_skb);
1800
1801/**
1802 * dev_forward_skb - loopback an skb to another netif
1803 *
1804 * @dev: destination network device
1805 * @skb: buffer to forward
1806 *
1807 * return values:
1808 *	NET_RX_SUCCESS	(no congestion)
1809 *	NET_RX_DROP     (packet was dropped, but freed)
1810 *
1811 * dev_forward_skb can be used for injecting an skb from the
1812 * start_xmit function of one device into the receive queue
1813 * of another device.
1814 *
1815 * The receiving device may be in another namespace, so
1816 * we have to clear all information in the skb that could
1817 * impact namespace isolation.
1818 */
1819int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1820{
1821	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1822}
1823EXPORT_SYMBOL_GPL(dev_forward_skb);
1824
 
 
 
 
 
1825static inline int deliver_skb(struct sk_buff *skb,
1826			      struct packet_type *pt_prev,
1827			      struct net_device *orig_dev)
1828{
1829	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1830		return -ENOMEM;
1831	atomic_inc(&skb->users);
1832	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1833}
1834
1835static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1836					  struct packet_type **pt,
1837					  struct net_device *orig_dev,
1838					  __be16 type,
1839					  struct list_head *ptype_list)
1840{
1841	struct packet_type *ptype, *pt_prev = *pt;
1842
1843	list_for_each_entry_rcu(ptype, ptype_list, list) {
1844		if (ptype->type != type)
1845			continue;
1846		if (pt_prev)
1847			deliver_skb(skb, pt_prev, orig_dev);
1848		pt_prev = ptype;
1849	}
1850	*pt = pt_prev;
1851}
1852
1853static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1854{
1855	if (!ptype->af_packet_priv || !skb->sk)
1856		return false;
1857
1858	if (ptype->id_match)
1859		return ptype->id_match(ptype, skb->sk);
1860	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1861		return true;
1862
1863	return false;
1864}
1865
 
 
 
 
 
 
 
 
 
 
 
 
1866/*
1867 *	Support routine. Sends outgoing frames to any network
1868 *	taps currently in use.
1869 */
1870
1871void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1872{
1873	struct packet_type *ptype;
 
1874	struct sk_buff *skb2 = NULL;
1875	struct packet_type *pt_prev = NULL;
1876	struct list_head *ptype_list = &ptype_all;
1877
1878	rcu_read_lock();
1879again:
1880	list_for_each_entry_rcu(ptype, ptype_list, list) {
 
 
 
1881		/* Never send packets back to the socket
1882		 * they originated from - MvS (miquels@drinkel.ow.org)
1883		 */
1884		if (skb_loop_sk(ptype, skb))
1885			continue;
1886
1887		if (pt_prev) {
1888			deliver_skb(skb2, pt_prev, skb->dev);
1889			pt_prev = ptype;
1890			continue;
1891		}
1892
1893		/* need to clone skb, done only once */
1894		skb2 = skb_clone(skb, GFP_ATOMIC);
1895		if (!skb2)
1896			goto out_unlock;
1897
1898		net_timestamp_set(skb2);
1899
1900		/* skb->nh should be correctly
1901		 * set by sender, so that the second statement is
1902		 * just protection against buggy protocols.
1903		 */
1904		skb_reset_mac_header(skb2);
1905
1906		if (skb_network_header(skb2) < skb2->data ||
1907		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1908			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1909					     ntohs(skb2->protocol),
1910					     dev->name);
1911			skb_reset_network_header(skb2);
1912		}
1913
1914		skb2->transport_header = skb2->network_header;
1915		skb2->pkt_type = PACKET_OUTGOING;
1916		pt_prev = ptype;
1917	}
1918
1919	if (ptype_list == &ptype_all) {
1920		ptype_list = &dev->ptype_all;
1921		goto again;
1922	}
1923out_unlock:
1924	if (pt_prev)
1925		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
 
 
 
 
1926	rcu_read_unlock();
1927}
1928EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1929
1930/**
1931 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1932 * @dev: Network device
1933 * @txq: number of queues available
1934 *
1935 * If real_num_tx_queues is changed the tc mappings may no longer be
1936 * valid. To resolve this verify the tc mapping remains valid and if
1937 * not NULL the mapping. With no priorities mapping to this
1938 * offset/count pair it will no longer be used. In the worst case TC0
1939 * is invalid nothing can be done so disable priority mappings. If is
1940 * expected that drivers will fix this mapping if they can before
1941 * calling netif_set_real_num_tx_queues.
1942 */
1943static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1944{
1945	int i;
1946	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1947
1948	/* If TC0 is invalidated disable TC mapping */
1949	if (tc->offset + tc->count > txq) {
1950		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1951		dev->num_tc = 0;
1952		return;
1953	}
1954
1955	/* Invalidated prio to tc mappings set to TC0 */
1956	for (i = 1; i < TC_BITMASK + 1; i++) {
1957		int q = netdev_get_prio_tc_map(dev, i);
1958
1959		tc = &dev->tc_to_txq[q];
1960		if (tc->offset + tc->count > txq) {
1961			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1962				i, q);
1963			netdev_set_prio_tc_map(dev, i, 0);
1964		}
1965	}
1966}
1967
1968int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
1969{
1970	if (dev->num_tc) {
1971		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1972		int i;
1973
 
1974		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
1975			if ((txq - tc->offset) < tc->count)
1976				return i;
1977		}
1978
 
1979		return -1;
1980	}
1981
1982	return 0;
1983}
 
1984
1985#ifdef CONFIG_XPS
 
 
1986static DEFINE_MUTEX(xps_map_mutex);
1987#define xmap_dereference(P)		\
1988	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1989
1990static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
1991			     int tci, u16 index)
1992{
1993	struct xps_map *map = NULL;
1994	int pos;
1995
1996	if (dev_maps)
1997		map = xmap_dereference(dev_maps->cpu_map[tci]);
1998	if (!map)
1999		return false;
2000
2001	for (pos = map->len; pos--;) {
2002		if (map->queues[pos] != index)
2003			continue;
2004
2005		if (map->len > 1) {
2006			map->queues[pos] = map->queues[--map->len];
2007			break;
2008		}
2009
2010		RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
 
 
2011		kfree_rcu(map, rcu);
2012		return false;
2013	}
2014
2015	return true;
2016}
2017
2018static bool remove_xps_queue_cpu(struct net_device *dev,
2019				 struct xps_dev_maps *dev_maps,
2020				 int cpu, u16 offset, u16 count)
2021{
2022	int num_tc = dev->num_tc ? : 1;
2023	bool active = false;
2024	int tci;
2025
2026	for (tci = cpu * num_tc; num_tc--; tci++) {
2027		int i, j;
2028
2029		for (i = count, j = offset; i--; j++) {
2030			if (!remove_xps_queue(dev_maps, cpu, j))
2031				break;
2032		}
2033
2034		active |= i < 0;
2035	}
2036
2037	return active;
2038}
2039
2040static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2041				   u16 count)
 
 
 
 
 
 
 
 
 
 
 
 
 
2042{
2043	struct xps_dev_maps *dev_maps;
2044	int cpu, i;
2045	bool active = false;
 
2046
2047	mutex_lock(&xps_map_mutex);
2048	dev_maps = xmap_dereference(dev->xps_maps);
2049
2050	if (!dev_maps)
2051		goto out_no_maps;
2052
2053	for_each_possible_cpu(cpu)
2054		active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
2055					       offset, count);
2056
2057	if (!active) {
2058		RCU_INIT_POINTER(dev->xps_maps, NULL);
2059		kfree_rcu(dev_maps, rcu);
 
 
2060	}
 
 
 
 
 
 
 
 
 
 
2061
2062	for (i = offset + (count - 1); count--; i--)
2063		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2064					     NUMA_NO_NODE);
 
2065
2066out_no_maps:
2067	mutex_unlock(&xps_map_mutex);
 
2068}
2069
2070static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2071{
2072	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2073}
2074
2075static struct xps_map *expand_xps_map(struct xps_map *map,
2076				      int cpu, u16 index)
2077{
2078	struct xps_map *new_map;
2079	int alloc_len = XPS_MIN_MAP_ALLOC;
2080	int i, pos;
2081
2082	for (pos = 0; map && pos < map->len; pos++) {
2083		if (map->queues[pos] != index)
2084			continue;
2085		return map;
2086	}
2087
2088	/* Need to add queue to this CPU's existing map */
2089	if (map) {
2090		if (pos < map->alloc_len)
2091			return map;
2092
2093		alloc_len = map->alloc_len * 2;
2094	}
2095
2096	/* Need to allocate new map to store queue on this CPU's map */
2097	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2098			       cpu_to_node(cpu));
 
 
 
 
 
2099	if (!new_map)
2100		return NULL;
2101
2102	for (i = 0; i < pos; i++)
2103		new_map->queues[i] = map->queues[i];
2104	new_map->alloc_len = alloc_len;
2105	new_map->len = pos;
2106
2107	return new_map;
2108}
2109
2110int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2111			u16 index)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2112{
2113	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2114	int i, cpu, tci, numa_node_id = -2;
 
 
2115	int maps_sz, num_tc = 1, tc = 0;
2116	struct xps_map *map, *new_map;
2117	bool active = false;
 
 
2118
2119	if (dev->num_tc) {
 
2120		num_tc = dev->num_tc;
 
 
 
 
 
 
2121		tc = netdev_txq_to_tc(dev, index);
2122		if (tc < 0)
2123			return -EINVAL;
2124	}
2125
2126	maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
 
 
 
 
 
 
 
 
 
 
 
 
2127	if (maps_sz < L1_CACHE_BYTES)
2128		maps_sz = L1_CACHE_BYTES;
2129
2130	mutex_lock(&xps_map_mutex);
2131
2132	dev_maps = xmap_dereference(dev->xps_maps);
 
 
 
 
 
2133
2134	/* allocate memory for queue storage */
2135	for_each_cpu_and(cpu, cpu_online_mask, mask) {
2136		if (!new_dev_maps)
 
2137			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2138		if (!new_dev_maps) {
2139			mutex_unlock(&xps_map_mutex);
2140			return -ENOMEM;
 
 
 
 
2141		}
2142
2143		tci = cpu * num_tc + tc;
2144		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
2145				 NULL;
2146
2147		map = expand_xps_map(map, cpu, index);
2148		if (!map)
2149			goto error;
2150
2151		RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2152	}
2153
2154	if (!new_dev_maps)
2155		goto out_no_new_maps;
2156
2157	for_each_possible_cpu(cpu) {
2158		/* copy maps belonging to foreign traffic classes */
2159		for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
2160			/* fill in the new device map from the old device map */
2161			map = xmap_dereference(dev_maps->cpu_map[tci]);
2162			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2163		}
2164
2165		/* We need to explicitly update tci as prevous loop
2166		 * could break out early if dev_maps is NULL.
2167		 */
2168		tci = cpu * num_tc + tc;
 
 
 
 
2169
2170		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2171			/* add queue to CPU maps */
2172			int pos = 0;
2173
2174			map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2175			while ((pos < map->len) && (map->queues[pos] != index))
2176				pos++;
2177
2178			if (pos == map->len)
2179				map->queues[map->len++] = index;
2180#ifdef CONFIG_NUMA
2181			if (numa_node_id == -2)
2182				numa_node_id = cpu_to_node(cpu);
2183			else if (numa_node_id != cpu_to_node(cpu))
2184				numa_node_id = -1;
 
 
2185#endif
2186		} else if (dev_maps) {
2187			/* fill in the new device map from the old device map */
2188			map = xmap_dereference(dev_maps->cpu_map[tci]);
2189			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2190		}
2191
2192		/* copy maps belonging to foreign traffic classes */
2193		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2194			/* fill in the new device map from the old device map */
2195			map = xmap_dereference(dev_maps->cpu_map[tci]);
2196			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2197		}
2198	}
2199
2200	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2201
2202	/* Cleanup old maps */
2203	if (!dev_maps)
2204		goto out_no_old_maps;
2205
2206	for_each_possible_cpu(cpu) {
2207		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2208			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2209			map = xmap_dereference(dev_maps->cpu_map[tci]);
2210			if (map && map != new_map)
2211				kfree_rcu(map, rcu);
 
 
 
 
 
 
 
 
2212		}
2213	}
2214
2215	kfree_rcu(dev_maps, rcu);
2216
2217out_no_old_maps:
2218	dev_maps = new_dev_maps;
2219	active = true;
2220
2221out_no_new_maps:
2222	/* update Tx queue numa node */
2223	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2224				     (numa_node_id >= 0) ? numa_node_id :
2225				     NUMA_NO_NODE);
 
2226
2227	if (!dev_maps)
2228		goto out_no_maps;
2229
2230	/* removes queue from unused CPUs */
2231	for_each_possible_cpu(cpu) {
2232		for (i = tc, tci = cpu * num_tc; i--; tci++)
2233			active |= remove_xps_queue(dev_maps, tci, index);
2234		if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
2235			active |= remove_xps_queue(dev_maps, tci, index);
2236		for (i = num_tc - tc, tci++; --i; tci++)
2237			active |= remove_xps_queue(dev_maps, tci, index);
 
 
 
 
 
 
2238	}
2239
 
 
 
2240	/* free map if not active */
2241	if (!active) {
2242		RCU_INIT_POINTER(dev->xps_maps, NULL);
2243		kfree_rcu(dev_maps, rcu);
2244	}
2245
2246out_no_maps:
2247	mutex_unlock(&xps_map_mutex);
2248
2249	return 0;
2250error:
2251	/* remove any maps that we added */
2252	for_each_possible_cpu(cpu) {
2253		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2254			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2255			map = dev_maps ?
2256			      xmap_dereference(dev_maps->cpu_map[tci]) :
2257			      NULL;
2258			if (new_map && new_map != map)
2259				kfree(new_map);
2260		}
2261	}
2262
2263	mutex_unlock(&xps_map_mutex);
2264
2265	kfree(new_dev_maps);
2266	return -ENOMEM;
2267}
 
 
 
 
 
 
 
 
 
 
 
 
 
2268EXPORT_SYMBOL(netif_set_xps_queue);
2269
2270#endif
 
 
 
 
 
 
 
 
 
 
 
2271void netdev_reset_tc(struct net_device *dev)
2272{
2273#ifdef CONFIG_XPS
2274	netif_reset_xps_queues_gt(dev, 0);
2275#endif
 
 
 
2276	dev->num_tc = 0;
2277	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2278	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2279}
2280EXPORT_SYMBOL(netdev_reset_tc);
2281
2282int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2283{
2284	if (tc >= dev->num_tc)
2285		return -EINVAL;
2286
2287#ifdef CONFIG_XPS
2288	netif_reset_xps_queues(dev, offset, count);
2289#endif
2290	dev->tc_to_txq[tc].count = count;
2291	dev->tc_to_txq[tc].offset = offset;
2292	return 0;
2293}
2294EXPORT_SYMBOL(netdev_set_tc_queue);
2295
2296int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2297{
2298	if (num_tc > TC_MAX_QUEUE)
2299		return -EINVAL;
2300
2301#ifdef CONFIG_XPS
2302	netif_reset_xps_queues_gt(dev, 0);
2303#endif
 
 
2304	dev->num_tc = num_tc;
2305	return 0;
2306}
2307EXPORT_SYMBOL(netdev_set_num_tc);
2308
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2309/*
2310 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2311 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2312 */
2313int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2314{
 
2315	int rc;
2316
 
 
2317	if (txq < 1 || txq > dev->num_tx_queues)
2318		return -EINVAL;
2319
2320	if (dev->reg_state == NETREG_REGISTERED ||
2321	    dev->reg_state == NETREG_UNREGISTERING) {
2322		ASSERT_RTNL();
2323
2324		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2325						  txq);
2326		if (rc)
2327			return rc;
2328
2329		if (dev->num_tc)
2330			netif_setup_tc(dev, txq);
2331
2332		if (txq < dev->real_num_tx_queues) {
 
 
 
 
 
2333			qdisc_reset_all_tx_gt(dev, txq);
2334#ifdef CONFIG_XPS
2335			netif_reset_xps_queues_gt(dev, txq);
2336#endif
2337		}
 
 
2338	}
2339
2340	dev->real_num_tx_queues = txq;
2341	return 0;
2342}
2343EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2344
2345#ifdef CONFIG_SYSFS
2346/**
2347 *	netif_set_real_num_rx_queues - set actual number of RX queues used
2348 *	@dev: Network device
2349 *	@rxq: Actual number of RX queues
2350 *
2351 *	This must be called either with the rtnl_lock held or before
2352 *	registration of the net device.  Returns 0 on success, or a
2353 *	negative error code.  If called before registration, it always
2354 *	succeeds.
2355 */
2356int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2357{
2358	int rc;
2359
2360	if (rxq < 1 || rxq > dev->num_rx_queues)
2361		return -EINVAL;
2362
2363	if (dev->reg_state == NETREG_REGISTERED) {
2364		ASSERT_RTNL();
2365
2366		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2367						  rxq);
2368		if (rc)
2369			return rc;
2370	}
2371
2372	dev->real_num_rx_queues = rxq;
2373	return 0;
2374}
2375EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2376#endif
2377
2378/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2379 * netif_get_num_default_rss_queues - default number of RSS queues
2380 *
2381 * This routine should set an upper limit on the number of RSS queues
2382 * used by default by multiqueue devices.
2383 */
2384int netif_get_num_default_rss_queues(void)
2385{
2386	return is_kdump_kernel() ?
2387		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
 
 
 
 
 
 
 
 
 
 
 
 
2388}
2389EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2390
2391static void __netif_reschedule(struct Qdisc *q)
2392{
2393	struct softnet_data *sd;
2394	unsigned long flags;
2395
2396	local_irq_save(flags);
2397	sd = this_cpu_ptr(&softnet_data);
2398	q->next_sched = NULL;
2399	*sd->output_queue_tailp = q;
2400	sd->output_queue_tailp = &q->next_sched;
2401	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2402	local_irq_restore(flags);
2403}
2404
2405void __netif_schedule(struct Qdisc *q)
2406{
2407	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2408		__netif_reschedule(q);
2409}
2410EXPORT_SYMBOL(__netif_schedule);
2411
2412struct dev_kfree_skb_cb {
2413	enum skb_free_reason reason;
2414};
2415
2416static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2417{
2418	return (struct dev_kfree_skb_cb *)skb->cb;
2419}
2420
2421void netif_schedule_queue(struct netdev_queue *txq)
2422{
2423	rcu_read_lock();
2424	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2425		struct Qdisc *q = rcu_dereference(txq->qdisc);
2426
2427		__netif_schedule(q);
2428	}
2429	rcu_read_unlock();
2430}
2431EXPORT_SYMBOL(netif_schedule_queue);
2432
2433/**
2434 *	netif_wake_subqueue - allow sending packets on subqueue
2435 *	@dev: network device
2436 *	@queue_index: sub queue index
2437 *
2438 * Resume individual transmit queue of a device with multiple transmit queues.
2439 */
2440void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2441{
2442	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2443
2444	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2445		struct Qdisc *q;
2446
2447		rcu_read_lock();
2448		q = rcu_dereference(txq->qdisc);
2449		__netif_schedule(q);
2450		rcu_read_unlock();
2451	}
2452}
2453EXPORT_SYMBOL(netif_wake_subqueue);
2454
2455void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2456{
2457	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2458		struct Qdisc *q;
2459
2460		rcu_read_lock();
2461		q = rcu_dereference(dev_queue->qdisc);
2462		__netif_schedule(q);
2463		rcu_read_unlock();
2464	}
2465}
2466EXPORT_SYMBOL(netif_tx_wake_queue);
2467
2468void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2469{
2470	unsigned long flags;
2471
2472	if (likely(atomic_read(&skb->users) == 1)) {
 
 
 
2473		smp_rmb();
2474		atomic_set(&skb->users, 0);
2475	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2476		return;
2477	}
2478	get_kfree_skb_cb(skb)->reason = reason;
2479	local_irq_save(flags);
2480	skb->next = __this_cpu_read(softnet_data.completion_queue);
2481	__this_cpu_write(softnet_data.completion_queue, skb);
2482	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2483	local_irq_restore(flags);
2484}
2485EXPORT_SYMBOL(__dev_kfree_skb_irq);
2486
2487void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2488{
2489	if (in_irq() || irqs_disabled())
2490		__dev_kfree_skb_irq(skb, reason);
2491	else
2492		dev_kfree_skb(skb);
2493}
2494EXPORT_SYMBOL(__dev_kfree_skb_any);
2495
2496
2497/**
2498 * netif_device_detach - mark device as removed
2499 * @dev: network device
2500 *
2501 * Mark device as removed from system and therefore no longer available.
2502 */
2503void netif_device_detach(struct net_device *dev)
2504{
2505	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2506	    netif_running(dev)) {
2507		netif_tx_stop_all_queues(dev);
2508	}
2509}
2510EXPORT_SYMBOL(netif_device_detach);
2511
2512/**
2513 * netif_device_attach - mark device as attached
2514 * @dev: network device
2515 *
2516 * Mark device as attached from system and restart if needed.
2517 */
2518void netif_device_attach(struct net_device *dev)
2519{
2520	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2521	    netif_running(dev)) {
2522		netif_tx_wake_all_queues(dev);
2523		__netdev_watchdog_up(dev);
2524	}
2525}
2526EXPORT_SYMBOL(netif_device_attach);
2527
2528/*
2529 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2530 * to be used as a distribution range.
2531 */
2532u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2533		  unsigned int num_tx_queues)
 
2534{
2535	u32 hash;
2536	u16 qoffset = 0;
2537	u16 qcount = num_tx_queues;
 
 
 
 
 
 
 
 
 
 
 
 
 
2538
2539	if (skb_rx_queue_recorded(skb)) {
 
2540		hash = skb_get_rx_queue(skb);
2541		while (unlikely(hash >= num_tx_queues))
2542			hash -= num_tx_queues;
2543		return hash;
2544	}
2545
2546	if (dev->num_tc) {
2547		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2548		qoffset = dev->tc_to_txq[tc].offset;
2549		qcount = dev->tc_to_txq[tc].count;
2550	}
2551
2552	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2553}
2554EXPORT_SYMBOL(__skb_tx_hash);
2555
2556static void skb_warn_bad_offload(const struct sk_buff *skb)
2557{
2558	static const netdev_features_t null_features;
2559	struct net_device *dev = skb->dev;
2560	const char *name = "";
2561
2562	if (!net_ratelimit())
2563		return;
2564
2565	if (dev) {
2566		if (dev->dev.parent)
2567			name = dev_driver_string(dev->dev.parent);
2568		else
2569			name = netdev_name(dev);
2570	}
2571	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2572	     "gso_type=%d ip_summed=%d\n",
2573	     name, dev ? &dev->features : &null_features,
2574	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2575	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2576	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2577}
2578
2579/*
2580 * Invalidate hardware checksum when packet is to be mangled, and
2581 * complete checksum manually on outgoing path.
2582 */
2583int skb_checksum_help(struct sk_buff *skb)
2584{
2585	__wsum csum;
2586	int ret = 0, offset;
2587
2588	if (skb->ip_summed == CHECKSUM_COMPLETE)
2589		goto out_set_summed;
2590
2591	if (unlikely(skb_shinfo(skb)->gso_size)) {
2592		skb_warn_bad_offload(skb);
2593		return -EINVAL;
2594	}
2595
2596	/* Before computing a checksum, we should make sure no frag could
2597	 * be modified by an external entity : checksum could be wrong.
2598	 */
2599	if (skb_has_shared_frag(skb)) {
2600		ret = __skb_linearize(skb);
2601		if (ret)
2602			goto out;
2603	}
2604
2605	offset = skb_checksum_start_offset(skb);
2606	BUG_ON(offset >= skb_headlen(skb));
 
 
 
 
 
 
2607	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2608
2609	offset += skb->csum_offset;
2610	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2611
2612	if (skb_cloned(skb) &&
2613	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2614		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2615		if (ret)
2616			goto out;
2617	}
 
 
 
2618
2619	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2620out_set_summed:
2621	skb->ip_summed = CHECKSUM_NONE;
2622out:
2623	return ret;
2624}
2625EXPORT_SYMBOL(skb_checksum_help);
2626
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2627__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2628{
2629	__be16 type = skb->protocol;
2630
2631	/* Tunnel gso handlers can set protocol to ethernet. */
2632	if (type == htons(ETH_P_TEB)) {
2633		struct ethhdr *eth;
2634
2635		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2636			return 0;
2637
2638		eth = (struct ethhdr *)skb_mac_header(skb);
2639		type = eth->h_proto;
2640	}
2641
2642	return __vlan_get_protocol(skb, type, depth);
2643}
2644
2645/**
2646 *	skb_mac_gso_segment - mac layer segmentation handler.
2647 *	@skb: buffer to segment
2648 *	@features: features for the output path (see dev->features)
2649 */
2650struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2651				    netdev_features_t features)
2652{
2653	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2654	struct packet_offload *ptype;
2655	int vlan_depth = skb->mac_len;
2656	__be16 type = skb_network_protocol(skb, &vlan_depth);
2657
2658	if (unlikely(!type))
2659		return ERR_PTR(-EINVAL);
2660
2661	__skb_pull(skb, vlan_depth);
2662
2663	rcu_read_lock();
2664	list_for_each_entry_rcu(ptype, &offload_base, list) {
2665		if (ptype->type == type && ptype->callbacks.gso_segment) {
2666			segs = ptype->callbacks.gso_segment(skb, features);
2667			break;
2668		}
2669	}
2670	rcu_read_unlock();
2671
2672	__skb_push(skb, skb->data - skb_mac_header(skb));
2673
2674	return segs;
2675}
2676EXPORT_SYMBOL(skb_mac_gso_segment);
2677
2678
2679/* openvswitch calls this on rx path, so we need a different check.
2680 */
2681static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2682{
2683	if (tx_path)
2684		return skb->ip_summed != CHECKSUM_PARTIAL;
2685	else
2686		return skb->ip_summed == CHECKSUM_NONE;
2687}
2688
2689/**
2690 *	__skb_gso_segment - Perform segmentation on skb.
2691 *	@skb: buffer to segment
2692 *	@features: features for the output path (see dev->features)
2693 *	@tx_path: whether it is called in TX path
2694 *
2695 *	This function segments the given skb and returns a list of segments.
2696 *
2697 *	It may return NULL if the skb requires no segmentation.  This is
2698 *	only possible when GSO is used for verifying header integrity.
2699 *
2700 *	Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2701 */
2702struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2703				  netdev_features_t features, bool tx_path)
2704{
2705	if (unlikely(skb_needs_check(skb, tx_path))) {
2706		int err;
2707
2708		skb_warn_bad_offload(skb);
2709
2710		err = skb_cow_head(skb, 0);
2711		if (err < 0)
2712			return ERR_PTR(err);
2713	}
2714
2715	/* Only report GSO partial support if it will enable us to
2716	 * support segmentation on this frame without needing additional
2717	 * work.
2718	 */
2719	if (features & NETIF_F_GSO_PARTIAL) {
2720		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2721		struct net_device *dev = skb->dev;
2722
2723		partial_features |= dev->features & dev->gso_partial_features;
2724		if (!skb_gso_ok(skb, features | partial_features))
2725			features &= ~NETIF_F_GSO_PARTIAL;
2726	}
2727
2728	BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2729		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2730
2731	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2732	SKB_GSO_CB(skb)->encap_level = 0;
2733
2734	skb_reset_mac_header(skb);
2735	skb_reset_mac_len(skb);
2736
2737	return skb_mac_gso_segment(skb, features);
2738}
2739EXPORT_SYMBOL(__skb_gso_segment);
2740
2741/* Take action when hardware reception checksum errors are detected. */
2742#ifdef CONFIG_BUG
2743void netdev_rx_csum_fault(struct net_device *dev)
2744{
2745	if (net_ratelimit()) {
2746		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2747		dump_stack();
2748	}
2749}
2750EXPORT_SYMBOL(netdev_rx_csum_fault);
2751#endif
2752
2753/* Actually, we should eliminate this check as soon as we know, that:
2754 * 1. IOMMU is present and allows to map all the memory.
2755 * 2. No high memory really exists on this machine.
2756 */
2757
2758static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2759{
2760#ifdef CONFIG_HIGHMEM
2761	int i;
 
2762	if (!(dev->features & NETIF_F_HIGHDMA)) {
2763		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2764			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
2765			if (PageHighMem(skb_frag_page(frag)))
2766				return 1;
2767		}
2768	}
2769
2770	if (PCI_DMA_BUS_IS_PHYS) {
2771		struct device *pdev = dev->dev.parent;
2772
2773		if (!pdev)
2774			return 0;
2775		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2776			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2777			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2778			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2779				return 1;
2780		}
2781	}
2782#endif
2783	return 0;
2784}
2785
2786/* If MPLS offload request, verify we are testing hardware MPLS features
2787 * instead of standard features for the netdev.
2788 */
2789#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2790static netdev_features_t net_mpls_features(struct sk_buff *skb,
2791					   netdev_features_t features,
2792					   __be16 type)
2793{
2794	if (eth_p_mpls(type))
2795		features &= skb->dev->mpls_features;
2796
2797	return features;
2798}
2799#else
2800static netdev_features_t net_mpls_features(struct sk_buff *skb,
2801					   netdev_features_t features,
2802					   __be16 type)
2803{
2804	return features;
2805}
2806#endif
2807
2808static netdev_features_t harmonize_features(struct sk_buff *skb,
2809	netdev_features_t features)
2810{
2811	int tmp;
2812	__be16 type;
2813
2814	type = skb_network_protocol(skb, &tmp);
2815	features = net_mpls_features(skb, features, type);
2816
2817	if (skb->ip_summed != CHECKSUM_NONE &&
2818	    !can_checksum_protocol(features, type)) {
2819		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2820	}
2821	if (illegal_highdma(skb->dev, skb))
2822		features &= ~NETIF_F_SG;
2823
2824	return features;
2825}
2826
2827netdev_features_t passthru_features_check(struct sk_buff *skb,
2828					  struct net_device *dev,
2829					  netdev_features_t features)
2830{
2831	return features;
2832}
2833EXPORT_SYMBOL(passthru_features_check);
2834
2835static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2836					     struct net_device *dev,
2837					     netdev_features_t features)
2838{
2839	return vlan_features_check(skb, features);
2840}
2841
2842static netdev_features_t gso_features_check(const struct sk_buff *skb,
2843					    struct net_device *dev,
2844					    netdev_features_t features)
2845{
2846	u16 gso_segs = skb_shinfo(skb)->gso_segs;
2847
2848	if (gso_segs > dev->gso_max_segs)
2849		return features & ~NETIF_F_GSO_MASK;
2850
 
 
 
 
 
 
 
 
2851	/* Support for GSO partial features requires software
2852	 * intervention before we can actually process the packets
2853	 * so we need to strip support for any partial features now
2854	 * and we can pull them back in after we have partially
2855	 * segmented the frame.
2856	 */
2857	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2858		features &= ~dev->gso_partial_features;
2859
2860	/* Make sure to clear the IPv4 ID mangling feature if the
2861	 * IPv4 header has the potential to be fragmented.
2862	 */
2863	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2864		struct iphdr *iph = skb->encapsulation ?
2865				    inner_ip_hdr(skb) : ip_hdr(skb);
2866
2867		if (!(iph->frag_off & htons(IP_DF)))
2868			features &= ~NETIF_F_TSO_MANGLEID;
2869	}
2870
2871	return features;
2872}
2873
2874netdev_features_t netif_skb_features(struct sk_buff *skb)
2875{
2876	struct net_device *dev = skb->dev;
2877	netdev_features_t features = dev->features;
2878
2879	if (skb_is_gso(skb))
2880		features = gso_features_check(skb, dev, features);
2881
2882	/* If encapsulation offload request, verify we are testing
2883	 * hardware encapsulation features instead of standard
2884	 * features for the netdev
2885	 */
2886	if (skb->encapsulation)
2887		features &= dev->hw_enc_features;
2888
2889	if (skb_vlan_tagged(skb))
2890		features = netdev_intersect_features(features,
2891						     dev->vlan_features |
2892						     NETIF_F_HW_VLAN_CTAG_TX |
2893						     NETIF_F_HW_VLAN_STAG_TX);
2894
2895	if (dev->netdev_ops->ndo_features_check)
2896		features &= dev->netdev_ops->ndo_features_check(skb, dev,
2897								features);
2898	else
2899		features &= dflt_features_check(skb, dev, features);
2900
2901	return harmonize_features(skb, features);
2902}
2903EXPORT_SYMBOL(netif_skb_features);
2904
2905static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2906		    struct netdev_queue *txq, bool more)
2907{
2908	unsigned int len;
2909	int rc;
2910
2911	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2912		dev_queue_xmit_nit(skb, dev);
2913
2914	len = skb->len;
2915	trace_net_dev_start_xmit(skb, dev);
2916	rc = netdev_start_xmit(skb, dev, txq, more);
2917	trace_net_dev_xmit(skb, rc, dev, len);
2918
2919	return rc;
2920}
2921
2922struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2923				    struct netdev_queue *txq, int *ret)
2924{
2925	struct sk_buff *skb = first;
2926	int rc = NETDEV_TX_OK;
2927
2928	while (skb) {
2929		struct sk_buff *next = skb->next;
2930
2931		skb->next = NULL;
2932		rc = xmit_one(skb, dev, txq, next != NULL);
2933		if (unlikely(!dev_xmit_complete(rc))) {
2934			skb->next = next;
2935			goto out;
2936		}
2937
2938		skb = next;
2939		if (netif_xmit_stopped(txq) && skb) {
2940			rc = NETDEV_TX_BUSY;
2941			break;
2942		}
2943	}
2944
2945out:
2946	*ret = rc;
2947	return skb;
2948}
2949
2950static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2951					  netdev_features_t features)
2952{
2953	if (skb_vlan_tag_present(skb) &&
2954	    !vlan_hw_offload_capable(features, skb->vlan_proto))
2955		skb = __vlan_hwaccel_push_inside(skb);
2956	return skb;
2957}
2958
2959static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2960{
2961	netdev_features_t features;
2962
2963	features = netif_skb_features(skb);
2964	skb = validate_xmit_vlan(skb, features);
2965	if (unlikely(!skb))
2966		goto out_null;
2967
 
 
 
 
2968	if (netif_needs_gso(skb, features)) {
2969		struct sk_buff *segs;
2970
2971		segs = skb_gso_segment(skb, features);
2972		if (IS_ERR(segs)) {
2973			goto out_kfree_skb;
2974		} else if (segs) {
2975			consume_skb(skb);
2976			skb = segs;
2977		}
2978	} else {
2979		if (skb_needs_linearize(skb, features) &&
2980		    __skb_linearize(skb))
2981			goto out_kfree_skb;
2982
2983		/* If packet is not checksummed and device does not
2984		 * support checksumming for this protocol, complete
2985		 * checksumming here.
2986		 */
2987		if (skb->ip_summed == CHECKSUM_PARTIAL) {
2988			if (skb->encapsulation)
2989				skb_set_inner_transport_header(skb,
2990							       skb_checksum_start_offset(skb));
2991			else
2992				skb_set_transport_header(skb,
2993							 skb_checksum_start_offset(skb));
2994			if (!(features & NETIF_F_CSUM_MASK) &&
2995			    skb_checksum_help(skb))
2996				goto out_kfree_skb;
2997		}
2998	}
2999
 
 
3000	return skb;
3001
3002out_kfree_skb:
3003	kfree_skb(skb);
3004out_null:
3005	atomic_long_inc(&dev->tx_dropped);
3006	return NULL;
3007}
3008
3009struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
3010{
3011	struct sk_buff *next, *head = NULL, *tail;
3012
3013	for (; skb != NULL; skb = next) {
3014		next = skb->next;
3015		skb->next = NULL;
3016
3017		/* in case skb wont be segmented, point to itself */
3018		skb->prev = skb;
3019
3020		skb = validate_xmit_skb(skb, dev);
3021		if (!skb)
3022			continue;
3023
3024		if (!head)
3025			head = skb;
3026		else
3027			tail->next = skb;
3028		/* If skb was segmented, skb->prev points to
3029		 * the last segment. If not, it still contains skb.
3030		 */
3031		tail = skb->prev;
3032	}
3033	return head;
3034}
3035EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3036
3037static void qdisc_pkt_len_init(struct sk_buff *skb)
3038{
3039	const struct skb_shared_info *shinfo = skb_shinfo(skb);
3040
3041	qdisc_skb_cb(skb)->pkt_len = skb->len;
3042
3043	/* To get more precise estimation of bytes sent on wire,
3044	 * we add to pkt_len the headers size of all segments
3045	 */
3046	if (shinfo->gso_size)  {
 
3047		unsigned int hdr_len;
3048		u16 gso_segs = shinfo->gso_segs;
3049
3050		/* mac layer + network layer */
3051		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3052
3053		/* + transport layer */
3054		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3055			hdr_len += tcp_hdrlen(skb);
3056		else
3057			hdr_len += sizeof(struct udphdr);
 
 
 
 
 
 
 
 
 
 
 
3058
3059		if (shinfo->gso_type & SKB_GSO_DODGY)
3060			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3061						shinfo->gso_size);
3062
3063		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3064	}
3065}
3066
 
 
 
 
 
 
 
 
 
 
 
 
3067static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3068				 struct net_device *dev,
3069				 struct netdev_queue *txq)
3070{
3071	spinlock_t *root_lock = qdisc_lock(q);
3072	struct sk_buff *to_free = NULL;
3073	bool contended;
3074	int rc;
3075
3076	qdisc_calculate_pkt_len(skb, q);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3077	/*
3078	 * Heuristic to force contended enqueues to serialize on a
3079	 * separate lock before trying to get qdisc main lock.
3080	 * This permits qdisc->running owner to get the lock more
3081	 * often and dequeue packets faster.
 
 
 
 
3082	 */
3083	contended = qdisc_is_running(q);
3084	if (unlikely(contended))
3085		spin_lock(&q->busylock);
3086
3087	spin_lock(root_lock);
3088	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3089		__qdisc_drop(skb, &to_free);
3090		rc = NET_XMIT_DROP;
3091	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3092		   qdisc_run_begin(q)) {
3093		/*
3094		 * This is a work-conserving queue; there are no old skbs
3095		 * waiting to be sent out; and the qdisc is not running -
3096		 * xmit the skb directly.
3097		 */
3098
3099		qdisc_bstats_update(q, skb);
3100
3101		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3102			if (unlikely(contended)) {
3103				spin_unlock(&q->busylock);
3104				contended = false;
3105			}
3106			__qdisc_run(q);
3107		} else
3108			qdisc_run_end(q);
3109
 
3110		rc = NET_XMIT_SUCCESS;
3111	} else {
3112		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
 
 
3113		if (qdisc_run_begin(q)) {
3114			if (unlikely(contended)) {
3115				spin_unlock(&q->busylock);
3116				contended = false;
3117			}
3118			__qdisc_run(q);
 
3119		}
3120	}
3121	spin_unlock(root_lock);
3122	if (unlikely(to_free))
3123		kfree_skb_list(to_free);
 
3124	if (unlikely(contended))
3125		spin_unlock(&q->busylock);
3126	return rc;
3127}
3128
3129#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3130static void skb_update_prio(struct sk_buff *skb)
3131{
3132	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
 
 
 
 
 
 
 
 
 
 
 
3133
3134	if (!skb->priority && skb->sk && map) {
3135		unsigned int prioidx =
3136			sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3137
3138		if (prioidx < map->priomap_len)
3139			skb->priority = map->priomap[prioidx];
3140	}
3141}
3142#else
3143#define skb_update_prio(skb)
3144#endif
3145
3146DEFINE_PER_CPU(int, xmit_recursion);
3147EXPORT_SYMBOL(xmit_recursion);
3148
3149/**
3150 *	dev_loopback_xmit - loop back @skb
3151 *	@net: network namespace this loopback is happening in
3152 *	@sk:  sk needed to be a netfilter okfn
3153 *	@skb: buffer to transmit
3154 */
3155int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3156{
3157	skb_reset_mac_header(skb);
3158	__skb_pull(skb, skb_network_offset(skb));
3159	skb->pkt_type = PACKET_LOOPBACK;
3160	skb->ip_summed = CHECKSUM_UNNECESSARY;
3161	WARN_ON(!skb_dst(skb));
 
3162	skb_dst_force(skb);
3163	netif_rx_ni(skb);
3164	return 0;
3165}
3166EXPORT_SYMBOL(dev_loopback_xmit);
3167
3168#ifdef CONFIG_NET_EGRESS
3169static struct sk_buff *
3170sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 
 
 
 
 
 
 
3171{
3172	struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3173	struct tcf_result cl_res;
3174
3175	if (!cl)
3176		return skb;
 
 
 
 
 
 
 
 
 
 
 
 
 
3177
3178	/* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3179	 * earlier by the caller.
3180	 */
3181	qdisc_bstats_cpu_update(cl->q, skb);
3182
3183	switch (tc_classify(skb, cl, &cl_res, false)) {
 
 
 
 
 
 
 
 
 
 
 
3184	case TC_ACT_OK:
3185	case TC_ACT_RECLASSIFY:
3186		skb->tc_index = TC_H_MIN(cl_res.classid);
3187		break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3188	case TC_ACT_SHOT:
3189		qdisc_qstats_cpu_drop(cl->q);
3190		*ret = NET_XMIT_DROP;
3191		kfree_skb(skb);
3192		return NULL;
 
3193	case TC_ACT_STOLEN:
3194	case TC_ACT_QUEUED:
3195		*ret = NET_XMIT_SUCCESS;
3196		consume_skb(skb);
 
 
 
3197		return NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3198	case TC_ACT_REDIRECT:
3199		/* No need to push/pop skb's mac_header here on egress! */
3200		skb_do_redirect(skb);
3201		*ret = NET_XMIT_SUCCESS;
3202		return NULL;
3203	default:
3204		break;
 
 
 
 
 
 
 
 
 
 
 
3205	}
3206
3207	return skb;
3208}
3209#endif /* CONFIG_NET_EGRESS */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3210
3211static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 
3212{
3213#ifdef CONFIG_XPS
3214	struct xps_dev_maps *dev_maps;
3215	struct xps_map *map;
3216	int queue_index = -1;
3217
 
 
 
3218	rcu_read_lock();
3219	dev_maps = rcu_dereference(dev->xps_maps);
 
 
 
3220	if (dev_maps) {
3221		unsigned int tci = skb->sender_cpu - 1;
 
 
 
 
 
3222
3223		if (dev->num_tc) {
3224			tci *= dev->num_tc;
3225			tci += netdev_get_prio_tc_map(dev, skb->priority);
3226		}
 
3227
3228		map = rcu_dereference(dev_maps->cpu_map[tci]);
3229		if (map) {
3230			if (map->len == 1)
3231				queue_index = map->queues[0];
3232			else
3233				queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3234									   map->len)];
3235			if (unlikely(queue_index >= dev->real_num_tx_queues))
3236				queue_index = -1;
3237		}
3238	}
3239	rcu_read_unlock();
3240
3241	return queue_index;
3242#else
3243	return -1;
3244#endif
3245}
3246
3247static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3248{
3249	struct sock *sk = skb->sk;
3250	int queue_index = sk_tx_queue_get(sk);
3251
 
 
3252	if (queue_index < 0 || skb->ooo_okay ||
3253	    queue_index >= dev->real_num_tx_queues) {
3254		int new_index = get_xps_queue(dev, skb);
 
3255		if (new_index < 0)
3256			new_index = skb_tx_hash(dev, skb);
3257
3258		if (queue_index != new_index && sk &&
3259		    sk_fullsock(sk) &&
3260		    rcu_access_pointer(sk->sk_dst_cache))
3261			sk_tx_queue_set(sk, new_index);
3262
3263		queue_index = new_index;
3264	}
3265
3266	return queue_index;
3267}
 
3268
3269struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3270				    struct sk_buff *skb,
3271				    void *accel_priv)
3272{
3273	int queue_index = 0;
3274
3275#ifdef CONFIG_XPS
3276	u32 sender_cpu = skb->sender_cpu - 1;
3277
3278	if (sender_cpu >= (u32)NR_CPUS)
3279		skb->sender_cpu = raw_smp_processor_id() + 1;
3280#endif
3281
3282	if (dev->real_num_tx_queues != 1) {
3283		const struct net_device_ops *ops = dev->netdev_ops;
 
3284		if (ops->ndo_select_queue)
3285			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3286							    __netdev_pick_tx);
3287		else
3288			queue_index = __netdev_pick_tx(dev, skb);
3289
3290		if (!accel_priv)
3291			queue_index = netdev_cap_txqueue(dev, queue_index);
3292	}
3293
3294	skb_set_queue_mapping(skb, queue_index);
3295	return netdev_get_tx_queue(dev, queue_index);
3296}
3297
3298/**
3299 *	__dev_queue_xmit - transmit a buffer
3300 *	@skb: buffer to transmit
3301 *	@accel_priv: private data used for L2 forwarding offload
3302 *
3303 *	Queue a buffer for transmission to a network device. The caller must
3304 *	have set the device and priority and built the buffer before calling
3305 *	this function. The function can be called from an interrupt.
3306 *
3307 *	A negative errno code is returned on a failure. A success does not
3308 *	guarantee the frame will be transmitted as it may be dropped due
3309 *	to congestion or traffic shaping.
3310 *
3311 * -----------------------------------------------------------------------------------
3312 *      I notice this method can also return errors from the queue disciplines,
3313 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3314 *      be positive.
3315 *
3316 *      Regardless of the return value, the skb is consumed, so it is currently
3317 *      difficult to retry a send to this method.  (You can bump the ref count
3318 *      before sending to hold a reference for retry if you are careful.)
3319 *
3320 *      When calling this method, interrupts MUST be enabled.  This is because
3321 *      the BH enable code must have IRQs enabled so that it will not deadlock.
3322 *          --BLG
3323 */
3324static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3325{
3326	struct net_device *dev = skb->dev;
3327	struct netdev_queue *txq;
3328	struct Qdisc *q;
3329	int rc = -ENOMEM;
 
3330
3331	skb_reset_mac_header(skb);
 
3332
3333	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3334		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3335
3336	/* Disable soft irqs for various locks below. Also
3337	 * stops preemption for RCU.
3338	 */
3339	rcu_read_lock_bh();
3340
3341	skb_update_prio(skb);
3342
3343	qdisc_pkt_len_init(skb);
3344#ifdef CONFIG_NET_CLS_ACT
3345	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3346# ifdef CONFIG_NET_EGRESS
3347	if (static_key_false(&egress_needed)) {
 
 
 
 
 
 
 
 
3348		skb = sch_handle_egress(skb, &rc, dev);
3349		if (!skb)
3350			goto out;
 
 
 
 
3351	}
3352# endif
3353#endif
3354	/* If device/qdisc don't need skb->dst, release it right now while
3355	 * its hot in this cpu cache.
3356	 */
3357	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3358		skb_dst_drop(skb);
3359	else
3360		skb_dst_force(skb);
3361
3362	txq = netdev_pick_tx(dev, skb, accel_priv);
 
 
3363	q = rcu_dereference_bh(txq->qdisc);
3364
3365	trace_net_dev_queue(skb);
3366	if (q->enqueue) {
3367		rc = __dev_xmit_skb(skb, q, dev, txq);
3368		goto out;
3369	}
3370
3371	/* The device has no queue. Common case for software devices:
3372	   loopback, all the sorts of tunnels...
3373
3374	   Really, it is unlikely that netif_tx_lock protection is necessary
3375	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3376	   counters.)
3377	   However, it is possible, that they rely on protection
3378	   made by us here.
3379
3380	   Check this and shot the lock. It is not prone from deadlocks.
3381	   Either shot noqueue qdisc, it is even simpler 8)
3382	 */
3383	if (dev->flags & IFF_UP) {
3384		int cpu = smp_processor_id(); /* ok because BHs are off */
3385
3386		if (txq->xmit_lock_owner != cpu) {
3387			if (unlikely(__this_cpu_read(xmit_recursion) >
3388				     XMIT_RECURSION_LIMIT))
 
 
3389				goto recursion_alert;
3390
3391			skb = validate_xmit_skb(skb, dev);
3392			if (!skb)
3393				goto out;
3394
3395			HARD_TX_LOCK(dev, txq, cpu);
3396
3397			if (!netif_xmit_stopped(txq)) {
3398				__this_cpu_inc(xmit_recursion);
3399				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3400				__this_cpu_dec(xmit_recursion);
3401				if (dev_xmit_complete(rc)) {
3402					HARD_TX_UNLOCK(dev, txq);
3403					goto out;
3404				}
3405			}
3406			HARD_TX_UNLOCK(dev, txq);
3407			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3408					     dev->name);
3409		} else {
3410			/* Recursion is detected! It is possible,
3411			 * unfortunately
3412			 */
3413recursion_alert:
3414			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3415					     dev->name);
3416		}
3417	}
3418
3419	rc = -ENETDOWN;
3420	rcu_read_unlock_bh();
3421
3422	atomic_long_inc(&dev->tx_dropped);
3423	kfree_skb_list(skb);
3424	return rc;
3425out:
3426	rcu_read_unlock_bh();
3427	return rc;
3428}
 
3429
3430int dev_queue_xmit(struct sk_buff *skb)
3431{
3432	return __dev_queue_xmit(skb, NULL);
3433}
3434EXPORT_SYMBOL(dev_queue_xmit);
 
 
 
 
 
 
 
 
 
 
3435
3436int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3437{
3438	return __dev_queue_xmit(skb, accel_priv);
3439}
3440EXPORT_SYMBOL(dev_queue_xmit_accel);
3441
 
3442
3443/*=======================================================================
3444			Receiver routines
3445  =======================================================================*/
 
 
 
3446
3447int netdev_max_backlog __read_mostly = 1000;
3448EXPORT_SYMBOL(netdev_max_backlog);
 
 
 
 
 
 
3449
3450int netdev_tstamp_prequeue __read_mostly = 1;
3451int netdev_budget __read_mostly = 300;
3452int weight_p __read_mostly = 64;            /* old backlog weight */
 
 
 
 
 
3453
3454/* Called with irq disabled */
3455static inline void ____napi_schedule(struct softnet_data *sd,
3456				     struct napi_struct *napi)
3457{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3458	list_add_tail(&napi->poll_list, &sd->poll_list);
3459	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 
 
 
 
 
3460}
3461
3462#ifdef CONFIG_RPS
3463
3464/* One global table that all flow-based protocols share. */
3465struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3466EXPORT_SYMBOL(rps_sock_flow_table);
3467u32 rps_cpu_mask __read_mostly;
3468EXPORT_SYMBOL(rps_cpu_mask);
3469
3470struct static_key rps_needed __read_mostly;
3471EXPORT_SYMBOL(rps_needed);
3472struct static_key rfs_needed __read_mostly;
3473EXPORT_SYMBOL(rfs_needed);
3474
3475static struct rps_dev_flow *
3476set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3477	    struct rps_dev_flow *rflow, u16 next_cpu)
3478{
3479	if (next_cpu < nr_cpu_ids) {
3480#ifdef CONFIG_RFS_ACCEL
3481		struct netdev_rx_queue *rxqueue;
3482		struct rps_dev_flow_table *flow_table;
3483		struct rps_dev_flow *old_rflow;
3484		u32 flow_id;
3485		u16 rxq_index;
3486		int rc;
3487
3488		/* Should we steer this flow to a different hardware queue? */
3489		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3490		    !(dev->features & NETIF_F_NTUPLE))
3491			goto out;
3492		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3493		if (rxq_index == skb_get_rx_queue(skb))
3494			goto out;
3495
3496		rxqueue = dev->_rx + rxq_index;
3497		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3498		if (!flow_table)
3499			goto out;
3500		flow_id = skb_get_hash(skb) & flow_table->mask;
3501		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3502							rxq_index, flow_id);
3503		if (rc < 0)
3504			goto out;
3505		old_rflow = rflow;
3506		rflow = &flow_table->flows[flow_id];
3507		rflow->filter = rc;
3508		if (old_rflow->filter == rflow->filter)
3509			old_rflow->filter = RPS_NO_FILTER;
3510	out:
3511#endif
3512		rflow->last_qtail =
3513			per_cpu(softnet_data, next_cpu).input_queue_head;
3514	}
3515
3516	rflow->cpu = next_cpu;
3517	return rflow;
3518}
3519
3520/*
3521 * get_rps_cpu is called from netif_receive_skb and returns the target
3522 * CPU from the RPS map of the receiving queue for a given skb.
3523 * rcu_read_lock must be held on entry.
3524 */
3525static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3526		       struct rps_dev_flow **rflowp)
3527{
3528	const struct rps_sock_flow_table *sock_flow_table;
3529	struct netdev_rx_queue *rxqueue = dev->_rx;
3530	struct rps_dev_flow_table *flow_table;
3531	struct rps_map *map;
3532	int cpu = -1;
3533	u32 tcpu;
3534	u32 hash;
3535
3536	if (skb_rx_queue_recorded(skb)) {
3537		u16 index = skb_get_rx_queue(skb);
3538
3539		if (unlikely(index >= dev->real_num_rx_queues)) {
3540			WARN_ONCE(dev->real_num_rx_queues > 1,
3541				  "%s received packet on queue %u, but number "
3542				  "of RX queues is %u\n",
3543				  dev->name, index, dev->real_num_rx_queues);
3544			goto done;
3545		}
3546		rxqueue += index;
3547	}
3548
3549	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3550
3551	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3552	map = rcu_dereference(rxqueue->rps_map);
3553	if (!flow_table && !map)
3554		goto done;
3555
3556	skb_reset_network_header(skb);
3557	hash = skb_get_hash(skb);
3558	if (!hash)
3559		goto done;
3560
3561	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3562	if (flow_table && sock_flow_table) {
3563		struct rps_dev_flow *rflow;
3564		u32 next_cpu;
3565		u32 ident;
3566
3567		/* First check into global flow table if there is a match */
3568		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3569		if ((ident ^ hash) & ~rps_cpu_mask)
 
 
3570			goto try_rps;
3571
3572		next_cpu = ident & rps_cpu_mask;
3573
3574		/* OK, now we know there is a match,
3575		 * we can look at the local (per receive queue) flow table
3576		 */
3577		rflow = &flow_table->flows[hash & flow_table->mask];
3578		tcpu = rflow->cpu;
3579
3580		/*
3581		 * If the desired CPU (where last recvmsg was done) is
3582		 * different from current CPU (one in the rx-queue flow
3583		 * table entry), switch if one of the following holds:
3584		 *   - Current CPU is unset (>= nr_cpu_ids).
3585		 *   - Current CPU is offline.
3586		 *   - The current CPU's queue tail has advanced beyond the
3587		 *     last packet that was enqueued using this table entry.
3588		 *     This guarantees that all previous packets for the flow
3589		 *     have been dequeued, thus preserving in order delivery.
3590		 */
3591		if (unlikely(tcpu != next_cpu) &&
3592		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3593		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3594		      rflow->last_qtail)) >= 0)) {
3595			tcpu = next_cpu;
3596			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3597		}
3598
3599		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3600			*rflowp = rflow;
3601			cpu = tcpu;
3602			goto done;
3603		}
3604	}
3605
3606try_rps:
3607
3608	if (map) {
3609		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3610		if (cpu_online(tcpu)) {
3611			cpu = tcpu;
3612			goto done;
3613		}
3614	}
3615
3616done:
3617	return cpu;
3618}
3619
3620#ifdef CONFIG_RFS_ACCEL
3621
3622/**
3623 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3624 * @dev: Device on which the filter was set
3625 * @rxq_index: RX queue index
3626 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3627 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3628 *
3629 * Drivers that implement ndo_rx_flow_steer() should periodically call
3630 * this function for each installed filter and remove the filters for
3631 * which it returns %true.
3632 */
3633bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3634			 u32 flow_id, u16 filter_id)
3635{
3636	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3637	struct rps_dev_flow_table *flow_table;
3638	struct rps_dev_flow *rflow;
3639	bool expire = true;
3640	unsigned int cpu;
3641
3642	rcu_read_lock();
3643	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3644	if (flow_table && flow_id <= flow_table->mask) {
3645		rflow = &flow_table->flows[flow_id];
3646		cpu = ACCESS_ONCE(rflow->cpu);
3647		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3648		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3649			   rflow->last_qtail) <
3650		     (int)(10 * flow_table->mask)))
3651			expire = false;
3652	}
3653	rcu_read_unlock();
3654	return expire;
3655}
3656EXPORT_SYMBOL(rps_may_expire_flow);
3657
3658#endif /* CONFIG_RFS_ACCEL */
3659
3660/* Called from hardirq (IPI) context */
3661static void rps_trigger_softirq(void *data)
3662{
3663	struct softnet_data *sd = data;
3664
3665	____napi_schedule(sd, &sd->backlog);
3666	sd->received_rps++;
3667}
3668
3669#endif /* CONFIG_RPS */
3670
 
 
 
 
 
 
 
 
 
3671/*
3672 * Check if this softnet_data structure is another cpu one
3673 * If yes, queue it to our IPI list and return 1
3674 * If no, return 0
 
 
 
 
 
3675 */
3676static int rps_ipi_queued(struct softnet_data *sd)
3677{
3678#ifdef CONFIG_RPS
3679	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3680
 
3681	if (sd != mysd) {
3682		sd->rps_ipi_next = mysd->rps_ipi_list;
3683		mysd->rps_ipi_list = sd;
3684
3685		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3686		return 1;
 
 
 
 
3687	}
3688#endif /* CONFIG_RPS */
3689	return 0;
3690}
3691
3692#ifdef CONFIG_NET_FLOW_LIMIT
3693int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3694#endif
3695
3696static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3697{
3698#ifdef CONFIG_NET_FLOW_LIMIT
3699	struct sd_flow_limit *fl;
3700	struct softnet_data *sd;
3701	unsigned int old_flow, new_flow;
3702
3703	if (qlen < (netdev_max_backlog >> 1))
3704		return false;
3705
3706	sd = this_cpu_ptr(&softnet_data);
3707
3708	rcu_read_lock();
3709	fl = rcu_dereference(sd->flow_limit);
3710	if (fl) {
3711		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3712		old_flow = fl->history[fl->history_head];
3713		fl->history[fl->history_head] = new_flow;
3714
3715		fl->history_head++;
3716		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3717
3718		if (likely(fl->buckets[old_flow]))
3719			fl->buckets[old_flow]--;
3720
3721		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3722			fl->count++;
3723			rcu_read_unlock();
3724			return true;
3725		}
3726	}
3727	rcu_read_unlock();
3728#endif
3729	return false;
3730}
3731
3732/*
3733 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3734 * queue (may be a remote CPU queue).
3735 */
3736static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3737			      unsigned int *qtail)
3738{
 
3739	struct softnet_data *sd;
3740	unsigned long flags;
3741	unsigned int qlen;
3742
 
3743	sd = &per_cpu(softnet_data, cpu);
3744
3745	local_irq_save(flags);
3746
3747	rps_lock(sd);
3748	if (!netif_running(skb->dev))
3749		goto drop;
3750	qlen = skb_queue_len(&sd->input_pkt_queue);
3751	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
 
3752		if (qlen) {
3753enqueue:
3754			__skb_queue_tail(&sd->input_pkt_queue, skb);
3755			input_queue_tail_incr_save(sd, qtail);
3756			rps_unlock(sd);
3757			local_irq_restore(flags);
3758			return NET_RX_SUCCESS;
3759		}
3760
3761		/* Schedule NAPI for backlog device
3762		 * We can use non atomic operation since we own the queue lock
3763		 */
3764		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3765			if (!rps_ipi_queued(sd))
3766				____napi_schedule(sd, &sd->backlog);
3767		}
3768		goto enqueue;
3769	}
 
3770
3771drop:
3772	sd->dropped++;
3773	rps_unlock(sd);
 
 
 
 
 
 
 
 
 
 
3774
3775	local_irq_restore(flags);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3776
3777	atomic_long_inc(&skb->dev->rx_dropped);
3778	kfree_skb(skb);
3779	return NET_RX_DROP;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3780}
 
3781
3782static int netif_rx_internal(struct sk_buff *skb)
3783{
3784	int ret;
3785
3786	net_timestamp_check(netdev_tstamp_prequeue, skb);
3787
3788	trace_netif_rx(skb);
 
3789#ifdef CONFIG_RPS
3790	if (static_key_false(&rps_needed)) {
3791		struct rps_dev_flow voidflow, *rflow = &voidflow;
3792		int cpu;
3793
3794		preempt_disable();
3795		rcu_read_lock();
3796
3797		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3798		if (cpu < 0)
3799			cpu = smp_processor_id();
3800
3801		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3802
3803		rcu_read_unlock();
3804		preempt_enable();
3805	} else
3806#endif
3807	{
3808		unsigned int qtail;
3809		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3810		put_cpu();
3811	}
3812	return ret;
3813}
3814
3815/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3816 *	netif_rx	-	post buffer to the network code
3817 *	@skb: buffer to post
3818 *
3819 *	This function receives a packet from a device driver and queues it for
3820 *	the upper (protocol) levels to process.  It always succeeds. The buffer
3821 *	may be dropped during processing for congestion control or by the
3822 *	protocol layers.
 
 
 
 
 
3823 *
3824 *	return values:
3825 *	NET_RX_SUCCESS	(no congestion)
3826 *	NET_RX_DROP     (packet was dropped)
3827 *
3828 */
3829
3830int netif_rx(struct sk_buff *skb)
3831{
 
 
 
 
 
3832	trace_netif_rx_entry(skb);
3833
3834	return netif_rx_internal(skb);
 
 
 
3835}
3836EXPORT_SYMBOL(netif_rx);
3837
3838int netif_rx_ni(struct sk_buff *skb)
3839{
3840	int err;
3841
3842	trace_netif_rx_ni_entry(skb);
3843
3844	preempt_disable();
3845	err = netif_rx_internal(skb);
3846	if (local_softirq_pending())
3847		do_softirq();
3848	preempt_enable();
3849
3850	return err;
3851}
3852EXPORT_SYMBOL(netif_rx_ni);
3853
3854static __latent_entropy void net_tx_action(struct softirq_action *h)
3855{
3856	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3857
3858	if (sd->completion_queue) {
3859		struct sk_buff *clist;
3860
3861		local_irq_disable();
3862		clist = sd->completion_queue;
3863		sd->completion_queue = NULL;
3864		local_irq_enable();
3865
3866		while (clist) {
3867			struct sk_buff *skb = clist;
 
3868			clist = clist->next;
3869
3870			WARN_ON(atomic_read(&skb->users));
3871			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3872				trace_consume_skb(skb);
3873			else
3874				trace_kfree_skb(skb, net_tx_action);
 
3875
3876			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3877				__kfree_skb(skb);
3878			else
3879				__kfree_skb_defer(skb);
 
3880		}
3881
3882		__kfree_skb_flush();
3883	}
3884
3885	if (sd->output_queue) {
3886		struct Qdisc *head;
3887
3888		local_irq_disable();
3889		head = sd->output_queue;
3890		sd->output_queue = NULL;
3891		sd->output_queue_tailp = &sd->output_queue;
3892		local_irq_enable();
3893
 
 
3894		while (head) {
3895			struct Qdisc *q = head;
3896			spinlock_t *root_lock;
3897
3898			head = head->next_sched;
3899
3900			root_lock = qdisc_lock(q);
3901			spin_lock(root_lock);
3902			/* We need to make sure head->next_sched is read
3903			 * before clearing __QDISC_STATE_SCHED
3904			 */
3905			smp_mb__before_atomic();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3906			clear_bit(__QDISC_STATE_SCHED, &q->state);
3907			qdisc_run(q);
3908			spin_unlock(root_lock);
 
3909		}
 
 
3910	}
 
 
3911}
3912
3913#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3914/* This hook is defined here for ATM LANE */
3915int (*br_fdb_test_addr_hook)(struct net_device *dev,
3916			     unsigned char *addr) __read_mostly;
3917EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3918#endif
3919
3920static inline struct sk_buff *
3921sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3922		   struct net_device *orig_dev)
3923{
3924#ifdef CONFIG_NET_CLS_ACT
3925	struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3926	struct tcf_result cl_res;
3927
3928	/* If there's at least one ingress present somewhere (so
3929	 * we get here via enabled static key), remaining devices
3930	 * that are not configured with an ingress qdisc will bail
3931	 * out here.
3932	 */
3933	if (!cl)
3934		return skb;
3935	if (*pt_prev) {
3936		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3937		*pt_prev = NULL;
3938	}
3939
3940	qdisc_skb_cb(skb)->pkt_len = skb->len;
3941	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3942	qdisc_bstats_cpu_update(cl->q, skb);
3943
3944	switch (tc_classify(skb, cl, &cl_res, false)) {
3945	case TC_ACT_OK:
3946	case TC_ACT_RECLASSIFY:
3947		skb->tc_index = TC_H_MIN(cl_res.classid);
3948		break;
3949	case TC_ACT_SHOT:
3950		qdisc_qstats_cpu_drop(cl->q);
3951		kfree_skb(skb);
3952		return NULL;
3953	case TC_ACT_STOLEN:
3954	case TC_ACT_QUEUED:
3955		consume_skb(skb);
3956		return NULL;
3957	case TC_ACT_REDIRECT:
3958		/* skb_mac_header check was done by cls/act_bpf, so
3959		 * we can safely push the L2 header back before
3960		 * redirecting to another netdev
3961		 */
3962		__skb_push(skb, skb->mac_len);
3963		skb_do_redirect(skb);
3964		return NULL;
3965	default:
3966		break;
3967	}
3968#endif /* CONFIG_NET_CLS_ACT */
3969	return skb;
3970}
3971
3972/**
3973 *	netdev_is_rx_handler_busy - check if receive handler is registered
3974 *	@dev: device to check
3975 *
3976 *	Check if a receive handler is already registered for a given device.
3977 *	Return true if there one.
3978 *
3979 *	The caller must hold the rtnl_mutex.
3980 */
3981bool netdev_is_rx_handler_busy(struct net_device *dev)
3982{
3983	ASSERT_RTNL();
3984	return dev && rtnl_dereference(dev->rx_handler);
3985}
3986EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3987
3988/**
3989 *	netdev_rx_handler_register - register receive handler
3990 *	@dev: device to register a handler for
3991 *	@rx_handler: receive handler to register
3992 *	@rx_handler_data: data pointer that is used by rx handler
3993 *
3994 *	Register a receive handler for a device. This handler will then be
3995 *	called from __netif_receive_skb. A negative errno code is returned
3996 *	on a failure.
3997 *
3998 *	The caller must hold the rtnl_mutex.
3999 *
4000 *	For a general description of rx_handler, see enum rx_handler_result.
4001 */
4002int netdev_rx_handler_register(struct net_device *dev,
4003			       rx_handler_func_t *rx_handler,
4004			       void *rx_handler_data)
4005{
4006	ASSERT_RTNL();
 
4007
4008	if (dev->rx_handler)
4009		return -EBUSY;
4010
4011	/* Note: rx_handler_data must be set before rx_handler */
4012	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4013	rcu_assign_pointer(dev->rx_handler, rx_handler);
4014
4015	return 0;
4016}
4017EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4018
4019/**
4020 *	netdev_rx_handler_unregister - unregister receive handler
4021 *	@dev: device to unregister a handler from
4022 *
4023 *	Unregister a receive handler from a device.
4024 *
4025 *	The caller must hold the rtnl_mutex.
4026 */
4027void netdev_rx_handler_unregister(struct net_device *dev)
4028{
4029
4030	ASSERT_RTNL();
4031	RCU_INIT_POINTER(dev->rx_handler, NULL);
4032	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4033	 * section has a guarantee to see a non NULL rx_handler_data
4034	 * as well.
4035	 */
4036	synchronize_net();
4037	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4038}
4039EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4040
4041/*
4042 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4043 * the special handling of PFMEMALLOC skbs.
4044 */
4045static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4046{
4047	switch (skb->protocol) {
4048	case htons(ETH_P_ARP):
4049	case htons(ETH_P_IP):
4050	case htons(ETH_P_IPV6):
4051	case htons(ETH_P_8021Q):
4052	case htons(ETH_P_8021AD):
4053		return true;
4054	default:
4055		return false;
4056	}
4057}
4058
4059static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4060			     int *ret, struct net_device *orig_dev)
4061{
4062#ifdef CONFIG_NETFILTER_INGRESS
4063	if (nf_hook_ingress_active(skb)) {
4064		int ingress_retval;
4065
4066		if (*pt_prev) {
4067			*ret = deliver_skb(skb, *pt_prev, orig_dev);
4068			*pt_prev = NULL;
4069		}
4070
4071		rcu_read_lock();
4072		ingress_retval = nf_hook_ingress(skb);
4073		rcu_read_unlock();
4074		return ingress_retval;
4075	}
4076#endif /* CONFIG_NETFILTER_INGRESS */
4077	return 0;
4078}
4079
4080static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
 
4081{
4082	struct packet_type *ptype, *pt_prev;
4083	rx_handler_func_t *rx_handler;
 
4084	struct net_device *orig_dev;
4085	bool deliver_exact = false;
4086	int ret = NET_RX_DROP;
4087	__be16 type;
4088
4089	net_timestamp_check(!netdev_tstamp_prequeue, skb);
4090
4091	trace_netif_receive_skb(skb);
4092
4093	orig_dev = skb->dev;
4094
4095	skb_reset_network_header(skb);
4096	if (!skb_transport_header_was_set(skb))
4097		skb_reset_transport_header(skb);
4098	skb_reset_mac_len(skb);
4099
4100	pt_prev = NULL;
4101
4102another_round:
4103	skb->skb_iif = skb->dev->ifindex;
4104
4105	__this_cpu_inc(softnet_data.processed);
4106
4107	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4108	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 
 
 
 
 
 
 
 
 
 
 
 
 
4109		skb = skb_vlan_untag(skb);
4110		if (unlikely(!skb))
4111			goto out;
4112	}
4113
4114#ifdef CONFIG_NET_CLS_ACT
4115	if (skb->tc_verd & TC_NCLS) {
4116		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4117		goto ncls;
4118	}
4119#endif
4120
4121	if (pfmemalloc)
4122		goto skip_taps;
4123
4124	list_for_each_entry_rcu(ptype, &ptype_all, list) {
4125		if (pt_prev)
4126			ret = deliver_skb(skb, pt_prev, orig_dev);
4127		pt_prev = ptype;
4128	}
4129
4130	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4131		if (pt_prev)
4132			ret = deliver_skb(skb, pt_prev, orig_dev);
4133		pt_prev = ptype;
4134	}
4135
4136skip_taps:
4137#ifdef CONFIG_NET_INGRESS
4138	if (static_key_false(&ingress_needed)) {
4139		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
 
 
 
 
 
 
4140		if (!skb)
4141			goto out;
4142
 
4143		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4144			goto out;
4145	}
4146#endif
4147#ifdef CONFIG_NET_CLS_ACT
4148	skb->tc_verd = 0;
4149ncls:
4150#endif
4151	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4152		goto drop;
4153
4154	if (skb_vlan_tag_present(skb)) {
4155		if (pt_prev) {
4156			ret = deliver_skb(skb, pt_prev, orig_dev);
4157			pt_prev = NULL;
4158		}
4159		if (vlan_do_receive(&skb))
4160			goto another_round;
4161		else if (unlikely(!skb))
4162			goto out;
4163	}
4164
4165	rx_handler = rcu_dereference(skb->dev->rx_handler);
4166	if (rx_handler) {
4167		if (pt_prev) {
4168			ret = deliver_skb(skb, pt_prev, orig_dev);
4169			pt_prev = NULL;
4170		}
4171		switch (rx_handler(&skb)) {
4172		case RX_HANDLER_CONSUMED:
4173			ret = NET_RX_SUCCESS;
4174			goto out;
4175		case RX_HANDLER_ANOTHER:
4176			goto another_round;
4177		case RX_HANDLER_EXACT:
4178			deliver_exact = true;
 
4179		case RX_HANDLER_PASS:
4180			break;
4181		default:
4182			BUG();
4183		}
4184	}
4185
4186	if (unlikely(skb_vlan_tag_present(skb))) {
4187		if (skb_vlan_tag_get_id(skb))
 
 
 
 
4188			skb->pkt_type = PACKET_OTHERHOST;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4189		/* Note: we might in the future use prio bits
4190		 * and set skb->priority like in vlan_do_receive()
4191		 * For the time being, just ignore Priority Code Point
4192		 */
4193		skb->vlan_tci = 0;
4194	}
4195
4196	type = skb->protocol;
4197
4198	/* deliver only exact match when indicated */
4199	if (likely(!deliver_exact)) {
4200		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4201				       &ptype_base[ntohs(type) &
4202						   PTYPE_HASH_MASK]);
4203	}
4204
4205	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4206			       &orig_dev->ptype_specific);
4207
4208	if (unlikely(skb->dev != orig_dev)) {
4209		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4210				       &skb->dev->ptype_specific);
4211	}
4212
4213	if (pt_prev) {
4214		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4215			goto drop;
4216		else
4217			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4218	} else {
4219drop:
4220		if (!deliver_exact)
4221			atomic_long_inc(&skb->dev->rx_dropped);
4222		else
4223			atomic_long_inc(&skb->dev->rx_nohandler);
4224		kfree_skb(skb);
4225		/* Jamal, now you will not able to escape explaining
4226		 * me how you were going to use this. :-)
4227		 */
4228		ret = NET_RX_DROP;
4229	}
4230
4231out:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4232	return ret;
4233}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4234
4235static int __netif_receive_skb(struct sk_buff *skb)
4236{
4237	int ret;
4238
4239	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4240		unsigned long pflags = current->flags;
4241
4242		/*
4243		 * PFMEMALLOC skbs are special, they should
4244		 * - be delivered to SOCK_MEMALLOC sockets only
4245		 * - stay away from userspace
4246		 * - have bounded memory usage
4247		 *
4248		 * Use PF_MEMALLOC as this saves us from propagating the allocation
4249		 * context down to all allocation sites.
4250		 */
4251		current->flags |= PF_MEMALLOC;
4252		ret = __netif_receive_skb_core(skb, true);
4253		tsk_restore_flags(current, pflags, PF_MEMALLOC);
4254	} else
4255		ret = __netif_receive_skb_core(skb, false);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4256
4257	return ret;
4258}
4259
4260static int netif_receive_skb_internal(struct sk_buff *skb)
4261{
4262	int ret;
4263
4264	net_timestamp_check(netdev_tstamp_prequeue, skb);
4265
4266	if (skb_defer_rx_timestamp(skb))
4267		return NET_RX_SUCCESS;
4268
4269	rcu_read_lock();
4270
4271#ifdef CONFIG_RPS
4272	if (static_key_false(&rps_needed)) {
4273		struct rps_dev_flow voidflow, *rflow = &voidflow;
4274		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4275
4276		if (cpu >= 0) {
4277			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4278			rcu_read_unlock();
4279			return ret;
4280		}
4281	}
4282#endif
4283	ret = __netif_receive_skb(skb);
4284	rcu_read_unlock();
4285	return ret;
4286}
4287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4288/**
4289 *	netif_receive_skb - process receive buffer from network
4290 *	@skb: buffer to process
4291 *
4292 *	netif_receive_skb() is the main receive data processing function.
4293 *	It always succeeds. The buffer may be dropped during processing
4294 *	for congestion control or by the protocol layers.
4295 *
4296 *	This function may only be called from softirq context and interrupts
4297 *	should be enabled.
4298 *
4299 *	Return values (usually ignored):
4300 *	NET_RX_SUCCESS: no congestion
4301 *	NET_RX_DROP: packet was dropped
4302 */
4303int netif_receive_skb(struct sk_buff *skb)
4304{
 
 
4305	trace_netif_receive_skb_entry(skb);
4306
4307	return netif_receive_skb_internal(skb);
 
 
 
4308}
4309EXPORT_SYMBOL(netif_receive_skb);
4310
4311DEFINE_PER_CPU(struct work_struct, flush_works);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4312
4313/* Network device is going away, flush any packets still pending */
4314static void flush_backlog(struct work_struct *work)
4315{
4316	struct sk_buff *skb, *tmp;
4317	struct softnet_data *sd;
4318
4319	local_bh_disable();
4320	sd = this_cpu_ptr(&softnet_data);
4321
4322	local_irq_disable();
4323	rps_lock(sd);
4324	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4325		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4326			__skb_unlink(skb, &sd->input_pkt_queue);
4327			kfree_skb(skb);
4328			input_queue_head_incr(sd);
4329		}
4330	}
4331	rps_unlock(sd);
4332	local_irq_enable();
4333
4334	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4335		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4336			__skb_unlink(skb, &sd->process_queue);
4337			kfree_skb(skb);
4338			input_queue_head_incr(sd);
4339		}
4340	}
4341	local_bh_enable();
4342}
4343
4344static void flush_all_backlogs(void)
4345{
4346	unsigned int cpu;
 
 
4347
4348	get_online_cpus();
4349
4350	for_each_online_cpu(cpu)
4351		queue_work_on(cpu, system_highpri_wq,
4352			      per_cpu_ptr(&flush_works, cpu));
 
 
 
4353
4354	for_each_online_cpu(cpu)
4355		flush_work(per_cpu_ptr(&flush_works, cpu));
4356
4357	put_online_cpus();
 
 
 
 
4358}
4359
4360static int napi_gro_complete(struct sk_buff *skb)
4361{
4362	struct packet_offload *ptype;
4363	__be16 type = skb->protocol;
4364	struct list_head *head = &offload_base;
4365	int err = -ENOENT;
4366
4367	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4368
4369	if (NAPI_GRO_CB(skb)->count == 1) {
4370		skb_shinfo(skb)->gso_size = 0;
4371		goto out;
4372	}
4373
4374	rcu_read_lock();
4375	list_for_each_entry_rcu(ptype, head, list) {
4376		if (ptype->type != type || !ptype->callbacks.gro_complete)
4377			continue;
4378
4379		err = ptype->callbacks.gro_complete(skb, 0);
4380		break;
4381	}
4382	rcu_read_unlock();
4383
4384	if (err) {
4385		WARN_ON(&ptype->list == head);
4386		kfree_skb(skb);
4387		return NET_RX_SUCCESS;
4388	}
4389
4390out:
4391	return netif_receive_skb_internal(skb);
4392}
4393
4394/* napi->gro_list contains packets ordered by age.
4395 * youngest packets at the head of it.
4396 * Complete skbs in reverse order to reduce latencies.
4397 */
4398void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4399{
4400	struct sk_buff *skb, *prev = NULL;
4401
4402	/* scan list and build reverse chain */
4403	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4404		skb->prev = prev;
4405		prev = skb;
4406	}
4407
4408	for (skb = prev; skb; skb = prev) {
4409		skb->next = NULL;
4410
4411		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4412			return;
4413
4414		prev = skb->prev;
4415		napi_gro_complete(skb);
4416		napi->gro_count--;
4417	}
4418
4419	napi->gro_list = NULL;
4420}
4421EXPORT_SYMBOL(napi_gro_flush);
4422
4423static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4424{
4425	struct sk_buff *p;
4426	unsigned int maclen = skb->dev->hard_header_len;
4427	u32 hash = skb_get_hash_raw(skb);
4428
4429	for (p = napi->gro_list; p; p = p->next) {
4430		unsigned long diffs;
4431
4432		NAPI_GRO_CB(p)->flush = 0;
4433
4434		if (hash != skb_get_hash_raw(p)) {
4435			NAPI_GRO_CB(p)->same_flow = 0;
4436			continue;
4437		}
4438
4439		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4440		diffs |= p->vlan_tci ^ skb->vlan_tci;
4441		diffs |= skb_metadata_dst_cmp(p, skb);
4442		if (maclen == ETH_HLEN)
4443			diffs |= compare_ether_header(skb_mac_header(p),
4444						      skb_mac_header(skb));
4445		else if (!diffs)
4446			diffs = memcmp(skb_mac_header(p),
4447				       skb_mac_header(skb),
4448				       maclen);
4449		NAPI_GRO_CB(p)->same_flow = !diffs;
4450	}
4451}
4452
4453static void skb_gro_reset_offset(struct sk_buff *skb)
4454{
4455	const struct skb_shared_info *pinfo = skb_shinfo(skb);
4456	const skb_frag_t *frag0 = &pinfo->frags[0];
4457
4458	NAPI_GRO_CB(skb)->data_offset = 0;
4459	NAPI_GRO_CB(skb)->frag0 = NULL;
4460	NAPI_GRO_CB(skb)->frag0_len = 0;
4461
4462	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4463	    pinfo->nr_frags &&
4464	    !PageHighMem(skb_frag_page(frag0))) {
4465		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4466		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4467						    skb_frag_size(frag0),
4468						    skb->end - skb->tail);
4469	}
4470}
4471
4472static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4473{
4474	struct skb_shared_info *pinfo = skb_shinfo(skb);
4475
4476	BUG_ON(skb->end - skb->tail < grow);
4477
4478	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4479
4480	skb->data_len -= grow;
4481	skb->tail += grow;
4482
4483	pinfo->frags[0].page_offset += grow;
4484	skb_frag_size_sub(&pinfo->frags[0], grow);
4485
4486	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4487		skb_frag_unref(skb, 0);
4488		memmove(pinfo->frags, pinfo->frags + 1,
4489			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
4490	}
4491}
4492
4493static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4494{
4495	struct sk_buff **pp = NULL;
4496	struct packet_offload *ptype;
4497	__be16 type = skb->protocol;
4498	struct list_head *head = &offload_base;
4499	int same_flow;
4500	enum gro_result ret;
4501	int grow;
4502
4503	if (!(skb->dev->features & NETIF_F_GRO))
4504		goto normal;
4505
4506	if (skb->csum_bad)
4507		goto normal;
4508
4509	gro_list_prepare(napi, skb);
4510
4511	rcu_read_lock();
4512	list_for_each_entry_rcu(ptype, head, list) {
4513		if (ptype->type != type || !ptype->callbacks.gro_receive)
4514			continue;
4515
4516		skb_set_network_header(skb, skb_gro_offset(skb));
4517		skb_reset_mac_len(skb);
4518		NAPI_GRO_CB(skb)->same_flow = 0;
4519		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
4520		NAPI_GRO_CB(skb)->free = 0;
4521		NAPI_GRO_CB(skb)->encap_mark = 0;
4522		NAPI_GRO_CB(skb)->recursion_counter = 0;
4523		NAPI_GRO_CB(skb)->is_fou = 0;
4524		NAPI_GRO_CB(skb)->is_atomic = 1;
4525		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4526
4527		/* Setup for GRO checksum validation */
4528		switch (skb->ip_summed) {
4529		case CHECKSUM_COMPLETE:
4530			NAPI_GRO_CB(skb)->csum = skb->csum;
4531			NAPI_GRO_CB(skb)->csum_valid = 1;
4532			NAPI_GRO_CB(skb)->csum_cnt = 0;
4533			break;
4534		case CHECKSUM_UNNECESSARY:
4535			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4536			NAPI_GRO_CB(skb)->csum_valid = 0;
4537			break;
4538		default:
4539			NAPI_GRO_CB(skb)->csum_cnt = 0;
4540			NAPI_GRO_CB(skb)->csum_valid = 0;
4541		}
4542
4543		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4544		break;
4545	}
4546	rcu_read_unlock();
4547
4548	if (&ptype->list == head)
4549		goto normal;
4550
4551	same_flow = NAPI_GRO_CB(skb)->same_flow;
4552	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4553
4554	if (pp) {
4555		struct sk_buff *nskb = *pp;
4556
4557		*pp = nskb->next;
4558		nskb->next = NULL;
4559		napi_gro_complete(nskb);
4560		napi->gro_count--;
4561	}
4562
4563	if (same_flow)
4564		goto ok;
4565
4566	if (NAPI_GRO_CB(skb)->flush)
4567		goto normal;
4568
4569	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4570		struct sk_buff *nskb = napi->gro_list;
4571
4572		/* locate the end of the list to select the 'oldest' flow */
4573		while (nskb->next) {
4574			pp = &nskb->next;
4575			nskb = *pp;
4576		}
4577		*pp = NULL;
4578		nskb->next = NULL;
4579		napi_gro_complete(nskb);
4580	} else {
4581		napi->gro_count++;
4582	}
4583	NAPI_GRO_CB(skb)->count = 1;
4584	NAPI_GRO_CB(skb)->age = jiffies;
4585	NAPI_GRO_CB(skb)->last = skb;
4586	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4587	skb->next = napi->gro_list;
4588	napi->gro_list = skb;
4589	ret = GRO_HELD;
4590
4591pull:
4592	grow = skb_gro_offset(skb) - skb_headlen(skb);
4593	if (grow > 0)
4594		gro_pull_from_frag0(skb, grow);
4595ok:
4596	return ret;
4597
4598normal:
4599	ret = GRO_NORMAL;
4600	goto pull;
4601}
4602
4603struct packet_offload *gro_find_receive_by_type(__be16 type)
4604{
4605	struct list_head *offload_head = &offload_base;
4606	struct packet_offload *ptype;
4607
4608	list_for_each_entry_rcu(ptype, offload_head, list) {
4609		if (ptype->type != type || !ptype->callbacks.gro_receive)
4610			continue;
4611		return ptype;
4612	}
4613	return NULL;
4614}
4615EXPORT_SYMBOL(gro_find_receive_by_type);
4616
4617struct packet_offload *gro_find_complete_by_type(__be16 type)
4618{
4619	struct list_head *offload_head = &offload_base;
4620	struct packet_offload *ptype;
4621
4622	list_for_each_entry_rcu(ptype, offload_head, list) {
4623		if (ptype->type != type || !ptype->callbacks.gro_complete)
4624			continue;
4625		return ptype;
4626	}
4627	return NULL;
4628}
4629EXPORT_SYMBOL(gro_find_complete_by_type);
4630
4631static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4632{
4633	switch (ret) {
4634	case GRO_NORMAL:
4635		if (netif_receive_skb_internal(skb))
4636			ret = GRO_DROP;
4637		break;
4638
4639	case GRO_DROP:
4640		kfree_skb(skb);
4641		break;
4642
4643	case GRO_MERGED_FREE:
4644		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4645			skb_dst_drop(skb);
4646			kmem_cache_free(skbuff_head_cache, skb);
4647		} else {
4648			__kfree_skb(skb);
4649		}
4650		break;
4651
4652	case GRO_HELD:
4653	case GRO_MERGED:
4654		break;
4655	}
4656
4657	return ret;
4658}
4659
4660gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4661{
4662	skb_mark_napi_id(skb, napi);
4663	trace_napi_gro_receive_entry(skb);
4664
4665	skb_gro_reset_offset(skb);
4666
4667	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4668}
4669EXPORT_SYMBOL(napi_gro_receive);
4670
4671static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4672{
4673	if (unlikely(skb->pfmemalloc)) {
4674		consume_skb(skb);
4675		return;
4676	}
4677	__skb_pull(skb, skb_headlen(skb));
4678	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4679	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4680	skb->vlan_tci = 0;
4681	skb->dev = napi->dev;
4682	skb->skb_iif = 0;
4683	skb->encapsulation = 0;
4684	skb_shinfo(skb)->gso_type = 0;
4685	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4686
4687	napi->skb = skb;
4688}
4689
4690struct sk_buff *napi_get_frags(struct napi_struct *napi)
4691{
4692	struct sk_buff *skb = napi->skb;
4693
4694	if (!skb) {
4695		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4696		if (skb) {
4697			napi->skb = skb;
4698			skb_mark_napi_id(skb, napi);
4699		}
4700	}
4701	return skb;
4702}
4703EXPORT_SYMBOL(napi_get_frags);
4704
4705static gro_result_t napi_frags_finish(struct napi_struct *napi,
4706				      struct sk_buff *skb,
4707				      gro_result_t ret)
4708{
4709	switch (ret) {
4710	case GRO_NORMAL:
4711	case GRO_HELD:
4712		__skb_push(skb, ETH_HLEN);
4713		skb->protocol = eth_type_trans(skb, skb->dev);
4714		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4715			ret = GRO_DROP;
4716		break;
4717
4718	case GRO_DROP:
4719	case GRO_MERGED_FREE:
4720		napi_reuse_skb(napi, skb);
4721		break;
4722
4723	case GRO_MERGED:
4724		break;
4725	}
4726
4727	return ret;
4728}
4729
4730/* Upper GRO stack assumes network header starts at gro_offset=0
4731 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4732 * We copy ethernet header into skb->data to have a common layout.
4733 */
4734static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4735{
4736	struct sk_buff *skb = napi->skb;
4737	const struct ethhdr *eth;
4738	unsigned int hlen = sizeof(*eth);
4739
4740	napi->skb = NULL;
4741
4742	skb_reset_mac_header(skb);
4743	skb_gro_reset_offset(skb);
4744
4745	eth = skb_gro_header_fast(skb, 0);
4746	if (unlikely(skb_gro_header_hard(skb, hlen))) {
4747		eth = skb_gro_header_slow(skb, hlen, 0);
4748		if (unlikely(!eth)) {
4749			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4750					     __func__, napi->dev->name);
4751			napi_reuse_skb(napi, skb);
4752			return NULL;
4753		}
4754	} else {
4755		gro_pull_from_frag0(skb, hlen);
4756		NAPI_GRO_CB(skb)->frag0 += hlen;
4757		NAPI_GRO_CB(skb)->frag0_len -= hlen;
4758	}
4759	__skb_pull(skb, hlen);
4760
4761	/*
4762	 * This works because the only protocols we care about don't require
4763	 * special handling.
4764	 * We'll fix it up properly in napi_frags_finish()
4765	 */
4766	skb->protocol = eth->h_proto;
 
4767
4768	return skb;
4769}
4770
4771gro_result_t napi_gro_frags(struct napi_struct *napi)
4772{
4773	struct sk_buff *skb = napi_frags_skb(napi);
4774
4775	if (!skb)
4776		return GRO_DROP;
4777
4778	trace_napi_gro_frags_entry(skb);
4779
4780	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4781}
4782EXPORT_SYMBOL(napi_gro_frags);
4783
4784/* Compute the checksum from gro_offset and return the folded value
4785 * after adding in any pseudo checksum.
4786 */
4787__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4788{
4789	__wsum wsum;
4790	__sum16 sum;
4791
4792	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4793
4794	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4795	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4796	if (likely(!sum)) {
4797		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4798		    !skb->csum_complete_sw)
4799			netdev_rx_csum_fault(skb->dev);
4800	}
4801
4802	NAPI_GRO_CB(skb)->csum = wsum;
4803	NAPI_GRO_CB(skb)->csum_valid = 1;
4804
4805	return sum;
4806}
4807EXPORT_SYMBOL(__skb_gro_checksum_complete);
4808
4809/*
4810 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4811 * Note: called with local irq disabled, but exits with local irq enabled.
4812 */
4813static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4814{
4815#ifdef CONFIG_RPS
4816	struct softnet_data *remsd = sd->rps_ipi_list;
4817
4818	if (remsd) {
4819		sd->rps_ipi_list = NULL;
4820
4821		local_irq_enable();
4822
4823		/* Send pending IPI's to kick RPS processing on remote cpus. */
4824		while (remsd) {
4825			struct softnet_data *next = remsd->rps_ipi_next;
4826
4827			if (cpu_online(remsd->cpu))
4828				smp_call_function_single_async(remsd->cpu,
4829							   &remsd->csd);
4830			remsd = next;
4831		}
4832	} else
4833#endif
4834		local_irq_enable();
4835}
4836
4837static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4838{
4839#ifdef CONFIG_RPS
4840	return sd->rps_ipi_list != NULL;
4841#else
4842	return false;
4843#endif
4844}
4845
4846static int process_backlog(struct napi_struct *napi, int quota)
4847{
4848	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4849	bool again = true;
4850	int work = 0;
4851
4852	/* Check if we have pending ipi, its better to send them now,
4853	 * not waiting net_rx_action() end.
4854	 */
4855	if (sd_has_rps_ipi_waiting(sd)) {
4856		local_irq_disable();
4857		net_rps_action_and_irq_enable(sd);
4858	}
4859
4860	napi->weight = weight_p;
4861	while (again) {
4862		struct sk_buff *skb;
4863
4864		while ((skb = __skb_dequeue(&sd->process_queue))) {
4865			rcu_read_lock();
4866			__netif_receive_skb(skb);
4867			rcu_read_unlock();
4868			input_queue_head_incr(sd);
4869			if (++work >= quota)
4870				return work;
4871
4872		}
4873
4874		local_irq_disable();
4875		rps_lock(sd);
4876		if (skb_queue_empty(&sd->input_pkt_queue)) {
4877			/*
4878			 * Inline a custom version of __napi_complete().
4879			 * only current cpu owns and manipulates this napi,
4880			 * and NAPI_STATE_SCHED is the only possible flag set
4881			 * on backlog.
4882			 * We can use a plain write instead of clear_bit(),
4883			 * and we dont need an smp_mb() memory barrier.
4884			 */
4885			napi->state = 0;
4886			again = false;
4887		} else {
4888			skb_queue_splice_tail_init(&sd->input_pkt_queue,
4889						   &sd->process_queue);
4890		}
4891		rps_unlock(sd);
4892		local_irq_enable();
4893	}
4894
4895	return work;
4896}
4897
4898/**
4899 * __napi_schedule - schedule for receive
4900 * @n: entry to schedule
4901 *
4902 * The entry's receive function will be scheduled to run.
4903 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4904 */
4905void __napi_schedule(struct napi_struct *n)
4906{
4907	unsigned long flags;
4908
4909	local_irq_save(flags);
4910	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4911	local_irq_restore(flags);
4912}
4913EXPORT_SYMBOL(__napi_schedule);
4914
4915/**
4916 *	napi_schedule_prep - check if napi can be scheduled
4917 *	@n: napi context
4918 *
4919 * Test if NAPI routine is already running, and if not mark
4920 * it as running.  This is used as a condition variable
4921 * insure only one NAPI poll instance runs.  We also make
4922 * sure there is no pending NAPI disable.
4923 */
4924bool napi_schedule_prep(struct napi_struct *n)
4925{
4926	unsigned long val, new;
4927
4928	do {
4929		val = READ_ONCE(n->state);
4930		if (unlikely(val & NAPIF_STATE_DISABLE))
4931			return false;
4932		new = val | NAPIF_STATE_SCHED;
4933
4934		/* Sets STATE_MISSED bit if STATE_SCHED was already set
4935		 * This was suggested by Alexander Duyck, as compiler
4936		 * emits better code than :
4937		 * if (val & NAPIF_STATE_SCHED)
4938		 *     new |= NAPIF_STATE_MISSED;
4939		 */
4940		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
4941						   NAPIF_STATE_MISSED;
4942	} while (cmpxchg(&n->state, val, new) != val);
4943
4944	return !(val & NAPIF_STATE_SCHED);
4945}
4946EXPORT_SYMBOL(napi_schedule_prep);
4947
4948/**
4949 * __napi_schedule_irqoff - schedule for receive
4950 * @n: entry to schedule
4951 *
4952 * Variant of __napi_schedule() assuming hard irqs are masked
 
 
 
 
4953 */
4954void __napi_schedule_irqoff(struct napi_struct *n)
4955{
4956	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 
 
 
4957}
4958EXPORT_SYMBOL(__napi_schedule_irqoff);
4959
4960bool __napi_complete(struct napi_struct *n)
4961{
4962	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4963
4964	/* Some drivers call us directly, instead of calling
4965	 * napi_complete_done().
4966	 */
4967	if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
4968		return false;
4969
4970	list_del_init(&n->poll_list);
4971	smp_mb__before_atomic();
4972	clear_bit(NAPI_STATE_SCHED, &n->state);
4973	return true;
4974}
4975EXPORT_SYMBOL(__napi_complete);
4976
4977bool napi_complete_done(struct napi_struct *n, int work_done)
4978{
4979	unsigned long flags, val, new;
 
4980
4981	/*
4982	 * 1) Don't let napi dequeue from the cpu poll list
4983	 *    just in case its running on a different cpu.
4984	 * 2) If we are busy polling, do nothing here, we have
4985	 *    the guarantee we will be called later.
4986	 */
4987	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
4988				 NAPIF_STATE_IN_BUSY_POLL)))
4989		return false;
4990
4991	if (n->gro_list) {
4992		unsigned long timeout = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4993
4994		if (work_done)
4995			timeout = n->dev->gro_flush_timeout;
4996
4997		if (timeout)
4998			hrtimer_start(&n->timer, ns_to_ktime(timeout),
4999				      HRTIMER_MODE_REL_PINNED);
5000		else
5001			napi_gro_flush(n, false);
5002	}
5003	if (unlikely(!list_empty(&n->poll_list))) {
5004		/* If n->poll_list is not empty, we need to mask irqs */
5005		local_irq_save(flags);
5006		list_del_init(&n->poll_list);
5007		local_irq_restore(flags);
5008	}
 
5009
 
5010	do {
5011		val = READ_ONCE(n->state);
5012
5013		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
5014
5015		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
 
 
5016
5017		/* If STATE_MISSED was set, leave STATE_SCHED set,
5018		 * because we will call napi->poll() one more time.
5019		 * This C code was suggested by Alexander Duyck to help gcc.
5020		 */
5021		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
5022						    NAPIF_STATE_SCHED;
5023	} while (cmpxchg(&n->state, val, new) != val);
5024
5025	if (unlikely(val & NAPIF_STATE_MISSED)) {
5026		__napi_schedule(n);
5027		return false;
5028	}
5029
5030	return true;
 
 
 
5031}
5032EXPORT_SYMBOL(napi_complete_done);
5033
5034/* must be called under rcu_read_lock(), as we dont take a reference */
5035static struct napi_struct *napi_by_id(unsigned int napi_id)
5036{
5037	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
5038	struct napi_struct *napi;
5039
5040	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
5041		if (napi->napi_id == napi_id)
5042			return napi;
5043
5044	return NULL;
5045}
5046
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5047#if defined(CONFIG_NET_RX_BUSY_POLL)
5048
5049#define BUSY_POLL_BUDGET 8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5050
5051static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
 
5052{
 
 
5053	int rc;
5054
5055	/* Busy polling means there is a high chance device driver hard irq
5056	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
5057	 * set in napi_schedule_prep().
5058	 * Since we are about to call napi->poll() once more, we can safely
5059	 * clear NAPI_STATE_MISSED.
5060	 *
5061	 * Note: x86 could use a single "lock and ..." instruction
5062	 * to perform these two clear_bit()
5063	 */
5064	clear_bit(NAPI_STATE_MISSED, &napi->state);
5065	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
5066
5067	local_bh_disable();
5068
 
 
 
 
 
 
 
 
 
5069	/* All we really want here is to re-enable device interrupts.
5070	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
5071	 */
5072	rc = napi->poll(napi, BUSY_POLL_BUDGET);
 
 
 
 
 
5073	netpoll_poll_unlock(have_poll_lock);
5074	if (rc == BUSY_POLL_BUDGET)
5075		__napi_schedule(napi);
5076	local_bh_enable();
5077	if (local_softirq_pending())
5078		do_softirq();
5079}
5080
5081bool sk_busy_loop(struct sock *sk, int nonblock)
 
 
5082{
5083	unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
5084	int (*napi_poll)(struct napi_struct *napi, int budget);
5085	int (*busy_poll)(struct napi_struct *dev);
5086	void *have_poll_lock = NULL;
5087	struct napi_struct *napi;
5088	int rc;
 
5089
5090restart:
5091	rc = false;
5092	napi_poll = NULL;
5093
5094	rcu_read_lock();
5095
5096	napi = napi_by_id(sk->sk_napi_id);
5097	if (!napi)
5098		goto out;
5099
5100	/* Note: ndo_busy_poll method is optional in linux-4.5 */
5101	busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
 
 
5102
5103	preempt_disable();
5104	for (;;) {
5105		rc = 0;
5106		local_bh_disable();
5107		if (busy_poll) {
5108			rc = busy_poll(napi);
5109			goto count;
5110		}
5111		if (!napi_poll) {
5112			unsigned long val = READ_ONCE(napi->state);
5113
5114			/* If multiple threads are competing for this napi,
5115			 * we avoid dirtying napi->state as much as we can.
5116			 */
5117			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
5118				   NAPIF_STATE_IN_BUSY_POLL))
 
 
5119				goto count;
 
5120			if (cmpxchg(&napi->state, val,
5121				    val | NAPIF_STATE_IN_BUSY_POLL |
5122					  NAPIF_STATE_SCHED) != val)
 
 
5123				goto count;
 
5124			have_poll_lock = netpoll_poll_lock(napi);
5125			napi_poll = napi->poll;
5126		}
5127		rc = napi_poll(napi, BUSY_POLL_BUDGET);
5128		trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
 
5129count:
5130		if (rc > 0)
5131			__NET_ADD_STATS(sock_net(sk),
5132					LINUX_MIB_BUSYPOLLRXPACKETS, rc);
 
5133		local_bh_enable();
5134
5135		if (rc == LL_FLUSH_FAILED)
5136			break; /* permanent failure */
5137
5138		if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
5139		    busy_loop_timeout(end_time))
5140			break;
5141
5142		if (unlikely(need_resched())) {
 
 
5143			if (napi_poll)
5144				busy_poll_stop(napi, have_poll_lock);
5145			preempt_enable();
 
5146			rcu_read_unlock();
5147			cond_resched();
5148			rc = !skb_queue_empty(&sk->sk_receive_queue);
5149			if (rc || busy_loop_timeout(end_time))
5150				return rc;
5151			goto restart;
5152		}
5153		cpu_relax();
5154	}
5155	if (napi_poll)
5156		busy_poll_stop(napi, have_poll_lock);
5157	preempt_enable();
5158	rc = !skb_queue_empty(&sk->sk_receive_queue);
5159out:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5160	rcu_read_unlock();
5161	return rc;
5162}
5163EXPORT_SYMBOL(sk_busy_loop);
5164
5165#endif /* CONFIG_NET_RX_BUSY_POLL */
5166
5167static void napi_hash_add(struct napi_struct *napi)
5168{
5169	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5170	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5171		return;
5172
5173	spin_lock(&napi_hash_lock);
5174
5175	/* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5176	do {
5177		if (unlikely(++napi_gen_id < NR_CPUS + 1))
5178			napi_gen_id = NR_CPUS + 1;
5179	} while (napi_by_id(napi_gen_id));
5180	napi->napi_id = napi_gen_id;
5181
5182	hlist_add_head_rcu(&napi->napi_hash_node,
5183			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5184
5185	spin_unlock(&napi_hash_lock);
5186}
5187
5188/* Warning : caller is responsible to make sure rcu grace period
5189 * is respected before freeing memory containing @napi
5190 */
5191bool napi_hash_del(struct napi_struct *napi)
5192{
5193	bool rcu_sync_needed = false;
5194
5195	spin_lock(&napi_hash_lock);
5196
5197	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5198		rcu_sync_needed = true;
5199		hlist_del_rcu(&napi->napi_hash_node);
5200	}
5201	spin_unlock(&napi_hash_lock);
5202	return rcu_sync_needed;
5203}
5204EXPORT_SYMBOL_GPL(napi_hash_del);
5205
5206static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5207{
5208	struct napi_struct *napi;
5209
5210	napi = container_of(timer, struct napi_struct, timer);
5211
5212	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
5213	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
5214	 */
5215	if (napi->gro_list && !napi_disable_pending(napi) &&
5216	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
 
5217		__napi_schedule_irqoff(napi);
 
5218
5219	return HRTIMER_NORESTART;
5220}
5221
5222void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5223		    int (*poll)(struct napi_struct *, int), int weight)
 
 
 
 
 
 
 
 
 
 
5224{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5225	INIT_LIST_HEAD(&napi->poll_list);
 
5226	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5227	napi->timer.function = napi_watchdog;
5228	napi->gro_count = 0;
5229	napi->gro_list = NULL;
5230	napi->skb = NULL;
 
 
5231	napi->poll = poll;
5232	if (weight > NAPI_POLL_WEIGHT)
5233		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5234			    weight, dev->name);
5235	napi->weight = weight;
5236	list_add(&napi->dev_list, &dev->napi_list);
5237	napi->dev = dev;
5238#ifdef CONFIG_NETPOLL
5239	napi->poll_owner = -1;
5240#endif
 
5241	set_bit(NAPI_STATE_SCHED, &napi->state);
 
 
5242	napi_hash_add(napi);
 
 
 
 
 
 
 
 
5243}
5244EXPORT_SYMBOL(netif_napi_add);
5245
5246void napi_disable(struct napi_struct *n)
5247{
 
 
5248	might_sleep();
5249	set_bit(NAPI_STATE_DISABLE, &n->state);
5250
5251	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5252		msleep(1);
5253	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5254		msleep(1);
 
 
 
 
 
 
5255
5256	hrtimer_cancel(&n->timer);
5257
5258	clear_bit(NAPI_STATE_DISABLE, &n->state);
5259}
5260EXPORT_SYMBOL(napi_disable);
5261
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5262/* Must be called in process context */
5263void netif_napi_del(struct napi_struct *napi)
5264{
5265	might_sleep();
5266	if (napi_hash_del(napi))
5267		synchronize_net();
5268	list_del_init(&napi->dev_list);
 
5269	napi_free_frags(napi);
5270
5271	kfree_skb_list(napi->gro_list);
5272	napi->gro_list = NULL;
5273	napi->gro_count = 0;
 
 
 
 
5274}
5275EXPORT_SYMBOL(netif_napi_del);
5276
5277static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5278{
5279	void *have;
5280	int work, weight;
5281
5282	list_del_init(&n->poll_list);
5283
5284	have = netpoll_poll_lock(n);
5285
5286	weight = n->weight;
5287
5288	/* This NAPI_STATE_SCHED test is for avoiding a race
5289	 * with netpoll's poll_napi().  Only the entity which
5290	 * obtains the lock and sees NAPI_STATE_SCHED set will
5291	 * actually make the ->poll() call.  Therefore we avoid
5292	 * accidentally calling ->poll() when NAPI is not scheduled.
5293	 */
5294	work = 0;
5295	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5296		work = n->poll(n, weight);
5297		trace_napi_poll(n, work, weight);
 
 
5298	}
5299
5300	WARN_ON_ONCE(work > weight);
 
 
5301
5302	if (likely(work < weight))
5303		goto out_unlock;
5304
5305	/* Drivers must not modify the NAPI state if they
5306	 * consume the entire weight.  In such cases this code
5307	 * still "owns" the NAPI instance and therefore can
5308	 * move the instance around on the list at-will.
5309	 */
5310	if (unlikely(napi_disable_pending(n))) {
5311		napi_complete(n);
5312		goto out_unlock;
 
 
 
 
 
 
 
 
 
 
 
 
 
5313	}
5314
5315	if (n->gro_list) {
5316		/* flush too old packets
5317		 * If HZ < 1000, flush all packets.
5318		 */
5319		napi_gro_flush(n, HZ >= 1000);
5320	}
5321
 
 
5322	/* Some drivers may have called napi_schedule
5323	 * prior to exhausting their budget.
5324	 */
5325	if (unlikely(!list_empty(&n->poll_list))) {
5326		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5327			     n->dev ? n->dev->name : "backlog");
5328		goto out_unlock;
5329	}
5330
5331	list_add_tail(&n->poll_list, repoll);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5332
5333out_unlock:
5334	netpoll_poll_unlock(have);
5335
5336	return work;
5337}
5338
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5339static __latent_entropy void net_rx_action(struct softirq_action *h)
5340{
5341	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5342	unsigned long time_limit = jiffies + 2;
5343	int budget = netdev_budget;
 
5344	LIST_HEAD(list);
5345	LIST_HEAD(repoll);
5346
 
 
5347	local_irq_disable();
5348	list_splice_init(&sd->poll_list, &list);
5349	local_irq_enable();
5350
5351	for (;;) {
5352		struct napi_struct *n;
5353
 
 
5354		if (list_empty(&list)) {
5355			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5356				goto out;
 
 
 
 
 
 
 
 
 
 
5357			break;
5358		}
5359
5360		n = list_first_entry(&list, struct napi_struct, poll_list);
5361		budget -= napi_poll(n, &repoll);
5362
5363		/* If softirq window is exhausted then punt.
5364		 * Allow this to run for 2 jiffies since which will allow
5365		 * an average latency of 1.5/HZ.
5366		 */
5367		if (unlikely(budget <= 0 ||
5368			     time_after_eq(jiffies, time_limit))) {
5369			sd->time_squeeze++;
5370			break;
5371		}
5372	}
5373
5374	local_irq_disable();
5375
5376	list_splice_tail_init(&sd->poll_list, &list);
5377	list_splice_tail(&repoll, &list);
5378	list_splice(&list, &sd->poll_list);
5379	if (!list_empty(&sd->poll_list))
5380		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 
 
5381
5382	net_rps_action_and_irq_enable(sd);
5383out:
5384	__kfree_skb_flush();
5385}
5386
5387struct netdev_adjacent {
5388	struct net_device *dev;
 
5389
5390	/* upper master flag, there can only be one master device per list */
5391	bool master;
5392
 
 
 
5393	/* counter for the number of times this device was added to us */
5394	u16 ref_nr;
5395
5396	/* private field for the users */
5397	void *private;
5398
5399	struct list_head list;
5400	struct rcu_head rcu;
5401};
5402
5403static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5404						 struct list_head *adj_list)
5405{
5406	struct netdev_adjacent *adj;
5407
5408	list_for_each_entry(adj, adj_list, list) {
5409		if (adj->dev == adj_dev)
5410			return adj;
5411	}
5412	return NULL;
5413}
5414
5415static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
 
5416{
5417	struct net_device *dev = data;
5418
5419	return upper_dev == dev;
5420}
5421
5422/**
5423 * netdev_has_upper_dev - Check if device is linked to an upper device
5424 * @dev: device
5425 * @upper_dev: upper device to check
5426 *
5427 * Find out if a device is linked to specified upper device and return true
5428 * in case it is. Note that this checks only immediate upper device,
5429 * not through a complete stack of devices. The caller must hold the RTNL lock.
5430 */
5431bool netdev_has_upper_dev(struct net_device *dev,
5432			  struct net_device *upper_dev)
5433{
 
 
 
 
5434	ASSERT_RTNL();
5435
5436	return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5437					     upper_dev);
5438}
5439EXPORT_SYMBOL(netdev_has_upper_dev);
5440
5441/**
5442 * netdev_has_upper_dev_all - Check if device is linked to an upper device
5443 * @dev: device
5444 * @upper_dev: upper device to check
5445 *
5446 * Find out if a device is linked to specified upper device and return true
5447 * in case it is. Note that this checks the entire upper device chain.
5448 * The caller must hold rcu lock.
5449 */
5450
5451bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
5452				  struct net_device *upper_dev)
5453{
5454	return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5455					       upper_dev);
 
 
 
 
5456}
5457EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
5458
5459/**
5460 * netdev_has_any_upper_dev - Check if device is linked to some device
5461 * @dev: device
5462 *
5463 * Find out if a device is linked to an upper device and return true in case
5464 * it is. The caller must hold the RTNL lock.
5465 */
5466static bool netdev_has_any_upper_dev(struct net_device *dev)
5467{
5468	ASSERT_RTNL();
5469
5470	return !list_empty(&dev->adj_list.upper);
5471}
 
5472
5473/**
5474 * netdev_master_upper_dev_get - Get master upper device
5475 * @dev: device
5476 *
5477 * Find a master upper device and return pointer to it or NULL in case
5478 * it's not there. The caller must hold the RTNL lock.
5479 */
5480struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5481{
5482	struct netdev_adjacent *upper;
5483
5484	ASSERT_RTNL();
5485
5486	if (list_empty(&dev->adj_list.upper))
5487		return NULL;
5488
5489	upper = list_first_entry(&dev->adj_list.upper,
5490				 struct netdev_adjacent, list);
5491	if (likely(upper->master))
5492		return upper->dev;
5493	return NULL;
5494}
5495EXPORT_SYMBOL(netdev_master_upper_dev_get);
5496
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5497/**
5498 * netdev_has_any_lower_dev - Check if device is linked to some device
5499 * @dev: device
5500 *
5501 * Find out if a device is linked to a lower device and return true in case
5502 * it is. The caller must hold the RTNL lock.
5503 */
5504static bool netdev_has_any_lower_dev(struct net_device *dev)
5505{
5506	ASSERT_RTNL();
5507
5508	return !list_empty(&dev->adj_list.lower);
5509}
5510
5511void *netdev_adjacent_get_private(struct list_head *adj_list)
5512{
5513	struct netdev_adjacent *adj;
5514
5515	adj = list_entry(adj_list, struct netdev_adjacent, list);
5516
5517	return adj->private;
5518}
5519EXPORT_SYMBOL(netdev_adjacent_get_private);
5520
5521/**
5522 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5523 * @dev: device
5524 * @iter: list_head ** of the current position
5525 *
5526 * Gets the next device from the dev's upper list, starting from iter
5527 * position. The caller must hold RCU read lock.
5528 */
5529struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5530						 struct list_head **iter)
5531{
5532	struct netdev_adjacent *upper;
5533
5534	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5535
5536	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5537
5538	if (&upper->list == &dev->adj_list.upper)
5539		return NULL;
5540
5541	*iter = &upper->list;
5542
5543	return upper->dev;
5544}
5545EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5546
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5547static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
5548						    struct list_head **iter)
5549{
5550	struct netdev_adjacent *upper;
5551
5552	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5553
5554	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5555
5556	if (&upper->list == &dev->adj_list.upper)
5557		return NULL;
5558
5559	*iter = &upper->list;
5560
5561	return upper->dev;
5562}
5563
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5564int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
5565				  int (*fn)(struct net_device *dev,
5566					    void *data),
5567				  void *data)
5568{
5569	struct net_device *udev;
5570	struct list_head *iter;
5571	int ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5572
5573	for (iter = &dev->adj_list.upper,
5574	     udev = netdev_next_upper_dev_rcu(dev, &iter);
5575	     udev;
5576	     udev = netdev_next_upper_dev_rcu(dev, &iter)) {
5577		/* first is the upper device itself */
5578		ret = fn(udev, data);
5579		if (ret)
5580			return ret;
5581
5582		/* then look at all of its upper devices */
5583		ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
5584		if (ret)
5585			return ret;
5586	}
5587
5588	return 0;
5589}
5590EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
5591
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5592/**
5593 * netdev_lower_get_next_private - Get the next ->private from the
5594 *				   lower neighbour list
5595 * @dev: device
5596 * @iter: list_head ** of the current position
5597 *
5598 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5599 * list, starting from iter position. The caller must hold either hold the
5600 * RTNL lock or its own locking that guarantees that the neighbour lower
5601 * list will remain unchanged.
5602 */
5603void *netdev_lower_get_next_private(struct net_device *dev,
5604				    struct list_head **iter)
5605{
5606	struct netdev_adjacent *lower;
5607
5608	lower = list_entry(*iter, struct netdev_adjacent, list);
5609
5610	if (&lower->list == &dev->adj_list.lower)
5611		return NULL;
5612
5613	*iter = lower->list.next;
5614
5615	return lower->private;
5616}
5617EXPORT_SYMBOL(netdev_lower_get_next_private);
5618
5619/**
5620 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5621 *				       lower neighbour list, RCU
5622 *				       variant
5623 * @dev: device
5624 * @iter: list_head ** of the current position
5625 *
5626 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5627 * list, starting from iter position. The caller must hold RCU read lock.
5628 */
5629void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5630					struct list_head **iter)
5631{
5632	struct netdev_adjacent *lower;
5633
5634	WARN_ON_ONCE(!rcu_read_lock_held());
5635
5636	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5637
5638	if (&lower->list == &dev->adj_list.lower)
5639		return NULL;
5640
5641	*iter = &lower->list;
5642
5643	return lower->private;
5644}
5645EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5646
5647/**
5648 * netdev_lower_get_next - Get the next device from the lower neighbour
5649 *                         list
5650 * @dev: device
5651 * @iter: list_head ** of the current position
5652 *
5653 * Gets the next netdev_adjacent from the dev's lower neighbour
5654 * list, starting from iter position. The caller must hold RTNL lock or
5655 * its own locking that guarantees that the neighbour lower
5656 * list will remain unchanged.
5657 */
5658void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5659{
5660	struct netdev_adjacent *lower;
5661
5662	lower = list_entry(*iter, struct netdev_adjacent, list);
5663
5664	if (&lower->list == &dev->adj_list.lower)
5665		return NULL;
5666
5667	*iter = lower->list.next;
5668
5669	return lower->dev;
5670}
5671EXPORT_SYMBOL(netdev_lower_get_next);
5672
5673static struct net_device *netdev_next_lower_dev(struct net_device *dev,
5674						struct list_head **iter)
5675{
5676	struct netdev_adjacent *lower;
5677
5678	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5679
5680	if (&lower->list == &dev->adj_list.lower)
5681		return NULL;
5682
5683	*iter = &lower->list;
5684
5685	return lower->dev;
5686}
5687
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5688int netdev_walk_all_lower_dev(struct net_device *dev,
5689			      int (*fn)(struct net_device *dev,
5690					void *data),
5691			      void *data)
5692{
5693	struct net_device *ldev;
5694	struct list_head *iter;
5695	int ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5696
5697	for (iter = &dev->adj_list.lower,
5698	     ldev = netdev_next_lower_dev(dev, &iter);
5699	     ldev;
5700	     ldev = netdev_next_lower_dev(dev, &iter)) {
5701		/* first is the lower device itself */
5702		ret = fn(ldev, data);
5703		if (ret)
5704			return ret;
5705
5706		/* then look at all of its lower devices */
5707		ret = netdev_walk_all_lower_dev(ldev, fn, data);
5708		if (ret)
5709			return ret;
5710	}
5711
5712	return 0;
5713}
5714EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
5715
5716static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
5717						    struct list_head **iter)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5718{
5719	struct netdev_adjacent *lower;
5720
5721	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5722	if (&lower->list == &dev->adj_list.lower)
5723		return NULL;
5724
5725	*iter = &lower->list;
5726
5727	return lower->dev;
5728}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5729
5730int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
5731				  int (*fn)(struct net_device *dev,
5732					    void *data),
5733				  void *data)
5734{
5735	struct net_device *ldev;
5736	struct list_head *iter;
5737	int ret;
 
5738
5739	for (iter = &dev->adj_list.lower,
5740	     ldev = netdev_next_lower_dev_rcu(dev, &iter);
5741	     ldev;
5742	     ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
5743		/* first is the lower device itself */
5744		ret = fn(ldev, data);
5745		if (ret)
5746			return ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5747
5748		/* then look at all of its lower devices */
5749		ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
5750		if (ret)
5751			return ret;
5752	}
5753
5754	return 0;
5755}
5756EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
5757
5758/**
5759 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5760 *				       lower neighbour list, RCU
5761 *				       variant
5762 * @dev: device
5763 *
5764 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5765 * list. The caller must hold RCU read lock.
5766 */
5767void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5768{
5769	struct netdev_adjacent *lower;
5770
5771	lower = list_first_or_null_rcu(&dev->adj_list.lower,
5772			struct netdev_adjacent, list);
5773	if (lower)
5774		return lower->private;
5775	return NULL;
5776}
5777EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5778
5779/**
5780 * netdev_master_upper_dev_get_rcu - Get master upper device
5781 * @dev: device
5782 *
5783 * Find a master upper device and return pointer to it or NULL in case
5784 * it's not there. The caller must hold the RCU read lock.
5785 */
5786struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5787{
5788	struct netdev_adjacent *upper;
5789
5790	upper = list_first_or_null_rcu(&dev->adj_list.upper,
5791				       struct netdev_adjacent, list);
5792	if (upper && likely(upper->master))
5793		return upper->dev;
5794	return NULL;
5795}
5796EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5797
5798static int netdev_adjacent_sysfs_add(struct net_device *dev,
5799			      struct net_device *adj_dev,
5800			      struct list_head *dev_list)
5801{
5802	char linkname[IFNAMSIZ+7];
 
5803	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5804		"upper_%s" : "lower_%s", adj_dev->name);
5805	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5806				 linkname);
5807}
5808static void netdev_adjacent_sysfs_del(struct net_device *dev,
5809			       char *name,
5810			       struct list_head *dev_list)
5811{
5812	char linkname[IFNAMSIZ+7];
 
5813	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5814		"upper_%s" : "lower_%s", name);
5815	sysfs_remove_link(&(dev->dev.kobj), linkname);
5816}
5817
5818static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5819						 struct net_device *adj_dev,
5820						 struct list_head *dev_list)
5821{
5822	return (dev_list == &dev->adj_list.upper ||
5823		dev_list == &dev->adj_list.lower) &&
5824		net_eq(dev_net(dev), dev_net(adj_dev));
5825}
5826
5827static int __netdev_adjacent_dev_insert(struct net_device *dev,
5828					struct net_device *adj_dev,
5829					struct list_head *dev_list,
5830					void *private, bool master)
5831{
5832	struct netdev_adjacent *adj;
5833	int ret;
5834
5835	adj = __netdev_find_adj(adj_dev, dev_list);
5836
5837	if (adj) {
5838		adj->ref_nr += 1;
5839		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
5840			 dev->name, adj_dev->name, adj->ref_nr);
5841
5842		return 0;
5843	}
5844
5845	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5846	if (!adj)
5847		return -ENOMEM;
5848
5849	adj->dev = adj_dev;
5850	adj->master = master;
5851	adj->ref_nr = 1;
5852	adj->private = private;
5853	dev_hold(adj_dev);
 
5854
5855	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
5856		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
5857
5858	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5859		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5860		if (ret)
5861			goto free_adj;
5862	}
5863
5864	/* Ensure that master link is always the first item in list. */
5865	if (master) {
5866		ret = sysfs_create_link(&(dev->dev.kobj),
5867					&(adj_dev->dev.kobj), "master");
5868		if (ret)
5869			goto remove_symlinks;
5870
5871		list_add_rcu(&adj->list, dev_list);
5872	} else {
5873		list_add_tail_rcu(&adj->list, dev_list);
5874	}
5875
5876	return 0;
5877
5878remove_symlinks:
5879	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5880		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5881free_adj:
 
5882	kfree(adj);
5883	dev_put(adj_dev);
5884
5885	return ret;
5886}
5887
5888static void __netdev_adjacent_dev_remove(struct net_device *dev,
5889					 struct net_device *adj_dev,
5890					 u16 ref_nr,
5891					 struct list_head *dev_list)
5892{
5893	struct netdev_adjacent *adj;
5894
5895	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
5896		 dev->name, adj_dev->name, ref_nr);
5897
5898	adj = __netdev_find_adj(adj_dev, dev_list);
5899
5900	if (!adj) {
5901		pr_err("Adjacency does not exist for device %s from %s\n",
5902		       dev->name, adj_dev->name);
5903		WARN_ON(1);
5904		return;
5905	}
5906
5907	if (adj->ref_nr > ref_nr) {
5908		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
5909			 dev->name, adj_dev->name, ref_nr,
5910			 adj->ref_nr - ref_nr);
5911		adj->ref_nr -= ref_nr;
5912		return;
5913	}
5914
5915	if (adj->master)
5916		sysfs_remove_link(&(dev->dev.kobj), "master");
5917
5918	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5919		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5920
5921	list_del_rcu(&adj->list);
5922	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
5923		 adj_dev->name, dev->name, adj_dev->name);
5924	dev_put(adj_dev);
5925	kfree_rcu(adj, rcu);
5926}
5927
5928static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5929					    struct net_device *upper_dev,
5930					    struct list_head *up_list,
5931					    struct list_head *down_list,
5932					    void *private, bool master)
5933{
5934	int ret;
5935
5936	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
5937					   private, master);
5938	if (ret)
5939		return ret;
5940
5941	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
5942					   private, false);
5943	if (ret) {
5944		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
5945		return ret;
5946	}
5947
5948	return 0;
5949}
5950
5951static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5952					       struct net_device *upper_dev,
5953					       u16 ref_nr,
5954					       struct list_head *up_list,
5955					       struct list_head *down_list)
5956{
5957	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5958	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5959}
5960
5961static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5962						struct net_device *upper_dev,
5963						void *private, bool master)
5964{
5965	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5966						&dev->adj_list.upper,
5967						&upper_dev->adj_list.lower,
5968						private, master);
5969}
5970
5971static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5972						   struct net_device *upper_dev)
5973{
5974	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5975					   &dev->adj_list.upper,
5976					   &upper_dev->adj_list.lower);
5977}
5978
5979static int __netdev_upper_dev_link(struct net_device *dev,
5980				   struct net_device *upper_dev, bool master,
5981				   void *upper_priv, void *upper_info)
5982{
5983	struct netdev_notifier_changeupper_info changeupper_info;
 
 
 
 
 
 
 
 
 
 
 
 
5984	int ret = 0;
5985
5986	ASSERT_RTNL();
5987
5988	if (dev == upper_dev)
5989		return -EBUSY;
5990
5991	/* To prevent loops, check if dev is not upper device to upper_dev. */
5992	if (netdev_has_upper_dev(upper_dev, dev))
5993		return -EBUSY;
5994
5995	if (netdev_has_upper_dev(dev, upper_dev))
5996		return -EEXIST;
5997
5998	if (master && netdev_master_upper_dev_get(dev))
5999		return -EBUSY;
6000
6001	changeupper_info.upper_dev = upper_dev;
6002	changeupper_info.master = master;
6003	changeupper_info.linking = true;
6004	changeupper_info.upper_info = upper_info;
 
6005
6006	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
6007					    &changeupper_info.info);
6008	ret = notifier_to_errno(ret);
6009	if (ret)
6010		return ret;
6011
6012	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
6013						   master);
6014	if (ret)
6015		return ret;
6016
6017	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
6018					    &changeupper_info.info);
6019	ret = notifier_to_errno(ret);
6020	if (ret)
6021		goto rollback;
6022
 
 
 
 
 
 
 
6023	return 0;
6024
6025rollback:
6026	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6027
6028	return ret;
6029}
6030
6031/**
6032 * netdev_upper_dev_link - Add a link to the upper device
6033 * @dev: device
6034 * @upper_dev: new upper device
 
6035 *
6036 * Adds a link to device which is upper to this one. The caller must hold
6037 * the RTNL lock. On a failure a negative errno code is returned.
6038 * On success the reference counts are adjusted and the function
6039 * returns zero.
6040 */
6041int netdev_upper_dev_link(struct net_device *dev,
6042			  struct net_device *upper_dev)
 
6043{
6044	return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
 
 
 
 
 
 
6045}
6046EXPORT_SYMBOL(netdev_upper_dev_link);
6047
6048/**
6049 * netdev_master_upper_dev_link - Add a master link to the upper device
6050 * @dev: device
6051 * @upper_dev: new upper device
6052 * @upper_priv: upper device private
6053 * @upper_info: upper info to be passed down via notifier
 
6054 *
6055 * Adds a link to device which is upper to this one. In this case, only
6056 * one master upper device can be linked, although other non-master devices
6057 * might be linked as well. The caller must hold the RTNL lock.
6058 * On a failure a negative errno code is returned. On success the reference
6059 * counts are adjusted and the function returns zero.
6060 */
6061int netdev_master_upper_dev_link(struct net_device *dev,
6062				 struct net_device *upper_dev,
6063				 void *upper_priv, void *upper_info)
 
6064{
 
 
 
 
 
6065	return __netdev_upper_dev_link(dev, upper_dev, true,
6066				       upper_priv, upper_info);
6067}
6068EXPORT_SYMBOL(netdev_master_upper_dev_link);
6069
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6070/**
6071 * netdev_upper_dev_unlink - Removes a link to upper device
6072 * @dev: device
6073 * @upper_dev: new upper device
6074 *
6075 * Removes a link to device which is upper to this one. The caller must hold
6076 * the RTNL lock.
6077 */
6078void netdev_upper_dev_unlink(struct net_device *dev,
6079			     struct net_device *upper_dev)
6080{
6081	struct netdev_notifier_changeupper_info changeupper_info;
6082	ASSERT_RTNL();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6083
6084	changeupper_info.upper_dev = upper_dev;
6085	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
6086	changeupper_info.linking = false;
 
 
 
 
 
6087
6088	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
6089				      &changeupper_info.info);
6090
6091	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 
6092
6093	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
6094				      &changeupper_info.info);
6095}
6096EXPORT_SYMBOL(netdev_upper_dev_unlink);
6097
6098/**
6099 * netdev_bonding_info_change - Dispatch event about slave change
6100 * @dev: device
6101 * @bonding_info: info to dispatch
6102 *
6103 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
6104 * The caller must hold the RTNL lock.
6105 */
6106void netdev_bonding_info_change(struct net_device *dev,
6107				struct netdev_bonding_info *bonding_info)
6108{
6109	struct netdev_notifier_bonding_info	info;
 
 
6110
6111	memcpy(&info.bonding_info, bonding_info,
6112	       sizeof(struct netdev_bonding_info));
6113	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
6114				      &info.info);
6115}
6116EXPORT_SYMBOL(netdev_bonding_info_change);
6117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6118static void netdev_adjacent_add_links(struct net_device *dev)
6119{
6120	struct netdev_adjacent *iter;
6121
6122	struct net *net = dev_net(dev);
6123
6124	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6125		if (!net_eq(net, dev_net(iter->dev)))
6126			continue;
6127		netdev_adjacent_sysfs_add(iter->dev, dev,
6128					  &iter->dev->adj_list.lower);
6129		netdev_adjacent_sysfs_add(dev, iter->dev,
6130					  &dev->adj_list.upper);
6131	}
6132
6133	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6134		if (!net_eq(net, dev_net(iter->dev)))
6135			continue;
6136		netdev_adjacent_sysfs_add(iter->dev, dev,
6137					  &iter->dev->adj_list.upper);
6138		netdev_adjacent_sysfs_add(dev, iter->dev,
6139					  &dev->adj_list.lower);
6140	}
6141}
6142
6143static void netdev_adjacent_del_links(struct net_device *dev)
6144{
6145	struct netdev_adjacent *iter;
6146
6147	struct net *net = dev_net(dev);
6148
6149	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6150		if (!net_eq(net, dev_net(iter->dev)))
6151			continue;
6152		netdev_adjacent_sysfs_del(iter->dev, dev->name,
6153					  &iter->dev->adj_list.lower);
6154		netdev_adjacent_sysfs_del(dev, iter->dev->name,
6155					  &dev->adj_list.upper);
6156	}
6157
6158	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6159		if (!net_eq(net, dev_net(iter->dev)))
6160			continue;
6161		netdev_adjacent_sysfs_del(iter->dev, dev->name,
6162					  &iter->dev->adj_list.upper);
6163		netdev_adjacent_sysfs_del(dev, iter->dev->name,
6164					  &dev->adj_list.lower);
6165	}
6166}
6167
6168void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
6169{
6170	struct netdev_adjacent *iter;
6171
6172	struct net *net = dev_net(dev);
6173
6174	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6175		if (!net_eq(net, dev_net(iter->dev)))
6176			continue;
6177		netdev_adjacent_sysfs_del(iter->dev, oldname,
6178					  &iter->dev->adj_list.lower);
6179		netdev_adjacent_sysfs_add(iter->dev, dev,
6180					  &iter->dev->adj_list.lower);
6181	}
6182
6183	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6184		if (!net_eq(net, dev_net(iter->dev)))
6185			continue;
6186		netdev_adjacent_sysfs_del(iter->dev, oldname,
6187					  &iter->dev->adj_list.upper);
6188		netdev_adjacent_sysfs_add(iter->dev, dev,
6189					  &iter->dev->adj_list.upper);
6190	}
6191}
6192
6193void *netdev_lower_dev_get_private(struct net_device *dev,
6194				   struct net_device *lower_dev)
6195{
6196	struct netdev_adjacent *lower;
6197
6198	if (!lower_dev)
6199		return NULL;
6200	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
6201	if (!lower)
6202		return NULL;
6203
6204	return lower->private;
6205}
6206EXPORT_SYMBOL(netdev_lower_dev_get_private);
6207
6208
6209int dev_get_nest_level(struct net_device *dev)
6210{
6211	struct net_device *lower = NULL;
6212	struct list_head *iter;
6213	int max_nest = -1;
6214	int nest;
6215
6216	ASSERT_RTNL();
6217
6218	netdev_for_each_lower_dev(dev, lower, iter) {
6219		nest = dev_get_nest_level(lower);
6220		if (max_nest < nest)
6221			max_nest = nest;
6222	}
6223
6224	return max_nest + 1;
6225}
6226EXPORT_SYMBOL(dev_get_nest_level);
6227
6228/**
6229 * netdev_lower_change - Dispatch event about lower device state change
6230 * @lower_dev: device
6231 * @lower_state_info: state to dispatch
6232 *
6233 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6234 * The caller must hold the RTNL lock.
6235 */
6236void netdev_lower_state_changed(struct net_device *lower_dev,
6237				void *lower_state_info)
6238{
6239	struct netdev_notifier_changelowerstate_info changelowerstate_info;
 
 
6240
6241	ASSERT_RTNL();
6242	changelowerstate_info.lower_state_info = lower_state_info;
6243	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6244				      &changelowerstate_info.info);
6245}
6246EXPORT_SYMBOL(netdev_lower_state_changed);
6247
6248int netdev_default_l2upper_neigh_construct(struct net_device *dev,
6249					   struct neighbour *n)
6250{
6251	struct net_device *lower_dev, *stop_dev;
6252	struct list_head *iter;
6253	int err;
6254
6255	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6256		if (!lower_dev->netdev_ops->ndo_neigh_construct)
6257			continue;
6258		err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
6259		if (err) {
6260			stop_dev = lower_dev;
6261			goto rollback;
6262		}
6263	}
6264	return 0;
6265
6266rollback:
6267	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6268		if (lower_dev == stop_dev)
6269			break;
6270		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6271			continue;
6272		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6273	}
6274	return err;
6275}
6276EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6277
6278void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6279					  struct neighbour *n)
6280{
6281	struct net_device *lower_dev;
6282	struct list_head *iter;
6283
6284	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6285		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6286			continue;
6287		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6288	}
6289}
6290EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6291
6292static void dev_change_rx_flags(struct net_device *dev, int flags)
6293{
6294	const struct net_device_ops *ops = dev->netdev_ops;
6295
6296	if (ops->ndo_change_rx_flags)
6297		ops->ndo_change_rx_flags(dev, flags);
6298}
6299
6300static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6301{
6302	unsigned int old_flags = dev->flags;
6303	kuid_t uid;
6304	kgid_t gid;
6305
6306	ASSERT_RTNL();
6307
6308	dev->flags |= IFF_PROMISC;
6309	dev->promiscuity += inc;
6310	if (dev->promiscuity == 0) {
6311		/*
6312		 * Avoid overflow.
6313		 * If inc causes overflow, untouch promisc and return error.
6314		 */
6315		if (inc < 0)
6316			dev->flags &= ~IFF_PROMISC;
6317		else {
6318			dev->promiscuity -= inc;
6319			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6320				dev->name);
6321			return -EOVERFLOW;
6322		}
6323	}
6324	if (dev->flags != old_flags) {
6325		pr_info("device %s %s promiscuous mode\n",
6326			dev->name,
6327			dev->flags & IFF_PROMISC ? "entered" : "left");
6328		if (audit_enabled) {
6329			current_uid_gid(&uid, &gid);
6330			audit_log(current->audit_context, GFP_ATOMIC,
6331				AUDIT_ANOM_PROMISCUOUS,
6332				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6333				dev->name, (dev->flags & IFF_PROMISC),
6334				(old_flags & IFF_PROMISC),
6335				from_kuid(&init_user_ns, audit_get_loginuid(current)),
6336				from_kuid(&init_user_ns, uid),
6337				from_kgid(&init_user_ns, gid),
6338				audit_get_sessionid(current));
6339		}
6340
6341		dev_change_rx_flags(dev, IFF_PROMISC);
6342	}
6343	if (notify)
6344		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
6345	return 0;
6346}
6347
6348/**
6349 *	dev_set_promiscuity	- update promiscuity count on a device
6350 *	@dev: device
6351 *	@inc: modifier
6352 *
6353 *	Add or remove promiscuity from a device. While the count in the device
6354 *	remains above zero the interface remains promiscuous. Once it hits zero
6355 *	the device reverts back to normal filtering operation. A negative inc
6356 *	value is used to drop promiscuity on the device.
6357 *	Return 0 if successful or a negative errno code on error.
6358 */
6359int dev_set_promiscuity(struct net_device *dev, int inc)
6360{
6361	unsigned int old_flags = dev->flags;
6362	int err;
6363
6364	err = __dev_set_promiscuity(dev, inc, true);
6365	if (err < 0)
6366		return err;
6367	if (dev->flags != old_flags)
6368		dev_set_rx_mode(dev);
6369	return err;
6370}
6371EXPORT_SYMBOL(dev_set_promiscuity);
6372
6373static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6374{
6375	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6376
6377	ASSERT_RTNL();
6378
6379	dev->flags |= IFF_ALLMULTI;
6380	dev->allmulti += inc;
6381	if (dev->allmulti == 0) {
6382		/*
6383		 * Avoid overflow.
6384		 * If inc causes overflow, untouch allmulti and return error.
6385		 */
6386		if (inc < 0)
6387			dev->flags &= ~IFF_ALLMULTI;
6388		else {
6389			dev->allmulti -= inc;
6390			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6391				dev->name);
6392			return -EOVERFLOW;
6393		}
6394	}
6395	if (dev->flags ^ old_flags) {
 
 
6396		dev_change_rx_flags(dev, IFF_ALLMULTI);
6397		dev_set_rx_mode(dev);
6398		if (notify)
6399			__dev_notify_flags(dev, old_flags,
6400					   dev->gflags ^ old_gflags);
6401	}
6402	return 0;
6403}
6404
6405/**
6406 *	dev_set_allmulti	- update allmulti count on a device
6407 *	@dev: device
6408 *	@inc: modifier
6409 *
6410 *	Add or remove reception of all multicast frames to a device. While the
6411 *	count in the device remains above zero the interface remains listening
6412 *	to all interfaces. Once it hits zero the device reverts back to normal
6413 *	filtering operation. A negative @inc value is used to drop the counter
6414 *	when releasing a resource needing all multicasts.
6415 *	Return 0 if successful or a negative errno code on error.
6416 */
6417
6418int dev_set_allmulti(struct net_device *dev, int inc)
6419{
6420	return __dev_set_allmulti(dev, inc, true);
6421}
6422EXPORT_SYMBOL(dev_set_allmulti);
6423
6424/*
6425 *	Upload unicast and multicast address lists to device and
6426 *	configure RX filtering. When the device doesn't support unicast
6427 *	filtering it is put in promiscuous mode while unicast addresses
6428 *	are present.
6429 */
6430void __dev_set_rx_mode(struct net_device *dev)
6431{
6432	const struct net_device_ops *ops = dev->netdev_ops;
6433
6434	/* dev_open will call this function so the list will stay sane. */
6435	if (!(dev->flags&IFF_UP))
6436		return;
6437
6438	if (!netif_device_present(dev))
6439		return;
6440
6441	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6442		/* Unicast addresses changes may only happen under the rtnl,
6443		 * therefore calling __dev_set_promiscuity here is safe.
6444		 */
6445		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6446			__dev_set_promiscuity(dev, 1, false);
6447			dev->uc_promisc = true;
6448		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6449			__dev_set_promiscuity(dev, -1, false);
6450			dev->uc_promisc = false;
6451		}
6452	}
6453
6454	if (ops->ndo_set_rx_mode)
6455		ops->ndo_set_rx_mode(dev);
6456}
6457
6458void dev_set_rx_mode(struct net_device *dev)
6459{
6460	netif_addr_lock_bh(dev);
6461	__dev_set_rx_mode(dev);
6462	netif_addr_unlock_bh(dev);
6463}
6464
6465/**
6466 *	dev_get_flags - get flags reported to userspace
6467 *	@dev: device
6468 *
6469 *	Get the combination of flag bits exported through APIs to userspace.
6470 */
6471unsigned int dev_get_flags(const struct net_device *dev)
6472{
6473	unsigned int flags;
6474
6475	flags = (dev->flags & ~(IFF_PROMISC |
6476				IFF_ALLMULTI |
6477				IFF_RUNNING |
6478				IFF_LOWER_UP |
6479				IFF_DORMANT)) |
6480		(dev->gflags & (IFF_PROMISC |
6481				IFF_ALLMULTI));
6482
6483	if (netif_running(dev)) {
6484		if (netif_oper_up(dev))
6485			flags |= IFF_RUNNING;
6486		if (netif_carrier_ok(dev))
6487			flags |= IFF_LOWER_UP;
6488		if (netif_dormant(dev))
6489			flags |= IFF_DORMANT;
6490	}
6491
6492	return flags;
6493}
6494EXPORT_SYMBOL(dev_get_flags);
6495
6496int __dev_change_flags(struct net_device *dev, unsigned int flags)
 
6497{
6498	unsigned int old_flags = dev->flags;
6499	int ret;
6500
6501	ASSERT_RTNL();
6502
6503	/*
6504	 *	Set the flags on our device.
6505	 */
6506
6507	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6508			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6509			       IFF_AUTOMEDIA)) |
6510		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6511				    IFF_ALLMULTI));
6512
6513	/*
6514	 *	Load in the correct multicast list now the flags have changed.
6515	 */
6516
6517	if ((old_flags ^ flags) & IFF_MULTICAST)
6518		dev_change_rx_flags(dev, IFF_MULTICAST);
6519
6520	dev_set_rx_mode(dev);
6521
6522	/*
6523	 *	Have we downed the interface. We handle IFF_UP ourselves
6524	 *	according to user attempts to set it, rather than blindly
6525	 *	setting it.
6526	 */
6527
6528	ret = 0;
6529	if ((old_flags ^ flags) & IFF_UP)
6530		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
 
 
 
 
6531
6532	if ((flags ^ dev->gflags) & IFF_PROMISC) {
6533		int inc = (flags & IFF_PROMISC) ? 1 : -1;
6534		unsigned int old_flags = dev->flags;
6535
6536		dev->gflags ^= IFF_PROMISC;
6537
6538		if (__dev_set_promiscuity(dev, inc, false) >= 0)
6539			if (dev->flags != old_flags)
6540				dev_set_rx_mode(dev);
6541	}
6542
6543	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6544	   is important. Some (broken) drivers set IFF_PROMISC, when
6545	   IFF_ALLMULTI is requested not asking us and not reporting.
6546	 */
6547	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6548		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6549
6550		dev->gflags ^= IFF_ALLMULTI;
6551		__dev_set_allmulti(dev, inc, false);
6552	}
6553
6554	return ret;
6555}
6556
6557void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6558			unsigned int gchanges)
 
6559{
6560	unsigned int changes = dev->flags ^ old_flags;
6561
6562	if (gchanges)
6563		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6564
6565	if (changes & IFF_UP) {
6566		if (dev->flags & IFF_UP)
6567			call_netdevice_notifiers(NETDEV_UP, dev);
6568		else
6569			call_netdevice_notifiers(NETDEV_DOWN, dev);
6570	}
6571
6572	if (dev->flags & IFF_UP &&
6573	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6574		struct netdev_notifier_change_info change_info;
 
 
 
 
 
6575
6576		change_info.flags_changed = changes;
6577		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6578					      &change_info.info);
6579	}
6580}
6581
6582/**
6583 *	dev_change_flags - change device settings
6584 *	@dev: device
6585 *	@flags: device state flags
 
6586 *
6587 *	Change settings on device based state flags. The flags are
6588 *	in the userspace exported format.
6589 */
6590int dev_change_flags(struct net_device *dev, unsigned int flags)
 
6591{
6592	int ret;
6593	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6594
6595	ret = __dev_change_flags(dev, flags);
6596	if (ret < 0)
6597		return ret;
6598
6599	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6600	__dev_notify_flags(dev, old_flags, changes);
6601	return ret;
6602}
6603EXPORT_SYMBOL(dev_change_flags);
6604
6605static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6606{
6607	const struct net_device_ops *ops = dev->netdev_ops;
6608
6609	if (ops->ndo_change_mtu)
6610		return ops->ndo_change_mtu(dev, new_mtu);
6611
6612	dev->mtu = new_mtu;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6613	return 0;
6614}
6615
6616/**
6617 *	dev_set_mtu - Change maximum transfer unit
6618 *	@dev: device
6619 *	@new_mtu: new transfer unit
 
6620 *
6621 *	Change the maximum transfer size of the network device.
6622 */
6623int dev_set_mtu(struct net_device *dev, int new_mtu)
 
6624{
6625	int err, orig_mtu;
6626
6627	if (new_mtu == dev->mtu)
6628		return 0;
6629
6630	/* MTU must be positive, and in range */
6631	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
6632		net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
6633				    dev->name, new_mtu, dev->min_mtu);
6634		return -EINVAL;
6635	}
6636
6637	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
6638		net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
6639				    dev->name, new_mtu, dev->max_mtu);
6640		return -EINVAL;
6641	}
6642
6643	if (!netif_device_present(dev))
6644		return -ENODEV;
6645
6646	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6647	err = notifier_to_errno(err);
6648	if (err)
6649		return err;
6650
6651	orig_mtu = dev->mtu;
6652	err = __dev_set_mtu(dev, new_mtu);
6653
6654	if (!err) {
6655		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
 
6656		err = notifier_to_errno(err);
6657		if (err) {
6658			/* setting mtu back and notifying everyone again,
6659			 * so that they have a chance to revert changes.
6660			 */
6661			__dev_set_mtu(dev, orig_mtu);
6662			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
 
6663		}
6664	}
6665	return err;
6666}
 
 
 
 
 
 
 
 
 
 
 
 
6667EXPORT_SYMBOL(dev_set_mtu);
6668
6669/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6670 *	dev_set_group - Change group this device belongs to
6671 *	@dev: device
6672 *	@new_group: group this device should belong to
6673 */
6674void dev_set_group(struct net_device *dev, int new_group)
6675{
6676	dev->group = new_group;
6677}
6678EXPORT_SYMBOL(dev_set_group);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6679
6680/**
6681 *	dev_set_mac_address - Change Media Access Control Address
6682 *	@dev: device
6683 *	@sa: new address
 
6684 *
6685 *	Change the hardware (MAC) address of the device
6686 */
6687int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
 
6688{
6689	const struct net_device_ops *ops = dev->netdev_ops;
6690	int err;
6691
6692	if (!ops->ndo_set_mac_address)
6693		return -EOPNOTSUPP;
6694	if (sa->sa_family != dev->type)
6695		return -EINVAL;
6696	if (!netif_device_present(dev))
6697		return -ENODEV;
6698	err = ops->ndo_set_mac_address(dev, sa);
6699	if (err)
6700		return err;
 
 
 
 
 
6701	dev->addr_assign_type = NET_ADDR_SET;
6702	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6703	add_device_randomness(dev->dev_addr, dev->addr_len);
6704	return 0;
6705}
6706EXPORT_SYMBOL(dev_set_mac_address);
6707
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6708/**
6709 *	dev_change_carrier - Change device carrier
6710 *	@dev: device
6711 *	@new_carrier: new value
6712 *
6713 *	Change device carrier
6714 */
6715int dev_change_carrier(struct net_device *dev, bool new_carrier)
6716{
6717	const struct net_device_ops *ops = dev->netdev_ops;
6718
6719	if (!ops->ndo_change_carrier)
6720		return -EOPNOTSUPP;
6721	if (!netif_device_present(dev))
6722		return -ENODEV;
6723	return ops->ndo_change_carrier(dev, new_carrier);
6724}
6725EXPORT_SYMBOL(dev_change_carrier);
6726
6727/**
6728 *	dev_get_phys_port_id - Get device physical port ID
6729 *	@dev: device
6730 *	@ppid: port ID
6731 *
6732 *	Get device physical port ID
6733 */
6734int dev_get_phys_port_id(struct net_device *dev,
6735			 struct netdev_phys_item_id *ppid)
6736{
6737	const struct net_device_ops *ops = dev->netdev_ops;
6738
6739	if (!ops->ndo_get_phys_port_id)
6740		return -EOPNOTSUPP;
6741	return ops->ndo_get_phys_port_id(dev, ppid);
6742}
6743EXPORT_SYMBOL(dev_get_phys_port_id);
6744
6745/**
6746 *	dev_get_phys_port_name - Get device physical port name
6747 *	@dev: device
6748 *	@name: port name
6749 *	@len: limit of bytes to copy to name
6750 *
6751 *	Get device physical port name
6752 */
6753int dev_get_phys_port_name(struct net_device *dev,
6754			   char *name, size_t len)
6755{
6756	const struct net_device_ops *ops = dev->netdev_ops;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6757
6758	if (!ops->ndo_get_phys_port_name)
6759		return -EOPNOTSUPP;
6760	return ops->ndo_get_phys_port_name(dev, name, len);
 
 
6761}
6762EXPORT_SYMBOL(dev_get_phys_port_name);
6763
6764/**
6765 *	dev_change_proto_down - update protocol port state information
 
6766 *	@dev: device
6767 *	@proto_down: new value
6768 *
6769 *	This info can be used by switch drivers to set the phys state of the
6770 *	port.
6771 */
6772int dev_change_proto_down(struct net_device *dev, bool proto_down)
6773{
6774	const struct net_device_ops *ops = dev->netdev_ops;
6775
6776	if (!ops->ndo_change_proto_down)
6777		return -EOPNOTSUPP;
6778	if (!netif_device_present(dev))
6779		return -ENODEV;
6780	return ops->ndo_change_proto_down(dev, proto_down);
 
 
 
 
 
6781}
6782EXPORT_SYMBOL(dev_change_proto_down);
6783
6784/**
6785 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
 
6786 *	@dev: device
6787 *	@fd: new program fd or negative value to clear
6788 *	@flags: xdp-related flags
6789 *
6790 *	Set or clear a bpf program for a device
6791 */
6792int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6793{
6794	const struct net_device_ops *ops = dev->netdev_ops;
6795	struct bpf_prog *prog = NULL;
6796	struct netdev_xdp xdp;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6797	int err;
6798
6799	ASSERT_RTNL();
6800
6801	if (!ops->ndo_xdp)
6802		return -EOPNOTSUPP;
6803	if (fd >= 0) {
6804		if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
6805			memset(&xdp, 0, sizeof(xdp));
6806			xdp.command = XDP_QUERY_PROG;
6807
6808			err = ops->ndo_xdp(dev, &xdp);
6809			if (err < 0)
6810				return err;
6811			if (xdp.prog_attached)
6812				return -EBUSY;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6813		}
6814
6815		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6816		if (IS_ERR(prog))
6817			return PTR_ERR(prog);
6818	}
6819
6820	memset(&xdp, 0, sizeof(xdp));
6821	xdp.command = XDP_SETUP_PROG;
6822	xdp.prog = prog;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6823
6824	err = ops->ndo_xdp(dev, &xdp);
6825	if (err < 0 && prog)
6826		bpf_prog_put(prog);
6827
6828	return err;
 
 
 
6829}
6830EXPORT_SYMBOL(dev_change_xdp_fd);
6831
6832/**
6833 *	dev_new_index	-	allocate an ifindex
6834 *	@net: the applicable net namespace
6835 *
6836 *	Returns a suitable unique value for a new device interface
6837 *	number.  The caller must hold the rtnl semaphore or the
6838 *	dev_base_lock to be sure it remains unique.
6839 */
6840static int dev_new_index(struct net *net)
6841{
6842	int ifindex = net->ifindex;
6843	for (;;) {
6844		if (++ifindex <= 0)
6845			ifindex = 1;
6846		if (!__dev_get_by_index(net, ifindex))
6847			return net->ifindex = ifindex;
 
 
 
 
6848	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6849}
6850
6851/* Delayed registration/unregisteration */
6852static LIST_HEAD(net_todo_list);
6853DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
 
 
 
 
 
 
 
 
 
 
6854
6855static void net_set_todo(struct net_device *dev)
 
6856{
6857	list_add_tail(&dev->todo_list, &net_todo_list);
6858	dev_net(dev)->dev_unreg_count++;
 
 
 
 
 
 
 
 
6859}
6860
6861static void rollback_registered_many(struct list_head *head)
 
6862{
6863	struct net_device *dev, *tmp;
6864	LIST_HEAD(close_head);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6865
6866	BUG_ON(dev_boot_phase);
6867	ASSERT_RTNL();
 
 
 
 
 
 
6868
6869	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6870		/* Some devices call without registering
6871		 * for initialization unwind. Remove those
6872		 * devices and proceed with the remaining.
6873		 */
6874		if (dev->reg_state == NETREG_UNINITIALIZED) {
6875			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6876				 dev->name, dev);
6877
6878			WARN_ON(1);
6879			list_del(&dev->unreg_list);
6880			continue;
6881		}
6882		dev->dismantle = true;
6883		BUG_ON(dev->reg_state != NETREG_REGISTERED);
6884	}
6885
6886	/* If device is running, close it first. */
6887	list_for_each_entry(dev, head, unreg_list)
6888		list_add_tail(&dev->close_list, &close_head);
6889	dev_close_many(&close_head, true);
 
 
 
 
 
6890
6891	list_for_each_entry(dev, head, unreg_list) {
6892		/* And unlink it from device chain. */
6893		unlist_netdevice(dev);
6894
6895		dev->reg_state = NETREG_UNREGISTERING;
 
 
 
 
6896	}
6897	flush_all_backlogs();
6898
6899	synchronize_net();
 
 
 
6900
6901	list_for_each_entry(dev, head, unreg_list) {
6902		struct sk_buff *skb = NULL;
6903
6904		/* Shutdown queueing discipline. */
6905		dev_shutdown(dev);
 
 
6906
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6907
6908		/* Notify protocols, that we are about to destroy
6909		   this device. They should clean all the things.
6910		*/
6911		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6912
6913		if (!dev->rtnl_link_ops ||
6914		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6915			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6916						     GFP_KERNEL);
 
 
6917
6918		/*
6919		 *	Flush the unicast and multicast chains
6920		 */
6921		dev_uc_flush(dev);
6922		dev_mc_flush(dev);
 
 
 
 
6923
6924		if (dev->netdev_ops->ndo_uninit)
6925			dev->netdev_ops->ndo_uninit(dev);
6926
6927		if (skb)
6928			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
 
 
 
 
 
6929
6930		/* Notifier chain MUST detach us all upper devices. */
6931		WARN_ON(netdev_has_any_upper_dev(dev));
6932		WARN_ON(netdev_has_any_lower_dev(dev));
 
 
 
 
 
 
 
 
 
 
 
6933
6934		/* Remove entries from kobject tree */
6935		netdev_unregister_kobject(dev);
6936#ifdef CONFIG_XPS
6937		/* Remove XPS queueing entries */
6938		netif_reset_xps_queues_gt(dev, 0);
6939#endif
6940	}
6941
6942	synchronize_net();
 
 
 
 
 
 
6943
6944	list_for_each_entry(dev, head, unreg_list)
6945		dev_put(dev);
6946}
6947
6948static void rollback_registered(struct net_device *dev)
6949{
6950	LIST_HEAD(single);
 
 
 
 
 
 
 
6951
6952	list_add(&dev->unreg_list, &single);
6953	rollback_registered_many(&single);
6954	list_del(&single);
6955}
6956
6957static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6958	struct net_device *upper, netdev_features_t features)
6959{
6960	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6961	netdev_features_t feature;
6962	int feature_bit;
6963
6964	for_each_netdev_feature(&upper_disables, feature_bit) {
6965		feature = __NETIF_F_BIT(feature_bit);
6966		if (!(upper->wanted_features & feature)
6967		    && (features & feature)) {
6968			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6969				   &feature, upper->name);
6970			features &= ~feature;
6971		}
6972	}
6973
6974	return features;
6975}
6976
6977static void netdev_sync_lower_features(struct net_device *upper,
6978	struct net_device *lower, netdev_features_t features)
6979{
6980	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6981	netdev_features_t feature;
6982	int feature_bit;
6983
6984	for_each_netdev_feature(&upper_disables, feature_bit) {
6985		feature = __NETIF_F_BIT(feature_bit);
6986		if (!(features & feature) && (lower->features & feature)) {
6987			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6988				   &feature, lower->name);
6989			lower->wanted_features &= ~feature;
6990			netdev_update_features(lower);
6991
6992			if (unlikely(lower->features & feature))
6993				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6994					    &feature, lower->name);
 
 
6995		}
6996	}
6997}
6998
6999static netdev_features_t netdev_fix_features(struct net_device *dev,
7000	netdev_features_t features)
7001{
7002	/* Fix illegal checksum combinations */
7003	if ((features & NETIF_F_HW_CSUM) &&
7004	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
7005		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
7006		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
7007	}
7008
7009	/* TSO requires that SG is present as well. */
7010	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
7011		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
7012		features &= ~NETIF_F_ALL_TSO;
7013	}
7014
7015	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
7016					!(features & NETIF_F_IP_CSUM)) {
7017		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
7018		features &= ~NETIF_F_TSO;
7019		features &= ~NETIF_F_TSO_ECN;
7020	}
7021
7022	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
7023					 !(features & NETIF_F_IPV6_CSUM)) {
7024		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
7025		features &= ~NETIF_F_TSO6;
7026	}
7027
7028	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
7029	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
7030		features &= ~NETIF_F_TSO_MANGLEID;
7031
7032	/* TSO ECN requires that TSO is present as well. */
7033	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
7034		features &= ~NETIF_F_TSO_ECN;
7035
7036	/* Software GSO depends on SG. */
7037	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
7038		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
7039		features &= ~NETIF_F_GSO;
7040	}
7041
7042	/* UFO needs SG and checksumming */
7043	if (features & NETIF_F_UFO) {
7044		/* maybe split UFO into V4 and V6? */
7045		if (!(features & NETIF_F_HW_CSUM) &&
7046		    ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
7047		     (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
7048			netdev_dbg(dev,
7049				"Dropping NETIF_F_UFO since no checksum offload features.\n");
7050			features &= ~NETIF_F_UFO;
7051		}
7052
7053		if (!(features & NETIF_F_SG)) {
7054			netdev_dbg(dev,
7055				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
7056			features &= ~NETIF_F_UFO;
7057		}
7058	}
7059
7060	/* GSO partial features require GSO partial be set */
7061	if ((features & dev->gso_partial_features) &&
7062	    !(features & NETIF_F_GSO_PARTIAL)) {
7063		netdev_dbg(dev,
7064			   "Dropping partially supported GSO features since no GSO partial.\n");
7065		features &= ~dev->gso_partial_features;
7066	}
7067
7068#ifdef CONFIG_NET_RX_BUSY_POLL
7069	if (dev->netdev_ops->ndo_busy_poll)
7070		features |= NETIF_F_BUSY_POLL;
7071	else
7072#endif
7073		features &= ~NETIF_F_BUSY_POLL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7074
7075	return features;
7076}
7077
7078int __netdev_update_features(struct net_device *dev)
7079{
7080	struct net_device *upper, *lower;
7081	netdev_features_t features;
7082	struct list_head *iter;
7083	int err = -1;
7084
7085	ASSERT_RTNL();
7086
7087	features = netdev_get_wanted_features(dev);
7088
7089	if (dev->netdev_ops->ndo_fix_features)
7090		features = dev->netdev_ops->ndo_fix_features(dev, features);
7091
7092	/* driver might be less strict about feature dependencies */
7093	features = netdev_fix_features(dev, features);
7094
7095	/* some features can't be enabled if they're off an an upper device */
7096	netdev_for_each_upper_dev_rcu(dev, upper, iter)
7097		features = netdev_sync_upper_features(dev, upper, features);
7098
7099	if (dev->features == features)
7100		goto sync_lower;
7101
7102	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
7103		&dev->features, &features);
7104
7105	if (dev->netdev_ops->ndo_set_features)
7106		err = dev->netdev_ops->ndo_set_features(dev, features);
7107	else
7108		err = 0;
7109
7110	if (unlikely(err < 0)) {
7111		netdev_err(dev,
7112			"set_features() failed (%d); wanted %pNF, left %pNF\n",
7113			err, &features, &dev->features);
7114		/* return non-0 since some features might have changed and
7115		 * it's better to fire a spurious notification than miss it
7116		 */
7117		return -1;
7118	}
7119
7120sync_lower:
7121	/* some features must be disabled on lower devices when disabled
7122	 * on an upper device (think: bonding master or bridge)
7123	 */
7124	netdev_for_each_lower_dev(dev, lower, iter)
7125		netdev_sync_lower_features(dev, lower, features);
7126
7127	if (!err)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7128		dev->features = features;
 
7129
7130	return err < 0 ? 0 : 1;
7131}
7132
7133/**
7134 *	netdev_update_features - recalculate device features
7135 *	@dev: the device to check
7136 *
7137 *	Recalculate dev->features set and send notifications if it
7138 *	has changed. Should be called after driver or hardware dependent
7139 *	conditions might have changed that influence the features.
7140 */
7141void netdev_update_features(struct net_device *dev)
7142{
7143	if (__netdev_update_features(dev))
7144		netdev_features_change(dev);
7145}
7146EXPORT_SYMBOL(netdev_update_features);
7147
7148/**
7149 *	netdev_change_features - recalculate device features
7150 *	@dev: the device to check
7151 *
7152 *	Recalculate dev->features set and send notifications even
7153 *	if they have not changed. Should be called instead of
7154 *	netdev_update_features() if also dev->vlan_features might
7155 *	have changed to allow the changes to be propagated to stacked
7156 *	VLAN devices.
7157 */
7158void netdev_change_features(struct net_device *dev)
7159{
7160	__netdev_update_features(dev);
7161	netdev_features_change(dev);
7162}
7163EXPORT_SYMBOL(netdev_change_features);
7164
7165/**
7166 *	netif_stacked_transfer_operstate -	transfer operstate
7167 *	@rootdev: the root or lower level device to transfer state from
7168 *	@dev: the device to transfer operstate to
7169 *
7170 *	Transfer operational state from root to device. This is normally
7171 *	called when a stacking relationship exists between the root
7172 *	device and the device(a leaf device).
7173 */
7174void netif_stacked_transfer_operstate(const struct net_device *rootdev,
7175					struct net_device *dev)
7176{
7177	if (rootdev->operstate == IF_OPER_DORMANT)
7178		netif_dormant_on(dev);
7179	else
7180		netif_dormant_off(dev);
7181
7182	if (netif_carrier_ok(rootdev)) {
7183		if (!netif_carrier_ok(dev))
7184			netif_carrier_on(dev);
7185	} else {
7186		if (netif_carrier_ok(dev))
7187			netif_carrier_off(dev);
7188	}
 
 
7189}
7190EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7191
7192#ifdef CONFIG_SYSFS
7193static int netif_alloc_rx_queues(struct net_device *dev)
7194{
7195	unsigned int i, count = dev->num_rx_queues;
7196	struct netdev_rx_queue *rx;
7197	size_t sz = count * sizeof(*rx);
 
7198
7199	BUG_ON(count < 1);
7200
7201	rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7202	if (!rx) {
7203		rx = vzalloc(sz);
7204		if (!rx)
7205			return -ENOMEM;
7206	}
7207	dev->_rx = rx;
7208
7209	for (i = 0; i < count; i++)
7210		rx[i].dev = dev;
 
 
 
 
 
 
7211	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7212}
7213#endif
7214
7215static void netdev_init_one_queue(struct net_device *dev,
7216				  struct netdev_queue *queue, void *_unused)
7217{
7218	/* Initialize queue lock */
7219	spin_lock_init(&queue->_xmit_lock);
7220	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7221	queue->xmit_lock_owner = -1;
7222	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
7223	queue->dev = dev;
7224#ifdef CONFIG_BQL
7225	dql_init(&queue->dql, HZ);
7226#endif
7227}
7228
7229static void netif_free_tx_queues(struct net_device *dev)
7230{
7231	kvfree(dev->_tx);
7232}
7233
7234static int netif_alloc_netdev_queues(struct net_device *dev)
7235{
7236	unsigned int count = dev->num_tx_queues;
7237	struct netdev_queue *tx;
7238	size_t sz = count * sizeof(*tx);
7239
7240	if (count < 1 || count > 0xffff)
7241		return -EINVAL;
7242
7243	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7244	if (!tx) {
7245		tx = vzalloc(sz);
7246		if (!tx)
7247			return -ENOMEM;
7248	}
7249	dev->_tx = tx;
7250
7251	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7252	spin_lock_init(&dev->tx_global_lock);
7253
7254	return 0;
7255}
7256
7257void netif_tx_stop_all_queues(struct net_device *dev)
7258{
7259	unsigned int i;
7260
7261	for (i = 0; i < dev->num_tx_queues; i++) {
7262		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 
7263		netif_tx_stop_queue(txq);
7264	}
7265}
7266EXPORT_SYMBOL(netif_tx_stop_all_queues);
7267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7268/**
7269 *	register_netdevice	- register a network device
7270 *	@dev: device to register
7271 *
7272 *	Take a completed network device structure and add it to the kernel
7273 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7274 *	chain. 0 is returned on success. A negative errno code is returned
7275 *	on a failure to set up the device, or if the name is a duplicate.
7276 *
7277 *	Callers must hold the rtnl semaphore. You may want
7278 *	register_netdev() instead of this.
7279 *
7280 *	BUGS:
7281 *	The locking appears insufficient to guarantee two parallel registers
7282 *	will not get the same name.
7283 */
7284
7285int register_netdevice(struct net_device *dev)
7286{
7287	int ret;
7288	struct net *net = dev_net(dev);
7289
 
 
7290	BUG_ON(dev_boot_phase);
7291	ASSERT_RTNL();
7292
7293	might_sleep();
7294
7295	/* When net_device's are persistent, this will be fatal. */
7296	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7297	BUG_ON(!net);
7298
 
 
 
 
7299	spin_lock_init(&dev->addr_list_lock);
7300	netdev_set_addr_lockdep_class(dev);
7301
7302	ret = dev_get_valid_name(net, dev, dev->name);
7303	if (ret < 0)
7304		goto out;
7305
 
 
 
 
 
7306	/* Init, if this function is available */
7307	if (dev->netdev_ops->ndo_init) {
7308		ret = dev->netdev_ops->ndo_init(dev);
7309		if (ret) {
7310			if (ret > 0)
7311				ret = -EIO;
7312			goto out;
7313		}
7314	}
7315
7316	if (((dev->hw_features | dev->features) &
7317	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
7318	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7319	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7320		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7321		ret = -EINVAL;
7322		goto err_uninit;
7323	}
7324
7325	ret = -EBUSY;
7326	if (!dev->ifindex)
7327		dev->ifindex = dev_new_index(net);
7328	else if (__dev_get_by_index(net, dev->ifindex))
7329		goto err_uninit;
7330
 
 
 
 
 
7331	/* Transfer changeable features to wanted_features and enable
7332	 * software offloads (GSO and GRO).
7333	 */
7334	dev->hw_features |= NETIF_F_SOFT_FEATURES;
7335	dev->features |= NETIF_F_SOFT_FEATURES;
 
 
 
 
 
 
7336	dev->wanted_features = dev->features & dev->hw_features;
7337
7338	if (!(dev->flags & IFF_LOOPBACK))
7339		dev->hw_features |= NETIF_F_NOCACHE_COPY;
7340
7341	/* If IPv4 TCP segmentation offload is supported we should also
7342	 * allow the device to enable segmenting the frame with the option
7343	 * of ignoring a static IP ID value.  This doesn't enable the
7344	 * feature itself but allows the user to enable it later.
7345	 */
7346	if (dev->hw_features & NETIF_F_TSO)
7347		dev->hw_features |= NETIF_F_TSO_MANGLEID;
7348	if (dev->vlan_features & NETIF_F_TSO)
7349		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7350	if (dev->mpls_features & NETIF_F_TSO)
7351		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7352	if (dev->hw_enc_features & NETIF_F_TSO)
7353		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7354
7355	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7356	 */
7357	dev->vlan_features |= NETIF_F_HIGHDMA;
7358
7359	/* Make NETIF_F_SG inheritable to tunnel devices.
7360	 */
7361	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7362
7363	/* Make NETIF_F_SG inheritable to MPLS.
7364	 */
7365	dev->mpls_features |= NETIF_F_SG;
7366
7367	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7368	ret = notifier_to_errno(ret);
7369	if (ret)
7370		goto err_uninit;
7371
7372	ret = netdev_register_kobject(dev);
 
 
 
7373	if (ret)
7374		goto err_uninit;
7375	dev->reg_state = NETREG_REGISTERED;
7376
7377	__netdev_update_features(dev);
7378
7379	/*
7380	 *	Default initial state at registry is that the
7381	 *	device is present.
7382	 */
7383
7384	set_bit(__LINK_STATE_PRESENT, &dev->state);
7385
7386	linkwatch_init_dev(dev);
7387
7388	dev_init_scheduler(dev);
7389	dev_hold(dev);
 
7390	list_netdevice(dev);
 
7391	add_device_randomness(dev->dev_addr, dev->addr_len);
7392
7393	/* If the device has permanent device address, driver should
7394	 * set dev_addr and also addr_assign_type should be set to
7395	 * NET_ADDR_PERM (default value).
7396	 */
7397	if (dev->addr_assign_type == NET_ADDR_PERM)
7398		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7399
7400	/* Notify protocols, that a new device appeared. */
7401	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7402	ret = notifier_to_errno(ret);
7403	if (ret) {
7404		rollback_registered(dev);
7405		dev->reg_state = NETREG_UNREGISTERED;
 
 
7406	}
7407	/*
7408	 *	Prevent userspace races by waiting until the network
7409	 *	device is fully setup before sending notifications.
7410	 */
7411	if (!dev->rtnl_link_ops ||
7412	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7413		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7414
7415out:
7416	return ret;
7417
 
 
 
 
 
 
7418err_uninit:
7419	if (dev->netdev_ops->ndo_uninit)
7420		dev->netdev_ops->ndo_uninit(dev);
 
 
 
 
7421	goto out;
7422}
7423EXPORT_SYMBOL(register_netdevice);
7424
7425/**
7426 *	init_dummy_netdev	- init a dummy network device for NAPI
7427 *	@dev: device to init
7428 *
7429 *	This takes a network device structure and initialize the minimum
7430 *	amount of fields so it can be used to schedule NAPI polls without
7431 *	registering a full blown interface. This is to be used by drivers
7432 *	that need to tie several hardware interfaces to a single NAPI
7433 *	poll scheduler due to HW limitations.
7434 */
7435int init_dummy_netdev(struct net_device *dev)
7436{
7437	/* Clear everything. Note we don't initialize spinlocks
7438	 * are they aren't supposed to be taken by any of the
7439	 * NAPI code and this dummy netdev is supposed to be
7440	 * only ever used for NAPI polls
7441	 */
7442	memset(dev, 0, sizeof(struct net_device));
7443
7444	/* make sure we BUG if trying to hit standard
7445	 * register/unregister code path
7446	 */
7447	dev->reg_state = NETREG_DUMMY;
7448
7449	/* NAPI wants this */
7450	INIT_LIST_HEAD(&dev->napi_list);
7451
7452	/* a dummy interface is started by default */
7453	set_bit(__LINK_STATE_PRESENT, &dev->state);
7454	set_bit(__LINK_STATE_START, &dev->state);
7455
 
 
 
7456	/* Note : We dont allocate pcpu_refcnt for dummy devices,
7457	 * because users of this 'device' dont need to change
7458	 * its refcount.
7459	 */
7460
7461	return 0;
7462}
7463EXPORT_SYMBOL_GPL(init_dummy_netdev);
7464
7465
7466/**
7467 *	register_netdev	- register a network device
7468 *	@dev: device to register
7469 *
7470 *	Take a completed network device structure and add it to the kernel
7471 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7472 *	chain. 0 is returned on success. A negative errno code is returned
7473 *	on a failure to set up the device, or if the name is a duplicate.
7474 *
7475 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
7476 *	and expands the device name if you passed a format string to
7477 *	alloc_netdev.
7478 */
7479int register_netdev(struct net_device *dev)
7480{
7481	int err;
7482
7483	rtnl_lock();
 
7484	err = register_netdevice(dev);
7485	rtnl_unlock();
7486	return err;
7487}
7488EXPORT_SYMBOL(register_netdev);
7489
7490int netdev_refcnt_read(const struct net_device *dev)
7491{
 
7492	int i, refcnt = 0;
7493
7494	for_each_possible_cpu(i)
7495		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7496	return refcnt;
 
 
 
7497}
7498EXPORT_SYMBOL(netdev_refcnt_read);
7499
 
 
 
 
7500/**
7501 * netdev_wait_allrefs - wait until all references are gone.
7502 * @dev: target net_device
7503 *
7504 * This is called when unregistering network devices.
7505 *
7506 * Any protocol or device that holds a reference should register
7507 * for netdevice notification, and cleanup and put back the
7508 * reference if they receive an UNREGISTER event.
7509 * We can get stuck here if buggy protocols don't correctly
7510 * call dev_put.
7511 */
7512static void netdev_wait_allrefs(struct net_device *dev)
7513{
7514	unsigned long rebroadcast_time, warning_time;
7515	int refcnt;
 
7516
7517	linkwatch_forget_dev(dev);
7518
7519	rebroadcast_time = warning_time = jiffies;
7520	refcnt = netdev_refcnt_read(dev);
 
7521
7522	while (refcnt != 0) {
7523		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7524			rtnl_lock();
7525
7526			/* Rebroadcast unregister notification */
7527			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 
7528
7529			__rtnl_unlock();
7530			rcu_barrier();
7531			rtnl_lock();
7532
7533			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7534			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7535				     &dev->state)) {
7536				/* We must not have linkwatch events
7537				 * pending on unregister. If this
7538				 * happens, we simply run the queue
7539				 * unscheduled, resulting in a noop
7540				 * for this device.
7541				 */
7542				linkwatch_run_queue();
7543			}
 
7544
7545			__rtnl_unlock();
7546
7547			rebroadcast_time = jiffies;
7548		}
7549
7550		msleep(250);
 
 
 
 
 
 
 
7551
7552		refcnt = netdev_refcnt_read(dev);
 
 
 
 
 
 
 
 
 
 
7553
7554		if (time_after(jiffies, warning_time + 10 * HZ)) {
7555			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7556				 dev->name, refcnt);
7557			warning_time = jiffies;
7558		}
7559	}
7560}
7561
7562/* The sequence is:
7563 *
7564 *	rtnl_lock();
7565 *	...
7566 *	register_netdevice(x1);
7567 *	register_netdevice(x2);
7568 *	...
7569 *	unregister_netdevice(y1);
7570 *	unregister_netdevice(y2);
7571 *      ...
7572 *	rtnl_unlock();
7573 *	free_netdev(y1);
7574 *	free_netdev(y2);
7575 *
7576 * We are invoked by rtnl_unlock().
7577 * This allows us to deal with problems:
7578 * 1) We can delete sysfs objects which invoke hotplug
7579 *    without deadlocking with linkwatch via keventd.
7580 * 2) Since we run with the RTNL semaphore not held, we can sleep
7581 *    safely in order to wait for the netdev refcnt to drop to zero.
7582 *
7583 * We must not return until all unregister events added during
7584 * the interval the lock was held have been completed.
7585 */
7586void netdev_run_todo(void)
7587{
 
7588	struct list_head list;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7589
7590	/* Snapshot list, allow later requests */
7591	list_replace_init(&net_todo_list, &list);
7592
7593	__rtnl_unlock();
7594
7595
7596	/* Wait for rcu callbacks to finish before next phase */
7597	if (!list_empty(&list))
7598		rcu_barrier();
7599
7600	while (!list_empty(&list)) {
7601		struct net_device *dev
7602			= list_first_entry(&list, struct net_device, todo_list);
7603		list_del(&dev->todo_list);
7604
7605		rtnl_lock();
7606		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7607		__rtnl_unlock();
7608
7609		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7610			pr_err("network todo '%s' but state %d\n",
7611			       dev->name, dev->reg_state);
7612			dump_stack();
7613			continue;
7614		}
7615
7616		dev->reg_state = NETREG_UNREGISTERED;
 
 
7617
7618		netdev_wait_allrefs(dev);
 
 
 
7619
7620		/* paranoia */
7621		BUG_ON(netdev_refcnt_read(dev));
7622		BUG_ON(!list_empty(&dev->ptype_all));
7623		BUG_ON(!list_empty(&dev->ptype_specific));
7624		WARN_ON(rcu_access_pointer(dev->ip_ptr));
7625		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7626		WARN_ON(dev->dn_ptr);
7627
7628		if (dev->destructor)
7629			dev->destructor(dev);
 
 
 
7630
7631		/* Report a network device has been unregistered */
7632		rtnl_lock();
7633		dev_net(dev)->dev_unreg_count--;
7634		__rtnl_unlock();
7635		wake_up(&netdev_unregistering_wq);
7636
7637		/* Free network device */
7638		kobject_put(&dev->dev.kobj);
7639	}
 
 
7640}
7641
7642/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7643 * all the same fields in the same order as net_device_stats, with only
7644 * the type differing, but rtnl_link_stats64 may have additional fields
7645 * at the end for newer counters.
7646 */
7647void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7648			     const struct net_device_stats *netdev_stats)
7649{
7650#if BITS_PER_LONG == 64
7651	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7652	memcpy(stats64, netdev_stats, sizeof(*stats64));
7653	/* zero out counters that only exist in rtnl_link_stats64 */
7654	memset((char *)stats64 + sizeof(*netdev_stats), 0,
7655	       sizeof(*stats64) - sizeof(*netdev_stats));
7656#else
7657	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7658	const unsigned long *src = (const unsigned long *)netdev_stats;
7659	u64 *dst = (u64 *)stats64;
7660
7661	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7662	for (i = 0; i < n; i++)
7663		dst[i] = src[i];
7664	/* zero out counters that only exist in rtnl_link_stats64 */
7665	memset((char *)stats64 + n * sizeof(u64), 0,
7666	       sizeof(*stats64) - n * sizeof(u64));
7667#endif
7668}
7669EXPORT_SYMBOL(netdev_stats_to_stats64);
7670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7671/**
7672 *	dev_get_stats	- get network device statistics
7673 *	@dev: device to get statistics from
7674 *	@storage: place to store stats
7675 *
7676 *	Get network statistics from device. Return @storage.
7677 *	The device driver may provide its own method by setting
7678 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7679 *	otherwise the internal statistics structure is used.
7680 */
7681struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7682					struct rtnl_link_stats64 *storage)
7683{
7684	const struct net_device_ops *ops = dev->netdev_ops;
 
7685
7686	if (ops->ndo_get_stats64) {
7687		memset(storage, 0, sizeof(*storage));
7688		ops->ndo_get_stats64(dev, storage);
7689	} else if (ops->ndo_get_stats) {
7690		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
 
 
7691	} else {
7692		netdev_stats_to_stats64(storage, &dev->stats);
7693	}
7694	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7695	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7696	storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
 
 
 
 
 
 
 
 
 
 
 
 
7697	return storage;
7698}
7699EXPORT_SYMBOL(dev_get_stats);
7700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7701struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7702{
7703	struct netdev_queue *queue = dev_ingress_queue(dev);
7704
7705#ifdef CONFIG_NET_CLS_ACT
7706	if (queue)
7707		return queue;
7708	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7709	if (!queue)
7710		return NULL;
7711	netdev_init_one_queue(dev, queue, NULL);
7712	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7713	queue->qdisc_sleeping = &noop_qdisc;
7714	rcu_assign_pointer(dev->ingress_queue, queue);
7715#endif
7716	return queue;
7717}
7718
7719static const struct ethtool_ops default_ethtool_ops;
7720
7721void netdev_set_default_ethtool_ops(struct net_device *dev,
7722				    const struct ethtool_ops *ops)
7723{
7724	if (dev->ethtool_ops == &default_ethtool_ops)
7725		dev->ethtool_ops = ops;
7726}
7727EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7728
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7729void netdev_freemem(struct net_device *dev)
7730{
7731	char *addr = (char *)dev - dev->padded;
7732
7733	kvfree(addr);
7734}
7735
7736/**
7737 *	alloc_netdev_mqs - allocate network device
7738 *	@sizeof_priv:		size of private data to allocate space for
7739 *	@name:			device name format string
7740 *	@name_assign_type: 	origin of device name
7741 *	@setup:			callback to initialize device
7742 *	@txqs:			the number of TX subqueues to allocate
7743 *	@rxqs:			the number of RX subqueues to allocate
7744 *
7745 *	Allocates a struct net_device with private data area for driver use
7746 *	and performs basic initialization.  Also allocates subqueue structs
7747 *	for each queue on the device.
7748 */
7749struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7750		unsigned char name_assign_type,
7751		void (*setup)(struct net_device *),
7752		unsigned int txqs, unsigned int rxqs)
7753{
7754	struct net_device *dev;
7755	size_t alloc_size;
7756	struct net_device *p;
7757
7758	BUG_ON(strlen(name) >= sizeof(dev->name));
7759
7760	if (txqs < 1) {
7761		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7762		return NULL;
7763	}
7764
7765#ifdef CONFIG_SYSFS
7766	if (rxqs < 1) {
7767		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7768		return NULL;
7769	}
7770#endif
7771
7772	alloc_size = sizeof(struct net_device);
7773	if (sizeof_priv) {
7774		/* ensure 32-byte alignment of private area */
7775		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7776		alloc_size += sizeof_priv;
7777	}
7778	/* ensure 32-byte alignment of whole construct */
7779	alloc_size += NETDEV_ALIGN - 1;
7780
7781	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7782	if (!p)
7783		p = vzalloc(alloc_size);
7784	if (!p)
7785		return NULL;
7786
7787	dev = PTR_ALIGN(p, NETDEV_ALIGN);
7788	dev->padded = (char *)dev - (char *)p;
7789
 
 
7790	dev->pcpu_refcnt = alloc_percpu(int);
7791	if (!dev->pcpu_refcnt)
7792		goto free_dev;
 
 
 
 
7793
7794	if (dev_addr_init(dev))
7795		goto free_pcpu;
7796
7797	dev_mc_init(dev);
7798	dev_uc_init(dev);
7799
7800	dev_net_set(dev, &init_net);
7801
7802	dev->gso_max_size = GSO_MAX_SIZE;
 
7803	dev->gso_max_segs = GSO_MAX_SEGS;
 
 
 
 
 
 
 
 
 
 
 
7804
7805	INIT_LIST_HEAD(&dev->napi_list);
7806	INIT_LIST_HEAD(&dev->unreg_list);
7807	INIT_LIST_HEAD(&dev->close_list);
7808	INIT_LIST_HEAD(&dev->link_watch_list);
7809	INIT_LIST_HEAD(&dev->adj_list.upper);
7810	INIT_LIST_HEAD(&dev->adj_list.lower);
7811	INIT_LIST_HEAD(&dev->ptype_all);
7812	INIT_LIST_HEAD(&dev->ptype_specific);
 
7813#ifdef CONFIG_NET_SCHED
7814	hash_init(dev->qdisc_hash);
7815#endif
7816	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7817	setup(dev);
7818
7819	if (!dev->tx_queue_len) {
7820		dev->priv_flags |= IFF_NO_QUEUE;
7821		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
7822	}
7823
7824	dev->num_tx_queues = txqs;
7825	dev->real_num_tx_queues = txqs;
7826	if (netif_alloc_netdev_queues(dev))
7827		goto free_all;
7828
7829#ifdef CONFIG_SYSFS
7830	dev->num_rx_queues = rxqs;
7831	dev->real_num_rx_queues = rxqs;
7832	if (netif_alloc_rx_queues(dev))
7833		goto free_all;
7834#endif
7835
7836	strcpy(dev->name, name);
7837	dev->name_assign_type = name_assign_type;
7838	dev->group = INIT_NETDEV_GROUP;
7839	if (!dev->ethtool_ops)
7840		dev->ethtool_ops = &default_ethtool_ops;
7841
7842	nf_hook_ingress_init(dev);
7843
7844	return dev;
7845
7846free_all:
7847	free_netdev(dev);
7848	return NULL;
7849
7850free_pcpu:
 
7851	free_percpu(dev->pcpu_refcnt);
7852free_dev:
 
7853	netdev_freemem(dev);
7854	return NULL;
7855}
7856EXPORT_SYMBOL(alloc_netdev_mqs);
7857
7858/**
7859 *	free_netdev - free network device
7860 *	@dev: device
7861 *
7862 *	This function does the last stage of destroying an allocated device
7863 * 	interface. The reference to the device object is released.
7864 *	If this is the last reference then it will be freed.
7865 *	Must be called in process context.
7866 */
7867void free_netdev(struct net_device *dev)
7868{
7869	struct napi_struct *p, *n;
7870
7871	might_sleep();
 
 
 
 
 
 
 
 
 
 
 
7872	netif_free_tx_queues(dev);
7873#ifdef CONFIG_SYSFS
7874	kvfree(dev->_rx);
7875#endif
7876
7877	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7878
7879	/* Flush device addresses */
7880	dev_addr_flush(dev);
7881
7882	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7883		netif_napi_del(p);
7884
 
 
7885	free_percpu(dev->pcpu_refcnt);
7886	dev->pcpu_refcnt = NULL;
 
 
 
 
 
7887
7888	/*  Compatibility with error handling in drivers */
7889	if (dev->reg_state == NETREG_UNINITIALIZED) {
7890		netdev_freemem(dev);
7891		return;
7892	}
7893
7894	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7895	dev->reg_state = NETREG_RELEASED;
7896
7897	/* will free via device release */
7898	put_device(&dev->dev);
7899}
7900EXPORT_SYMBOL(free_netdev);
7901
7902/**
7903 *	synchronize_net -  Synchronize with packet receive processing
7904 *
7905 *	Wait for packets currently being received to be done.
7906 *	Does not block later packets from starting.
7907 */
7908void synchronize_net(void)
7909{
7910	might_sleep();
7911	if (rtnl_is_locked())
7912		synchronize_rcu_expedited();
7913	else
7914		synchronize_rcu();
7915}
7916EXPORT_SYMBOL(synchronize_net);
7917
7918/**
7919 *	unregister_netdevice_queue - remove device from the kernel
7920 *	@dev: device
7921 *	@head: list
7922 *
7923 *	This function shuts down a device interface and removes it
7924 *	from the kernel tables.
7925 *	If head not NULL, device is queued to be unregistered later.
7926 *
7927 *	Callers must hold the rtnl semaphore.  You may want
7928 *	unregister_netdev() instead of this.
7929 */
7930
7931void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7932{
7933	ASSERT_RTNL();
7934
7935	if (head) {
7936		list_move_tail(&dev->unreg_list, head);
7937	} else {
7938		rollback_registered(dev);
7939		/* Finish processing unregister after unlock */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7940		net_set_todo(dev);
 
7941	}
 
 
 
7942}
7943EXPORT_SYMBOL(unregister_netdevice_queue);
7944
7945/**
7946 *	unregister_netdevice_many - unregister many devices
7947 *	@head: list of devices
7948 *
7949 *  Note: As most callers use a stack allocated list_head,
7950 *  we force a list_del() to make sure stack wont be corrupted later.
7951 */
7952void unregister_netdevice_many(struct list_head *head)
7953{
7954	struct net_device *dev;
7955
7956	if (!list_empty(head)) {
7957		rollback_registered_many(head);
7958		list_for_each_entry(dev, head, unreg_list)
7959			net_set_todo(dev);
7960		list_del(head);
7961	}
7962}
7963EXPORT_SYMBOL(unregister_netdevice_many);
7964
7965/**
7966 *	unregister_netdev - remove device from the kernel
7967 *	@dev: device
7968 *
7969 *	This function shuts down a device interface and removes it
7970 *	from the kernel tables.
7971 *
7972 *	This is just a wrapper for unregister_netdevice that takes
7973 *	the rtnl semaphore.  In general you want to use this and not
7974 *	unregister_netdevice.
7975 */
7976void unregister_netdev(struct net_device *dev)
7977{
7978	rtnl_lock();
7979	unregister_netdevice(dev);
7980	rtnl_unlock();
7981}
7982EXPORT_SYMBOL(unregister_netdev);
7983
7984/**
7985 *	dev_change_net_namespace - move device to different nethost namespace
7986 *	@dev: device
7987 *	@net: network namespace
7988 *	@pat: If not NULL name pattern to try if the current device name
7989 *	      is already taken in the destination network namespace.
 
 
7990 *
7991 *	This function shuts down a device interface and moves it
7992 *	to a new network namespace. On success 0 is returned, on
7993 *	a failure a netagive errno code is returned.
7994 *
7995 *	Callers must hold the rtnl semaphore.
7996 */
7997
7998int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
 
7999{
8000	int err;
 
 
 
8001
8002	ASSERT_RTNL();
8003
8004	/* Don't allow namespace local devices to be moved. */
8005	err = -EINVAL;
8006	if (dev->features & NETIF_F_NETNS_LOCAL)
8007		goto out;
8008
8009	/* Ensure the device has been registrered */
8010	if (dev->reg_state != NETREG_REGISTERED)
8011		goto out;
8012
8013	/* Get out if there is nothing todo */
8014	err = 0;
8015	if (net_eq(dev_net(dev), net))
8016		goto out;
8017
8018	/* Pick the destination device name, and ensure
8019	 * we can use it in the destination network namespace.
8020	 */
8021	err = -EEXIST;
8022	if (__dev_get_by_name(net, dev->name)) {
8023		/* We get here if we can't use the current device name */
8024		if (!pat)
8025			goto out;
8026		if (dev_get_valid_name(net, dev, pat) < 0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8027			goto out;
 
8028	}
8029
8030	/*
8031	 * And now a mini version of register_netdevice unregister_netdevice.
8032	 */
8033
8034	/* If device is running close it first. */
8035	dev_close(dev);
8036
8037	/* And unlink it from device chain */
8038	err = -ENODEV;
8039	unlist_netdevice(dev);
8040
8041	synchronize_net();
8042
8043	/* Shutdown queueing discipline. */
8044	dev_shutdown(dev);
8045
8046	/* Notify protocols, that we are about to destroy
8047	   this device. They should clean all the things.
8048
8049	   Note that dev->reg_state stays at NETREG_REGISTERED.
8050	   This is wanted because this way 8021q and macvlan know
8051	   the device is just moving and can keep their slaves up.
8052	*/
8053	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8054	rcu_barrier();
8055	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
8056	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
 
 
 
8057
8058	/*
8059	 *	Flush the unicast and multicast chains
8060	 */
8061	dev_uc_flush(dev);
8062	dev_mc_flush(dev);
8063
8064	/* Send a netdev-removed uevent to the old namespace */
8065	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
8066	netdev_adjacent_del_links(dev);
8067
 
 
 
8068	/* Actually switch the network namespace */
8069	dev_net_set(dev, net);
 
 
 
 
8070
8071	/* If there is an ifindex conflict assign a new one */
8072	if (__dev_get_by_index(net, dev->ifindex))
8073		dev->ifindex = dev_new_index(net);
 
 
8074
8075	/* Send a netdev-add uevent to the new namespace */
8076	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
8077	netdev_adjacent_add_links(dev);
8078
8079	/* Fixup kobjects */
8080	err = device_rename(&dev->dev, dev->name);
 
 
8081	WARN_ON(err);
8082
8083	/* Add the device back in the hashes */
8084	list_netdevice(dev);
8085
8086	/* Notify protocols, that a new device appeared. */
8087	call_netdevice_notifiers(NETDEV_REGISTER, dev);
8088
8089	/*
8090	 *	Prevent userspace races by waiting until the network
8091	 *	device is fully setup before sending notifications.
8092	 */
8093	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
8094
8095	synchronize_net();
8096	err = 0;
8097out:
8098	return err;
8099}
8100EXPORT_SYMBOL_GPL(dev_change_net_namespace);
8101
8102static int dev_cpu_dead(unsigned int oldcpu)
8103{
8104	struct sk_buff **list_skb;
8105	struct sk_buff *skb;
8106	unsigned int cpu;
8107	struct softnet_data *sd, *oldsd;
8108
8109	local_irq_disable();
8110	cpu = smp_processor_id();
8111	sd = &per_cpu(softnet_data, cpu);
8112	oldsd = &per_cpu(softnet_data, oldcpu);
8113
8114	/* Find end of our completion_queue. */
8115	list_skb = &sd->completion_queue;
8116	while (*list_skb)
8117		list_skb = &(*list_skb)->next;
8118	/* Append completion queue from offline CPU. */
8119	*list_skb = oldsd->completion_queue;
8120	oldsd->completion_queue = NULL;
8121
8122	/* Append output queue from offline CPU. */
8123	if (oldsd->output_queue) {
8124		*sd->output_queue_tailp = oldsd->output_queue;
8125		sd->output_queue_tailp = oldsd->output_queue_tailp;
8126		oldsd->output_queue = NULL;
8127		oldsd->output_queue_tailp = &oldsd->output_queue;
8128	}
8129	/* Append NAPI poll list from offline CPU, with one exception :
8130	 * process_backlog() must be called by cpu owning percpu backlog.
8131	 * We properly handle process_queue & input_pkt_queue later.
8132	 */
8133	while (!list_empty(&oldsd->poll_list)) {
8134		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
8135							    struct napi_struct,
8136							    poll_list);
8137
8138		list_del_init(&napi->poll_list);
8139		if (napi->poll == process_backlog)
8140			napi->state = 0;
8141		else
8142			____napi_schedule(sd, napi);
8143	}
8144
8145	raise_softirq_irqoff(NET_TX_SOFTIRQ);
8146	local_irq_enable();
8147
 
 
 
 
 
 
 
8148	/* Process offline CPU's input_pkt_queue */
8149	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
8150		netif_rx_ni(skb);
8151		input_queue_head_incr(oldsd);
8152	}
8153	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
8154		netif_rx_ni(skb);
8155		input_queue_head_incr(oldsd);
8156	}
8157
8158	return 0;
8159}
8160
8161/**
8162 *	netdev_increment_features - increment feature set by one
8163 *	@all: current feature set
8164 *	@one: new feature set
8165 *	@mask: mask feature set
8166 *
8167 *	Computes a new feature set after adding a device with feature set
8168 *	@one to the master device with current feature set @all.  Will not
8169 *	enable anything that is off in @mask. Returns the new feature set.
8170 */
8171netdev_features_t netdev_increment_features(netdev_features_t all,
8172	netdev_features_t one, netdev_features_t mask)
8173{
8174	if (mask & NETIF_F_HW_CSUM)
8175		mask |= NETIF_F_CSUM_MASK;
8176	mask |= NETIF_F_VLAN_CHALLENGED;
8177
8178	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
8179	all &= one | ~NETIF_F_ALL_FOR_ALL;
8180
8181	/* If one device supports hw checksumming, set for all. */
8182	if (all & NETIF_F_HW_CSUM)
8183		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
8184
8185	return all;
8186}
8187EXPORT_SYMBOL(netdev_increment_features);
8188
8189static struct hlist_head * __net_init netdev_create_hash(void)
8190{
8191	int i;
8192	struct hlist_head *hash;
8193
8194	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8195	if (hash != NULL)
8196		for (i = 0; i < NETDEV_HASHENTRIES; i++)
8197			INIT_HLIST_HEAD(&hash[i]);
8198
8199	return hash;
8200}
8201
8202/* Initialize per network namespace state */
8203static int __net_init netdev_init(struct net *net)
8204{
8205	if (net != &init_net)
8206		INIT_LIST_HEAD(&net->dev_base_head);
 
 
8207
8208	net->dev_name_head = netdev_create_hash();
8209	if (net->dev_name_head == NULL)
8210		goto err_name;
8211
8212	net->dev_index_head = netdev_create_hash();
8213	if (net->dev_index_head == NULL)
8214		goto err_idx;
8215
 
 
 
 
8216	return 0;
8217
8218err_idx:
8219	kfree(net->dev_name_head);
8220err_name:
8221	return -ENOMEM;
8222}
8223
8224/**
8225 *	netdev_drivername - network driver for the device
8226 *	@dev: network device
8227 *
8228 *	Determine network driver for device.
8229 */
8230const char *netdev_drivername(const struct net_device *dev)
8231{
8232	const struct device_driver *driver;
8233	const struct device *parent;
8234	const char *empty = "";
8235
8236	parent = dev->dev.parent;
8237	if (!parent)
8238		return empty;
8239
8240	driver = parent->driver;
8241	if (driver && driver->name)
8242		return driver->name;
8243	return empty;
8244}
8245
8246static void __netdev_printk(const char *level, const struct net_device *dev,
8247			    struct va_format *vaf)
8248{
8249	if (dev && dev->dev.parent) {
8250		dev_printk_emit(level[1] - '0',
8251				dev->dev.parent,
8252				"%s %s %s%s: %pV",
8253				dev_driver_string(dev->dev.parent),
8254				dev_name(dev->dev.parent),
8255				netdev_name(dev), netdev_reg_state(dev),
8256				vaf);
8257	} else if (dev) {
8258		printk("%s%s%s: %pV",
8259		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
8260	} else {
8261		printk("%s(NULL net_device): %pV", level, vaf);
8262	}
8263}
8264
8265void netdev_printk(const char *level, const struct net_device *dev,
8266		   const char *format, ...)
8267{
8268	struct va_format vaf;
8269	va_list args;
8270
8271	va_start(args, format);
8272
8273	vaf.fmt = format;
8274	vaf.va = &args;
8275
8276	__netdev_printk(level, dev, &vaf);
8277
8278	va_end(args);
8279}
8280EXPORT_SYMBOL(netdev_printk);
8281
8282#define define_netdev_printk_level(func, level)			\
8283void func(const struct net_device *dev, const char *fmt, ...)	\
8284{								\
8285	struct va_format vaf;					\
8286	va_list args;						\
8287								\
8288	va_start(args, fmt);					\
8289								\
8290	vaf.fmt = fmt;						\
8291	vaf.va = &args;						\
8292								\
8293	__netdev_printk(level, dev, &vaf);			\
8294								\
8295	va_end(args);						\
8296}								\
8297EXPORT_SYMBOL(func);
8298
8299define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8300define_netdev_printk_level(netdev_alert, KERN_ALERT);
8301define_netdev_printk_level(netdev_crit, KERN_CRIT);
8302define_netdev_printk_level(netdev_err, KERN_ERR);
8303define_netdev_printk_level(netdev_warn, KERN_WARNING);
8304define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8305define_netdev_printk_level(netdev_info, KERN_INFO);
8306
8307static void __net_exit netdev_exit(struct net *net)
8308{
8309	kfree(net->dev_name_head);
8310	kfree(net->dev_index_head);
 
 
 
8311}
8312
8313static struct pernet_operations __net_initdata netdev_net_ops = {
8314	.init = netdev_init,
8315	.exit = netdev_exit,
8316};
8317
8318static void __net_exit default_device_exit(struct net *net)
8319{
 
8320	struct net_device *dev, *aux;
8321	/*
8322	 * Push all migratable network devices back to the
8323	 * initial network namespace
8324	 */
8325	rtnl_lock();
8326	for_each_netdev_safe(net, dev, aux) {
8327		int err;
8328		char fb_name[IFNAMSIZ];
8329
8330		/* Ignore unmoveable devices (i.e. loopback) */
8331		if (dev->features & NETIF_F_NETNS_LOCAL)
8332			continue;
8333
8334		/* Leave virtual devices for the generic cleanup */
8335		if (dev->rtnl_link_ops)
8336			continue;
8337
8338		/* Push remaining network devices to init_net */
8339		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
 
 
 
 
 
 
 
8340		err = dev_change_net_namespace(dev, &init_net, fb_name);
8341		if (err) {
8342			pr_emerg("%s: failed to move %s to init_net: %d\n",
8343				 __func__, dev->name, err);
8344			BUG();
8345		}
8346	}
8347	rtnl_unlock();
8348}
8349
8350static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8351{
8352	/* Return with the rtnl_lock held when there are no network
8353	 * devices unregistering in any network namespace in net_list.
8354	 */
8355	struct net *net;
8356	bool unregistering;
8357	DEFINE_WAIT_FUNC(wait, woken_wake_function);
8358
8359	add_wait_queue(&netdev_unregistering_wq, &wait);
8360	for (;;) {
8361		unregistering = false;
8362		rtnl_lock();
8363		list_for_each_entry(net, net_list, exit_list) {
8364			if (net->dev_unreg_count > 0) {
8365				unregistering = true;
8366				break;
8367			}
8368		}
8369		if (!unregistering)
8370			break;
8371		__rtnl_unlock();
8372
8373		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8374	}
8375	remove_wait_queue(&netdev_unregistering_wq, &wait);
8376}
8377
8378static void __net_exit default_device_exit_batch(struct list_head *net_list)
8379{
8380	/* At exit all network devices most be removed from a network
8381	 * namespace.  Do this in the reverse order of registration.
8382	 * Do this across as many network namespaces as possible to
8383	 * improve batching efficiency.
8384	 */
8385	struct net_device *dev;
8386	struct net *net;
8387	LIST_HEAD(dev_kill_list);
8388
8389	/* To prevent network device cleanup code from dereferencing
8390	 * loopback devices or network devices that have been freed
8391	 * wait here for all pending unregistrations to complete,
8392	 * before unregistring the loopback device and allowing the
8393	 * network namespace be freed.
8394	 *
8395	 * The netdev todo list containing all network devices
8396	 * unregistrations that happen in default_device_exit_batch
8397	 * will run in the rtnl_unlock() at the end of
8398	 * default_device_exit_batch.
8399	 */
8400	rtnl_lock_unregistering(net_list);
8401	list_for_each_entry(net, net_list, exit_list) {
8402		for_each_netdev_reverse(net, dev) {
8403			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8404				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8405			else
8406				unregister_netdevice_queue(dev, &dev_kill_list);
8407		}
8408	}
8409	unregister_netdevice_many(&dev_kill_list);
8410	rtnl_unlock();
8411}
8412
8413static struct pernet_operations __net_initdata default_device_ops = {
8414	.exit = default_device_exit,
8415	.exit_batch = default_device_exit_batch,
8416};
8417
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8418/*
8419 *	Initialize the DEV module. At boot time this walks the device list and
8420 *	unhooks any devices that fail to initialise (normally hardware not
8421 *	present) and leaves us with a valid list of present and active devices.
8422 *
8423 */
8424
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8425/*
8426 *       This is called single threaded during boot, so no need
8427 *       to take the rtnl semaphore.
8428 */
8429static int __init net_dev_init(void)
8430{
8431	int i, rc = -ENOMEM;
8432
8433	BUG_ON(!dev_boot_phase);
8434
 
 
8435	if (dev_proc_init())
8436		goto out;
8437
8438	if (netdev_kobject_init())
8439		goto out;
8440
8441	INIT_LIST_HEAD(&ptype_all);
8442	for (i = 0; i < PTYPE_HASH_SIZE; i++)
8443		INIT_LIST_HEAD(&ptype_base[i]);
8444
8445	INIT_LIST_HEAD(&offload_base);
8446
8447	if (register_pernet_subsys(&netdev_net_ops))
8448		goto out;
8449
8450	/*
8451	 *	Initialise the packet receive queues.
8452	 */
8453
8454	for_each_possible_cpu(i) {
8455		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
8456		struct softnet_data *sd = &per_cpu(softnet_data, i);
8457
8458		INIT_WORK(flush, flush_backlog);
8459
8460		skb_queue_head_init(&sd->input_pkt_queue);
8461		skb_queue_head_init(&sd->process_queue);
 
 
 
8462		INIT_LIST_HEAD(&sd->poll_list);
8463		sd->output_queue_tailp = &sd->output_queue;
8464#ifdef CONFIG_RPS
8465		sd->csd.func = rps_trigger_softirq;
8466		sd->csd.info = sd;
8467		sd->cpu = i;
8468#endif
 
 
8469
 
8470		sd->backlog.poll = process_backlog;
8471		sd->backlog.weight = weight_p;
 
 
 
8472	}
8473
8474	dev_boot_phase = 0;
8475
8476	/* The loopback device is special if any other network devices
8477	 * is present in a network namespace the loopback device must
8478	 * be present. Since we now dynamically allocate and free the
8479	 * loopback device ensure this invariant is maintained by
8480	 * keeping the loopback device as the first device on the
8481	 * list of network devices.  Ensuring the loopback devices
8482	 * is the first device that appears and the last network device
8483	 * that disappears.
8484	 */
8485	if (register_pernet_device(&loopback_net_ops))
8486		goto out;
8487
8488	if (register_pernet_device(&default_device_ops))
8489		goto out;
8490
8491	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8492	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8493
8494	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
8495				       NULL, dev_cpu_dead);
8496	WARN_ON(rc < 0);
8497	dst_subsys_init();
8498	rc = 0;
8499out:
 
 
 
 
 
 
 
 
 
 
 
 
 
8500	return rc;
8501}
8502
8503subsys_initcall(net_dev_init);