Linux Audio

Check our new training course

Loading...
v6.13.7
    1// SPDX-License-Identifier: GPL-2.0-or-later
    2/*
    3 *      NET3    Protocol independent device support routines.
 
 
 
 
 
    4 *
    5 *	Derived from the non IP parts of dev.c 1.0.19
    6 *              Authors:	Ross Biro
    7 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
    8 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
    9 *
   10 *	Additional Authors:
   11 *		Florian la Roche <rzsfl@rz.uni-sb.de>
   12 *		Alan Cox <gw4pts@gw4pts.ampr.org>
   13 *		David Hinds <dahinds@users.sourceforge.net>
   14 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
   15 *		Adam Sulmicki <adam@cfar.umd.edu>
   16 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
   17 *
   18 *	Changes:
   19 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
   20 *                                      to 2 if register_netdev gets called
   21 *                                      before net_dev_init & also removed a
   22 *                                      few lines of code in the process.
   23 *		Alan Cox	:	device private ioctl copies fields back.
   24 *		Alan Cox	:	Transmit queue code does relevant
   25 *					stunts to keep the queue safe.
   26 *		Alan Cox	:	Fixed double lock.
   27 *		Alan Cox	:	Fixed promisc NULL pointer trap
   28 *		????????	:	Support the full private ioctl range
   29 *		Alan Cox	:	Moved ioctl permission check into
   30 *					drivers
   31 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
   32 *		Alan Cox	:	100 backlog just doesn't cut it when
   33 *					you start doing multicast video 8)
   34 *		Alan Cox	:	Rewrote net_bh and list manager.
   35 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
   36 *		Alan Cox	:	Took out transmit every packet pass
   37 *					Saved a few bytes in the ioctl handler
   38 *		Alan Cox	:	Network driver sets packet type before
   39 *					calling netif_rx. Saves a function
   40 *					call a packet.
   41 *		Alan Cox	:	Hashed net_bh()
   42 *		Richard Kooijman:	Timestamp fixes.
   43 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
   44 *		Alan Cox	:	Device lock protection.
   45 *              Alan Cox        :       Fixed nasty side effect of device close
   46 *					changes.
   47 *		Rudi Cilibrasi	:	Pass the right thing to
   48 *					set_mac_address()
   49 *		Dave Miller	:	32bit quantity for the device lock to
   50 *					make it work out on a Sparc.
   51 *		Bjorn Ekwall	:	Added KERNELD hack.
   52 *		Alan Cox	:	Cleaned up the backlog initialise.
   53 *		Craig Metz	:	SIOCGIFCONF fix if space for under
   54 *					1 device.
   55 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
   56 *					is no device open function.
   57 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
   58 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
   59 *		Cyrus Durgin	:	Cleaned for KMOD
   60 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
   61 *					A network device unload needs to purge
   62 *					the backlog queue.
   63 *	Paul Rusty Russell	:	SIOCSIFNAME
   64 *              Pekka Riikonen  :	Netdev boot-time settings code
   65 *              Andrew Morton   :       Make unregister_netdevice wait
   66 *                                      indefinitely on dev->refcnt
   67 *              J Hadi Salim    :       - Backlog queue sampling
   68 *				        - netif_rx() feedback
   69 */
   70
   71#include <linux/uaccess.h>
   72#include <linux/bitmap.h>
   73#include <linux/capability.h>
   74#include <linux/cpu.h>
   75#include <linux/types.h>
   76#include <linux/kernel.h>
   77#include <linux/hash.h>
   78#include <linux/slab.h>
   79#include <linux/sched.h>
   80#include <linux/sched/isolation.h>
   81#include <linux/sched/mm.h>
   82#include <linux/smpboot.h>
   83#include <linux/mutex.h>
   84#include <linux/rwsem.h>
   85#include <linux/string.h>
   86#include <linux/mm.h>
   87#include <linux/socket.h>
   88#include <linux/sockios.h>
   89#include <linux/errno.h>
   90#include <linux/interrupt.h>
   91#include <linux/if_ether.h>
   92#include <linux/netdevice.h>
   93#include <linux/etherdevice.h>
   94#include <linux/ethtool.h>
 
   95#include <linux/skbuff.h>
   96#include <linux/kthread.h>
   97#include <linux/bpf.h>
   98#include <linux/bpf_trace.h>
   99#include <net/net_namespace.h>
  100#include <net/sock.h>
  101#include <net/busy_poll.h>
  102#include <linux/rtnetlink.h>
  103#include <linux/stat.h>
  104#include <net/dsa.h>
  105#include <net/dst.h>
  106#include <net/dst_metadata.h>
  107#include <net/gro.h>
  108#include <net/pkt_sched.h>
  109#include <net/pkt_cls.h>
  110#include <net/checksum.h>
  111#include <net/xfrm.h>
  112#include <net/tcx.h>
  113#include <linux/highmem.h>
  114#include <linux/init.h>
  115#include <linux/module.h>
  116#include <linux/netpoll.h>
  117#include <linux/rcupdate.h>
  118#include <linux/delay.h>
  119#include <net/iw_handler.h>
  120#include <asm/current.h>
  121#include <linux/audit.h>
  122#include <linux/dmaengine.h>
  123#include <linux/err.h>
  124#include <linux/ctype.h>
  125#include <linux/if_arp.h>
  126#include <linux/if_vlan.h>
  127#include <linux/ip.h>
  128#include <net/ip.h>
  129#include <net/mpls.h>
  130#include <linux/ipv6.h>
  131#include <linux/in.h>
  132#include <linux/jhash.h>
  133#include <linux/random.h>
  134#include <trace/events/napi.h>
  135#include <trace/events/net.h>
  136#include <trace/events/skb.h>
  137#include <trace/events/qdisc.h>
  138#include <trace/events/xdp.h>
  139#include <linux/inetdevice.h>
  140#include <linux/cpu_rmap.h>
  141#include <linux/static_key.h>
  142#include <linux/hashtable.h>
  143#include <linux/vmalloc.h>
  144#include <linux/if_macvlan.h>
  145#include <linux/errqueue.h>
  146#include <linux/hrtimer.h>
  147#include <linux/netfilter_netdev.h>
  148#include <linux/crash_dump.h>
  149#include <linux/sctp.h>
  150#include <net/udp_tunnel.h>
  151#include <linux/net_namespace.h>
  152#include <linux/indirect_call_wrapper.h>
  153#include <net/devlink.h>
  154#include <linux/pm_runtime.h>
  155#include <linux/prandom.h>
  156#include <linux/once_lite.h>
  157#include <net/netdev_rx_queue.h>
  158#include <net/page_pool/types.h>
  159#include <net/page_pool/helpers.h>
  160#include <net/rps.h>
  161#include <linux/phy_link_topology.h>
  162
  163#include "dev.h"
  164#include "devmem.h"
  165#include "net-sysfs.h"
  166
 
 
 
 
 
 
  167static DEFINE_SPINLOCK(ptype_lock);
 
  168struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 
 
  169
  170static int netif_rx_internal(struct sk_buff *skb);
  171static int call_netdevice_notifiers_extack(unsigned long val,
  172					   struct net_device *dev,
  173					   struct netlink_ext_ack *extack);
  174
  175static DEFINE_MUTEX(ifalias_mutex);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  176
  177/* protects napi_hash addition/deletion and napi_gen_id */
  178static DEFINE_SPINLOCK(napi_hash_lock);
  179
  180static unsigned int napi_gen_id = NR_CPUS;
  181static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
  182
  183static DECLARE_RWSEM(devnet_rename_sem);
  184
  185static inline void dev_base_seq_inc(struct net *net)
  186{
  187	unsigned int val = net->dev_base_seq + 1;
  188
  189	WRITE_ONCE(net->dev_base_seq, val ?: 1);
  190}
  191
  192static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
  193{
  194	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
  195
  196	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
  197}
  198
  199static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
  200{
  201	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
  202}
  203
  204#ifndef CONFIG_PREEMPT_RT
  205
  206static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key);
  207
  208static int __init setup_backlog_napi_threads(char *arg)
  209{
  210	static_branch_enable(&use_backlog_threads_key);
  211	return 0;
  212}
  213early_param("thread_backlog_napi", setup_backlog_napi_threads);
  214
  215static bool use_backlog_threads(void)
  216{
  217	return static_branch_unlikely(&use_backlog_threads_key);
  218}
  219
  220#else
  221
  222static bool use_backlog_threads(void)
  223{
  224	return true;
  225}
  226
  227#endif
  228
  229static inline void backlog_lock_irq_save(struct softnet_data *sd,
  230					 unsigned long *flags)
  231{
  232	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
  233		spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
  234	else
  235		local_irq_save(*flags);
  236}
  237
  238static inline void backlog_lock_irq_disable(struct softnet_data *sd)
  239{
  240	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
  241		spin_lock_irq(&sd->input_pkt_queue.lock);
  242	else
  243		local_irq_disable();
  244}
  245
  246static inline void backlog_unlock_irq_restore(struct softnet_data *sd,
  247					      unsigned long *flags)
  248{
  249	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
  250		spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
  251	else
  252		local_irq_restore(*flags);
  253}
  254
  255static inline void backlog_unlock_irq_enable(struct softnet_data *sd)
  256{
  257	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
  258		spin_unlock_irq(&sd->input_pkt_queue.lock);
  259	else
  260		local_irq_enable();
  261}
  262
  263static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
  264						       const char *name)
  265{
  266	struct netdev_name_node *name_node;
  267
  268	name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
  269	if (!name_node)
  270		return NULL;
  271	INIT_HLIST_NODE(&name_node->hlist);
  272	name_node->dev = dev;
  273	name_node->name = name;
  274	return name_node;
  275}
  276
  277static struct netdev_name_node *
  278netdev_name_node_head_alloc(struct net_device *dev)
  279{
  280	struct netdev_name_node *name_node;
  281
  282	name_node = netdev_name_node_alloc(dev, dev->name);
  283	if (!name_node)
  284		return NULL;
  285	INIT_LIST_HEAD(&name_node->list);
  286	return name_node;
  287}
  288
  289static void netdev_name_node_free(struct netdev_name_node *name_node)
  290{
  291	kfree(name_node);
  292}
  293
  294static void netdev_name_node_add(struct net *net,
  295				 struct netdev_name_node *name_node)
  296{
  297	hlist_add_head_rcu(&name_node->hlist,
  298			   dev_name_hash(net, name_node->name));
  299}
  300
  301static void netdev_name_node_del(struct netdev_name_node *name_node)
  302{
  303	hlist_del_rcu(&name_node->hlist);
  304}
  305
  306static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
  307							const char *name)
  308{
  309	struct hlist_head *head = dev_name_hash(net, name);
  310	struct netdev_name_node *name_node;
  311
  312	hlist_for_each_entry(name_node, head, hlist)
  313		if (!strcmp(name_node->name, name))
  314			return name_node;
  315	return NULL;
  316}
  317
  318static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
  319							    const char *name)
  320{
  321	struct hlist_head *head = dev_name_hash(net, name);
  322	struct netdev_name_node *name_node;
  323
  324	hlist_for_each_entry_rcu(name_node, head, hlist)
  325		if (!strcmp(name_node->name, name))
  326			return name_node;
  327	return NULL;
  328}
  329
  330bool netdev_name_in_use(struct net *net, const char *name)
  331{
  332	return netdev_name_node_lookup(net, name);
  333}
  334EXPORT_SYMBOL(netdev_name_in_use);
  335
  336int netdev_name_node_alt_create(struct net_device *dev, const char *name)
  337{
  338	struct netdev_name_node *name_node;
  339	struct net *net = dev_net(dev);
  340
  341	name_node = netdev_name_node_lookup(net, name);
  342	if (name_node)
  343		return -EEXIST;
  344	name_node = netdev_name_node_alloc(dev, name);
  345	if (!name_node)
  346		return -ENOMEM;
  347	netdev_name_node_add(net, name_node);
  348	/* The node that holds dev->name acts as a head of per-device list. */
  349	list_add_tail_rcu(&name_node->list, &dev->name_node->list);
  350
  351	return 0;
  352}
  353
  354static void netdev_name_node_alt_free(struct rcu_head *head)
  355{
  356	struct netdev_name_node *name_node =
  357		container_of(head, struct netdev_name_node, rcu);
  358
  359	kfree(name_node->name);
  360	netdev_name_node_free(name_node);
  361}
  362
  363static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
  364{
  365	netdev_name_node_del(name_node);
  366	list_del(&name_node->list);
  367	call_rcu(&name_node->rcu, netdev_name_node_alt_free);
  368}
  369
  370int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
  371{
  372	struct netdev_name_node *name_node;
  373	struct net *net = dev_net(dev);
  374
  375	name_node = netdev_name_node_lookup(net, name);
  376	if (!name_node)
  377		return -ENOENT;
  378	/* lookup might have found our primary name or a name belonging
  379	 * to another device.
  380	 */
  381	if (name_node == dev->name_node || name_node->dev != dev)
  382		return -EINVAL;
  383
  384	__netdev_name_node_alt_destroy(name_node);
  385	return 0;
  386}
  387
  388static void netdev_name_node_alt_flush(struct net_device *dev)
  389{
  390	struct netdev_name_node *name_node, *tmp;
  391
  392	list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) {
  393		list_del(&name_node->list);
  394		netdev_name_node_alt_free(&name_node->rcu);
  395	}
  396}
  397
  398/* Device list insertion */
  399static void list_netdevice(struct net_device *dev)
  400{
  401	struct netdev_name_node *name_node;
  402	struct net *net = dev_net(dev);
  403
  404	ASSERT_RTNL();
  405
 
  406	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
  407	netdev_name_node_add(net, dev->name_node);
  408	hlist_add_head_rcu(&dev->index_hlist,
  409			   dev_index_hash(net, dev->ifindex));
  410
  411	netdev_for_each_altname(dev, name_node)
  412		netdev_name_node_add(net, name_node);
  413
  414	/* We reserved the ifindex, this can't fail */
  415	WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));
  416
  417	dev_base_seq_inc(net);
  418}
  419
  420/* Device list removal
  421 * caller must respect a RCU grace period before freeing/reusing dev
  422 */
  423static void unlist_netdevice(struct net_device *dev)
  424{
  425	struct netdev_name_node *name_node;
  426	struct net *net = dev_net(dev);
  427
  428	ASSERT_RTNL();
  429
  430	xa_erase(&net->dev_by_index, dev->ifindex);
  431
  432	netdev_for_each_altname(dev, name_node)
  433		netdev_name_node_del(name_node);
  434
  435	/* Unlink dev from the device chain */
 
  436	list_del_rcu(&dev->dev_list);
  437	netdev_name_node_del(dev->name_node);
  438	hlist_del_rcu(&dev->index_hlist);
 
  439
  440	dev_base_seq_inc(dev_net(dev));
  441}
  442
  443/*
  444 *	Our notifier list
  445 */
  446
  447static RAW_NOTIFIER_HEAD(netdev_chain);
  448
  449/*
  450 *	Device drivers call our routines to queue packets here. We empty the
  451 *	queue in the local softnet handler.
  452 */
  453
  454DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = {
  455	.process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock),
  456};
  457EXPORT_PER_CPU_SYMBOL(softnet_data);
  458
  459/* Page_pool has a lockless array/stack to alloc/recycle pages.
  460 * PP consumers must pay attention to run APIs in the appropriate context
  461 * (e.g. NAPI context).
  462 */
  463static DEFINE_PER_CPU(struct page_pool *, system_page_pool);
  464
  465#ifdef CONFIG_LOCKDEP
  466/*
  467 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
  468 * according to dev->type
  469 */
  470static const unsigned short netdev_lock_type[] = {
  471	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
  472	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
  473	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
  474	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
  475	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
  476	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
  477	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
  478	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
  479	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
  480	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
  481	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
  482	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
  483	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
  484	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
  485	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
  486
  487static const char *const netdev_lock_name[] = {
  488	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
  489	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
  490	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
  491	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
  492	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
  493	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
  494	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
  495	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
  496	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
  497	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
  498	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
  499	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
  500	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
  501	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
  502	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
  503
  504static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
  505static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
  506
  507static inline unsigned short netdev_lock_pos(unsigned short dev_type)
  508{
  509	int i;
  510
  511	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
  512		if (netdev_lock_type[i] == dev_type)
  513			return i;
  514	/* the last key is used by default */
  515	return ARRAY_SIZE(netdev_lock_type) - 1;
  516}
  517
  518static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
  519						 unsigned short dev_type)
  520{
  521	int i;
  522
  523	i = netdev_lock_pos(dev_type);
  524	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
  525				   netdev_lock_name[i]);
  526}
  527
  528static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
  529{
  530	int i;
  531
  532	i = netdev_lock_pos(dev->type);
  533	lockdep_set_class_and_name(&dev->addr_list_lock,
  534				   &netdev_addr_lock_key[i],
  535				   netdev_lock_name[i]);
  536}
  537#else
  538static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
  539						 unsigned short dev_type)
  540{
  541}
  542
  543static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
  544{
  545}
  546#endif
  547
  548/*******************************************************************************
  549 *
  550 *		Protocol management and registration routines
  551 *
  552 *******************************************************************************/
  553
 
 
 
  554
  555/*
  556 *	Add a protocol ID to the list. Now that the input handler is
  557 *	smarter we can dispense with all the messy stuff that used to be
  558 *	here.
  559 *
  560 *	BEWARE!!! Protocol handlers, mangling input packets,
  561 *	MUST BE last in hash buckets and checking protocol handlers
  562 *	MUST start from promiscuous ptype_all chain in net_bh.
  563 *	It is true now, do not change it.
  564 *	Explanation follows: if protocol handler, mangling packet, will
  565 *	be the first on list, it is not able to sense, that packet
  566 *	is cloned and should be copied-on-write, so that it will
  567 *	change it and subsequent readers will get broken packet.
  568 *							--ANK (980803)
  569 */
  570
  571static inline struct list_head *ptype_head(const struct packet_type *pt)
  572{
  573	if (pt->type == htons(ETH_P_ALL))
  574		return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all;
  575	else
  576		return pt->dev ? &pt->dev->ptype_specific :
  577				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
  578}
  579
  580/**
  581 *	dev_add_pack - add packet handler
  582 *	@pt: packet type declaration
  583 *
  584 *	Add a protocol handler to the networking stack. The passed &packet_type
  585 *	is linked into kernel lists and may not be freed until it has been
  586 *	removed from the kernel lists.
  587 *
  588 *	This call does not sleep therefore it can not
  589 *	guarantee all CPU's that are in middle of receiving packets
  590 *	will see the new packet type (until the next received packet).
  591 */
  592
  593void dev_add_pack(struct packet_type *pt)
  594{
  595	struct list_head *head = ptype_head(pt);
  596
  597	spin_lock(&ptype_lock);
  598	list_add_rcu(&pt->list, head);
  599	spin_unlock(&ptype_lock);
  600}
  601EXPORT_SYMBOL(dev_add_pack);
  602
  603/**
  604 *	__dev_remove_pack	 - remove packet handler
  605 *	@pt: packet type declaration
  606 *
  607 *	Remove a protocol handler that was previously added to the kernel
  608 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  609 *	from the kernel lists and can be freed or reused once this function
  610 *	returns.
  611 *
  612 *      The packet type might still be in use by receivers
  613 *	and must not be freed until after all the CPU's have gone
  614 *	through a quiescent state.
  615 */
  616void __dev_remove_pack(struct packet_type *pt)
  617{
  618	struct list_head *head = ptype_head(pt);
  619	struct packet_type *pt1;
  620
  621	spin_lock(&ptype_lock);
  622
  623	list_for_each_entry(pt1, head, list) {
  624		if (pt == pt1) {
  625			list_del_rcu(&pt->list);
  626			goto out;
  627		}
  628	}
  629
  630	pr_warn("dev_remove_pack: %p not found\n", pt);
  631out:
  632	spin_unlock(&ptype_lock);
  633}
  634EXPORT_SYMBOL(__dev_remove_pack);
  635
  636/**
  637 *	dev_remove_pack	 - remove packet handler
  638 *	@pt: packet type declaration
  639 *
  640 *	Remove a protocol handler that was previously added to the kernel
  641 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  642 *	from the kernel lists and can be freed or reused once this function
  643 *	returns.
  644 *
  645 *	This call sleeps to guarantee that no CPU is looking at the packet
  646 *	type after return.
  647 */
  648void dev_remove_pack(struct packet_type *pt)
  649{
  650	__dev_remove_pack(pt);
  651
  652	synchronize_net();
  653}
  654EXPORT_SYMBOL(dev_remove_pack);
  655
  656
  657/*******************************************************************************
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  658 *
  659 *			    Device Interface Subroutines
 
 
 
  660 *
  661 *******************************************************************************/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  662
  663/**
  664 *	dev_get_iflink	- get 'iflink' value of a interface
  665 *	@dev: targeted interface
  666 *
  667 *	Indicates the ifindex the interface is linked to.
  668 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
  669 */
  670
  671int dev_get_iflink(const struct net_device *dev)
  672{
  673	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
  674		return dev->netdev_ops->ndo_get_iflink(dev);
  675
  676	return READ_ONCE(dev->ifindex);
  677}
  678EXPORT_SYMBOL(dev_get_iflink);
  679
  680/**
  681 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
  682 *	@dev: targeted interface
  683 *	@skb: The packet.
  684 *
  685 *	For better visibility of tunnel traffic OVS needs to retrieve
  686 *	egress tunnel information for a packet. Following API allows
  687 *	user to get this info.
  688 */
  689int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
  690{
  691	struct ip_tunnel_info *info;
  692
  693	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
  694		return -EINVAL;
  695
  696	info = skb_tunnel_info_unclone(skb);
  697	if (!info)
  698		return -ENOMEM;
  699	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
  700		return -EINVAL;
  701
  702	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
  703}
  704EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
  705
  706static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
  707{
  708	int k = stack->num_paths++;
  709
  710	if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
  711		return NULL;
  712
  713	return &stack->path[k];
  714}
  715
  716int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
  717			  struct net_device_path_stack *stack)
  718{
  719	const struct net_device *last_dev;
  720	struct net_device_path_ctx ctx = {
  721		.dev	= dev,
  722	};
  723	struct net_device_path *path;
  724	int ret = 0;
  725
  726	memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
  727	stack->num_paths = 0;
  728	while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
  729		last_dev = ctx.dev;
  730		path = dev_fwd_path(stack);
  731		if (!path)
  732			return -1;
  733
  734		memset(path, 0, sizeof(struct net_device_path));
  735		ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
  736		if (ret < 0)
  737			return -1;
  738
  739		if (WARN_ON_ONCE(last_dev == ctx.dev))
  740			return -1;
  741	}
  742
  743	if (!ctx.dev)
  744		return ret;
  745
  746	path = dev_fwd_path(stack);
  747	if (!path)
  748		return -1;
  749	path->type = DEV_PATH_ETHERNET;
  750	path->dev = ctx.dev;
  751
  752	return ret;
  753}
  754EXPORT_SYMBOL_GPL(dev_fill_forward_path);
  755
  756/* must be called under rcu_read_lock(), as we dont take a reference */
  757static struct napi_struct *napi_by_id(unsigned int napi_id)
  758{
  759	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
  760	struct napi_struct *napi;
  761
  762	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
  763		if (napi->napi_id == napi_id)
  764			return napi;
  765
  766	return NULL;
  767}
  768
  769/* must be called under rcu_read_lock(), as we dont take a reference */
  770struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id)
  771{
  772	struct napi_struct *napi;
  773
  774	napi = napi_by_id(napi_id);
  775	if (!napi)
  776		return NULL;
  777
  778	if (WARN_ON_ONCE(!napi->dev))
  779		return NULL;
  780	if (!net_eq(net, dev_net(napi->dev)))
  781		return NULL;
  782
  783	return napi;
  784}
  785
  786/**
  787 *	__dev_get_by_name	- find a device by its name
  788 *	@net: the applicable net namespace
  789 *	@name: name to find
  790 *
  791 *	Find an interface by name. Must be called under RTNL semaphore.
  792 *	If the name is found a pointer to the device is returned.
  793 *	If the name is not found then %NULL is returned. The
  794 *	reference counters are not incremented so the caller must be
  795 *	careful with locks.
  796 */
  797
  798struct net_device *__dev_get_by_name(struct net *net, const char *name)
  799{
  800	struct netdev_name_node *node_name;
 
 
 
 
 
  801
  802	node_name = netdev_name_node_lookup(net, name);
  803	return node_name ? node_name->dev : NULL;
  804}
  805EXPORT_SYMBOL(__dev_get_by_name);
  806
  807/**
  808 * dev_get_by_name_rcu	- find a device by its name
  809 * @net: the applicable net namespace
  810 * @name: name to find
  811 *
  812 * Find an interface by name.
  813 * If the name is found a pointer to the device is returned.
  814 * If the name is not found then %NULL is returned.
  815 * The reference counters are not incremented so the caller must be
  816 * careful with locks. The caller must hold RCU lock.
  817 */
  818
  819struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
  820{
  821	struct netdev_name_node *node_name;
  822
  823	node_name = netdev_name_node_lookup_rcu(net, name);
  824	return node_name ? node_name->dev : NULL;
  825}
  826EXPORT_SYMBOL(dev_get_by_name_rcu);
  827
  828/* Deprecated for new users, call netdev_get_by_name() instead */
  829struct net_device *dev_get_by_name(struct net *net, const char *name)
  830{
  831	struct net_device *dev;
 
  832
  833	rcu_read_lock();
  834	dev = dev_get_by_name_rcu(net, name);
  835	dev_hold(dev);
  836	rcu_read_unlock();
  837	return dev;
  838}
  839EXPORT_SYMBOL(dev_get_by_name);
  840
  841/**
  842 *	netdev_get_by_name() - find a device by its name
  843 *	@net: the applicable net namespace
  844 *	@name: name to find
  845 *	@tracker: tracking object for the acquired reference
  846 *	@gfp: allocation flags for the tracker
  847 *
  848 *	Find an interface by name. This can be called from any
  849 *	context and does its own locking. The returned handle has
  850 *	the usage count incremented and the caller must use netdev_put() to
  851 *	release it when it is no longer needed. %NULL is returned if no
  852 *	matching device is found.
  853 */
  854struct net_device *netdev_get_by_name(struct net *net, const char *name,
  855				      netdevice_tracker *tracker, gfp_t gfp)
  856{
  857	struct net_device *dev;
  858
  859	dev = dev_get_by_name(net, name);
 
  860	if (dev)
  861		netdev_tracker_alloc(dev, tracker, gfp);
 
  862	return dev;
  863}
  864EXPORT_SYMBOL(netdev_get_by_name);
  865
  866/**
  867 *	__dev_get_by_index - find a device by its ifindex
  868 *	@net: the applicable net namespace
  869 *	@ifindex: index of device
  870 *
  871 *	Search for an interface by index. Returns %NULL if the device
  872 *	is not found or a pointer to the device. The device has not
  873 *	had its reference counter increased so the caller must be careful
  874 *	about locking. The caller must hold the RTNL semaphore.
 
  875 */
  876
  877struct net_device *__dev_get_by_index(struct net *net, int ifindex)
  878{
  879	struct net_device *dev;
  880	struct hlist_head *head = dev_index_hash(net, ifindex);
  881
  882	hlist_for_each_entry(dev, head, index_hlist)
  883		if (dev->ifindex == ifindex)
  884			return dev;
  885
  886	return NULL;
  887}
  888EXPORT_SYMBOL(__dev_get_by_index);
  889
  890/**
  891 *	dev_get_by_index_rcu - find a device by its ifindex
  892 *	@net: the applicable net namespace
  893 *	@ifindex: index of device
  894 *
  895 *	Search for an interface by index. Returns %NULL if the device
  896 *	is not found or a pointer to the device. The device has not
  897 *	had its reference counter increased so the caller must be careful
  898 *	about locking. The caller must hold RCU lock.
  899 */
  900
  901struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
  902{
  903	struct net_device *dev;
  904	struct hlist_head *head = dev_index_hash(net, ifindex);
  905
  906	hlist_for_each_entry_rcu(dev, head, index_hlist)
  907		if (dev->ifindex == ifindex)
  908			return dev;
  909
  910	return NULL;
  911}
  912EXPORT_SYMBOL(dev_get_by_index_rcu);
  913
  914/* Deprecated for new users, call netdev_get_by_index() instead */
  915struct net_device *dev_get_by_index(struct net *net, int ifindex)
  916{
  917	struct net_device *dev;
  918
  919	rcu_read_lock();
  920	dev = dev_get_by_index_rcu(net, ifindex);
  921	dev_hold(dev);
  922	rcu_read_unlock();
  923	return dev;
  924}
  925EXPORT_SYMBOL(dev_get_by_index);
  926
  927/**
  928 *	netdev_get_by_index() - find a device by its ifindex
  929 *	@net: the applicable net namespace
  930 *	@ifindex: index of device
  931 *	@tracker: tracking object for the acquired reference
  932 *	@gfp: allocation flags for the tracker
  933 *
  934 *	Search for an interface by index. Returns NULL if the device
  935 *	is not found or a pointer to the device. The device returned has
  936 *	had a reference added and the pointer is safe until the user calls
  937 *	netdev_put() to indicate they have finished with it.
  938 */
  939struct net_device *netdev_get_by_index(struct net *net, int ifindex,
  940				       netdevice_tracker *tracker, gfp_t gfp)
  941{
  942	struct net_device *dev;
  943
  944	dev = dev_get_by_index(net, ifindex);
 
  945	if (dev)
  946		netdev_tracker_alloc(dev, tracker, gfp);
 
  947	return dev;
  948}
  949EXPORT_SYMBOL(netdev_get_by_index);
  950
  951/**
  952 *	dev_get_by_napi_id - find a device by napi_id
  953 *	@napi_id: ID of the NAPI struct
  954 *
  955 *	Search for an interface by NAPI ID. Returns %NULL if the device
  956 *	is not found or a pointer to the device. The device has not had
  957 *	its reference counter increased so the caller must be careful
  958 *	about locking. The caller must hold RCU lock.
  959 */
  960
  961struct net_device *dev_get_by_napi_id(unsigned int napi_id)
  962{
  963	struct napi_struct *napi;
  964
  965	WARN_ON_ONCE(!rcu_read_lock_held());
  966
  967	if (napi_id < MIN_NAPI_ID)
  968		return NULL;
  969
  970	napi = napi_by_id(napi_id);
  971
  972	return napi ? napi->dev : NULL;
  973}
  974EXPORT_SYMBOL(dev_get_by_napi_id);
  975
  976static DEFINE_SEQLOCK(netdev_rename_lock);
  977
  978void netdev_copy_name(struct net_device *dev, char *name)
  979{
  980	unsigned int seq;
  981
  982	do {
  983		seq = read_seqbegin(&netdev_rename_lock);
  984		strscpy(name, dev->name, IFNAMSIZ);
  985	} while (read_seqretry(&netdev_rename_lock, seq));
  986}
  987
  988/**
  989 *	netdev_get_name - get a netdevice name, knowing its ifindex.
  990 *	@net: network namespace
  991 *	@name: a pointer to the buffer where the name will be stored.
  992 *	@ifindex: the ifindex of the interface to get the name from.
 
 
 
 
  993 */
  994int netdev_get_name(struct net *net, char *name, int ifindex)
  995{
  996	struct net_device *dev;
  997	int ret;
  998
 
 
  999	rcu_read_lock();
 1000
 1001	dev = dev_get_by_index_rcu(net, ifindex);
 1002	if (!dev) {
 1003		ret = -ENODEV;
 1004		goto out;
 1005	}
 1006
 1007	netdev_copy_name(dev, name);
 1008
 1009	ret = 0;
 1010out:
 1011	rcu_read_unlock();
 1012	return ret;
 1013}
 
 
 1014
 1015static bool dev_addr_cmp(struct net_device *dev, unsigned short type,
 1016			 const char *ha)
 1017{
 1018	return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len);
 1019}
 1020
 1021/**
 1022 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 1023 *	@net: the applicable net namespace
 1024 *	@type: media type of device
 1025 *	@ha: hardware address
 1026 *
 1027 *	Search for an interface by MAC address. Returns NULL if the device
 1028 *	is not found or a pointer to the device.
 1029 *	The caller must hold RCU.
 1030 *	The returned device has not had its ref count increased
 1031 *	and the caller must therefore be careful about locking
 1032 *
 1033 */
 1034
 1035struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 1036				       const char *ha)
 1037{
 1038	struct net_device *dev;
 1039
 1040	for_each_netdev_rcu(net, dev)
 1041		if (dev_addr_cmp(dev, type, ha))
 
 1042			return dev;
 1043
 1044	return NULL;
 1045}
 1046EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 1047
 1048/**
 1049 * dev_getbyhwaddr() - find a device by its hardware address
 1050 * @net: the applicable net namespace
 1051 * @type: media type of device
 1052 * @ha: hardware address
 1053 *
 1054 * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold
 1055 * rtnl_lock.
 1056 *
 1057 * Context: rtnl_lock() must be held.
 1058 * Return: pointer to the net_device, or NULL if not found
 1059 */
 1060struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type,
 1061				   const char *ha)
 1062{
 1063	struct net_device *dev;
 1064
 1065	ASSERT_RTNL();
 1066	for_each_netdev(net, dev)
 1067		if (dev_addr_cmp(dev, type, ha))
 1068			return dev;
 1069
 1070	return NULL;
 1071}
 1072EXPORT_SYMBOL(dev_getbyhwaddr);
 1073
 1074struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 1075{
 1076	struct net_device *dev, *ret = NULL;
 1077
 1078	rcu_read_lock();
 1079	for_each_netdev_rcu(net, dev)
 1080		if (dev->type == type) {
 1081			dev_hold(dev);
 1082			ret = dev;
 1083			break;
 1084		}
 1085	rcu_read_unlock();
 1086	return ret;
 1087}
 1088EXPORT_SYMBOL(dev_getfirstbyhwtype);
 1089
 1090/**
 1091 *	__dev_get_by_flags - find any device with given flags
 1092 *	@net: the applicable net namespace
 1093 *	@if_flags: IFF_* values
 1094 *	@mask: bitmask of bits in if_flags to check
 1095 *
 1096 *	Search for any interface with the given flags. Returns NULL if a device
 1097 *	is not found or a pointer to the device. Must be called inside
 1098 *	rtnl_lock(), and result refcount is unchanged.
 1099 */
 1100
 1101struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 1102				      unsigned short mask)
 1103{
 1104	struct net_device *dev, *ret;
 1105
 1106	ASSERT_RTNL();
 1107
 1108	ret = NULL;
 1109	for_each_netdev(net, dev) {
 1110		if (((dev->flags ^ if_flags) & mask) == 0) {
 1111			ret = dev;
 1112			break;
 1113		}
 1114	}
 1115	return ret;
 1116}
 1117EXPORT_SYMBOL(__dev_get_by_flags);
 1118
 1119/**
 1120 *	dev_valid_name - check if name is okay for network device
 1121 *	@name: name string
 1122 *
 1123 *	Network device names need to be valid file names to
 1124 *	allow sysfs to work.  We also disallow any kind of
 1125 *	whitespace.
 1126 */
 1127bool dev_valid_name(const char *name)
 1128{
 1129	if (*name == '\0')
 1130		return false;
 1131	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
 1132		return false;
 1133	if (!strcmp(name, ".") || !strcmp(name, ".."))
 1134		return false;
 1135
 1136	while (*name) {
 1137		if (*name == '/' || *name == ':' || isspace(*name))
 1138			return false;
 1139		name++;
 1140	}
 1141	return true;
 1142}
 1143EXPORT_SYMBOL(dev_valid_name);
 1144
 1145/**
 1146 *	__dev_alloc_name - allocate a name for a device
 1147 *	@net: network namespace to allocate the device name in
 1148 *	@name: name format string
 1149 *	@res: result name string
 1150 *
 1151 *	Passed a format string - eg "lt%d" it will try and find a suitable
 1152 *	id. It scans list of devices to build up a free map, then chooses
 1153 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 1154 *	while allocating the name and adding the device in order to avoid
 1155 *	duplicates.
 1156 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 1157 *	Returns the number of the unit assigned or a negative errno code.
 1158 */
 1159
 1160static int __dev_alloc_name(struct net *net, const char *name, char *res)
 1161{
 1162	int i = 0;
 1163	const char *p;
 1164	const int max_netdevices = 8*PAGE_SIZE;
 1165	unsigned long *inuse;
 1166	struct net_device *d;
 1167	char buf[IFNAMSIZ];
 1168
 1169	/* Verify the string as this thing may have come from the user.
 1170	 * There must be one "%d" and no other "%" characters.
 1171	 */
 1172	p = strchr(name, '%');
 1173	if (!p || p[1] != 'd' || strchr(p + 2, '%'))
 1174		return -EINVAL;
 1175
 1176	/* Use one page as a bit array of possible slots */
 1177	inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
 1178	if (!inuse)
 1179		return -ENOMEM;
 
 
 
 
 
 1180
 1181	for_each_netdev(net, d) {
 1182		struct netdev_name_node *name_node;
 
 
 1183
 1184		netdev_for_each_altname(d, name_node) {
 1185			if (!sscanf(name_node->name, name, &i))
 1186				continue;
 1187			if (i < 0 || i >= max_netdevices)
 1188				continue;
 1189
 1190			/* avoid cases where sscanf is not exact inverse of printf */
 1191			snprintf(buf, IFNAMSIZ, name, i);
 1192			if (!strncmp(buf, name_node->name, IFNAMSIZ))
 1193				__set_bit(i, inuse);
 1194		}
 1195		if (!sscanf(d->name, name, &i))
 1196			continue;
 1197		if (i < 0 || i >= max_netdevices)
 1198			continue;
 1199
 1200		/* avoid cases where sscanf is not exact inverse of printf */
 1201		snprintf(buf, IFNAMSIZ, name, i);
 1202		if (!strncmp(buf, d->name, IFNAMSIZ))
 1203			__set_bit(i, inuse);
 1204	}
 1205
 1206	i = find_first_zero_bit(inuse, max_netdevices);
 1207	bitmap_free(inuse);
 1208	if (i == max_netdevices)
 1209		return -ENFILE;
 1210
 1211	/* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */
 1212	strscpy(buf, name, IFNAMSIZ);
 1213	snprintf(res, IFNAMSIZ, buf, i);
 1214	return i;
 1215}
 1216
 1217/* Returns negative errno or allocated unit id (see __dev_alloc_name()) */
 1218static int dev_prep_valid_name(struct net *net, struct net_device *dev,
 1219			       const char *want_name, char *out_name,
 1220			       int dup_errno)
 1221{
 1222	if (!dev_valid_name(want_name))
 1223		return -EINVAL;
 1224
 1225	if (strchr(want_name, '%'))
 1226		return __dev_alloc_name(net, want_name, out_name);
 1227
 1228	if (netdev_name_in_use(net, want_name))
 1229		return -dup_errno;
 1230	if (out_name != want_name)
 1231		strscpy(out_name, want_name, IFNAMSIZ);
 1232	return 0;
 1233}
 1234
 1235/**
 1236 *	dev_alloc_name - allocate a name for a device
 1237 *	@dev: device
 1238 *	@name: name format string
 1239 *
 1240 *	Passed a format string - eg "lt%d" it will try and find a suitable
 1241 *	id. It scans list of devices to build up a free map, then chooses
 1242 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 1243 *	while allocating the name and adding the device in order to avoid
 1244 *	duplicates.
 1245 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 1246 *	Returns the number of the unit assigned or a negative errno code.
 1247 */
 1248
 1249int dev_alloc_name(struct net_device *dev, const char *name)
 1250{
 1251	return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE);
 
 
 
 
 
 
 
 
 
 1252}
 1253EXPORT_SYMBOL(dev_alloc_name);
 1254
 1255static int dev_get_valid_name(struct net *net, struct net_device *dev,
 1256			      const char *name)
 
 1257{
 
 1258	int ret;
 1259
 1260	ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST);
 1261	return ret < 0 ? ret : 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 1262}
 1263
 1264/**
 1265 *	dev_change_name - change name of a device
 1266 *	@dev: device
 1267 *	@newname: name (or format string) must be at least IFNAMSIZ
 1268 *
 1269 *	Change name of a device, can pass format strings "eth%d".
 1270 *	for wildcarding.
 1271 */
 1272int dev_change_name(struct net_device *dev, const char *newname)
 1273{
 1274	unsigned char old_assign_type;
 1275	char oldname[IFNAMSIZ];
 1276	int err = 0;
 1277	int ret;
 1278	struct net *net;
 1279
 1280	ASSERT_RTNL();
 1281	BUG_ON(!dev_net(dev));
 1282
 1283	net = dev_net(dev);
 
 
 1284
 1285	down_write(&devnet_rename_sem);
 1286
 1287	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
 1288		up_write(&devnet_rename_sem);
 1289		return 0;
 1290	}
 1291
 1292	memcpy(oldname, dev->name, IFNAMSIZ);
 1293
 1294	write_seqlock_bh(&netdev_rename_lock);
 1295	err = dev_get_valid_name(net, dev, newname);
 1296	write_sequnlock_bh(&netdev_rename_lock);
 1297
 1298	if (err < 0) {
 1299		up_write(&devnet_rename_sem);
 1300		return err;
 1301	}
 1302
 1303	if (oldname[0] && !strchr(oldname, '%'))
 1304		netdev_info(dev, "renamed from %s%s\n", oldname,
 1305			    dev->flags & IFF_UP ? " (while UP)" : "");
 1306
 1307	old_assign_type = dev->name_assign_type;
 1308	WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED);
 1309
 1310rollback:
 1311	ret = device_rename(&dev->dev, dev->name);
 1312	if (ret) {
 1313		write_seqlock_bh(&netdev_rename_lock);
 1314		memcpy(dev->name, oldname, IFNAMSIZ);
 1315		write_sequnlock_bh(&netdev_rename_lock);
 1316		WRITE_ONCE(dev->name_assign_type, old_assign_type);
 1317		up_write(&devnet_rename_sem);
 1318		return ret;
 1319	}
 1320
 1321	up_write(&devnet_rename_sem);
 1322
 1323	netdev_adjacent_rename_links(dev, oldname);
 1324
 1325	netdev_name_node_del(dev->name_node);
 1326
 1327	synchronize_net();
 1328
 1329	netdev_name_node_add(net, dev->name_node);
 
 
 
 
 1330
 1331	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 1332	ret = notifier_to_errno(ret);
 1333
 1334	if (ret) {
 1335		/* err >= 0 after dev_alloc_name() or stores the first errno */
 1336		if (err >= 0) {
 1337			err = ret;
 1338			down_write(&devnet_rename_sem);
 1339			write_seqlock_bh(&netdev_rename_lock);
 1340			memcpy(dev->name, oldname, IFNAMSIZ);
 1341			write_sequnlock_bh(&netdev_rename_lock);
 1342			memcpy(oldname, newname, IFNAMSIZ);
 1343			WRITE_ONCE(dev->name_assign_type, old_assign_type);
 1344			old_assign_type = NET_NAME_RENAMED;
 1345			goto rollback;
 1346		} else {
 1347			netdev_err(dev, "name change rollback failed: %d\n",
 1348				   ret);
 1349		}
 1350	}
 1351
 1352	return err;
 1353}
 1354
 1355/**
 1356 *	dev_set_alias - change ifalias of a device
 1357 *	@dev: device
 1358 *	@alias: name up to IFALIASZ
 1359 *	@len: limit of bytes to copy from info
 1360 *
 1361 *	Set ifalias for a device,
 1362 */
 1363int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 1364{
 1365	struct dev_ifalias *new_alias = NULL;
 
 
 1366
 1367	if (len >= IFALIASZ)
 1368		return -EINVAL;
 1369
 1370	if (len) {
 1371		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
 1372		if (!new_alias)
 1373			return -ENOMEM;
 1374
 1375		memcpy(new_alias->ifalias, alias, len);
 1376		new_alias->ifalias[len] = 0;
 1377	}
 1378
 1379	mutex_lock(&ifalias_mutex);
 1380	new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
 1381					mutex_is_locked(&ifalias_mutex));
 1382	mutex_unlock(&ifalias_mutex);
 1383
 1384	if (new_alias)
 1385		kfree_rcu(new_alias, rcuhead);
 1386
 
 1387	return len;
 1388}
 1389EXPORT_SYMBOL(dev_set_alias);
 1390
 1391/**
 1392 *	dev_get_alias - get ifalias of a device
 1393 *	@dev: device
 1394 *	@name: buffer to store name of ifalias
 1395 *	@len: size of buffer
 1396 *
 1397 *	get ifalias for a device.  Caller must make sure dev cannot go
 1398 *	away,  e.g. rcu read lock or own a reference count to device.
 1399 */
 1400int dev_get_alias(const struct net_device *dev, char *name, size_t len)
 1401{
 1402	const struct dev_ifalias *alias;
 1403	int ret = 0;
 1404
 1405	rcu_read_lock();
 1406	alias = rcu_dereference(dev->ifalias);
 1407	if (alias)
 1408		ret = snprintf(name, len, "%s", alias->ifalias);
 1409	rcu_read_unlock();
 1410
 1411	return ret;
 1412}
 1413
 1414/**
 1415 *	netdev_features_change - device changes features
 1416 *	@dev: device to cause notification
 1417 *
 1418 *	Called to indicate a device has changed features.
 1419 */
 1420void netdev_features_change(struct net_device *dev)
 1421{
 1422	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
 1423}
 1424EXPORT_SYMBOL(netdev_features_change);
 1425
 1426/**
 1427 *	netdev_state_change - device changes state
 1428 *	@dev: device to cause notification
 1429 *
 1430 *	Called to indicate a device has changed state. This function calls
 1431 *	the notifier chains for netdev_chain and sends a NEWLINK message
 1432 *	to the routing socket.
 1433 */
 1434void netdev_state_change(struct net_device *dev)
 1435{
 1436	if (dev->flags & IFF_UP) {
 1437		struct netdev_notifier_change_info change_info = {
 1438			.info.dev = dev,
 1439		};
 1440
 1441		call_netdevice_notifiers_info(NETDEV_CHANGE,
 
 1442					      &change_info.info);
 1443		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
 1444	}
 1445}
 1446EXPORT_SYMBOL(netdev_state_change);
 1447
 1448/**
 1449 * __netdev_notify_peers - notify network peers about existence of @dev,
 1450 * to be called when rtnl lock is already held.
 1451 * @dev: network device
 1452 *
 1453 * Generate traffic such that interested network peers are aware of
 1454 * @dev, such as by generating a gratuitous ARP. This may be used when
 1455 * a device wants to inform the rest of the network about some sort of
 1456 * reconfiguration such as a failover event or virtual machine
 1457 * migration.
 1458 */
 1459void __netdev_notify_peers(struct net_device *dev)
 1460{
 1461	ASSERT_RTNL();
 1462	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
 1463	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
 1464}
 1465EXPORT_SYMBOL(__netdev_notify_peers);
 1466
 1467/**
 1468 * netdev_notify_peers - notify network peers about existence of @dev
 1469 * @dev: network device
 1470 *
 1471 * Generate traffic such that interested network peers are aware of
 1472 * @dev, such as by generating a gratuitous ARP. This may be used when
 1473 * a device wants to inform the rest of the network about some sort of
 1474 * reconfiguration such as a failover event or virtual machine
 1475 * migration.
 1476 */
 1477void netdev_notify_peers(struct net_device *dev)
 1478{
 1479	rtnl_lock();
 1480	__netdev_notify_peers(dev);
 1481	rtnl_unlock();
 1482}
 1483EXPORT_SYMBOL(netdev_notify_peers);
 1484
 1485static int napi_threaded_poll(void *data);
 1486
 1487static int napi_kthread_create(struct napi_struct *n)
 1488{
 1489	int err = 0;
 1490
 1491	/* Create and wake up the kthread once to put it in
 1492	 * TASK_INTERRUPTIBLE mode to avoid the blocked task
 1493	 * warning and work with loadavg.
 1494	 */
 1495	n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
 1496				n->dev->name, n->napi_id);
 1497	if (IS_ERR(n->thread)) {
 1498		err = PTR_ERR(n->thread);
 1499		pr_err("kthread_run failed with err %d\n", err);
 1500		n->thread = NULL;
 1501	}
 1502
 1503	return err;
 1504}
 1505
 1506static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1507{
 1508	const struct net_device_ops *ops = dev->netdev_ops;
 1509	int ret;
 1510
 1511	ASSERT_RTNL();
 1512	dev_addr_check(dev);
 1513
 1514	if (!netif_device_present(dev)) {
 1515		/* may be detached because parent is runtime-suspended */
 1516		if (dev->dev.parent)
 1517			pm_runtime_resume(dev->dev.parent);
 1518		if (!netif_device_present(dev))
 1519			return -ENODEV;
 1520	}
 1521
 1522	/* Block netpoll from trying to do any rx path servicing.
 1523	 * If we don't do this there is a chance ndo_poll_controller
 1524	 * or ndo_poll may be running while we open the device
 1525	 */
 1526	netpoll_poll_disable(dev);
 1527
 1528	ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
 1529	ret = notifier_to_errno(ret);
 1530	if (ret)
 1531		return ret;
 1532
 1533	set_bit(__LINK_STATE_START, &dev->state);
 1534
 1535	if (ops->ndo_validate_addr)
 1536		ret = ops->ndo_validate_addr(dev);
 1537
 1538	if (!ret && ops->ndo_open)
 1539		ret = ops->ndo_open(dev);
 1540
 1541	netpoll_poll_enable(dev);
 1542
 1543	if (ret)
 1544		clear_bit(__LINK_STATE_START, &dev->state);
 1545	else {
 1546		dev->flags |= IFF_UP;
 1547		dev_set_rx_mode(dev);
 1548		dev_activate(dev);
 1549		add_device_randomness(dev->dev_addr, dev->addr_len);
 1550	}
 1551
 1552	return ret;
 1553}
 1554
 1555/**
 1556 *	dev_open	- prepare an interface for use.
 1557 *	@dev: device to open
 1558 *	@extack: netlink extended ack
 1559 *
 1560 *	Takes a device from down to up state. The device's private open
 1561 *	function is invoked and then the multicast lists are loaded. Finally
 1562 *	the device is moved into the up state and a %NETDEV_UP message is
 1563 *	sent to the netdev notifier chain.
 1564 *
 1565 *	Calling this function on an active interface is a nop. On a failure
 1566 *	a negative errno code is returned.
 1567 */
 1568int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1569{
 1570	int ret;
 1571
 1572	if (dev->flags & IFF_UP)
 1573		return 0;
 1574
 1575	ret = __dev_open(dev, extack);
 1576	if (ret < 0)
 1577		return ret;
 1578
 1579	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
 1580	call_netdevice_notifiers(NETDEV_UP, dev);
 1581
 1582	return ret;
 1583}
 1584EXPORT_SYMBOL(dev_open);
 1585
 1586static void __dev_close_many(struct list_head *head)
 1587{
 1588	struct net_device *dev;
 1589
 1590	ASSERT_RTNL();
 1591	might_sleep();
 1592
 1593	list_for_each_entry(dev, head, close_list) {
 1594		/* Temporarily disable netpoll until the interface is down */
 1595		netpoll_poll_disable(dev);
 1596
 1597		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
 1598
 1599		clear_bit(__LINK_STATE_START, &dev->state);
 1600
 1601		/* Synchronize to scheduled poll. We cannot touch poll list, it
 1602		 * can be even on different cpu. So just clear netif_running().
 1603		 *
 1604		 * dev->stop() will invoke napi_disable() on all of it's
 1605		 * napi_struct instances on this device.
 1606		 */
 1607		smp_mb__after_atomic(); /* Commit netif_running(). */
 1608	}
 1609
 1610	dev_deactivate_many(head);
 1611
 1612	list_for_each_entry(dev, head, close_list) {
 1613		const struct net_device_ops *ops = dev->netdev_ops;
 1614
 1615		/*
 1616		 *	Call the device specific close. This cannot fail.
 1617		 *	Only if device is UP
 1618		 *
 1619		 *	We allow it to be called even after a DETACH hot-plug
 1620		 *	event.
 1621		 */
 1622		if (ops->ndo_stop)
 1623			ops->ndo_stop(dev);
 1624
 1625		dev->flags &= ~IFF_UP;
 1626		netpoll_poll_enable(dev);
 1627	}
 
 
 1628}
 1629
 1630static void __dev_close(struct net_device *dev)
 1631{
 
 1632	LIST_HEAD(single);
 1633
 1634	list_add(&dev->close_list, &single);
 1635	__dev_close_many(&single);
 1636	list_del(&single);
 
 
 1637}
 1638
 1639void dev_close_many(struct list_head *head, bool unlink)
 1640{
 1641	struct net_device *dev, *tmp;
 1642
 1643	/* Remove the devices that don't need to be closed */
 1644	list_for_each_entry_safe(dev, tmp, head, close_list)
 1645		if (!(dev->flags & IFF_UP))
 1646			list_del_init(&dev->close_list);
 1647
 1648	__dev_close_many(head);
 1649
 1650	list_for_each_entry_safe(dev, tmp, head, close_list) {
 1651		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
 1652		call_netdevice_notifiers(NETDEV_DOWN, dev);
 1653		if (unlink)
 1654			list_del_init(&dev->close_list);
 1655	}
 
 
 1656}
 1657EXPORT_SYMBOL(dev_close_many);
 1658
 1659/**
 1660 *	dev_close - shutdown an interface.
 1661 *	@dev: device to shutdown
 1662 *
 1663 *	This function moves an active device into down state. A
 1664 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
 1665 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
 1666 *	chain.
 1667 */
 1668void dev_close(struct net_device *dev)
 1669{
 1670	if (dev->flags & IFF_UP) {
 1671		LIST_HEAD(single);
 1672
 1673		list_add(&dev->close_list, &single);
 1674		dev_close_many(&single, true);
 1675		list_del(&single);
 1676	}
 
 1677}
 1678EXPORT_SYMBOL(dev_close);
 1679
 1680
 1681/**
 1682 *	dev_disable_lro - disable Large Receive Offload on a device
 1683 *	@dev: device
 1684 *
 1685 *	Disable Large Receive Offload (LRO) on a net device.  Must be
 1686 *	called under RTNL.  This is needed if received packets may be
 1687 *	forwarded to another interface.
 1688 */
 1689void dev_disable_lro(struct net_device *dev)
 1690{
 1691	struct net_device *lower_dev;
 1692	struct list_head *iter;
 1693
 1694	dev->wanted_features &= ~NETIF_F_LRO;
 1695	netdev_update_features(dev);
 1696
 1697	if (unlikely(dev->features & NETIF_F_LRO))
 1698		netdev_WARN(dev, "failed to disable LRO!\n");
 1699
 1700	netdev_for_each_lower_dev(dev, lower_dev, iter)
 1701		dev_disable_lro(lower_dev);
 1702}
 1703EXPORT_SYMBOL(dev_disable_lro);
 1704
 1705/**
 1706 *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
 1707 *	@dev: device
 1708 *
 1709 *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
 1710 *	called under RTNL.  This is needed if Generic XDP is installed on
 1711 *	the device.
 1712 */
 1713static void dev_disable_gro_hw(struct net_device *dev)
 1714{
 1715	dev->wanted_features &= ~NETIF_F_GRO_HW;
 1716	netdev_update_features(dev);
 1717
 1718	if (unlikely(dev->features & NETIF_F_GRO_HW))
 1719		netdev_WARN(dev, "failed to disable GRO_HW!\n");
 1720}
 1721
 1722const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 1723{
 1724#define N(val) 						\
 1725	case NETDEV_##val:				\
 1726		return "NETDEV_" __stringify(val);
 1727	switch (cmd) {
 1728	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
 1729	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
 1730	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
 1731	N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN)
 1732	N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA)
 1733	N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE)
 1734	N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
 1735	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
 1736	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
 1737	N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
 1738	N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
 1739	N(XDP_FEAT_CHANGE)
 1740	}
 1741#undef N
 1742	return "UNKNOWN_NETDEV_EVENT";
 1743}
 1744EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
 1745
 1746static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
 1747				   struct net_device *dev)
 1748{
 1749	struct netdev_notifier_info info = {
 1750		.dev = dev,
 1751	};
 1752
 
 1753	return nb->notifier_call(nb, val, &info);
 1754}
 1755
 1756static int call_netdevice_register_notifiers(struct notifier_block *nb,
 1757					     struct net_device *dev)
 1758{
 1759	int err;
 1760
 1761	err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
 1762	err = notifier_to_errno(err);
 1763	if (err)
 1764		return err;
 1765
 1766	if (!(dev->flags & IFF_UP))
 1767		return 0;
 1768
 1769	call_netdevice_notifier(nb, NETDEV_UP, dev);
 1770	return 0;
 1771}
 1772
 1773static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
 1774						struct net_device *dev)
 1775{
 1776	if (dev->flags & IFF_UP) {
 1777		call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
 1778					dev);
 1779		call_netdevice_notifier(nb, NETDEV_DOWN, dev);
 1780	}
 1781	call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
 1782}
 1783
 1784static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
 1785						 struct net *net)
 1786{
 1787	struct net_device *dev;
 1788	int err;
 1789
 1790	for_each_netdev(net, dev) {
 1791		err = call_netdevice_register_notifiers(nb, dev);
 1792		if (err)
 1793			goto rollback;
 1794	}
 1795	return 0;
 1796
 1797rollback:
 1798	for_each_netdev_continue_reverse(net, dev)
 1799		call_netdevice_unregister_notifiers(nb, dev);
 1800	return err;
 1801}
 1802
 1803static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
 1804						    struct net *net)
 1805{
 1806	struct net_device *dev;
 1807
 1808	for_each_netdev(net, dev)
 1809		call_netdevice_unregister_notifiers(nb, dev);
 1810}
 1811
 1812static int dev_boot_phase = 1;
 1813
 1814/**
 1815 * register_netdevice_notifier - register a network notifier block
 1816 * @nb: notifier
 1817 *
 1818 * Register a notifier to be called when network device events occur.
 1819 * The notifier passed is linked into the kernel structures and must
 1820 * not be reused until it has been unregistered. A negative errno code
 1821 * is returned on a failure.
 1822 *
 1823 * When registered all registration and up events are replayed
 1824 * to the new notifier to allow device to have a race free
 1825 * view of the network device list.
 1826 */
 1827
 1828int register_netdevice_notifier(struct notifier_block *nb)
 1829{
 
 
 1830	struct net *net;
 1831	int err;
 1832
 1833	/* Close race with setup_net() and cleanup_net() */
 1834	down_write(&pernet_ops_rwsem);
 1835	rtnl_lock();
 1836	err = raw_notifier_chain_register(&netdev_chain, nb);
 1837	if (err)
 1838		goto unlock;
 1839	if (dev_boot_phase)
 1840		goto unlock;
 1841	for_each_net(net) {
 1842		err = call_netdevice_register_net_notifiers(nb, net);
 1843		if (err)
 1844			goto rollback;
 
 
 
 
 
 
 
 
 1845	}
 1846
 1847unlock:
 1848	rtnl_unlock();
 1849	up_write(&pernet_ops_rwsem);
 1850	return err;
 1851
 1852rollback:
 1853	for_each_net_continue_reverse(net)
 1854		call_netdevice_unregister_net_notifiers(nb, net);
 
 
 
 
 
 
 
 
 
 
 
 
 1855
 
 1856	raw_notifier_chain_unregister(&netdev_chain, nb);
 1857	goto unlock;
 1858}
 1859EXPORT_SYMBOL(register_netdevice_notifier);
 1860
 1861/**
 1862 * unregister_netdevice_notifier - unregister a network notifier block
 1863 * @nb: notifier
 1864 *
 1865 * Unregister a notifier previously registered by
 1866 * register_netdevice_notifier(). The notifier is unlinked into the
 1867 * kernel structures and may then be reused. A negative errno code
 1868 * is returned on a failure.
 1869 *
 1870 * After unregistering unregister and down device events are synthesized
 1871 * for all devices on the device list to the removed notifier to remove
 1872 * the need for special case cleanup code.
 1873 */
 1874
 1875int unregister_netdevice_notifier(struct notifier_block *nb)
 1876{
 
 1877	struct net *net;
 1878	int err;
 1879
 1880	/* Close race with setup_net() and cleanup_net() */
 1881	down_write(&pernet_ops_rwsem);
 1882	rtnl_lock();
 1883	err = raw_notifier_chain_unregister(&netdev_chain, nb);
 1884	if (err)
 1885		goto unlock;
 1886
 1887	for_each_net(net)
 1888		call_netdevice_unregister_net_notifiers(nb, net);
 1889
 
 
 
 
 
 
 
 1890unlock:
 1891	rtnl_unlock();
 1892	up_write(&pernet_ops_rwsem);
 1893	return err;
 1894}
 1895EXPORT_SYMBOL(unregister_netdevice_notifier);
 1896
 1897static int __register_netdevice_notifier_net(struct net *net,
 1898					     struct notifier_block *nb,
 1899					     bool ignore_call_fail)
 1900{
 1901	int err;
 1902
 1903	err = raw_notifier_chain_register(&net->netdev_chain, nb);
 1904	if (err)
 1905		return err;
 1906	if (dev_boot_phase)
 1907		return 0;
 1908
 1909	err = call_netdevice_register_net_notifiers(nb, net);
 1910	if (err && !ignore_call_fail)
 1911		goto chain_unregister;
 1912
 1913	return 0;
 1914
 1915chain_unregister:
 1916	raw_notifier_chain_unregister(&net->netdev_chain, nb);
 1917	return err;
 1918}
 1919
 1920static int __unregister_netdevice_notifier_net(struct net *net,
 1921					       struct notifier_block *nb)
 1922{
 1923	int err;
 1924
 1925	err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
 1926	if (err)
 1927		return err;
 1928
 1929	call_netdevice_unregister_net_notifiers(nb, net);
 1930	return 0;
 1931}
 1932
 1933/**
 1934 * register_netdevice_notifier_net - register a per-netns network notifier block
 1935 * @net: network namespace
 1936 * @nb: notifier
 1937 *
 1938 * Register a notifier to be called when network device events occur.
 1939 * The notifier passed is linked into the kernel structures and must
 1940 * not be reused until it has been unregistered. A negative errno code
 1941 * is returned on a failure.
 1942 *
 1943 * When registered all registration and up events are replayed
 1944 * to the new notifier to allow device to have a race free
 1945 * view of the network device list.
 1946 */
 1947
 1948int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
 1949{
 1950	int err;
 1951
 1952	rtnl_lock();
 1953	err = __register_netdevice_notifier_net(net, nb, false);
 1954	rtnl_unlock();
 1955	return err;
 1956}
 1957EXPORT_SYMBOL(register_netdevice_notifier_net);
 1958
 1959/**
 1960 * unregister_netdevice_notifier_net - unregister a per-netns
 1961 *                                     network notifier block
 1962 * @net: network namespace
 1963 * @nb: notifier
 1964 *
 1965 * Unregister a notifier previously registered by
 1966 * register_netdevice_notifier_net(). The notifier is unlinked from the
 1967 * kernel structures and may then be reused. A negative errno code
 1968 * is returned on a failure.
 1969 *
 1970 * After unregistering unregister and down device events are synthesized
 1971 * for all devices on the device list to the removed notifier to remove
 1972 * the need for special case cleanup code.
 1973 */
 1974
 1975int unregister_netdevice_notifier_net(struct net *net,
 1976				      struct notifier_block *nb)
 1977{
 1978	int err;
 1979
 1980	rtnl_lock();
 1981	err = __unregister_netdevice_notifier_net(net, nb);
 1982	rtnl_unlock();
 1983	return err;
 1984}
 1985EXPORT_SYMBOL(unregister_netdevice_notifier_net);
 1986
 1987static void __move_netdevice_notifier_net(struct net *src_net,
 1988					  struct net *dst_net,
 1989					  struct notifier_block *nb)
 1990{
 1991	__unregister_netdevice_notifier_net(src_net, nb);
 1992	__register_netdevice_notifier_net(dst_net, nb, true);
 1993}
 1994
 1995int register_netdevice_notifier_dev_net(struct net_device *dev,
 1996					struct notifier_block *nb,
 1997					struct netdev_net_notifier *nn)
 1998{
 1999	int err;
 2000
 2001	rtnl_lock();
 2002	err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
 2003	if (!err) {
 2004		nn->nb = nb;
 2005		list_add(&nn->list, &dev->net_notifier_list);
 2006	}
 2007	rtnl_unlock();
 2008	return err;
 2009}
 2010EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
 2011
 2012int unregister_netdevice_notifier_dev_net(struct net_device *dev,
 2013					  struct notifier_block *nb,
 2014					  struct netdev_net_notifier *nn)
 2015{
 2016	int err;
 2017
 2018	rtnl_lock();
 2019	list_del(&nn->list);
 2020	err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
 2021	rtnl_unlock();
 2022	return err;
 2023}
 2024EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
 2025
 2026static void move_netdevice_notifiers_dev_net(struct net_device *dev,
 2027					     struct net *net)
 2028{
 2029	struct netdev_net_notifier *nn;
 2030
 2031	list_for_each_entry(nn, &dev->net_notifier_list, list)
 2032		__move_netdevice_notifier_net(dev_net(dev), net, nn->nb);
 2033}
 2034
 2035/**
 2036 *	call_netdevice_notifiers_info - call all network notifier blocks
 2037 *	@val: value passed unmodified to notifier function
 
 2038 *	@info: notifier information data
 2039 *
 2040 *	Call all network notifier blocks.  Parameters and return value
 2041 *	are as for raw_notifier_call_chain().
 2042 */
 2043
 2044int call_netdevice_notifiers_info(unsigned long val,
 2045				  struct netdev_notifier_info *info)
 
 2046{
 2047	struct net *net = dev_net(info->dev);
 2048	int ret;
 2049
 2050	ASSERT_RTNL();
 2051
 2052	/* Run per-netns notifier block chain first, then run the global one.
 2053	 * Hopefully, one day, the global one is going to be removed after
 2054	 * all notifier block registrators get converted to be per-netns.
 2055	 */
 2056	ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
 2057	if (ret & NOTIFY_STOP_MASK)
 2058		return ret;
 2059	return raw_notifier_call_chain(&netdev_chain, val, info);
 2060}
 2061
 2062/**
 2063 *	call_netdevice_notifiers_info_robust - call per-netns notifier blocks
 2064 *	                                       for and rollback on error
 2065 *	@val_up: value passed unmodified to notifier function
 2066 *	@val_down: value passed unmodified to the notifier function when
 2067 *	           recovering from an error on @val_up
 2068 *	@info: notifier information data
 2069 *
 2070 *	Call all per-netns network notifier blocks, but not notifier blocks on
 2071 *	the global notifier chain. Parameters and return value are as for
 2072 *	raw_notifier_call_chain_robust().
 2073 */
 2074
 2075static int
 2076call_netdevice_notifiers_info_robust(unsigned long val_up,
 2077				     unsigned long val_down,
 2078				     struct netdev_notifier_info *info)
 2079{
 2080	struct net *net = dev_net(info->dev);
 2081
 2082	ASSERT_RTNL();
 2083
 2084	return raw_notifier_call_chain_robust(&net->netdev_chain,
 2085					      val_up, val_down, info);
 2086}
 2087
 2088static int call_netdevice_notifiers_extack(unsigned long val,
 2089					   struct net_device *dev,
 2090					   struct netlink_ext_ack *extack)
 2091{
 2092	struct netdev_notifier_info info = {
 2093		.dev = dev,
 2094		.extack = extack,
 2095	};
 2096
 2097	return call_netdevice_notifiers_info(val, &info);
 2098}
 2099
 2100/**
 2101 *	call_netdevice_notifiers - call all network notifier blocks
 2102 *      @val: value passed unmodified to notifier function
 2103 *      @dev: net_device pointer passed unmodified to notifier function
 2104 *
 2105 *	Call all network notifier blocks.  Parameters and return value
 2106 *	are as for raw_notifier_call_chain().
 2107 */
 2108
 2109int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
 2110{
 2111	return call_netdevice_notifiers_extack(val, dev, NULL);
 2112}
 2113EXPORT_SYMBOL(call_netdevice_notifiers);
 2114
 2115/**
 2116 *	call_netdevice_notifiers_mtu - call all network notifier blocks
 2117 *	@val: value passed unmodified to notifier function
 2118 *	@dev: net_device pointer passed unmodified to notifier function
 2119 *	@arg: additional u32 argument passed to the notifier function
 2120 *
 2121 *	Call all network notifier blocks.  Parameters and return value
 2122 *	are as for raw_notifier_call_chain().
 2123 */
 2124static int call_netdevice_notifiers_mtu(unsigned long val,
 2125					struct net_device *dev, u32 arg)
 2126{
 2127	struct netdev_notifier_info_ext info = {
 2128		.info.dev = dev,
 2129		.ext.mtu = arg,
 2130	};
 2131
 2132	BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
 2133
 2134	return call_netdevice_notifiers_info(val, &info.info);
 2135}
 
 2136
 2137#ifdef CONFIG_NET_INGRESS
 2138static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
 2139
 2140void net_inc_ingress_queue(void)
 2141{
 2142	static_branch_inc(&ingress_needed_key);
 2143}
 2144EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
 2145
 2146void net_dec_ingress_queue(void)
 2147{
 2148	static_branch_dec(&ingress_needed_key);
 2149}
 2150EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
 2151#endif
 2152
 2153#ifdef CONFIG_NET_EGRESS
 2154static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
 2155
 2156void net_inc_egress_queue(void)
 2157{
 2158	static_branch_inc(&egress_needed_key);
 2159}
 2160EXPORT_SYMBOL_GPL(net_inc_egress_queue);
 2161
 2162void net_dec_egress_queue(void)
 2163{
 2164	static_branch_dec(&egress_needed_key);
 2165}
 2166EXPORT_SYMBOL_GPL(net_dec_egress_queue);
 2167#endif
 2168
 2169#ifdef CONFIG_NET_CLS_ACT
 2170DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key);
 2171EXPORT_SYMBOL(tcf_sw_enabled_key);
 2172#endif
 2173
 2174DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
 2175EXPORT_SYMBOL(netstamp_needed_key);
 2176#ifdef CONFIG_JUMP_LABEL
 2177static atomic_t netstamp_needed_deferred;
 2178static atomic_t netstamp_wanted;
 2179static void netstamp_clear(struct work_struct *work)
 2180{
 2181	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
 2182	int wanted;
 2183
 2184	wanted = atomic_add_return(deferred, &netstamp_wanted);
 2185	if (wanted > 0)
 2186		static_branch_enable(&netstamp_needed_key);
 2187	else
 2188		static_branch_disable(&netstamp_needed_key);
 2189}
 2190static DECLARE_WORK(netstamp_work, netstamp_clear);
 2191#endif
 2192
 2193void net_enable_timestamp(void)
 2194{
 2195#ifdef CONFIG_JUMP_LABEL
 2196	int wanted = atomic_read(&netstamp_wanted);
 2197
 2198	while (wanted > 0) {
 2199		if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1))
 
 
 
 2200			return;
 2201	}
 2202	atomic_inc(&netstamp_needed_deferred);
 2203	schedule_work(&netstamp_work);
 2204#else
 2205	static_branch_inc(&netstamp_needed_key);
 2206#endif
 2207}
 2208EXPORT_SYMBOL(net_enable_timestamp);
 2209
 2210void net_disable_timestamp(void)
 2211{
 2212#ifdef CONFIG_JUMP_LABEL
 2213	int wanted = atomic_read(&netstamp_wanted);
 2214
 2215	while (wanted > 1) {
 2216		if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1))
 
 
 
 2217			return;
 2218	}
 2219	atomic_dec(&netstamp_needed_deferred);
 2220	schedule_work(&netstamp_work);
 2221#else
 2222	static_branch_dec(&netstamp_needed_key);
 2223#endif
 2224}
 2225EXPORT_SYMBOL(net_disable_timestamp);
 2226
 2227static inline void net_timestamp_set(struct sk_buff *skb)
 2228{
 2229	skb->tstamp = 0;
 2230	skb->tstamp_type = SKB_CLOCK_REALTIME;
 2231	if (static_branch_unlikely(&netstamp_needed_key))
 2232		skb->tstamp = ktime_get_real();
 2233}
 2234
 2235#define net_timestamp_check(COND, SKB)				\
 2236	if (static_branch_unlikely(&netstamp_needed_key)) {	\
 2237		if ((COND) && !(SKB)->tstamp)			\
 2238			(SKB)->tstamp = ktime_get_real();	\
 2239	}							\
 2240
 2241bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
 2242{
 2243	return __is_skb_forwardable(dev, skb, true);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 2244}
 2245EXPORT_SYMBOL_GPL(is_skb_forwardable);
 2246
 2247static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
 2248			      bool check_mtu)
 2249{
 2250	int ret = ____dev_forward_skb(dev, skb, check_mtu);
 2251
 2252	if (likely(!ret)) {
 2253		skb->protocol = eth_type_trans(skb, dev);
 2254		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 2255	}
 2256
 2257	return ret;
 2258}
 2259
 2260int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 2261{
 2262	return __dev_forward_skb2(dev, skb, true);
 2263}
 2264EXPORT_SYMBOL_GPL(__dev_forward_skb);
 2265
 2266/**
 2267 * dev_forward_skb - loopback an skb to another netif
 2268 *
 2269 * @dev: destination network device
 2270 * @skb: buffer to forward
 2271 *
 2272 * return values:
 2273 *	NET_RX_SUCCESS	(no congestion)
 2274 *	NET_RX_DROP     (packet was dropped, but freed)
 2275 *
 2276 * dev_forward_skb can be used for injecting an skb from the
 2277 * start_xmit function of one device into the receive queue
 2278 * of another device.
 2279 *
 2280 * The receiving device may be in another namespace, so
 2281 * we have to clear all information in the skb that could
 2282 * impact namespace isolation.
 2283 */
 2284int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 2285{
 2286	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
 2287}
 2288EXPORT_SYMBOL_GPL(dev_forward_skb);
 2289
 2290int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
 2291{
 2292	return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
 2293}
 2294
 2295static inline int deliver_skb(struct sk_buff *skb,
 2296			      struct packet_type *pt_prev,
 2297			      struct net_device *orig_dev)
 2298{
 2299	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 2300		return -ENOMEM;
 2301	refcount_inc(&skb->users);
 2302	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 2303}
 2304
 2305static inline void deliver_ptype_list_skb(struct sk_buff *skb,
 2306					  struct packet_type **pt,
 2307					  struct net_device *orig_dev,
 2308					  __be16 type,
 2309					  struct list_head *ptype_list)
 2310{
 2311	struct packet_type *ptype, *pt_prev = *pt;
 2312
 2313	list_for_each_entry_rcu(ptype, ptype_list, list) {
 2314		if (ptype->type != type)
 2315			continue;
 2316		if (pt_prev)
 2317			deliver_skb(skb, pt_prev, orig_dev);
 2318		pt_prev = ptype;
 2319	}
 2320	*pt = pt_prev;
 2321}
 2322
 2323static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
 2324{
 2325	if (!ptype->af_packet_priv || !skb->sk)
 2326		return false;
 2327
 2328	if (ptype->id_match)
 2329		return ptype->id_match(ptype, skb->sk);
 2330	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
 2331		return true;
 2332
 2333	return false;
 2334}
 2335
 2336/**
 2337 * dev_nit_active - return true if any network interface taps are in use
 2338 *
 2339 * @dev: network device to check for the presence of taps
 2340 */
 2341bool dev_nit_active(struct net_device *dev)
 2342{
 2343	return !list_empty(&net_hotdata.ptype_all) ||
 2344	       !list_empty(&dev->ptype_all);
 2345}
 2346EXPORT_SYMBOL_GPL(dev_nit_active);
 2347
 2348/*
 2349 *	Support routine. Sends outgoing frames to any network
 2350 *	taps currently in use.
 2351 */
 2352
 2353void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 2354{
 2355	struct list_head *ptype_list = &net_hotdata.ptype_all;
 2356	struct packet_type *ptype, *pt_prev = NULL;
 2357	struct sk_buff *skb2 = NULL;
 
 
 2358
 2359	rcu_read_lock();
 2360again:
 2361	list_for_each_entry_rcu(ptype, ptype_list, list) {
 2362		if (READ_ONCE(ptype->ignore_outgoing))
 2363			continue;
 2364
 2365		/* Never send packets back to the socket
 2366		 * they originated from - MvS (miquels@drinkel.ow.org)
 2367		 */
 2368		if (skb_loop_sk(ptype, skb))
 2369			continue;
 2370
 2371		if (pt_prev) {
 2372			deliver_skb(skb2, pt_prev, skb->dev);
 2373			pt_prev = ptype;
 2374			continue;
 2375		}
 2376
 2377		/* need to clone skb, done only once */
 2378		skb2 = skb_clone(skb, GFP_ATOMIC);
 2379		if (!skb2)
 2380			goto out_unlock;
 2381
 2382		net_timestamp_set(skb2);
 2383
 2384		/* skb->nh should be correctly
 2385		 * set by sender, so that the second statement is
 2386		 * just protection against buggy protocols.
 2387		 */
 2388		skb_reset_mac_header(skb2);
 2389
 2390		if (skb_network_header(skb2) < skb2->data ||
 2391		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
 2392			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
 2393					     ntohs(skb2->protocol),
 2394					     dev->name);
 2395			skb_reset_network_header(skb2);
 2396		}
 2397
 2398		skb2->transport_header = skb2->network_header;
 2399		skb2->pkt_type = PACKET_OUTGOING;
 2400		pt_prev = ptype;
 2401	}
 2402
 2403	if (ptype_list == &net_hotdata.ptype_all) {
 2404		ptype_list = &dev->ptype_all;
 2405		goto again;
 2406	}
 2407out_unlock:
 2408	if (pt_prev) {
 2409		if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
 2410			pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
 2411		else
 2412			kfree_skb(skb2);
 2413	}
 2414	rcu_read_unlock();
 2415}
 2416EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
 2417
 2418/**
 2419 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
 2420 * @dev: Network device
 2421 * @txq: number of queues available
 2422 *
 2423 * If real_num_tx_queues is changed the tc mappings may no longer be
 2424 * valid. To resolve this verify the tc mapping remains valid and if
 2425 * not NULL the mapping. With no priorities mapping to this
 2426 * offset/count pair it will no longer be used. In the worst case TC0
 2427 * is invalid nothing can be done so disable priority mappings. If is
 2428 * expected that drivers will fix this mapping if they can before
 2429 * calling netif_set_real_num_tx_queues.
 2430 */
 2431static void netif_setup_tc(struct net_device *dev, unsigned int txq)
 2432{
 2433	int i;
 2434	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2435
 2436	/* If TC0 is invalidated disable TC mapping */
 2437	if (tc->offset + tc->count > txq) {
 2438		netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
 2439		dev->num_tc = 0;
 2440		return;
 2441	}
 2442
 2443	/* Invalidated prio to tc mappings set to TC0 */
 2444	for (i = 1; i < TC_BITMASK + 1; i++) {
 2445		int q = netdev_get_prio_tc_map(dev, i);
 2446
 2447		tc = &dev->tc_to_txq[q];
 2448		if (tc->offset + tc->count > txq) {
 2449			netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
 2450				    i, q);
 2451			netdev_set_prio_tc_map(dev, i, 0);
 2452		}
 2453	}
 2454}
 2455
 2456int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
 2457{
 2458	if (dev->num_tc) {
 2459		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2460		int i;
 2461
 2462		/* walk through the TCs and see if it falls into any of them */
 2463		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
 2464			if ((txq - tc->offset) < tc->count)
 2465				return i;
 2466		}
 2467
 2468		/* didn't find it, just return -1 to indicate no match */
 2469		return -1;
 2470	}
 2471
 2472	return 0;
 2473}
 2474EXPORT_SYMBOL(netdev_txq_to_tc);
 2475
 2476#ifdef CONFIG_XPS
 2477static struct static_key xps_needed __read_mostly;
 2478static struct static_key xps_rxqs_needed __read_mostly;
 2479static DEFINE_MUTEX(xps_map_mutex);
 2480#define xmap_dereference(P)		\
 2481	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
 2482
 2483static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
 2484			     struct xps_dev_maps *old_maps, int tci, u16 index)
 2485{
 2486	struct xps_map *map = NULL;
 2487	int pos;
 2488
 2489	map = xmap_dereference(dev_maps->attr_map[tci]);
 
 2490	if (!map)
 2491		return false;
 2492
 2493	for (pos = map->len; pos--;) {
 2494		if (map->queues[pos] != index)
 2495			continue;
 2496
 2497		if (map->len > 1) {
 2498			map->queues[pos] = map->queues[--map->len];
 2499			break;
 2500		}
 2501
 2502		if (old_maps)
 2503			RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
 2504		RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
 2505		kfree_rcu(map, rcu);
 2506		return false;
 2507	}
 2508
 2509	return true;
 2510}
 2511
 2512static bool remove_xps_queue_cpu(struct net_device *dev,
 2513				 struct xps_dev_maps *dev_maps,
 2514				 int cpu, u16 offset, u16 count)
 2515{
 2516	int num_tc = dev_maps->num_tc;
 2517	bool active = false;
 2518	int tci;
 2519
 2520	for (tci = cpu * num_tc; num_tc--; tci++) {
 2521		int i, j;
 2522
 2523		for (i = count, j = offset; i--; j++) {
 2524			if (!remove_xps_queue(dev_maps, NULL, tci, j))
 2525				break;
 2526		}
 2527
 2528		active |= i < 0;
 2529	}
 2530
 2531	return active;
 2532}
 2533
 2534static void reset_xps_maps(struct net_device *dev,
 2535			   struct xps_dev_maps *dev_maps,
 2536			   enum xps_map_type type)
 2537{
 2538	static_key_slow_dec_cpuslocked(&xps_needed);
 2539	if (type == XPS_RXQS)
 2540		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
 2541
 2542	RCU_INIT_POINTER(dev->xps_maps[type], NULL);
 2543
 2544	kfree_rcu(dev_maps, rcu);
 2545}
 2546
 2547static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
 2548			   u16 offset, u16 count)
 2549{
 2550	struct xps_dev_maps *dev_maps;
 
 2551	bool active = false;
 2552	int i, j;
 2553
 2554	dev_maps = xmap_dereference(dev->xps_maps[type]);
 
 
 2555	if (!dev_maps)
 2556		return;
 2557
 2558	for (j = 0; j < dev_maps->nr_ids; j++)
 2559		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
 2560	if (!active)
 2561		reset_xps_maps(dev, dev_maps, type);
 2562
 2563	if (type == XPS_CPUS) {
 2564		for (i = offset + (count - 1); count--; i--)
 2565			netdev_queue_numa_node_write(
 2566				netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
 2567	}
 2568}
 2569
 2570static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
 2571				   u16 count)
 2572{
 2573	if (!static_key_false(&xps_needed))
 2574		return;
 2575
 2576	cpus_read_lock();
 2577	mutex_lock(&xps_map_mutex);
 2578
 2579	if (static_key_false(&xps_rxqs_needed))
 2580		clean_xps_maps(dev, XPS_RXQS, offset, count);
 2581
 2582	clean_xps_maps(dev, XPS_CPUS, offset, count);
 2583
 
 2584	mutex_unlock(&xps_map_mutex);
 2585	cpus_read_unlock();
 2586}
 2587
 2588static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
 2589{
 2590	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
 2591}
 2592
 2593static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
 2594				      u16 index, bool is_rxqs_map)
 2595{
 2596	struct xps_map *new_map;
 2597	int alloc_len = XPS_MIN_MAP_ALLOC;
 2598	int i, pos;
 2599
 2600	for (pos = 0; map && pos < map->len; pos++) {
 2601		if (map->queues[pos] != index)
 2602			continue;
 2603		return map;
 2604	}
 2605
 2606	/* Need to add tx-queue to this CPU's/rx-queue's existing map */
 2607	if (map) {
 2608		if (pos < map->alloc_len)
 2609			return map;
 2610
 2611		alloc_len = map->alloc_len * 2;
 2612	}
 2613
 2614	/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
 2615	 *  map
 2616	 */
 2617	if (is_rxqs_map)
 2618		new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
 2619	else
 2620		new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
 2621				       cpu_to_node(attr_index));
 2622	if (!new_map)
 2623		return NULL;
 2624
 2625	for (i = 0; i < pos; i++)
 2626		new_map->queues[i] = map->queues[i];
 2627	new_map->alloc_len = alloc_len;
 2628	new_map->len = pos;
 2629
 2630	return new_map;
 2631}
 2632
 2633/* Copy xps maps at a given index */
 2634static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
 2635			      struct xps_dev_maps *new_dev_maps, int index,
 2636			      int tc, bool skip_tc)
 2637{
 2638	int i, tci = index * dev_maps->num_tc;
 2639	struct xps_map *map;
 2640
 2641	/* copy maps belonging to foreign traffic classes */
 2642	for (i = 0; i < dev_maps->num_tc; i++, tci++) {
 2643		if (i == tc && skip_tc)
 2644			continue;
 2645
 2646		/* fill in the new device map from the old device map */
 2647		map = xmap_dereference(dev_maps->attr_map[tci]);
 2648		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2649	}
 2650}
 2651
 2652/* Must be called under cpus_read_lock */
 2653int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 2654			  u16 index, enum xps_map_type type)
 2655{
 2656	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
 2657	const unsigned long *online_mask = NULL;
 2658	bool active = false, copy = false;
 2659	int i, j, tci, numa_node_id = -2;
 2660	int maps_sz, num_tc = 1, tc = 0;
 2661	struct xps_map *map, *new_map;
 2662	unsigned int nr_ids;
 2663
 2664	WARN_ON_ONCE(index >= dev->num_tx_queues);
 2665
 2666	if (dev->num_tc) {
 2667		/* Do not allow XPS on subordinate device directly */
 2668		num_tc = dev->num_tc;
 2669		if (num_tc < 0)
 2670			return -EINVAL;
 2671
 2672		/* If queue belongs to subordinate dev use its map */
 2673		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
 2674
 2675		tc = netdev_txq_to_tc(dev, index);
 2676		if (tc < 0)
 2677			return -EINVAL;
 2678	}
 2679
 2680	mutex_lock(&xps_map_mutex);
 2681
 2682	dev_maps = xmap_dereference(dev->xps_maps[type]);
 2683	if (type == XPS_RXQS) {
 2684		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
 2685		nr_ids = dev->num_rx_queues;
 2686	} else {
 2687		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
 2688		if (num_possible_cpus() > 1)
 2689			online_mask = cpumask_bits(cpu_online_mask);
 2690		nr_ids = nr_cpu_ids;
 2691	}
 2692
 2693	if (maps_sz < L1_CACHE_BYTES)
 2694		maps_sz = L1_CACHE_BYTES;
 2695
 2696	/* The old dev_maps could be larger or smaller than the one we're
 2697	 * setting up now, as dev->num_tc or nr_ids could have been updated in
 2698	 * between. We could try to be smart, but let's be safe instead and only
 2699	 * copy foreign traffic classes if the two map sizes match.
 2700	 */
 2701	if (dev_maps &&
 2702	    dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
 2703		copy = true;
 2704
 2705	/* allocate memory for queue storage */
 2706	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
 2707	     j < nr_ids;) {
 2708		if (!new_dev_maps) {
 2709			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
 2710			if (!new_dev_maps) {
 2711				mutex_unlock(&xps_map_mutex);
 2712				return -ENOMEM;
 2713			}
 2714
 2715			new_dev_maps->nr_ids = nr_ids;
 2716			new_dev_maps->num_tc = num_tc;
 2717		}
 2718
 2719		tci = j * num_tc + tc;
 2720		map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
 
 2721
 2722		map = expand_xps_map(map, j, index, type == XPS_RXQS);
 2723		if (!map)
 2724			goto error;
 2725
 2726		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2727	}
 2728
 2729	if (!new_dev_maps)
 2730		goto out_no_new_maps;
 2731
 2732	if (!dev_maps) {
 2733		/* Increment static keys at most once per type */
 2734		static_key_slow_inc_cpuslocked(&xps_needed);
 2735		if (type == XPS_RXQS)
 2736			static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
 2737	}
 
 2738
 2739	for (j = 0; j < nr_ids; j++) {
 2740		bool skip_tc = false;
 2741
 2742		tci = j * num_tc + tc;
 2743		if (netif_attr_test_mask(j, mask, nr_ids) &&
 2744		    netif_attr_test_online(j, online_mask, nr_ids)) {
 2745			/* add tx-queue to CPU/rx-queue maps */
 2746			int pos = 0;
 2747
 2748			skip_tc = true;
 
 
 2749
 2750			map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2751			while ((pos < map->len) && (map->queues[pos] != index))
 2752				pos++;
 2753
 2754			if (pos == map->len)
 2755				map->queues[map->len++] = index;
 2756#ifdef CONFIG_NUMA
 2757			if (type == XPS_CPUS) {
 2758				if (numa_node_id == -2)
 2759					numa_node_id = cpu_to_node(j);
 2760				else if (numa_node_id != cpu_to_node(j))
 2761					numa_node_id = -1;
 2762			}
 2763#endif
 
 
 
 
 2764		}
 2765
 2766		if (copy)
 2767			xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
 2768					  skip_tc);
 
 
 
 2769	}
 2770
 2771	rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
 2772
 2773	/* Cleanup old maps */
 2774	if (!dev_maps)
 2775		goto out_no_old_maps;
 2776
 2777	for (j = 0; j < dev_maps->nr_ids; j++) {
 2778		for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
 2779			map = xmap_dereference(dev_maps->attr_map[tci]);
 2780			if (!map)
 2781				continue;
 2782
 2783			if (copy) {
 2784				new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2785				if (map == new_map)
 2786					continue;
 2787			}
 2788
 2789			RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
 2790			kfree_rcu(map, rcu);
 2791		}
 2792	}
 2793
 2794	old_dev_maps = dev_maps;
 2795
 2796out_no_old_maps:
 2797	dev_maps = new_dev_maps;
 2798	active = true;
 2799
 2800out_no_new_maps:
 2801	if (type == XPS_CPUS)
 2802		/* update Tx queue numa node */
 2803		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
 2804					     (numa_node_id >= 0) ?
 2805					     numa_node_id : NUMA_NO_NODE);
 2806
 2807	if (!dev_maps)
 2808		goto out_no_maps;
 2809
 2810	/* removes tx-queue from unused CPUs/rx-queues */
 2811	for (j = 0; j < dev_maps->nr_ids; j++) {
 2812		tci = j * dev_maps->num_tc;
 2813
 2814		for (i = 0; i < dev_maps->num_tc; i++, tci++) {
 2815			if (i == tc &&
 2816			    netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
 2817			    netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
 2818				continue;
 2819
 2820			active |= remove_xps_queue(dev_maps,
 2821						   copy ? old_dev_maps : NULL,
 2822						   tci, index);
 2823		}
 2824	}
 2825
 2826	if (old_dev_maps)
 2827		kfree_rcu(old_dev_maps, rcu);
 2828
 2829	/* free map if not active */
 2830	if (!active)
 2831		reset_xps_maps(dev, dev_maps, type);
 
 
 2832
 2833out_no_maps:
 2834	mutex_unlock(&xps_map_mutex);
 2835
 2836	return 0;
 2837error:
 2838	/* remove any maps that we added */
 2839	for (j = 0; j < nr_ids; j++) {
 2840		for (i = num_tc, tci = j * num_tc; i--; tci++) {
 2841			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2842			map = copy ?
 2843			      xmap_dereference(dev_maps->attr_map[tci]) :
 2844			      NULL;
 2845			if (new_map && new_map != map)
 2846				kfree(new_map);
 2847		}
 2848	}
 2849
 2850	mutex_unlock(&xps_map_mutex);
 2851
 2852	kfree(new_dev_maps);
 2853	return -ENOMEM;
 2854}
 2855EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
 2856
 2857int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 2858			u16 index)
 2859{
 2860	int ret;
 2861
 2862	cpus_read_lock();
 2863	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
 2864	cpus_read_unlock();
 2865
 2866	return ret;
 2867}
 2868EXPORT_SYMBOL(netif_set_xps_queue);
 2869
 2870#endif
 2871static void netdev_unbind_all_sb_channels(struct net_device *dev)
 2872{
 2873	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2874
 2875	/* Unbind any subordinate channels */
 2876	while (txq-- != &dev->_tx[0]) {
 2877		if (txq->sb_dev)
 2878			netdev_unbind_sb_channel(dev, txq->sb_dev);
 2879	}
 2880}
 2881
 2882void netdev_reset_tc(struct net_device *dev)
 2883{
 2884#ifdef CONFIG_XPS
 2885	netif_reset_xps_queues_gt(dev, 0);
 2886#endif
 2887	netdev_unbind_all_sb_channels(dev);
 2888
 2889	/* Reset TC configuration of device */
 2890	dev->num_tc = 0;
 2891	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
 2892	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
 2893}
 2894EXPORT_SYMBOL(netdev_reset_tc);
 2895
 2896int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
 2897{
 2898	if (tc >= dev->num_tc)
 2899		return -EINVAL;
 2900
 2901#ifdef CONFIG_XPS
 2902	netif_reset_xps_queues(dev, offset, count);
 2903#endif
 2904	dev->tc_to_txq[tc].count = count;
 2905	dev->tc_to_txq[tc].offset = offset;
 2906	return 0;
 2907}
 2908EXPORT_SYMBOL(netdev_set_tc_queue);
 2909
 2910int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
 2911{
 2912	if (num_tc > TC_MAX_QUEUE)
 2913		return -EINVAL;
 2914
 2915#ifdef CONFIG_XPS
 2916	netif_reset_xps_queues_gt(dev, 0);
 2917#endif
 2918	netdev_unbind_all_sb_channels(dev);
 2919
 2920	dev->num_tc = num_tc;
 2921	return 0;
 2922}
 2923EXPORT_SYMBOL(netdev_set_num_tc);
 2924
 2925void netdev_unbind_sb_channel(struct net_device *dev,
 2926			      struct net_device *sb_dev)
 2927{
 2928	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2929
 2930#ifdef CONFIG_XPS
 2931	netif_reset_xps_queues_gt(sb_dev, 0);
 2932#endif
 2933	memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
 2934	memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
 2935
 2936	while (txq-- != &dev->_tx[0]) {
 2937		if (txq->sb_dev == sb_dev)
 2938			txq->sb_dev = NULL;
 2939	}
 2940}
 2941EXPORT_SYMBOL(netdev_unbind_sb_channel);
 2942
 2943int netdev_bind_sb_channel_queue(struct net_device *dev,
 2944				 struct net_device *sb_dev,
 2945				 u8 tc, u16 count, u16 offset)
 2946{
 2947	/* Make certain the sb_dev and dev are already configured */
 2948	if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
 2949		return -EINVAL;
 2950
 2951	/* We cannot hand out queues we don't have */
 2952	if ((offset + count) > dev->real_num_tx_queues)
 2953		return -EINVAL;
 2954
 2955	/* Record the mapping */
 2956	sb_dev->tc_to_txq[tc].count = count;
 2957	sb_dev->tc_to_txq[tc].offset = offset;
 2958
 2959	/* Provide a way for Tx queue to find the tc_to_txq map or
 2960	 * XPS map for itself.
 2961	 */
 2962	while (count--)
 2963		netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
 2964
 2965	return 0;
 2966}
 2967EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
 2968
 2969int netdev_set_sb_channel(struct net_device *dev, u16 channel)
 2970{
 2971	/* Do not use a multiqueue device to represent a subordinate channel */
 2972	if (netif_is_multiqueue(dev))
 2973		return -ENODEV;
 2974
 2975	/* We allow channels 1 - 32767 to be used for subordinate channels.
 2976	 * Channel 0 is meant to be "native" mode and used only to represent
 2977	 * the main root device. We allow writing 0 to reset the device back
 2978	 * to normal mode after being used as a subordinate channel.
 2979	 */
 2980	if (channel > S16_MAX)
 2981		return -EINVAL;
 2982
 2983	dev->num_tc = -channel;
 2984
 2985	return 0;
 2986}
 2987EXPORT_SYMBOL(netdev_set_sb_channel);
 2988
 2989/*
 2990 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
 2991 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
 2992 */
 2993int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
 2994{
 2995	bool disabling;
 2996	int rc;
 2997
 2998	disabling = txq < dev->real_num_tx_queues;
 2999
 3000	if (txq < 1 || txq > dev->num_tx_queues)
 3001		return -EINVAL;
 3002
 3003	if (dev->reg_state == NETREG_REGISTERED ||
 3004	    dev->reg_state == NETREG_UNREGISTERING) {
 3005		ASSERT_RTNL();
 3006
 3007		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
 3008						  txq);
 3009		if (rc)
 3010			return rc;
 3011
 3012		if (dev->num_tc)
 3013			netif_setup_tc(dev, txq);
 3014
 3015		net_shaper_set_real_num_tx_queues(dev, txq);
 3016
 3017		dev_qdisc_change_real_num_tx(dev, txq);
 3018
 3019		dev->real_num_tx_queues = txq;
 3020
 3021		if (disabling) {
 3022			synchronize_net();
 3023			qdisc_reset_all_tx_gt(dev, txq);
 3024#ifdef CONFIG_XPS
 3025			netif_reset_xps_queues_gt(dev, txq);
 3026#endif
 3027		}
 3028	} else {
 3029		dev->real_num_tx_queues = txq;
 3030	}
 3031
 
 3032	return 0;
 3033}
 3034EXPORT_SYMBOL(netif_set_real_num_tx_queues);
 3035
 3036#ifdef CONFIG_SYSFS
 3037/**
 3038 *	netif_set_real_num_rx_queues - set actual number of RX queues used
 3039 *	@dev: Network device
 3040 *	@rxq: Actual number of RX queues
 3041 *
 3042 *	This must be called either with the rtnl_lock held or before
 3043 *	registration of the net device.  Returns 0 on success, or a
 3044 *	negative error code.  If called before registration, it always
 3045 *	succeeds.
 3046 */
 3047int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
 3048{
 3049	int rc;
 3050
 3051	if (rxq < 1 || rxq > dev->num_rx_queues)
 3052		return -EINVAL;
 3053
 3054	if (dev->reg_state == NETREG_REGISTERED) {
 3055		ASSERT_RTNL();
 3056
 3057		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
 3058						  rxq);
 3059		if (rc)
 3060			return rc;
 3061	}
 3062
 3063	dev->real_num_rx_queues = rxq;
 3064	return 0;
 3065}
 3066EXPORT_SYMBOL(netif_set_real_num_rx_queues);
 3067#endif
 3068
 3069/**
 3070 *	netif_set_real_num_queues - set actual number of RX and TX queues used
 3071 *	@dev: Network device
 3072 *	@txq: Actual number of TX queues
 3073 *	@rxq: Actual number of RX queues
 3074 *
 3075 *	Set the real number of both TX and RX queues.
 3076 *	Does nothing if the number of queues is already correct.
 3077 */
 3078int netif_set_real_num_queues(struct net_device *dev,
 3079			      unsigned int txq, unsigned int rxq)
 3080{
 3081	unsigned int old_rxq = dev->real_num_rx_queues;
 3082	int err;
 3083
 3084	if (txq < 1 || txq > dev->num_tx_queues ||
 3085	    rxq < 1 || rxq > dev->num_rx_queues)
 3086		return -EINVAL;
 3087
 3088	/* Start from increases, so the error path only does decreases -
 3089	 * decreases can't fail.
 3090	 */
 3091	if (rxq > dev->real_num_rx_queues) {
 3092		err = netif_set_real_num_rx_queues(dev, rxq);
 3093		if (err)
 3094			return err;
 3095	}
 3096	if (txq > dev->real_num_tx_queues) {
 3097		err = netif_set_real_num_tx_queues(dev, txq);
 3098		if (err)
 3099			goto undo_rx;
 3100	}
 3101	if (rxq < dev->real_num_rx_queues)
 3102		WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
 3103	if (txq < dev->real_num_tx_queues)
 3104		WARN_ON(netif_set_real_num_tx_queues(dev, txq));
 3105
 3106	return 0;
 3107undo_rx:
 3108	WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
 3109	return err;
 3110}
 3111EXPORT_SYMBOL(netif_set_real_num_queues);
 3112
 3113/**
 3114 * netif_set_tso_max_size() - set the max size of TSO frames supported
 3115 * @dev:	netdev to update
 3116 * @size:	max skb->len of a TSO frame
 3117 *
 3118 * Set the limit on the size of TSO super-frames the device can handle.
 3119 * Unless explicitly set the stack will assume the value of
 3120 * %GSO_LEGACY_MAX_SIZE.
 3121 */
 3122void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
 3123{
 3124	dev->tso_max_size = min(GSO_MAX_SIZE, size);
 3125	if (size < READ_ONCE(dev->gso_max_size))
 3126		netif_set_gso_max_size(dev, size);
 3127	if (size < READ_ONCE(dev->gso_ipv4_max_size))
 3128		netif_set_gso_ipv4_max_size(dev, size);
 3129}
 3130EXPORT_SYMBOL(netif_set_tso_max_size);
 3131
 3132/**
 3133 * netif_set_tso_max_segs() - set the max number of segs supported for TSO
 3134 * @dev:	netdev to update
 3135 * @segs:	max number of TCP segments
 3136 *
 3137 * Set the limit on the number of TCP segments the device can generate from
 3138 * a single TSO super-frame.
 3139 * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
 3140 */
 3141void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
 3142{
 3143	dev->tso_max_segs = segs;
 3144	if (segs < READ_ONCE(dev->gso_max_segs))
 3145		netif_set_gso_max_segs(dev, segs);
 3146}
 3147EXPORT_SYMBOL(netif_set_tso_max_segs);
 3148
 3149/**
 3150 * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
 3151 * @to:		netdev to update
 3152 * @from:	netdev from which to copy the limits
 3153 */
 3154void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
 3155{
 3156	netif_set_tso_max_size(to, from->tso_max_size);
 3157	netif_set_tso_max_segs(to, from->tso_max_segs);
 3158}
 3159EXPORT_SYMBOL(netif_inherit_tso_max);
 3160
 3161/**
 3162 * netif_get_num_default_rss_queues - default number of RSS queues
 3163 *
 3164 * Default value is the number of physical cores if there are only 1 or 2, or
 3165 * divided by 2 if there are more.
 3166 */
 3167int netif_get_num_default_rss_queues(void)
 3168{
 3169	cpumask_var_t cpus;
 3170	int cpu, count = 0;
 3171
 3172	if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
 3173		return 1;
 3174
 3175	cpumask_copy(cpus, cpu_online_mask);
 3176	for_each_cpu(cpu, cpus) {
 3177		++count;
 3178		cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
 3179	}
 3180	free_cpumask_var(cpus);
 3181
 3182	return count > 2 ? DIV_ROUND_UP(count, 2) : count;
 3183}
 3184EXPORT_SYMBOL(netif_get_num_default_rss_queues);
 3185
 3186static void __netif_reschedule(struct Qdisc *q)
 3187{
 3188	struct softnet_data *sd;
 3189	unsigned long flags;
 3190
 3191	local_irq_save(flags);
 3192	sd = this_cpu_ptr(&softnet_data);
 3193	q->next_sched = NULL;
 3194	*sd->output_queue_tailp = q;
 3195	sd->output_queue_tailp = &q->next_sched;
 3196	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 3197	local_irq_restore(flags);
 3198}
 3199
 3200void __netif_schedule(struct Qdisc *q)
 3201{
 3202	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
 3203		__netif_reschedule(q);
 3204}
 3205EXPORT_SYMBOL(__netif_schedule);
 3206
 3207struct dev_kfree_skb_cb {
 3208	enum skb_drop_reason reason;
 3209};
 3210
 3211static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
 3212{
 3213	return (struct dev_kfree_skb_cb *)skb->cb;
 3214}
 3215
 3216void netif_schedule_queue(struct netdev_queue *txq)
 3217{
 3218	rcu_read_lock();
 3219	if (!netif_xmit_stopped(txq)) {
 3220		struct Qdisc *q = rcu_dereference(txq->qdisc);
 3221
 3222		__netif_schedule(q);
 3223	}
 3224	rcu_read_unlock();
 3225}
 3226EXPORT_SYMBOL(netif_schedule_queue);
 3227
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3228void netif_tx_wake_queue(struct netdev_queue *dev_queue)
 3229{
 3230	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
 3231		struct Qdisc *q;
 3232
 3233		rcu_read_lock();
 3234		q = rcu_dereference(dev_queue->qdisc);
 3235		__netif_schedule(q);
 3236		rcu_read_unlock();
 3237	}
 3238}
 3239EXPORT_SYMBOL(netif_tx_wake_queue);
 3240
 3241void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason)
 3242{
 3243	unsigned long flags;
 3244
 3245	if (unlikely(!skb))
 3246		return;
 3247
 3248	if (likely(refcount_read(&skb->users) == 1)) {
 3249		smp_rmb();
 3250		refcount_set(&skb->users, 0);
 3251	} else if (likely(!refcount_dec_and_test(&skb->users))) {
 3252		return;
 3253	}
 3254	get_kfree_skb_cb(skb)->reason = reason;
 3255	local_irq_save(flags);
 3256	skb->next = __this_cpu_read(softnet_data.completion_queue);
 3257	__this_cpu_write(softnet_data.completion_queue, skb);
 3258	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 3259	local_irq_restore(flags);
 3260}
 3261EXPORT_SYMBOL(dev_kfree_skb_irq_reason);
 3262
 3263void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason)
 3264{
 3265	if (in_hardirq() || irqs_disabled())
 3266		dev_kfree_skb_irq_reason(skb, reason);
 3267	else
 3268		kfree_skb_reason(skb, reason);
 3269}
 3270EXPORT_SYMBOL(dev_kfree_skb_any_reason);
 3271
 3272
 3273/**
 3274 * netif_device_detach - mark device as removed
 3275 * @dev: network device
 3276 *
 3277 * Mark device as removed from system and therefore no longer available.
 3278 */
 3279void netif_device_detach(struct net_device *dev)
 3280{
 3281	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
 3282	    netif_running(dev)) {
 3283		netif_tx_stop_all_queues(dev);
 3284	}
 3285}
 3286EXPORT_SYMBOL(netif_device_detach);
 3287
 3288/**
 3289 * netif_device_attach - mark device as attached
 3290 * @dev: network device
 3291 *
 3292 * Mark device as attached from system and restart if needed.
 3293 */
 3294void netif_device_attach(struct net_device *dev)
 3295{
 3296	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
 3297	    netif_running(dev)) {
 3298		netif_tx_wake_all_queues(dev);
 3299		__netdev_watchdog_up(dev);
 3300	}
 3301}
 3302EXPORT_SYMBOL(netif_device_attach);
 3303
 3304/*
 3305 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
 3306 * to be used as a distribution range.
 3307 */
 3308static u16 skb_tx_hash(const struct net_device *dev,
 3309		       const struct net_device *sb_dev,
 3310		       struct sk_buff *skb)
 3311{
 3312	u32 hash;
 3313	u16 qoffset = 0;
 3314	u16 qcount = dev->real_num_tx_queues;
 3315
 3316	if (dev->num_tc) {
 3317		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
 3318
 3319		qoffset = sb_dev->tc_to_txq[tc].offset;
 3320		qcount = sb_dev->tc_to_txq[tc].count;
 3321		if (unlikely(!qcount)) {
 3322			net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
 3323					     sb_dev->name, qoffset, tc);
 3324			qoffset = 0;
 3325			qcount = dev->real_num_tx_queues;
 3326		}
 3327	}
 3328
 3329	if (skb_rx_queue_recorded(skb)) {
 3330		DEBUG_NET_WARN_ON_ONCE(qcount == 0);
 3331		hash = skb_get_rx_queue(skb);
 3332		if (hash >= qoffset)
 3333			hash -= qoffset;
 3334		while (unlikely(hash >= qcount))
 3335			hash -= qcount;
 3336		return hash + qoffset;
 
 
 
 
 3337	}
 3338
 3339	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
 3340}
 
 3341
 3342void skb_warn_bad_offload(const struct sk_buff *skb)
 3343{
 3344	static const netdev_features_t null_features;
 3345	struct net_device *dev = skb->dev;
 3346	const char *name = "";
 3347
 3348	if (!net_ratelimit())
 3349		return;
 3350
 3351	if (dev) {
 3352		if (dev->dev.parent)
 3353			name = dev_driver_string(dev->dev.parent);
 3354		else
 3355			name = netdev_name(dev);
 3356	}
 3357	skb_dump(KERN_WARNING, skb, false);
 3358	WARN(1, "%s: caps=(%pNF, %pNF)\n",
 3359	     name, dev ? &dev->features : &null_features,
 3360	     skb->sk ? &skb->sk->sk_route_caps : &null_features);
 
 
 3361}
 3362
 3363/*
 3364 * Invalidate hardware checksum when packet is to be mangled, and
 3365 * complete checksum manually on outgoing path.
 3366 */
 3367int skb_checksum_help(struct sk_buff *skb)
 3368{
 3369	__wsum csum;
 3370	int ret = 0, offset;
 3371
 3372	if (skb->ip_summed == CHECKSUM_COMPLETE)
 3373		goto out_set_summed;
 3374
 3375	if (unlikely(skb_is_gso(skb))) {
 3376		skb_warn_bad_offload(skb);
 3377		return -EINVAL;
 3378	}
 3379
 3380	if (!skb_frags_readable(skb)) {
 3381		return -EFAULT;
 3382	}
 3383
 3384	/* Before computing a checksum, we should make sure no frag could
 3385	 * be modified by an external entity : checksum could be wrong.
 3386	 */
 3387	if (skb_has_shared_frag(skb)) {
 3388		ret = __skb_linearize(skb);
 3389		if (ret)
 3390			goto out;
 3391	}
 3392
 3393	offset = skb_checksum_start_offset(skb);
 3394	ret = -EINVAL;
 3395	if (unlikely(offset >= skb_headlen(skb))) {
 3396		DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
 3397		WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n",
 3398			  offset, skb_headlen(skb));
 3399		goto out;
 3400	}
 3401	csum = skb_checksum(skb, offset, skb->len - offset, 0);
 3402
 3403	offset += skb->csum_offset;
 3404	if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) {
 3405		DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
 3406		WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n",
 3407			  offset + sizeof(__sum16), skb_headlen(skb));
 3408		goto out;
 
 
 3409	}
 3410	ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
 3411	if (ret)
 3412		goto out;
 3413
 3414	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
 3415out_set_summed:
 3416	skb->ip_summed = CHECKSUM_NONE;
 3417out:
 3418	return ret;
 3419}
 3420EXPORT_SYMBOL(skb_checksum_help);
 3421
 3422int skb_crc32c_csum_help(struct sk_buff *skb)
 3423{
 3424	__le32 crc32c_csum;
 3425	int ret = 0, offset, start;
 3426
 3427	if (skb->ip_summed != CHECKSUM_PARTIAL)
 3428		goto out;
 3429
 3430	if (unlikely(skb_is_gso(skb)))
 3431		goto out;
 3432
 3433	/* Before computing a checksum, we should make sure no frag could
 3434	 * be modified by an external entity : checksum could be wrong.
 3435	 */
 3436	if (unlikely(skb_has_shared_frag(skb))) {
 3437		ret = __skb_linearize(skb);
 3438		if (ret)
 3439			goto out;
 3440	}
 3441	start = skb_checksum_start_offset(skb);
 3442	offset = start + offsetof(struct sctphdr, checksum);
 3443	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
 3444		ret = -EINVAL;
 3445		goto out;
 3446	}
 3447
 3448	ret = skb_ensure_writable(skb, offset + sizeof(__le32));
 3449	if (ret)
 3450		goto out;
 3451
 3452	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
 3453						  skb->len - start, ~(__u32)0,
 3454						  crc32c_csum_stub));
 3455	*(__le32 *)(skb->data + offset) = crc32c_csum;
 3456	skb_reset_csum_not_inet(skb);
 3457out:
 3458	return ret;
 3459}
 3460EXPORT_SYMBOL(skb_crc32c_csum_help);
 3461
 3462__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
 3463{
 3464	__be16 type = skb->protocol;
 3465
 3466	/* Tunnel gso handlers can set protocol to ethernet. */
 3467	if (type == htons(ETH_P_TEB)) {
 3468		struct ethhdr *eth;
 3469
 3470		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
 3471			return 0;
 3472
 3473		eth = (struct ethhdr *)skb->data;
 3474		type = eth->h_proto;
 3475	}
 3476
 3477	return vlan_get_protocol_and_depth(skb, type, depth);
 3478}
 3479
 
 
 
 
 
 
 
 
 
 
 
 
 3480
 3481/* Take action when hardware reception checksum errors are detected. */
 3482#ifdef CONFIG_BUG
 3483static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3484{
 3485	netdev_err(dev, "hw csum failure\n");
 3486	skb_dump(KERN_ERR, skb, true);
 3487	dump_stack();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3488}
 
 3489
 3490void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
 
 
 3491{
 3492	DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
 
 
 
 3493}
 3494EXPORT_SYMBOL(netdev_rx_csum_fault);
 3495#endif
 3496
 3497/* XXX: check that highmem exists at all on the given machine. */
 
 
 
 
 3498static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 3499{
 3500#ifdef CONFIG_HIGHMEM
 3501	int i;
 3502
 3503	if (!(dev->features & NETIF_F_HIGHDMA)) {
 3504		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 3505			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 3506			struct page *page = skb_frag_page(frag);
 
 
 
 
 
 
 3507
 3508			if (page && PageHighMem(page))
 
 
 
 
 
 3509				return 1;
 3510		}
 3511	}
 3512#endif
 3513	return 0;
 3514}
 3515
 3516/* If MPLS offload request, verify we are testing hardware MPLS features
 3517 * instead of standard features for the netdev.
 3518 */
 3519#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
 3520static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3521					   netdev_features_t features,
 3522					   __be16 type)
 3523{
 3524	if (eth_p_mpls(type))
 3525		features &= skb->dev->mpls_features;
 3526
 3527	return features;
 3528}
 3529#else
 3530static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3531					   netdev_features_t features,
 3532					   __be16 type)
 3533{
 3534	return features;
 3535}
 3536#endif
 3537
 3538static netdev_features_t harmonize_features(struct sk_buff *skb,
 3539	netdev_features_t features)
 3540{
 
 3541	__be16 type;
 3542
 3543	type = skb_network_protocol(skb, NULL);
 3544	features = net_mpls_features(skb, features, type);
 3545
 3546	if (skb->ip_summed != CHECKSUM_NONE &&
 3547	    !can_checksum_protocol(features, type)) {
 3548		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
 3549	}
 3550	if (illegal_highdma(skb->dev, skb))
 3551		features &= ~NETIF_F_SG;
 3552
 3553	return features;
 3554}
 3555
 3556netdev_features_t passthru_features_check(struct sk_buff *skb,
 3557					  struct net_device *dev,
 3558					  netdev_features_t features)
 3559{
 3560	return features;
 3561}
 3562EXPORT_SYMBOL(passthru_features_check);
 3563
 3564static netdev_features_t dflt_features_check(struct sk_buff *skb,
 3565					     struct net_device *dev,
 3566					     netdev_features_t features)
 3567{
 3568	return vlan_features_check(skb, features);
 3569}
 3570
 3571static netdev_features_t gso_features_check(const struct sk_buff *skb,
 3572					    struct net_device *dev,
 3573					    netdev_features_t features)
 3574{
 3575	u16 gso_segs = skb_shinfo(skb)->gso_segs;
 3576
 3577	if (gso_segs > READ_ONCE(dev->gso_max_segs))
 3578		return features & ~NETIF_F_GSO_MASK;
 3579
 3580	if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb)))
 3581		return features & ~NETIF_F_GSO_MASK;
 3582
 3583	if (!skb_shinfo(skb)->gso_type) {
 3584		skb_warn_bad_offload(skb);
 3585		return features & ~NETIF_F_GSO_MASK;
 3586	}
 3587
 3588	/* Support for GSO partial features requires software
 3589	 * intervention before we can actually process the packets
 3590	 * so we need to strip support for any partial features now
 3591	 * and we can pull them back in after we have partially
 3592	 * segmented the frame.
 3593	 */
 3594	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
 3595		features &= ~dev->gso_partial_features;
 3596
 3597	/* Make sure to clear the IPv4 ID mangling feature if the
 3598	 * IPv4 header has the potential to be fragmented.
 3599	 */
 3600	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
 3601		struct iphdr *iph = skb->encapsulation ?
 3602				    inner_ip_hdr(skb) : ip_hdr(skb);
 3603
 3604		if (!(iph->frag_off & htons(IP_DF)))
 3605			features &= ~NETIF_F_TSO_MANGLEID;
 3606	}
 3607
 3608	return features;
 3609}
 3610
 3611netdev_features_t netif_skb_features(struct sk_buff *skb)
 3612{
 3613	struct net_device *dev = skb->dev;
 3614	netdev_features_t features = dev->features;
 3615
 3616	if (skb_is_gso(skb))
 3617		features = gso_features_check(skb, dev, features);
 3618
 3619	/* If encapsulation offload request, verify we are testing
 3620	 * hardware encapsulation features instead of standard
 3621	 * features for the netdev
 3622	 */
 3623	if (skb->encapsulation)
 3624		features &= dev->hw_enc_features;
 3625
 3626	if (skb_vlan_tagged(skb))
 3627		features = netdev_intersect_features(features,
 3628						     dev->vlan_features |
 3629						     NETIF_F_HW_VLAN_CTAG_TX |
 3630						     NETIF_F_HW_VLAN_STAG_TX);
 3631
 3632	if (dev->netdev_ops->ndo_features_check)
 3633		features &= dev->netdev_ops->ndo_features_check(skb, dev,
 3634								features);
 3635	else
 3636		features &= dflt_features_check(skb, dev, features);
 3637
 3638	return harmonize_features(skb, features);
 3639}
 3640EXPORT_SYMBOL(netif_skb_features);
 3641
 3642static int xmit_one(struct sk_buff *skb, struct net_device *dev,
 3643		    struct netdev_queue *txq, bool more)
 3644{
 3645	unsigned int len;
 3646	int rc;
 3647
 3648	if (dev_nit_active(dev))
 3649		dev_queue_xmit_nit(skb, dev);
 3650
 3651	len = skb->len;
 3652	trace_net_dev_start_xmit(skb, dev);
 3653	rc = netdev_start_xmit(skb, dev, txq, more);
 3654	trace_net_dev_xmit(skb, rc, dev, len);
 3655
 3656	return rc;
 3657}
 3658
 3659struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
 3660				    struct netdev_queue *txq, int *ret)
 3661{
 3662	struct sk_buff *skb = first;
 3663	int rc = NETDEV_TX_OK;
 3664
 3665	while (skb) {
 3666		struct sk_buff *next = skb->next;
 3667
 3668		skb_mark_not_on_list(skb);
 3669		rc = xmit_one(skb, dev, txq, next != NULL);
 3670		if (unlikely(!dev_xmit_complete(rc))) {
 3671			skb->next = next;
 3672			goto out;
 3673		}
 3674
 3675		skb = next;
 3676		if (netif_tx_queue_stopped(txq) && skb) {
 3677			rc = NETDEV_TX_BUSY;
 3678			break;
 3679		}
 3680	}
 3681
 3682out:
 3683	*ret = rc;
 3684	return skb;
 3685}
 3686
 3687static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
 3688					  netdev_features_t features)
 3689{
 3690	if (skb_vlan_tag_present(skb) &&
 3691	    !vlan_hw_offload_capable(features, skb->vlan_proto))
 3692		skb = __vlan_hwaccel_push_inside(skb);
 3693	return skb;
 3694}
 3695
 3696int skb_csum_hwoffload_help(struct sk_buff *skb,
 3697			    const netdev_features_t features)
 3698{
 3699	if (unlikely(skb_csum_is_sctp(skb)))
 3700		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
 3701			skb_crc32c_csum_help(skb);
 3702
 3703	if (features & NETIF_F_HW_CSUM)
 3704		return 0;
 3705
 3706	if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
 3707		if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) &&
 3708		    skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
 3709		    !ipv6_has_hopopt_jumbo(skb))
 3710			goto sw_checksum;
 3711
 3712		switch (skb->csum_offset) {
 3713		case offsetof(struct tcphdr, check):
 3714		case offsetof(struct udphdr, check):
 3715			return 0;
 3716		}
 3717	}
 3718
 3719sw_checksum:
 3720	return skb_checksum_help(skb);
 3721}
 3722EXPORT_SYMBOL(skb_csum_hwoffload_help);
 3723
 3724static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
 3725{
 3726	netdev_features_t features;
 3727
 3728	features = netif_skb_features(skb);
 3729	skb = validate_xmit_vlan(skb, features);
 3730	if (unlikely(!skb))
 3731		goto out_null;
 3732
 3733	skb = sk_validate_xmit_skb(skb, dev);
 3734	if (unlikely(!skb))
 3735		goto out_null;
 3736
 3737	if (netif_needs_gso(skb, features)) {
 3738		struct sk_buff *segs;
 3739
 3740		segs = skb_gso_segment(skb, features);
 3741		if (IS_ERR(segs)) {
 3742			goto out_kfree_skb;
 3743		} else if (segs) {
 3744			consume_skb(skb);
 3745			skb = segs;
 3746		}
 3747	} else {
 3748		if (skb_needs_linearize(skb, features) &&
 3749		    __skb_linearize(skb))
 3750			goto out_kfree_skb;
 3751
 3752		/* If packet is not checksummed and device does not
 3753		 * support checksumming for this protocol, complete
 3754		 * checksumming here.
 3755		 */
 3756		if (skb->ip_summed == CHECKSUM_PARTIAL) {
 3757			if (skb->encapsulation)
 3758				skb_set_inner_transport_header(skb,
 3759							       skb_checksum_start_offset(skb));
 3760			else
 3761				skb_set_transport_header(skb,
 3762							 skb_checksum_start_offset(skb));
 3763			if (skb_csum_hwoffload_help(skb, features))
 
 3764				goto out_kfree_skb;
 3765		}
 3766	}
 3767
 3768	skb = validate_xmit_xfrm(skb, features, again);
 3769
 3770	return skb;
 3771
 3772out_kfree_skb:
 3773	kfree_skb(skb);
 3774out_null:
 3775	dev_core_stats_tx_dropped_inc(dev);
 3776	return NULL;
 3777}
 3778
 3779struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
 3780{
 3781	struct sk_buff *next, *head = NULL, *tail;
 3782
 3783	for (; skb != NULL; skb = next) {
 3784		next = skb->next;
 3785		skb_mark_not_on_list(skb);
 3786
 3787		/* in case skb won't be segmented, point to itself */
 3788		skb->prev = skb;
 3789
 3790		skb = validate_xmit_skb(skb, dev, again);
 3791		if (!skb)
 3792			continue;
 3793
 3794		if (!head)
 3795			head = skb;
 3796		else
 3797			tail->next = skb;
 3798		/* If skb was segmented, skb->prev points to
 3799		 * the last segment. If not, it still contains skb.
 3800		 */
 3801		tail = skb->prev;
 3802	}
 3803	return head;
 3804}
 3805EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
 3806
 3807static void qdisc_pkt_len_init(struct sk_buff *skb)
 3808{
 3809	const struct skb_shared_info *shinfo = skb_shinfo(skb);
 3810
 3811	qdisc_skb_cb(skb)->pkt_len = skb->len;
 3812
 3813	/* To get more precise estimation of bytes sent on wire,
 3814	 * we add to pkt_len the headers size of all segments
 3815	 */
 3816	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
 3817		u16 gso_segs = shinfo->gso_segs;
 3818		unsigned int hdr_len;
 
 3819
 3820		/* mac layer + network layer */
 3821		hdr_len = skb_transport_offset(skb);
 3822
 3823		/* + transport layer */
 3824		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
 3825			const struct tcphdr *th;
 3826			struct tcphdr _tcphdr;
 3827
 3828			th = skb_header_pointer(skb, hdr_len,
 3829						sizeof(_tcphdr), &_tcphdr);
 3830			if (likely(th))
 3831				hdr_len += __tcp_hdrlen(th);
 3832		} else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
 3833			struct udphdr _udphdr;
 3834
 3835			if (skb_header_pointer(skb, hdr_len,
 3836					       sizeof(_udphdr), &_udphdr))
 3837				hdr_len += sizeof(struct udphdr);
 3838		}
 3839
 3840		if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) {
 3841			int payload = skb->len - hdr_len;
 3842
 3843			/* Malicious packet. */
 3844			if (payload <= 0)
 3845				return;
 3846			gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size);
 3847		}
 3848		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
 3849	}
 3850}
 3851
 3852static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
 3853			     struct sk_buff **to_free,
 3854			     struct netdev_queue *txq)
 3855{
 3856	int rc;
 3857
 3858	rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
 3859	if (rc == NET_XMIT_SUCCESS)
 3860		trace_qdisc_enqueue(q, txq, skb);
 3861	return rc;
 3862}
 3863
 3864static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 3865				 struct net_device *dev,
 3866				 struct netdev_queue *txq)
 3867{
 3868	spinlock_t *root_lock = qdisc_lock(q);
 3869	struct sk_buff *to_free = NULL;
 3870	bool contended;
 3871	int rc;
 3872
 3873	qdisc_calculate_pkt_len(skb, q);
 3874
 3875	tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP);
 3876
 3877	if (q->flags & TCQ_F_NOLOCK) {
 3878		if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
 3879		    qdisc_run_begin(q)) {
 3880			/* Retest nolock_qdisc_is_empty() within the protection
 3881			 * of q->seqlock to protect from racing with requeuing.
 3882			 */
 3883			if (unlikely(!nolock_qdisc_is_empty(q))) {
 3884				rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
 3885				__qdisc_run(q);
 3886				qdisc_run_end(q);
 3887
 3888				goto no_lock_out;
 3889			}
 3890
 3891			qdisc_bstats_cpu_update(q, skb);
 3892			if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
 3893			    !nolock_qdisc_is_empty(q))
 3894				__qdisc_run(q);
 3895
 3896			qdisc_run_end(q);
 3897			return NET_XMIT_SUCCESS;
 3898		}
 3899
 3900		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
 3901		qdisc_run(q);
 3902
 3903no_lock_out:
 3904		if (unlikely(to_free))
 3905			kfree_skb_list_reason(to_free,
 3906					      tcf_get_drop_reason(to_free));
 3907		return rc;
 3908	}
 3909
 3910	if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
 3911		kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
 3912		return NET_XMIT_DROP;
 3913	}
 3914	/*
 3915	 * Heuristic to force contended enqueues to serialize on a
 3916	 * separate lock before trying to get qdisc main lock.
 3917	 * This permits qdisc->running owner to get the lock more
 3918	 * often and dequeue packets faster.
 3919	 * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
 3920	 * and then other tasks will only enqueue packets. The packets will be
 3921	 * sent after the qdisc owner is scheduled again. To prevent this
 3922	 * scenario the task always serialize on the lock.
 3923	 */
 3924	contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
 3925	if (unlikely(contended))
 3926		spin_lock(&q->busylock);
 3927
 3928	spin_lock(root_lock);
 3929	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
 3930		__qdisc_drop(skb, &to_free);
 3931		rc = NET_XMIT_DROP;
 3932	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
 3933		   qdisc_run_begin(q)) {
 3934		/*
 3935		 * This is a work-conserving queue; there are no old skbs
 3936		 * waiting to be sent out; and the qdisc is not running -
 3937		 * xmit the skb directly.
 3938		 */
 3939
 3940		qdisc_bstats_update(q, skb);
 3941
 3942		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
 3943			if (unlikely(contended)) {
 3944				spin_unlock(&q->busylock);
 3945				contended = false;
 3946			}
 3947			__qdisc_run(q);
 3948		}
 
 3949
 3950		qdisc_run_end(q);
 3951		rc = NET_XMIT_SUCCESS;
 3952	} else {
 3953		WRITE_ONCE(q->owner, smp_processor_id());
 3954		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
 3955		WRITE_ONCE(q->owner, -1);
 3956		if (qdisc_run_begin(q)) {
 3957			if (unlikely(contended)) {
 3958				spin_unlock(&q->busylock);
 3959				contended = false;
 3960			}
 3961			__qdisc_run(q);
 3962			qdisc_run_end(q);
 3963		}
 3964	}
 3965	spin_unlock(root_lock);
 3966	if (unlikely(to_free))
 3967		kfree_skb_list_reason(to_free,
 3968				      tcf_get_drop_reason(to_free));
 3969	if (unlikely(contended))
 3970		spin_unlock(&q->busylock);
 3971	return rc;
 3972}
 3973
 3974#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 3975static void skb_update_prio(struct sk_buff *skb)
 3976{
 3977	const struct netprio_map *map;
 3978	const struct sock *sk;
 3979	unsigned int prioidx;
 3980
 3981	if (skb->priority)
 3982		return;
 3983	map = rcu_dereference_bh(skb->dev->priomap);
 3984	if (!map)
 3985		return;
 3986	sk = skb_to_full_sk(skb);
 3987	if (!sk)
 3988		return;
 3989
 3990	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
 3991
 3992	if (prioidx < map->priomap_len)
 3993		skb->priority = map->priomap[prioidx];
 3994}
 3995#else
 3996#define skb_update_prio(skb)
 3997#endif
 3998
 
 
 
 3999/**
 4000 *	dev_loopback_xmit - loop back @skb
 4001 *	@net: network namespace this loopback is happening in
 4002 *	@sk:  sk needed to be a netfilter okfn
 4003 *	@skb: buffer to transmit
 4004 */
 4005int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
 4006{
 4007	skb_reset_mac_header(skb);
 4008	__skb_pull(skb, skb_network_offset(skb));
 4009	skb->pkt_type = PACKET_LOOPBACK;
 4010	if (skb->ip_summed == CHECKSUM_NONE)
 4011		skb->ip_summed = CHECKSUM_UNNECESSARY;
 4012	DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb));
 4013	skb_dst_force(skb);
 4014	netif_rx(skb);
 4015	return 0;
 4016}
 4017EXPORT_SYMBOL(dev_loopback_xmit);
 4018
 4019#ifdef CONFIG_NET_EGRESS
 4020static struct netdev_queue *
 4021netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
 4022{
 4023	int qm = skb_get_queue_mapping(skb);
 4024
 4025	return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
 4026}
 4027
 4028#ifndef CONFIG_PREEMPT_RT
 4029static bool netdev_xmit_txqueue_skipped(void)
 4030{
 4031	return __this_cpu_read(softnet_data.xmit.skip_txqueue);
 4032}
 4033
 4034void netdev_xmit_skip_txqueue(bool skip)
 4035{
 4036	__this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
 4037}
 4038EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
 4039
 4040#else
 4041static bool netdev_xmit_txqueue_skipped(void)
 4042{
 4043	return current->net_xmit.skip_txqueue;
 4044}
 4045
 4046void netdev_xmit_skip_txqueue(bool skip)
 4047{
 4048	current->net_xmit.skip_txqueue = skip;
 4049}
 4050EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
 4051#endif
 4052#endif /* CONFIG_NET_EGRESS */
 4053
 4054#ifdef CONFIG_NET_XGRESS
 4055static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
 4056		  enum skb_drop_reason *drop_reason)
 4057{
 4058	int ret = TC_ACT_UNSPEC;
 4059#ifdef CONFIG_NET_CLS_ACT
 4060	struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
 4061	struct tcf_result res;
 4062
 4063	if (!miniq)
 4064		return ret;
 4065
 4066	/* Global bypass */
 4067	if (!static_branch_likely(&tcf_sw_enabled_key))
 4068		return ret;
 4069
 4070	/* Block-wise bypass */
 4071	if (tcf_block_bypass_sw(miniq->block))
 4072		return ret;
 
 4073
 4074	tc_skb_cb(skb)->mru = 0;
 4075	tc_skb_cb(skb)->post_ct = false;
 4076	tcf_set_drop_reason(skb, *drop_reason);
 4077
 4078	mini_qdisc_bstats_cpu_update(miniq, skb);
 4079	ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
 4080	/* Only tcf related quirks below. */
 4081	switch (ret) {
 4082	case TC_ACT_SHOT:
 4083		*drop_reason = tcf_get_drop_reason(skb);
 4084		mini_qdisc_qstats_cpu_drop(miniq);
 4085		break;
 4086	case TC_ACT_OK:
 4087	case TC_ACT_RECLASSIFY:
 4088		skb->tc_index = TC_H_MIN(res.classid);
 4089		break;
 4090	}
 4091#endif /* CONFIG_NET_CLS_ACT */
 4092	return ret;
 4093}
 4094
 4095static DEFINE_STATIC_KEY_FALSE(tcx_needed_key);
 4096
 4097void tcx_inc(void)
 4098{
 4099	static_branch_inc(&tcx_needed_key);
 4100}
 4101
 4102void tcx_dec(void)
 4103{
 4104	static_branch_dec(&tcx_needed_key);
 4105}
 4106
 4107static __always_inline enum tcx_action_base
 4108tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
 4109	const bool needs_mac)
 4110{
 4111	const struct bpf_mprog_fp *fp;
 4112	const struct bpf_prog *prog;
 4113	int ret = TCX_NEXT;
 4114
 4115	if (needs_mac)
 4116		__skb_push(skb, skb->mac_len);
 4117	bpf_mprog_foreach_prog(entry, fp, prog) {
 4118		bpf_compute_data_pointers(skb);
 4119		ret = bpf_prog_run(prog, skb);
 4120		if (ret != TCX_NEXT)
 4121			break;
 4122	}
 4123	if (needs_mac)
 4124		__skb_pull(skb, skb->mac_len);
 4125	return tcx_action_code(skb, ret);
 4126}
 4127
 4128static __always_inline struct sk_buff *
 4129sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 4130		   struct net_device *orig_dev, bool *another)
 4131{
 4132	struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
 4133	enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
 4134	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
 4135	int sch_ret;
 4136
 4137	if (!entry)
 4138		return skb;
 4139
 4140	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
 4141	if (*pt_prev) {
 4142		*ret = deliver_skb(skb, *pt_prev, orig_dev);
 4143		*pt_prev = NULL;
 4144	}
 4145
 4146	qdisc_skb_cb(skb)->pkt_len = skb->len;
 4147	tcx_set_ingress(skb, true);
 4148
 4149	if (static_branch_unlikely(&tcx_needed_key)) {
 4150		sch_ret = tcx_run(entry, skb, true);
 4151		if (sch_ret != TC_ACT_UNSPEC)
 4152			goto ingress_verdict;
 4153	}
 4154	sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
 4155ingress_verdict:
 4156	switch (sch_ret) {
 4157	case TC_ACT_REDIRECT:
 4158		/* skb_mac_header check was done by BPF, so we can safely
 4159		 * push the L2 header back before redirecting to another
 4160		 * netdev.
 4161		 */
 4162		__skb_push(skb, skb->mac_len);
 4163		if (skb_do_redirect(skb) == -EAGAIN) {
 4164			__skb_pull(skb, skb->mac_len);
 4165			*another = true;
 4166			break;
 4167		}
 4168		*ret = NET_RX_SUCCESS;
 4169		bpf_net_ctx_clear(bpf_net_ctx);
 4170		return NULL;
 4171	case TC_ACT_SHOT:
 4172		kfree_skb_reason(skb, drop_reason);
 4173		*ret = NET_RX_DROP;
 4174		bpf_net_ctx_clear(bpf_net_ctx);
 4175		return NULL;
 4176	/* used by tc_run */
 4177	case TC_ACT_STOLEN:
 4178	case TC_ACT_QUEUED:
 4179	case TC_ACT_TRAP:
 4180		consume_skb(skb);
 4181		fallthrough;
 4182	case TC_ACT_CONSUMED:
 4183		*ret = NET_RX_SUCCESS;
 4184		bpf_net_ctx_clear(bpf_net_ctx);
 4185		return NULL;
 4186	}
 4187	bpf_net_ctx_clear(bpf_net_ctx);
 4188
 4189	return skb;
 4190}
 4191
 4192static __always_inline struct sk_buff *
 4193sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 4194{
 4195	struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
 4196	enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
 4197	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
 4198	int sch_ret;
 4199
 4200	if (!entry)
 4201		return skb;
 4202
 4203	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
 4204
 4205	/* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
 4206	 * already set by the caller.
 4207	 */
 4208	if (static_branch_unlikely(&tcx_needed_key)) {
 4209		sch_ret = tcx_run(entry, skb, false);
 4210		if (sch_ret != TC_ACT_UNSPEC)
 4211			goto egress_verdict;
 4212	}
 4213	sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
 4214egress_verdict:
 4215	switch (sch_ret) {
 4216	case TC_ACT_REDIRECT:
 4217		/* No need to push/pop skb's mac_header here on egress! */
 4218		skb_do_redirect(skb);
 4219		*ret = NET_XMIT_SUCCESS;
 4220		bpf_net_ctx_clear(bpf_net_ctx);
 4221		return NULL;
 4222	case TC_ACT_SHOT:
 4223		kfree_skb_reason(skb, drop_reason);
 4224		*ret = NET_XMIT_DROP;
 4225		bpf_net_ctx_clear(bpf_net_ctx);
 4226		return NULL;
 4227	/* used by tc_run */
 4228	case TC_ACT_STOLEN:
 4229	case TC_ACT_QUEUED:
 4230	case TC_ACT_TRAP:
 4231		consume_skb(skb);
 4232		fallthrough;
 4233	case TC_ACT_CONSUMED:
 4234		*ret = NET_XMIT_SUCCESS;
 4235		bpf_net_ctx_clear(bpf_net_ctx);
 4236		return NULL;
 
 
 4237	}
 4238	bpf_net_ctx_clear(bpf_net_ctx);
 4239
 4240	return skb;
 4241}
 4242#else
 4243static __always_inline struct sk_buff *
 4244sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 4245		   struct net_device *orig_dev, bool *another)
 4246{
 4247	return skb;
 4248}
 4249
 4250static __always_inline struct sk_buff *
 4251sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 4252{
 4253	return skb;
 4254}
 4255#endif /* CONFIG_NET_XGRESS */
 4256
 4257#ifdef CONFIG_XPS
 4258static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
 4259			       struct xps_dev_maps *dev_maps, unsigned int tci)
 4260{
 4261	int tc = netdev_get_prio_tc_map(dev, skb->priority);
 4262	struct xps_map *map;
 4263	int queue_index = -1;
 4264
 4265	if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
 4266		return queue_index;
 4267
 4268	tci *= dev_maps->num_tc;
 4269	tci += tc;
 4270
 4271	map = rcu_dereference(dev_maps->attr_map[tci]);
 4272	if (map) {
 4273		if (map->len == 1)
 4274			queue_index = map->queues[0];
 4275		else
 4276			queue_index = map->queues[reciprocal_scale(
 4277						skb_get_hash(skb), map->len)];
 4278		if (unlikely(queue_index >= dev->real_num_tx_queues))
 4279			queue_index = -1;
 4280	}
 4281	return queue_index;
 4282}
 4283#endif
 4284
 4285static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
 4286			 struct sk_buff *skb)
 4287{
 4288#ifdef CONFIG_XPS
 4289	struct xps_dev_maps *dev_maps;
 4290	struct sock *sk = skb->sk;
 4291	int queue_index = -1;
 4292
 4293	if (!static_key_false(&xps_needed))
 4294		return -1;
 4295
 4296	rcu_read_lock();
 4297	if (!static_key_false(&xps_rxqs_needed))
 4298		goto get_cpus_map;
 4299
 4300	dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
 4301	if (dev_maps) {
 4302		int tci = sk_rx_queue_get(sk);
 4303
 4304		if (tci >= 0)
 4305			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 4306							  tci);
 4307	}
 4308
 4309get_cpus_map:
 4310	if (queue_index < 0) {
 4311		dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
 4312		if (dev_maps) {
 4313			unsigned int tci = skb->sender_cpu - 1;
 4314
 4315			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 4316							  tci);
 
 
 
 
 
 
 
 4317		}
 4318	}
 4319	rcu_read_unlock();
 4320
 4321	return queue_index;
 4322#else
 4323	return -1;
 4324#endif
 4325}
 4326
 4327u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
 4328		     struct net_device *sb_dev)
 4329{
 4330	return 0;
 4331}
 4332EXPORT_SYMBOL(dev_pick_tx_zero);
 4333
 4334u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
 4335		     struct net_device *sb_dev)
 4336{
 4337	struct sock *sk = skb->sk;
 4338	int queue_index = sk_tx_queue_get(sk);
 4339
 4340	sb_dev = sb_dev ? : dev;
 4341
 4342	if (queue_index < 0 || skb->ooo_okay ||
 4343	    queue_index >= dev->real_num_tx_queues) {
 4344		int new_index = get_xps_queue(dev, sb_dev, skb);
 4345
 4346		if (new_index < 0)
 4347			new_index = skb_tx_hash(dev, sb_dev, skb);
 4348
 4349		if (queue_index != new_index && sk &&
 4350		    sk_fullsock(sk) &&
 4351		    rcu_access_pointer(sk->sk_dst_cache))
 4352			sk_tx_queue_set(sk, new_index);
 4353
 4354		queue_index = new_index;
 4355	}
 4356
 4357	return queue_index;
 4358}
 4359EXPORT_SYMBOL(netdev_pick_tx);
 4360
 4361struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
 4362					 struct sk_buff *skb,
 4363					 struct net_device *sb_dev)
 4364{
 4365	int queue_index = 0;
 4366
 4367#ifdef CONFIG_XPS
 4368	u32 sender_cpu = skb->sender_cpu - 1;
 4369
 4370	if (sender_cpu >= (u32)NR_CPUS)
 4371		skb->sender_cpu = raw_smp_processor_id() + 1;
 4372#endif
 4373
 4374	if (dev->real_num_tx_queues != 1) {
 4375		const struct net_device_ops *ops = dev->netdev_ops;
 4376
 4377		if (ops->ndo_select_queue)
 4378			queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
 
 4379		else
 4380			queue_index = netdev_pick_tx(dev, skb, sb_dev);
 4381
 4382		queue_index = netdev_cap_txqueue(dev, queue_index);
 
 4383	}
 4384
 4385	skb_set_queue_mapping(skb, queue_index);
 4386	return netdev_get_tx_queue(dev, queue_index);
 4387}
 4388
 4389/**
 4390 * __dev_queue_xmit() - transmit a buffer
 4391 * @skb:	buffer to transmit
 4392 * @sb_dev:	suboordinate device used for L2 forwarding offload
 4393 *
 4394 * Queue a buffer for transmission to a network device. The caller must
 4395 * have set the device and priority and built the buffer before calling
 4396 * this function. The function can be called from an interrupt.
 4397 *
 4398 * When calling this method, interrupts MUST be enabled. This is because
 4399 * the BH enable code must have IRQs enabled so that it will not deadlock.
 4400 *
 4401 * Regardless of the return value, the skb is consumed, so it is currently
 4402 * difficult to retry a send to this method. (You can bump the ref count
 4403 * before sending to hold a reference for retry if you are careful.)
 4404 *
 4405 * Return:
 4406 * * 0				- buffer successfully transmitted
 4407 * * positive qdisc return code	- NET_XMIT_DROP etc.
 4408 * * negative errno		- other errors
 
 
 
 
 
 4409 */
 4410int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 4411{
 4412	struct net_device *dev = skb->dev;
 4413	struct netdev_queue *txq = NULL;
 4414	struct Qdisc *q;
 4415	int rc = -ENOMEM;
 4416	bool again = false;
 4417
 4418	skb_reset_mac_header(skb);
 4419	skb_assert_len(skb);
 4420
 4421	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
 4422		__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
 4423
 4424	/* Disable soft irqs for various locks below. Also
 4425	 * stops preemption for RCU.
 4426	 */
 4427	rcu_read_lock_bh();
 4428
 4429	skb_update_prio(skb);
 4430
 4431	qdisc_pkt_len_init(skb);
 4432	tcx_set_ingress(skb, false);
 4433#ifdef CONFIG_NET_EGRESS
 4434	if (static_branch_unlikely(&egress_needed_key)) {
 4435		if (nf_hook_egress_active()) {
 4436			skb = nf_hook_egress(skb, &rc, dev);
 4437			if (!skb)
 4438				goto out;
 4439		}
 4440
 4441		netdev_xmit_skip_txqueue(false);
 4442
 4443		nf_skip_egress(skb, true);
 4444		skb = sch_handle_egress(skb, &rc, dev);
 4445		if (!skb)
 4446			goto out;
 4447		nf_skip_egress(skb, false);
 4448
 4449		if (netdev_xmit_txqueue_skipped())
 4450			txq = netdev_tx_queue_mapping(dev, skb);
 4451	}
 
 4452#endif
 4453	/* If device/qdisc don't need skb->dst, release it right now while
 4454	 * its hot in this cpu cache.
 4455	 */
 4456	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
 4457		skb_dst_drop(skb);
 4458	else
 4459		skb_dst_force(skb);
 4460
 4461	if (!txq)
 4462		txq = netdev_core_pick_tx(dev, skb, sb_dev);
 4463
 4464	q = rcu_dereference_bh(txq->qdisc);
 4465
 4466	trace_net_dev_queue(skb);
 4467	if (q->enqueue) {
 4468		rc = __dev_xmit_skb(skb, q, dev, txq);
 4469		goto out;
 4470	}
 4471
 4472	/* The device has no queue. Common case for software devices:
 4473	 * loopback, all the sorts of tunnels...
 4474
 4475	 * Really, it is unlikely that netif_tx_lock protection is necessary
 4476	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
 4477	 * counters.)
 4478	 * However, it is possible, that they rely on protection
 4479	 * made by us here.
 4480
 4481	 * Check this and shot the lock. It is not prone from deadlocks.
 4482	 *Either shot noqueue qdisc, it is even simpler 8)
 4483	 */
 4484	if (dev->flags & IFF_UP) {
 4485		int cpu = smp_processor_id(); /* ok because BHs are off */
 4486
 4487		/* Other cpus might concurrently change txq->xmit_lock_owner
 4488		 * to -1 or to their cpu id, but not to our id.
 4489		 */
 4490		if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
 4491			if (dev_xmit_recursion())
 4492				goto recursion_alert;
 4493
 4494			skb = validate_xmit_skb(skb, dev, &again);
 4495			if (!skb)
 4496				goto out;
 4497
 4498			HARD_TX_LOCK(dev, txq, cpu);
 4499
 4500			if (!netif_xmit_stopped(txq)) {
 4501				dev_xmit_recursion_inc();
 4502				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
 4503				dev_xmit_recursion_dec();
 4504				if (dev_xmit_complete(rc)) {
 4505					HARD_TX_UNLOCK(dev, txq);
 4506					goto out;
 4507				}
 4508			}
 4509			HARD_TX_UNLOCK(dev, txq);
 4510			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
 4511					     dev->name);
 4512		} else {
 4513			/* Recursion is detected! It is possible,
 4514			 * unfortunately
 4515			 */
 4516recursion_alert:
 4517			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
 4518					     dev->name);
 4519		}
 4520	}
 4521
 4522	rc = -ENETDOWN;
 4523	rcu_read_unlock_bh();
 4524
 4525	dev_core_stats_tx_dropped_inc(dev);
 4526	kfree_skb_list(skb);
 4527	return rc;
 4528out:
 4529	rcu_read_unlock_bh();
 4530	return rc;
 4531}
 4532EXPORT_SYMBOL(__dev_queue_xmit);
 4533
 4534int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
 4535{
 4536	struct net_device *dev = skb->dev;
 4537	struct sk_buff *orig_skb = skb;
 4538	struct netdev_queue *txq;
 4539	int ret = NETDEV_TX_BUSY;
 4540	bool again = false;
 4541
 4542	if (unlikely(!netif_running(dev) ||
 4543		     !netif_carrier_ok(dev)))
 4544		goto drop;
 4545
 4546	skb = validate_xmit_skb_list(skb, dev, &again);
 4547	if (skb != orig_skb)
 4548		goto drop;
 4549
 4550	skb_set_queue_mapping(skb, queue_id);
 4551	txq = skb_get_tx_queue(dev, skb);
 
 
 
 4552
 4553	local_bh_disable();
 4554
 4555	dev_xmit_recursion_inc();
 4556	HARD_TX_LOCK(dev, txq, smp_processor_id());
 4557	if (!netif_xmit_frozen_or_drv_stopped(txq))
 4558		ret = netdev_start_xmit(skb, dev, txq, false);
 4559	HARD_TX_UNLOCK(dev, txq);
 4560	dev_xmit_recursion_dec();
 4561
 4562	local_bh_enable();
 4563	return ret;
 4564drop:
 4565	dev_core_stats_tx_dropped_inc(dev);
 4566	kfree_skb_list(skb);
 4567	return NET_XMIT_DROP;
 4568}
 4569EXPORT_SYMBOL(__dev_direct_xmit);
 4570
 4571/*************************************************************************
 4572 *			Receiver routines
 4573 *************************************************************************/
 4574static DEFINE_PER_CPU(struct task_struct *, backlog_napi);
 4575
 4576int weight_p __read_mostly = 64;           /* old backlog weight */
 4577int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
 4578int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
 4579
 4580/* Called with irq disabled */
 4581static inline void ____napi_schedule(struct softnet_data *sd,
 4582				     struct napi_struct *napi)
 4583{
 4584	struct task_struct *thread;
 4585
 4586	lockdep_assert_irqs_disabled();
 4587
 4588	if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
 4589		/* Paired with smp_mb__before_atomic() in
 4590		 * napi_enable()/dev_set_threaded().
 4591		 * Use READ_ONCE() to guarantee a complete
 4592		 * read on napi->thread. Only call
 4593		 * wake_up_process() when it's not NULL.
 4594		 */
 4595		thread = READ_ONCE(napi->thread);
 4596		if (thread) {
 4597			if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
 4598				goto use_local_napi;
 4599
 4600			set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
 4601			wake_up_process(thread);
 4602			return;
 4603		}
 4604	}
 4605
 4606use_local_napi:
 4607	list_add_tail(&napi->poll_list, &sd->poll_list);
 4608	WRITE_ONCE(napi->list_owner, smp_processor_id());
 4609	/* If not called from net_rx_action()
 4610	 * we have to raise NET_RX_SOFTIRQ.
 4611	 */
 4612	if (!sd->in_net_rx_action)
 4613		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4614}
 4615
 4616#ifdef CONFIG_RPS
 4617
 4618struct static_key_false rps_needed __read_mostly;
 
 
 
 
 
 
 4619EXPORT_SYMBOL(rps_needed);
 4620struct static_key_false rfs_needed __read_mostly;
 4621EXPORT_SYMBOL(rfs_needed);
 4622
 4623static struct rps_dev_flow *
 4624set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 4625	    struct rps_dev_flow *rflow, u16 next_cpu)
 4626{
 4627	if (next_cpu < nr_cpu_ids) {
 4628		u32 head;
 4629#ifdef CONFIG_RFS_ACCEL
 4630		struct netdev_rx_queue *rxqueue;
 4631		struct rps_dev_flow_table *flow_table;
 4632		struct rps_dev_flow *old_rflow;
 4633		u16 rxq_index;
 4634		u32 flow_id;
 
 4635		int rc;
 4636
 4637		/* Should we steer this flow to a different hardware queue? */
 4638		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
 4639		    !(dev->features & NETIF_F_NTUPLE))
 4640			goto out;
 4641		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
 4642		if (rxq_index == skb_get_rx_queue(skb))
 4643			goto out;
 4644
 4645		rxqueue = dev->_rx + rxq_index;
 4646		flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4647		if (!flow_table)
 4648			goto out;
 4649		flow_id = skb_get_hash(skb) & flow_table->mask;
 4650		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
 4651							rxq_index, flow_id);
 4652		if (rc < 0)
 4653			goto out;
 4654		old_rflow = rflow;
 4655		rflow = &flow_table->flows[flow_id];
 4656		WRITE_ONCE(rflow->filter, rc);
 4657		if (old_rflow->filter == rc)
 4658			WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER);
 4659	out:
 4660#endif
 4661		head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head);
 4662		rps_input_queue_tail_save(&rflow->last_qtail, head);
 4663	}
 4664
 4665	WRITE_ONCE(rflow->cpu, next_cpu);
 4666	return rflow;
 4667}
 4668
 4669/*
 4670 * get_rps_cpu is called from netif_receive_skb and returns the target
 4671 * CPU from the RPS map of the receiving queue for a given skb.
 4672 * rcu_read_lock must be held on entry.
 4673 */
 4674static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 4675		       struct rps_dev_flow **rflowp)
 4676{
 4677	const struct rps_sock_flow_table *sock_flow_table;
 4678	struct netdev_rx_queue *rxqueue = dev->_rx;
 4679	struct rps_dev_flow_table *flow_table;
 4680	struct rps_map *map;
 4681	int cpu = -1;
 4682	u32 tcpu;
 4683	u32 hash;
 4684
 4685	if (skb_rx_queue_recorded(skb)) {
 4686		u16 index = skb_get_rx_queue(skb);
 4687
 4688		if (unlikely(index >= dev->real_num_rx_queues)) {
 4689			WARN_ONCE(dev->real_num_rx_queues > 1,
 4690				  "%s received packet on queue %u, but number "
 4691				  "of RX queues is %u\n",
 4692				  dev->name, index, dev->real_num_rx_queues);
 4693			goto done;
 4694		}
 4695		rxqueue += index;
 4696	}
 4697
 4698	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
 4699
 4700	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4701	map = rcu_dereference(rxqueue->rps_map);
 4702	if (!flow_table && !map)
 4703		goto done;
 4704
 4705	skb_reset_network_header(skb);
 4706	hash = skb_get_hash(skb);
 4707	if (!hash)
 4708		goto done;
 4709
 4710	sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
 4711	if (flow_table && sock_flow_table) {
 4712		struct rps_dev_flow *rflow;
 4713		u32 next_cpu;
 4714		u32 ident;
 4715
 4716		/* First check into global flow table if there is a match.
 4717		 * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
 4718		 */
 4719		ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
 4720		if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask)
 4721			goto try_rps;
 4722
 4723		next_cpu = ident & net_hotdata.rps_cpu_mask;
 4724
 4725		/* OK, now we know there is a match,
 4726		 * we can look at the local (per receive queue) flow table
 4727		 */
 4728		rflow = &flow_table->flows[hash & flow_table->mask];
 4729		tcpu = rflow->cpu;
 4730
 4731		/*
 4732		 * If the desired CPU (where last recvmsg was done) is
 4733		 * different from current CPU (one in the rx-queue flow
 4734		 * table entry), switch if one of the following holds:
 4735		 *   - Current CPU is unset (>= nr_cpu_ids).
 4736		 *   - Current CPU is offline.
 4737		 *   - The current CPU's queue tail has advanced beyond the
 4738		 *     last packet that was enqueued using this table entry.
 4739		 *     This guarantees that all previous packets for the flow
 4740		 *     have been dequeued, thus preserving in order delivery.
 4741		 */
 4742		if (unlikely(tcpu != next_cpu) &&
 4743		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
 4744		     ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) -
 4745		      rflow->last_qtail)) >= 0)) {
 4746			tcpu = next_cpu;
 4747			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
 4748		}
 4749
 4750		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
 4751			*rflowp = rflow;
 4752			cpu = tcpu;
 4753			goto done;
 4754		}
 4755	}
 4756
 4757try_rps:
 4758
 4759	if (map) {
 4760		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
 4761		if (cpu_online(tcpu)) {
 4762			cpu = tcpu;
 4763			goto done;
 4764		}
 4765	}
 4766
 4767done:
 4768	return cpu;
 4769}
 4770
 4771#ifdef CONFIG_RFS_ACCEL
 4772
 4773/**
 4774 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
 4775 * @dev: Device on which the filter was set
 4776 * @rxq_index: RX queue index
 4777 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
 4778 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
 4779 *
 4780 * Drivers that implement ndo_rx_flow_steer() should periodically call
 4781 * this function for each installed filter and remove the filters for
 4782 * which it returns %true.
 4783 */
 4784bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
 4785			 u32 flow_id, u16 filter_id)
 4786{
 4787	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
 4788	struct rps_dev_flow_table *flow_table;
 4789	struct rps_dev_flow *rflow;
 4790	bool expire = true;
 4791	unsigned int cpu;
 4792
 4793	rcu_read_lock();
 4794	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4795	if (flow_table && flow_id <= flow_table->mask) {
 4796		rflow = &flow_table->flows[flow_id];
 4797		cpu = READ_ONCE(rflow->cpu);
 4798		if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids &&
 4799		    ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) -
 4800			   READ_ONCE(rflow->last_qtail)) <
 4801		     (int)(10 * flow_table->mask)))
 4802			expire = false;
 4803	}
 4804	rcu_read_unlock();
 4805	return expire;
 4806}
 4807EXPORT_SYMBOL(rps_may_expire_flow);
 4808
 4809#endif /* CONFIG_RFS_ACCEL */
 4810
 4811/* Called from hardirq (IPI) context */
 4812static void rps_trigger_softirq(void *data)
 4813{
 4814	struct softnet_data *sd = data;
 4815
 4816	____napi_schedule(sd, &sd->backlog);
 4817	sd->received_rps++;
 4818}
 4819
 4820#endif /* CONFIG_RPS */
 4821
 4822/* Called from hardirq (IPI) context */
 4823static void trigger_rx_softirq(void *data)
 4824{
 4825	struct softnet_data *sd = data;
 4826
 4827	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4828	smp_store_release(&sd->defer_ipi_scheduled, 0);
 4829}
 4830
 4831/*
 4832 * After we queued a packet into sd->input_pkt_queue,
 4833 * we need to make sure this queue is serviced soon.
 4834 *
 4835 * - If this is another cpu queue, link it to our rps_ipi_list,
 4836 *   and make sure we will process rps_ipi_list from net_rx_action().
 4837 *
 4838 * - If this is our own queue, NAPI schedule our backlog.
 4839 *   Note that this also raises NET_RX_SOFTIRQ.
 4840 */
 4841static void napi_schedule_rps(struct softnet_data *sd)
 4842{
 
 4843	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
 4844
 4845#ifdef CONFIG_RPS
 4846	if (sd != mysd) {
 4847		if (use_backlog_threads()) {
 4848			__napi_schedule_irqoff(&sd->backlog);
 4849			return;
 4850		}
 4851
 4852		sd->rps_ipi_next = mysd->rps_ipi_list;
 4853		mysd->rps_ipi_list = sd;
 4854
 4855		/* If not called from net_rx_action() or napi_threaded_poll()
 4856		 * we have to raise NET_RX_SOFTIRQ.
 4857		 */
 4858		if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll)
 4859			__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4860		return;
 4861	}
 4862#endif /* CONFIG_RPS */
 4863	__napi_schedule_irqoff(&mysd->backlog);
 4864}
 4865
 4866void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu)
 4867{
 4868	unsigned long flags;
 4869
 4870	if (use_backlog_threads()) {
 4871		backlog_lock_irq_save(sd, &flags);
 4872
 4873		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
 4874			__napi_schedule_irqoff(&sd->backlog);
 4875
 4876		backlog_unlock_irq_restore(sd, &flags);
 4877
 4878	} else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
 4879		smp_call_function_single_async(cpu, &sd->defer_csd);
 4880	}
 4881}
 4882
 4883#ifdef CONFIG_NET_FLOW_LIMIT
 4884int netdev_flow_limit_table_len __read_mostly = (1 << 12);
 4885#endif
 4886
 4887static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
 4888{
 4889#ifdef CONFIG_NET_FLOW_LIMIT
 4890	struct sd_flow_limit *fl;
 4891	struct softnet_data *sd;
 4892	unsigned int old_flow, new_flow;
 4893
 4894	if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1))
 4895		return false;
 4896
 4897	sd = this_cpu_ptr(&softnet_data);
 4898
 4899	rcu_read_lock();
 4900	fl = rcu_dereference(sd->flow_limit);
 4901	if (fl) {
 4902		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
 4903		old_flow = fl->history[fl->history_head];
 4904		fl->history[fl->history_head] = new_flow;
 4905
 4906		fl->history_head++;
 4907		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
 4908
 4909		if (likely(fl->buckets[old_flow]))
 4910			fl->buckets[old_flow]--;
 4911
 4912		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
 4913			fl->count++;
 4914			rcu_read_unlock();
 4915			return true;
 4916		}
 4917	}
 4918	rcu_read_unlock();
 4919#endif
 4920	return false;
 4921}
 4922
 4923/*
 4924 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
 4925 * queue (may be a remote CPU queue).
 4926 */
 4927static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 4928			      unsigned int *qtail)
 4929{
 4930	enum skb_drop_reason reason;
 4931	struct softnet_data *sd;
 4932	unsigned long flags;
 4933	unsigned int qlen;
 4934	int max_backlog;
 4935	u32 tail;
 4936
 4937	reason = SKB_DROP_REASON_DEV_READY;
 4938	if (!netif_running(skb->dev))
 4939		goto bad_dev;
 4940
 4941	reason = SKB_DROP_REASON_CPU_BACKLOG;
 4942	sd = &per_cpu(softnet_data, cpu);
 4943
 4944	qlen = skb_queue_len_lockless(&sd->input_pkt_queue);
 4945	max_backlog = READ_ONCE(net_hotdata.max_backlog);
 4946	if (unlikely(qlen > max_backlog))
 4947		goto cpu_backlog_drop;
 4948	backlog_lock_irq_save(sd, &flags);
 4949	qlen = skb_queue_len(&sd->input_pkt_queue);
 4950	if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) {
 4951		if (!qlen) {
 4952			/* Schedule NAPI for backlog device. We can use
 4953			 * non atomic operation as we own the queue lock.
 4954			 */
 4955			if (!__test_and_set_bit(NAPI_STATE_SCHED,
 4956						&sd->backlog.state))
 4957				napi_schedule_rps(sd);
 4958		}
 4959		__skb_queue_tail(&sd->input_pkt_queue, skb);
 4960		tail = rps_input_queue_tail_incr(sd);
 4961		backlog_unlock_irq_restore(sd, &flags);
 4962
 4963		/* save the tail outside of the critical section */
 4964		rps_input_queue_tail_save(qtail, tail);
 4965		return NET_RX_SUCCESS;
 4966	}
 4967
 4968	backlog_unlock_irq_restore(sd, &flags);
 4969
 4970cpu_backlog_drop:
 4971	atomic_inc(&sd->dropped);
 4972bad_dev:
 4973	dev_core_stats_rx_dropped_inc(skb->dev);
 4974	kfree_skb_reason(skb, reason);
 4975	return NET_RX_DROP;
 4976}
 4977
 4978static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
 4979{
 4980	struct net_device *dev = skb->dev;
 4981	struct netdev_rx_queue *rxqueue;
 4982
 4983	rxqueue = dev->_rx;
 4984
 4985	if (skb_rx_queue_recorded(skb)) {
 4986		u16 index = skb_get_rx_queue(skb);
 4987
 4988		if (unlikely(index >= dev->real_num_rx_queues)) {
 4989			WARN_ONCE(dev->real_num_rx_queues > 1,
 4990				  "%s received packet on queue %u, but number "
 4991				  "of RX queues is %u\n",
 4992				  dev->name, index, dev->real_num_rx_queues);
 4993
 4994			return rxqueue; /* Return first rxqueue */
 
 
 
 
 
 
 
 
 
 
 
 4995		}
 4996		rxqueue += index;
 4997	}
 4998	return rxqueue;
 4999}
 5000
 5001u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
 5002			     struct bpf_prog *xdp_prog)
 5003{
 5004	void *orig_data, *orig_data_end, *hard_start;
 5005	struct netdev_rx_queue *rxqueue;
 5006	bool orig_bcast, orig_host;
 5007	u32 mac_len, frame_sz;
 5008	__be16 orig_eth_type;
 5009	struct ethhdr *eth;
 5010	u32 metalen, act;
 5011	int off;
 5012
 5013	/* The XDP program wants to see the packet starting at the MAC
 5014	 * header.
 5015	 */
 5016	mac_len = skb->data - skb_mac_header(skb);
 5017	hard_start = skb->data - skb_headroom(skb);
 5018
 5019	/* SKB "head" area always have tailroom for skb_shared_info */
 5020	frame_sz = (void *)skb_end_pointer(skb) - hard_start;
 5021	frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 5022
 5023	rxqueue = netif_get_rxqueue(skb);
 5024	xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
 5025	xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
 5026			 skb_headlen(skb) + mac_len, true);
 5027	if (skb_is_nonlinear(skb)) {
 5028		skb_shinfo(skb)->xdp_frags_size = skb->data_len;
 5029		xdp_buff_set_frags_flag(xdp);
 5030	} else {
 5031		xdp_buff_clear_frags_flag(xdp);
 5032	}
 5033
 5034	orig_data_end = xdp->data_end;
 5035	orig_data = xdp->data;
 5036	eth = (struct ethhdr *)xdp->data;
 5037	orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
 5038	orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
 5039	orig_eth_type = eth->h_proto;
 5040
 5041	act = bpf_prog_run_xdp(xdp_prog, xdp);
 5042
 5043	/* check if bpf_xdp_adjust_head was used */
 5044	off = xdp->data - orig_data;
 5045	if (off) {
 5046		if (off > 0)
 5047			__skb_pull(skb, off);
 5048		else if (off < 0)
 5049			__skb_push(skb, -off);
 5050
 5051		skb->mac_header += off;
 5052		skb_reset_network_header(skb);
 5053	}
 5054
 5055	/* check if bpf_xdp_adjust_tail was used */
 5056	off = xdp->data_end - orig_data_end;
 5057	if (off != 0) {
 5058		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
 5059		skb->len += off; /* positive on grow, negative on shrink */
 5060	}
 5061
 5062	/* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
 5063	 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
 5064	 */
 5065	if (xdp_buff_has_frags(xdp))
 5066		skb->data_len = skb_shinfo(skb)->xdp_frags_size;
 5067	else
 5068		skb->data_len = 0;
 5069
 5070	/* check if XDP changed eth hdr such SKB needs update */
 5071	eth = (struct ethhdr *)xdp->data;
 5072	if ((orig_eth_type != eth->h_proto) ||
 5073	    (orig_host != ether_addr_equal_64bits(eth->h_dest,
 5074						  skb->dev->dev_addr)) ||
 5075	    (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
 5076		__skb_push(skb, ETH_HLEN);
 5077		skb->pkt_type = PACKET_HOST;
 5078		skb->protocol = eth_type_trans(skb, skb->dev);
 5079	}
 5080
 5081	/* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
 5082	 * before calling us again on redirect path. We do not call do_redirect
 5083	 * as we leave that up to the caller.
 5084	 *
 5085	 * Caller is responsible for managing lifetime of skb (i.e. calling
 5086	 * kfree_skb in response to actions it cannot handle/XDP_DROP).
 5087	 */
 5088	switch (act) {
 5089	case XDP_REDIRECT:
 5090	case XDP_TX:
 5091		__skb_push(skb, mac_len);
 5092		break;
 5093	case XDP_PASS:
 5094		metalen = xdp->data - xdp->data_meta;
 5095		if (metalen)
 5096			skb_metadata_set(skb, metalen);
 5097		break;
 5098	}
 5099
 5100	return act;
 5101}
 5102
 5103static int
 5104netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog)
 5105{
 5106	struct sk_buff *skb = *pskb;
 5107	int err, hroom, troom;
 5108
 5109	if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog))
 5110		return 0;
 5111
 5112	/* In case we have to go down the path and also linearize,
 5113	 * then lets do the pskb_expand_head() work just once here.
 5114	 */
 5115	hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
 5116	troom = skb->tail + skb->data_len - skb->end;
 5117	err = pskb_expand_head(skb,
 5118			       hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
 5119			       troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
 5120	if (err)
 5121		return err;
 5122
 5123	return skb_linearize(skb);
 5124}
 5125
 5126static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
 5127				     struct xdp_buff *xdp,
 5128				     struct bpf_prog *xdp_prog)
 5129{
 5130	struct sk_buff *skb = *pskb;
 5131	u32 mac_len, act = XDP_DROP;
 5132
 5133	/* Reinjected packets coming from act_mirred or similar should
 5134	 * not get XDP generic processing.
 5135	 */
 5136	if (skb_is_redirected(skb))
 5137		return XDP_PASS;
 5138
 5139	/* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
 5140	 * bytes. This is the guarantee that also native XDP provides,
 5141	 * thus we need to do it here as well.
 5142	 */
 5143	mac_len = skb->data - skb_mac_header(skb);
 5144	__skb_push(skb, mac_len);
 5145
 5146	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
 5147	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
 5148		if (netif_skb_check_for_xdp(pskb, xdp_prog))
 5149			goto do_drop;
 5150	}
 5151
 5152	__skb_pull(*pskb, mac_len);
 5153
 5154	act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
 5155	switch (act) {
 5156	case XDP_REDIRECT:
 5157	case XDP_TX:
 5158	case XDP_PASS:
 5159		break;
 5160	default:
 5161		bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
 5162		fallthrough;
 5163	case XDP_ABORTED:
 5164		trace_xdp_exception((*pskb)->dev, xdp_prog, act);
 5165		fallthrough;
 5166	case XDP_DROP:
 5167	do_drop:
 5168		kfree_skb(*pskb);
 5169		break;
 5170	}
 5171
 5172	return act;
 5173}
 5174
 5175/* When doing generic XDP we have to bypass the qdisc layer and the
 5176 * network taps in order to match in-driver-XDP behavior. This also means
 5177 * that XDP packets are able to starve other packets going through a qdisc,
 5178 * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
 5179 * queues, so they do not have this starvation issue.
 5180 */
 5181void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 5182{
 5183	struct net_device *dev = skb->dev;
 5184	struct netdev_queue *txq;
 5185	bool free_skb = true;
 5186	int cpu, rc;
 5187
 5188	txq = netdev_core_pick_tx(dev, skb, NULL);
 5189	cpu = smp_processor_id();
 5190	HARD_TX_LOCK(dev, txq, cpu);
 5191	if (!netif_xmit_frozen_or_drv_stopped(txq)) {
 5192		rc = netdev_start_xmit(skb, dev, txq, 0);
 5193		if (dev_xmit_complete(rc))
 5194			free_skb = false;
 5195	}
 5196	HARD_TX_UNLOCK(dev, txq);
 5197	if (free_skb) {
 5198		trace_xdp_exception(dev, xdp_prog, XDP_TX);
 5199		dev_core_stats_tx_dropped_inc(dev);
 5200		kfree_skb(skb);
 5201	}
 5202}
 5203
 5204static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
 5205
 5206int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb)
 5207{
 5208	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
 5209
 5210	if (xdp_prog) {
 5211		struct xdp_buff xdp;
 5212		u32 act;
 5213		int err;
 5214
 5215		bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
 5216		act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog);
 5217		if (act != XDP_PASS) {
 5218			switch (act) {
 5219			case XDP_REDIRECT:
 5220				err = xdp_do_generic_redirect((*pskb)->dev, *pskb,
 5221							      &xdp, xdp_prog);
 5222				if (err)
 5223					goto out_redir;
 5224				break;
 5225			case XDP_TX:
 5226				generic_xdp_tx(*pskb, xdp_prog);
 5227				break;
 5228			}
 5229			bpf_net_ctx_clear(bpf_net_ctx);
 5230			return XDP_DROP;
 5231		}
 5232		bpf_net_ctx_clear(bpf_net_ctx);
 5233	}
 5234	return XDP_PASS;
 5235out_redir:
 5236	bpf_net_ctx_clear(bpf_net_ctx);
 5237	kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP);
 5238	return XDP_DROP;
 5239}
 5240EXPORT_SYMBOL_GPL(do_xdp_generic);
 5241
 5242static int netif_rx_internal(struct sk_buff *skb)
 5243{
 5244	int ret;
 5245
 5246	net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
 5247
 5248	trace_netif_rx(skb);
 5249
 5250#ifdef CONFIG_RPS
 5251	if (static_branch_unlikely(&rps_needed)) {
 5252		struct rps_dev_flow voidflow, *rflow = &voidflow;
 5253		int cpu;
 5254
 
 5255		rcu_read_lock();
 5256
 5257		cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5258		if (cpu < 0)
 5259			cpu = smp_processor_id();
 5260
 5261		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5262
 5263		rcu_read_unlock();
 
 5264	} else
 5265#endif
 5266	{
 5267		unsigned int qtail;
 5268
 5269		ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
 5270	}
 5271	return ret;
 5272}
 5273
 5274/**
 5275 *	__netif_rx	-	Slightly optimized version of netif_rx
 5276 *	@skb: buffer to post
 5277 *
 5278 *	This behaves as netif_rx except that it does not disable bottom halves.
 5279 *	As a result this function may only be invoked from the interrupt context
 5280 *	(either hard or soft interrupt).
 5281 */
 5282int __netif_rx(struct sk_buff *skb)
 5283{
 5284	int ret;
 5285
 5286	lockdep_assert_once(hardirq_count() | softirq_count());
 5287
 5288	trace_netif_rx_entry(skb);
 5289	ret = netif_rx_internal(skb);
 5290	trace_netif_rx_exit(ret);
 5291	return ret;
 5292}
 5293EXPORT_SYMBOL(__netif_rx);
 5294
 5295/**
 5296 *	netif_rx	-	post buffer to the network code
 5297 *	@skb: buffer to post
 5298 *
 5299 *	This function receives a packet from a device driver and queues it for
 5300 *	the upper (protocol) levels to process via the backlog NAPI device. It
 5301 *	always succeeds. The buffer may be dropped during processing for
 5302 *	congestion control or by the protocol layers.
 5303 *	The network buffer is passed via the backlog NAPI device. Modern NIC
 5304 *	driver should use NAPI and GRO.
 5305 *	This function can used from interrupt and from process context. The
 5306 *	caller from process context must not disable interrupts before invoking
 5307 *	this function.
 5308 *
 5309 *	return values:
 5310 *	NET_RX_SUCCESS	(no congestion)
 5311 *	NET_RX_DROP     (packet was dropped)
 5312 *
 5313 */
 
 5314int netif_rx(struct sk_buff *skb)
 5315{
 5316	bool need_bh_off = !(hardirq_count() | softirq_count());
 5317	int ret;
 5318
 5319	if (need_bh_off)
 5320		local_bh_disable();
 5321	trace_netif_rx_entry(skb);
 5322	ret = netif_rx_internal(skb);
 5323	trace_netif_rx_exit(ret);
 5324	if (need_bh_off)
 5325		local_bh_enable();
 5326	return ret;
 5327}
 5328EXPORT_SYMBOL(netif_rx);
 5329
 5330static __latent_entropy void net_tx_action(void)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 5331{
 5332	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 5333
 5334	if (sd->completion_queue) {
 5335		struct sk_buff *clist;
 5336
 5337		local_irq_disable();
 5338		clist = sd->completion_queue;
 5339		sd->completion_queue = NULL;
 5340		local_irq_enable();
 5341
 5342		while (clist) {
 5343			struct sk_buff *skb = clist;
 5344
 5345			clist = clist->next;
 5346
 5347			WARN_ON(refcount_read(&skb->users));
 5348			if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED))
 5349				trace_consume_skb(skb, net_tx_action);
 5350			else
 5351				trace_kfree_skb(skb, net_tx_action,
 5352						get_kfree_skb_cb(skb)->reason, NULL);
 5353
 5354			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
 5355				__kfree_skb(skb);
 5356			else
 5357				__napi_kfree_skb(skb,
 5358						 get_kfree_skb_cb(skb)->reason);
 5359		}
 
 
 5360	}
 5361
 5362	if (sd->output_queue) {
 5363		struct Qdisc *head;
 5364
 5365		local_irq_disable();
 5366		head = sd->output_queue;
 5367		sd->output_queue = NULL;
 5368		sd->output_queue_tailp = &sd->output_queue;
 5369		local_irq_enable();
 5370
 5371		rcu_read_lock();
 5372
 5373		while (head) {
 5374			struct Qdisc *q = head;
 5375			spinlock_t *root_lock = NULL;
 5376
 5377			head = head->next_sched;
 5378
 
 
 5379			/* We need to make sure head->next_sched is read
 5380			 * before clearing __QDISC_STATE_SCHED
 5381			 */
 5382			smp_mb__before_atomic();
 5383
 5384			if (!(q->flags & TCQ_F_NOLOCK)) {
 5385				root_lock = qdisc_lock(q);
 5386				spin_lock(root_lock);
 5387			} else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
 5388						     &q->state))) {
 5389				/* There is a synchronize_net() between
 5390				 * STATE_DEACTIVATED flag being set and
 5391				 * qdisc_reset()/some_qdisc_is_busy() in
 5392				 * dev_deactivate(), so we can safely bail out
 5393				 * early here to avoid data race between
 5394				 * qdisc_deactivate() and some_qdisc_is_busy()
 5395				 * for lockless qdisc.
 5396				 */
 5397				clear_bit(__QDISC_STATE_SCHED, &q->state);
 5398				continue;
 5399			}
 5400
 5401			clear_bit(__QDISC_STATE_SCHED, &q->state);
 5402			qdisc_run(q);
 5403			if (root_lock)
 5404				spin_unlock(root_lock);
 5405		}
 5406
 5407		rcu_read_unlock();
 5408	}
 5409
 5410	xfrm_dev_backlog(sd);
 5411}
 5412
 5413#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
 5414/* This hook is defined here for ATM LANE */
 5415int (*br_fdb_test_addr_hook)(struct net_device *dev,
 5416			     unsigned char *addr) __read_mostly;
 5417EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
 5418#endif
 5419
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 5420/**
 5421 *	netdev_is_rx_handler_busy - check if receive handler is registered
 5422 *	@dev: device to check
 5423 *
 5424 *	Check if a receive handler is already registered for a given device.
 5425 *	Return true if there one.
 5426 *
 5427 *	The caller must hold the rtnl_mutex.
 5428 */
 5429bool netdev_is_rx_handler_busy(struct net_device *dev)
 5430{
 5431	ASSERT_RTNL();
 5432	return dev && rtnl_dereference(dev->rx_handler);
 5433}
 5434EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
 5435
 5436/**
 5437 *	netdev_rx_handler_register - register receive handler
 5438 *	@dev: device to register a handler for
 5439 *	@rx_handler: receive handler to register
 5440 *	@rx_handler_data: data pointer that is used by rx handler
 5441 *
 5442 *	Register a receive handler for a device. This handler will then be
 5443 *	called from __netif_receive_skb. A negative errno code is returned
 5444 *	on a failure.
 5445 *
 5446 *	The caller must hold the rtnl_mutex.
 5447 *
 5448 *	For a general description of rx_handler, see enum rx_handler_result.
 5449 */
 5450int netdev_rx_handler_register(struct net_device *dev,
 5451			       rx_handler_func_t *rx_handler,
 5452			       void *rx_handler_data)
 5453{
 5454	if (netdev_is_rx_handler_busy(dev))
 5455		return -EBUSY;
 5456
 5457	if (dev->priv_flags & IFF_NO_RX_HANDLER)
 5458		return -EINVAL;
 5459
 5460	/* Note: rx_handler_data must be set before rx_handler */
 5461	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
 5462	rcu_assign_pointer(dev->rx_handler, rx_handler);
 5463
 5464	return 0;
 5465}
 5466EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
 5467
 5468/**
 5469 *	netdev_rx_handler_unregister - unregister receive handler
 5470 *	@dev: device to unregister a handler from
 5471 *
 5472 *	Unregister a receive handler from a device.
 5473 *
 5474 *	The caller must hold the rtnl_mutex.
 5475 */
 5476void netdev_rx_handler_unregister(struct net_device *dev)
 5477{
 5478
 5479	ASSERT_RTNL();
 5480	RCU_INIT_POINTER(dev->rx_handler, NULL);
 5481	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
 5482	 * section has a guarantee to see a non NULL rx_handler_data
 5483	 * as well.
 5484	 */
 5485	synchronize_net();
 5486	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
 5487}
 5488EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
 5489
 5490/*
 5491 * Limit the use of PFMEMALLOC reserves to those protocols that implement
 5492 * the special handling of PFMEMALLOC skbs.
 5493 */
 5494static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
 5495{
 5496	switch (skb->protocol) {
 5497	case htons(ETH_P_ARP):
 5498	case htons(ETH_P_IP):
 5499	case htons(ETH_P_IPV6):
 5500	case htons(ETH_P_8021Q):
 5501	case htons(ETH_P_8021AD):
 5502		return true;
 5503	default:
 5504		return false;
 5505	}
 5506}
 5507
 5508static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
 5509			     int *ret, struct net_device *orig_dev)
 5510{
 
 5511	if (nf_hook_ingress_active(skb)) {
 5512		int ingress_retval;
 5513
 5514		if (*pt_prev) {
 5515			*ret = deliver_skb(skb, *pt_prev, orig_dev);
 5516			*pt_prev = NULL;
 5517		}
 5518
 5519		rcu_read_lock();
 5520		ingress_retval = nf_hook_ingress(skb);
 5521		rcu_read_unlock();
 5522		return ingress_retval;
 5523	}
 
 5524	return 0;
 5525}
 5526
 5527static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
 5528				    struct packet_type **ppt_prev)
 5529{
 5530	struct packet_type *ptype, *pt_prev;
 5531	rx_handler_func_t *rx_handler;
 5532	struct sk_buff *skb = *pskb;
 5533	struct net_device *orig_dev;
 5534	bool deliver_exact = false;
 5535	int ret = NET_RX_DROP;
 5536	__be16 type;
 5537
 5538	net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb);
 5539
 5540	trace_netif_receive_skb(skb);
 5541
 5542	orig_dev = skb->dev;
 5543
 5544	skb_reset_network_header(skb);
 5545	if (!skb_transport_header_was_set(skb))
 5546		skb_reset_transport_header(skb);
 5547	skb_reset_mac_len(skb);
 5548
 5549	pt_prev = NULL;
 5550
 5551another_round:
 5552	skb->skb_iif = skb->dev->ifindex;
 5553
 5554	__this_cpu_inc(softnet_data.processed);
 5555
 5556	if (static_branch_unlikely(&generic_xdp_needed_key)) {
 5557		int ret2;
 5558
 5559		migrate_disable();
 5560		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
 5561				      &skb);
 5562		migrate_enable();
 5563
 5564		if (ret2 != XDP_PASS) {
 5565			ret = NET_RX_DROP;
 5566			goto out;
 5567		}
 5568	}
 5569
 5570	if (eth_type_vlan(skb->protocol)) {
 5571		skb = skb_vlan_untag(skb);
 5572		if (unlikely(!skb))
 5573			goto out;
 5574	}
 5575
 5576	if (skb_skip_tc_classify(skb))
 5577		goto skip_classify;
 
 
 
 
 5578
 5579	if (pfmemalloc)
 5580		goto skip_taps;
 5581
 5582	list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) {
 5583		if (pt_prev)
 5584			ret = deliver_skb(skb, pt_prev, orig_dev);
 5585		pt_prev = ptype;
 5586	}
 5587
 5588	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
 5589		if (pt_prev)
 5590			ret = deliver_skb(skb, pt_prev, orig_dev);
 5591		pt_prev = ptype;
 5592	}
 5593
 5594skip_taps:
 5595#ifdef CONFIG_NET_INGRESS
 5596	if (static_branch_unlikely(&ingress_needed_key)) {
 5597		bool another = false;
 5598
 5599		nf_skip_egress(skb, true);
 5600		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
 5601					 &another);
 5602		if (another)
 5603			goto another_round;
 5604		if (!skb)
 5605			goto out;
 5606
 5607		nf_skip_egress(skb, false);
 5608		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
 5609			goto out;
 5610	}
 5611#endif
 5612	skb_reset_redirect(skb);
 5613skip_classify:
 
 
 5614	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
 5615		goto drop;
 5616
 5617	if (skb_vlan_tag_present(skb)) {
 5618		if (pt_prev) {
 5619			ret = deliver_skb(skb, pt_prev, orig_dev);
 5620			pt_prev = NULL;
 5621		}
 5622		if (vlan_do_receive(&skb))
 5623			goto another_round;
 5624		else if (unlikely(!skb))
 5625			goto out;
 5626	}
 5627
 5628	rx_handler = rcu_dereference(skb->dev->rx_handler);
 5629	if (rx_handler) {
 5630		if (pt_prev) {
 5631			ret = deliver_skb(skb, pt_prev, orig_dev);
 5632			pt_prev = NULL;
 5633		}
 5634		switch (rx_handler(&skb)) {
 5635		case RX_HANDLER_CONSUMED:
 5636			ret = NET_RX_SUCCESS;
 5637			goto out;
 5638		case RX_HANDLER_ANOTHER:
 5639			goto another_round;
 5640		case RX_HANDLER_EXACT:
 5641			deliver_exact = true;
 5642			break;
 5643		case RX_HANDLER_PASS:
 5644			break;
 5645		default:
 5646			BUG();
 5647		}
 5648	}
 5649
 5650	if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
 5651check_vlan_id:
 5652		if (skb_vlan_tag_get_id(skb)) {
 5653			/* Vlan id is non 0 and vlan_do_receive() above couldn't
 5654			 * find vlan device.
 5655			 */
 5656			skb->pkt_type = PACKET_OTHERHOST;
 5657		} else if (eth_type_vlan(skb->protocol)) {
 5658			/* Outer header is 802.1P with vlan 0, inner header is
 5659			 * 802.1Q or 802.1AD and vlan_do_receive() above could
 5660			 * not find vlan dev for vlan id 0.
 5661			 */
 5662			__vlan_hwaccel_clear_tag(skb);
 5663			skb = skb_vlan_untag(skb);
 5664			if (unlikely(!skb))
 5665				goto out;
 5666			if (vlan_do_receive(&skb))
 5667				/* After stripping off 802.1P header with vlan 0
 5668				 * vlan dev is found for inner header.
 5669				 */
 5670				goto another_round;
 5671			else if (unlikely(!skb))
 5672				goto out;
 5673			else
 5674				/* We have stripped outer 802.1P vlan 0 header.
 5675				 * But could not find vlan dev.
 5676				 * check again for vlan id to set OTHERHOST.
 5677				 */
 5678				goto check_vlan_id;
 5679		}
 5680		/* Note: we might in the future use prio bits
 5681		 * and set skb->priority like in vlan_do_receive()
 5682		 * For the time being, just ignore Priority Code Point
 5683		 */
 5684		__vlan_hwaccel_clear_tag(skb);
 5685	}
 5686
 5687	type = skb->protocol;
 5688
 5689	/* deliver only exact match when indicated */
 5690	if (likely(!deliver_exact)) {
 5691		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5692				       &ptype_base[ntohs(type) &
 5693						   PTYPE_HASH_MASK]);
 5694	}
 5695
 5696	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5697			       &orig_dev->ptype_specific);
 5698
 5699	if (unlikely(skb->dev != orig_dev)) {
 5700		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5701				       &skb->dev->ptype_specific);
 5702	}
 5703
 5704	if (pt_prev) {
 5705		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 5706			goto drop;
 5707		*ppt_prev = pt_prev;
 
 5708	} else {
 5709drop:
 5710		if (!deliver_exact)
 5711			dev_core_stats_rx_dropped_inc(skb->dev);
 5712		else
 5713			dev_core_stats_rx_nohandler_inc(skb->dev);
 5714		kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
 5715		/* Jamal, now you will not able to escape explaining
 5716		 * me how you were going to use this. :-)
 5717		 */
 5718		ret = NET_RX_DROP;
 5719	}
 5720
 5721out:
 5722	/* The invariant here is that if *ppt_prev is not NULL
 5723	 * then skb should also be non-NULL.
 5724	 *
 5725	 * Apparently *ppt_prev assignment above holds this invariant due to
 5726	 * skb dereferencing near it.
 5727	 */
 5728	*pskb = skb;
 5729	return ret;
 5730}
 5731
 5732static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
 5733{
 5734	struct net_device *orig_dev = skb->dev;
 5735	struct packet_type *pt_prev = NULL;
 5736	int ret;
 5737
 5738	ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 5739	if (pt_prev)
 5740		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
 5741					 skb->dev, pt_prev, orig_dev);
 5742	return ret;
 5743}
 5744
 5745/**
 5746 *	netif_receive_skb_core - special purpose version of netif_receive_skb
 5747 *	@skb: buffer to process
 5748 *
 5749 *	More direct receive version of netif_receive_skb().  It should
 5750 *	only be used by callers that have a need to skip RPS and Generic XDP.
 5751 *	Caller must also take care of handling if ``(page_is_)pfmemalloc``.
 5752 *
 5753 *	This function may only be called from softirq context and interrupts
 5754 *	should be enabled.
 5755 *
 5756 *	Return values (usually ignored):
 5757 *	NET_RX_SUCCESS: no congestion
 5758 *	NET_RX_DROP: packet was dropped
 5759 */
 5760int netif_receive_skb_core(struct sk_buff *skb)
 5761{
 5762	int ret;
 5763
 5764	rcu_read_lock();
 5765	ret = __netif_receive_skb_one_core(skb, false);
 5766	rcu_read_unlock();
 5767
 5768	return ret;
 5769}
 5770EXPORT_SYMBOL(netif_receive_skb_core);
 5771
 5772static inline void __netif_receive_skb_list_ptype(struct list_head *head,
 5773						  struct packet_type *pt_prev,
 5774						  struct net_device *orig_dev)
 5775{
 5776	struct sk_buff *skb, *next;
 5777
 5778	if (!pt_prev)
 5779		return;
 5780	if (list_empty(head))
 5781		return;
 5782	if (pt_prev->list_func != NULL)
 5783		INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
 5784				   ip_list_rcv, head, pt_prev, orig_dev);
 5785	else
 5786		list_for_each_entry_safe(skb, next, head, list) {
 5787			skb_list_del_init(skb);
 5788			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 5789		}
 5790}
 5791
 5792static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
 5793{
 5794	/* Fast-path assumptions:
 5795	 * - There is no RX handler.
 5796	 * - Only one packet_type matches.
 5797	 * If either of these fails, we will end up doing some per-packet
 5798	 * processing in-line, then handling the 'last ptype' for the whole
 5799	 * sublist.  This can't cause out-of-order delivery to any single ptype,
 5800	 * because the 'last ptype' must be constant across the sublist, and all
 5801	 * other ptypes are handled per-packet.
 5802	 */
 5803	/* Current (common) ptype of sublist */
 5804	struct packet_type *pt_curr = NULL;
 5805	/* Current (common) orig_dev of sublist */
 5806	struct net_device *od_curr = NULL;
 5807	struct sk_buff *skb, *next;
 5808	LIST_HEAD(sublist);
 5809
 5810	list_for_each_entry_safe(skb, next, head, list) {
 5811		struct net_device *orig_dev = skb->dev;
 5812		struct packet_type *pt_prev = NULL;
 5813
 5814		skb_list_del_init(skb);
 5815		__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 5816		if (!pt_prev)
 5817			continue;
 5818		if (pt_curr != pt_prev || od_curr != orig_dev) {
 5819			/* dispatch old sublist */
 5820			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 5821			/* start new sublist */
 5822			INIT_LIST_HEAD(&sublist);
 5823			pt_curr = pt_prev;
 5824			od_curr = orig_dev;
 5825		}
 5826		list_add_tail(&skb->list, &sublist);
 5827	}
 5828
 5829	/* dispatch final sublist */
 5830	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 5831}
 5832
 5833static int __netif_receive_skb(struct sk_buff *skb)
 5834{
 5835	int ret;
 5836
 5837	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
 5838		unsigned int noreclaim_flag;
 5839
 5840		/*
 5841		 * PFMEMALLOC skbs are special, they should
 5842		 * - be delivered to SOCK_MEMALLOC sockets only
 5843		 * - stay away from userspace
 5844		 * - have bounded memory usage
 5845		 *
 5846		 * Use PF_MEMALLOC as this saves us from propagating the allocation
 5847		 * context down to all allocation sites.
 5848		 */
 5849		noreclaim_flag = memalloc_noreclaim_save();
 5850		ret = __netif_receive_skb_one_core(skb, true);
 5851		memalloc_noreclaim_restore(noreclaim_flag);
 5852	} else
 5853		ret = __netif_receive_skb_one_core(skb, false);
 5854
 5855	return ret;
 5856}
 5857
 5858static void __netif_receive_skb_list(struct list_head *head)
 5859{
 5860	unsigned long noreclaim_flag = 0;
 5861	struct sk_buff *skb, *next;
 5862	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
 5863
 5864	list_for_each_entry_safe(skb, next, head, list) {
 5865		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
 5866			struct list_head sublist;
 5867
 5868			/* Handle the previous sublist */
 5869			list_cut_before(&sublist, head, &skb->list);
 5870			if (!list_empty(&sublist))
 5871				__netif_receive_skb_list_core(&sublist, pfmemalloc);
 5872			pfmemalloc = !pfmemalloc;
 5873			/* See comments in __netif_receive_skb */
 5874			if (pfmemalloc)
 5875				noreclaim_flag = memalloc_noreclaim_save();
 5876			else
 5877				memalloc_noreclaim_restore(noreclaim_flag);
 5878		}
 5879	}
 5880	/* Handle the remaining sublist */
 5881	if (!list_empty(head))
 5882		__netif_receive_skb_list_core(head, pfmemalloc);
 5883	/* Restore pflags */
 5884	if (pfmemalloc)
 5885		memalloc_noreclaim_restore(noreclaim_flag);
 5886}
 5887
 5888static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 5889{
 5890	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
 5891	struct bpf_prog *new = xdp->prog;
 5892	int ret = 0;
 5893
 5894	switch (xdp->command) {
 5895	case XDP_SETUP_PROG:
 5896		rcu_assign_pointer(dev->xdp_prog, new);
 5897		if (old)
 5898			bpf_prog_put(old);
 5899
 5900		if (old && !new) {
 5901			static_branch_dec(&generic_xdp_needed_key);
 5902		} else if (new && !old) {
 5903			static_branch_inc(&generic_xdp_needed_key);
 5904			dev_disable_lro(dev);
 5905			dev_disable_gro_hw(dev);
 5906		}
 5907		break;
 5908
 5909	default:
 5910		ret = -EINVAL;
 5911		break;
 5912	}
 5913
 5914	return ret;
 5915}
 5916
 5917static int netif_receive_skb_internal(struct sk_buff *skb)
 5918{
 5919	int ret;
 5920
 5921	net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
 5922
 5923	if (skb_defer_rx_timestamp(skb))
 5924		return NET_RX_SUCCESS;
 5925
 5926	rcu_read_lock();
 
 5927#ifdef CONFIG_RPS
 5928	if (static_branch_unlikely(&rps_needed)) {
 5929		struct rps_dev_flow voidflow, *rflow = &voidflow;
 5930		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5931
 5932		if (cpu >= 0) {
 5933			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5934			rcu_read_unlock();
 5935			return ret;
 5936		}
 5937	}
 5938#endif
 5939	ret = __netif_receive_skb(skb);
 5940	rcu_read_unlock();
 5941	return ret;
 5942}
 5943
 5944void netif_receive_skb_list_internal(struct list_head *head)
 5945{
 5946	struct sk_buff *skb, *next;
 5947	LIST_HEAD(sublist);
 5948
 5949	list_for_each_entry_safe(skb, next, head, list) {
 5950		net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue),
 5951				    skb);
 5952		skb_list_del_init(skb);
 5953		if (!skb_defer_rx_timestamp(skb))
 5954			list_add_tail(&skb->list, &sublist);
 5955	}
 5956	list_splice_init(&sublist, head);
 5957
 5958	rcu_read_lock();
 5959#ifdef CONFIG_RPS
 5960	if (static_branch_unlikely(&rps_needed)) {
 5961		list_for_each_entry_safe(skb, next, head, list) {
 5962			struct rps_dev_flow voidflow, *rflow = &voidflow;
 5963			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5964
 5965			if (cpu >= 0) {
 5966				/* Will be handled, remove from list */
 5967				skb_list_del_init(skb);
 5968				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5969			}
 5970		}
 5971	}
 5972#endif
 5973	__netif_receive_skb_list(head);
 5974	rcu_read_unlock();
 5975}
 5976
 5977/**
 5978 *	netif_receive_skb - process receive buffer from network
 5979 *	@skb: buffer to process
 5980 *
 5981 *	netif_receive_skb() is the main receive data processing function.
 5982 *	It always succeeds. The buffer may be dropped during processing
 5983 *	for congestion control or by the protocol layers.
 5984 *
 5985 *	This function may only be called from softirq context and interrupts
 5986 *	should be enabled.
 5987 *
 5988 *	Return values (usually ignored):
 5989 *	NET_RX_SUCCESS: no congestion
 5990 *	NET_RX_DROP: packet was dropped
 5991 */
 5992int netif_receive_skb(struct sk_buff *skb)
 5993{
 5994	int ret;
 5995
 5996	trace_netif_receive_skb_entry(skb);
 5997
 5998	ret = netif_receive_skb_internal(skb);
 5999	trace_netif_receive_skb_exit(ret);
 6000
 6001	return ret;
 6002}
 6003EXPORT_SYMBOL(netif_receive_skb);
 6004
 6005/**
 6006 *	netif_receive_skb_list - process many receive buffers from network
 6007 *	@head: list of skbs to process.
 6008 *
 6009 *	Since return value of netif_receive_skb() is normally ignored, and
 6010 *	wouldn't be meaningful for a list, this function returns void.
 6011 *
 6012 *	This function may only be called from softirq context and interrupts
 6013 *	should be enabled.
 6014 */
 6015void netif_receive_skb_list(struct list_head *head)
 6016{
 6017	struct sk_buff *skb;
 6018
 6019	if (list_empty(head))
 6020		return;
 6021	if (trace_netif_receive_skb_list_entry_enabled()) {
 6022		list_for_each_entry(skb, head, list)
 6023			trace_netif_receive_skb_list_entry(skb);
 6024	}
 6025	netif_receive_skb_list_internal(head);
 6026	trace_netif_receive_skb_list_exit(0);
 6027}
 6028EXPORT_SYMBOL(netif_receive_skb_list);
 6029
 6030static DEFINE_PER_CPU(struct work_struct, flush_works);
 6031
 6032/* Network device is going away, flush any packets still pending */
 6033static void flush_backlog(struct work_struct *work)
 6034{
 6035	struct sk_buff *skb, *tmp;
 6036	struct softnet_data *sd;
 6037
 6038	local_bh_disable();
 6039	sd = this_cpu_ptr(&softnet_data);
 6040
 6041	backlog_lock_irq_disable(sd);
 
 6042	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
 6043		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 6044			__skb_unlink(skb, &sd->input_pkt_queue);
 6045			dev_kfree_skb_irq(skb);
 6046			rps_input_queue_head_incr(sd);
 6047		}
 6048	}
 6049	backlog_unlock_irq_enable(sd);
 
 6050
 6051	local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
 6052	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
 6053		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 6054			__skb_unlink(skb, &sd->process_queue);
 6055			kfree_skb(skb);
 6056			rps_input_queue_head_incr(sd);
 6057		}
 6058	}
 6059	local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
 6060	local_bh_enable();
 6061}
 6062
 6063static bool flush_required(int cpu)
 6064{
 6065#if IS_ENABLED(CONFIG_RPS)
 6066	struct softnet_data *sd = &per_cpu(softnet_data, cpu);
 6067	bool do_flush;
 6068
 6069	backlog_lock_irq_disable(sd);
 6070
 6071	/* as insertion into process_queue happens with the rps lock held,
 6072	 * process_queue access may race only with dequeue
 6073	 */
 6074	do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
 6075		   !skb_queue_empty_lockless(&sd->process_queue);
 6076	backlog_unlock_irq_enable(sd);
 6077
 6078	return do_flush;
 6079#endif
 6080	/* without RPS we can't safely check input_pkt_queue: during a
 6081	 * concurrent remote skb_queue_splice() we can detect as empty both
 6082	 * input_pkt_queue and process_queue even if the latter could end-up
 6083	 * containing a lot of packets.
 6084	 */
 6085	return true;
 6086}
 6087
 6088static void flush_all_backlogs(void)
 6089{
 6090	static cpumask_t flush_cpus;
 6091	unsigned int cpu;
 
 
 6092
 6093	/* since we are under rtnl lock protection we can use static data
 6094	 * for the cpumask and avoid allocating on stack the possibly
 6095	 * large mask
 6096	 */
 6097	ASSERT_RTNL();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 6098
 6099	cpus_read_lock();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 6100
 6101	cpumask_clear(&flush_cpus);
 6102	for_each_online_cpu(cpu) {
 6103		if (flush_required(cpu)) {
 6104			queue_work_on(cpu, system_highpri_wq,
 6105				      per_cpu_ptr(&flush_works, cpu));
 6106			cpumask_set_cpu(cpu, &flush_cpus);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 6107		}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 6108	}
 6109
 6110	/* we can have in flight packet[s] on the cpus we are not flushing,
 6111	 * synchronize_net() in unregister_netdevice_many() will take care of
 6112	 * them
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 6113	 */
 6114	for_each_cpu(cpu, &flush_cpus)
 6115		flush_work(per_cpu_ptr(&flush_works, cpu));
 6116
 6117	cpus_read_unlock();
 6118}
 6119
 6120static void net_rps_send_ipi(struct softnet_data *remsd)
 6121{
 6122#ifdef CONFIG_RPS
 6123	while (remsd) {
 6124		struct softnet_data *next = remsd->rps_ipi_next;
 
 6125
 6126		if (cpu_online(remsd->cpu))
 6127			smp_call_function_single_async(remsd->cpu, &remsd->csd);
 6128		remsd = next;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 6129	}
 6130#endif
 
 
 
 
 6131}
 
 6132
 6133/*
 6134 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
 6135 * Note: called with local irq disabled, but exits with local irq enabled.
 6136 */
 6137static void net_rps_action_and_irq_enable(struct softnet_data *sd)
 6138{
 6139#ifdef CONFIG_RPS
 6140	struct softnet_data *remsd = sd->rps_ipi_list;
 6141
 6142	if (!use_backlog_threads() && remsd) {
 6143		sd->rps_ipi_list = NULL;
 6144
 6145		local_irq_enable();
 6146
 6147		/* Send pending IPI's to kick RPS processing on remote cpus. */
 6148		net_rps_send_ipi(remsd);
 
 
 
 
 
 
 
 6149	} else
 6150#endif
 6151		local_irq_enable();
 6152}
 6153
 6154static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
 6155{
 6156#ifdef CONFIG_RPS
 6157	return !use_backlog_threads() && sd->rps_ipi_list;
 6158#else
 6159	return false;
 6160#endif
 6161}
 6162
 6163static int process_backlog(struct napi_struct *napi, int quota)
 6164{
 6165	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
 6166	bool again = true;
 6167	int work = 0;
 6168
 6169	/* Check if we have pending ipi, its better to send them now,
 6170	 * not waiting net_rx_action() end.
 6171	 */
 6172	if (sd_has_rps_ipi_waiting(sd)) {
 6173		local_irq_disable();
 6174		net_rps_action_and_irq_enable(sd);
 6175	}
 6176
 6177	napi->weight = READ_ONCE(net_hotdata.dev_rx_weight);
 6178	while (again) {
 6179		struct sk_buff *skb;
 6180
 6181		local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
 6182		while ((skb = __skb_dequeue(&sd->process_queue))) {
 6183			local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
 6184			rcu_read_lock();
 6185			__netif_receive_skb(skb);
 6186			rcu_read_unlock();
 6187			if (++work >= quota) {
 6188				rps_input_queue_head_add(sd, work);
 6189				return work;
 6190			}
 6191
 6192			local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
 6193		}
 6194		local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
 6195
 6196		backlog_lock_irq_disable(sd);
 
 6197		if (skb_queue_empty(&sd->input_pkt_queue)) {
 6198			/*
 6199			 * Inline a custom version of __napi_complete().
 6200			 * only current cpu owns and manipulates this napi,
 6201			 * and NAPI_STATE_SCHED is the only possible flag set
 6202			 * on backlog.
 6203			 * We can use a plain write instead of clear_bit(),
 6204			 * and we dont need an smp_mb() memory barrier.
 6205			 */
 6206			napi->state &= NAPIF_STATE_THREADED;
 6207			again = false;
 6208		} else {
 6209			local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
 6210			skb_queue_splice_tail_init(&sd->input_pkt_queue,
 6211						   &sd->process_queue);
 6212			local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
 6213		}
 6214		backlog_unlock_irq_enable(sd);
 
 6215	}
 6216
 6217	if (work)
 6218		rps_input_queue_head_add(sd, work);
 6219	return work;
 6220}
 6221
 6222/**
 6223 * __napi_schedule - schedule for receive
 6224 * @n: entry to schedule
 6225 *
 6226 * The entry's receive function will be scheduled to run.
 6227 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
 6228 */
 6229void __napi_schedule(struct napi_struct *n)
 6230{
 6231	unsigned long flags;
 6232
 6233	local_irq_save(flags);
 6234	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 6235	local_irq_restore(flags);
 6236}
 6237EXPORT_SYMBOL(__napi_schedule);
 6238
 6239/**
 6240 *	napi_schedule_prep - check if napi can be scheduled
 6241 *	@n: napi context
 6242 *
 6243 * Test if NAPI routine is already running, and if not mark
 6244 * it as running.  This is used as a condition variable to
 6245 * insure only one NAPI poll instance runs.  We also make
 6246 * sure there is no pending NAPI disable.
 6247 */
 6248bool napi_schedule_prep(struct napi_struct *n)
 6249{
 6250	unsigned long new, val = READ_ONCE(n->state);
 6251
 6252	do {
 
 6253		if (unlikely(val & NAPIF_STATE_DISABLE))
 6254			return false;
 6255		new = val | NAPIF_STATE_SCHED;
 6256
 6257		/* Sets STATE_MISSED bit if STATE_SCHED was already set
 6258		 * This was suggested by Alexander Duyck, as compiler
 6259		 * emits better code than :
 6260		 * if (val & NAPIF_STATE_SCHED)
 6261		 *     new |= NAPIF_STATE_MISSED;
 6262		 */
 6263		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
 6264						   NAPIF_STATE_MISSED;
 6265	} while (!try_cmpxchg(&n->state, &val, new));
 6266
 6267	return !(val & NAPIF_STATE_SCHED);
 6268}
 6269EXPORT_SYMBOL(napi_schedule_prep);
 6270
 6271/**
 6272 * __napi_schedule_irqoff - schedule for receive
 6273 * @n: entry to schedule
 6274 *
 6275 * Variant of __napi_schedule() assuming hard irqs are masked.
 6276 *
 6277 * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
 6278 * because the interrupt disabled assumption might not be true
 6279 * due to force-threaded interrupts and spinlock substitution.
 6280 */
 6281void __napi_schedule_irqoff(struct napi_struct *n)
 6282{
 6283	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 6284		____napi_schedule(this_cpu_ptr(&softnet_data), n);
 6285	else
 6286		__napi_schedule(n);
 6287}
 6288EXPORT_SYMBOL(__napi_schedule_irqoff);
 6289
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 6290bool napi_complete_done(struct napi_struct *n, int work_done)
 6291{
 6292	unsigned long flags, val, new, timeout = 0;
 6293	bool ret = true;
 6294
 6295	/*
 6296	 * 1) Don't let napi dequeue from the cpu poll list
 6297	 *    just in case its running on a different cpu.
 6298	 * 2) If we are busy polling, do nothing here, we have
 6299	 *    the guarantee we will be called later.
 6300	 */
 6301	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
 6302				 NAPIF_STATE_IN_BUSY_POLL)))
 6303		return false;
 6304
 6305	if (work_done) {
 6306		if (n->gro_bitmask)
 6307			timeout = napi_get_gro_flush_timeout(n);
 6308		n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n);
 6309	}
 6310	if (n->defer_hard_irqs_count > 0) {
 6311		n->defer_hard_irqs_count--;
 6312		timeout = napi_get_gro_flush_timeout(n);
 6313		if (timeout)
 6314			ret = false;
 6315	}
 6316	if (n->gro_bitmask) {
 6317		/* When the NAPI instance uses a timeout and keeps postponing
 6318		 * it, we need to bound somehow the time packets are kept in
 6319		 * the GRO layer
 6320		 */
 6321		napi_gro_flush(n, !!timeout);
 6322	}
 6323
 6324	gro_normal_list(n);
 
 6325
 
 
 
 
 
 
 6326	if (unlikely(!list_empty(&n->poll_list))) {
 6327		/* If n->poll_list is not empty, we need to mask irqs */
 6328		local_irq_save(flags);
 6329		list_del_init(&n->poll_list);
 6330		local_irq_restore(flags);
 6331	}
 6332	WRITE_ONCE(n->list_owner, -1);
 6333
 6334	val = READ_ONCE(n->state);
 6335	do {
 
 
 6336		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
 6337
 6338		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
 6339			      NAPIF_STATE_SCHED_THREADED |
 6340			      NAPIF_STATE_PREFER_BUSY_POLL);
 6341
 6342		/* If STATE_MISSED was set, leave STATE_SCHED set,
 6343		 * because we will call napi->poll() one more time.
 6344		 * This C code was suggested by Alexander Duyck to help gcc.
 6345		 */
 6346		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
 6347						    NAPIF_STATE_SCHED;
 6348	} while (!try_cmpxchg(&n->state, &val, new));
 6349
 6350	if (unlikely(val & NAPIF_STATE_MISSED)) {
 6351		__napi_schedule(n);
 6352		return false;
 6353	}
 6354
 6355	if (timeout)
 6356		hrtimer_start(&n->timer, ns_to_ktime(timeout),
 6357			      HRTIMER_MODE_REL_PINNED);
 6358	return ret;
 6359}
 6360EXPORT_SYMBOL(napi_complete_done);
 6361
 6362static void skb_defer_free_flush(struct softnet_data *sd)
 
 6363{
 6364	struct sk_buff *skb, *next;
 6365
 6366	/* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
 6367	if (!READ_ONCE(sd->defer_list))
 6368		return;
 6369
 6370	spin_lock(&sd->defer_lock);
 6371	skb = sd->defer_list;
 6372	sd->defer_list = NULL;
 6373	sd->defer_count = 0;
 6374	spin_unlock(&sd->defer_lock);
 6375
 6376	while (skb != NULL) {
 6377		next = skb->next;
 6378		napi_consume_skb(skb, 1);
 6379		skb = next;
 6380	}
 6381}
 6382
 6383#if defined(CONFIG_NET_RX_BUSY_POLL)
 6384
 6385static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
 6386{
 6387	if (!skip_schedule) {
 6388		gro_normal_list(napi);
 6389		__napi_schedule(napi);
 6390		return;
 6391	}
 6392
 6393	if (napi->gro_bitmask) {
 6394		/* flush too old packets
 6395		 * If HZ < 1000, flush all packets.
 6396		 */
 6397		napi_gro_flush(napi, HZ >= 1000);
 6398	}
 6399
 6400	gro_normal_list(napi);
 6401	clear_bit(NAPI_STATE_SCHED, &napi->state);
 6402}
 6403
 6404enum {
 6405	NAPI_F_PREFER_BUSY_POLL	= 1,
 6406	NAPI_F_END_ON_RESCHED	= 2,
 6407};
 6408
 6409static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
 6410			   unsigned flags, u16 budget)
 6411{
 6412	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
 6413	bool skip_schedule = false;
 6414	unsigned long timeout;
 6415	int rc;
 6416
 6417	/* Busy polling means there is a high chance device driver hard irq
 6418	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
 6419	 * set in napi_schedule_prep().
 6420	 * Since we are about to call napi->poll() once more, we can safely
 6421	 * clear NAPI_STATE_MISSED.
 6422	 *
 6423	 * Note: x86 could use a single "lock and ..." instruction
 6424	 * to perform these two clear_bit()
 6425	 */
 6426	clear_bit(NAPI_STATE_MISSED, &napi->state);
 6427	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
 6428
 6429	local_bh_disable();
 6430	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
 6431
 6432	if (flags & NAPI_F_PREFER_BUSY_POLL) {
 6433		napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi);
 6434		timeout = napi_get_gro_flush_timeout(napi);
 6435		if (napi->defer_hard_irqs_count && timeout) {
 6436			hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
 6437			skip_schedule = true;
 6438		}
 6439	}
 6440
 6441	/* All we really want here is to re-enable device interrupts.
 6442	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
 6443	 */
 6444	rc = napi->poll(napi, budget);
 6445	/* We can't gro_normal_list() here, because napi->poll() might have
 6446	 * rearmed the napi (napi_complete_done()) in which case it could
 6447	 * already be running on another CPU.
 6448	 */
 6449	trace_napi_poll(napi, rc, budget);
 6450	netpoll_poll_unlock(have_poll_lock);
 6451	if (rc == budget)
 6452		__busy_poll_stop(napi, skip_schedule);
 6453	bpf_net_ctx_clear(bpf_net_ctx);
 6454	local_bh_enable();
 
 
 6455}
 6456
 6457static void __napi_busy_loop(unsigned int napi_id,
 6458		      bool (*loop_end)(void *, unsigned long),
 6459		      void *loop_end_arg, unsigned flags, u16 budget)
 6460{
 6461	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
 6462	int (*napi_poll)(struct napi_struct *napi, int budget);
 6463	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
 6464	void *have_poll_lock = NULL;
 6465	struct napi_struct *napi;
 6466
 6467	WARN_ON_ONCE(!rcu_read_lock_held());
 6468
 6469restart:
 
 6470	napi_poll = NULL;
 6471
 6472	napi = napi_by_id(napi_id);
 
 
 6473	if (!napi)
 6474		return;
 6475
 6476	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 6477		preempt_disable();
 6478	for (;;) {
 6479		int work = 0;
 6480
 
 
 
 6481		local_bh_disable();
 6482		bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
 
 
 
 6483		if (!napi_poll) {
 6484			unsigned long val = READ_ONCE(napi->state);
 6485
 6486			/* If multiple threads are competing for this napi,
 6487			 * we avoid dirtying napi->state as much as we can.
 6488			 */
 6489			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
 6490				   NAPIF_STATE_IN_BUSY_POLL)) {
 6491				if (flags & NAPI_F_PREFER_BUSY_POLL)
 6492					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
 6493				goto count;
 6494			}
 6495			if (cmpxchg(&napi->state, val,
 6496				    val | NAPIF_STATE_IN_BUSY_POLL |
 6497					  NAPIF_STATE_SCHED) != val) {
 6498				if (flags & NAPI_F_PREFER_BUSY_POLL)
 6499					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
 6500				goto count;
 6501			}
 6502			have_poll_lock = netpoll_poll_lock(napi);
 6503			napi_poll = napi->poll;
 6504		}
 6505		work = napi_poll(napi, budget);
 6506		trace_napi_poll(napi, work, budget);
 6507		gro_normal_list(napi);
 6508count:
 6509		if (work > 0)
 6510			__NET_ADD_STATS(dev_net(napi->dev),
 6511					LINUX_MIB_BUSYPOLLRXPACKETS, work);
 6512		skb_defer_free_flush(this_cpu_ptr(&softnet_data));
 6513		bpf_net_ctx_clear(bpf_net_ctx);
 6514		local_bh_enable();
 6515
 6516		if (!loop_end || loop_end(loop_end_arg, start_time))
 
 
 
 
 6517			break;
 6518
 6519		if (unlikely(need_resched())) {
 6520			if (flags & NAPI_F_END_ON_RESCHED)
 6521				break;
 6522			if (napi_poll)
 6523				busy_poll_stop(napi, have_poll_lock, flags, budget);
 6524			if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 6525				preempt_enable();
 6526			rcu_read_unlock();
 6527			cond_resched();
 6528			rcu_read_lock();
 6529			if (loop_end(loop_end_arg, start_time))
 6530				return;
 6531			goto restart;
 6532		}
 6533		cpu_relax();
 6534	}
 6535	if (napi_poll)
 6536		busy_poll_stop(napi, have_poll_lock, flags, budget);
 6537	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 6538		preempt_enable();
 6539}
 6540
 6541void napi_busy_loop_rcu(unsigned int napi_id,
 6542			bool (*loop_end)(void *, unsigned long),
 6543			void *loop_end_arg, bool prefer_busy_poll, u16 budget)
 6544{
 6545	unsigned flags = NAPI_F_END_ON_RESCHED;
 6546
 6547	if (prefer_busy_poll)
 6548		flags |= NAPI_F_PREFER_BUSY_POLL;
 6549
 6550	__napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
 6551}
 6552
 6553void napi_busy_loop(unsigned int napi_id,
 6554		    bool (*loop_end)(void *, unsigned long),
 6555		    void *loop_end_arg, bool prefer_busy_poll, u16 budget)
 6556{
 6557	unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0;
 6558
 6559	rcu_read_lock();
 6560	__napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
 6561	rcu_read_unlock();
 6562}
 6563EXPORT_SYMBOL(napi_busy_loop);
 6564
 6565void napi_suspend_irqs(unsigned int napi_id)
 6566{
 6567	struct napi_struct *napi;
 6568
 6569	rcu_read_lock();
 6570	napi = napi_by_id(napi_id);
 6571	if (napi) {
 6572		unsigned long timeout = napi_get_irq_suspend_timeout(napi);
 6573
 6574		if (timeout)
 6575			hrtimer_start(&napi->timer, ns_to_ktime(timeout),
 6576				      HRTIMER_MODE_REL_PINNED);
 6577	}
 6578	rcu_read_unlock();
 6579}
 6580
 6581void napi_resume_irqs(unsigned int napi_id)
 6582{
 6583	struct napi_struct *napi;
 6584
 6585	rcu_read_lock();
 6586	napi = napi_by_id(napi_id);
 6587	if (napi) {
 6588		/* If irq_suspend_timeout is set to 0 between the call to
 6589		 * napi_suspend_irqs and now, the original value still
 6590		 * determines the safety timeout as intended and napi_watchdog
 6591		 * will resume irq processing.
 6592		 */
 6593		if (napi_get_irq_suspend_timeout(napi)) {
 6594			local_bh_disable();
 6595			napi_schedule(napi);
 6596			local_bh_enable();
 6597		}
 6598	}
 6599	rcu_read_unlock();
 
 6600}
 
 6601
 6602#endif /* CONFIG_NET_RX_BUSY_POLL */
 6603
 6604static void __napi_hash_add_with_id(struct napi_struct *napi,
 6605				    unsigned int napi_id)
 6606{
 6607	WRITE_ONCE(napi->napi_id, napi_id);
 6608	hlist_add_head_rcu(&napi->napi_hash_node,
 6609			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
 6610}
 6611
 6612static void napi_hash_add_with_id(struct napi_struct *napi,
 6613				  unsigned int napi_id)
 6614{
 6615	unsigned long flags;
 6616
 6617	spin_lock_irqsave(&napi_hash_lock, flags);
 6618	WARN_ON_ONCE(napi_by_id(napi_id));
 6619	__napi_hash_add_with_id(napi, napi_id);
 6620	spin_unlock_irqrestore(&napi_hash_lock, flags);
 6621}
 6622
 6623static void napi_hash_add(struct napi_struct *napi)
 6624{
 6625	unsigned long flags;
 6626
 6627	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
 6628		return;
 6629
 6630	spin_lock_irqsave(&napi_hash_lock, flags);
 6631
 6632	/* 0..NR_CPUS range is reserved for sender_cpu use */
 6633	do {
 6634		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
 6635			napi_gen_id = MIN_NAPI_ID;
 6636	} while (napi_by_id(napi_gen_id));
 
 6637
 6638	__napi_hash_add_with_id(napi, napi_gen_id);
 
 6639
 6640	spin_unlock_irqrestore(&napi_hash_lock, flags);
 6641}
 6642
 6643/* Warning : caller is responsible to make sure rcu grace period
 6644 * is respected before freeing memory containing @napi
 6645 */
 6646static void napi_hash_del(struct napi_struct *napi)
 6647{
 6648	unsigned long flags;
 6649
 6650	spin_lock_irqsave(&napi_hash_lock, flags);
 6651
 6652	hlist_del_init_rcu(&napi->napi_hash_node);
 6653
 6654	spin_unlock_irqrestore(&napi_hash_lock, flags);
 
 
 
 6655}
 
 6656
 6657static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 6658{
 6659	struct napi_struct *napi;
 6660
 6661	napi = container_of(timer, struct napi_struct, timer);
 6662
 6663	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
 6664	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
 6665	 */
 6666	if (!napi_disable_pending(napi) &&
 6667	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
 6668		clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
 6669		__napi_schedule_irqoff(napi);
 6670	}
 6671
 6672	return HRTIMER_NORESTART;
 6673}
 6674
 6675static void init_gro_hash(struct napi_struct *napi)
 6676{
 6677	int i;
 6678
 6679	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6680		INIT_LIST_HEAD(&napi->gro_hash[i].list);
 6681		napi->gro_hash[i].count = 0;
 6682	}
 6683	napi->gro_bitmask = 0;
 6684}
 6685
 6686int dev_set_threaded(struct net_device *dev, bool threaded)
 6687{
 6688	struct napi_struct *napi;
 6689	int err = 0;
 6690
 6691	if (dev->threaded == threaded)
 6692		return 0;
 6693
 6694	if (threaded) {
 6695		list_for_each_entry(napi, &dev->napi_list, dev_list) {
 6696			if (!napi->thread) {
 6697				err = napi_kthread_create(napi);
 6698				if (err) {
 6699					threaded = false;
 6700					break;
 6701				}
 6702			}
 6703		}
 6704	}
 6705
 6706	WRITE_ONCE(dev->threaded, threaded);
 6707
 6708	/* Make sure kthread is created before THREADED bit
 6709	 * is set.
 6710	 */
 6711	smp_mb__before_atomic();
 6712
 6713	/* Setting/unsetting threaded mode on a napi might not immediately
 6714	 * take effect, if the current napi instance is actively being
 6715	 * polled. In this case, the switch between threaded mode and
 6716	 * softirq mode will happen in the next round of napi_schedule().
 6717	 * This should not cause hiccups/stalls to the live traffic.
 6718	 */
 6719	list_for_each_entry(napi, &dev->napi_list, dev_list)
 6720		assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
 6721
 6722	return err;
 6723}
 6724EXPORT_SYMBOL(dev_set_threaded);
 6725
 6726/**
 6727 * netif_queue_set_napi - Associate queue with the napi
 6728 * @dev: device to which NAPI and queue belong
 6729 * @queue_index: Index of queue
 6730 * @type: queue type as RX or TX
 6731 * @napi: NAPI context, pass NULL to clear previously set NAPI
 6732 *
 6733 * Set queue with its corresponding napi context. This should be done after
 6734 * registering the NAPI handler for the queue-vector and the queues have been
 6735 * mapped to the corresponding interrupt vector.
 6736 */
 6737void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
 6738			  enum netdev_queue_type type, struct napi_struct *napi)
 6739{
 6740	struct netdev_rx_queue *rxq;
 6741	struct netdev_queue *txq;
 6742
 6743	if (WARN_ON_ONCE(napi && !napi->dev))
 6744		return;
 6745	if (dev->reg_state >= NETREG_REGISTERED)
 6746		ASSERT_RTNL();
 6747
 6748	switch (type) {
 6749	case NETDEV_QUEUE_TYPE_RX:
 6750		rxq = __netif_get_rx_queue(dev, queue_index);
 6751		rxq->napi = napi;
 6752		return;
 6753	case NETDEV_QUEUE_TYPE_TX:
 6754		txq = netdev_get_tx_queue(dev, queue_index);
 6755		txq->napi = napi;
 6756		return;
 6757	default:
 6758		return;
 6759	}
 6760}
 6761EXPORT_SYMBOL(netif_queue_set_napi);
 6762
 6763static void napi_restore_config(struct napi_struct *n)
 6764{
 6765	n->defer_hard_irqs = n->config->defer_hard_irqs;
 6766	n->gro_flush_timeout = n->config->gro_flush_timeout;
 6767	n->irq_suspend_timeout = n->config->irq_suspend_timeout;
 6768	/* a NAPI ID might be stored in the config, if so use it. if not, use
 6769	 * napi_hash_add to generate one for us. It will be saved to the config
 6770	 * in napi_disable.
 6771	 */
 6772	if (n->config->napi_id)
 6773		napi_hash_add_with_id(n, n->config->napi_id);
 6774	else
 6775		napi_hash_add(n);
 6776}
 6777
 6778static void napi_save_config(struct napi_struct *n)
 6779{
 6780	n->config->defer_hard_irqs = n->defer_hard_irqs;
 6781	n->config->gro_flush_timeout = n->gro_flush_timeout;
 6782	n->config->irq_suspend_timeout = n->irq_suspend_timeout;
 6783	n->config->napi_id = n->napi_id;
 6784	napi_hash_del(n);
 6785}
 6786
 6787void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 6788			   int (*poll)(struct napi_struct *, int), int weight)
 6789{
 6790	if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
 6791		return;
 6792
 6793	INIT_LIST_HEAD(&napi->poll_list);
 6794	INIT_HLIST_NODE(&napi->napi_hash_node);
 6795	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 6796	napi->timer.function = napi_watchdog;
 6797	init_gro_hash(napi);
 
 6798	napi->skb = NULL;
 6799	INIT_LIST_HEAD(&napi->rx_list);
 6800	napi->rx_count = 0;
 6801	napi->poll = poll;
 6802	if (weight > NAPI_POLL_WEIGHT)
 6803		netdev_err_once(dev, "%s() called with weight %d\n", __func__,
 6804				weight);
 6805	napi->weight = weight;
 
 6806	napi->dev = dev;
 6807#ifdef CONFIG_NETPOLL
 6808	napi->poll_owner = -1;
 6809#endif
 6810	napi->list_owner = -1;
 6811	set_bit(NAPI_STATE_SCHED, &napi->state);
 6812	set_bit(NAPI_STATE_NPSVC, &napi->state);
 6813	list_add_rcu(&napi->dev_list, &dev->napi_list);
 6814
 6815	/* default settings from sysfs are applied to all NAPIs. any per-NAPI
 6816	 * configuration will be loaded in napi_enable
 6817	 */
 6818	napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
 6819	napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
 6820
 6821	napi_get_frags_check(napi);
 6822	/* Create kthread for this napi if dev->threaded is set.
 6823	 * Clear dev->threaded if kthread creation failed so that
 6824	 * threaded mode will not be enabled in napi_enable().
 6825	 */
 6826	if (dev->threaded && napi_kthread_create(napi))
 6827		dev->threaded = false;
 6828	netif_napi_set_irq(napi, -1);
 6829}
 6830EXPORT_SYMBOL(netif_napi_add_weight);
 6831
 6832void napi_disable(struct napi_struct *n)
 6833{
 6834	unsigned long val, new;
 6835
 6836	might_sleep();
 6837	set_bit(NAPI_STATE_DISABLE, &n->state);
 6838
 6839	val = READ_ONCE(n->state);
 6840	do {
 6841		while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
 6842			usleep_range(20, 200);
 6843			val = READ_ONCE(n->state);
 6844		}
 6845
 6846		new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
 6847		new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
 6848	} while (!try_cmpxchg(&n->state, &val, new));
 6849
 6850	hrtimer_cancel(&n->timer);
 6851
 6852	if (n->config)
 6853		napi_save_config(n);
 6854	else
 6855		napi_hash_del(n);
 6856
 6857	clear_bit(NAPI_STATE_DISABLE, &n->state);
 6858}
 6859EXPORT_SYMBOL(napi_disable);
 6860
 6861/**
 6862 *	napi_enable - enable NAPI scheduling
 6863 *	@n: NAPI context
 6864 *
 6865 * Resume NAPI from being scheduled on this context.
 6866 * Must be paired with napi_disable.
 6867 */
 6868void napi_enable(struct napi_struct *n)
 6869{
 6870	unsigned long new, val = READ_ONCE(n->state);
 6871
 6872	if (n->config)
 6873		napi_restore_config(n);
 6874	else
 6875		napi_hash_add(n);
 6876
 6877	do {
 6878		BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
 6879
 6880		new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
 6881		if (n->dev->threaded && n->thread)
 6882			new |= NAPIF_STATE_THREADED;
 6883	} while (!try_cmpxchg(&n->state, &val, new));
 6884}
 6885EXPORT_SYMBOL(napi_enable);
 6886
 6887static void flush_gro_hash(struct napi_struct *napi)
 6888{
 6889	int i;
 6890
 6891	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6892		struct sk_buff *skb, *n;
 6893
 6894		list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
 6895			kfree_skb(skb);
 6896		napi->gro_hash[i].count = 0;
 6897	}
 6898}
 6899
 6900/* Must be called in process context */
 6901void __netif_napi_del(struct napi_struct *napi)
 6902{
 6903	if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
 6904		return;
 6905
 6906	if (napi->config) {
 6907		napi->index = -1;
 6908		napi->config = NULL;
 6909	}
 6910
 6911	list_del_rcu(&napi->dev_list);
 6912	napi_free_frags(napi);
 6913
 6914	flush_gro_hash(napi);
 6915	napi->gro_bitmask = 0;
 6916
 6917	if (napi->thread) {
 6918		kthread_stop(napi->thread);
 6919		napi->thread = NULL;
 6920	}
 6921}
 6922EXPORT_SYMBOL(__netif_napi_del);
 6923
 6924static int __napi_poll(struct napi_struct *n, bool *repoll)
 6925{
 
 6926	int work, weight;
 6927
 
 
 
 
 6928	weight = n->weight;
 6929
 6930	/* This NAPI_STATE_SCHED test is for avoiding a race
 6931	 * with netpoll's poll_napi().  Only the entity which
 6932	 * obtains the lock and sees NAPI_STATE_SCHED set will
 6933	 * actually make the ->poll() call.  Therefore we avoid
 6934	 * accidentally calling ->poll() when NAPI is not scheduled.
 6935	 */
 6936	work = 0;
 6937	if (napi_is_scheduled(n)) {
 6938		work = n->poll(n, weight);
 6939		trace_napi_poll(n, work, weight);
 6940
 6941		xdp_do_check_flushed(n);
 6942	}
 6943
 6944	if (unlikely(work > weight))
 6945		netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
 6946				n->poll, work, weight);
 6947
 6948	if (likely(work < weight))
 6949		return work;
 6950
 6951	/* Drivers must not modify the NAPI state if they
 6952	 * consume the entire weight.  In such cases this code
 6953	 * still "owns" the NAPI instance and therefore can
 6954	 * move the instance around on the list at-will.
 6955	 */
 6956	if (unlikely(napi_disable_pending(n))) {
 6957		napi_complete(n);
 6958		return work;
 6959	}
 6960
 6961	/* The NAPI context has more processing work, but busy-polling
 6962	 * is preferred. Exit early.
 6963	 */
 6964	if (napi_prefer_busy_poll(n)) {
 6965		if (napi_complete_done(n, work)) {
 6966			/* If timeout is not set, we need to make sure
 6967			 * that the NAPI is re-scheduled.
 6968			 */
 6969			napi_schedule(n);
 6970		}
 6971		return work;
 6972	}
 6973
 6974	if (n->gro_bitmask) {
 6975		/* flush too old packets
 6976		 * If HZ < 1000, flush all packets.
 6977		 */
 6978		napi_gro_flush(n, HZ >= 1000);
 6979	}
 6980
 6981	gro_normal_list(n);
 6982
 6983	/* Some drivers may have called napi_schedule
 6984	 * prior to exhausting their budget.
 6985	 */
 6986	if (unlikely(!list_empty(&n->poll_list))) {
 6987		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
 6988			     n->dev ? n->dev->name : "backlog");
 6989		return work;
 6990	}
 6991
 6992	*repoll = true;
 6993
 6994	return work;
 6995}
 6996
 6997static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 6998{
 6999	bool do_repoll = false;
 7000	void *have;
 7001	int work;
 7002
 7003	list_del_init(&n->poll_list);
 7004
 7005	have = netpoll_poll_lock(n);
 7006
 7007	work = __napi_poll(n, &do_repoll);
 7008
 7009	if (do_repoll)
 7010		list_add_tail(&n->poll_list, repoll);
 7011
 
 7012	netpoll_poll_unlock(have);
 7013
 7014	return work;
 7015}
 7016
 7017static int napi_thread_wait(struct napi_struct *napi)
 7018{
 7019	set_current_state(TASK_INTERRUPTIBLE);
 7020
 7021	while (!kthread_should_stop()) {
 7022		/* Testing SCHED_THREADED bit here to make sure the current
 7023		 * kthread owns this napi and could poll on this napi.
 7024		 * Testing SCHED bit is not enough because SCHED bit might be
 7025		 * set by some other busy poll thread or by napi_disable().
 7026		 */
 7027		if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) {
 7028			WARN_ON(!list_empty(&napi->poll_list));
 7029			__set_current_state(TASK_RUNNING);
 7030			return 0;
 7031		}
 7032
 7033		schedule();
 7034		set_current_state(TASK_INTERRUPTIBLE);
 7035	}
 7036	__set_current_state(TASK_RUNNING);
 7037
 7038	return -1;
 7039}
 7040
 7041static void napi_threaded_poll_loop(struct napi_struct *napi)
 7042{
 7043	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
 7044	struct softnet_data *sd;
 7045	unsigned long last_qs = jiffies;
 7046
 7047	for (;;) {
 7048		bool repoll = false;
 7049		void *have;
 7050
 7051		local_bh_disable();
 7052		bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
 7053
 7054		sd = this_cpu_ptr(&softnet_data);
 7055		sd->in_napi_threaded_poll = true;
 7056
 7057		have = netpoll_poll_lock(napi);
 7058		__napi_poll(napi, &repoll);
 7059		netpoll_poll_unlock(have);
 7060
 7061		sd->in_napi_threaded_poll = false;
 7062		barrier();
 7063
 7064		if (sd_has_rps_ipi_waiting(sd)) {
 7065			local_irq_disable();
 7066			net_rps_action_and_irq_enable(sd);
 7067		}
 7068		skb_defer_free_flush(sd);
 7069		bpf_net_ctx_clear(bpf_net_ctx);
 7070		local_bh_enable();
 7071
 7072		if (!repoll)
 7073			break;
 7074
 7075		rcu_softirq_qs_periodic(last_qs);
 7076		cond_resched();
 7077	}
 7078}
 7079
 7080static int napi_threaded_poll(void *data)
 7081{
 7082	struct napi_struct *napi = data;
 7083
 7084	while (!napi_thread_wait(napi))
 7085		napi_threaded_poll_loop(napi);
 7086
 7087	return 0;
 7088}
 7089
 7090static __latent_entropy void net_rx_action(void)
 7091{
 7092	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 7093	unsigned long time_limit = jiffies +
 7094		usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
 7095	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
 7096	int budget = READ_ONCE(net_hotdata.netdev_budget);
 7097	LIST_HEAD(list);
 7098	LIST_HEAD(repoll);
 7099
 7100	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
 7101start:
 7102	sd->in_net_rx_action = true;
 7103	local_irq_disable();
 7104	list_splice_init(&sd->poll_list, &list);
 7105	local_irq_enable();
 7106
 7107	for (;;) {
 7108		struct napi_struct *n;
 7109
 7110		skb_defer_free_flush(sd);
 7111
 7112		if (list_empty(&list)) {
 7113			if (list_empty(&repoll)) {
 7114				sd->in_net_rx_action = false;
 7115				barrier();
 7116				/* We need to check if ____napi_schedule()
 7117				 * had refilled poll_list while
 7118				 * sd->in_net_rx_action was true.
 7119				 */
 7120				if (!list_empty(&sd->poll_list))
 7121					goto start;
 7122				if (!sd_has_rps_ipi_waiting(sd))
 7123					goto end;
 7124			}
 7125			break;
 7126		}
 7127
 7128		n = list_first_entry(&list, struct napi_struct, poll_list);
 7129		budget -= napi_poll(n, &repoll);
 7130
 7131		/* If softirq window is exhausted then punt.
 7132		 * Allow this to run for 2 jiffies since which will allow
 7133		 * an average latency of 1.5/HZ.
 7134		 */
 7135		if (unlikely(budget <= 0 ||
 7136			     time_after_eq(jiffies, time_limit))) {
 7137			sd->time_squeeze++;
 7138			break;
 7139		}
 7140	}
 7141
 7142	local_irq_disable();
 7143
 7144	list_splice_tail_init(&sd->poll_list, &list);
 7145	list_splice_tail(&repoll, &list);
 7146	list_splice(&list, &sd->poll_list);
 7147	if (!list_empty(&sd->poll_list))
 7148		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 7149	else
 7150		sd->in_net_rx_action = false;
 7151
 7152	net_rps_action_and_irq_enable(sd);
 7153end:
 7154	bpf_net_ctx_clear(bpf_net_ctx);
 7155}
 7156
 7157struct netdev_adjacent {
 7158	struct net_device *dev;
 7159	netdevice_tracker dev_tracker;
 7160
 7161	/* upper master flag, there can only be one master device per list */
 7162	bool master;
 7163
 7164	/* lookup ignore flag */
 7165	bool ignore;
 7166
 7167	/* counter for the number of times this device was added to us */
 7168	u16 ref_nr;
 7169
 7170	/* private field for the users */
 7171	void *private;
 7172
 7173	struct list_head list;
 7174	struct rcu_head rcu;
 7175};
 7176
 7177static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
 7178						 struct list_head *adj_list)
 7179{
 7180	struct netdev_adjacent *adj;
 7181
 7182	list_for_each_entry(adj, adj_list, list) {
 7183		if (adj->dev == adj_dev)
 7184			return adj;
 7185	}
 7186	return NULL;
 7187}
 7188
 7189static int ____netdev_has_upper_dev(struct net_device *upper_dev,
 7190				    struct netdev_nested_priv *priv)
 7191{
 7192	struct net_device *dev = (struct net_device *)priv->data;
 7193
 7194	return upper_dev == dev;
 7195}
 7196
 7197/**
 7198 * netdev_has_upper_dev - Check if device is linked to an upper device
 7199 * @dev: device
 7200 * @upper_dev: upper device to check
 7201 *
 7202 * Find out if a device is linked to specified upper device and return true
 7203 * in case it is. Note that this checks only immediate upper device,
 7204 * not through a complete stack of devices. The caller must hold the RTNL lock.
 7205 */
 7206bool netdev_has_upper_dev(struct net_device *dev,
 7207			  struct net_device *upper_dev)
 7208{
 7209	struct netdev_nested_priv priv = {
 7210		.data = (void *)upper_dev,
 7211	};
 7212
 7213	ASSERT_RTNL();
 7214
 7215	return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 7216					     &priv);
 7217}
 7218EXPORT_SYMBOL(netdev_has_upper_dev);
 7219
 7220/**
 7221 * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
 7222 * @dev: device
 7223 * @upper_dev: upper device to check
 7224 *
 7225 * Find out if a device is linked to specified upper device and return true
 7226 * in case it is. Note that this checks the entire upper device chain.
 7227 * The caller must hold rcu lock.
 7228 */
 7229
 7230bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
 7231				  struct net_device *upper_dev)
 7232{
 7233	struct netdev_nested_priv priv = {
 7234		.data = (void *)upper_dev,
 7235	};
 7236
 7237	return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 7238					       &priv);
 7239}
 7240EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
 7241
 7242/**
 7243 * netdev_has_any_upper_dev - Check if device is linked to some device
 7244 * @dev: device
 7245 *
 7246 * Find out if a device is linked to an upper device and return true in case
 7247 * it is. The caller must hold the RTNL lock.
 7248 */
 7249bool netdev_has_any_upper_dev(struct net_device *dev)
 7250{
 7251	ASSERT_RTNL();
 7252
 7253	return !list_empty(&dev->adj_list.upper);
 7254}
 7255EXPORT_SYMBOL(netdev_has_any_upper_dev);
 7256
 7257/**
 7258 * netdev_master_upper_dev_get - Get master upper device
 7259 * @dev: device
 7260 *
 7261 * Find a master upper device and return pointer to it or NULL in case
 7262 * it's not there. The caller must hold the RTNL lock.
 7263 */
 7264struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
 7265{
 7266	struct netdev_adjacent *upper;
 7267
 7268	ASSERT_RTNL();
 7269
 7270	if (list_empty(&dev->adj_list.upper))
 7271		return NULL;
 7272
 7273	upper = list_first_entry(&dev->adj_list.upper,
 7274				 struct netdev_adjacent, list);
 7275	if (likely(upper->master))
 7276		return upper->dev;
 7277	return NULL;
 7278}
 7279EXPORT_SYMBOL(netdev_master_upper_dev_get);
 7280
 7281static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
 7282{
 7283	struct netdev_adjacent *upper;
 7284
 7285	ASSERT_RTNL();
 7286
 7287	if (list_empty(&dev->adj_list.upper))
 7288		return NULL;
 7289
 7290	upper = list_first_entry(&dev->adj_list.upper,
 7291				 struct netdev_adjacent, list);
 7292	if (likely(upper->master) && !upper->ignore)
 7293		return upper->dev;
 7294	return NULL;
 7295}
 7296
 7297/**
 7298 * netdev_has_any_lower_dev - Check if device is linked to some device
 7299 * @dev: device
 7300 *
 7301 * Find out if a device is linked to a lower device and return true in case
 7302 * it is. The caller must hold the RTNL lock.
 7303 */
 7304static bool netdev_has_any_lower_dev(struct net_device *dev)
 7305{
 7306	ASSERT_RTNL();
 7307
 7308	return !list_empty(&dev->adj_list.lower);
 7309}
 7310
 7311void *netdev_adjacent_get_private(struct list_head *adj_list)
 7312{
 7313	struct netdev_adjacent *adj;
 7314
 7315	adj = list_entry(adj_list, struct netdev_adjacent, list);
 7316
 7317	return adj->private;
 7318}
 7319EXPORT_SYMBOL(netdev_adjacent_get_private);
 7320
 7321/**
 7322 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
 7323 * @dev: device
 7324 * @iter: list_head ** of the current position
 7325 *
 7326 * Gets the next device from the dev's upper list, starting from iter
 7327 * position. The caller must hold RCU read lock.
 7328 */
 7329struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
 7330						 struct list_head **iter)
 7331{
 7332	struct netdev_adjacent *upper;
 7333
 7334	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 7335
 7336	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7337
 7338	if (&upper->list == &dev->adj_list.upper)
 7339		return NULL;
 7340
 7341	*iter = &upper->list;
 7342
 7343	return upper->dev;
 7344}
 7345EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
 7346
 7347static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
 7348						  struct list_head **iter,
 7349						  bool *ignore)
 7350{
 7351	struct netdev_adjacent *upper;
 7352
 7353	upper = list_entry((*iter)->next, struct netdev_adjacent, list);
 7354
 7355	if (&upper->list == &dev->adj_list.upper)
 7356		return NULL;
 7357
 7358	*iter = &upper->list;
 7359	*ignore = upper->ignore;
 7360
 7361	return upper->dev;
 7362}
 7363
 7364static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
 7365						    struct list_head **iter)
 7366{
 7367	struct netdev_adjacent *upper;
 7368
 7369	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 7370
 7371	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7372
 7373	if (&upper->list == &dev->adj_list.upper)
 7374		return NULL;
 7375
 7376	*iter = &upper->list;
 7377
 7378	return upper->dev;
 7379}
 7380
 7381static int __netdev_walk_all_upper_dev(struct net_device *dev,
 7382				       int (*fn)(struct net_device *dev,
 7383					 struct netdev_nested_priv *priv),
 7384				       struct netdev_nested_priv *priv)
 7385{
 7386	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7387	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7388	int ret, cur = 0;
 7389	bool ignore;
 7390
 7391	now = dev;
 7392	iter = &dev->adj_list.upper;
 7393
 7394	while (1) {
 7395		if (now != dev) {
 7396			ret = fn(now, priv);
 7397			if (ret)
 7398				return ret;
 7399		}
 7400
 7401		next = NULL;
 7402		while (1) {
 7403			udev = __netdev_next_upper_dev(now, &iter, &ignore);
 7404			if (!udev)
 7405				break;
 7406			if (ignore)
 7407				continue;
 7408
 7409			next = udev;
 7410			niter = &udev->adj_list.upper;
 7411			dev_stack[cur] = now;
 7412			iter_stack[cur++] = iter;
 7413			break;
 7414		}
 7415
 7416		if (!next) {
 7417			if (!cur)
 7418				return 0;
 7419			next = dev_stack[--cur];
 7420			niter = iter_stack[cur];
 7421		}
 7422
 7423		now = next;
 7424		iter = niter;
 7425	}
 7426
 7427	return 0;
 7428}
 7429
 7430int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
 7431				  int (*fn)(struct net_device *dev,
 7432					    struct netdev_nested_priv *priv),
 7433				  struct netdev_nested_priv *priv)
 7434{
 7435	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7436	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7437	int ret, cur = 0;
 7438
 7439	now = dev;
 7440	iter = &dev->adj_list.upper;
 7441
 7442	while (1) {
 7443		if (now != dev) {
 7444			ret = fn(now, priv);
 7445			if (ret)
 7446				return ret;
 7447		}
 7448
 7449		next = NULL;
 7450		while (1) {
 7451			udev = netdev_next_upper_dev_rcu(now, &iter);
 7452			if (!udev)
 7453				break;
 7454
 7455			next = udev;
 7456			niter = &udev->adj_list.upper;
 7457			dev_stack[cur] = now;
 7458			iter_stack[cur++] = iter;
 7459			break;
 7460		}
 7461
 7462		if (!next) {
 7463			if (!cur)
 7464				return 0;
 7465			next = dev_stack[--cur];
 7466			niter = iter_stack[cur];
 7467		}
 
 
 7468
 7469		now = next;
 7470		iter = niter;
 
 
 7471	}
 7472
 7473	return 0;
 7474}
 7475EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
 7476
 7477static bool __netdev_has_upper_dev(struct net_device *dev,
 7478				   struct net_device *upper_dev)
 7479{
 7480	struct netdev_nested_priv priv = {
 7481		.flags = 0,
 7482		.data = (void *)upper_dev,
 7483	};
 7484
 7485	ASSERT_RTNL();
 7486
 7487	return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
 7488					   &priv);
 7489}
 7490
 7491/**
 7492 * netdev_lower_get_next_private - Get the next ->private from the
 7493 *				   lower neighbour list
 7494 * @dev: device
 7495 * @iter: list_head ** of the current position
 7496 *
 7497 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 7498 * list, starting from iter position. The caller must hold either hold the
 7499 * RTNL lock or its own locking that guarantees that the neighbour lower
 7500 * list will remain unchanged.
 7501 */
 7502void *netdev_lower_get_next_private(struct net_device *dev,
 7503				    struct list_head **iter)
 7504{
 7505	struct netdev_adjacent *lower;
 7506
 7507	lower = list_entry(*iter, struct netdev_adjacent, list);
 7508
 7509	if (&lower->list == &dev->adj_list.lower)
 7510		return NULL;
 7511
 7512	*iter = lower->list.next;
 7513
 7514	return lower->private;
 7515}
 7516EXPORT_SYMBOL(netdev_lower_get_next_private);
 7517
 7518/**
 7519 * netdev_lower_get_next_private_rcu - Get the next ->private from the
 7520 *				       lower neighbour list, RCU
 7521 *				       variant
 7522 * @dev: device
 7523 * @iter: list_head ** of the current position
 7524 *
 7525 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 7526 * list, starting from iter position. The caller must hold RCU read lock.
 7527 */
 7528void *netdev_lower_get_next_private_rcu(struct net_device *dev,
 7529					struct list_head **iter)
 7530{
 7531	struct netdev_adjacent *lower;
 7532
 7533	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
 7534
 7535	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7536
 7537	if (&lower->list == &dev->adj_list.lower)
 7538		return NULL;
 7539
 7540	*iter = &lower->list;
 7541
 7542	return lower->private;
 7543}
 7544EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
 7545
 7546/**
 7547 * netdev_lower_get_next - Get the next device from the lower neighbour
 7548 *                         list
 7549 * @dev: device
 7550 * @iter: list_head ** of the current position
 7551 *
 7552 * Gets the next netdev_adjacent from the dev's lower neighbour
 7553 * list, starting from iter position. The caller must hold RTNL lock or
 7554 * its own locking that guarantees that the neighbour lower
 7555 * list will remain unchanged.
 7556 */
 7557void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
 7558{
 7559	struct netdev_adjacent *lower;
 7560
 7561	lower = list_entry(*iter, struct netdev_adjacent, list);
 7562
 7563	if (&lower->list == &dev->adj_list.lower)
 7564		return NULL;
 7565
 7566	*iter = lower->list.next;
 7567
 7568	return lower->dev;
 7569}
 7570EXPORT_SYMBOL(netdev_lower_get_next);
 7571
 7572static struct net_device *netdev_next_lower_dev(struct net_device *dev,
 7573						struct list_head **iter)
 7574{
 7575	struct netdev_adjacent *lower;
 7576
 7577	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 7578
 7579	if (&lower->list == &dev->adj_list.lower)
 7580		return NULL;
 7581
 7582	*iter = &lower->list;
 7583
 7584	return lower->dev;
 7585}
 7586
 7587static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
 7588						  struct list_head **iter,
 7589						  bool *ignore)
 7590{
 7591	struct netdev_adjacent *lower;
 7592
 7593	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 7594
 7595	if (&lower->list == &dev->adj_list.lower)
 7596		return NULL;
 7597
 7598	*iter = &lower->list;
 7599	*ignore = lower->ignore;
 7600
 7601	return lower->dev;
 7602}
 7603
 7604int netdev_walk_all_lower_dev(struct net_device *dev,
 7605			      int (*fn)(struct net_device *dev,
 7606					struct netdev_nested_priv *priv),
 7607			      struct netdev_nested_priv *priv)
 7608{
 7609	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7610	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7611	int ret, cur = 0;
 7612
 7613	now = dev;
 7614	iter = &dev->adj_list.lower;
 7615
 7616	while (1) {
 7617		if (now != dev) {
 7618			ret = fn(now, priv);
 7619			if (ret)
 7620				return ret;
 7621		}
 7622
 7623		next = NULL;
 7624		while (1) {
 7625			ldev = netdev_next_lower_dev(now, &iter);
 7626			if (!ldev)
 7627				break;
 7628
 7629			next = ldev;
 7630			niter = &ldev->adj_list.lower;
 7631			dev_stack[cur] = now;
 7632			iter_stack[cur++] = iter;
 7633			break;
 7634		}
 7635
 7636		if (!next) {
 7637			if (!cur)
 7638				return 0;
 7639			next = dev_stack[--cur];
 7640			niter = iter_stack[cur];
 7641		}
 
 
 7642
 7643		now = next;
 7644		iter = niter;
 
 
 7645	}
 7646
 7647	return 0;
 7648}
 7649EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
 7650
 7651static int __netdev_walk_all_lower_dev(struct net_device *dev,
 7652				       int (*fn)(struct net_device *dev,
 7653					 struct netdev_nested_priv *priv),
 7654				       struct netdev_nested_priv *priv)
 7655{
 7656	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7657	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7658	int ret, cur = 0;
 7659	bool ignore;
 7660
 7661	now = dev;
 7662	iter = &dev->adj_list.lower;
 7663
 7664	while (1) {
 7665		if (now != dev) {
 7666			ret = fn(now, priv);
 7667			if (ret)
 7668				return ret;
 7669		}
 7670
 7671		next = NULL;
 7672		while (1) {
 7673			ldev = __netdev_next_lower_dev(now, &iter, &ignore);
 7674			if (!ldev)
 7675				break;
 7676			if (ignore)
 7677				continue;
 7678
 7679			next = ldev;
 7680			niter = &ldev->adj_list.lower;
 7681			dev_stack[cur] = now;
 7682			iter_stack[cur++] = iter;
 7683			break;
 7684		}
 7685
 7686		if (!next) {
 7687			if (!cur)
 7688				return 0;
 7689			next = dev_stack[--cur];
 7690			niter = iter_stack[cur];
 7691		}
 7692
 7693		now = next;
 7694		iter = niter;
 7695	}
 7696
 7697	return 0;
 7698}
 7699
 7700struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
 7701					     struct list_head **iter)
 7702{
 7703	struct netdev_adjacent *lower;
 7704
 7705	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7706	if (&lower->list == &dev->adj_list.lower)
 7707		return NULL;
 7708
 7709	*iter = &lower->list;
 7710
 7711	return lower->dev;
 7712}
 7713EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
 7714
 7715static u8 __netdev_upper_depth(struct net_device *dev)
 7716{
 7717	struct net_device *udev;
 7718	struct list_head *iter;
 7719	u8 max_depth = 0;
 7720	bool ignore;
 7721
 7722	for (iter = &dev->adj_list.upper,
 7723	     udev = __netdev_next_upper_dev(dev, &iter, &ignore);
 7724	     udev;
 7725	     udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
 7726		if (ignore)
 7727			continue;
 7728		if (max_depth < udev->upper_level)
 7729			max_depth = udev->upper_level;
 7730	}
 7731
 7732	return max_depth;
 7733}
 7734
 7735static u8 __netdev_lower_depth(struct net_device *dev)
 
 
 
 7736{
 7737	struct net_device *ldev;
 7738	struct list_head *iter;
 7739	u8 max_depth = 0;
 7740	bool ignore;
 7741
 7742	for (iter = &dev->adj_list.lower,
 7743	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
 7744	     ldev;
 7745	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
 7746		if (ignore)
 7747			continue;
 7748		if (max_depth < ldev->lower_level)
 7749			max_depth = ldev->lower_level;
 7750	}
 7751
 7752	return max_depth;
 7753}
 7754
 7755static int __netdev_update_upper_level(struct net_device *dev,
 7756				       struct netdev_nested_priv *__unused)
 7757{
 7758	dev->upper_level = __netdev_upper_depth(dev) + 1;
 7759	return 0;
 7760}
 7761
 7762#ifdef CONFIG_LOCKDEP
 7763static LIST_HEAD(net_unlink_list);
 7764
 7765static void net_unlink_todo(struct net_device *dev)
 7766{
 7767	if (list_empty(&dev->unlink_list))
 7768		list_add_tail(&dev->unlink_list, &net_unlink_list);
 7769}
 7770#endif
 7771
 7772static int __netdev_update_lower_level(struct net_device *dev,
 7773				       struct netdev_nested_priv *priv)
 7774{
 7775	dev->lower_level = __netdev_lower_depth(dev) + 1;
 7776
 7777#ifdef CONFIG_LOCKDEP
 7778	if (!priv)
 7779		return 0;
 7780
 7781	if (priv->flags & NESTED_SYNC_IMM)
 7782		dev->nested_level = dev->lower_level - 1;
 7783	if (priv->flags & NESTED_SYNC_TODO)
 7784		net_unlink_todo(dev);
 7785#endif
 7786	return 0;
 7787}
 7788
 7789int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
 7790				  int (*fn)(struct net_device *dev,
 7791					    struct netdev_nested_priv *priv),
 7792				  struct netdev_nested_priv *priv)
 7793{
 7794	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7795	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7796	int ret, cur = 0;
 7797
 7798	now = dev;
 7799	iter = &dev->adj_list.lower;
 7800
 7801	while (1) {
 7802		if (now != dev) {
 7803			ret = fn(now, priv);
 7804			if (ret)
 7805				return ret;
 7806		}
 7807
 7808		next = NULL;
 7809		while (1) {
 7810			ldev = netdev_next_lower_dev_rcu(now, &iter);
 7811			if (!ldev)
 7812				break;
 7813
 7814			next = ldev;
 7815			niter = &ldev->adj_list.lower;
 7816			dev_stack[cur] = now;
 7817			iter_stack[cur++] = iter;
 7818			break;
 7819		}
 7820
 7821		if (!next) {
 7822			if (!cur)
 7823				return 0;
 7824			next = dev_stack[--cur];
 7825			niter = iter_stack[cur];
 7826		}
 7827
 7828		now = next;
 7829		iter = niter;
 
 
 7830	}
 7831
 7832	return 0;
 7833}
 7834EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
 7835
 7836/**
 7837 * netdev_lower_get_first_private_rcu - Get the first ->private from the
 7838 *				       lower neighbour list, RCU
 7839 *				       variant
 7840 * @dev: device
 7841 *
 7842 * Gets the first netdev_adjacent->private from the dev's lower neighbour
 7843 * list. The caller must hold RCU read lock.
 7844 */
 7845void *netdev_lower_get_first_private_rcu(struct net_device *dev)
 7846{
 7847	struct netdev_adjacent *lower;
 7848
 7849	lower = list_first_or_null_rcu(&dev->adj_list.lower,
 7850			struct netdev_adjacent, list);
 7851	if (lower)
 7852		return lower->private;
 7853	return NULL;
 7854}
 7855EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
 7856
 7857/**
 7858 * netdev_master_upper_dev_get_rcu - Get master upper device
 7859 * @dev: device
 7860 *
 7861 * Find a master upper device and return pointer to it or NULL in case
 7862 * it's not there. The caller must hold the RCU read lock.
 7863 */
 7864struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
 7865{
 7866	struct netdev_adjacent *upper;
 7867
 7868	upper = list_first_or_null_rcu(&dev->adj_list.upper,
 7869				       struct netdev_adjacent, list);
 7870	if (upper && likely(upper->master))
 7871		return upper->dev;
 7872	return NULL;
 7873}
 7874EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
 7875
 7876static int netdev_adjacent_sysfs_add(struct net_device *dev,
 7877			      struct net_device *adj_dev,
 7878			      struct list_head *dev_list)
 7879{
 7880	char linkname[IFNAMSIZ+7];
 7881
 7882	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7883		"upper_%s" : "lower_%s", adj_dev->name);
 7884	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
 7885				 linkname);
 7886}
 7887static void netdev_adjacent_sysfs_del(struct net_device *dev,
 7888			       char *name,
 7889			       struct list_head *dev_list)
 7890{
 7891	char linkname[IFNAMSIZ+7];
 7892
 7893	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7894		"upper_%s" : "lower_%s", name);
 7895	sysfs_remove_link(&(dev->dev.kobj), linkname);
 7896}
 7897
 7898static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
 7899						 struct net_device *adj_dev,
 7900						 struct list_head *dev_list)
 7901{
 7902	return (dev_list == &dev->adj_list.upper ||
 7903		dev_list == &dev->adj_list.lower) &&
 7904		net_eq(dev_net(dev), dev_net(adj_dev));
 7905}
 7906
 7907static int __netdev_adjacent_dev_insert(struct net_device *dev,
 7908					struct net_device *adj_dev,
 7909					struct list_head *dev_list,
 7910					void *private, bool master)
 7911{
 7912	struct netdev_adjacent *adj;
 7913	int ret;
 7914
 7915	adj = __netdev_find_adj(adj_dev, dev_list);
 7916
 7917	if (adj) {
 7918		adj->ref_nr += 1;
 7919		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
 7920			 dev->name, adj_dev->name, adj->ref_nr);
 7921
 7922		return 0;
 7923	}
 7924
 7925	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
 7926	if (!adj)
 7927		return -ENOMEM;
 7928
 7929	adj->dev = adj_dev;
 7930	adj->master = master;
 7931	adj->ref_nr = 1;
 7932	adj->private = private;
 7933	adj->ignore = false;
 7934	netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL);
 7935
 7936	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
 7937		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
 7938
 7939	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
 7940		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
 7941		if (ret)
 7942			goto free_adj;
 7943	}
 7944
 7945	/* Ensure that master link is always the first item in list. */
 7946	if (master) {
 7947		ret = sysfs_create_link(&(dev->dev.kobj),
 7948					&(adj_dev->dev.kobj), "master");
 7949		if (ret)
 7950			goto remove_symlinks;
 7951
 7952		list_add_rcu(&adj->list, dev_list);
 7953	} else {
 7954		list_add_tail_rcu(&adj->list, dev_list);
 7955	}
 7956
 7957	return 0;
 7958
 7959remove_symlinks:
 7960	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 7961		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 7962free_adj:
 7963	netdev_put(adj_dev, &adj->dev_tracker);
 7964	kfree(adj);
 
 7965
 7966	return ret;
 7967}
 7968
 7969static void __netdev_adjacent_dev_remove(struct net_device *dev,
 7970					 struct net_device *adj_dev,
 7971					 u16 ref_nr,
 7972					 struct list_head *dev_list)
 7973{
 7974	struct netdev_adjacent *adj;
 7975
 7976	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
 7977		 dev->name, adj_dev->name, ref_nr);
 7978
 7979	adj = __netdev_find_adj(adj_dev, dev_list);
 7980
 7981	if (!adj) {
 7982		pr_err("Adjacency does not exist for device %s from %s\n",
 7983		       dev->name, adj_dev->name);
 7984		WARN_ON(1);
 7985		return;
 7986	}
 7987
 7988	if (adj->ref_nr > ref_nr) {
 7989		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
 7990			 dev->name, adj_dev->name, ref_nr,
 7991			 adj->ref_nr - ref_nr);
 7992		adj->ref_nr -= ref_nr;
 7993		return;
 7994	}
 7995
 7996	if (adj->master)
 7997		sysfs_remove_link(&(dev->dev.kobj), "master");
 7998
 7999	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 8000		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 8001
 8002	list_del_rcu(&adj->list);
 8003	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
 8004		 adj_dev->name, dev->name, adj_dev->name);
 8005	netdev_put(adj_dev, &adj->dev_tracker);
 8006	kfree_rcu(adj, rcu);
 8007}
 8008
 8009static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
 8010					    struct net_device *upper_dev,
 8011					    struct list_head *up_list,
 8012					    struct list_head *down_list,
 8013					    void *private, bool master)
 8014{
 8015	int ret;
 8016
 8017	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
 8018					   private, master);
 8019	if (ret)
 8020		return ret;
 8021
 8022	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
 8023					   private, false);
 8024	if (ret) {
 8025		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
 8026		return ret;
 8027	}
 8028
 8029	return 0;
 8030}
 8031
 8032static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
 8033					       struct net_device *upper_dev,
 8034					       u16 ref_nr,
 8035					       struct list_head *up_list,
 8036					       struct list_head *down_list)
 8037{
 8038	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
 8039	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
 8040}
 8041
 8042static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
 8043						struct net_device *upper_dev,
 8044						void *private, bool master)
 8045{
 8046	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
 8047						&dev->adj_list.upper,
 8048						&upper_dev->adj_list.lower,
 8049						private, master);
 8050}
 8051
 8052static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
 8053						   struct net_device *upper_dev)
 8054{
 8055	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
 8056					   &dev->adj_list.upper,
 8057					   &upper_dev->adj_list.lower);
 8058}
 8059
 8060static int __netdev_upper_dev_link(struct net_device *dev,
 8061				   struct net_device *upper_dev, bool master,
 8062				   void *upper_priv, void *upper_info,
 8063				   struct netdev_nested_priv *priv,
 8064				   struct netlink_ext_ack *extack)
 8065{
 8066	struct netdev_notifier_changeupper_info changeupper_info = {
 8067		.info = {
 8068			.dev = dev,
 8069			.extack = extack,
 8070		},
 8071		.upper_dev = upper_dev,
 8072		.master = master,
 8073		.linking = true,
 8074		.upper_info = upper_info,
 8075	};
 8076	struct net_device *master_dev;
 8077	int ret = 0;
 8078
 8079	ASSERT_RTNL();
 8080
 8081	if (dev == upper_dev)
 8082		return -EBUSY;
 8083
 8084	/* To prevent loops, check if dev is not upper device to upper_dev. */
 8085	if (__netdev_has_upper_dev(upper_dev, dev))
 8086		return -EBUSY;
 8087
 8088	if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
 8089		return -EMLINK;
 8090
 8091	if (!master) {
 8092		if (__netdev_has_upper_dev(dev, upper_dev))
 8093			return -EEXIST;
 8094	} else {
 8095		master_dev = __netdev_master_upper_dev_get(dev);
 8096		if (master_dev)
 8097			return master_dev == upper_dev ? -EEXIST : -EBUSY;
 8098	}
 8099
 8100	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 8101					    &changeupper_info.info);
 8102	ret = notifier_to_errno(ret);
 8103	if (ret)
 8104		return ret;
 8105
 8106	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
 8107						   master);
 8108	if (ret)
 8109		return ret;
 8110
 8111	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 8112					    &changeupper_info.info);
 8113	ret = notifier_to_errno(ret);
 8114	if (ret)
 8115		goto rollback;
 8116
 8117	__netdev_update_upper_level(dev, NULL);
 8118	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 8119
 8120	__netdev_update_lower_level(upper_dev, priv);
 8121	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 8122				    priv);
 8123
 8124	return 0;
 8125
 8126rollback:
 8127	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 8128
 8129	return ret;
 8130}
 8131
 8132/**
 8133 * netdev_upper_dev_link - Add a link to the upper device
 8134 * @dev: device
 8135 * @upper_dev: new upper device
 8136 * @extack: netlink extended ack
 8137 *
 8138 * Adds a link to device which is upper to this one. The caller must hold
 8139 * the RTNL lock. On a failure a negative errno code is returned.
 8140 * On success the reference counts are adjusted and the function
 8141 * returns zero.
 8142 */
 8143int netdev_upper_dev_link(struct net_device *dev,
 8144			  struct net_device *upper_dev,
 8145			  struct netlink_ext_ack *extack)
 8146{
 8147	struct netdev_nested_priv priv = {
 8148		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 8149		.data = NULL,
 8150	};
 8151
 8152	return __netdev_upper_dev_link(dev, upper_dev, false,
 8153				       NULL, NULL, &priv, extack);
 8154}
 8155EXPORT_SYMBOL(netdev_upper_dev_link);
 8156
 8157/**
 8158 * netdev_master_upper_dev_link - Add a master link to the upper device
 8159 * @dev: device
 8160 * @upper_dev: new upper device
 8161 * @upper_priv: upper device private
 8162 * @upper_info: upper info to be passed down via notifier
 8163 * @extack: netlink extended ack
 8164 *
 8165 * Adds a link to device which is upper to this one. In this case, only
 8166 * one master upper device can be linked, although other non-master devices
 8167 * might be linked as well. The caller must hold the RTNL lock.
 8168 * On a failure a negative errno code is returned. On success the reference
 8169 * counts are adjusted and the function returns zero.
 8170 */
 8171int netdev_master_upper_dev_link(struct net_device *dev,
 8172				 struct net_device *upper_dev,
 8173				 void *upper_priv, void *upper_info,
 8174				 struct netlink_ext_ack *extack)
 8175{
 8176	struct netdev_nested_priv priv = {
 8177		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 8178		.data = NULL,
 8179	};
 8180
 8181	return __netdev_upper_dev_link(dev, upper_dev, true,
 8182				       upper_priv, upper_info, &priv, extack);
 8183}
 8184EXPORT_SYMBOL(netdev_master_upper_dev_link);
 8185
 8186static void __netdev_upper_dev_unlink(struct net_device *dev,
 8187				      struct net_device *upper_dev,
 8188				      struct netdev_nested_priv *priv)
 8189{
 8190	struct netdev_notifier_changeupper_info changeupper_info = {
 8191		.info = {
 8192			.dev = dev,
 8193		},
 8194		.upper_dev = upper_dev,
 8195		.linking = false,
 8196	};
 8197
 8198	ASSERT_RTNL();
 8199
 8200	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
 8201
 8202	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 8203				      &changeupper_info.info);
 8204
 8205	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 8206
 8207	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 8208				      &changeupper_info.info);
 8209
 8210	__netdev_update_upper_level(dev, NULL);
 8211	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 8212
 8213	__netdev_update_lower_level(upper_dev, priv);
 8214	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 8215				    priv);
 8216}
 8217
 8218/**
 8219 * netdev_upper_dev_unlink - Removes a link to upper device
 8220 * @dev: device
 8221 * @upper_dev: new upper device
 8222 *
 8223 * Removes a link to device which is upper to this one. The caller must hold
 8224 * the RTNL lock.
 8225 */
 8226void netdev_upper_dev_unlink(struct net_device *dev,
 8227			     struct net_device *upper_dev)
 8228{
 8229	struct netdev_nested_priv priv = {
 8230		.flags = NESTED_SYNC_TODO,
 8231		.data = NULL,
 8232	};
 8233
 8234	__netdev_upper_dev_unlink(dev, upper_dev, &priv);
 8235}
 8236EXPORT_SYMBOL(netdev_upper_dev_unlink);
 8237
 8238static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
 8239				      struct net_device *lower_dev,
 8240				      bool val)
 8241{
 8242	struct netdev_adjacent *adj;
 8243
 8244	adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
 8245	if (adj)
 8246		adj->ignore = val;
 8247
 8248	adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
 8249	if (adj)
 8250		adj->ignore = val;
 8251}
 8252
 8253static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
 8254					struct net_device *lower_dev)
 8255{
 8256	__netdev_adjacent_dev_set(upper_dev, lower_dev, true);
 8257}
 8258
 8259static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
 8260				       struct net_device *lower_dev)
 8261{
 8262	__netdev_adjacent_dev_set(upper_dev, lower_dev, false);
 8263}
 8264
 8265int netdev_adjacent_change_prepare(struct net_device *old_dev,
 8266				   struct net_device *new_dev,
 8267				   struct net_device *dev,
 8268				   struct netlink_ext_ack *extack)
 8269{
 8270	struct netdev_nested_priv priv = {
 8271		.flags = 0,
 8272		.data = NULL,
 8273	};
 8274	int err;
 8275
 8276	if (!new_dev)
 8277		return 0;
 8278
 8279	if (old_dev && new_dev != old_dev)
 8280		netdev_adjacent_dev_disable(dev, old_dev);
 8281	err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
 8282				      extack);
 8283	if (err) {
 8284		if (old_dev && new_dev != old_dev)
 8285			netdev_adjacent_dev_enable(dev, old_dev);
 8286		return err;
 8287	}
 8288
 8289	return 0;
 8290}
 8291EXPORT_SYMBOL(netdev_adjacent_change_prepare);
 8292
 8293void netdev_adjacent_change_commit(struct net_device *old_dev,
 8294				   struct net_device *new_dev,
 8295				   struct net_device *dev)
 8296{
 8297	struct netdev_nested_priv priv = {
 8298		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 8299		.data = NULL,
 8300	};
 8301
 8302	if (!new_dev || !old_dev)
 8303		return;
 8304
 8305	if (new_dev == old_dev)
 8306		return;
 8307
 8308	netdev_adjacent_dev_enable(dev, old_dev);
 8309	__netdev_upper_dev_unlink(old_dev, dev, &priv);
 8310}
 8311EXPORT_SYMBOL(netdev_adjacent_change_commit);
 8312
 8313void netdev_adjacent_change_abort(struct net_device *old_dev,
 8314				  struct net_device *new_dev,
 8315				  struct net_device *dev)
 8316{
 8317	struct netdev_nested_priv priv = {
 8318		.flags = 0,
 8319		.data = NULL,
 8320	};
 8321
 8322	if (!new_dev)
 8323		return;
 8324
 8325	if (old_dev && new_dev != old_dev)
 8326		netdev_adjacent_dev_enable(dev, old_dev);
 8327
 8328	__netdev_upper_dev_unlink(new_dev, dev, &priv);
 
 8329}
 8330EXPORT_SYMBOL(netdev_adjacent_change_abort);
 8331
 8332/**
 8333 * netdev_bonding_info_change - Dispatch event about slave change
 8334 * @dev: device
 8335 * @bonding_info: info to dispatch
 8336 *
 8337 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
 8338 * The caller must hold the RTNL lock.
 8339 */
 8340void netdev_bonding_info_change(struct net_device *dev,
 8341				struct netdev_bonding_info *bonding_info)
 8342{
 8343	struct netdev_notifier_bonding_info info = {
 8344		.info.dev = dev,
 8345	};
 8346
 8347	memcpy(&info.bonding_info, bonding_info,
 8348	       sizeof(struct netdev_bonding_info));
 8349	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
 8350				      &info.info);
 8351}
 8352EXPORT_SYMBOL(netdev_bonding_info_change);
 8353
 8354static int netdev_offload_xstats_enable_l3(struct net_device *dev,
 8355					   struct netlink_ext_ack *extack)
 8356{
 8357	struct netdev_notifier_offload_xstats_info info = {
 8358		.info.dev = dev,
 8359		.info.extack = extack,
 8360		.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
 8361	};
 8362	int err;
 8363	int rc;
 8364
 8365	dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
 8366					 GFP_KERNEL);
 8367	if (!dev->offload_xstats_l3)
 8368		return -ENOMEM;
 8369
 8370	rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
 8371						  NETDEV_OFFLOAD_XSTATS_DISABLE,
 8372						  &info.info);
 8373	err = notifier_to_errno(rc);
 8374	if (err)
 8375		goto free_stats;
 8376
 8377	return 0;
 8378
 8379free_stats:
 8380	kfree(dev->offload_xstats_l3);
 8381	dev->offload_xstats_l3 = NULL;
 8382	return err;
 8383}
 8384
 8385int netdev_offload_xstats_enable(struct net_device *dev,
 8386				 enum netdev_offload_xstats_type type,
 8387				 struct netlink_ext_ack *extack)
 8388{
 8389	ASSERT_RTNL();
 8390
 8391	if (netdev_offload_xstats_enabled(dev, type))
 8392		return -EALREADY;
 8393
 8394	switch (type) {
 8395	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
 8396		return netdev_offload_xstats_enable_l3(dev, extack);
 8397	}
 8398
 8399	WARN_ON(1);
 8400	return -EINVAL;
 8401}
 8402EXPORT_SYMBOL(netdev_offload_xstats_enable);
 8403
 8404static void netdev_offload_xstats_disable_l3(struct net_device *dev)
 8405{
 8406	struct netdev_notifier_offload_xstats_info info = {
 8407		.info.dev = dev,
 8408		.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
 8409	};
 8410
 8411	call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
 8412				      &info.info);
 8413	kfree(dev->offload_xstats_l3);
 8414	dev->offload_xstats_l3 = NULL;
 8415}
 8416
 8417int netdev_offload_xstats_disable(struct net_device *dev,
 8418				  enum netdev_offload_xstats_type type)
 8419{
 8420	ASSERT_RTNL();
 8421
 8422	if (!netdev_offload_xstats_enabled(dev, type))
 8423		return -EALREADY;
 8424
 8425	switch (type) {
 8426	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
 8427		netdev_offload_xstats_disable_l3(dev);
 8428		return 0;
 8429	}
 8430
 8431	WARN_ON(1);
 8432	return -EINVAL;
 8433}
 8434EXPORT_SYMBOL(netdev_offload_xstats_disable);
 8435
 8436static void netdev_offload_xstats_disable_all(struct net_device *dev)
 8437{
 8438	netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
 8439}
 8440
 8441static struct rtnl_hw_stats64 *
 8442netdev_offload_xstats_get_ptr(const struct net_device *dev,
 8443			      enum netdev_offload_xstats_type type)
 8444{
 8445	switch (type) {
 8446	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
 8447		return dev->offload_xstats_l3;
 8448	}
 8449
 8450	WARN_ON(1);
 8451	return NULL;
 8452}
 8453
 8454bool netdev_offload_xstats_enabled(const struct net_device *dev,
 8455				   enum netdev_offload_xstats_type type)
 8456{
 8457	ASSERT_RTNL();
 8458
 8459	return netdev_offload_xstats_get_ptr(dev, type);
 8460}
 8461EXPORT_SYMBOL(netdev_offload_xstats_enabled);
 8462
 8463struct netdev_notifier_offload_xstats_ru {
 8464	bool used;
 8465};
 8466
 8467struct netdev_notifier_offload_xstats_rd {
 8468	struct rtnl_hw_stats64 stats;
 8469	bool used;
 8470};
 8471
 8472static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
 8473				  const struct rtnl_hw_stats64 *src)
 8474{
 8475	dest->rx_packets	  += src->rx_packets;
 8476	dest->tx_packets	  += src->tx_packets;
 8477	dest->rx_bytes		  += src->rx_bytes;
 8478	dest->tx_bytes		  += src->tx_bytes;
 8479	dest->rx_errors		  += src->rx_errors;
 8480	dest->tx_errors		  += src->tx_errors;
 8481	dest->rx_dropped	  += src->rx_dropped;
 8482	dest->tx_dropped	  += src->tx_dropped;
 8483	dest->multicast		  += src->multicast;
 8484}
 8485
 8486static int netdev_offload_xstats_get_used(struct net_device *dev,
 8487					  enum netdev_offload_xstats_type type,
 8488					  bool *p_used,
 8489					  struct netlink_ext_ack *extack)
 8490{
 8491	struct netdev_notifier_offload_xstats_ru report_used = {};
 8492	struct netdev_notifier_offload_xstats_info info = {
 8493		.info.dev = dev,
 8494		.info.extack = extack,
 8495		.type = type,
 8496		.report_used = &report_used,
 8497	};
 8498	int rc;
 8499
 8500	WARN_ON(!netdev_offload_xstats_enabled(dev, type));
 8501	rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
 8502					   &info.info);
 8503	*p_used = report_used.used;
 8504	return notifier_to_errno(rc);
 8505}
 8506
 8507static int netdev_offload_xstats_get_stats(struct net_device *dev,
 8508					   enum netdev_offload_xstats_type type,
 8509					   struct rtnl_hw_stats64 *p_stats,
 8510					   bool *p_used,
 8511					   struct netlink_ext_ack *extack)
 8512{
 8513	struct netdev_notifier_offload_xstats_rd report_delta = {};
 8514	struct netdev_notifier_offload_xstats_info info = {
 8515		.info.dev = dev,
 8516		.info.extack = extack,
 8517		.type = type,
 8518		.report_delta = &report_delta,
 8519	};
 8520	struct rtnl_hw_stats64 *stats;
 8521	int rc;
 8522
 8523	stats = netdev_offload_xstats_get_ptr(dev, type);
 8524	if (WARN_ON(!stats))
 8525		return -EINVAL;
 8526
 8527	rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
 8528					   &info.info);
 8529
 8530	/* Cache whatever we got, even if there was an error, otherwise the
 8531	 * successful stats retrievals would get lost.
 8532	 */
 8533	netdev_hw_stats64_add(stats, &report_delta.stats);
 8534
 8535	if (p_stats)
 8536		*p_stats = *stats;
 8537	*p_used = report_delta.used;
 8538
 8539	return notifier_to_errno(rc);
 8540}
 8541
 8542int netdev_offload_xstats_get(struct net_device *dev,
 8543			      enum netdev_offload_xstats_type type,
 8544			      struct rtnl_hw_stats64 *p_stats, bool *p_used,
 8545			      struct netlink_ext_ack *extack)
 8546{
 8547	ASSERT_RTNL();
 8548
 8549	if (p_stats)
 8550		return netdev_offload_xstats_get_stats(dev, type, p_stats,
 8551						       p_used, extack);
 8552	else
 8553		return netdev_offload_xstats_get_used(dev, type, p_used,
 8554						      extack);
 8555}
 8556EXPORT_SYMBOL(netdev_offload_xstats_get);
 8557
 8558void
 8559netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
 8560				   const struct rtnl_hw_stats64 *stats)
 8561{
 8562	report_delta->used = true;
 8563	netdev_hw_stats64_add(&report_delta->stats, stats);
 8564}
 8565EXPORT_SYMBOL(netdev_offload_xstats_report_delta);
 8566
 8567void
 8568netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
 8569{
 8570	report_used->used = true;
 8571}
 8572EXPORT_SYMBOL(netdev_offload_xstats_report_used);
 8573
 8574void netdev_offload_xstats_push_delta(struct net_device *dev,
 8575				      enum netdev_offload_xstats_type type,
 8576				      const struct rtnl_hw_stats64 *p_stats)
 8577{
 8578	struct rtnl_hw_stats64 *stats;
 8579
 8580	ASSERT_RTNL();
 8581
 8582	stats = netdev_offload_xstats_get_ptr(dev, type);
 8583	if (WARN_ON(!stats))
 8584		return;
 8585
 8586	netdev_hw_stats64_add(stats, p_stats);
 8587}
 8588EXPORT_SYMBOL(netdev_offload_xstats_push_delta);
 8589
 8590/**
 8591 * netdev_get_xmit_slave - Get the xmit slave of master device
 8592 * @dev: device
 8593 * @skb: The packet
 8594 * @all_slaves: assume all the slaves are active
 8595 *
 8596 * The reference counters are not incremented so the caller must be
 8597 * careful with locks. The caller must hold RCU lock.
 8598 * %NULL is returned if no slave is found.
 8599 */
 8600
 8601struct net_device *netdev_get_xmit_slave(struct net_device *dev,
 8602					 struct sk_buff *skb,
 8603					 bool all_slaves)
 8604{
 8605	const struct net_device_ops *ops = dev->netdev_ops;
 8606
 8607	if (!ops->ndo_get_xmit_slave)
 8608		return NULL;
 8609	return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
 8610}
 8611EXPORT_SYMBOL(netdev_get_xmit_slave);
 8612
 8613static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
 8614						  struct sock *sk)
 8615{
 8616	const struct net_device_ops *ops = dev->netdev_ops;
 8617
 8618	if (!ops->ndo_sk_get_lower_dev)
 8619		return NULL;
 8620	return ops->ndo_sk_get_lower_dev(dev, sk);
 8621}
 8622
 8623/**
 8624 * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
 8625 * @dev: device
 8626 * @sk: the socket
 8627 *
 8628 * %NULL is returned if no lower device is found.
 8629 */
 8630
 8631struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
 8632					    struct sock *sk)
 8633{
 8634	struct net_device *lower;
 8635
 8636	lower = netdev_sk_get_lower_dev(dev, sk);
 8637	while (lower) {
 8638		dev = lower;
 8639		lower = netdev_sk_get_lower_dev(dev, sk);
 8640	}
 8641
 8642	return dev;
 8643}
 8644EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
 8645
 8646static void netdev_adjacent_add_links(struct net_device *dev)
 8647{
 8648	struct netdev_adjacent *iter;
 8649
 8650	struct net *net = dev_net(dev);
 8651
 8652	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8653		if (!net_eq(net, dev_net(iter->dev)))
 8654			continue;
 8655		netdev_adjacent_sysfs_add(iter->dev, dev,
 8656					  &iter->dev->adj_list.lower);
 8657		netdev_adjacent_sysfs_add(dev, iter->dev,
 8658					  &dev->adj_list.upper);
 8659	}
 8660
 8661	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8662		if (!net_eq(net, dev_net(iter->dev)))
 8663			continue;
 8664		netdev_adjacent_sysfs_add(iter->dev, dev,
 8665					  &iter->dev->adj_list.upper);
 8666		netdev_adjacent_sysfs_add(dev, iter->dev,
 8667					  &dev->adj_list.lower);
 8668	}
 8669}
 8670
 8671static void netdev_adjacent_del_links(struct net_device *dev)
 8672{
 8673	struct netdev_adjacent *iter;
 8674
 8675	struct net *net = dev_net(dev);
 8676
 8677	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8678		if (!net_eq(net, dev_net(iter->dev)))
 8679			continue;
 8680		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 8681					  &iter->dev->adj_list.lower);
 8682		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 8683					  &dev->adj_list.upper);
 8684	}
 8685
 8686	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8687		if (!net_eq(net, dev_net(iter->dev)))
 8688			continue;
 8689		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 8690					  &iter->dev->adj_list.upper);
 8691		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 8692					  &dev->adj_list.lower);
 8693	}
 8694}
 8695
 8696void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
 8697{
 8698	struct netdev_adjacent *iter;
 8699
 8700	struct net *net = dev_net(dev);
 8701
 8702	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8703		if (!net_eq(net, dev_net(iter->dev)))
 8704			continue;
 8705		netdev_adjacent_sysfs_del(iter->dev, oldname,
 8706					  &iter->dev->adj_list.lower);
 8707		netdev_adjacent_sysfs_add(iter->dev, dev,
 8708					  &iter->dev->adj_list.lower);
 8709	}
 8710
 8711	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8712		if (!net_eq(net, dev_net(iter->dev)))
 8713			continue;
 8714		netdev_adjacent_sysfs_del(iter->dev, oldname,
 8715					  &iter->dev->adj_list.upper);
 8716		netdev_adjacent_sysfs_add(iter->dev, dev,
 8717					  &iter->dev->adj_list.upper);
 8718	}
 8719}
 8720
 8721void *netdev_lower_dev_get_private(struct net_device *dev,
 8722				   struct net_device *lower_dev)
 8723{
 8724	struct netdev_adjacent *lower;
 8725
 8726	if (!lower_dev)
 8727		return NULL;
 8728	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
 8729	if (!lower)
 8730		return NULL;
 8731
 8732	return lower->private;
 8733}
 8734EXPORT_SYMBOL(netdev_lower_dev_get_private);
 8735
 8736
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 8737/**
 8738 * netdev_lower_state_changed - Dispatch event about lower device state change
 8739 * @lower_dev: device
 8740 * @lower_state_info: state to dispatch
 8741 *
 8742 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
 8743 * The caller must hold the RTNL lock.
 8744 */
 8745void netdev_lower_state_changed(struct net_device *lower_dev,
 8746				void *lower_state_info)
 8747{
 8748	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
 8749		.info.dev = lower_dev,
 8750	};
 8751
 8752	ASSERT_RTNL();
 8753	changelowerstate_info.lower_state_info = lower_state_info;
 8754	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
 8755				      &changelowerstate_info.info);
 8756}
 8757EXPORT_SYMBOL(netdev_lower_state_changed);
 8758
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 8759static void dev_change_rx_flags(struct net_device *dev, int flags)
 8760{
 8761	const struct net_device_ops *ops = dev->netdev_ops;
 8762
 8763	if (ops->ndo_change_rx_flags)
 8764		ops->ndo_change_rx_flags(dev, flags);
 8765}
 8766
 8767static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
 8768{
 8769	unsigned int old_flags = dev->flags;
 8770	unsigned int promiscuity, flags;
 8771	kuid_t uid;
 8772	kgid_t gid;
 8773
 8774	ASSERT_RTNL();
 8775
 8776	promiscuity = dev->promiscuity + inc;
 8777	if (promiscuity == 0) {
 
 8778		/*
 8779		 * Avoid overflow.
 8780		 * If inc causes overflow, untouch promisc and return error.
 8781		 */
 8782		if (unlikely(inc > 0)) {
 8783			netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
 
 
 
 
 8784			return -EOVERFLOW;
 8785		}
 8786		flags = old_flags & ~IFF_PROMISC;
 8787	} else {
 8788		flags = old_flags | IFF_PROMISC;
 8789	}
 8790	WRITE_ONCE(dev->promiscuity, promiscuity);
 8791	if (flags != old_flags) {
 8792		WRITE_ONCE(dev->flags, flags);
 8793		netdev_info(dev, "%s promiscuous mode\n",
 8794			    dev->flags & IFF_PROMISC ? "entered" : "left");
 8795		if (audit_enabled) {
 8796			current_uid_gid(&uid, &gid);
 8797			audit_log(audit_context(), GFP_ATOMIC,
 8798				  AUDIT_ANOM_PROMISCUOUS,
 8799				  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
 8800				  dev->name, (dev->flags & IFF_PROMISC),
 8801				  (old_flags & IFF_PROMISC),
 8802				  from_kuid(&init_user_ns, audit_get_loginuid(current)),
 8803				  from_kuid(&init_user_ns, uid),
 8804				  from_kgid(&init_user_ns, gid),
 8805				  audit_get_sessionid(current));
 8806		}
 8807
 8808		dev_change_rx_flags(dev, IFF_PROMISC);
 8809	}
 8810	if (notify)
 8811		__dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
 8812	return 0;
 8813}
 8814
 8815/**
 8816 *	dev_set_promiscuity	- update promiscuity count on a device
 8817 *	@dev: device
 8818 *	@inc: modifier
 8819 *
 8820 *	Add or remove promiscuity from a device. While the count in the device
 8821 *	remains above zero the interface remains promiscuous. Once it hits zero
 8822 *	the device reverts back to normal filtering operation. A negative inc
 8823 *	value is used to drop promiscuity on the device.
 8824 *	Return 0 if successful or a negative errno code on error.
 8825 */
 8826int dev_set_promiscuity(struct net_device *dev, int inc)
 8827{
 8828	unsigned int old_flags = dev->flags;
 8829	int err;
 8830
 8831	err = __dev_set_promiscuity(dev, inc, true);
 8832	if (err < 0)
 8833		return err;
 8834	if (dev->flags != old_flags)
 8835		dev_set_rx_mode(dev);
 8836	return err;
 8837}
 8838EXPORT_SYMBOL(dev_set_promiscuity);
 8839
 8840static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
 8841{
 8842	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
 8843	unsigned int allmulti, flags;
 8844
 8845	ASSERT_RTNL();
 8846
 8847	allmulti = dev->allmulti + inc;
 8848	if (allmulti == 0) {
 
 8849		/*
 8850		 * Avoid overflow.
 8851		 * If inc causes overflow, untouch allmulti and return error.
 8852		 */
 8853		if (unlikely(inc > 0)) {
 8854			netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
 
 
 
 
 8855			return -EOVERFLOW;
 8856		}
 8857		flags = old_flags & ~IFF_ALLMULTI;
 8858	} else {
 8859		flags = old_flags | IFF_ALLMULTI;
 8860	}
 8861	WRITE_ONCE(dev->allmulti, allmulti);
 8862	if (flags != old_flags) {
 8863		WRITE_ONCE(dev->flags, flags);
 8864		netdev_info(dev, "%s allmulticast mode\n",
 8865			    dev->flags & IFF_ALLMULTI ? "entered" : "left");
 8866		dev_change_rx_flags(dev, IFF_ALLMULTI);
 8867		dev_set_rx_mode(dev);
 8868		if (notify)
 8869			__dev_notify_flags(dev, old_flags,
 8870					   dev->gflags ^ old_gflags, 0, NULL);
 8871	}
 8872	return 0;
 8873}
 8874
 8875/**
 8876 *	dev_set_allmulti	- update allmulti count on a device
 8877 *	@dev: device
 8878 *	@inc: modifier
 8879 *
 8880 *	Add or remove reception of all multicast frames to a device. While the
 8881 *	count in the device remains above zero the interface remains listening
 8882 *	to all interfaces. Once it hits zero the device reverts back to normal
 8883 *	filtering operation. A negative @inc value is used to drop the counter
 8884 *	when releasing a resource needing all multicasts.
 8885 *	Return 0 if successful or a negative errno code on error.
 8886 */
 8887
 8888int dev_set_allmulti(struct net_device *dev, int inc)
 8889{
 8890	return __dev_set_allmulti(dev, inc, true);
 8891}
 8892EXPORT_SYMBOL(dev_set_allmulti);
 8893
 8894/*
 8895 *	Upload unicast and multicast address lists to device and
 8896 *	configure RX filtering. When the device doesn't support unicast
 8897 *	filtering it is put in promiscuous mode while unicast addresses
 8898 *	are present.
 8899 */
 8900void __dev_set_rx_mode(struct net_device *dev)
 8901{
 8902	const struct net_device_ops *ops = dev->netdev_ops;
 8903
 8904	/* dev_open will call this function so the list will stay sane. */
 8905	if (!(dev->flags&IFF_UP))
 8906		return;
 8907
 8908	if (!netif_device_present(dev))
 8909		return;
 8910
 8911	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
 8912		/* Unicast addresses changes may only happen under the rtnl,
 8913		 * therefore calling __dev_set_promiscuity here is safe.
 8914		 */
 8915		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
 8916			__dev_set_promiscuity(dev, 1, false);
 8917			dev->uc_promisc = true;
 8918		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
 8919			__dev_set_promiscuity(dev, -1, false);
 8920			dev->uc_promisc = false;
 8921		}
 8922	}
 8923
 8924	if (ops->ndo_set_rx_mode)
 8925		ops->ndo_set_rx_mode(dev);
 8926}
 8927
 8928void dev_set_rx_mode(struct net_device *dev)
 8929{
 8930	netif_addr_lock_bh(dev);
 8931	__dev_set_rx_mode(dev);
 8932	netif_addr_unlock_bh(dev);
 8933}
 8934
 8935/**
 8936 *	dev_get_flags - get flags reported to userspace
 8937 *	@dev: device
 8938 *
 8939 *	Get the combination of flag bits exported through APIs to userspace.
 8940 */
 8941unsigned int dev_get_flags(const struct net_device *dev)
 8942{
 8943	unsigned int flags;
 8944
 8945	flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC |
 8946				IFF_ALLMULTI |
 8947				IFF_RUNNING |
 8948				IFF_LOWER_UP |
 8949				IFF_DORMANT)) |
 8950		(READ_ONCE(dev->gflags) & (IFF_PROMISC |
 8951				IFF_ALLMULTI));
 8952
 8953	if (netif_running(dev)) {
 8954		if (netif_oper_up(dev))
 8955			flags |= IFF_RUNNING;
 8956		if (netif_carrier_ok(dev))
 8957			flags |= IFF_LOWER_UP;
 8958		if (netif_dormant(dev))
 8959			flags |= IFF_DORMANT;
 8960	}
 8961
 8962	return flags;
 8963}
 8964EXPORT_SYMBOL(dev_get_flags);
 8965
 8966int __dev_change_flags(struct net_device *dev, unsigned int flags,
 8967		       struct netlink_ext_ack *extack)
 8968{
 8969	unsigned int old_flags = dev->flags;
 8970	int ret;
 8971
 8972	ASSERT_RTNL();
 8973
 8974	/*
 8975	 *	Set the flags on our device.
 8976	 */
 8977
 8978	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
 8979			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
 8980			       IFF_AUTOMEDIA)) |
 8981		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
 8982				    IFF_ALLMULTI));
 8983
 8984	/*
 8985	 *	Load in the correct multicast list now the flags have changed.
 8986	 */
 8987
 8988	if ((old_flags ^ flags) & IFF_MULTICAST)
 8989		dev_change_rx_flags(dev, IFF_MULTICAST);
 8990
 8991	dev_set_rx_mode(dev);
 8992
 8993	/*
 8994	 *	Have we downed the interface. We handle IFF_UP ourselves
 8995	 *	according to user attempts to set it, rather than blindly
 8996	 *	setting it.
 8997	 */
 8998
 8999	ret = 0;
 9000	if ((old_flags ^ flags) & IFF_UP) {
 9001		if (old_flags & IFF_UP)
 9002			__dev_close(dev);
 9003		else
 9004			ret = __dev_open(dev, extack);
 9005	}
 9006
 9007	if ((flags ^ dev->gflags) & IFF_PROMISC) {
 9008		int inc = (flags & IFF_PROMISC) ? 1 : -1;
 9009		unsigned int old_flags = dev->flags;
 9010
 9011		dev->gflags ^= IFF_PROMISC;
 9012
 9013		if (__dev_set_promiscuity(dev, inc, false) >= 0)
 9014			if (dev->flags != old_flags)
 9015				dev_set_rx_mode(dev);
 9016	}
 9017
 9018	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
 9019	 * is important. Some (broken) drivers set IFF_PROMISC, when
 9020	 * IFF_ALLMULTI is requested not asking us and not reporting.
 9021	 */
 9022	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
 9023		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
 9024
 9025		dev->gflags ^= IFF_ALLMULTI;
 9026		__dev_set_allmulti(dev, inc, false);
 9027	}
 9028
 9029	return ret;
 9030}
 9031
 9032void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
 9033			unsigned int gchanges, u32 portid,
 9034			const struct nlmsghdr *nlh)
 9035{
 9036	unsigned int changes = dev->flags ^ old_flags;
 9037
 9038	if (gchanges)
 9039		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh);
 9040
 9041	if (changes & IFF_UP) {
 9042		if (dev->flags & IFF_UP)
 9043			call_netdevice_notifiers(NETDEV_UP, dev);
 9044		else
 9045			call_netdevice_notifiers(NETDEV_DOWN, dev);
 9046	}
 9047
 9048	if (dev->flags & IFF_UP &&
 9049	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
 9050		struct netdev_notifier_change_info change_info = {
 9051			.info = {
 9052				.dev = dev,
 9053			},
 9054			.flags_changed = changes,
 9055		};
 9056
 9057		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
 
 
 9058	}
 9059}
 9060
 9061/**
 9062 *	dev_change_flags - change device settings
 9063 *	@dev: device
 9064 *	@flags: device state flags
 9065 *	@extack: netlink extended ack
 9066 *
 9067 *	Change settings on device based state flags. The flags are
 9068 *	in the userspace exported format.
 9069 */
 9070int dev_change_flags(struct net_device *dev, unsigned int flags,
 9071		     struct netlink_ext_ack *extack)
 9072{
 9073	int ret;
 9074	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
 9075
 9076	ret = __dev_change_flags(dev, flags, extack);
 9077	if (ret < 0)
 9078		return ret;
 9079
 9080	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
 9081	__dev_notify_flags(dev, old_flags, changes, 0, NULL);
 9082	return ret;
 9083}
 9084EXPORT_SYMBOL(dev_change_flags);
 9085
 9086int __dev_set_mtu(struct net_device *dev, int new_mtu)
 9087{
 9088	const struct net_device_ops *ops = dev->netdev_ops;
 9089
 9090	if (ops->ndo_change_mtu)
 9091		return ops->ndo_change_mtu(dev, new_mtu);
 9092
 9093	/* Pairs with all the lockless reads of dev->mtu in the stack */
 9094	WRITE_ONCE(dev->mtu, new_mtu);
 9095	return 0;
 9096}
 9097EXPORT_SYMBOL(__dev_set_mtu);
 9098
 9099int dev_validate_mtu(struct net_device *dev, int new_mtu,
 9100		     struct netlink_ext_ack *extack)
 9101{
 9102	/* MTU must be positive, and in range */
 9103	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
 9104		NL_SET_ERR_MSG(extack, "mtu less than device minimum");
 9105		return -EINVAL;
 9106	}
 9107
 9108	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
 9109		NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
 9110		return -EINVAL;
 9111	}
 9112	return 0;
 9113}
 9114
 9115/**
 9116 *	dev_set_mtu_ext - Change maximum transfer unit
 9117 *	@dev: device
 9118 *	@new_mtu: new transfer unit
 9119 *	@extack: netlink extended ack
 9120 *
 9121 *	Change the maximum transfer size of the network device.
 9122 */
 9123int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
 9124		    struct netlink_ext_ack *extack)
 9125{
 9126	int err, orig_mtu;
 9127
 9128	if (new_mtu == dev->mtu)
 9129		return 0;
 9130
 9131	err = dev_validate_mtu(dev, new_mtu, extack);
 9132	if (err)
 9133		return err;
 
 
 
 
 
 
 
 
 
 9134
 9135	if (!netif_device_present(dev))
 9136		return -ENODEV;
 9137
 9138	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
 9139	err = notifier_to_errno(err);
 9140	if (err)
 9141		return err;
 9142
 9143	orig_mtu = dev->mtu;
 9144	err = __dev_set_mtu(dev, new_mtu);
 9145
 9146	if (!err) {
 9147		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 9148						   orig_mtu);
 9149		err = notifier_to_errno(err);
 9150		if (err) {
 9151			/* setting mtu back and notifying everyone again,
 9152			 * so that they have a chance to revert changes.
 9153			 */
 9154			__dev_set_mtu(dev, orig_mtu);
 9155			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 9156						     new_mtu);
 9157		}
 9158	}
 9159	return err;
 9160}
 9161
 9162int dev_set_mtu(struct net_device *dev, int new_mtu)
 9163{
 9164	struct netlink_ext_ack extack;
 9165	int err;
 9166
 9167	memset(&extack, 0, sizeof(extack));
 9168	err = dev_set_mtu_ext(dev, new_mtu, &extack);
 9169	if (err && extack._msg)
 9170		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
 9171	return err;
 9172}
 9173EXPORT_SYMBOL(dev_set_mtu);
 9174
 9175/**
 9176 *	dev_change_tx_queue_len - Change TX queue length of a netdevice
 9177 *	@dev: device
 9178 *	@new_len: new tx queue length
 9179 */
 9180int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
 9181{
 9182	unsigned int orig_len = dev->tx_queue_len;
 9183	int res;
 9184
 9185	if (new_len != (unsigned int)new_len)
 9186		return -ERANGE;
 9187
 9188	if (new_len != orig_len) {
 9189		WRITE_ONCE(dev->tx_queue_len, new_len);
 9190		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
 9191		res = notifier_to_errno(res);
 9192		if (res)
 9193			goto err_rollback;
 9194		res = dev_qdisc_change_tx_queue_len(dev);
 9195		if (res)
 9196			goto err_rollback;
 9197	}
 9198
 9199	return 0;
 9200
 9201err_rollback:
 9202	netdev_err(dev, "refused to change device tx_queue_len\n");
 9203	WRITE_ONCE(dev->tx_queue_len, orig_len);
 9204	return res;
 9205}
 9206
 9207/**
 9208 *	dev_set_group - Change group this device belongs to
 9209 *	@dev: device
 9210 *	@new_group: group this device should belong to
 9211 */
 9212void dev_set_group(struct net_device *dev, int new_group)
 9213{
 9214	dev->group = new_group;
 9215}
 9216
 9217/**
 9218 *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
 9219 *	@dev: device
 9220 *	@addr: new address
 9221 *	@extack: netlink extended ack
 9222 */
 9223int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
 9224			      struct netlink_ext_ack *extack)
 9225{
 9226	struct netdev_notifier_pre_changeaddr_info info = {
 9227		.info.dev = dev,
 9228		.info.extack = extack,
 9229		.dev_addr = addr,
 9230	};
 9231	int rc;
 9232
 9233	rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
 9234	return notifier_to_errno(rc);
 9235}
 9236EXPORT_SYMBOL(dev_pre_changeaddr_notify);
 9237
 9238/**
 9239 *	dev_set_mac_address - Change Media Access Control Address
 9240 *	@dev: device
 9241 *	@sa: new address
 9242 *	@extack: netlink extended ack
 9243 *
 9244 *	Change the hardware (MAC) address of the device
 9245 */
 9246int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
 9247			struct netlink_ext_ack *extack)
 9248{
 9249	const struct net_device_ops *ops = dev->netdev_ops;
 9250	int err;
 9251
 9252	if (!ops->ndo_set_mac_address)
 9253		return -EOPNOTSUPP;
 9254	if (sa->sa_family != dev->type)
 9255		return -EINVAL;
 9256	if (!netif_device_present(dev))
 9257		return -ENODEV;
 9258	err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
 9259	if (err)
 9260		return err;
 9261	if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) {
 9262		err = ops->ndo_set_mac_address(dev, sa);
 9263		if (err)
 9264			return err;
 9265	}
 9266	dev->addr_assign_type = NET_ADDR_SET;
 9267	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
 9268	add_device_randomness(dev->dev_addr, dev->addr_len);
 9269	return 0;
 9270}
 9271EXPORT_SYMBOL(dev_set_mac_address);
 9272
 9273DECLARE_RWSEM(dev_addr_sem);
 9274
 9275int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
 9276			     struct netlink_ext_ack *extack)
 9277{
 9278	int ret;
 9279
 9280	down_write(&dev_addr_sem);
 9281	ret = dev_set_mac_address(dev, sa, extack);
 9282	up_write(&dev_addr_sem);
 9283	return ret;
 9284}
 9285EXPORT_SYMBOL(dev_set_mac_address_user);
 9286
 9287int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
 9288{
 9289	size_t size = sizeof(sa->sa_data_min);
 9290	struct net_device *dev;
 9291	int ret = 0;
 9292
 9293	down_read(&dev_addr_sem);
 9294	rcu_read_lock();
 9295
 9296	dev = dev_get_by_name_rcu(net, dev_name);
 9297	if (!dev) {
 9298		ret = -ENODEV;
 9299		goto unlock;
 9300	}
 9301	if (!dev->addr_len)
 9302		memset(sa->sa_data, 0, size);
 9303	else
 9304		memcpy(sa->sa_data, dev->dev_addr,
 9305		       min_t(size_t, size, dev->addr_len));
 9306	sa->sa_family = dev->type;
 9307
 9308unlock:
 9309	rcu_read_unlock();
 9310	up_read(&dev_addr_sem);
 9311	return ret;
 9312}
 9313EXPORT_SYMBOL(dev_get_mac_address);
 9314
 9315/**
 9316 *	dev_change_carrier - Change device carrier
 9317 *	@dev: device
 9318 *	@new_carrier: new value
 9319 *
 9320 *	Change device carrier
 9321 */
 9322int dev_change_carrier(struct net_device *dev, bool new_carrier)
 9323{
 9324	const struct net_device_ops *ops = dev->netdev_ops;
 9325
 9326	if (!ops->ndo_change_carrier)
 9327		return -EOPNOTSUPP;
 9328	if (!netif_device_present(dev))
 9329		return -ENODEV;
 9330	return ops->ndo_change_carrier(dev, new_carrier);
 9331}
 
 9332
 9333/**
 9334 *	dev_get_phys_port_id - Get device physical port ID
 9335 *	@dev: device
 9336 *	@ppid: port ID
 9337 *
 9338 *	Get device physical port ID
 9339 */
 9340int dev_get_phys_port_id(struct net_device *dev,
 9341			 struct netdev_phys_item_id *ppid)
 9342{
 9343	const struct net_device_ops *ops = dev->netdev_ops;
 9344
 9345	if (!ops->ndo_get_phys_port_id)
 9346		return -EOPNOTSUPP;
 9347	return ops->ndo_get_phys_port_id(dev, ppid);
 9348}
 
 9349
 9350/**
 9351 *	dev_get_phys_port_name - Get device physical port name
 9352 *	@dev: device
 9353 *	@name: port name
 9354 *	@len: limit of bytes to copy to name
 9355 *
 9356 *	Get device physical port name
 9357 */
 9358int dev_get_phys_port_name(struct net_device *dev,
 9359			   char *name, size_t len)
 9360{
 9361	const struct net_device_ops *ops = dev->netdev_ops;
 9362	int err;
 9363
 9364	if (ops->ndo_get_phys_port_name) {
 9365		err = ops->ndo_get_phys_port_name(dev, name, len);
 9366		if (err != -EOPNOTSUPP)
 9367			return err;
 9368	}
 9369	return devlink_compat_phys_port_name_get(dev, name, len);
 9370}
 9371
 9372/**
 9373 *	dev_get_port_parent_id - Get the device's port parent identifier
 9374 *	@dev: network device
 9375 *	@ppid: pointer to a storage for the port's parent identifier
 9376 *	@recurse: allow/disallow recursion to lower devices
 9377 *
 9378 *	Get the devices's port parent identifier
 9379 */
 9380int dev_get_port_parent_id(struct net_device *dev,
 9381			   struct netdev_phys_item_id *ppid,
 9382			   bool recurse)
 9383{
 9384	const struct net_device_ops *ops = dev->netdev_ops;
 9385	struct netdev_phys_item_id first = { };
 9386	struct net_device *lower_dev;
 9387	struct list_head *iter;
 9388	int err;
 9389
 9390	if (ops->ndo_get_port_parent_id) {
 9391		err = ops->ndo_get_port_parent_id(dev, ppid);
 9392		if (err != -EOPNOTSUPP)
 9393			return err;
 9394	}
 9395
 9396	err = devlink_compat_switch_id_get(dev, ppid);
 9397	if (!recurse || err != -EOPNOTSUPP)
 9398		return err;
 9399
 9400	netdev_for_each_lower_dev(dev, lower_dev, iter) {
 9401		err = dev_get_port_parent_id(lower_dev, ppid, true);
 9402		if (err)
 9403			break;
 9404		if (!first.id_len)
 9405			first = *ppid;
 9406		else if (memcmp(&first, ppid, sizeof(*ppid)))
 9407			return -EOPNOTSUPP;
 9408	}
 9409
 9410	return err;
 9411}
 9412EXPORT_SYMBOL(dev_get_port_parent_id);
 9413
 9414/**
 9415 *	netdev_port_same_parent_id - Indicate if two network devices have
 9416 *	the same port parent identifier
 9417 *	@a: first network device
 9418 *	@b: second network device
 9419 */
 9420bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
 9421{
 9422	struct netdev_phys_item_id a_id = { };
 9423	struct netdev_phys_item_id b_id = { };
 9424
 9425	if (dev_get_port_parent_id(a, &a_id, true) ||
 9426	    dev_get_port_parent_id(b, &b_id, true))
 9427		return false;
 9428
 9429	return netdev_phys_item_id_same(&a_id, &b_id);
 9430}
 9431EXPORT_SYMBOL(netdev_port_same_parent_id);
 9432
 9433/**
 9434 *	dev_change_proto_down - set carrier according to proto_down.
 9435 *
 9436 *	@dev: device
 9437 *	@proto_down: new value
 
 
 
 9438 */
 9439int dev_change_proto_down(struct net_device *dev, bool proto_down)
 9440{
 9441	if (!dev->change_proto_down)
 
 
 9442		return -EOPNOTSUPP;
 9443	if (!netif_device_present(dev))
 9444		return -ENODEV;
 9445	if (proto_down)
 9446		netif_carrier_off(dev);
 9447	else
 9448		netif_carrier_on(dev);
 9449	WRITE_ONCE(dev->proto_down, proto_down);
 9450	return 0;
 9451}
 
 9452
 9453/**
 9454 *	dev_change_proto_down_reason - proto down reason
 9455 *
 9456 *	@dev: device
 9457 *	@mask: proto down mask
 9458 *	@value: proto down value
 
 
 9459 */
 9460void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
 9461				  u32 value)
 9462{
 9463	u32 proto_down_reason;
 9464	int b;
 9465
 9466	if (!mask) {
 9467		proto_down_reason = value;
 9468	} else {
 9469		proto_down_reason = dev->proto_down_reason;
 9470		for_each_set_bit(b, &mask, 32) {
 9471			if (value & (1 << b))
 9472				proto_down_reason |= BIT(b);
 9473			else
 9474				proto_down_reason &= ~BIT(b);
 9475		}
 9476	}
 9477	WRITE_ONCE(dev->proto_down_reason, proto_down_reason);
 9478}
 9479
 9480struct bpf_xdp_link {
 9481	struct bpf_link link;
 9482	struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
 9483	int flags;
 9484};
 9485
 9486static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
 9487{
 9488	if (flags & XDP_FLAGS_HW_MODE)
 9489		return XDP_MODE_HW;
 9490	if (flags & XDP_FLAGS_DRV_MODE)
 9491		return XDP_MODE_DRV;
 9492	if (flags & XDP_FLAGS_SKB_MODE)
 9493		return XDP_MODE_SKB;
 9494	return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
 9495}
 9496
 9497static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
 9498{
 9499	switch (mode) {
 9500	case XDP_MODE_SKB:
 9501		return generic_xdp_install;
 9502	case XDP_MODE_DRV:
 9503	case XDP_MODE_HW:
 9504		return dev->netdev_ops->ndo_bpf;
 9505	default:
 9506		return NULL;
 9507	}
 9508}
 9509
 9510static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
 9511					 enum bpf_xdp_mode mode)
 9512{
 9513	return dev->xdp_state[mode].link;
 9514}
 9515
 9516static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
 9517				     enum bpf_xdp_mode mode)
 9518{
 9519	struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
 9520
 9521	if (link)
 9522		return link->link.prog;
 9523	return dev->xdp_state[mode].prog;
 9524}
 9525
 9526u8 dev_xdp_prog_count(struct net_device *dev)
 9527{
 9528	u8 count = 0;
 9529	int i;
 9530
 9531	for (i = 0; i < __MAX_XDP_MODE; i++)
 9532		if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
 9533			count++;
 9534	return count;
 9535}
 9536EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
 9537
 9538int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf)
 9539{
 9540	if (!dev->netdev_ops->ndo_bpf)
 9541		return -EOPNOTSUPP;
 
 
 
 
 
 
 
 
 
 
 
 9542
 9543	if (dev_get_min_mp_channel_count(dev)) {
 9544		NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider");
 9545		return -EBUSY;
 9546	}
 9547
 9548	return dev->netdev_ops->ndo_bpf(dev, bpf);
 9549}
 9550EXPORT_SYMBOL_GPL(dev_xdp_propagate);
 9551
 9552u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
 9553{
 9554	struct bpf_prog *prog = dev_xdp_prog(dev, mode);
 9555
 9556	return prog ? prog->aux->id : 0;
 9557}
 9558
 9559static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
 9560			     struct bpf_xdp_link *link)
 9561{
 9562	dev->xdp_state[mode].link = link;
 9563	dev->xdp_state[mode].prog = NULL;
 9564}
 9565
 9566static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
 9567			     struct bpf_prog *prog)
 9568{
 9569	dev->xdp_state[mode].link = NULL;
 9570	dev->xdp_state[mode].prog = prog;
 9571}
 9572
 9573static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
 9574			   bpf_op_t bpf_op, struct netlink_ext_ack *extack,
 9575			   u32 flags, struct bpf_prog *prog)
 9576{
 9577	struct netdev_bpf xdp;
 9578	int err;
 9579
 9580	if (dev_get_min_mp_channel_count(dev)) {
 9581		NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider");
 9582		return -EBUSY;
 9583	}
 9584
 9585	memset(&xdp, 0, sizeof(xdp));
 9586	xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
 9587	xdp.extack = extack;
 9588	xdp.flags = flags;
 9589	xdp.prog = prog;
 9590
 9591	/* Drivers assume refcnt is already incremented (i.e, prog pointer is
 9592	 * "moved" into driver), so they don't increment it on their own, but
 9593	 * they do decrement refcnt when program is detached or replaced.
 9594	 * Given net_device also owns link/prog, we need to bump refcnt here
 9595	 * to prevent drivers from underflowing it.
 9596	 */
 9597	if (prog)
 9598		bpf_prog_inc(prog);
 9599	err = bpf_op(dev, &xdp);
 9600	if (err) {
 9601		if (prog)
 9602			bpf_prog_put(prog);
 9603		return err;
 9604	}
 9605
 9606	if (mode != XDP_MODE_HW)
 9607		bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
 9608
 9609	return 0;
 9610}
 
 9611
 9612static void dev_xdp_uninstall(struct net_device *dev)
 
 
 
 
 
 
 
 
 9613{
 9614	struct bpf_xdp_link *link;
 9615	struct bpf_prog *prog;
 9616	enum bpf_xdp_mode mode;
 9617	bpf_op_t bpf_op;
 9618
 9619	ASSERT_RTNL();
 9620
 9621	for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
 9622		prog = dev_xdp_prog(dev, mode);
 9623		if (!prog)
 9624			continue;
 9625
 9626		bpf_op = dev_xdp_bpf_op(dev, mode);
 9627		if (!bpf_op)
 9628			continue;
 9629
 9630		WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
 9631
 9632		/* auto-detach link from net device */
 9633		link = dev_xdp_link(dev, mode);
 9634		if (link)
 9635			link->dev = NULL;
 9636		else
 9637			bpf_prog_put(prog);
 9638
 9639		dev_xdp_set_link(dev, mode, NULL);
 9640	}
 9641}
 9642
 9643static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
 9644			  struct bpf_xdp_link *link, struct bpf_prog *new_prog,
 9645			  struct bpf_prog *old_prog, u32 flags)
 9646{
 9647	unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
 9648	struct bpf_prog *cur_prog;
 9649	struct net_device *upper;
 9650	struct list_head *iter;
 9651	enum bpf_xdp_mode mode;
 9652	bpf_op_t bpf_op;
 9653	int err;
 9654
 9655	ASSERT_RTNL();
 9656
 9657	/* either link or prog attachment, never both */
 9658	if (link && (new_prog || old_prog))
 9659		return -EINVAL;
 9660	/* link supports only XDP mode flags */
 9661	if (link && (flags & ~XDP_FLAGS_MODES)) {
 9662		NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
 9663		return -EINVAL;
 9664	}
 9665	/* just one XDP mode bit should be set, zero defaults to drv/skb mode */
 9666	if (num_modes > 1) {
 9667		NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
 9668		return -EINVAL;
 9669	}
 9670	/* avoid ambiguity if offload + drv/skb mode progs are both loaded */
 9671	if (!num_modes && dev_xdp_prog_count(dev) > 1) {
 9672		NL_SET_ERR_MSG(extack,
 9673			       "More than one program loaded, unset mode is ambiguous");
 9674		return -EINVAL;
 9675	}
 9676	/* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
 9677	if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
 9678		NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
 9679		return -EINVAL;
 9680	}
 9681
 9682	mode = dev_xdp_mode(dev, flags);
 9683	/* can't replace attached link */
 9684	if (dev_xdp_link(dev, mode)) {
 9685		NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
 9686		return -EBUSY;
 9687	}
 9688
 9689	/* don't allow if an upper device already has a program */
 9690	netdev_for_each_upper_dev_rcu(dev, upper, iter) {
 9691		if (dev_xdp_prog_count(upper) > 0) {
 9692			NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
 9693			return -EEXIST;
 9694		}
 9695	}
 9696
 9697	cur_prog = dev_xdp_prog(dev, mode);
 9698	/* can't replace attached prog with link */
 9699	if (link && cur_prog) {
 9700		NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
 9701		return -EBUSY;
 9702	}
 9703	if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
 9704		NL_SET_ERR_MSG(extack, "Active program does not match expected");
 9705		return -EEXIST;
 9706	}
 9707
 9708	/* put effective new program into new_prog */
 9709	if (link)
 9710		new_prog = link->link.prog;
 9711
 9712	if (new_prog) {
 9713		bool offload = mode == XDP_MODE_HW;
 9714		enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
 9715					       ? XDP_MODE_DRV : XDP_MODE_SKB;
 9716
 9717		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
 9718			NL_SET_ERR_MSG(extack, "XDP program already attached");
 9719			return -EBUSY;
 9720		}
 9721		if (!offload && dev_xdp_prog(dev, other_mode)) {
 9722			NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
 9723			return -EEXIST;
 9724		}
 9725		if (!offload && bpf_prog_is_offloaded(new_prog->aux)) {
 9726			NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported");
 9727			return -EINVAL;
 9728		}
 9729		if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) {
 9730			NL_SET_ERR_MSG(extack, "Program bound to different device");
 9731			return -EINVAL;
 9732		}
 9733		if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) {
 9734			NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode");
 9735			return -EINVAL;
 9736		}
 9737		if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
 9738			NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
 9739			return -EINVAL;
 9740		}
 9741		if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
 9742			NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
 9743			return -EINVAL;
 9744		}
 9745	}
 9746
 9747	/* don't call drivers if the effective program didn't change */
 9748	if (new_prog != cur_prog) {
 9749		bpf_op = dev_xdp_bpf_op(dev, mode);
 9750		if (!bpf_op) {
 9751			NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
 9752			return -EOPNOTSUPP;
 9753		}
 9754
 9755		err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
 9756		if (err)
 9757			return err;
 9758	}
 9759
 9760	if (link)
 9761		dev_xdp_set_link(dev, mode, link);
 9762	else
 9763		dev_xdp_set_prog(dev, mode, new_prog);
 9764	if (cur_prog)
 9765		bpf_prog_put(cur_prog);
 9766
 9767	return 0;
 9768}
 9769
 9770static int dev_xdp_attach_link(struct net_device *dev,
 9771			       struct netlink_ext_ack *extack,
 9772			       struct bpf_xdp_link *link)
 9773{
 9774	return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
 
 9775}
 9776
 9777static int dev_xdp_detach_link(struct net_device *dev,
 9778			       struct netlink_ext_ack *extack,
 9779			       struct bpf_xdp_link *link)
 9780{
 9781	enum bpf_xdp_mode mode;
 9782	bpf_op_t bpf_op;
 9783
 
 9784	ASSERT_RTNL();
 9785
 9786	mode = dev_xdp_mode(dev, link->flags);
 9787	if (dev_xdp_link(dev, mode) != link)
 9788		return -EINVAL;
 9789
 9790	bpf_op = dev_xdp_bpf_op(dev, mode);
 9791	WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
 9792	dev_xdp_set_link(dev, mode, NULL);
 9793	return 0;
 9794}
 9795
 9796static void bpf_xdp_link_release(struct bpf_link *link)
 9797{
 9798	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9799
 9800	rtnl_lock();
 9801
 9802	/* if racing with net_device's tear down, xdp_link->dev might be
 9803	 * already NULL, in which case link was already auto-detached
 9804	 */
 9805	if (xdp_link->dev) {
 9806		WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
 9807		xdp_link->dev = NULL;
 9808	}
 9809
 9810	rtnl_unlock();
 9811}
 9812
 9813static int bpf_xdp_link_detach(struct bpf_link *link)
 9814{
 9815	bpf_xdp_link_release(link);
 9816	return 0;
 9817}
 9818
 9819static void bpf_xdp_link_dealloc(struct bpf_link *link)
 9820{
 9821	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9822
 9823	kfree(xdp_link);
 9824}
 9825
 9826static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
 9827				     struct seq_file *seq)
 9828{
 9829	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9830	u32 ifindex = 0;
 9831
 9832	rtnl_lock();
 9833	if (xdp_link->dev)
 9834		ifindex = xdp_link->dev->ifindex;
 9835	rtnl_unlock();
 9836
 9837	seq_printf(seq, "ifindex:\t%u\n", ifindex);
 9838}
 9839
 9840static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
 9841				       struct bpf_link_info *info)
 9842{
 9843	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9844	u32 ifindex = 0;
 9845
 9846	rtnl_lock();
 9847	if (xdp_link->dev)
 9848		ifindex = xdp_link->dev->ifindex;
 9849	rtnl_unlock();
 9850
 9851	info->xdp.ifindex = ifindex;
 9852	return 0;
 9853}
 9854
 9855static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
 9856			       struct bpf_prog *old_prog)
 9857{
 9858	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9859	enum bpf_xdp_mode mode;
 9860	bpf_op_t bpf_op;
 9861	int err = 0;
 9862
 9863	rtnl_lock();
 9864
 9865	/* link might have been auto-released already, so fail */
 9866	if (!xdp_link->dev) {
 9867		err = -ENOLINK;
 9868		goto out_unlock;
 9869	}
 9870
 9871	if (old_prog && link->prog != old_prog) {
 9872		err = -EPERM;
 9873		goto out_unlock;
 9874	}
 9875	old_prog = link->prog;
 9876	if (old_prog->type != new_prog->type ||
 9877	    old_prog->expected_attach_type != new_prog->expected_attach_type) {
 9878		err = -EINVAL;
 9879		goto out_unlock;
 9880	}
 9881
 9882	if (old_prog == new_prog) {
 9883		/* no-op, don't disturb drivers */
 9884		bpf_prog_put(new_prog);
 9885		goto out_unlock;
 9886	}
 9887
 9888	mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
 9889	bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
 9890	err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
 9891			      xdp_link->flags, new_prog);
 9892	if (err)
 9893		goto out_unlock;
 9894
 9895	old_prog = xchg(&link->prog, new_prog);
 9896	bpf_prog_put(old_prog);
 9897
 9898out_unlock:
 9899	rtnl_unlock();
 9900	return err;
 9901}
 9902
 9903static const struct bpf_link_ops bpf_xdp_link_lops = {
 9904	.release = bpf_xdp_link_release,
 9905	.dealloc = bpf_xdp_link_dealloc,
 9906	.detach = bpf_xdp_link_detach,
 9907	.show_fdinfo = bpf_xdp_link_show_fdinfo,
 9908	.fill_link_info = bpf_xdp_link_fill_link_info,
 9909	.update_prog = bpf_xdp_link_update,
 9910};
 9911
 9912int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 9913{
 9914	struct net *net = current->nsproxy->net_ns;
 9915	struct bpf_link_primer link_primer;
 9916	struct netlink_ext_ack extack = {};
 9917	struct bpf_xdp_link *link;
 9918	struct net_device *dev;
 9919	int err, fd;
 9920
 9921	rtnl_lock();
 9922	dev = dev_get_by_index(net, attr->link_create.target_ifindex);
 9923	if (!dev) {
 9924		rtnl_unlock();
 9925		return -EINVAL;
 9926	}
 9927
 9928	link = kzalloc(sizeof(*link), GFP_USER);
 9929	if (!link) {
 9930		err = -ENOMEM;
 9931		goto unlock;
 
 
 9932	}
 9933
 9934	bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
 9935	link->dev = dev;
 9936	link->flags = attr->link_create.flags;
 9937
 9938	err = bpf_link_prime(&link->link, &link_primer);
 9939	if (err) {
 9940		kfree(link);
 9941		goto unlock;
 9942	}
 9943
 9944	err = dev_xdp_attach_link(dev, &extack, link);
 9945	rtnl_unlock();
 
 9946
 9947	if (err) {
 9948		link->dev = NULL;
 9949		bpf_link_cleanup(&link_primer);
 9950		trace_bpf_xdp_link_attach_failed(extack._msg);
 9951		goto out_put_dev;
 9952	}
 
 9953
 9954	fd = bpf_link_settle(&link_primer);
 9955	/* link itself doesn't hold dev's refcnt to not complicate shutdown */
 9956	dev_put(dev);
 9957	return fd;
 9958
 9959unlock:
 9960	rtnl_unlock();
 9961
 9962out_put_dev:
 9963	dev_put(dev);
 9964	return err;
 9965}
 9966
 9967/**
 9968 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
 9969 *	@dev: device
 9970 *	@extack: netlink extended ack
 9971 *	@fd: new program fd or negative value to clear
 9972 *	@expected_fd: old program fd that userspace expects to replace or clear
 9973 *	@flags: xdp-related flags
 9974 *
 9975 *	Set or clear a bpf program for a device
 9976 */
 9977int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 9978		      int fd, int expected_fd, u32 flags)
 9979{
 9980	enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
 9981	struct bpf_prog *new_prog = NULL, *old_prog = NULL;
 9982	int err;
 9983
 9984	ASSERT_RTNL();
 9985
 9986	if (fd >= 0) {
 9987		new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
 9988						 mode != XDP_MODE_SKB);
 9989		if (IS_ERR(new_prog))
 9990			return PTR_ERR(new_prog);
 9991	}
 9992
 9993	if (expected_fd >= 0) {
 9994		old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
 9995						 mode != XDP_MODE_SKB);
 9996		if (IS_ERR(old_prog)) {
 9997			err = PTR_ERR(old_prog);
 9998			old_prog = NULL;
 9999			goto err_out;
10000		}
10001	}
10002
10003	err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
10004
10005err_out:
10006	if (err && new_prog)
10007		bpf_prog_put(new_prog);
10008	if (old_prog)
10009		bpf_prog_put(old_prog);
10010	return err;
10011}
10012
10013u32 dev_get_min_mp_channel_count(const struct net_device *dev)
10014{
10015	int i;
 
10016
10017	ASSERT_RTNL();
 
 
 
 
10018
10019	for (i = dev->real_num_rx_queues - 1; i >= 0; i--)
10020		if (dev->_rx[i].mp_params.mp_priv)
10021			/* The channel count is the idx plus 1. */
10022			return i + 1;
10023
10024	return 0;
10025}
10026
10027/**
10028 * dev_index_reserve() - allocate an ifindex in a namespace
10029 * @net: the applicable net namespace
10030 * @ifindex: requested ifindex, pass %0 to get one allocated
10031 *
10032 * Allocate a ifindex for a new device. Caller must either use the ifindex
10033 * to store the device (via list_netdevice()) or call dev_index_release()
10034 * to give the index up.
10035 *
10036 * Return: a suitable unique value for a new device interface number or -errno.
10037 */
10038static int dev_index_reserve(struct net *net, u32 ifindex)
10039{
10040	int err;
10041
10042	if (ifindex > INT_MAX) {
10043		DEBUG_NET_WARN_ON_ONCE(1);
10044		return -EINVAL;
 
 
 
10045	}
10046
10047	if (!ifindex)
10048		err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL,
10049				      xa_limit_31b, &net->ifindex, GFP_KERNEL);
10050	else
10051		err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL);
10052	if (err < 0)
10053		return err;
10054
10055	return ifindex;
 
10056}
10057
10058static void dev_index_release(struct net *net, int ifindex)
10059{
10060	/* Expect only unused indexes, unlist_netdevice() removes the used */
10061	WARN_ON(xa_erase(&net->dev_by_index, ifindex));
10062}
10063
10064/* Delayed registration/unregisteration */
10065LIST_HEAD(net_todo_list);
10066DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
10067atomic_t dev_unreg_count = ATOMIC_INIT(0);
10068
10069static void net_set_todo(struct net_device *dev)
10070{
10071	list_add_tail(&dev->todo_list, &net_todo_list);
10072}
10073
10074static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
10075	struct net_device *upper, netdev_features_t features)
10076{
10077	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
10078	netdev_features_t feature;
10079	int feature_bit;
10080
10081	for_each_netdev_feature(upper_disables, feature_bit) {
10082		feature = __NETIF_F_BIT(feature_bit);
10083		if (!(upper->wanted_features & feature)
10084		    && (features & feature)) {
10085			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
10086				   &feature, upper->name);
10087			features &= ~feature;
10088		}
10089	}
10090
10091	return features;
10092}
10093
10094static void netdev_sync_lower_features(struct net_device *upper,
10095	struct net_device *lower, netdev_features_t features)
10096{
10097	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
10098	netdev_features_t feature;
10099	int feature_bit;
10100
10101	for_each_netdev_feature(upper_disables, feature_bit) {
10102		feature = __NETIF_F_BIT(feature_bit);
10103		if (!(features & feature) && (lower->features & feature)) {
10104			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
10105				   &feature, lower->name);
10106			lower->wanted_features &= ~feature;
10107			__netdev_update_features(lower);
10108
10109			if (unlikely(lower->features & feature))
10110				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
10111					    &feature, lower->name);
10112			else
10113				netdev_features_change(lower);
10114		}
10115	}
10116}
10117
10118static bool netdev_has_ip_or_hw_csum(netdev_features_t features)
10119{
10120	netdev_features_t ip_csum_mask = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
10121	bool ip_csum = (features & ip_csum_mask) == ip_csum_mask;
10122	bool hw_csum = features & NETIF_F_HW_CSUM;
10123
10124	return ip_csum || hw_csum;
10125}
10126
10127static netdev_features_t netdev_fix_features(struct net_device *dev,
10128	netdev_features_t features)
10129{
10130	/* Fix illegal checksum combinations */
10131	if ((features & NETIF_F_HW_CSUM) &&
10132	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
10133		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
10134		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
10135	}
10136
10137	/* TSO requires that SG is present as well. */
10138	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
10139		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
10140		features &= ~NETIF_F_ALL_TSO;
10141	}
10142
10143	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
10144					!(features & NETIF_F_IP_CSUM)) {
10145		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
10146		features &= ~NETIF_F_TSO;
10147		features &= ~NETIF_F_TSO_ECN;
10148	}
10149
10150	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
10151					 !(features & NETIF_F_IPV6_CSUM)) {
10152		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
10153		features &= ~NETIF_F_TSO6;
10154	}
10155
10156	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
10157	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
10158		features &= ~NETIF_F_TSO_MANGLEID;
10159
10160	/* TSO ECN requires that TSO is present as well. */
10161	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
10162		features &= ~NETIF_F_TSO_ECN;
10163
10164	/* Software GSO depends on SG. */
10165	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
10166		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
10167		features &= ~NETIF_F_GSO;
10168	}
10169
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10170	/* GSO partial features require GSO partial be set */
10171	if ((features & dev->gso_partial_features) &&
10172	    !(features & NETIF_F_GSO_PARTIAL)) {
10173		netdev_dbg(dev,
10174			   "Dropping partially supported GSO features since no GSO partial.\n");
10175		features &= ~dev->gso_partial_features;
10176	}
10177
10178	if (!(features & NETIF_F_RXCSUM)) {
10179		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
10180		 * successfully merged by hardware must also have the
10181		 * checksum verified by hardware.  If the user does not
10182		 * want to enable RXCSUM, logically, we should disable GRO_HW.
10183		 */
10184		if (features & NETIF_F_GRO_HW) {
10185			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
10186			features &= ~NETIF_F_GRO_HW;
10187		}
10188	}
10189
10190	/* LRO/HW-GRO features cannot be combined with RX-FCS */
10191	if (features & NETIF_F_RXFCS) {
10192		if (features & NETIF_F_LRO) {
10193			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
10194			features &= ~NETIF_F_LRO;
10195		}
10196
10197		if (features & NETIF_F_GRO_HW) {
10198			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
10199			features &= ~NETIF_F_GRO_HW;
10200		}
10201	}
10202
10203	if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
10204		netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
10205		features &= ~NETIF_F_LRO;
10206	}
10207
10208	if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) {
10209		netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
10210		features &= ~NETIF_F_HW_TLS_TX;
10211	}
10212
10213	if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
10214		netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
10215		features &= ~NETIF_F_HW_TLS_RX;
10216	}
10217
10218	if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) {
10219		netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n");
10220		features &= ~NETIF_F_GSO_UDP_L4;
10221	}
10222
10223	return features;
10224}
10225
10226int __netdev_update_features(struct net_device *dev)
10227{
10228	struct net_device *upper, *lower;
10229	netdev_features_t features;
10230	struct list_head *iter;
10231	int err = -1;
10232
10233	ASSERT_RTNL();
10234
10235	features = netdev_get_wanted_features(dev);
10236
10237	if (dev->netdev_ops->ndo_fix_features)
10238		features = dev->netdev_ops->ndo_fix_features(dev, features);
10239
10240	/* driver might be less strict about feature dependencies */
10241	features = netdev_fix_features(dev, features);
10242
10243	/* some features can't be enabled if they're off on an upper device */
10244	netdev_for_each_upper_dev_rcu(dev, upper, iter)
10245		features = netdev_sync_upper_features(dev, upper, features);
10246
10247	if (dev->features == features)
10248		goto sync_lower;
10249
10250	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
10251		&dev->features, &features);
10252
10253	if (dev->netdev_ops->ndo_set_features)
10254		err = dev->netdev_ops->ndo_set_features(dev, features);
10255	else
10256		err = 0;
10257
10258	if (unlikely(err < 0)) {
10259		netdev_err(dev,
10260			"set_features() failed (%d); wanted %pNF, left %pNF\n",
10261			err, &features, &dev->features);
10262		/* return non-0 since some features might have changed and
10263		 * it's better to fire a spurious notification than miss it
10264		 */
10265		return -1;
10266	}
10267
10268sync_lower:
10269	/* some features must be disabled on lower devices when disabled
10270	 * on an upper device (think: bonding master or bridge)
10271	 */
10272	netdev_for_each_lower_dev(dev, lower, iter)
10273		netdev_sync_lower_features(dev, lower, features);
10274
10275	if (!err) {
10276		netdev_features_t diff = features ^ dev->features;
10277
10278		if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
10279			/* udp_tunnel_{get,drop}_rx_info both need
10280			 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
10281			 * device, or they won't do anything.
10282			 * Thus we need to update dev->features
10283			 * *before* calling udp_tunnel_get_rx_info,
10284			 * but *after* calling udp_tunnel_drop_rx_info.
10285			 */
10286			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
10287				dev->features = features;
10288				udp_tunnel_get_rx_info(dev);
10289			} else {
10290				udp_tunnel_drop_rx_info(dev);
10291			}
10292		}
10293
10294		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
10295			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
10296				dev->features = features;
10297				err |= vlan_get_rx_ctag_filter_info(dev);
10298			} else {
10299				vlan_drop_rx_ctag_filter_info(dev);
10300			}
10301		}
10302
10303		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
10304			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
10305				dev->features = features;
10306				err |= vlan_get_rx_stag_filter_info(dev);
10307			} else {
10308				vlan_drop_rx_stag_filter_info(dev);
10309			}
10310		}
10311
10312		dev->features = features;
10313	}
10314
10315	return err < 0 ? 0 : 1;
10316}
10317
10318/**
10319 *	netdev_update_features - recalculate device features
10320 *	@dev: the device to check
10321 *
10322 *	Recalculate dev->features set and send notifications if it
10323 *	has changed. Should be called after driver or hardware dependent
10324 *	conditions might have changed that influence the features.
10325 */
10326void netdev_update_features(struct net_device *dev)
10327{
10328	if (__netdev_update_features(dev))
10329		netdev_features_change(dev);
10330}
10331EXPORT_SYMBOL(netdev_update_features);
10332
10333/**
10334 *	netdev_change_features - recalculate device features
10335 *	@dev: the device to check
10336 *
10337 *	Recalculate dev->features set and send notifications even
10338 *	if they have not changed. Should be called instead of
10339 *	netdev_update_features() if also dev->vlan_features might
10340 *	have changed to allow the changes to be propagated to stacked
10341 *	VLAN devices.
10342 */
10343void netdev_change_features(struct net_device *dev)
10344{
10345	__netdev_update_features(dev);
10346	netdev_features_change(dev);
10347}
10348EXPORT_SYMBOL(netdev_change_features);
10349
10350/**
10351 *	netif_stacked_transfer_operstate -	transfer operstate
10352 *	@rootdev: the root or lower level device to transfer state from
10353 *	@dev: the device to transfer operstate to
10354 *
10355 *	Transfer operational state from root to device. This is normally
10356 *	called when a stacking relationship exists between the root
10357 *	device and the device(a leaf device).
10358 */
10359void netif_stacked_transfer_operstate(const struct net_device *rootdev,
10360					struct net_device *dev)
10361{
10362	if (rootdev->operstate == IF_OPER_DORMANT)
10363		netif_dormant_on(dev);
10364	else
10365		netif_dormant_off(dev);
10366
10367	if (rootdev->operstate == IF_OPER_TESTING)
10368		netif_testing_on(dev);
10369	else
10370		netif_testing_off(dev);
10371
10372	if (netif_carrier_ok(rootdev))
10373		netif_carrier_on(dev);
10374	else
10375		netif_carrier_off(dev);
10376}
10377EXPORT_SYMBOL(netif_stacked_transfer_operstate);
10378
 
10379static int netif_alloc_rx_queues(struct net_device *dev)
10380{
10381	unsigned int i, count = dev->num_rx_queues;
10382	struct netdev_rx_queue *rx;
10383	size_t sz = count * sizeof(*rx);
10384	int err = 0;
10385
10386	BUG_ON(count < 1);
10387
10388	rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10389	if (!rx)
10390		return -ENOMEM;
10391
 
 
10392	dev->_rx = rx;
10393
10394	for (i = 0; i < count; i++) {
10395		rx[i].dev = dev;
10396
10397		/* XDP RX-queue setup */
10398		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
10399		if (err < 0)
10400			goto err_rxq_info;
10401	}
10402	return 0;
10403
10404err_rxq_info:
10405	/* Rollback successful reg's and free other resources */
10406	while (i--)
10407		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
10408	kvfree(dev->_rx);
10409	dev->_rx = NULL;
10410	return err;
10411}
10412
10413static void netif_free_rx_queues(struct net_device *dev)
10414{
10415	unsigned int i, count = dev->num_rx_queues;
10416
10417	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
10418	if (!dev->_rx)
10419		return;
10420
10421	for (i = 0; i < count; i++)
10422		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
10423
10424	kvfree(dev->_rx);
10425}
 
10426
10427static void netdev_init_one_queue(struct net_device *dev,
10428				  struct netdev_queue *queue, void *_unused)
10429{
10430	/* Initialize queue lock */
10431	spin_lock_init(&queue->_xmit_lock);
10432	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
10433	queue->xmit_lock_owner = -1;
10434	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
10435	queue->dev = dev;
10436#ifdef CONFIG_BQL
10437	dql_init(&queue->dql, HZ);
10438#endif
10439}
10440
10441static void netif_free_tx_queues(struct net_device *dev)
10442{
10443	kvfree(dev->_tx);
10444}
10445
10446static int netif_alloc_netdev_queues(struct net_device *dev)
10447{
10448	unsigned int count = dev->num_tx_queues;
10449	struct netdev_queue *tx;
10450	size_t sz = count * sizeof(*tx);
10451
10452	if (count < 1 || count > 0xffff)
10453		return -EINVAL;
10454
10455	tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10456	if (!tx)
10457		return -ENOMEM;
10458
 
 
10459	dev->_tx = tx;
10460
10461	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
10462	spin_lock_init(&dev->tx_global_lock);
10463
10464	return 0;
10465}
10466
10467void netif_tx_stop_all_queues(struct net_device *dev)
10468{
10469	unsigned int i;
10470
10471	for (i = 0; i < dev->num_tx_queues; i++) {
10472		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
10473
10474		netif_tx_stop_queue(txq);
10475	}
10476}
10477EXPORT_SYMBOL(netif_tx_stop_all_queues);
10478
10479static int netdev_do_alloc_pcpu_stats(struct net_device *dev)
10480{
10481	void __percpu *v;
10482
10483	/* Drivers implementing ndo_get_peer_dev must support tstat
10484	 * accounting, so that skb_do_redirect() can bump the dev's
10485	 * RX stats upon network namespace switch.
10486	 */
10487	if (dev->netdev_ops->ndo_get_peer_dev &&
10488	    dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS)
10489		return -EOPNOTSUPP;
10490
10491	switch (dev->pcpu_stat_type) {
10492	case NETDEV_PCPU_STAT_NONE:
10493		return 0;
10494	case NETDEV_PCPU_STAT_LSTATS:
10495		v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
10496		break;
10497	case NETDEV_PCPU_STAT_TSTATS:
10498		v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
10499		break;
10500	case NETDEV_PCPU_STAT_DSTATS:
10501		v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
10502		break;
10503	default:
10504		return -EINVAL;
10505	}
10506
10507	return v ? 0 : -ENOMEM;
10508}
10509
10510static void netdev_do_free_pcpu_stats(struct net_device *dev)
10511{
10512	switch (dev->pcpu_stat_type) {
10513	case NETDEV_PCPU_STAT_NONE:
10514		return;
10515	case NETDEV_PCPU_STAT_LSTATS:
10516		free_percpu(dev->lstats);
10517		break;
10518	case NETDEV_PCPU_STAT_TSTATS:
10519		free_percpu(dev->tstats);
10520		break;
10521	case NETDEV_PCPU_STAT_DSTATS:
10522		free_percpu(dev->dstats);
10523		break;
10524	}
10525}
10526
10527static void netdev_free_phy_link_topology(struct net_device *dev)
10528{
10529	struct phy_link_topology *topo = dev->link_topo;
10530
10531	if (IS_ENABLED(CONFIG_PHYLIB) && topo) {
10532		xa_destroy(&topo->phys);
10533		kfree(topo);
10534		dev->link_topo = NULL;
10535	}
10536}
10537
10538/**
10539 * register_netdevice() - register a network device
10540 * @dev: device to register
 
 
 
 
 
10541 *
10542 * Take a prepared network device structure and make it externally accessible.
10543 * A %NETDEV_REGISTER message is sent to the netdev notifier chain.
10544 * Callers must hold the rtnl lock - you may want register_netdev()
10545 * instead of this.
 
 
10546 */
 
10547int register_netdevice(struct net_device *dev)
10548{
10549	int ret;
10550	struct net *net = dev_net(dev);
10551
10552	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
10553		     NETDEV_FEATURE_COUNT);
10554	BUG_ON(dev_boot_phase);
10555	ASSERT_RTNL();
10556
10557	might_sleep();
10558
10559	/* When net_device's are persistent, this will be fatal. */
10560	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
10561	BUG_ON(!net);
10562
10563	ret = ethtool_check_ops(dev->ethtool_ops);
10564	if (ret)
10565		return ret;
10566
10567	/* rss ctx ID 0 is reserved for the default context, start from 1 */
10568	xa_init_flags(&dev->ethtool->rss_ctx, XA_FLAGS_ALLOC1);
10569	mutex_init(&dev->ethtool->rss_lock);
10570
10571	spin_lock_init(&dev->addr_list_lock);
10572	netdev_set_addr_lockdep_class(dev);
10573
10574	ret = dev_get_valid_name(net, dev, dev->name);
10575	if (ret < 0)
10576		goto out;
10577
10578	ret = -ENOMEM;
10579	dev->name_node = netdev_name_node_head_alloc(dev);
10580	if (!dev->name_node)
10581		goto out;
10582
10583	/* Init, if this function is available */
10584	if (dev->netdev_ops->ndo_init) {
10585		ret = dev->netdev_ops->ndo_init(dev);
10586		if (ret) {
10587			if (ret > 0)
10588				ret = -EIO;
10589			goto err_free_name;
10590		}
10591	}
10592
10593	if (((dev->hw_features | dev->features) &
10594	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
10595	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
10596	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
10597		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
10598		ret = -EINVAL;
10599		goto err_uninit;
10600	}
10601
10602	ret = netdev_do_alloc_pcpu_stats(dev);
10603	if (ret)
 
 
10604		goto err_uninit;
10605
10606	ret = dev_index_reserve(net, dev->ifindex);
10607	if (ret < 0)
10608		goto err_free_pcpu;
10609	dev->ifindex = ret;
10610
10611	/* Transfer changeable features to wanted_features and enable
10612	 * software offloads (GSO and GRO).
10613	 */
10614	dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
10615	dev->features |= NETIF_F_SOFT_FEATURES;
10616
10617	if (dev->udp_tunnel_nic_info) {
10618		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10619		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10620	}
10621
10622	dev->wanted_features = dev->features & dev->hw_features;
10623
10624	if (!(dev->flags & IFF_LOOPBACK))
10625		dev->hw_features |= NETIF_F_NOCACHE_COPY;
10626
10627	/* If IPv4 TCP segmentation offload is supported we should also
10628	 * allow the device to enable segmenting the frame with the option
10629	 * of ignoring a static IP ID value.  This doesn't enable the
10630	 * feature itself but allows the user to enable it later.
10631	 */
10632	if (dev->hw_features & NETIF_F_TSO)
10633		dev->hw_features |= NETIF_F_TSO_MANGLEID;
10634	if (dev->vlan_features & NETIF_F_TSO)
10635		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
10636	if (dev->mpls_features & NETIF_F_TSO)
10637		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
10638	if (dev->hw_enc_features & NETIF_F_TSO)
10639		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
10640
10641	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
10642	 */
10643	dev->vlan_features |= NETIF_F_HIGHDMA;
10644
10645	/* Make NETIF_F_SG inheritable to tunnel devices.
10646	 */
10647	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
10648
10649	/* Make NETIF_F_SG inheritable to MPLS.
10650	 */
10651	dev->mpls_features |= NETIF_F_SG;
10652
10653	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
10654	ret = notifier_to_errno(ret);
10655	if (ret)
10656		goto err_ifindex_release;
10657
10658	ret = netdev_register_kobject(dev);
10659
10660	WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
10661
10662	if (ret)
10663		goto err_uninit_notify;
 
10664
10665	__netdev_update_features(dev);
10666
10667	/*
10668	 *	Default initial state at registry is that the
10669	 *	device is present.
10670	 */
10671
10672	set_bit(__LINK_STATE_PRESENT, &dev->state);
10673
10674	linkwatch_init_dev(dev);
10675
10676	dev_init_scheduler(dev);
10677
10678	netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL);
10679	list_netdevice(dev);
10680
10681	add_device_randomness(dev->dev_addr, dev->addr_len);
10682
10683	/* If the device has permanent device address, driver should
10684	 * set dev_addr and also addr_assign_type should be set to
10685	 * NET_ADDR_PERM (default value).
10686	 */
10687	if (dev->addr_assign_type == NET_ADDR_PERM)
10688		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
10689
10690	/* Notify protocols, that a new device appeared. */
10691	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
10692	ret = notifier_to_errno(ret);
10693	if (ret) {
10694		/* Expect explicit free_netdev() on failure */
10695		dev->needs_free_netdev = false;
10696		unregister_netdevice_queue(dev, NULL);
10697		goto out;
10698	}
10699	/*
10700	 *	Prevent userspace races by waiting until the network
10701	 *	device is fully setup before sending notifications.
10702	 */
10703	if (!dev->rtnl_link_ops ||
10704	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10705		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
10706
10707out:
10708	return ret;
10709
10710err_uninit_notify:
10711	call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
10712err_ifindex_release:
10713	dev_index_release(net, dev->ifindex);
10714err_free_pcpu:
10715	netdev_do_free_pcpu_stats(dev);
10716err_uninit:
10717	if (dev->netdev_ops->ndo_uninit)
10718		dev->netdev_ops->ndo_uninit(dev);
10719	if (dev->priv_destructor)
10720		dev->priv_destructor(dev);
10721err_free_name:
10722	netdev_name_node_free(dev->name_node);
10723	goto out;
10724}
10725EXPORT_SYMBOL(register_netdevice);
10726
10727/* Initialize the core of a dummy net device.
10728 * This is useful if you are calling this function after alloc_netdev(),
10729 * since it does not memset the net_device fields.
 
 
 
 
 
 
10730 */
10731static void init_dummy_netdev_core(struct net_device *dev)
10732{
 
 
 
 
 
 
 
10733	/* make sure we BUG if trying to hit standard
10734	 * register/unregister code path
10735	 */
10736	dev->reg_state = NETREG_DUMMY;
10737
10738	/* NAPI wants this */
10739	INIT_LIST_HEAD(&dev->napi_list);
10740
10741	/* a dummy interface is started by default */
10742	set_bit(__LINK_STATE_PRESENT, &dev->state);
10743	set_bit(__LINK_STATE_START, &dev->state);
10744
10745	/* napi_busy_loop stats accounting wants this */
10746	dev_net_set(dev, &init_net);
10747
10748	/* Note : We dont allocate pcpu_refcnt for dummy devices,
10749	 * because users of this 'device' dont need to change
10750	 * its refcount.
10751	 */
10752}
10753
10754/**
10755 *	init_dummy_netdev	- init a dummy network device for NAPI
10756 *	@dev: device to init
10757 *
10758 *	This takes a network device structure and initializes the minimum
10759 *	amount of fields so it can be used to schedule NAPI polls without
10760 *	registering a full blown interface. This is to be used by drivers
10761 *	that need to tie several hardware interfaces to a single NAPI
10762 *	poll scheduler due to HW limitations.
10763 */
10764void init_dummy_netdev(struct net_device *dev)
10765{
10766	/* Clear everything. Note we don't initialize spinlocks
10767	 * as they aren't supposed to be taken by any of the
10768	 * NAPI code and this dummy netdev is supposed to be
10769	 * only ever used for NAPI polls
10770	 */
10771	memset(dev, 0, sizeof(struct net_device));
10772	init_dummy_netdev_core(dev);
10773}
10774EXPORT_SYMBOL_GPL(init_dummy_netdev);
10775
 
10776/**
10777 *	register_netdev	- register a network device
10778 *	@dev: device to register
10779 *
10780 *	Take a completed network device structure and add it to the kernel
10781 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
10782 *	chain. 0 is returned on success. A negative errno code is returned
10783 *	on a failure to set up the device, or if the name is a duplicate.
10784 *
10785 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
10786 *	and expands the device name if you passed a format string to
10787 *	alloc_netdev.
10788 */
10789int register_netdev(struct net_device *dev)
10790{
10791	int err;
10792
10793	if (rtnl_lock_killable())
10794		return -EINTR;
10795	err = register_netdevice(dev);
10796	rtnl_unlock();
10797	return err;
10798}
10799EXPORT_SYMBOL(register_netdev);
10800
10801int netdev_refcnt_read(const struct net_device *dev)
10802{
10803#ifdef CONFIG_PCPU_DEV_REFCNT
10804	int i, refcnt = 0;
10805
10806	for_each_possible_cpu(i)
10807		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
10808	return refcnt;
10809#else
10810	return refcount_read(&dev->dev_refcnt);
10811#endif
10812}
10813EXPORT_SYMBOL(netdev_refcnt_read);
10814
10815int netdev_unregister_timeout_secs __read_mostly = 10;
10816
10817#define WAIT_REFS_MIN_MSECS 1
10818#define WAIT_REFS_MAX_MSECS 250
10819/**
10820 * netdev_wait_allrefs_any - wait until all references are gone.
10821 * @list: list of net_devices to wait on
10822 *
10823 * This is called when unregistering network devices.
10824 *
10825 * Any protocol or device that holds a reference should register
10826 * for netdevice notification, and cleanup and put back the
10827 * reference if they receive an UNREGISTER event.
10828 * We can get stuck here if buggy protocols don't correctly
10829 * call dev_put.
10830 */
10831static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
10832{
10833	unsigned long rebroadcast_time, warning_time;
10834	struct net_device *dev;
10835	int wait = 0;
10836
10837	rebroadcast_time = warning_time = jiffies;
10838
10839	list_for_each_entry(dev, list, todo_list)
10840		if (netdev_refcnt_read(dev) == 1)
10841			return dev;
10842
10843	while (true) {
10844		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
10845			rtnl_lock();
10846
10847			/* Rebroadcast unregister notification */
10848			list_for_each_entry(dev, list, todo_list)
10849				call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10850
10851			__rtnl_unlock();
10852			rcu_barrier();
10853			rtnl_lock();
10854
10855			list_for_each_entry(dev, list, todo_list)
10856				if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
10857					     &dev->state)) {
10858					/* We must not have linkwatch events
10859					 * pending on unregister. If this
10860					 * happens, we simply run the queue
10861					 * unscheduled, resulting in a noop
10862					 * for this device.
10863					 */
10864					linkwatch_run_queue();
10865					break;
10866				}
10867
10868			__rtnl_unlock();
10869
10870			rebroadcast_time = jiffies;
10871		}
10872
10873		rcu_barrier();
10874
10875		if (!wait) {
10876			wait = WAIT_REFS_MIN_MSECS;
10877		} else {
10878			msleep(wait);
10879			wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
10880		}
10881
10882		list_for_each_entry(dev, list, todo_list)
10883			if (netdev_refcnt_read(dev) == 1)
10884				return dev;
10885
10886		if (time_after(jiffies, warning_time +
10887			       READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
10888			list_for_each_entry(dev, list, todo_list) {
10889				pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
10890					 dev->name, netdev_refcnt_read(dev));
10891				ref_tracker_dir_print(&dev->refcnt_tracker, 10);
10892			}
10893
 
 
 
10894			warning_time = jiffies;
10895		}
10896	}
10897}
10898
10899/* The sequence is:
10900 *
10901 *	rtnl_lock();
10902 *	...
10903 *	register_netdevice(x1);
10904 *	register_netdevice(x2);
10905 *	...
10906 *	unregister_netdevice(y1);
10907 *	unregister_netdevice(y2);
10908 *      ...
10909 *	rtnl_unlock();
10910 *	free_netdev(y1);
10911 *	free_netdev(y2);
10912 *
10913 * We are invoked by rtnl_unlock().
10914 * This allows us to deal with problems:
10915 * 1) We can delete sysfs objects which invoke hotplug
10916 *    without deadlocking with linkwatch via keventd.
10917 * 2) Since we run with the RTNL semaphore not held, we can sleep
10918 *    safely in order to wait for the netdev refcnt to drop to zero.
10919 *
10920 * We must not return until all unregister events added during
10921 * the interval the lock was held have been completed.
10922 */
10923void netdev_run_todo(void)
10924{
10925	struct net_device *dev, *tmp;
10926	struct list_head list;
10927	int cnt;
10928#ifdef CONFIG_LOCKDEP
10929	struct list_head unlink_list;
10930
10931	list_replace_init(&net_unlink_list, &unlink_list);
10932
10933	while (!list_empty(&unlink_list)) {
10934		struct net_device *dev = list_first_entry(&unlink_list,
10935							  struct net_device,
10936							  unlink_list);
10937		list_del_init(&dev->unlink_list);
10938		dev->nested_level = dev->lower_level - 1;
10939	}
10940#endif
10941
10942	/* Snapshot list, allow later requests */
10943	list_replace_init(&net_todo_list, &list);
10944
10945	__rtnl_unlock();
10946
 
10947	/* Wait for rcu callbacks to finish before next phase */
10948	if (!list_empty(&list))
10949		rcu_barrier();
10950
10951	list_for_each_entry_safe(dev, tmp, &list, todo_list) {
 
 
 
 
 
 
 
 
10952		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
10953			netdev_WARN(dev, "run_todo but not unregistering\n");
10954			list_del(&dev->todo_list);
 
10955			continue;
10956		}
10957
10958		WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
10959		linkwatch_sync_dev(dev);
10960	}
10961
10962	cnt = 0;
10963	while (!list_empty(&list)) {
10964		dev = netdev_wait_allrefs_any(&list);
10965		list_del(&dev->todo_list);
10966
10967		/* paranoia */
10968		BUG_ON(netdev_refcnt_read(dev) != 1);
10969		BUG_ON(!list_empty(&dev->ptype_all));
10970		BUG_ON(!list_empty(&dev->ptype_specific));
10971		WARN_ON(rcu_access_pointer(dev->ip_ptr));
10972		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
 
10973
10974		netdev_do_free_pcpu_stats(dev);
10975		if (dev->priv_destructor)
10976			dev->priv_destructor(dev);
10977		if (dev->needs_free_netdev)
10978			free_netdev(dev);
10979
10980		cnt++;
 
 
 
 
10981
10982		/* Free network device */
10983		kobject_put(&dev->dev.kobj);
10984	}
10985	if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count))
10986		wake_up(&netdev_unregistering_wq);
10987}
10988
10989/* Collate per-cpu network dstats statistics
10990 *
10991 * Read per-cpu network statistics from dev->dstats and populate the related
10992 * fields in @s.
10993 */
10994static void dev_fetch_dstats(struct rtnl_link_stats64 *s,
10995			     const struct pcpu_dstats __percpu *dstats)
10996{
10997	int cpu;
10998
10999	for_each_possible_cpu(cpu) {
11000		u64 rx_packets, rx_bytes, rx_drops;
11001		u64 tx_packets, tx_bytes, tx_drops;
11002		const struct pcpu_dstats *stats;
11003		unsigned int start;
11004
11005		stats = per_cpu_ptr(dstats, cpu);
11006		do {
11007			start = u64_stats_fetch_begin(&stats->syncp);
11008			rx_packets = u64_stats_read(&stats->rx_packets);
11009			rx_bytes   = u64_stats_read(&stats->rx_bytes);
11010			rx_drops   = u64_stats_read(&stats->rx_drops);
11011			tx_packets = u64_stats_read(&stats->tx_packets);
11012			tx_bytes   = u64_stats_read(&stats->tx_bytes);
11013			tx_drops   = u64_stats_read(&stats->tx_drops);
11014		} while (u64_stats_fetch_retry(&stats->syncp, start));
11015
11016		s->rx_packets += rx_packets;
11017		s->rx_bytes   += rx_bytes;
11018		s->rx_dropped += rx_drops;
11019		s->tx_packets += tx_packets;
11020		s->tx_bytes   += tx_bytes;
11021		s->tx_dropped += tx_drops;
11022	}
11023}
11024
11025/* ndo_get_stats64 implementation for dtstats-based accounting.
11026 *
11027 * Populate @s from dev->stats and dev->dstats. This is used internally by the
11028 * core for NETDEV_PCPU_STAT_DSTAT-type stats collection.
11029 */
11030static void dev_get_dstats64(const struct net_device *dev,
11031			     struct rtnl_link_stats64 *s)
11032{
11033	netdev_stats_to_stats64(s, &dev->stats);
11034	dev_fetch_dstats(s, dev->dstats);
11035}
11036
11037/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
11038 * all the same fields in the same order as net_device_stats, with only
11039 * the type differing, but rtnl_link_stats64 may have additional fields
11040 * at the end for newer counters.
11041 */
11042void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
11043			     const struct net_device_stats *netdev_stats)
11044{
11045	size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
11046	const atomic_long_t *src = (atomic_long_t *)netdev_stats;
 
 
 
 
 
 
 
11047	u64 *dst = (u64 *)stats64;
11048
11049	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
11050	for (i = 0; i < n; i++)
11051		dst[i] = (unsigned long)atomic_long_read(&src[i]);
11052	/* zero out counters that only exist in rtnl_link_stats64 */
11053	memset((char *)stats64 + n * sizeof(u64), 0,
11054	       sizeof(*stats64) - n * sizeof(u64));
 
11055}
11056EXPORT_SYMBOL(netdev_stats_to_stats64);
11057
11058static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc(
11059		struct net_device *dev)
11060{
11061	struct net_device_core_stats __percpu *p;
11062
11063	p = alloc_percpu_gfp(struct net_device_core_stats,
11064			     GFP_ATOMIC | __GFP_NOWARN);
11065
11066	if (p && cmpxchg(&dev->core_stats, NULL, p))
11067		free_percpu(p);
11068
11069	/* This READ_ONCE() pairs with the cmpxchg() above */
11070	return READ_ONCE(dev->core_stats);
11071}
11072
11073noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset)
11074{
11075	/* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
11076	struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
11077	unsigned long __percpu *field;
11078
11079	if (unlikely(!p)) {
11080		p = netdev_core_stats_alloc(dev);
11081		if (!p)
11082			return;
11083	}
11084
11085	field = (unsigned long __percpu *)((void __percpu *)p + offset);
11086	this_cpu_inc(*field);
11087}
11088EXPORT_SYMBOL_GPL(netdev_core_stats_inc);
11089
11090/**
11091 *	dev_get_stats	- get network device statistics
11092 *	@dev: device to get statistics from
11093 *	@storage: place to store stats
11094 *
11095 *	Get network statistics from device. Return @storage.
11096 *	The device driver may provide its own method by setting
11097 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
11098 *	otherwise the internal statistics structure is used.
11099 */
11100struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
11101					struct rtnl_link_stats64 *storage)
11102{
11103	const struct net_device_ops *ops = dev->netdev_ops;
11104	const struct net_device_core_stats __percpu *p;
11105
11106	if (ops->ndo_get_stats64) {
11107		memset(storage, 0, sizeof(*storage));
11108		ops->ndo_get_stats64(dev, storage);
11109	} else if (ops->ndo_get_stats) {
11110		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
11111	} else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) {
11112		dev_get_tstats64(dev, storage);
11113	} else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) {
11114		dev_get_dstats64(dev, storage);
11115	} else {
11116		netdev_stats_to_stats64(storage, &dev->stats);
11117	}
11118
11119	/* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
11120	p = READ_ONCE(dev->core_stats);
11121	if (p) {
11122		const struct net_device_core_stats *core_stats;
11123		int i;
11124
11125		for_each_possible_cpu(i) {
11126			core_stats = per_cpu_ptr(p, i);
11127			storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
11128			storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
11129			storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
11130			storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped);
11131		}
11132	}
11133	return storage;
11134}
11135EXPORT_SYMBOL(dev_get_stats);
11136
11137/**
11138 *	dev_fetch_sw_netstats - get per-cpu network device statistics
11139 *	@s: place to store stats
11140 *	@netstats: per-cpu network stats to read from
11141 *
11142 *	Read per-cpu network statistics and populate the related fields in @s.
11143 */
11144void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
11145			   const struct pcpu_sw_netstats __percpu *netstats)
11146{
11147	int cpu;
11148
11149	for_each_possible_cpu(cpu) {
11150		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
11151		const struct pcpu_sw_netstats *stats;
11152		unsigned int start;
11153
11154		stats = per_cpu_ptr(netstats, cpu);
11155		do {
11156			start = u64_stats_fetch_begin(&stats->syncp);
11157			rx_packets = u64_stats_read(&stats->rx_packets);
11158			rx_bytes   = u64_stats_read(&stats->rx_bytes);
11159			tx_packets = u64_stats_read(&stats->tx_packets);
11160			tx_bytes   = u64_stats_read(&stats->tx_bytes);
11161		} while (u64_stats_fetch_retry(&stats->syncp, start));
11162
11163		s->rx_packets += rx_packets;
11164		s->rx_bytes   += rx_bytes;
11165		s->tx_packets += tx_packets;
11166		s->tx_bytes   += tx_bytes;
11167	}
11168}
11169EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
11170
11171/**
11172 *	dev_get_tstats64 - ndo_get_stats64 implementation
11173 *	@dev: device to get statistics from
11174 *	@s: place to store stats
11175 *
11176 *	Populate @s from dev->stats and dev->tstats. Can be used as
11177 *	ndo_get_stats64() callback.
11178 */
11179void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
11180{
11181	netdev_stats_to_stats64(s, &dev->stats);
11182	dev_fetch_sw_netstats(s, dev->tstats);
11183}
11184EXPORT_SYMBOL_GPL(dev_get_tstats64);
11185
11186struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
11187{
11188	struct netdev_queue *queue = dev_ingress_queue(dev);
11189
11190#ifdef CONFIG_NET_CLS_ACT
11191	if (queue)
11192		return queue;
11193	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
11194	if (!queue)
11195		return NULL;
11196	netdev_init_one_queue(dev, queue, NULL);
11197	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
11198	RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc);
11199	rcu_assign_pointer(dev->ingress_queue, queue);
11200#endif
11201	return queue;
11202}
11203
11204static const struct ethtool_ops default_ethtool_ops;
11205
11206void netdev_set_default_ethtool_ops(struct net_device *dev,
11207				    const struct ethtool_ops *ops)
11208{
11209	if (dev->ethtool_ops == &default_ethtool_ops)
11210		dev->ethtool_ops = ops;
11211}
11212EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
11213
11214/**
11215 * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default
11216 * @dev: netdev to enable the IRQ coalescing on
11217 *
11218 * Sets a conservative default for SW IRQ coalescing. Users can use
11219 * sysfs attributes to override the default values.
11220 */
11221void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
11222{
11223	WARN_ON(dev->reg_state == NETREG_REGISTERED);
11224
11225	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
11226		netdev_set_gro_flush_timeout(dev, 20000);
11227		netdev_set_defer_hard_irqs(dev, 1);
11228	}
11229}
11230EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
11231
11232/**
11233 * alloc_netdev_mqs - allocate network device
11234 * @sizeof_priv: size of private data to allocate space for
11235 * @name: device name format string
11236 * @name_assign_type: origin of device name
11237 * @setup: callback to initialize device
11238 * @txqs: the number of TX subqueues to allocate
11239 * @rxqs: the number of RX subqueues to allocate
11240 *
11241 * Allocates a struct net_device with private data area for driver use
11242 * and performs basic initialization.  Also allocates subqueue structs
11243 * for each queue on the device.
11244 */
11245struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
11246		unsigned char name_assign_type,
11247		void (*setup)(struct net_device *),
11248		unsigned int txqs, unsigned int rxqs)
11249{
11250	struct net_device *dev;
11251	size_t napi_config_sz;
11252	unsigned int maxqs;
11253
11254	BUG_ON(strlen(name) >= sizeof(dev->name));
11255
11256	if (txqs < 1) {
11257		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
11258		return NULL;
11259	}
11260
 
11261	if (rxqs < 1) {
11262		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
11263		return NULL;
11264	}
 
11265
11266	maxqs = max(txqs, rxqs);
11267
11268	dev = kvzalloc(struct_size(dev, priv, sizeof_priv),
11269		       GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
11270	if (!dev)
 
 
 
 
 
 
 
 
11271		return NULL;
11272
11273	dev->priv_len = sizeof_priv;
 
11274
11275	ref_tracker_dir_init(&dev->refcnt_tracker, 128, name);
11276#ifdef CONFIG_PCPU_DEV_REFCNT
11277	dev->pcpu_refcnt = alloc_percpu(int);
11278	if (!dev->pcpu_refcnt)
11279		goto free_dev;
11280	__dev_hold(dev);
11281#else
11282	refcount_set(&dev->dev_refcnt, 1);
11283#endif
11284
11285	if (dev_addr_init(dev))
11286		goto free_pcpu;
11287
11288	dev_mc_init(dev);
11289	dev_uc_init(dev);
11290
11291	dev_net_set(dev, &init_net);
11292
11293	dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
11294	dev->xdp_zc_max_segs = 1;
11295	dev->gso_max_segs = GSO_MAX_SEGS;
11296	dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
11297	dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE;
11298	dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE;
11299	dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
11300	dev->tso_max_segs = TSO_MAX_SEGS;
11301	dev->upper_level = 1;
11302	dev->lower_level = 1;
11303#ifdef CONFIG_LOCKDEP
11304	dev->nested_level = 0;
11305	INIT_LIST_HEAD(&dev->unlink_list);
11306#endif
11307
11308	INIT_LIST_HEAD(&dev->napi_list);
11309	INIT_LIST_HEAD(&dev->unreg_list);
11310	INIT_LIST_HEAD(&dev->close_list);
11311	INIT_LIST_HEAD(&dev->link_watch_list);
11312	INIT_LIST_HEAD(&dev->adj_list.upper);
11313	INIT_LIST_HEAD(&dev->adj_list.lower);
11314	INIT_LIST_HEAD(&dev->ptype_all);
11315	INIT_LIST_HEAD(&dev->ptype_specific);
11316	INIT_LIST_HEAD(&dev->net_notifier_list);
11317#ifdef CONFIG_NET_SCHED
11318	hash_init(dev->qdisc_hash);
11319#endif
11320
11321	mutex_init(&dev->lock);
11322
11323	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
11324	setup(dev);
11325
11326	if (!dev->tx_queue_len) {
11327		dev->priv_flags |= IFF_NO_QUEUE;
11328		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
11329	}
11330
11331	dev->num_tx_queues = txqs;
11332	dev->real_num_tx_queues = txqs;
11333	if (netif_alloc_netdev_queues(dev))
11334		goto free_all;
11335
 
11336	dev->num_rx_queues = rxqs;
11337	dev->real_num_rx_queues = rxqs;
11338	if (netif_alloc_rx_queues(dev))
11339		goto free_all;
11340	dev->ethtool = kzalloc(sizeof(*dev->ethtool), GFP_KERNEL_ACCOUNT);
11341	if (!dev->ethtool)
11342		goto free_all;
11343
11344	napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
11345	dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
11346	if (!dev->napi_config)
11347		goto free_all;
11348
11349	strscpy(dev->name, name);
11350	dev->name_assign_type = name_assign_type;
11351	dev->group = INIT_NETDEV_GROUP;
11352	if (!dev->ethtool_ops)
11353		dev->ethtool_ops = &default_ethtool_ops;
11354
11355	nf_hook_netdev_init(dev);
11356
11357	return dev;
11358
11359free_all:
11360	free_netdev(dev);
11361	return NULL;
11362
11363free_pcpu:
11364#ifdef CONFIG_PCPU_DEV_REFCNT
11365	free_percpu(dev->pcpu_refcnt);
11366free_dev:
11367#endif
11368	kvfree(dev);
11369	return NULL;
11370}
11371EXPORT_SYMBOL(alloc_netdev_mqs);
11372
11373/**
11374 * free_netdev - free network device
11375 * @dev: device
11376 *
11377 * This function does the last stage of destroying an allocated device
11378 * interface. The reference to the device object is released. If this
11379 * is the last reference then it will be freed.Must be called in process
11380 * context.
11381 */
11382void free_netdev(struct net_device *dev)
11383{
11384	struct napi_struct *p, *n;
11385
11386	might_sleep();
11387
11388	/* When called immediately after register_netdevice() failed the unwind
11389	 * handling may still be dismantling the device. Handle that case by
11390	 * deferring the free.
11391	 */
11392	if (dev->reg_state == NETREG_UNREGISTERING) {
11393		ASSERT_RTNL();
11394		dev->needs_free_netdev = true;
11395		return;
11396	}
11397
11398	mutex_destroy(&dev->lock);
11399
11400	kfree(dev->ethtool);
11401	netif_free_tx_queues(dev);
11402	netif_free_rx_queues(dev);
 
 
11403
11404	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
11405
11406	/* Flush device addresses */
11407	dev_addr_flush(dev);
11408
11409	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
11410		netif_napi_del(p);
11411
11412	kvfree(dev->napi_config);
11413
11414	ref_tracker_dir_exit(&dev->refcnt_tracker);
11415#ifdef CONFIG_PCPU_DEV_REFCNT
11416	free_percpu(dev->pcpu_refcnt);
11417	dev->pcpu_refcnt = NULL;
11418#endif
11419	free_percpu(dev->core_stats);
11420	dev->core_stats = NULL;
11421	free_percpu(dev->xdp_bulkq);
11422	dev->xdp_bulkq = NULL;
11423
11424	netdev_free_phy_link_topology(dev);
11425
11426	/*  Compatibility with error handling in drivers */
11427	if (dev->reg_state == NETREG_UNINITIALIZED ||
11428	    dev->reg_state == NETREG_DUMMY) {
11429		kvfree(dev);
11430		return;
11431	}
11432
11433	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
11434	WRITE_ONCE(dev->reg_state, NETREG_RELEASED);
11435
11436	/* will free via device release */
11437	put_device(&dev->dev);
11438}
11439EXPORT_SYMBOL(free_netdev);
11440
11441/**
11442 * alloc_netdev_dummy - Allocate and initialize a dummy net device.
11443 * @sizeof_priv: size of private data to allocate space for
11444 *
11445 * Return: the allocated net_device on success, NULL otherwise
11446 */
11447struct net_device *alloc_netdev_dummy(int sizeof_priv)
11448{
11449	return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN,
11450			    init_dummy_netdev_core);
11451}
11452EXPORT_SYMBOL_GPL(alloc_netdev_dummy);
11453
11454/**
11455 *	synchronize_net -  Synchronize with packet receive processing
11456 *
11457 *	Wait for packets currently being received to be done.
11458 *	Does not block later packets from starting.
11459 */
11460void synchronize_net(void)
11461{
11462	might_sleep();
11463	if (rtnl_is_locked())
11464		synchronize_rcu_expedited();
11465	else
11466		synchronize_rcu();
11467}
11468EXPORT_SYMBOL(synchronize_net);
11469
11470static void netdev_rss_contexts_free(struct net_device *dev)
11471{
11472	struct ethtool_rxfh_context *ctx;
11473	unsigned long context;
11474
11475	mutex_lock(&dev->ethtool->rss_lock);
11476	xa_for_each(&dev->ethtool->rss_ctx, context, ctx) {
11477		struct ethtool_rxfh_param rxfh;
11478
11479		rxfh.indir = ethtool_rxfh_context_indir(ctx);
11480		rxfh.key = ethtool_rxfh_context_key(ctx);
11481		rxfh.hfunc = ctx->hfunc;
11482		rxfh.input_xfrm = ctx->input_xfrm;
11483		rxfh.rss_context = context;
11484		rxfh.rss_delete = true;
11485
11486		xa_erase(&dev->ethtool->rss_ctx, context);
11487		if (dev->ethtool_ops->create_rxfh_context)
11488			dev->ethtool_ops->remove_rxfh_context(dev, ctx,
11489							      context, NULL);
11490		else
11491			dev->ethtool_ops->set_rxfh(dev, &rxfh, NULL);
11492		kfree(ctx);
11493	}
11494	xa_destroy(&dev->ethtool->rss_ctx);
11495	mutex_unlock(&dev->ethtool->rss_lock);
11496}
11497
11498/**
11499 *	unregister_netdevice_queue - remove device from the kernel
11500 *	@dev: device
11501 *	@head: list
11502 *
11503 *	This function shuts down a device interface and removes it
11504 *	from the kernel tables.
11505 *	If head not NULL, device is queued to be unregistered later.
11506 *
11507 *	Callers must hold the rtnl semaphore.  You may want
11508 *	unregister_netdev() instead of this.
11509 */
11510
11511void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
11512{
11513	ASSERT_RTNL();
11514
11515	if (head) {
11516		list_move_tail(&dev->unreg_list, head);
11517	} else {
11518		LIST_HEAD(single);
11519
11520		list_add(&dev->unreg_list, &single);
11521		unregister_netdevice_many(&single);
11522	}
11523}
11524EXPORT_SYMBOL(unregister_netdevice_queue);
11525
11526void unregister_netdevice_many_notify(struct list_head *head,
11527				      u32 portid, const struct nlmsghdr *nlh)
11528{
11529	struct net_device *dev, *tmp;
11530	LIST_HEAD(close_head);
11531	int cnt = 0;
11532
11533	BUG_ON(dev_boot_phase);
11534	ASSERT_RTNL();
11535
11536	if (list_empty(head))
11537		return;
11538
11539	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
11540		/* Some devices call without registering
11541		 * for initialization unwind. Remove those
11542		 * devices and proceed with the remaining.
11543		 */
11544		if (dev->reg_state == NETREG_UNINITIALIZED) {
11545			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
11546				 dev->name, dev);
11547
11548			WARN_ON(1);
11549			list_del(&dev->unreg_list);
11550			continue;
11551		}
11552		dev->dismantle = true;
11553		BUG_ON(dev->reg_state != NETREG_REGISTERED);
11554	}
11555
11556	/* If device is running, close it first. */
11557	list_for_each_entry(dev, head, unreg_list)
11558		list_add_tail(&dev->close_list, &close_head);
11559	dev_close_many(&close_head, true);
11560
11561	list_for_each_entry(dev, head, unreg_list) {
11562		/* And unlink it from device chain. */
11563		unlist_netdevice(dev);
11564		WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
11565	}
11566	flush_all_backlogs();
11567
11568	synchronize_net();
11569
11570	list_for_each_entry(dev, head, unreg_list) {
11571		struct sk_buff *skb = NULL;
11572
11573		/* Shutdown queueing discipline. */
11574		dev_shutdown(dev);
11575		dev_tcx_uninstall(dev);
11576		dev_xdp_uninstall(dev);
11577		bpf_dev_bound_netdev_unregister(dev);
11578		dev_dmabuf_uninstall(dev);
11579
11580		netdev_offload_xstats_disable_all(dev);
11581
11582		/* Notify protocols, that we are about to destroy
11583		 * this device. They should clean all the things.
11584		 */
11585		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
11586
11587		if (!dev->rtnl_link_ops ||
11588		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
11589			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
11590						     GFP_KERNEL, NULL, 0,
11591						     portid, nlh);
11592
11593		/*
11594		 *	Flush the unicast and multicast chains
11595		 */
11596		dev_uc_flush(dev);
11597		dev_mc_flush(dev);
11598
11599		netdev_name_node_alt_flush(dev);
11600		netdev_name_node_free(dev->name_node);
11601
11602		netdev_rss_contexts_free(dev);
11603
11604		call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
11605
11606		if (dev->netdev_ops->ndo_uninit)
11607			dev->netdev_ops->ndo_uninit(dev);
11608
11609		mutex_destroy(&dev->ethtool->rss_lock);
11610
11611		net_shaper_flush_netdev(dev);
11612
11613		if (skb)
11614			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);
11615
11616		/* Notifier chain MUST detach us all upper devices. */
11617		WARN_ON(netdev_has_any_upper_dev(dev));
11618		WARN_ON(netdev_has_any_lower_dev(dev));
11619
11620		/* Remove entries from kobject tree */
11621		netdev_unregister_kobject(dev);
11622#ifdef CONFIG_XPS
11623		/* Remove XPS queueing entries */
11624		netif_reset_xps_queues_gt(dev, 0);
11625#endif
11626	}
11627
11628	synchronize_net();
11629
11630	list_for_each_entry(dev, head, unreg_list) {
11631		netdev_put(dev, &dev->dev_registered_tracker);
11632		net_set_todo(dev);
11633		cnt++;
11634	}
11635	atomic_add(cnt, &dev_unreg_count);
11636
11637	list_del(head);
11638}
 
11639
11640/**
11641 *	unregister_netdevice_many - unregister many devices
11642 *	@head: list of devices
11643 *
11644 *  Note: As most callers use a stack allocated list_head,
11645 *  we force a list_del() to make sure stack won't be corrupted later.
11646 */
11647void unregister_netdevice_many(struct list_head *head)
11648{
11649	unregister_netdevice_many_notify(head, 0, NULL);
 
 
 
 
 
 
 
11650}
11651EXPORT_SYMBOL(unregister_netdevice_many);
11652
11653/**
11654 *	unregister_netdev - remove device from the kernel
11655 *	@dev: device
11656 *
11657 *	This function shuts down a device interface and removes it
11658 *	from the kernel tables.
11659 *
11660 *	This is just a wrapper for unregister_netdevice that takes
11661 *	the rtnl semaphore.  In general you want to use this and not
11662 *	unregister_netdevice.
11663 */
11664void unregister_netdev(struct net_device *dev)
11665{
11666	rtnl_lock();
11667	unregister_netdevice(dev);
11668	rtnl_unlock();
11669}
11670EXPORT_SYMBOL(unregister_netdev);
11671
11672/**
11673 *	__dev_change_net_namespace - move device to different nethost namespace
11674 *	@dev: device
11675 *	@net: network namespace
11676 *	@pat: If not NULL name pattern to try if the current device name
11677 *	      is already taken in the destination network namespace.
11678 *	@new_ifindex: If not zero, specifies device index in the target
11679 *	              namespace.
11680 *
11681 *	This function shuts down a device interface and moves it
11682 *	to a new network namespace. On success 0 is returned, on
11683 *	a failure a netagive errno code is returned.
11684 *
11685 *	Callers must hold the rtnl semaphore.
11686 */
11687
11688int __dev_change_net_namespace(struct net_device *dev, struct net *net,
11689			       const char *pat, int new_ifindex)
11690{
11691	struct netdev_name_node *name_node;
11692	struct net *net_old = dev_net(dev);
11693	char new_name[IFNAMSIZ] = {};
11694	int err, new_nsid;
11695
11696	ASSERT_RTNL();
11697
11698	/* Don't allow namespace local devices to be moved. */
11699	err = -EINVAL;
11700	if (dev->netns_local)
11701		goto out;
11702
11703	/* Ensure the device has been registered */
11704	if (dev->reg_state != NETREG_REGISTERED)
11705		goto out;
11706
11707	/* Get out if there is nothing todo */
11708	err = 0;
11709	if (net_eq(net_old, net))
11710		goto out;
11711
11712	/* Pick the destination device name, and ensure
11713	 * we can use it in the destination network namespace.
11714	 */
11715	err = -EEXIST;
11716	if (netdev_name_in_use(net, dev->name)) {
11717		/* We get here if we can't use the current device name */
11718		if (!pat)
11719			goto out;
11720		err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST);
11721		if (err < 0)
11722			goto out;
11723	}
11724	/* Check that none of the altnames conflicts. */
11725	err = -EEXIST;
11726	netdev_for_each_altname(dev, name_node)
11727		if (netdev_name_in_use(net, name_node->name))
11728			goto out;
11729
11730	/* Check that new_ifindex isn't used yet. */
11731	if (new_ifindex) {
11732		err = dev_index_reserve(net, new_ifindex);
11733		if (err < 0)
11734			goto out;
11735	} else {
11736		/* If there is an ifindex conflict assign a new one */
11737		err = dev_index_reserve(net, dev->ifindex);
11738		if (err == -EBUSY)
11739			err = dev_index_reserve(net, 0);
11740		if (err < 0)
11741			goto out;
11742		new_ifindex = err;
11743	}
11744
11745	/*
11746	 * And now a mini version of register_netdevice unregister_netdevice.
11747	 */
11748
11749	/* If device is running close it first. */
11750	dev_close(dev);
11751
11752	/* And unlink it from device chain */
 
11753	unlist_netdevice(dev);
11754
11755	synchronize_net();
11756
11757	/* Shutdown queueing discipline. */
11758	dev_shutdown(dev);
11759
11760	/* Notify protocols, that we are about to destroy
11761	 * this device. They should clean all the things.
11762	 *
11763	 * Note that dev->reg_state stays at NETREG_REGISTERED.
11764	 * This is wanted because this way 8021q and macvlan know
11765	 * the device is just moving and can keep their slaves up.
11766	 */
11767	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
11768	rcu_barrier();
11769
11770	new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
11771
11772	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
11773			    new_ifindex);
11774
11775	/*
11776	 *	Flush the unicast and multicast chains
11777	 */
11778	dev_uc_flush(dev);
11779	dev_mc_flush(dev);
11780
11781	/* Send a netdev-removed uevent to the old namespace */
11782	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
11783	netdev_adjacent_del_links(dev);
11784
11785	/* Move per-net netdevice notifiers that are following the netdevice */
11786	move_netdevice_notifiers_dev_net(dev, net);
11787
11788	/* Actually switch the network namespace */
11789	dev_net_set(dev, net);
11790	dev->ifindex = new_ifindex;
11791
11792	if (new_name[0]) {
11793		/* Rename the netdev to prepared name */
11794		write_seqlock_bh(&netdev_rename_lock);
11795		strscpy(dev->name, new_name, IFNAMSIZ);
11796		write_sequnlock_bh(&netdev_rename_lock);
11797	}
11798
11799	/* Fixup kobjects */
11800	dev_set_uevent_suppress(&dev->dev, 1);
11801	err = device_rename(&dev->dev, dev->name);
11802	dev_set_uevent_suppress(&dev->dev, 0);
11803	WARN_ON(err);
11804
11805	/* Send a netdev-add uevent to the new namespace */
11806	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
11807	netdev_adjacent_add_links(dev);
11808
11809	/* Adapt owner in case owning user namespace of target network
11810	 * namespace is different from the original one.
11811	 */
11812	err = netdev_change_owner(dev, net_old, net);
11813	WARN_ON(err);
11814
11815	/* Add the device back in the hashes */
11816	list_netdevice(dev);
11817
11818	/* Notify protocols, that a new device appeared. */
11819	call_netdevice_notifiers(NETDEV_REGISTER, dev);
11820
11821	/*
11822	 *	Prevent userspace races by waiting until the network
11823	 *	device is fully setup before sending notifications.
11824	 */
11825	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
11826
11827	synchronize_net();
11828	err = 0;
11829out:
11830	return err;
11831}
11832EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
11833
11834static int dev_cpu_dead(unsigned int oldcpu)
11835{
11836	struct sk_buff **list_skb;
11837	struct sk_buff *skb;
11838	unsigned int cpu;
11839	struct softnet_data *sd, *oldsd, *remsd = NULL;
11840
11841	local_irq_disable();
11842	cpu = smp_processor_id();
11843	sd = &per_cpu(softnet_data, cpu);
11844	oldsd = &per_cpu(softnet_data, oldcpu);
11845
11846	/* Find end of our completion_queue. */
11847	list_skb = &sd->completion_queue;
11848	while (*list_skb)
11849		list_skb = &(*list_skb)->next;
11850	/* Append completion queue from offline CPU. */
11851	*list_skb = oldsd->completion_queue;
11852	oldsd->completion_queue = NULL;
11853
11854	/* Append output queue from offline CPU. */
11855	if (oldsd->output_queue) {
11856		*sd->output_queue_tailp = oldsd->output_queue;
11857		sd->output_queue_tailp = oldsd->output_queue_tailp;
11858		oldsd->output_queue = NULL;
11859		oldsd->output_queue_tailp = &oldsd->output_queue;
11860	}
11861	/* Append NAPI poll list from offline CPU, with one exception :
11862	 * process_backlog() must be called by cpu owning percpu backlog.
11863	 * We properly handle process_queue & input_pkt_queue later.
11864	 */
11865	while (!list_empty(&oldsd->poll_list)) {
11866		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
11867							    struct napi_struct,
11868							    poll_list);
11869
11870		list_del_init(&napi->poll_list);
11871		if (napi->poll == process_backlog)
11872			napi->state &= NAPIF_STATE_THREADED;
11873		else
11874			____napi_schedule(sd, napi);
11875	}
11876
11877	raise_softirq_irqoff(NET_TX_SOFTIRQ);
11878	local_irq_enable();
11879
11880	if (!use_backlog_threads()) {
11881#ifdef CONFIG_RPS
11882		remsd = oldsd->rps_ipi_list;
11883		oldsd->rps_ipi_list = NULL;
11884#endif
11885		/* send out pending IPI's on offline CPU */
11886		net_rps_send_ipi(remsd);
11887	}
11888
11889	/* Process offline CPU's input_pkt_queue */
11890	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
11891		netif_rx(skb);
11892		rps_input_queue_head_incr(oldsd);
11893	}
11894	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
11895		netif_rx(skb);
11896		rps_input_queue_head_incr(oldsd);
11897	}
11898
11899	return 0;
11900}
11901
11902/**
11903 *	netdev_increment_features - increment feature set by one
11904 *	@all: current feature set
11905 *	@one: new feature set
11906 *	@mask: mask feature set
11907 *
11908 *	Computes a new feature set after adding a device with feature set
11909 *	@one to the master device with current feature set @all.  Will not
11910 *	enable anything that is off in @mask. Returns the new feature set.
11911 */
11912netdev_features_t netdev_increment_features(netdev_features_t all,
11913	netdev_features_t one, netdev_features_t mask)
11914{
11915	if (mask & NETIF_F_HW_CSUM)
11916		mask |= NETIF_F_CSUM_MASK;
11917	mask |= NETIF_F_VLAN_CHALLENGED;
11918
11919	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
11920	all &= one | ~NETIF_F_ALL_FOR_ALL;
11921
11922	/* If one device supports hw checksumming, set for all. */
11923	if (all & NETIF_F_HW_CSUM)
11924		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
11925
11926	return all;
11927}
11928EXPORT_SYMBOL(netdev_increment_features);
11929
11930static struct hlist_head * __net_init netdev_create_hash(void)
11931{
11932	int i;
11933	struct hlist_head *hash;
11934
11935	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
11936	if (hash != NULL)
11937		for (i = 0; i < NETDEV_HASHENTRIES; i++)
11938			INIT_HLIST_HEAD(&hash[i]);
11939
11940	return hash;
11941}
11942
11943/* Initialize per network namespace state */
11944static int __net_init netdev_init(struct net *net)
11945{
11946	BUILD_BUG_ON(GRO_HASH_BUCKETS >
11947		     8 * sizeof_field(struct napi_struct, gro_bitmask));
11948
11949	INIT_LIST_HEAD(&net->dev_base_head);
11950
11951	net->dev_name_head = netdev_create_hash();
11952	if (net->dev_name_head == NULL)
11953		goto err_name;
11954
11955	net->dev_index_head = netdev_create_hash();
11956	if (net->dev_index_head == NULL)
11957		goto err_idx;
11958
11959	xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1);
11960
11961	RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
11962
11963	return 0;
11964
11965err_idx:
11966	kfree(net->dev_name_head);
11967err_name:
11968	return -ENOMEM;
11969}
11970
11971/**
11972 *	netdev_drivername - network driver for the device
11973 *	@dev: network device
11974 *
11975 *	Determine network driver for device.
11976 */
11977const char *netdev_drivername(const struct net_device *dev)
11978{
11979	const struct device_driver *driver;
11980	const struct device *parent;
11981	const char *empty = "";
11982
11983	parent = dev->dev.parent;
11984	if (!parent)
11985		return empty;
11986
11987	driver = parent->driver;
11988	if (driver && driver->name)
11989		return driver->name;
11990	return empty;
11991}
11992
11993static void __netdev_printk(const char *level, const struct net_device *dev,
11994			    struct va_format *vaf)
11995{
11996	if (dev && dev->dev.parent) {
11997		dev_printk_emit(level[1] - '0',
11998				dev->dev.parent,
11999				"%s %s %s%s: %pV",
12000				dev_driver_string(dev->dev.parent),
12001				dev_name(dev->dev.parent),
12002				netdev_name(dev), netdev_reg_state(dev),
12003				vaf);
12004	} else if (dev) {
12005		printk("%s%s%s: %pV",
12006		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
12007	} else {
12008		printk("%s(NULL net_device): %pV", level, vaf);
12009	}
12010}
12011
12012void netdev_printk(const char *level, const struct net_device *dev,
12013		   const char *format, ...)
12014{
12015	struct va_format vaf;
12016	va_list args;
12017
12018	va_start(args, format);
12019
12020	vaf.fmt = format;
12021	vaf.va = &args;
12022
12023	__netdev_printk(level, dev, &vaf);
12024
12025	va_end(args);
12026}
12027EXPORT_SYMBOL(netdev_printk);
12028
12029#define define_netdev_printk_level(func, level)			\
12030void func(const struct net_device *dev, const char *fmt, ...)	\
12031{								\
12032	struct va_format vaf;					\
12033	va_list args;						\
12034								\
12035	va_start(args, fmt);					\
12036								\
12037	vaf.fmt = fmt;						\
12038	vaf.va = &args;						\
12039								\
12040	__netdev_printk(level, dev, &vaf);			\
12041								\
12042	va_end(args);						\
12043}								\
12044EXPORT_SYMBOL(func);
12045
12046define_netdev_printk_level(netdev_emerg, KERN_EMERG);
12047define_netdev_printk_level(netdev_alert, KERN_ALERT);
12048define_netdev_printk_level(netdev_crit, KERN_CRIT);
12049define_netdev_printk_level(netdev_err, KERN_ERR);
12050define_netdev_printk_level(netdev_warn, KERN_WARNING);
12051define_netdev_printk_level(netdev_notice, KERN_NOTICE);
12052define_netdev_printk_level(netdev_info, KERN_INFO);
12053
12054static void __net_exit netdev_exit(struct net *net)
12055{
12056	kfree(net->dev_name_head);
12057	kfree(net->dev_index_head);
12058	xa_destroy(&net->dev_by_index);
12059	if (net != &init_net)
12060		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
12061}
12062
12063static struct pernet_operations __net_initdata netdev_net_ops = {
12064	.init = netdev_init,
12065	.exit = netdev_exit,
12066};
12067
12068static void __net_exit default_device_exit_net(struct net *net)
12069{
12070	struct netdev_name_node *name_node, *tmp;
12071	struct net_device *dev, *aux;
12072	/*
12073	 * Push all migratable network devices back to the
12074	 * initial network namespace
12075	 */
12076	ASSERT_RTNL();
12077	for_each_netdev_safe(net, dev, aux) {
12078		int err;
12079		char fb_name[IFNAMSIZ];
12080
12081		/* Ignore unmoveable devices (i.e. loopback) */
12082		if (dev->netns_local)
12083			continue;
12084
12085		/* Leave virtual devices for the generic cleanup */
12086		if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
12087			continue;
12088
12089		/* Push remaining network devices to init_net */
12090		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
12091		if (netdev_name_in_use(&init_net, fb_name))
12092			snprintf(fb_name, IFNAMSIZ, "dev%%d");
12093
12094		netdev_for_each_altname_safe(dev, name_node, tmp)
12095			if (netdev_name_in_use(&init_net, name_node->name))
12096				__netdev_name_node_alt_destroy(name_node);
12097
12098		err = dev_change_net_namespace(dev, &init_net, fb_name);
12099		if (err) {
12100			pr_emerg("%s: failed to move %s to init_net: %d\n",
12101				 __func__, dev->name, err);
12102			BUG();
12103		}
12104	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12105}
12106
12107static void __net_exit default_device_exit_batch(struct list_head *net_list)
12108{
12109	/* At exit all network devices most be removed from a network
12110	 * namespace.  Do this in the reverse order of registration.
12111	 * Do this across as many network namespaces as possible to
12112	 * improve batching efficiency.
12113	 */
12114	struct net_device *dev;
12115	struct net *net;
12116	LIST_HEAD(dev_kill_list);
12117
12118	rtnl_lock();
12119	list_for_each_entry(net, net_list, exit_list) {
12120		default_device_exit_net(net);
12121		cond_resched();
12122	}
12123
 
 
 
 
 
 
12124	list_for_each_entry(net, net_list, exit_list) {
12125		for_each_netdev_reverse(net, dev) {
12126			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
12127				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
12128			else
12129				unregister_netdevice_queue(dev, &dev_kill_list);
12130		}
12131	}
12132	unregister_netdevice_many(&dev_kill_list);
12133	rtnl_unlock();
12134}
12135
12136static struct pernet_operations __net_initdata default_device_ops = {
 
12137	.exit_batch = default_device_exit_batch,
12138};
12139
12140static void __init net_dev_struct_check(void)
12141{
12142	/* TX read-mostly hotpath */
12143	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags_fast);
12144	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops);
12145	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops);
12146	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx);
12147	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues);
12148	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size);
12149	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size);
12150	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs);
12151	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features);
12152	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc);
12153	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu);
12154	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom);
12155	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq);
12156#ifdef CONFIG_XPS
12157	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps);
12158#endif
12159#ifdef CONFIG_NETFILTER_EGRESS
12160	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress);
12161#endif
12162#ifdef CONFIG_NET_XGRESS
12163	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress);
12164#endif
12165	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160);
12166
12167	/* TXRX read-mostly hotpath */
12168	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats);
12169	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state);
12170	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags);
12171	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len);
12172	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features);
12173	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr);
12174	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46);
12175
12176	/* RX read-mostly hotpath */
12177	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific);
12178	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
12179	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
12180	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
12181	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
12182	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
12183	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
12184	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data);
12185	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net);
12186#ifdef CONFIG_NETPOLL
12187	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo);
12188#endif
12189#ifdef CONFIG_NET_XGRESS
12190	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
12191#endif
12192	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92);
12193}
12194
12195/*
12196 *	Initialize the DEV module. At boot time this walks the device list and
12197 *	unhooks any devices that fail to initialise (normally hardware not
12198 *	present) and leaves us with a valid list of present and active devices.
12199 *
12200 */
12201
12202/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */
12203#define SYSTEM_PERCPU_PAGE_POOL_SIZE	((1 << 20) / PAGE_SIZE)
12204
12205static int net_page_pool_create(int cpuid)
12206{
12207#if IS_ENABLED(CONFIG_PAGE_POOL)
12208	struct page_pool_params page_pool_params = {
12209		.pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE,
12210		.flags = PP_FLAG_SYSTEM_POOL,
12211		.nid = cpu_to_mem(cpuid),
12212	};
12213	struct page_pool *pp_ptr;
12214
12215	pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
12216	if (IS_ERR(pp_ptr))
12217		return -ENOMEM;
12218
12219	per_cpu(system_page_pool, cpuid) = pp_ptr;
12220#endif
12221	return 0;
12222}
12223
12224static int backlog_napi_should_run(unsigned int cpu)
12225{
12226	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
12227	struct napi_struct *napi = &sd->backlog;
12228
12229	return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
12230}
12231
12232static void run_backlog_napi(unsigned int cpu)
12233{
12234	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
12235
12236	napi_threaded_poll_loop(&sd->backlog);
12237}
12238
12239static void backlog_napi_setup(unsigned int cpu)
12240{
12241	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
12242	struct napi_struct *napi = &sd->backlog;
12243
12244	napi->thread = this_cpu_read(backlog_napi);
12245	set_bit(NAPI_STATE_THREADED, &napi->state);
12246}
12247
12248static struct smp_hotplug_thread backlog_threads = {
12249	.store			= &backlog_napi,
12250	.thread_should_run	= backlog_napi_should_run,
12251	.thread_fn		= run_backlog_napi,
12252	.thread_comm		= "backlog_napi/%u",
12253	.setup			= backlog_napi_setup,
12254};
12255
12256/*
12257 *       This is called single threaded during boot, so no need
12258 *       to take the rtnl semaphore.
12259 */
12260static int __init net_dev_init(void)
12261{
12262	int i, rc = -ENOMEM;
12263
12264	BUG_ON(!dev_boot_phase);
12265
12266	net_dev_struct_check();
12267
12268	if (dev_proc_init())
12269		goto out;
12270
12271	if (netdev_kobject_init())
12272		goto out;
12273
 
12274	for (i = 0; i < PTYPE_HASH_SIZE; i++)
12275		INIT_LIST_HEAD(&ptype_base[i]);
12276
 
 
12277	if (register_pernet_subsys(&netdev_net_ops))
12278		goto out;
12279
12280	/*
12281	 *	Initialise the packet receive queues.
12282	 */
12283
12284	for_each_possible_cpu(i) {
12285		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
12286		struct softnet_data *sd = &per_cpu(softnet_data, i);
12287
12288		INIT_WORK(flush, flush_backlog);
12289
12290		skb_queue_head_init(&sd->input_pkt_queue);
12291		skb_queue_head_init(&sd->process_queue);
12292#ifdef CONFIG_XFRM_OFFLOAD
12293		skb_queue_head_init(&sd->xfrm_backlog);
12294#endif
12295		INIT_LIST_HEAD(&sd->poll_list);
12296		sd->output_queue_tailp = &sd->output_queue;
12297#ifdef CONFIG_RPS
12298		INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
 
12299		sd->cpu = i;
12300#endif
12301		INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
12302		spin_lock_init(&sd->defer_lock);
12303
12304		init_gro_hash(&sd->backlog);
12305		sd->backlog.poll = process_backlog;
12306		sd->backlog.weight = weight_p;
12307		INIT_LIST_HEAD(&sd->backlog.poll_list);
12308
12309		if (net_page_pool_create(i))
12310			goto out;
12311	}
12312	if (use_backlog_threads())
12313		smpboot_register_percpu_thread(&backlog_threads);
12314
12315	dev_boot_phase = 0;
12316
12317	/* The loopback device is special if any other network devices
12318	 * is present in a network namespace the loopback device must
12319	 * be present. Since we now dynamically allocate and free the
12320	 * loopback device ensure this invariant is maintained by
12321	 * keeping the loopback device as the first device on the
12322	 * list of network devices.  Ensuring the loopback devices
12323	 * is the first device that appears and the last network device
12324	 * that disappears.
12325	 */
12326	if (register_pernet_device(&loopback_net_ops))
12327		goto out;
12328
12329	if (register_pernet_device(&default_device_ops))
12330		goto out;
12331
12332	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
12333	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
12334
12335	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
12336				       NULL, dev_cpu_dead);
12337	WARN_ON(rc < 0);
 
12338	rc = 0;
12339
12340	/* avoid static key IPIs to isolated CPUs */
12341	if (housekeeping_enabled(HK_TYPE_MISC))
12342		net_enable_timestamp();
12343out:
12344	if (rc < 0) {
12345		for_each_possible_cpu(i) {
12346			struct page_pool *pp_ptr;
12347
12348			pp_ptr = per_cpu(system_page_pool, i);
12349			if (!pp_ptr)
12350				continue;
12351
12352			page_pool_destroy(pp_ptr);
12353			per_cpu(system_page_pool, i) = NULL;
12354		}
12355	}
12356
12357	return rc;
12358}
12359
12360subsys_initcall(net_dev_init);
v4.10.11
 
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <linux/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
 
 
 
  84#include <linux/mutex.h>
 
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
 
  97#include <linux/bpf.h>
 
  98#include <net/net_namespace.h>
  99#include <net/sock.h>
 100#include <net/busy_poll.h>
 101#include <linux/rtnetlink.h>
 102#include <linux/stat.h>
 
 103#include <net/dst.h>
 104#include <net/dst_metadata.h>
 
 105#include <net/pkt_sched.h>
 
 106#include <net/checksum.h>
 107#include <net/xfrm.h>
 
 108#include <linux/highmem.h>
 109#include <linux/init.h>
 110#include <linux/module.h>
 111#include <linux/netpoll.h>
 112#include <linux/rcupdate.h>
 113#include <linux/delay.h>
 114#include <net/iw_handler.h>
 115#include <asm/current.h>
 116#include <linux/audit.h>
 117#include <linux/dmaengine.h>
 118#include <linux/err.h>
 119#include <linux/ctype.h>
 120#include <linux/if_arp.h>
 121#include <linux/if_vlan.h>
 122#include <linux/ip.h>
 123#include <net/ip.h>
 124#include <net/mpls.h>
 125#include <linux/ipv6.h>
 126#include <linux/in.h>
 127#include <linux/jhash.h>
 128#include <linux/random.h>
 129#include <trace/events/napi.h>
 130#include <trace/events/net.h>
 131#include <trace/events/skb.h>
 132#include <linux/pci.h>
 
 133#include <linux/inetdevice.h>
 134#include <linux/cpu_rmap.h>
 135#include <linux/static_key.h>
 136#include <linux/hashtable.h>
 137#include <linux/vmalloc.h>
 138#include <linux/if_macvlan.h>
 139#include <linux/errqueue.h>
 140#include <linux/hrtimer.h>
 141#include <linux/netfilter_ingress.h>
 142#include <linux/crash_dump.h>
 
 
 
 
 
 
 
 
 
 
 
 
 
 143
 
 
 144#include "net-sysfs.h"
 145
 146/* Instead of increasing this, you should create a hash table. */
 147#define MAX_GRO_SKBS 8
 148
 149/* This should be increased if a protocol with a bigger head is added. */
 150#define GRO_MAX_HEAD (MAX_HEADER + 128)
 151
 152static DEFINE_SPINLOCK(ptype_lock);
 153static DEFINE_SPINLOCK(offload_lock);
 154struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 155struct list_head ptype_all __read_mostly;	/* Taps */
 156static struct list_head offload_base __read_mostly;
 157
 158static int netif_rx_internal(struct sk_buff *skb);
 159static int call_netdevice_notifiers_info(unsigned long val,
 160					 struct net_device *dev,
 161					 struct netdev_notifier_info *info);
 162
 163/*
 164 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 165 * semaphore.
 166 *
 167 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 168 *
 169 * Writers must hold the rtnl semaphore while they loop through the
 170 * dev_base_head list, and hold dev_base_lock for writing when they do the
 171 * actual updates.  This allows pure readers to access the list even
 172 * while a writer is preparing to update it.
 173 *
 174 * To put it another way, dev_base_lock is held for writing only to
 175 * protect against pure readers; the rtnl semaphore provides the
 176 * protection against other writers.
 177 *
 178 * See, for example usages, register_netdevice() and
 179 * unregister_netdevice(), which must be called with the rtnl
 180 * semaphore held.
 181 */
 182DEFINE_RWLOCK(dev_base_lock);
 183EXPORT_SYMBOL(dev_base_lock);
 184
 185/* protects napi_hash addition/deletion and napi_gen_id */
 186static DEFINE_SPINLOCK(napi_hash_lock);
 187
 188static unsigned int napi_gen_id = NR_CPUS;
 189static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 190
 191static seqcount_t devnet_rename_seq;
 192
 193static inline void dev_base_seq_inc(struct net *net)
 194{
 195	while (++net->dev_base_seq == 0);
 
 
 196}
 197
 198static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 199{
 200	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
 201
 202	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 203}
 204
 205static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 206{
 207	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 208}
 209
 210static inline void rps_lock(struct softnet_data *sd)
 
 
 
 
 
 
 
 
 
 
 
 211{
 212#ifdef CONFIG_RPS
 213	spin_lock(&sd->input_pkt_queue.lock);
 
 
 
 
 
 
 
 
 214#endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 215}
 216
 217static inline void rps_unlock(struct softnet_data *sd)
 218{
 219#ifdef CONFIG_RPS
 220	spin_unlock(&sd->input_pkt_queue.lock);
 221#endif
 
 
 
 222}
 223
 224/* Device list insertion */
 225static void list_netdevice(struct net_device *dev)
 226{
 
 227	struct net *net = dev_net(dev);
 228
 229	ASSERT_RTNL();
 230
 231	write_lock_bh(&dev_base_lock);
 232	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 233	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 234	hlist_add_head_rcu(&dev->index_hlist,
 235			   dev_index_hash(net, dev->ifindex));
 236	write_unlock_bh(&dev_base_lock);
 
 
 
 
 
 237
 238	dev_base_seq_inc(net);
 239}
 240
 241/* Device list removal
 242 * caller must respect a RCU grace period before freeing/reusing dev
 243 */
 244static void unlist_netdevice(struct net_device *dev)
 245{
 
 
 
 246	ASSERT_RTNL();
 247
 
 
 
 
 
 248	/* Unlink dev from the device chain */
 249	write_lock_bh(&dev_base_lock);
 250	list_del_rcu(&dev->dev_list);
 251	hlist_del_rcu(&dev->name_hlist);
 252	hlist_del_rcu(&dev->index_hlist);
 253	write_unlock_bh(&dev_base_lock);
 254
 255	dev_base_seq_inc(dev_net(dev));
 256}
 257
 258/*
 259 *	Our notifier list
 260 */
 261
 262static RAW_NOTIFIER_HEAD(netdev_chain);
 263
 264/*
 265 *	Device drivers call our routines to queue packets here. We empty the
 266 *	queue in the local softnet handler.
 267 */
 268
 269DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 
 
 270EXPORT_PER_CPU_SYMBOL(softnet_data);
 271
 
 
 
 
 
 
 272#ifdef CONFIG_LOCKDEP
 273/*
 274 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 275 * according to dev->type
 276 */
 277static const unsigned short netdev_lock_type[] =
 278	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 279	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 280	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 281	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 282	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 283	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 284	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 285	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 286	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 287	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 288	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 289	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 290	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 291	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 292	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 293
 294static const char *const netdev_lock_name[] =
 295	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 296	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 297	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 298	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 299	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 300	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 301	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 302	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 303	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 304	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 305	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 306	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 307	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 308	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 309	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 310
 311static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 312static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 313
 314static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 315{
 316	int i;
 317
 318	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 319		if (netdev_lock_type[i] == dev_type)
 320			return i;
 321	/* the last key is used by default */
 322	return ARRAY_SIZE(netdev_lock_type) - 1;
 323}
 324
 325static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 326						 unsigned short dev_type)
 327{
 328	int i;
 329
 330	i = netdev_lock_pos(dev_type);
 331	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 332				   netdev_lock_name[i]);
 333}
 334
 335static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 336{
 337	int i;
 338
 339	i = netdev_lock_pos(dev->type);
 340	lockdep_set_class_and_name(&dev->addr_list_lock,
 341				   &netdev_addr_lock_key[i],
 342				   netdev_lock_name[i]);
 343}
 344#else
 345static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 346						 unsigned short dev_type)
 347{
 348}
 
 349static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 350{
 351}
 352#endif
 353
 354/*******************************************************************************
 
 
 
 
 355
 356		Protocol management and registration routines
 357
 358*******************************************************************************/
 359
 360/*
 361 *	Add a protocol ID to the list. Now that the input handler is
 362 *	smarter we can dispense with all the messy stuff that used to be
 363 *	here.
 364 *
 365 *	BEWARE!!! Protocol handlers, mangling input packets,
 366 *	MUST BE last in hash buckets and checking protocol handlers
 367 *	MUST start from promiscuous ptype_all chain in net_bh.
 368 *	It is true now, do not change it.
 369 *	Explanation follows: if protocol handler, mangling packet, will
 370 *	be the first on list, it is not able to sense, that packet
 371 *	is cloned and should be copied-on-write, so that it will
 372 *	change it and subsequent readers will get broken packet.
 373 *							--ANK (980803)
 374 */
 375
 376static inline struct list_head *ptype_head(const struct packet_type *pt)
 377{
 378	if (pt->type == htons(ETH_P_ALL))
 379		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 380	else
 381		return pt->dev ? &pt->dev->ptype_specific :
 382				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 383}
 384
 385/**
 386 *	dev_add_pack - add packet handler
 387 *	@pt: packet type declaration
 388 *
 389 *	Add a protocol handler to the networking stack. The passed &packet_type
 390 *	is linked into kernel lists and may not be freed until it has been
 391 *	removed from the kernel lists.
 392 *
 393 *	This call does not sleep therefore it can not
 394 *	guarantee all CPU's that are in middle of receiving packets
 395 *	will see the new packet type (until the next received packet).
 396 */
 397
 398void dev_add_pack(struct packet_type *pt)
 399{
 400	struct list_head *head = ptype_head(pt);
 401
 402	spin_lock(&ptype_lock);
 403	list_add_rcu(&pt->list, head);
 404	spin_unlock(&ptype_lock);
 405}
 406EXPORT_SYMBOL(dev_add_pack);
 407
 408/**
 409 *	__dev_remove_pack	 - remove packet handler
 410 *	@pt: packet type declaration
 411 *
 412 *	Remove a protocol handler that was previously added to the kernel
 413 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 414 *	from the kernel lists and can be freed or reused once this function
 415 *	returns.
 416 *
 417 *      The packet type might still be in use by receivers
 418 *	and must not be freed until after all the CPU's have gone
 419 *	through a quiescent state.
 420 */
 421void __dev_remove_pack(struct packet_type *pt)
 422{
 423	struct list_head *head = ptype_head(pt);
 424	struct packet_type *pt1;
 425
 426	spin_lock(&ptype_lock);
 427
 428	list_for_each_entry(pt1, head, list) {
 429		if (pt == pt1) {
 430			list_del_rcu(&pt->list);
 431			goto out;
 432		}
 433	}
 434
 435	pr_warn("dev_remove_pack: %p not found\n", pt);
 436out:
 437	spin_unlock(&ptype_lock);
 438}
 439EXPORT_SYMBOL(__dev_remove_pack);
 440
 441/**
 442 *	dev_remove_pack	 - remove packet handler
 443 *	@pt: packet type declaration
 444 *
 445 *	Remove a protocol handler that was previously added to the kernel
 446 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 447 *	from the kernel lists and can be freed or reused once this function
 448 *	returns.
 449 *
 450 *	This call sleeps to guarantee that no CPU is looking at the packet
 451 *	type after return.
 452 */
 453void dev_remove_pack(struct packet_type *pt)
 454{
 455	__dev_remove_pack(pt);
 456
 457	synchronize_net();
 458}
 459EXPORT_SYMBOL(dev_remove_pack);
 460
 461
 462/**
 463 *	dev_add_offload - register offload handlers
 464 *	@po: protocol offload declaration
 465 *
 466 *	Add protocol offload handlers to the networking stack. The passed
 467 *	&proto_offload is linked into kernel lists and may not be freed until
 468 *	it has been removed from the kernel lists.
 469 *
 470 *	This call does not sleep therefore it can not
 471 *	guarantee all CPU's that are in middle of receiving packets
 472 *	will see the new offload handlers (until the next received packet).
 473 */
 474void dev_add_offload(struct packet_offload *po)
 475{
 476	struct packet_offload *elem;
 477
 478	spin_lock(&offload_lock);
 479	list_for_each_entry(elem, &offload_base, list) {
 480		if (po->priority < elem->priority)
 481			break;
 482	}
 483	list_add_rcu(&po->list, elem->list.prev);
 484	spin_unlock(&offload_lock);
 485}
 486EXPORT_SYMBOL(dev_add_offload);
 487
 488/**
 489 *	__dev_remove_offload	 - remove offload handler
 490 *	@po: packet offload declaration
 491 *
 492 *	Remove a protocol offload handler that was previously added to the
 493 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
 494 *	is removed from the kernel lists and can be freed or reused once this
 495 *	function returns.
 496 *
 497 *      The packet type might still be in use by receivers
 498 *	and must not be freed until after all the CPU's have gone
 499 *	through a quiescent state.
 500 */
 501static void __dev_remove_offload(struct packet_offload *po)
 502{
 503	struct list_head *head = &offload_base;
 504	struct packet_offload *po1;
 505
 506	spin_lock(&offload_lock);
 507
 508	list_for_each_entry(po1, head, list) {
 509		if (po == po1) {
 510			list_del_rcu(&po->list);
 511			goto out;
 512		}
 513	}
 514
 515	pr_warn("dev_remove_offload: %p not found\n", po);
 516out:
 517	spin_unlock(&offload_lock);
 518}
 519
 520/**
 521 *	dev_remove_offload	 - remove packet offload handler
 522 *	@po: packet offload declaration
 523 *
 524 *	Remove a packet offload handler that was previously added to the kernel
 525 *	offload handlers by dev_add_offload(). The passed &offload_type is
 526 *	removed from the kernel lists and can be freed or reused once this
 527 *	function returns.
 528 *
 529 *	This call sleeps to guarantee that no CPU is looking at the packet
 530 *	type after return.
 531 */
 532void dev_remove_offload(struct packet_offload *po)
 533{
 534	__dev_remove_offload(po);
 535
 536	synchronize_net();
 537}
 538EXPORT_SYMBOL(dev_remove_offload);
 539
 540/******************************************************************************
 541
 542		      Device Boot-time Settings Routines
 543
 544*******************************************************************************/
 545
 546/* Boot time configuration table */
 547static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 548
 549/**
 550 *	netdev_boot_setup_add	- add new setup entry
 551 *	@name: name of the device
 552 *	@map: configured settings for the device
 553 *
 554 *	Adds new setup entry to the dev_boot_setup list.  The function
 555 *	returns 0 on error and 1 on success.  This is a generic routine to
 556 *	all netdevices.
 557 */
 558static int netdev_boot_setup_add(char *name, struct ifmap *map)
 559{
 560	struct netdev_boot_setup *s;
 561	int i;
 562
 563	s = dev_boot_setup;
 564	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 565		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 566			memset(s[i].name, 0, sizeof(s[i].name));
 567			strlcpy(s[i].name, name, IFNAMSIZ);
 568			memcpy(&s[i].map, map, sizeof(s[i].map));
 569			break;
 570		}
 571	}
 572
 573	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 574}
 575
 576/**
 577 *	netdev_boot_setup_check	- check boot time settings
 578 *	@dev: the netdevice
 579 *
 580 * 	Check boot time settings for the device.
 581 *	The found settings are set for the device to be used
 582 *	later in the device probing.
 583 *	Returns 0 if no settings found, 1 if they are.
 584 */
 585int netdev_boot_setup_check(struct net_device *dev)
 586{
 587	struct netdev_boot_setup *s = dev_boot_setup;
 588	int i;
 589
 590	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 591		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 592		    !strcmp(dev->name, s[i].name)) {
 593			dev->irq 	= s[i].map.irq;
 594			dev->base_addr 	= s[i].map.base_addr;
 595			dev->mem_start 	= s[i].map.mem_start;
 596			dev->mem_end 	= s[i].map.mem_end;
 597			return 1;
 598		}
 599	}
 600	return 0;
 601}
 602EXPORT_SYMBOL(netdev_boot_setup_check);
 603
 604
 605/**
 606 *	netdev_boot_base	- get address from boot time settings
 607 *	@prefix: prefix for network device
 608 *	@unit: id for network device
 609 *
 610 * 	Check boot time settings for the base address of device.
 611 *	The found settings are set for the device to be used
 612 *	later in the device probing.
 613 *	Returns 0 if no settings found.
 614 */
 615unsigned long netdev_boot_base(const char *prefix, int unit)
 616{
 617	const struct netdev_boot_setup *s = dev_boot_setup;
 618	char name[IFNAMSIZ];
 619	int i;
 620
 621	sprintf(name, "%s%d", prefix, unit);
 622
 623	/*
 624	 * If device already registered then return base of 1
 625	 * to indicate not to probe for this interface
 626	 */
 627	if (__dev_get_by_name(&init_net, name))
 628		return 1;
 629
 630	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 631		if (!strcmp(name, s[i].name))
 632			return s[i].map.base_addr;
 633	return 0;
 634}
 635
 636/*
 637 * Saves at boot time configured settings for any netdevice.
 638 */
 639int __init netdev_boot_setup(char *str)
 640{
 641	int ints[5];
 642	struct ifmap map;
 643
 644	str = get_options(str, ARRAY_SIZE(ints), ints);
 645	if (!str || !*str)
 646		return 0;
 647
 648	/* Save settings */
 649	memset(&map, 0, sizeof(map));
 650	if (ints[0] > 0)
 651		map.irq = ints[1];
 652	if (ints[0] > 1)
 653		map.base_addr = ints[2];
 654	if (ints[0] > 2)
 655		map.mem_start = ints[3];
 656	if (ints[0] > 3)
 657		map.mem_end = ints[4];
 658
 659	/* Add new entry to the list */
 660	return netdev_boot_setup_add(str, &map);
 661}
 662
 663__setup("netdev=", netdev_boot_setup);
 664
 665/*******************************************************************************
 666
 667			    Device Interface Subroutines
 668
 669*******************************************************************************/
 670
 671/**
 672 *	dev_get_iflink	- get 'iflink' value of a interface
 673 *	@dev: targeted interface
 674 *
 675 *	Indicates the ifindex the interface is linked to.
 676 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
 677 */
 678
 679int dev_get_iflink(const struct net_device *dev)
 680{
 681	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 682		return dev->netdev_ops->ndo_get_iflink(dev);
 683
 684	return dev->ifindex;
 685}
 686EXPORT_SYMBOL(dev_get_iflink);
 687
 688/**
 689 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
 690 *	@dev: targeted interface
 691 *	@skb: The packet.
 692 *
 693 *	For better visibility of tunnel traffic OVS needs to retrieve
 694 *	egress tunnel information for a packet. Following API allows
 695 *	user to get this info.
 696 */
 697int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 698{
 699	struct ip_tunnel_info *info;
 700
 701	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 702		return -EINVAL;
 703
 704	info = skb_tunnel_info_unclone(skb);
 705	if (!info)
 706		return -ENOMEM;
 707	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 708		return -EINVAL;
 709
 710	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 711}
 712EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 713
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 714/**
 715 *	__dev_get_by_name	- find a device by its name
 716 *	@net: the applicable net namespace
 717 *	@name: name to find
 718 *
 719 *	Find an interface by name. Must be called under RTNL semaphore
 720 *	or @dev_base_lock. If the name is found a pointer to the device
 721 *	is returned. If the name is not found then %NULL is returned. The
 722 *	reference counters are not incremented so the caller must be
 723 *	careful with locks.
 724 */
 725
 726struct net_device *__dev_get_by_name(struct net *net, const char *name)
 727{
 728	struct net_device *dev;
 729	struct hlist_head *head = dev_name_hash(net, name);
 730
 731	hlist_for_each_entry(dev, head, name_hlist)
 732		if (!strncmp(dev->name, name, IFNAMSIZ))
 733			return dev;
 734
 735	return NULL;
 
 736}
 737EXPORT_SYMBOL(__dev_get_by_name);
 738
 739/**
 740 *	dev_get_by_name_rcu	- find a device by its name
 741 *	@net: the applicable net namespace
 742 *	@name: name to find
 743 *
 744 *	Find an interface by name.
 745 *	If the name is found a pointer to the device is returned.
 746 * 	If the name is not found then %NULL is returned.
 747 *	The reference counters are not incremented so the caller must be
 748 *	careful with locks. The caller must hold RCU lock.
 749 */
 750
 751struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 752{
 
 
 
 
 
 
 
 
 
 
 753	struct net_device *dev;
 754	struct hlist_head *head = dev_name_hash(net, name);
 755
 756	hlist_for_each_entry_rcu(dev, head, name_hlist)
 757		if (!strncmp(dev->name, name, IFNAMSIZ))
 758			return dev;
 759
 760	return NULL;
 761}
 762EXPORT_SYMBOL(dev_get_by_name_rcu);
 763
 764/**
 765 *	dev_get_by_name		- find a device by its name
 766 *	@net: the applicable net namespace
 767 *	@name: name to find
 
 
 768 *
 769 *	Find an interface by name. This can be called from any
 770 *	context and does its own locking. The returned handle has
 771 *	the usage count incremented and the caller must use dev_put() to
 772 *	release it when it is no longer needed. %NULL is returned if no
 773 *	matching device is found.
 774 */
 775
 776struct net_device *dev_get_by_name(struct net *net, const char *name)
 777{
 778	struct net_device *dev;
 779
 780	rcu_read_lock();
 781	dev = dev_get_by_name_rcu(net, name);
 782	if (dev)
 783		dev_hold(dev);
 784	rcu_read_unlock();
 785	return dev;
 786}
 787EXPORT_SYMBOL(dev_get_by_name);
 788
 789/**
 790 *	__dev_get_by_index - find a device by its ifindex
 791 *	@net: the applicable net namespace
 792 *	@ifindex: index of device
 793 *
 794 *	Search for an interface by index. Returns %NULL if the device
 795 *	is not found or a pointer to the device. The device has not
 796 *	had its reference counter increased so the caller must be careful
 797 *	about locking. The caller must hold either the RTNL semaphore
 798 *	or @dev_base_lock.
 799 */
 800
 801struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 802{
 803	struct net_device *dev;
 804	struct hlist_head *head = dev_index_hash(net, ifindex);
 805
 806	hlist_for_each_entry(dev, head, index_hlist)
 807		if (dev->ifindex == ifindex)
 808			return dev;
 809
 810	return NULL;
 811}
 812EXPORT_SYMBOL(__dev_get_by_index);
 813
 814/**
 815 *	dev_get_by_index_rcu - find a device by its ifindex
 816 *	@net: the applicable net namespace
 817 *	@ifindex: index of device
 818 *
 819 *	Search for an interface by index. Returns %NULL if the device
 820 *	is not found or a pointer to the device. The device has not
 821 *	had its reference counter increased so the caller must be careful
 822 *	about locking. The caller must hold RCU lock.
 823 */
 824
 825struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 826{
 827	struct net_device *dev;
 828	struct hlist_head *head = dev_index_hash(net, ifindex);
 829
 830	hlist_for_each_entry_rcu(dev, head, index_hlist)
 831		if (dev->ifindex == ifindex)
 832			return dev;
 833
 834	return NULL;
 835}
 836EXPORT_SYMBOL(dev_get_by_index_rcu);
 837
 
 
 
 
 
 
 
 
 
 
 
 
 838
 839/**
 840 *	dev_get_by_index - find a device by its ifindex
 841 *	@net: the applicable net namespace
 842 *	@ifindex: index of device
 
 
 843 *
 844 *	Search for an interface by index. Returns NULL if the device
 845 *	is not found or a pointer to the device. The device returned has
 846 *	had a reference added and the pointer is safe until the user calls
 847 *	dev_put to indicate they have finished with it.
 848 */
 849
 850struct net_device *dev_get_by_index(struct net *net, int ifindex)
 851{
 852	struct net_device *dev;
 853
 854	rcu_read_lock();
 855	dev = dev_get_by_index_rcu(net, ifindex);
 856	if (dev)
 857		dev_hold(dev);
 858	rcu_read_unlock();
 859	return dev;
 860}
 861EXPORT_SYMBOL(dev_get_by_index);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 862
 863/**
 864 *	netdev_get_name - get a netdevice name, knowing its ifindex.
 865 *	@net: network namespace
 866 *	@name: a pointer to the buffer where the name will be stored.
 867 *	@ifindex: the ifindex of the interface to get the name from.
 868 *
 869 *	The use of raw_seqcount_begin() and cond_resched() before
 870 *	retrying is required as we want to give the writers a chance
 871 *	to complete when CONFIG_PREEMPT is not set.
 872 */
 873int netdev_get_name(struct net *net, char *name, int ifindex)
 874{
 875	struct net_device *dev;
 876	unsigned int seq;
 877
 878retry:
 879	seq = raw_seqcount_begin(&devnet_rename_seq);
 880	rcu_read_lock();
 
 881	dev = dev_get_by_index_rcu(net, ifindex);
 882	if (!dev) {
 883		rcu_read_unlock();
 884		return -ENODEV;
 885	}
 886
 887	strcpy(name, dev->name);
 
 
 
 888	rcu_read_unlock();
 889	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 890		cond_resched();
 891		goto retry;
 892	}
 893
 894	return 0;
 
 
 
 895}
 896
 897/**
 898 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 899 *	@net: the applicable net namespace
 900 *	@type: media type of device
 901 *	@ha: hardware address
 902 *
 903 *	Search for an interface by MAC address. Returns NULL if the device
 904 *	is not found or a pointer to the device.
 905 *	The caller must hold RCU or RTNL.
 906 *	The returned device has not had its ref count increased
 907 *	and the caller must therefore be careful about locking
 908 *
 909 */
 910
 911struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 912				       const char *ha)
 913{
 914	struct net_device *dev;
 915
 916	for_each_netdev_rcu(net, dev)
 917		if (dev->type == type &&
 918		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 919			return dev;
 920
 921	return NULL;
 922}
 923EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 924
 925struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 
 
 
 
 
 
 
 
 
 
 
 
 
 926{
 927	struct net_device *dev;
 928
 929	ASSERT_RTNL();
 930	for_each_netdev(net, dev)
 931		if (dev->type == type)
 932			return dev;
 933
 934	return NULL;
 935}
 936EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 937
 938struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 939{
 940	struct net_device *dev, *ret = NULL;
 941
 942	rcu_read_lock();
 943	for_each_netdev_rcu(net, dev)
 944		if (dev->type == type) {
 945			dev_hold(dev);
 946			ret = dev;
 947			break;
 948		}
 949	rcu_read_unlock();
 950	return ret;
 951}
 952EXPORT_SYMBOL(dev_getfirstbyhwtype);
 953
 954/**
 955 *	__dev_get_by_flags - find any device with given flags
 956 *	@net: the applicable net namespace
 957 *	@if_flags: IFF_* values
 958 *	@mask: bitmask of bits in if_flags to check
 959 *
 960 *	Search for any interface with the given flags. Returns NULL if a device
 961 *	is not found or a pointer to the device. Must be called inside
 962 *	rtnl_lock(), and result refcount is unchanged.
 963 */
 964
 965struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 966				      unsigned short mask)
 967{
 968	struct net_device *dev, *ret;
 969
 970	ASSERT_RTNL();
 971
 972	ret = NULL;
 973	for_each_netdev(net, dev) {
 974		if (((dev->flags ^ if_flags) & mask) == 0) {
 975			ret = dev;
 976			break;
 977		}
 978	}
 979	return ret;
 980}
 981EXPORT_SYMBOL(__dev_get_by_flags);
 982
 983/**
 984 *	dev_valid_name - check if name is okay for network device
 985 *	@name: name string
 986 *
 987 *	Network device names need to be valid file names to
 988 *	to allow sysfs to work.  We also disallow any kind of
 989 *	whitespace.
 990 */
 991bool dev_valid_name(const char *name)
 992{
 993	if (*name == '\0')
 994		return false;
 995	if (strlen(name) >= IFNAMSIZ)
 996		return false;
 997	if (!strcmp(name, ".") || !strcmp(name, ".."))
 998		return false;
 999
1000	while (*name) {
1001		if (*name == '/' || *name == ':' || isspace(*name))
1002			return false;
1003		name++;
1004	}
1005	return true;
1006}
1007EXPORT_SYMBOL(dev_valid_name);
1008
1009/**
1010 *	__dev_alloc_name - allocate a name for a device
1011 *	@net: network namespace to allocate the device name in
1012 *	@name: name format string
1013 *	@buf:  scratch buffer and result name string
1014 *
1015 *	Passed a format string - eg "lt%d" it will try and find a suitable
1016 *	id. It scans list of devices to build up a free map, then chooses
1017 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1018 *	while allocating the name and adding the device in order to avoid
1019 *	duplicates.
1020 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021 *	Returns the number of the unit assigned or a negative errno code.
1022 */
1023
1024static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1025{
1026	int i = 0;
1027	const char *p;
1028	const int max_netdevices = 8*PAGE_SIZE;
1029	unsigned long *inuse;
1030	struct net_device *d;
 
 
 
 
 
 
 
 
1031
1032	p = strnchr(name, IFNAMSIZ-1, '%');
1033	if (p) {
1034		/*
1035		 * Verify the string as this thing may have come from
1036		 * the user.  There must be either one "%d" and no other "%"
1037		 * characters.
1038		 */
1039		if (p[1] != 'd' || strchr(p + 2, '%'))
1040			return -EINVAL;
1041
1042		/* Use one page as a bit array of possible slots */
1043		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1044		if (!inuse)
1045			return -ENOMEM;
1046
1047		for_each_netdev(net, d) {
1048			if (!sscanf(d->name, name, &i))
1049				continue;
1050			if (i < 0 || i >= max_netdevices)
1051				continue;
1052
1053			/*  avoid cases where sscanf is not exact inverse of printf */
1054			snprintf(buf, IFNAMSIZ, name, i);
1055			if (!strncmp(buf, d->name, IFNAMSIZ))
1056				set_bit(i, inuse);
1057		}
 
 
 
 
1058
1059		i = find_first_zero_bit(inuse, max_netdevices);
1060		free_page((unsigned long) inuse);
 
 
1061	}
1062
1063	if (buf != name)
1064		snprintf(buf, IFNAMSIZ, name, i);
1065	if (!__dev_get_by_name(net, buf))
1066		return i;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1067
1068	/* It is possible to run out of possible slots
1069	 * when the name is long and there isn't enough space left
1070	 * for the digits, or if all bits are used.
1071	 */
1072	return -ENFILE;
1073}
1074
1075/**
1076 *	dev_alloc_name - allocate a name for a device
1077 *	@dev: device
1078 *	@name: name format string
1079 *
1080 *	Passed a format string - eg "lt%d" it will try and find a suitable
1081 *	id. It scans list of devices to build up a free map, then chooses
1082 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1083 *	while allocating the name and adding the device in order to avoid
1084 *	duplicates.
1085 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086 *	Returns the number of the unit assigned or a negative errno code.
1087 */
1088
1089int dev_alloc_name(struct net_device *dev, const char *name)
1090{
1091	char buf[IFNAMSIZ];
1092	struct net *net;
1093	int ret;
1094
1095	BUG_ON(!dev_net(dev));
1096	net = dev_net(dev);
1097	ret = __dev_alloc_name(net, name, buf);
1098	if (ret >= 0)
1099		strlcpy(dev->name, buf, IFNAMSIZ);
1100	return ret;
1101}
1102EXPORT_SYMBOL(dev_alloc_name);
1103
1104static int dev_alloc_name_ns(struct net *net,
1105			     struct net_device *dev,
1106			     const char *name)
1107{
1108	char buf[IFNAMSIZ];
1109	int ret;
1110
1111	ret = __dev_alloc_name(net, name, buf);
1112	if (ret >= 0)
1113		strlcpy(dev->name, buf, IFNAMSIZ);
1114	return ret;
1115}
1116
1117static int dev_get_valid_name(struct net *net,
1118			      struct net_device *dev,
1119			      const char *name)
1120{
1121	BUG_ON(!net);
1122
1123	if (!dev_valid_name(name))
1124		return -EINVAL;
1125
1126	if (strchr(name, '%'))
1127		return dev_alloc_name_ns(net, dev, name);
1128	else if (__dev_get_by_name(net, name))
1129		return -EEXIST;
1130	else if (dev->name != name)
1131		strlcpy(dev->name, name, IFNAMSIZ);
1132
1133	return 0;
1134}
1135
1136/**
1137 *	dev_change_name - change name of a device
1138 *	@dev: device
1139 *	@newname: name (or format string) must be at least IFNAMSIZ
1140 *
1141 *	Change name of a device, can pass format strings "eth%d".
1142 *	for wildcarding.
1143 */
1144int dev_change_name(struct net_device *dev, const char *newname)
1145{
1146	unsigned char old_assign_type;
1147	char oldname[IFNAMSIZ];
1148	int err = 0;
1149	int ret;
1150	struct net *net;
1151
1152	ASSERT_RTNL();
1153	BUG_ON(!dev_net(dev));
1154
1155	net = dev_net(dev);
1156	if (dev->flags & IFF_UP)
1157		return -EBUSY;
1158
1159	write_seqcount_begin(&devnet_rename_seq);
1160
1161	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1162		write_seqcount_end(&devnet_rename_seq);
1163		return 0;
1164	}
1165
1166	memcpy(oldname, dev->name, IFNAMSIZ);
1167
 
1168	err = dev_get_valid_name(net, dev, newname);
 
 
1169	if (err < 0) {
1170		write_seqcount_end(&devnet_rename_seq);
1171		return err;
1172	}
1173
1174	if (oldname[0] && !strchr(oldname, '%'))
1175		netdev_info(dev, "renamed from %s\n", oldname);
 
1176
1177	old_assign_type = dev->name_assign_type;
1178	dev->name_assign_type = NET_NAME_RENAMED;
1179
1180rollback:
1181	ret = device_rename(&dev->dev, dev->name);
1182	if (ret) {
 
1183		memcpy(dev->name, oldname, IFNAMSIZ);
1184		dev->name_assign_type = old_assign_type;
1185		write_seqcount_end(&devnet_rename_seq);
 
1186		return ret;
1187	}
1188
1189	write_seqcount_end(&devnet_rename_seq);
1190
1191	netdev_adjacent_rename_links(dev, oldname);
1192
1193	write_lock_bh(&dev_base_lock);
1194	hlist_del_rcu(&dev->name_hlist);
1195	write_unlock_bh(&dev_base_lock);
1196
1197	synchronize_rcu();
1198
1199	write_lock_bh(&dev_base_lock);
1200	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1201	write_unlock_bh(&dev_base_lock);
1202
1203	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1204	ret = notifier_to_errno(ret);
1205
1206	if (ret) {
1207		/* err >= 0 after dev_alloc_name() or stores the first errno */
1208		if (err >= 0) {
1209			err = ret;
1210			write_seqcount_begin(&devnet_rename_seq);
 
1211			memcpy(dev->name, oldname, IFNAMSIZ);
 
1212			memcpy(oldname, newname, IFNAMSIZ);
1213			dev->name_assign_type = old_assign_type;
1214			old_assign_type = NET_NAME_RENAMED;
1215			goto rollback;
1216		} else {
1217			pr_err("%s: name change rollback failed: %d\n",
1218			       dev->name, ret);
1219		}
1220	}
1221
1222	return err;
1223}
1224
1225/**
1226 *	dev_set_alias - change ifalias of a device
1227 *	@dev: device
1228 *	@alias: name up to IFALIASZ
1229 *	@len: limit of bytes to copy from info
1230 *
1231 *	Set ifalias for a device,
1232 */
1233int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1234{
1235	char *new_ifalias;
1236
1237	ASSERT_RTNL();
1238
1239	if (len >= IFALIASZ)
1240		return -EINVAL;
1241
1242	if (!len) {
1243		kfree(dev->ifalias);
1244		dev->ifalias = NULL;
1245		return 0;
 
 
 
1246	}
1247
1248	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1249	if (!new_ifalias)
1250		return -ENOMEM;
1251	dev->ifalias = new_ifalias;
 
 
 
1252
1253	strlcpy(dev->ifalias, alias, len+1);
1254	return len;
1255}
 
1256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1257
1258/**
1259 *	netdev_features_change - device changes features
1260 *	@dev: device to cause notification
1261 *
1262 *	Called to indicate a device has changed features.
1263 */
1264void netdev_features_change(struct net_device *dev)
1265{
1266	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1267}
1268EXPORT_SYMBOL(netdev_features_change);
1269
1270/**
1271 *	netdev_state_change - device changes state
1272 *	@dev: device to cause notification
1273 *
1274 *	Called to indicate a device has changed state. This function calls
1275 *	the notifier chains for netdev_chain and sends a NEWLINK message
1276 *	to the routing socket.
1277 */
1278void netdev_state_change(struct net_device *dev)
1279{
1280	if (dev->flags & IFF_UP) {
1281		struct netdev_notifier_change_info change_info;
 
 
1282
1283		change_info.flags_changed = 0;
1284		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1285					      &change_info.info);
1286		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1287	}
1288}
1289EXPORT_SYMBOL(netdev_state_change);
1290
1291/**
1292 * 	netdev_notify_peers - notify network peers about existence of @dev
1293 * 	@dev: network device
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1294 *
1295 * Generate traffic such that interested network peers are aware of
1296 * @dev, such as by generating a gratuitous ARP. This may be used when
1297 * a device wants to inform the rest of the network about some sort of
1298 * reconfiguration such as a failover event or virtual machine
1299 * migration.
1300 */
1301void netdev_notify_peers(struct net_device *dev)
1302{
1303	rtnl_lock();
1304	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1305	rtnl_unlock();
1306}
1307EXPORT_SYMBOL(netdev_notify_peers);
1308
1309static int __dev_open(struct net_device *dev)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1310{
1311	const struct net_device_ops *ops = dev->netdev_ops;
1312	int ret;
1313
1314	ASSERT_RTNL();
 
1315
1316	if (!netif_device_present(dev))
1317		return -ENODEV;
 
 
 
 
 
1318
1319	/* Block netpoll from trying to do any rx path servicing.
1320	 * If we don't do this there is a chance ndo_poll_controller
1321	 * or ndo_poll may be running while we open the device
1322	 */
1323	netpoll_poll_disable(dev);
1324
1325	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1326	ret = notifier_to_errno(ret);
1327	if (ret)
1328		return ret;
1329
1330	set_bit(__LINK_STATE_START, &dev->state);
1331
1332	if (ops->ndo_validate_addr)
1333		ret = ops->ndo_validate_addr(dev);
1334
1335	if (!ret && ops->ndo_open)
1336		ret = ops->ndo_open(dev);
1337
1338	netpoll_poll_enable(dev);
1339
1340	if (ret)
1341		clear_bit(__LINK_STATE_START, &dev->state);
1342	else {
1343		dev->flags |= IFF_UP;
1344		dev_set_rx_mode(dev);
1345		dev_activate(dev);
1346		add_device_randomness(dev->dev_addr, dev->addr_len);
1347	}
1348
1349	return ret;
1350}
1351
1352/**
1353 *	dev_open	- prepare an interface for use.
1354 *	@dev:	device to open
 
1355 *
1356 *	Takes a device from down to up state. The device's private open
1357 *	function is invoked and then the multicast lists are loaded. Finally
1358 *	the device is moved into the up state and a %NETDEV_UP message is
1359 *	sent to the netdev notifier chain.
1360 *
1361 *	Calling this function on an active interface is a nop. On a failure
1362 *	a negative errno code is returned.
1363 */
1364int dev_open(struct net_device *dev)
1365{
1366	int ret;
1367
1368	if (dev->flags & IFF_UP)
1369		return 0;
1370
1371	ret = __dev_open(dev);
1372	if (ret < 0)
1373		return ret;
1374
1375	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376	call_netdevice_notifiers(NETDEV_UP, dev);
1377
1378	return ret;
1379}
1380EXPORT_SYMBOL(dev_open);
1381
1382static int __dev_close_many(struct list_head *head)
1383{
1384	struct net_device *dev;
1385
1386	ASSERT_RTNL();
1387	might_sleep();
1388
1389	list_for_each_entry(dev, head, close_list) {
1390		/* Temporarily disable netpoll until the interface is down */
1391		netpoll_poll_disable(dev);
1392
1393		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1394
1395		clear_bit(__LINK_STATE_START, &dev->state);
1396
1397		/* Synchronize to scheduled poll. We cannot touch poll list, it
1398		 * can be even on different cpu. So just clear netif_running().
1399		 *
1400		 * dev->stop() will invoke napi_disable() on all of it's
1401		 * napi_struct instances on this device.
1402		 */
1403		smp_mb__after_atomic(); /* Commit netif_running(). */
1404	}
1405
1406	dev_deactivate_many(head);
1407
1408	list_for_each_entry(dev, head, close_list) {
1409		const struct net_device_ops *ops = dev->netdev_ops;
1410
1411		/*
1412		 *	Call the device specific close. This cannot fail.
1413		 *	Only if device is UP
1414		 *
1415		 *	We allow it to be called even after a DETACH hot-plug
1416		 *	event.
1417		 */
1418		if (ops->ndo_stop)
1419			ops->ndo_stop(dev);
1420
1421		dev->flags &= ~IFF_UP;
1422		netpoll_poll_enable(dev);
1423	}
1424
1425	return 0;
1426}
1427
1428static int __dev_close(struct net_device *dev)
1429{
1430	int retval;
1431	LIST_HEAD(single);
1432
1433	list_add(&dev->close_list, &single);
1434	retval = __dev_close_many(&single);
1435	list_del(&single);
1436
1437	return retval;
1438}
1439
1440int dev_close_many(struct list_head *head, bool unlink)
1441{
1442	struct net_device *dev, *tmp;
1443
1444	/* Remove the devices that don't need to be closed */
1445	list_for_each_entry_safe(dev, tmp, head, close_list)
1446		if (!(dev->flags & IFF_UP))
1447			list_del_init(&dev->close_list);
1448
1449	__dev_close_many(head);
1450
1451	list_for_each_entry_safe(dev, tmp, head, close_list) {
1452		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1453		call_netdevice_notifiers(NETDEV_DOWN, dev);
1454		if (unlink)
1455			list_del_init(&dev->close_list);
1456	}
1457
1458	return 0;
1459}
1460EXPORT_SYMBOL(dev_close_many);
1461
1462/**
1463 *	dev_close - shutdown an interface.
1464 *	@dev: device to shutdown
1465 *
1466 *	This function moves an active device into down state. A
1467 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1468 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1469 *	chain.
1470 */
1471int dev_close(struct net_device *dev)
1472{
1473	if (dev->flags & IFF_UP) {
1474		LIST_HEAD(single);
1475
1476		list_add(&dev->close_list, &single);
1477		dev_close_many(&single, true);
1478		list_del(&single);
1479	}
1480	return 0;
1481}
1482EXPORT_SYMBOL(dev_close);
1483
1484
1485/**
1486 *	dev_disable_lro - disable Large Receive Offload on a device
1487 *	@dev: device
1488 *
1489 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1490 *	called under RTNL.  This is needed if received packets may be
1491 *	forwarded to another interface.
1492 */
1493void dev_disable_lro(struct net_device *dev)
1494{
1495	struct net_device *lower_dev;
1496	struct list_head *iter;
1497
1498	dev->wanted_features &= ~NETIF_F_LRO;
1499	netdev_update_features(dev);
1500
1501	if (unlikely(dev->features & NETIF_F_LRO))
1502		netdev_WARN(dev, "failed to disable LRO!\n");
1503
1504	netdev_for_each_lower_dev(dev, lower_dev, iter)
1505		dev_disable_lro(lower_dev);
1506}
1507EXPORT_SYMBOL(dev_disable_lro);
1508
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1509static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1510				   struct net_device *dev)
1511{
1512	struct netdev_notifier_info info;
 
 
1513
1514	netdev_notifier_info_init(&info, dev);
1515	return nb->notifier_call(nb, val, &info);
1516}
1517
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1518static int dev_boot_phase = 1;
1519
1520/**
1521 *	register_netdevice_notifier - register a network notifier block
1522 *	@nb: notifier
1523 *
1524 *	Register a notifier to be called when network device events occur.
1525 *	The notifier passed is linked into the kernel structures and must
1526 *	not be reused until it has been unregistered. A negative errno code
1527 *	is returned on a failure.
1528 *
1529 * 	When registered all registration and up events are replayed
1530 *	to the new notifier to allow device to have a race free
1531 *	view of the network device list.
1532 */
1533
1534int register_netdevice_notifier(struct notifier_block *nb)
1535{
1536	struct net_device *dev;
1537	struct net_device *last;
1538	struct net *net;
1539	int err;
1540
 
 
1541	rtnl_lock();
1542	err = raw_notifier_chain_register(&netdev_chain, nb);
1543	if (err)
1544		goto unlock;
1545	if (dev_boot_phase)
1546		goto unlock;
1547	for_each_net(net) {
1548		for_each_netdev(net, dev) {
1549			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1550			err = notifier_to_errno(err);
1551			if (err)
1552				goto rollback;
1553
1554			if (!(dev->flags & IFF_UP))
1555				continue;
1556
1557			call_netdevice_notifier(nb, NETDEV_UP, dev);
1558		}
1559	}
1560
1561unlock:
1562	rtnl_unlock();
 
1563	return err;
1564
1565rollback:
1566	last = dev;
1567	for_each_net(net) {
1568		for_each_netdev(net, dev) {
1569			if (dev == last)
1570				goto outroll;
1571
1572			if (dev->flags & IFF_UP) {
1573				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1574							dev);
1575				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1576			}
1577			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1578		}
1579	}
1580
1581outroll:
1582	raw_notifier_chain_unregister(&netdev_chain, nb);
1583	goto unlock;
1584}
1585EXPORT_SYMBOL(register_netdevice_notifier);
1586
1587/**
1588 *	unregister_netdevice_notifier - unregister a network notifier block
1589 *	@nb: notifier
1590 *
1591 *	Unregister a notifier previously registered by
1592 *	register_netdevice_notifier(). The notifier is unlinked into the
1593 *	kernel structures and may then be reused. A negative errno code
1594 *	is returned on a failure.
1595 *
1596 * 	After unregistering unregister and down device events are synthesized
1597 *	for all devices on the device list to the removed notifier to remove
1598 *	the need for special case cleanup code.
1599 */
1600
1601int unregister_netdevice_notifier(struct notifier_block *nb)
1602{
1603	struct net_device *dev;
1604	struct net *net;
1605	int err;
1606
 
 
1607	rtnl_lock();
1608	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1609	if (err)
1610		goto unlock;
1611
1612	for_each_net(net) {
1613		for_each_netdev(net, dev) {
1614			if (dev->flags & IFF_UP) {
1615				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1616							dev);
1617				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1618			}
1619			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1620		}
1621	}
1622unlock:
1623	rtnl_unlock();
 
1624	return err;
1625}
1626EXPORT_SYMBOL(unregister_netdevice_notifier);
1627
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1628/**
1629 *	call_netdevice_notifiers_info - call all network notifier blocks
1630 *	@val: value passed unmodified to notifier function
1631 *	@dev: net_device pointer passed unmodified to notifier function
1632 *	@info: notifier information data
1633 *
1634 *	Call all network notifier blocks.  Parameters and return value
1635 *	are as for raw_notifier_call_chain().
1636 */
1637
1638static int call_netdevice_notifiers_info(unsigned long val,
1639					 struct net_device *dev,
1640					 struct netdev_notifier_info *info)
1641{
 
 
 
1642	ASSERT_RTNL();
1643	netdev_notifier_info_init(info, dev);
 
 
 
 
 
 
 
1644	return raw_notifier_call_chain(&netdev_chain, val, info);
1645}
1646
1647/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1648 *	call_netdevice_notifiers - call all network notifier blocks
1649 *      @val: value passed unmodified to notifier function
1650 *      @dev: net_device pointer passed unmodified to notifier function
1651 *
1652 *	Call all network notifier blocks.  Parameters and return value
1653 *	are as for raw_notifier_call_chain().
1654 */
1655
1656int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1657{
1658	struct netdev_notifier_info info;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1659
1660	return call_netdevice_notifiers_info(val, dev, &info);
 
 
1661}
1662EXPORT_SYMBOL(call_netdevice_notifiers);
1663
1664#ifdef CONFIG_NET_INGRESS
1665static struct static_key ingress_needed __read_mostly;
1666
1667void net_inc_ingress_queue(void)
1668{
1669	static_key_slow_inc(&ingress_needed);
1670}
1671EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1672
1673void net_dec_ingress_queue(void)
1674{
1675	static_key_slow_dec(&ingress_needed);
1676}
1677EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1678#endif
1679
1680#ifdef CONFIG_NET_EGRESS
1681static struct static_key egress_needed __read_mostly;
1682
1683void net_inc_egress_queue(void)
1684{
1685	static_key_slow_inc(&egress_needed);
1686}
1687EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1688
1689void net_dec_egress_queue(void)
1690{
1691	static_key_slow_dec(&egress_needed);
1692}
1693EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1694#endif
1695
1696static struct static_key netstamp_needed __read_mostly;
1697#ifdef HAVE_JUMP_LABEL
 
 
 
 
 
 
1698static atomic_t netstamp_needed_deferred;
1699static atomic_t netstamp_wanted;
1700static void netstamp_clear(struct work_struct *work)
1701{
1702	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1703	int wanted;
1704
1705	wanted = atomic_add_return(deferred, &netstamp_wanted);
1706	if (wanted > 0)
1707		static_key_enable(&netstamp_needed);
1708	else
1709		static_key_disable(&netstamp_needed);
1710}
1711static DECLARE_WORK(netstamp_work, netstamp_clear);
1712#endif
1713
1714void net_enable_timestamp(void)
1715{
1716#ifdef HAVE_JUMP_LABEL
1717	int wanted;
1718
1719	while (1) {
1720		wanted = atomic_read(&netstamp_wanted);
1721		if (wanted <= 0)
1722			break;
1723		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1724			return;
1725	}
1726	atomic_inc(&netstamp_needed_deferred);
1727	schedule_work(&netstamp_work);
1728#else
1729	static_key_slow_inc(&netstamp_needed);
1730#endif
1731}
1732EXPORT_SYMBOL(net_enable_timestamp);
1733
1734void net_disable_timestamp(void)
1735{
1736#ifdef HAVE_JUMP_LABEL
1737	int wanted;
1738
1739	while (1) {
1740		wanted = atomic_read(&netstamp_wanted);
1741		if (wanted <= 1)
1742			break;
1743		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1744			return;
1745	}
1746	atomic_dec(&netstamp_needed_deferred);
1747	schedule_work(&netstamp_work);
1748#else
1749	static_key_slow_dec(&netstamp_needed);
1750#endif
1751}
1752EXPORT_SYMBOL(net_disable_timestamp);
1753
1754static inline void net_timestamp_set(struct sk_buff *skb)
1755{
1756	skb->tstamp = 0;
1757	if (static_key_false(&netstamp_needed))
1758		__net_timestamp(skb);
 
1759}
1760
1761#define net_timestamp_check(COND, SKB)			\
1762	if (static_key_false(&netstamp_needed)) {		\
1763		if ((COND) && !(SKB)->tstamp)	\
1764			__net_timestamp(SKB);		\
1765	}						\
1766
1767bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1768{
1769	unsigned int len;
1770
1771	if (!(dev->flags & IFF_UP))
1772		return false;
1773
1774	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1775	if (skb->len <= len)
1776		return true;
1777
1778	/* if TSO is enabled, we don't care about the length as the packet
1779	 * could be forwarded without being segmented before
1780	 */
1781	if (skb_is_gso(skb))
1782		return true;
1783
1784	return false;
1785}
1786EXPORT_SYMBOL_GPL(is_skb_forwardable);
1787
1788int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 
1789{
1790	int ret = ____dev_forward_skb(dev, skb);
1791
1792	if (likely(!ret)) {
1793		skb->protocol = eth_type_trans(skb, dev);
1794		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1795	}
1796
1797	return ret;
1798}
 
 
 
 
 
1799EXPORT_SYMBOL_GPL(__dev_forward_skb);
1800
1801/**
1802 * dev_forward_skb - loopback an skb to another netif
1803 *
1804 * @dev: destination network device
1805 * @skb: buffer to forward
1806 *
1807 * return values:
1808 *	NET_RX_SUCCESS	(no congestion)
1809 *	NET_RX_DROP     (packet was dropped, but freed)
1810 *
1811 * dev_forward_skb can be used for injecting an skb from the
1812 * start_xmit function of one device into the receive queue
1813 * of another device.
1814 *
1815 * The receiving device may be in another namespace, so
1816 * we have to clear all information in the skb that could
1817 * impact namespace isolation.
1818 */
1819int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1820{
1821	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1822}
1823EXPORT_SYMBOL_GPL(dev_forward_skb);
1824
 
 
 
 
 
1825static inline int deliver_skb(struct sk_buff *skb,
1826			      struct packet_type *pt_prev,
1827			      struct net_device *orig_dev)
1828{
1829	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1830		return -ENOMEM;
1831	atomic_inc(&skb->users);
1832	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1833}
1834
1835static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1836					  struct packet_type **pt,
1837					  struct net_device *orig_dev,
1838					  __be16 type,
1839					  struct list_head *ptype_list)
1840{
1841	struct packet_type *ptype, *pt_prev = *pt;
1842
1843	list_for_each_entry_rcu(ptype, ptype_list, list) {
1844		if (ptype->type != type)
1845			continue;
1846		if (pt_prev)
1847			deliver_skb(skb, pt_prev, orig_dev);
1848		pt_prev = ptype;
1849	}
1850	*pt = pt_prev;
1851}
1852
1853static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1854{
1855	if (!ptype->af_packet_priv || !skb->sk)
1856		return false;
1857
1858	if (ptype->id_match)
1859		return ptype->id_match(ptype, skb->sk);
1860	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1861		return true;
1862
1863	return false;
1864}
1865
 
 
 
 
 
 
 
 
 
 
 
 
1866/*
1867 *	Support routine. Sends outgoing frames to any network
1868 *	taps currently in use.
1869 */
1870
1871void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1872{
1873	struct packet_type *ptype;
 
1874	struct sk_buff *skb2 = NULL;
1875	struct packet_type *pt_prev = NULL;
1876	struct list_head *ptype_list = &ptype_all;
1877
1878	rcu_read_lock();
1879again:
1880	list_for_each_entry_rcu(ptype, ptype_list, list) {
 
 
 
1881		/* Never send packets back to the socket
1882		 * they originated from - MvS (miquels@drinkel.ow.org)
1883		 */
1884		if (skb_loop_sk(ptype, skb))
1885			continue;
1886
1887		if (pt_prev) {
1888			deliver_skb(skb2, pt_prev, skb->dev);
1889			pt_prev = ptype;
1890			continue;
1891		}
1892
1893		/* need to clone skb, done only once */
1894		skb2 = skb_clone(skb, GFP_ATOMIC);
1895		if (!skb2)
1896			goto out_unlock;
1897
1898		net_timestamp_set(skb2);
1899
1900		/* skb->nh should be correctly
1901		 * set by sender, so that the second statement is
1902		 * just protection against buggy protocols.
1903		 */
1904		skb_reset_mac_header(skb2);
1905
1906		if (skb_network_header(skb2) < skb2->data ||
1907		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1908			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1909					     ntohs(skb2->protocol),
1910					     dev->name);
1911			skb_reset_network_header(skb2);
1912		}
1913
1914		skb2->transport_header = skb2->network_header;
1915		skb2->pkt_type = PACKET_OUTGOING;
1916		pt_prev = ptype;
1917	}
1918
1919	if (ptype_list == &ptype_all) {
1920		ptype_list = &dev->ptype_all;
1921		goto again;
1922	}
1923out_unlock:
1924	if (pt_prev)
1925		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
 
 
 
 
1926	rcu_read_unlock();
1927}
1928EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1929
1930/**
1931 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1932 * @dev: Network device
1933 * @txq: number of queues available
1934 *
1935 * If real_num_tx_queues is changed the tc mappings may no longer be
1936 * valid. To resolve this verify the tc mapping remains valid and if
1937 * not NULL the mapping. With no priorities mapping to this
1938 * offset/count pair it will no longer be used. In the worst case TC0
1939 * is invalid nothing can be done so disable priority mappings. If is
1940 * expected that drivers will fix this mapping if they can before
1941 * calling netif_set_real_num_tx_queues.
1942 */
1943static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1944{
1945	int i;
1946	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1947
1948	/* If TC0 is invalidated disable TC mapping */
1949	if (tc->offset + tc->count > txq) {
1950		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1951		dev->num_tc = 0;
1952		return;
1953	}
1954
1955	/* Invalidated prio to tc mappings set to TC0 */
1956	for (i = 1; i < TC_BITMASK + 1; i++) {
1957		int q = netdev_get_prio_tc_map(dev, i);
1958
1959		tc = &dev->tc_to_txq[q];
1960		if (tc->offset + tc->count > txq) {
1961			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1962				i, q);
1963			netdev_set_prio_tc_map(dev, i, 0);
1964		}
1965	}
1966}
1967
1968int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
1969{
1970	if (dev->num_tc) {
1971		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1972		int i;
1973
 
1974		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
1975			if ((txq - tc->offset) < tc->count)
1976				return i;
1977		}
1978
 
1979		return -1;
1980	}
1981
1982	return 0;
1983}
 
1984
1985#ifdef CONFIG_XPS
 
 
1986static DEFINE_MUTEX(xps_map_mutex);
1987#define xmap_dereference(P)		\
1988	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1989
1990static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
1991			     int tci, u16 index)
1992{
1993	struct xps_map *map = NULL;
1994	int pos;
1995
1996	if (dev_maps)
1997		map = xmap_dereference(dev_maps->cpu_map[tci]);
1998	if (!map)
1999		return false;
2000
2001	for (pos = map->len; pos--;) {
2002		if (map->queues[pos] != index)
2003			continue;
2004
2005		if (map->len > 1) {
2006			map->queues[pos] = map->queues[--map->len];
2007			break;
2008		}
2009
2010		RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
 
 
2011		kfree_rcu(map, rcu);
2012		return false;
2013	}
2014
2015	return true;
2016}
2017
2018static bool remove_xps_queue_cpu(struct net_device *dev,
2019				 struct xps_dev_maps *dev_maps,
2020				 int cpu, u16 offset, u16 count)
2021{
2022	int num_tc = dev->num_tc ? : 1;
2023	bool active = false;
2024	int tci;
2025
2026	for (tci = cpu * num_tc; num_tc--; tci++) {
2027		int i, j;
2028
2029		for (i = count, j = offset; i--; j++) {
2030			if (!remove_xps_queue(dev_maps, cpu, j))
2031				break;
2032		}
2033
2034		active |= i < 0;
2035	}
2036
2037	return active;
2038}
2039
2040static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2041				   u16 count)
 
 
 
 
 
 
 
 
 
 
 
 
 
2042{
2043	struct xps_dev_maps *dev_maps;
2044	int cpu, i;
2045	bool active = false;
 
2046
2047	mutex_lock(&xps_map_mutex);
2048	dev_maps = xmap_dereference(dev->xps_maps);
2049
2050	if (!dev_maps)
2051		goto out_no_maps;
2052
2053	for_each_possible_cpu(cpu)
2054		active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
2055					       offset, count);
2056
2057	if (!active) {
2058		RCU_INIT_POINTER(dev->xps_maps, NULL);
2059		kfree_rcu(dev_maps, rcu);
 
 
2060	}
 
2061
2062	for (i = offset + (count - 1); count--; i--)
2063		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2064					     NUMA_NO_NODE);
 
 
 
 
 
 
 
 
 
 
2065
2066out_no_maps:
2067	mutex_unlock(&xps_map_mutex);
 
2068}
2069
2070static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2071{
2072	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2073}
2074
2075static struct xps_map *expand_xps_map(struct xps_map *map,
2076				      int cpu, u16 index)
2077{
2078	struct xps_map *new_map;
2079	int alloc_len = XPS_MIN_MAP_ALLOC;
2080	int i, pos;
2081
2082	for (pos = 0; map && pos < map->len; pos++) {
2083		if (map->queues[pos] != index)
2084			continue;
2085		return map;
2086	}
2087
2088	/* Need to add queue to this CPU's existing map */
2089	if (map) {
2090		if (pos < map->alloc_len)
2091			return map;
2092
2093		alloc_len = map->alloc_len * 2;
2094	}
2095
2096	/* Need to allocate new map to store queue on this CPU's map */
2097	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2098			       cpu_to_node(cpu));
 
 
 
 
 
2099	if (!new_map)
2100		return NULL;
2101
2102	for (i = 0; i < pos; i++)
2103		new_map->queues[i] = map->queues[i];
2104	new_map->alloc_len = alloc_len;
2105	new_map->len = pos;
2106
2107	return new_map;
2108}
2109
2110int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2111			u16 index)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2112{
2113	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2114	int i, cpu, tci, numa_node_id = -2;
 
 
2115	int maps_sz, num_tc = 1, tc = 0;
2116	struct xps_map *map, *new_map;
2117	bool active = false;
 
 
2118
2119	if (dev->num_tc) {
 
2120		num_tc = dev->num_tc;
 
 
 
 
 
 
2121		tc = netdev_txq_to_tc(dev, index);
2122		if (tc < 0)
2123			return -EINVAL;
2124	}
2125
2126	maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
 
 
 
 
 
 
 
 
 
 
 
 
2127	if (maps_sz < L1_CACHE_BYTES)
2128		maps_sz = L1_CACHE_BYTES;
2129
2130	mutex_lock(&xps_map_mutex);
2131
2132	dev_maps = xmap_dereference(dev->xps_maps);
 
 
 
 
 
2133
2134	/* allocate memory for queue storage */
2135	for_each_cpu_and(cpu, cpu_online_mask, mask) {
2136		if (!new_dev_maps)
 
2137			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2138		if (!new_dev_maps) {
2139			mutex_unlock(&xps_map_mutex);
2140			return -ENOMEM;
 
 
 
 
2141		}
2142
2143		tci = cpu * num_tc + tc;
2144		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
2145				 NULL;
2146
2147		map = expand_xps_map(map, cpu, index);
2148		if (!map)
2149			goto error;
2150
2151		RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2152	}
2153
2154	if (!new_dev_maps)
2155		goto out_no_new_maps;
2156
2157	for_each_possible_cpu(cpu) {
2158		/* copy maps belonging to foreign traffic classes */
2159		for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
2160			/* fill in the new device map from the old device map */
2161			map = xmap_dereference(dev_maps->cpu_map[tci]);
2162			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2163		}
2164
2165		/* We need to explicitly update tci as prevous loop
2166		 * could break out early if dev_maps is NULL.
2167		 */
2168		tci = cpu * num_tc + tc;
 
 
 
 
2169
2170		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2171			/* add queue to CPU maps */
2172			int pos = 0;
2173
2174			map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2175			while ((pos < map->len) && (map->queues[pos] != index))
2176				pos++;
2177
2178			if (pos == map->len)
2179				map->queues[map->len++] = index;
2180#ifdef CONFIG_NUMA
2181			if (numa_node_id == -2)
2182				numa_node_id = cpu_to_node(cpu);
2183			else if (numa_node_id != cpu_to_node(cpu))
2184				numa_node_id = -1;
 
 
2185#endif
2186		} else if (dev_maps) {
2187			/* fill in the new device map from the old device map */
2188			map = xmap_dereference(dev_maps->cpu_map[tci]);
2189			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2190		}
2191
2192		/* copy maps belonging to foreign traffic classes */
2193		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2194			/* fill in the new device map from the old device map */
2195			map = xmap_dereference(dev_maps->cpu_map[tci]);
2196			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2197		}
2198	}
2199
2200	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2201
2202	/* Cleanup old maps */
2203	if (!dev_maps)
2204		goto out_no_old_maps;
2205
2206	for_each_possible_cpu(cpu) {
2207		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2208			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2209			map = xmap_dereference(dev_maps->cpu_map[tci]);
2210			if (map && map != new_map)
2211				kfree_rcu(map, rcu);
 
 
 
 
 
 
 
 
2212		}
2213	}
2214
2215	kfree_rcu(dev_maps, rcu);
2216
2217out_no_old_maps:
2218	dev_maps = new_dev_maps;
2219	active = true;
2220
2221out_no_new_maps:
2222	/* update Tx queue numa node */
2223	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2224				     (numa_node_id >= 0) ? numa_node_id :
2225				     NUMA_NO_NODE);
 
2226
2227	if (!dev_maps)
2228		goto out_no_maps;
2229
2230	/* removes queue from unused CPUs */
2231	for_each_possible_cpu(cpu) {
2232		for (i = tc, tci = cpu * num_tc; i--; tci++)
2233			active |= remove_xps_queue(dev_maps, tci, index);
2234		if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
2235			active |= remove_xps_queue(dev_maps, tci, index);
2236		for (i = num_tc - tc, tci++; --i; tci++)
2237			active |= remove_xps_queue(dev_maps, tci, index);
 
 
 
 
 
 
2238	}
2239
 
 
 
2240	/* free map if not active */
2241	if (!active) {
2242		RCU_INIT_POINTER(dev->xps_maps, NULL);
2243		kfree_rcu(dev_maps, rcu);
2244	}
2245
2246out_no_maps:
2247	mutex_unlock(&xps_map_mutex);
2248
2249	return 0;
2250error:
2251	/* remove any maps that we added */
2252	for_each_possible_cpu(cpu) {
2253		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2254			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2255			map = dev_maps ?
2256			      xmap_dereference(dev_maps->cpu_map[tci]) :
2257			      NULL;
2258			if (new_map && new_map != map)
2259				kfree(new_map);
2260		}
2261	}
2262
2263	mutex_unlock(&xps_map_mutex);
2264
2265	kfree(new_dev_maps);
2266	return -ENOMEM;
2267}
 
 
 
 
 
 
 
 
 
 
 
 
 
2268EXPORT_SYMBOL(netif_set_xps_queue);
2269
2270#endif
 
 
 
 
 
 
 
 
 
 
 
2271void netdev_reset_tc(struct net_device *dev)
2272{
2273#ifdef CONFIG_XPS
2274	netif_reset_xps_queues_gt(dev, 0);
2275#endif
 
 
 
2276	dev->num_tc = 0;
2277	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2278	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2279}
2280EXPORT_SYMBOL(netdev_reset_tc);
2281
2282int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2283{
2284	if (tc >= dev->num_tc)
2285		return -EINVAL;
2286
2287#ifdef CONFIG_XPS
2288	netif_reset_xps_queues(dev, offset, count);
2289#endif
2290	dev->tc_to_txq[tc].count = count;
2291	dev->tc_to_txq[tc].offset = offset;
2292	return 0;
2293}
2294EXPORT_SYMBOL(netdev_set_tc_queue);
2295
2296int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2297{
2298	if (num_tc > TC_MAX_QUEUE)
2299		return -EINVAL;
2300
2301#ifdef CONFIG_XPS
2302	netif_reset_xps_queues_gt(dev, 0);
2303#endif
 
 
2304	dev->num_tc = num_tc;
2305	return 0;
2306}
2307EXPORT_SYMBOL(netdev_set_num_tc);
2308
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2309/*
2310 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2311 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2312 */
2313int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2314{
 
2315	int rc;
2316
 
 
2317	if (txq < 1 || txq > dev->num_tx_queues)
2318		return -EINVAL;
2319
2320	if (dev->reg_state == NETREG_REGISTERED ||
2321	    dev->reg_state == NETREG_UNREGISTERING) {
2322		ASSERT_RTNL();
2323
2324		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2325						  txq);
2326		if (rc)
2327			return rc;
2328
2329		if (dev->num_tc)
2330			netif_setup_tc(dev, txq);
2331
2332		if (txq < dev->real_num_tx_queues) {
 
 
 
 
 
 
 
2333			qdisc_reset_all_tx_gt(dev, txq);
2334#ifdef CONFIG_XPS
2335			netif_reset_xps_queues_gt(dev, txq);
2336#endif
2337		}
 
 
2338	}
2339
2340	dev->real_num_tx_queues = txq;
2341	return 0;
2342}
2343EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2344
2345#ifdef CONFIG_SYSFS
2346/**
2347 *	netif_set_real_num_rx_queues - set actual number of RX queues used
2348 *	@dev: Network device
2349 *	@rxq: Actual number of RX queues
2350 *
2351 *	This must be called either with the rtnl_lock held or before
2352 *	registration of the net device.  Returns 0 on success, or a
2353 *	negative error code.  If called before registration, it always
2354 *	succeeds.
2355 */
2356int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2357{
2358	int rc;
2359
2360	if (rxq < 1 || rxq > dev->num_rx_queues)
2361		return -EINVAL;
2362
2363	if (dev->reg_state == NETREG_REGISTERED) {
2364		ASSERT_RTNL();
2365
2366		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2367						  rxq);
2368		if (rc)
2369			return rc;
2370	}
2371
2372	dev->real_num_rx_queues = rxq;
2373	return 0;
2374}
2375EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2376#endif
2377
2378/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2379 * netif_get_num_default_rss_queues - default number of RSS queues
2380 *
2381 * This routine should set an upper limit on the number of RSS queues
2382 * used by default by multiqueue devices.
2383 */
2384int netif_get_num_default_rss_queues(void)
2385{
2386	return is_kdump_kernel() ?
2387		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
 
 
 
 
 
 
 
 
 
 
 
 
2388}
2389EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2390
2391static void __netif_reschedule(struct Qdisc *q)
2392{
2393	struct softnet_data *sd;
2394	unsigned long flags;
2395
2396	local_irq_save(flags);
2397	sd = this_cpu_ptr(&softnet_data);
2398	q->next_sched = NULL;
2399	*sd->output_queue_tailp = q;
2400	sd->output_queue_tailp = &q->next_sched;
2401	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2402	local_irq_restore(flags);
2403}
2404
2405void __netif_schedule(struct Qdisc *q)
2406{
2407	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2408		__netif_reschedule(q);
2409}
2410EXPORT_SYMBOL(__netif_schedule);
2411
2412struct dev_kfree_skb_cb {
2413	enum skb_free_reason reason;
2414};
2415
2416static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2417{
2418	return (struct dev_kfree_skb_cb *)skb->cb;
2419}
2420
2421void netif_schedule_queue(struct netdev_queue *txq)
2422{
2423	rcu_read_lock();
2424	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2425		struct Qdisc *q = rcu_dereference(txq->qdisc);
2426
2427		__netif_schedule(q);
2428	}
2429	rcu_read_unlock();
2430}
2431EXPORT_SYMBOL(netif_schedule_queue);
2432
2433/**
2434 *	netif_wake_subqueue - allow sending packets on subqueue
2435 *	@dev: network device
2436 *	@queue_index: sub queue index
2437 *
2438 * Resume individual transmit queue of a device with multiple transmit queues.
2439 */
2440void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2441{
2442	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2443
2444	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2445		struct Qdisc *q;
2446
2447		rcu_read_lock();
2448		q = rcu_dereference(txq->qdisc);
2449		__netif_schedule(q);
2450		rcu_read_unlock();
2451	}
2452}
2453EXPORT_SYMBOL(netif_wake_subqueue);
2454
2455void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2456{
2457	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2458		struct Qdisc *q;
2459
2460		rcu_read_lock();
2461		q = rcu_dereference(dev_queue->qdisc);
2462		__netif_schedule(q);
2463		rcu_read_unlock();
2464	}
2465}
2466EXPORT_SYMBOL(netif_tx_wake_queue);
2467
2468void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2469{
2470	unsigned long flags;
2471
2472	if (likely(atomic_read(&skb->users) == 1)) {
 
 
 
2473		smp_rmb();
2474		atomic_set(&skb->users, 0);
2475	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2476		return;
2477	}
2478	get_kfree_skb_cb(skb)->reason = reason;
2479	local_irq_save(flags);
2480	skb->next = __this_cpu_read(softnet_data.completion_queue);
2481	__this_cpu_write(softnet_data.completion_queue, skb);
2482	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2483	local_irq_restore(flags);
2484}
2485EXPORT_SYMBOL(__dev_kfree_skb_irq);
2486
2487void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2488{
2489	if (in_irq() || irqs_disabled())
2490		__dev_kfree_skb_irq(skb, reason);
2491	else
2492		dev_kfree_skb(skb);
2493}
2494EXPORT_SYMBOL(__dev_kfree_skb_any);
2495
2496
2497/**
2498 * netif_device_detach - mark device as removed
2499 * @dev: network device
2500 *
2501 * Mark device as removed from system and therefore no longer available.
2502 */
2503void netif_device_detach(struct net_device *dev)
2504{
2505	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2506	    netif_running(dev)) {
2507		netif_tx_stop_all_queues(dev);
2508	}
2509}
2510EXPORT_SYMBOL(netif_device_detach);
2511
2512/**
2513 * netif_device_attach - mark device as attached
2514 * @dev: network device
2515 *
2516 * Mark device as attached from system and restart if needed.
2517 */
2518void netif_device_attach(struct net_device *dev)
2519{
2520	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2521	    netif_running(dev)) {
2522		netif_tx_wake_all_queues(dev);
2523		__netdev_watchdog_up(dev);
2524	}
2525}
2526EXPORT_SYMBOL(netif_device_attach);
2527
2528/*
2529 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2530 * to be used as a distribution range.
2531 */
2532u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2533		  unsigned int num_tx_queues)
 
2534{
2535	u32 hash;
2536	u16 qoffset = 0;
2537	u16 qcount = num_tx_queues;
 
 
 
 
 
 
 
 
 
 
 
 
 
2538
2539	if (skb_rx_queue_recorded(skb)) {
 
2540		hash = skb_get_rx_queue(skb);
2541		while (unlikely(hash >= num_tx_queues))
2542			hash -= num_tx_queues;
2543		return hash;
2544	}
2545
2546	if (dev->num_tc) {
2547		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2548		qoffset = dev->tc_to_txq[tc].offset;
2549		qcount = dev->tc_to_txq[tc].count;
2550	}
2551
2552	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2553}
2554EXPORT_SYMBOL(__skb_tx_hash);
2555
2556static void skb_warn_bad_offload(const struct sk_buff *skb)
2557{
2558	static const netdev_features_t null_features;
2559	struct net_device *dev = skb->dev;
2560	const char *name = "";
2561
2562	if (!net_ratelimit())
2563		return;
2564
2565	if (dev) {
2566		if (dev->dev.parent)
2567			name = dev_driver_string(dev->dev.parent);
2568		else
2569			name = netdev_name(dev);
2570	}
2571	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2572	     "gso_type=%d ip_summed=%d\n",
2573	     name, dev ? &dev->features : &null_features,
2574	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2575	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2576	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2577}
2578
2579/*
2580 * Invalidate hardware checksum when packet is to be mangled, and
2581 * complete checksum manually on outgoing path.
2582 */
2583int skb_checksum_help(struct sk_buff *skb)
2584{
2585	__wsum csum;
2586	int ret = 0, offset;
2587
2588	if (skb->ip_summed == CHECKSUM_COMPLETE)
2589		goto out_set_summed;
2590
2591	if (unlikely(skb_shinfo(skb)->gso_size)) {
2592		skb_warn_bad_offload(skb);
2593		return -EINVAL;
2594	}
2595
 
 
 
 
2596	/* Before computing a checksum, we should make sure no frag could
2597	 * be modified by an external entity : checksum could be wrong.
2598	 */
2599	if (skb_has_shared_frag(skb)) {
2600		ret = __skb_linearize(skb);
2601		if (ret)
2602			goto out;
2603	}
2604
2605	offset = skb_checksum_start_offset(skb);
2606	BUG_ON(offset >= skb_headlen(skb));
 
 
 
 
 
 
2607	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2608
2609	offset += skb->csum_offset;
2610	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2611
2612	if (skb_cloned(skb) &&
2613	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2614		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2615		if (ret)
2616			goto out;
2617	}
 
 
 
2618
2619	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2620out_set_summed:
2621	skb->ip_summed = CHECKSUM_NONE;
2622out:
2623	return ret;
2624}
2625EXPORT_SYMBOL(skb_checksum_help);
2626
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2627__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2628{
2629	__be16 type = skb->protocol;
2630
2631	/* Tunnel gso handlers can set protocol to ethernet. */
2632	if (type == htons(ETH_P_TEB)) {
2633		struct ethhdr *eth;
2634
2635		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2636			return 0;
2637
2638		eth = (struct ethhdr *)skb_mac_header(skb);
2639		type = eth->h_proto;
2640	}
2641
2642	return __vlan_get_protocol(skb, type, depth);
2643}
2644
2645/**
2646 *	skb_mac_gso_segment - mac layer segmentation handler.
2647 *	@skb: buffer to segment
2648 *	@features: features for the output path (see dev->features)
2649 */
2650struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2651				    netdev_features_t features)
2652{
2653	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2654	struct packet_offload *ptype;
2655	int vlan_depth = skb->mac_len;
2656	__be16 type = skb_network_protocol(skb, &vlan_depth);
2657
2658	if (unlikely(!type))
2659		return ERR_PTR(-EINVAL);
2660
2661	__skb_pull(skb, vlan_depth);
2662
2663	rcu_read_lock();
2664	list_for_each_entry_rcu(ptype, &offload_base, list) {
2665		if (ptype->type == type && ptype->callbacks.gso_segment) {
2666			segs = ptype->callbacks.gso_segment(skb, features);
2667			break;
2668		}
2669	}
2670	rcu_read_unlock();
2671
2672	__skb_push(skb, skb->data - skb_mac_header(skb));
2673
2674	return segs;
2675}
2676EXPORT_SYMBOL(skb_mac_gso_segment);
2677
2678
2679/* openvswitch calls this on rx path, so we need a different check.
2680 */
2681static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2682{
2683	if (tx_path)
2684		return skb->ip_summed != CHECKSUM_PARTIAL;
2685	else
2686		return skb->ip_summed == CHECKSUM_NONE;
2687}
2688
2689/**
2690 *	__skb_gso_segment - Perform segmentation on skb.
2691 *	@skb: buffer to segment
2692 *	@features: features for the output path (see dev->features)
2693 *	@tx_path: whether it is called in TX path
2694 *
2695 *	This function segments the given skb and returns a list of segments.
2696 *
2697 *	It may return NULL if the skb requires no segmentation.  This is
2698 *	only possible when GSO is used for verifying header integrity.
2699 *
2700 *	Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2701 */
2702struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2703				  netdev_features_t features, bool tx_path)
2704{
2705	if (unlikely(skb_needs_check(skb, tx_path))) {
2706		int err;
2707
2708		skb_warn_bad_offload(skb);
2709
2710		err = skb_cow_head(skb, 0);
2711		if (err < 0)
2712			return ERR_PTR(err);
2713	}
2714
2715	/* Only report GSO partial support if it will enable us to
2716	 * support segmentation on this frame without needing additional
2717	 * work.
2718	 */
2719	if (features & NETIF_F_GSO_PARTIAL) {
2720		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2721		struct net_device *dev = skb->dev;
2722
2723		partial_features |= dev->features & dev->gso_partial_features;
2724		if (!skb_gso_ok(skb, features | partial_features))
2725			features &= ~NETIF_F_GSO_PARTIAL;
2726	}
2727
2728	BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2729		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2730
2731	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2732	SKB_GSO_CB(skb)->encap_level = 0;
2733
2734	skb_reset_mac_header(skb);
2735	skb_reset_mac_len(skb);
2736
2737	return skb_mac_gso_segment(skb, features);
2738}
2739EXPORT_SYMBOL(__skb_gso_segment);
2740
2741/* Take action when hardware reception checksum errors are detected. */
2742#ifdef CONFIG_BUG
2743void netdev_rx_csum_fault(struct net_device *dev)
2744{
2745	if (net_ratelimit()) {
2746		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2747		dump_stack();
2748	}
2749}
2750EXPORT_SYMBOL(netdev_rx_csum_fault);
2751#endif
2752
2753/* Actually, we should eliminate this check as soon as we know, that:
2754 * 1. IOMMU is present and allows to map all the memory.
2755 * 2. No high memory really exists on this machine.
2756 */
2757
2758static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2759{
2760#ifdef CONFIG_HIGHMEM
2761	int i;
 
2762	if (!(dev->features & NETIF_F_HIGHDMA)) {
2763		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2764			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2765			if (PageHighMem(skb_frag_page(frag)))
2766				return 1;
2767		}
2768	}
2769
2770	if (PCI_DMA_BUS_IS_PHYS) {
2771		struct device *pdev = dev->dev.parent;
2772
2773		if (!pdev)
2774			return 0;
2775		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2776			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2777			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2778			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2779				return 1;
2780		}
2781	}
2782#endif
2783	return 0;
2784}
2785
2786/* If MPLS offload request, verify we are testing hardware MPLS features
2787 * instead of standard features for the netdev.
2788 */
2789#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2790static netdev_features_t net_mpls_features(struct sk_buff *skb,
2791					   netdev_features_t features,
2792					   __be16 type)
2793{
2794	if (eth_p_mpls(type))
2795		features &= skb->dev->mpls_features;
2796
2797	return features;
2798}
2799#else
2800static netdev_features_t net_mpls_features(struct sk_buff *skb,
2801					   netdev_features_t features,
2802					   __be16 type)
2803{
2804	return features;
2805}
2806#endif
2807
2808static netdev_features_t harmonize_features(struct sk_buff *skb,
2809	netdev_features_t features)
2810{
2811	int tmp;
2812	__be16 type;
2813
2814	type = skb_network_protocol(skb, &tmp);
2815	features = net_mpls_features(skb, features, type);
2816
2817	if (skb->ip_summed != CHECKSUM_NONE &&
2818	    !can_checksum_protocol(features, type)) {
2819		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2820	}
2821	if (illegal_highdma(skb->dev, skb))
2822		features &= ~NETIF_F_SG;
2823
2824	return features;
2825}
2826
2827netdev_features_t passthru_features_check(struct sk_buff *skb,
2828					  struct net_device *dev,
2829					  netdev_features_t features)
2830{
2831	return features;
2832}
2833EXPORT_SYMBOL(passthru_features_check);
2834
2835static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2836					     struct net_device *dev,
2837					     netdev_features_t features)
2838{
2839	return vlan_features_check(skb, features);
2840}
2841
2842static netdev_features_t gso_features_check(const struct sk_buff *skb,
2843					    struct net_device *dev,
2844					    netdev_features_t features)
2845{
2846	u16 gso_segs = skb_shinfo(skb)->gso_segs;
2847
2848	if (gso_segs > dev->gso_max_segs)
 
 
 
 
 
 
 
2849		return features & ~NETIF_F_GSO_MASK;
 
2850
2851	/* Support for GSO partial features requires software
2852	 * intervention before we can actually process the packets
2853	 * so we need to strip support for any partial features now
2854	 * and we can pull them back in after we have partially
2855	 * segmented the frame.
2856	 */
2857	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2858		features &= ~dev->gso_partial_features;
2859
2860	/* Make sure to clear the IPv4 ID mangling feature if the
2861	 * IPv4 header has the potential to be fragmented.
2862	 */
2863	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2864		struct iphdr *iph = skb->encapsulation ?
2865				    inner_ip_hdr(skb) : ip_hdr(skb);
2866
2867		if (!(iph->frag_off & htons(IP_DF)))
2868			features &= ~NETIF_F_TSO_MANGLEID;
2869	}
2870
2871	return features;
2872}
2873
2874netdev_features_t netif_skb_features(struct sk_buff *skb)
2875{
2876	struct net_device *dev = skb->dev;
2877	netdev_features_t features = dev->features;
2878
2879	if (skb_is_gso(skb))
2880		features = gso_features_check(skb, dev, features);
2881
2882	/* If encapsulation offload request, verify we are testing
2883	 * hardware encapsulation features instead of standard
2884	 * features for the netdev
2885	 */
2886	if (skb->encapsulation)
2887		features &= dev->hw_enc_features;
2888
2889	if (skb_vlan_tagged(skb))
2890		features = netdev_intersect_features(features,
2891						     dev->vlan_features |
2892						     NETIF_F_HW_VLAN_CTAG_TX |
2893						     NETIF_F_HW_VLAN_STAG_TX);
2894
2895	if (dev->netdev_ops->ndo_features_check)
2896		features &= dev->netdev_ops->ndo_features_check(skb, dev,
2897								features);
2898	else
2899		features &= dflt_features_check(skb, dev, features);
2900
2901	return harmonize_features(skb, features);
2902}
2903EXPORT_SYMBOL(netif_skb_features);
2904
2905static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2906		    struct netdev_queue *txq, bool more)
2907{
2908	unsigned int len;
2909	int rc;
2910
2911	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2912		dev_queue_xmit_nit(skb, dev);
2913
2914	len = skb->len;
2915	trace_net_dev_start_xmit(skb, dev);
2916	rc = netdev_start_xmit(skb, dev, txq, more);
2917	trace_net_dev_xmit(skb, rc, dev, len);
2918
2919	return rc;
2920}
2921
2922struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2923				    struct netdev_queue *txq, int *ret)
2924{
2925	struct sk_buff *skb = first;
2926	int rc = NETDEV_TX_OK;
2927
2928	while (skb) {
2929		struct sk_buff *next = skb->next;
2930
2931		skb->next = NULL;
2932		rc = xmit_one(skb, dev, txq, next != NULL);
2933		if (unlikely(!dev_xmit_complete(rc))) {
2934			skb->next = next;
2935			goto out;
2936		}
2937
2938		skb = next;
2939		if (netif_xmit_stopped(txq) && skb) {
2940			rc = NETDEV_TX_BUSY;
2941			break;
2942		}
2943	}
2944
2945out:
2946	*ret = rc;
2947	return skb;
2948}
2949
2950static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2951					  netdev_features_t features)
2952{
2953	if (skb_vlan_tag_present(skb) &&
2954	    !vlan_hw_offload_capable(features, skb->vlan_proto))
2955		skb = __vlan_hwaccel_push_inside(skb);
2956	return skb;
2957}
2958
2959static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2960{
2961	netdev_features_t features;
2962
2963	features = netif_skb_features(skb);
2964	skb = validate_xmit_vlan(skb, features);
2965	if (unlikely(!skb))
2966		goto out_null;
2967
 
 
 
 
2968	if (netif_needs_gso(skb, features)) {
2969		struct sk_buff *segs;
2970
2971		segs = skb_gso_segment(skb, features);
2972		if (IS_ERR(segs)) {
2973			goto out_kfree_skb;
2974		} else if (segs) {
2975			consume_skb(skb);
2976			skb = segs;
2977		}
2978	} else {
2979		if (skb_needs_linearize(skb, features) &&
2980		    __skb_linearize(skb))
2981			goto out_kfree_skb;
2982
2983		/* If packet is not checksummed and device does not
2984		 * support checksumming for this protocol, complete
2985		 * checksumming here.
2986		 */
2987		if (skb->ip_summed == CHECKSUM_PARTIAL) {
2988			if (skb->encapsulation)
2989				skb_set_inner_transport_header(skb,
2990							       skb_checksum_start_offset(skb));
2991			else
2992				skb_set_transport_header(skb,
2993							 skb_checksum_start_offset(skb));
2994			if (!(features & NETIF_F_CSUM_MASK) &&
2995			    skb_checksum_help(skb))
2996				goto out_kfree_skb;
2997		}
2998	}
2999
 
 
3000	return skb;
3001
3002out_kfree_skb:
3003	kfree_skb(skb);
3004out_null:
3005	atomic_long_inc(&dev->tx_dropped);
3006	return NULL;
3007}
3008
3009struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
3010{
3011	struct sk_buff *next, *head = NULL, *tail;
3012
3013	for (; skb != NULL; skb = next) {
3014		next = skb->next;
3015		skb->next = NULL;
3016
3017		/* in case skb wont be segmented, point to itself */
3018		skb->prev = skb;
3019
3020		skb = validate_xmit_skb(skb, dev);
3021		if (!skb)
3022			continue;
3023
3024		if (!head)
3025			head = skb;
3026		else
3027			tail->next = skb;
3028		/* If skb was segmented, skb->prev points to
3029		 * the last segment. If not, it still contains skb.
3030		 */
3031		tail = skb->prev;
3032	}
3033	return head;
3034}
3035EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3036
3037static void qdisc_pkt_len_init(struct sk_buff *skb)
3038{
3039	const struct skb_shared_info *shinfo = skb_shinfo(skb);
3040
3041	qdisc_skb_cb(skb)->pkt_len = skb->len;
3042
3043	/* To get more precise estimation of bytes sent on wire,
3044	 * we add to pkt_len the headers size of all segments
3045	 */
3046	if (shinfo->gso_size)  {
 
3047		unsigned int hdr_len;
3048		u16 gso_segs = shinfo->gso_segs;
3049
3050		/* mac layer + network layer */
3051		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3052
3053		/* + transport layer */
3054		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3055			hdr_len += tcp_hdrlen(skb);
3056		else
3057			hdr_len += sizeof(struct udphdr);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3058
3059		if (shinfo->gso_type & SKB_GSO_DODGY)
3060			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3061						shinfo->gso_size);
 
 
3062
3063		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3064	}
 
 
3065}
3066
3067static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3068				 struct net_device *dev,
3069				 struct netdev_queue *txq)
3070{
3071	spinlock_t *root_lock = qdisc_lock(q);
3072	struct sk_buff *to_free = NULL;
3073	bool contended;
3074	int rc;
3075
3076	qdisc_calculate_pkt_len(skb, q);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3077	/*
3078	 * Heuristic to force contended enqueues to serialize on a
3079	 * separate lock before trying to get qdisc main lock.
3080	 * This permits qdisc->running owner to get the lock more
3081	 * often and dequeue packets faster.
 
 
 
 
3082	 */
3083	contended = qdisc_is_running(q);
3084	if (unlikely(contended))
3085		spin_lock(&q->busylock);
3086
3087	spin_lock(root_lock);
3088	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3089		__qdisc_drop(skb, &to_free);
3090		rc = NET_XMIT_DROP;
3091	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3092		   qdisc_run_begin(q)) {
3093		/*
3094		 * This is a work-conserving queue; there are no old skbs
3095		 * waiting to be sent out; and the qdisc is not running -
3096		 * xmit the skb directly.
3097		 */
3098
3099		qdisc_bstats_update(q, skb);
3100
3101		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3102			if (unlikely(contended)) {
3103				spin_unlock(&q->busylock);
3104				contended = false;
3105			}
3106			__qdisc_run(q);
3107		} else
3108			qdisc_run_end(q);
3109
 
3110		rc = NET_XMIT_SUCCESS;
3111	} else {
3112		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
 
 
3113		if (qdisc_run_begin(q)) {
3114			if (unlikely(contended)) {
3115				spin_unlock(&q->busylock);
3116				contended = false;
3117			}
3118			__qdisc_run(q);
 
3119		}
3120	}
3121	spin_unlock(root_lock);
3122	if (unlikely(to_free))
3123		kfree_skb_list(to_free);
 
3124	if (unlikely(contended))
3125		spin_unlock(&q->busylock);
3126	return rc;
3127}
3128
3129#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3130static void skb_update_prio(struct sk_buff *skb)
3131{
3132	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
 
 
3133
3134	if (!skb->priority && skb->sk && map) {
3135		unsigned int prioidx =
3136			sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
 
 
 
 
 
3137
3138		if (prioidx < map->priomap_len)
3139			skb->priority = map->priomap[prioidx];
3140	}
 
3141}
3142#else
3143#define skb_update_prio(skb)
3144#endif
3145
3146DEFINE_PER_CPU(int, xmit_recursion);
3147EXPORT_SYMBOL(xmit_recursion);
3148
3149/**
3150 *	dev_loopback_xmit - loop back @skb
3151 *	@net: network namespace this loopback is happening in
3152 *	@sk:  sk needed to be a netfilter okfn
3153 *	@skb: buffer to transmit
3154 */
3155int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3156{
3157	skb_reset_mac_header(skb);
3158	__skb_pull(skb, skb_network_offset(skb));
3159	skb->pkt_type = PACKET_LOOPBACK;
3160	skb->ip_summed = CHECKSUM_UNNECESSARY;
3161	WARN_ON(!skb_dst(skb));
 
3162	skb_dst_force(skb);
3163	netif_rx_ni(skb);
3164	return 0;
3165}
3166EXPORT_SYMBOL(dev_loopback_xmit);
3167
3168#ifdef CONFIG_NET_EGRESS
3169static struct sk_buff *
3170sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3171{
3172	struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3173	struct tcf_result cl_res;
 
 
 
3174
3175	if (!cl)
3176		return skb;
 
 
 
 
 
 
 
 
 
 
 
 
 
3177
3178	/* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3179	 * earlier by the caller.
3180	 */
3181	qdisc_bstats_cpu_update(cl->q, skb);
3182
3183	switch (tc_classify(skb, cl, &cl_res, false)) {
 
 
 
 
 
 
 
 
 
 
 
3184	case TC_ACT_OK:
3185	case TC_ACT_RECLASSIFY:
3186		skb->tc_index = TC_H_MIN(cl_res.classid);
3187		break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3188	case TC_ACT_SHOT:
3189		qdisc_qstats_cpu_drop(cl->q);
3190		*ret = NET_XMIT_DROP;
3191		kfree_skb(skb);
3192		return NULL;
 
3193	case TC_ACT_STOLEN:
3194	case TC_ACT_QUEUED:
3195		*ret = NET_XMIT_SUCCESS;
3196		consume_skb(skb);
 
 
 
 
3197		return NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3198	case TC_ACT_REDIRECT:
3199		/* No need to push/pop skb's mac_header here on egress! */
3200		skb_do_redirect(skb);
3201		*ret = NET_XMIT_SUCCESS;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3202		return NULL;
3203	default:
3204		break;
3205	}
 
3206
3207	return skb;
3208}
3209#endif /* CONFIG_NET_EGRESS */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3210
3211static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 
3212{
3213#ifdef CONFIG_XPS
3214	struct xps_dev_maps *dev_maps;
3215	struct xps_map *map;
3216	int queue_index = -1;
3217
 
 
 
3218	rcu_read_lock();
3219	dev_maps = rcu_dereference(dev->xps_maps);
 
 
 
3220	if (dev_maps) {
3221		unsigned int tci = skb->sender_cpu - 1;
3222
3223		if (dev->num_tc) {
3224			tci *= dev->num_tc;
3225			tci += netdev_get_prio_tc_map(dev, skb->priority);
3226		}
 
 
 
 
 
 
3227
3228		map = rcu_dereference(dev_maps->cpu_map[tci]);
3229		if (map) {
3230			if (map->len == 1)
3231				queue_index = map->queues[0];
3232			else
3233				queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3234									   map->len)];
3235			if (unlikely(queue_index >= dev->real_num_tx_queues))
3236				queue_index = -1;
3237		}
3238	}
3239	rcu_read_unlock();
3240
3241	return queue_index;
3242#else
3243	return -1;
3244#endif
3245}
3246
3247static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
 
 
 
 
 
 
 
 
3248{
3249	struct sock *sk = skb->sk;
3250	int queue_index = sk_tx_queue_get(sk);
3251
 
 
3252	if (queue_index < 0 || skb->ooo_okay ||
3253	    queue_index >= dev->real_num_tx_queues) {
3254		int new_index = get_xps_queue(dev, skb);
 
3255		if (new_index < 0)
3256			new_index = skb_tx_hash(dev, skb);
3257
3258		if (queue_index != new_index && sk &&
3259		    sk_fullsock(sk) &&
3260		    rcu_access_pointer(sk->sk_dst_cache))
3261			sk_tx_queue_set(sk, new_index);
3262
3263		queue_index = new_index;
3264	}
3265
3266	return queue_index;
3267}
 
3268
3269struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3270				    struct sk_buff *skb,
3271				    void *accel_priv)
3272{
3273	int queue_index = 0;
3274
3275#ifdef CONFIG_XPS
3276	u32 sender_cpu = skb->sender_cpu - 1;
3277
3278	if (sender_cpu >= (u32)NR_CPUS)
3279		skb->sender_cpu = raw_smp_processor_id() + 1;
3280#endif
3281
3282	if (dev->real_num_tx_queues != 1) {
3283		const struct net_device_ops *ops = dev->netdev_ops;
 
3284		if (ops->ndo_select_queue)
3285			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3286							    __netdev_pick_tx);
3287		else
3288			queue_index = __netdev_pick_tx(dev, skb);
3289
3290		if (!accel_priv)
3291			queue_index = netdev_cap_txqueue(dev, queue_index);
3292	}
3293
3294	skb_set_queue_mapping(skb, queue_index);
3295	return netdev_get_tx_queue(dev, queue_index);
3296}
3297
3298/**
3299 *	__dev_queue_xmit - transmit a buffer
3300 *	@skb: buffer to transmit
3301 *	@accel_priv: private data used for L2 forwarding offload
3302 *
3303 *	Queue a buffer for transmission to a network device. The caller must
3304 *	have set the device and priority and built the buffer before calling
3305 *	this function. The function can be called from an interrupt.
3306 *
3307 *	A negative errno code is returned on a failure. A success does not
3308 *	guarantee the frame will be transmitted as it may be dropped due
3309 *	to congestion or traffic shaping.
3310 *
3311 * -----------------------------------------------------------------------------------
3312 *      I notice this method can also return errors from the queue disciplines,
3313 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3314 *      be positive.
3315 *
3316 *      Regardless of the return value, the skb is consumed, so it is currently
3317 *      difficult to retry a send to this method.  (You can bump the ref count
3318 *      before sending to hold a reference for retry if you are careful.)
3319 *
3320 *      When calling this method, interrupts MUST be enabled.  This is because
3321 *      the BH enable code must have IRQs enabled so that it will not deadlock.
3322 *          --BLG
3323 */
3324static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3325{
3326	struct net_device *dev = skb->dev;
3327	struct netdev_queue *txq;
3328	struct Qdisc *q;
3329	int rc = -ENOMEM;
 
3330
3331	skb_reset_mac_header(skb);
 
3332
3333	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3334		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3335
3336	/* Disable soft irqs for various locks below. Also
3337	 * stops preemption for RCU.
3338	 */
3339	rcu_read_lock_bh();
3340
3341	skb_update_prio(skb);
3342
3343	qdisc_pkt_len_init(skb);
3344#ifdef CONFIG_NET_CLS_ACT
3345	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3346# ifdef CONFIG_NET_EGRESS
3347	if (static_key_false(&egress_needed)) {
 
 
 
 
 
 
 
 
3348		skb = sch_handle_egress(skb, &rc, dev);
3349		if (!skb)
3350			goto out;
 
 
 
 
3351	}
3352# endif
3353#endif
3354	/* If device/qdisc don't need skb->dst, release it right now while
3355	 * its hot in this cpu cache.
3356	 */
3357	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3358		skb_dst_drop(skb);
3359	else
3360		skb_dst_force(skb);
3361
3362	txq = netdev_pick_tx(dev, skb, accel_priv);
 
 
3363	q = rcu_dereference_bh(txq->qdisc);
3364
3365	trace_net_dev_queue(skb);
3366	if (q->enqueue) {
3367		rc = __dev_xmit_skb(skb, q, dev, txq);
3368		goto out;
3369	}
3370
3371	/* The device has no queue. Common case for software devices:
3372	   loopback, all the sorts of tunnels...
3373
3374	   Really, it is unlikely that netif_tx_lock protection is necessary
3375	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3376	   counters.)
3377	   However, it is possible, that they rely on protection
3378	   made by us here.
3379
3380	   Check this and shot the lock. It is not prone from deadlocks.
3381	   Either shot noqueue qdisc, it is even simpler 8)
3382	 */
3383	if (dev->flags & IFF_UP) {
3384		int cpu = smp_processor_id(); /* ok because BHs are off */
3385
3386		if (txq->xmit_lock_owner != cpu) {
3387			if (unlikely(__this_cpu_read(xmit_recursion) >
3388				     XMIT_RECURSION_LIMIT))
 
 
3389				goto recursion_alert;
3390
3391			skb = validate_xmit_skb(skb, dev);
3392			if (!skb)
3393				goto out;
3394
3395			HARD_TX_LOCK(dev, txq, cpu);
3396
3397			if (!netif_xmit_stopped(txq)) {
3398				__this_cpu_inc(xmit_recursion);
3399				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3400				__this_cpu_dec(xmit_recursion);
3401				if (dev_xmit_complete(rc)) {
3402					HARD_TX_UNLOCK(dev, txq);
3403					goto out;
3404				}
3405			}
3406			HARD_TX_UNLOCK(dev, txq);
3407			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3408					     dev->name);
3409		} else {
3410			/* Recursion is detected! It is possible,
3411			 * unfortunately
3412			 */
3413recursion_alert:
3414			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3415					     dev->name);
3416		}
3417	}
3418
3419	rc = -ENETDOWN;
3420	rcu_read_unlock_bh();
3421
3422	atomic_long_inc(&dev->tx_dropped);
3423	kfree_skb_list(skb);
3424	return rc;
3425out:
3426	rcu_read_unlock_bh();
3427	return rc;
3428}
 
3429
3430int dev_queue_xmit(struct sk_buff *skb)
3431{
3432	return __dev_queue_xmit(skb, NULL);
3433}
3434EXPORT_SYMBOL(dev_queue_xmit);
 
 
 
 
 
 
 
 
 
 
3435
3436int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3437{
3438	return __dev_queue_xmit(skb, accel_priv);
3439}
3440EXPORT_SYMBOL(dev_queue_xmit_accel);
3441
 
3442
3443/*=======================================================================
3444			Receiver routines
3445  =======================================================================*/
 
 
 
3446
3447int netdev_max_backlog __read_mostly = 1000;
3448EXPORT_SYMBOL(netdev_max_backlog);
 
 
 
 
 
 
3449
3450int netdev_tstamp_prequeue __read_mostly = 1;
3451int netdev_budget __read_mostly = 300;
3452int weight_p __read_mostly = 64;            /* old backlog weight */
 
 
 
 
 
3453
3454/* Called with irq disabled */
3455static inline void ____napi_schedule(struct softnet_data *sd,
3456				     struct napi_struct *napi)
3457{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3458	list_add_tail(&napi->poll_list, &sd->poll_list);
3459	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 
 
 
 
 
3460}
3461
3462#ifdef CONFIG_RPS
3463
3464/* One global table that all flow-based protocols share. */
3465struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3466EXPORT_SYMBOL(rps_sock_flow_table);
3467u32 rps_cpu_mask __read_mostly;
3468EXPORT_SYMBOL(rps_cpu_mask);
3469
3470struct static_key rps_needed __read_mostly;
3471EXPORT_SYMBOL(rps_needed);
3472struct static_key rfs_needed __read_mostly;
3473EXPORT_SYMBOL(rfs_needed);
3474
3475static struct rps_dev_flow *
3476set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3477	    struct rps_dev_flow *rflow, u16 next_cpu)
3478{
3479	if (next_cpu < nr_cpu_ids) {
 
3480#ifdef CONFIG_RFS_ACCEL
3481		struct netdev_rx_queue *rxqueue;
3482		struct rps_dev_flow_table *flow_table;
3483		struct rps_dev_flow *old_rflow;
 
3484		u32 flow_id;
3485		u16 rxq_index;
3486		int rc;
3487
3488		/* Should we steer this flow to a different hardware queue? */
3489		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3490		    !(dev->features & NETIF_F_NTUPLE))
3491			goto out;
3492		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3493		if (rxq_index == skb_get_rx_queue(skb))
3494			goto out;
3495
3496		rxqueue = dev->_rx + rxq_index;
3497		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3498		if (!flow_table)
3499			goto out;
3500		flow_id = skb_get_hash(skb) & flow_table->mask;
3501		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3502							rxq_index, flow_id);
3503		if (rc < 0)
3504			goto out;
3505		old_rflow = rflow;
3506		rflow = &flow_table->flows[flow_id];
3507		rflow->filter = rc;
3508		if (old_rflow->filter == rflow->filter)
3509			old_rflow->filter = RPS_NO_FILTER;
3510	out:
3511#endif
3512		rflow->last_qtail =
3513			per_cpu(softnet_data, next_cpu).input_queue_head;
3514	}
3515
3516	rflow->cpu = next_cpu;
3517	return rflow;
3518}
3519
3520/*
3521 * get_rps_cpu is called from netif_receive_skb and returns the target
3522 * CPU from the RPS map of the receiving queue for a given skb.
3523 * rcu_read_lock must be held on entry.
3524 */
3525static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3526		       struct rps_dev_flow **rflowp)
3527{
3528	const struct rps_sock_flow_table *sock_flow_table;
3529	struct netdev_rx_queue *rxqueue = dev->_rx;
3530	struct rps_dev_flow_table *flow_table;
3531	struct rps_map *map;
3532	int cpu = -1;
3533	u32 tcpu;
3534	u32 hash;
3535
3536	if (skb_rx_queue_recorded(skb)) {
3537		u16 index = skb_get_rx_queue(skb);
3538
3539		if (unlikely(index >= dev->real_num_rx_queues)) {
3540			WARN_ONCE(dev->real_num_rx_queues > 1,
3541				  "%s received packet on queue %u, but number "
3542				  "of RX queues is %u\n",
3543				  dev->name, index, dev->real_num_rx_queues);
3544			goto done;
3545		}
3546		rxqueue += index;
3547	}
3548
3549	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3550
3551	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3552	map = rcu_dereference(rxqueue->rps_map);
3553	if (!flow_table && !map)
3554		goto done;
3555
3556	skb_reset_network_header(skb);
3557	hash = skb_get_hash(skb);
3558	if (!hash)
3559		goto done;
3560
3561	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3562	if (flow_table && sock_flow_table) {
3563		struct rps_dev_flow *rflow;
3564		u32 next_cpu;
3565		u32 ident;
3566
3567		/* First check into global flow table if there is a match */
3568		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3569		if ((ident ^ hash) & ~rps_cpu_mask)
 
 
3570			goto try_rps;
3571
3572		next_cpu = ident & rps_cpu_mask;
3573
3574		/* OK, now we know there is a match,
3575		 * we can look at the local (per receive queue) flow table
3576		 */
3577		rflow = &flow_table->flows[hash & flow_table->mask];
3578		tcpu = rflow->cpu;
3579
3580		/*
3581		 * If the desired CPU (where last recvmsg was done) is
3582		 * different from current CPU (one in the rx-queue flow
3583		 * table entry), switch if one of the following holds:
3584		 *   - Current CPU is unset (>= nr_cpu_ids).
3585		 *   - Current CPU is offline.
3586		 *   - The current CPU's queue tail has advanced beyond the
3587		 *     last packet that was enqueued using this table entry.
3588		 *     This guarantees that all previous packets for the flow
3589		 *     have been dequeued, thus preserving in order delivery.
3590		 */
3591		if (unlikely(tcpu != next_cpu) &&
3592		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3593		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3594		      rflow->last_qtail)) >= 0)) {
3595			tcpu = next_cpu;
3596			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3597		}
3598
3599		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3600			*rflowp = rflow;
3601			cpu = tcpu;
3602			goto done;
3603		}
3604	}
3605
3606try_rps:
3607
3608	if (map) {
3609		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3610		if (cpu_online(tcpu)) {
3611			cpu = tcpu;
3612			goto done;
3613		}
3614	}
3615
3616done:
3617	return cpu;
3618}
3619
3620#ifdef CONFIG_RFS_ACCEL
3621
3622/**
3623 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3624 * @dev: Device on which the filter was set
3625 * @rxq_index: RX queue index
3626 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3627 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3628 *
3629 * Drivers that implement ndo_rx_flow_steer() should periodically call
3630 * this function for each installed filter and remove the filters for
3631 * which it returns %true.
3632 */
3633bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3634			 u32 flow_id, u16 filter_id)
3635{
3636	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3637	struct rps_dev_flow_table *flow_table;
3638	struct rps_dev_flow *rflow;
3639	bool expire = true;
3640	unsigned int cpu;
3641
3642	rcu_read_lock();
3643	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3644	if (flow_table && flow_id <= flow_table->mask) {
3645		rflow = &flow_table->flows[flow_id];
3646		cpu = ACCESS_ONCE(rflow->cpu);
3647		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3648		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3649			   rflow->last_qtail) <
3650		     (int)(10 * flow_table->mask)))
3651			expire = false;
3652	}
3653	rcu_read_unlock();
3654	return expire;
3655}
3656EXPORT_SYMBOL(rps_may_expire_flow);
3657
3658#endif /* CONFIG_RFS_ACCEL */
3659
3660/* Called from hardirq (IPI) context */
3661static void rps_trigger_softirq(void *data)
3662{
3663	struct softnet_data *sd = data;
3664
3665	____napi_schedule(sd, &sd->backlog);
3666	sd->received_rps++;
3667}
3668
3669#endif /* CONFIG_RPS */
3670
 
 
 
 
 
 
 
 
 
3671/*
3672 * Check if this softnet_data structure is another cpu one
3673 * If yes, queue it to our IPI list and return 1
3674 * If no, return 0
 
 
 
 
 
3675 */
3676static int rps_ipi_queued(struct softnet_data *sd)
3677{
3678#ifdef CONFIG_RPS
3679	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3680
 
3681	if (sd != mysd) {
 
 
 
 
 
3682		sd->rps_ipi_next = mysd->rps_ipi_list;
3683		mysd->rps_ipi_list = sd;
3684
3685		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3686		return 1;
 
 
 
 
3687	}
3688#endif /* CONFIG_RPS */
3689	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3690}
3691
3692#ifdef CONFIG_NET_FLOW_LIMIT
3693int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3694#endif
3695
3696static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3697{
3698#ifdef CONFIG_NET_FLOW_LIMIT
3699	struct sd_flow_limit *fl;
3700	struct softnet_data *sd;
3701	unsigned int old_flow, new_flow;
3702
3703	if (qlen < (netdev_max_backlog >> 1))
3704		return false;
3705
3706	sd = this_cpu_ptr(&softnet_data);
3707
3708	rcu_read_lock();
3709	fl = rcu_dereference(sd->flow_limit);
3710	if (fl) {
3711		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3712		old_flow = fl->history[fl->history_head];
3713		fl->history[fl->history_head] = new_flow;
3714
3715		fl->history_head++;
3716		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3717
3718		if (likely(fl->buckets[old_flow]))
3719			fl->buckets[old_flow]--;
3720
3721		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3722			fl->count++;
3723			rcu_read_unlock();
3724			return true;
3725		}
3726	}
3727	rcu_read_unlock();
3728#endif
3729	return false;
3730}
3731
3732/*
3733 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3734 * queue (may be a remote CPU queue).
3735 */
3736static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3737			      unsigned int *qtail)
3738{
 
3739	struct softnet_data *sd;
3740	unsigned long flags;
3741	unsigned int qlen;
 
 
3742
 
 
 
 
 
3743	sd = &per_cpu(softnet_data, cpu);
3744
3745	local_irq_save(flags);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3746
3747	rps_lock(sd);
3748	if (!netif_running(skb->dev))
3749		goto drop;
3750	qlen = skb_queue_len(&sd->input_pkt_queue);
3751	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3752		if (qlen) {
3753enqueue:
3754			__skb_queue_tail(&sd->input_pkt_queue, skb);
3755			input_queue_tail_incr_save(sd, qtail);
3756			rps_unlock(sd);
3757			local_irq_restore(flags);
3758			return NET_RX_SUCCESS;
3759		}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3760
3761		/* Schedule NAPI for backlog device
3762		 * We can use non atomic operation since we own the queue lock
3763		 */
3764		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3765			if (!rps_ipi_queued(sd))
3766				____napi_schedule(sd, &sd->backlog);
3767		}
3768		goto enqueue;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3769	}
 
3770
3771drop:
3772	sd->dropped++;
3773	rps_unlock(sd);
 
 
3774
3775	local_irq_restore(flags);
 
 
 
3776
3777	atomic_long_inc(&skb->dev->rx_dropped);
3778	kfree_skb(skb);
3779	return NET_RX_DROP;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3780}
 
3781
3782static int netif_rx_internal(struct sk_buff *skb)
3783{
3784	int ret;
3785
3786	net_timestamp_check(netdev_tstamp_prequeue, skb);
3787
3788	trace_netif_rx(skb);
 
3789#ifdef CONFIG_RPS
3790	if (static_key_false(&rps_needed)) {
3791		struct rps_dev_flow voidflow, *rflow = &voidflow;
3792		int cpu;
3793
3794		preempt_disable();
3795		rcu_read_lock();
3796
3797		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3798		if (cpu < 0)
3799			cpu = smp_processor_id();
3800
3801		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3802
3803		rcu_read_unlock();
3804		preempt_enable();
3805	} else
3806#endif
3807	{
3808		unsigned int qtail;
3809		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3810		put_cpu();
3811	}
3812	return ret;
3813}
3814
3815/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3816 *	netif_rx	-	post buffer to the network code
3817 *	@skb: buffer to post
3818 *
3819 *	This function receives a packet from a device driver and queues it for
3820 *	the upper (protocol) levels to process.  It always succeeds. The buffer
3821 *	may be dropped during processing for congestion control or by the
3822 *	protocol layers.
 
 
 
 
 
3823 *
3824 *	return values:
3825 *	NET_RX_SUCCESS	(no congestion)
3826 *	NET_RX_DROP     (packet was dropped)
3827 *
3828 */
3829
3830int netif_rx(struct sk_buff *skb)
3831{
 
 
 
 
 
3832	trace_netif_rx_entry(skb);
3833
3834	return netif_rx_internal(skb);
 
 
 
3835}
3836EXPORT_SYMBOL(netif_rx);
3837
3838int netif_rx_ni(struct sk_buff *skb)
3839{
3840	int err;
3841
3842	trace_netif_rx_ni_entry(skb);
3843
3844	preempt_disable();
3845	err = netif_rx_internal(skb);
3846	if (local_softirq_pending())
3847		do_softirq();
3848	preempt_enable();
3849
3850	return err;
3851}
3852EXPORT_SYMBOL(netif_rx_ni);
3853
3854static __latent_entropy void net_tx_action(struct softirq_action *h)
3855{
3856	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3857
3858	if (sd->completion_queue) {
3859		struct sk_buff *clist;
3860
3861		local_irq_disable();
3862		clist = sd->completion_queue;
3863		sd->completion_queue = NULL;
3864		local_irq_enable();
3865
3866		while (clist) {
3867			struct sk_buff *skb = clist;
 
3868			clist = clist->next;
3869
3870			WARN_ON(atomic_read(&skb->users));
3871			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3872				trace_consume_skb(skb);
3873			else
3874				trace_kfree_skb(skb, net_tx_action);
 
3875
3876			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3877				__kfree_skb(skb);
3878			else
3879				__kfree_skb_defer(skb);
 
3880		}
3881
3882		__kfree_skb_flush();
3883	}
3884
3885	if (sd->output_queue) {
3886		struct Qdisc *head;
3887
3888		local_irq_disable();
3889		head = sd->output_queue;
3890		sd->output_queue = NULL;
3891		sd->output_queue_tailp = &sd->output_queue;
3892		local_irq_enable();
3893
 
 
3894		while (head) {
3895			struct Qdisc *q = head;
3896			spinlock_t *root_lock;
3897
3898			head = head->next_sched;
3899
3900			root_lock = qdisc_lock(q);
3901			spin_lock(root_lock);
3902			/* We need to make sure head->next_sched is read
3903			 * before clearing __QDISC_STATE_SCHED
3904			 */
3905			smp_mb__before_atomic();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3906			clear_bit(__QDISC_STATE_SCHED, &q->state);
3907			qdisc_run(q);
3908			spin_unlock(root_lock);
 
3909		}
 
 
3910	}
 
 
3911}
3912
3913#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3914/* This hook is defined here for ATM LANE */
3915int (*br_fdb_test_addr_hook)(struct net_device *dev,
3916			     unsigned char *addr) __read_mostly;
3917EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3918#endif
3919
3920static inline struct sk_buff *
3921sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3922		   struct net_device *orig_dev)
3923{
3924#ifdef CONFIG_NET_CLS_ACT
3925	struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3926	struct tcf_result cl_res;
3927
3928	/* If there's at least one ingress present somewhere (so
3929	 * we get here via enabled static key), remaining devices
3930	 * that are not configured with an ingress qdisc will bail
3931	 * out here.
3932	 */
3933	if (!cl)
3934		return skb;
3935	if (*pt_prev) {
3936		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3937		*pt_prev = NULL;
3938	}
3939
3940	qdisc_skb_cb(skb)->pkt_len = skb->len;
3941	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3942	qdisc_bstats_cpu_update(cl->q, skb);
3943
3944	switch (tc_classify(skb, cl, &cl_res, false)) {
3945	case TC_ACT_OK:
3946	case TC_ACT_RECLASSIFY:
3947		skb->tc_index = TC_H_MIN(cl_res.classid);
3948		break;
3949	case TC_ACT_SHOT:
3950		qdisc_qstats_cpu_drop(cl->q);
3951		kfree_skb(skb);
3952		return NULL;
3953	case TC_ACT_STOLEN:
3954	case TC_ACT_QUEUED:
3955		consume_skb(skb);
3956		return NULL;
3957	case TC_ACT_REDIRECT:
3958		/* skb_mac_header check was done by cls/act_bpf, so
3959		 * we can safely push the L2 header back before
3960		 * redirecting to another netdev
3961		 */
3962		__skb_push(skb, skb->mac_len);
3963		skb_do_redirect(skb);
3964		return NULL;
3965	default:
3966		break;
3967	}
3968#endif /* CONFIG_NET_CLS_ACT */
3969	return skb;
3970}
3971
3972/**
3973 *	netdev_is_rx_handler_busy - check if receive handler is registered
3974 *	@dev: device to check
3975 *
3976 *	Check if a receive handler is already registered for a given device.
3977 *	Return true if there one.
3978 *
3979 *	The caller must hold the rtnl_mutex.
3980 */
3981bool netdev_is_rx_handler_busy(struct net_device *dev)
3982{
3983	ASSERT_RTNL();
3984	return dev && rtnl_dereference(dev->rx_handler);
3985}
3986EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3987
3988/**
3989 *	netdev_rx_handler_register - register receive handler
3990 *	@dev: device to register a handler for
3991 *	@rx_handler: receive handler to register
3992 *	@rx_handler_data: data pointer that is used by rx handler
3993 *
3994 *	Register a receive handler for a device. This handler will then be
3995 *	called from __netif_receive_skb. A negative errno code is returned
3996 *	on a failure.
3997 *
3998 *	The caller must hold the rtnl_mutex.
3999 *
4000 *	For a general description of rx_handler, see enum rx_handler_result.
4001 */
4002int netdev_rx_handler_register(struct net_device *dev,
4003			       rx_handler_func_t *rx_handler,
4004			       void *rx_handler_data)
4005{
4006	ASSERT_RTNL();
 
4007
4008	if (dev->rx_handler)
4009		return -EBUSY;
4010
4011	/* Note: rx_handler_data must be set before rx_handler */
4012	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4013	rcu_assign_pointer(dev->rx_handler, rx_handler);
4014
4015	return 0;
4016}
4017EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4018
4019/**
4020 *	netdev_rx_handler_unregister - unregister receive handler
4021 *	@dev: device to unregister a handler from
4022 *
4023 *	Unregister a receive handler from a device.
4024 *
4025 *	The caller must hold the rtnl_mutex.
4026 */
4027void netdev_rx_handler_unregister(struct net_device *dev)
4028{
4029
4030	ASSERT_RTNL();
4031	RCU_INIT_POINTER(dev->rx_handler, NULL);
4032	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4033	 * section has a guarantee to see a non NULL rx_handler_data
4034	 * as well.
4035	 */
4036	synchronize_net();
4037	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4038}
4039EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4040
4041/*
4042 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4043 * the special handling of PFMEMALLOC skbs.
4044 */
4045static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4046{
4047	switch (skb->protocol) {
4048	case htons(ETH_P_ARP):
4049	case htons(ETH_P_IP):
4050	case htons(ETH_P_IPV6):
4051	case htons(ETH_P_8021Q):
4052	case htons(ETH_P_8021AD):
4053		return true;
4054	default:
4055		return false;
4056	}
4057}
4058
4059static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4060			     int *ret, struct net_device *orig_dev)
4061{
4062#ifdef CONFIG_NETFILTER_INGRESS
4063	if (nf_hook_ingress_active(skb)) {
4064		int ingress_retval;
4065
4066		if (*pt_prev) {
4067			*ret = deliver_skb(skb, *pt_prev, orig_dev);
4068			*pt_prev = NULL;
4069		}
4070
4071		rcu_read_lock();
4072		ingress_retval = nf_hook_ingress(skb);
4073		rcu_read_unlock();
4074		return ingress_retval;
4075	}
4076#endif /* CONFIG_NETFILTER_INGRESS */
4077	return 0;
4078}
4079
4080static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
 
4081{
4082	struct packet_type *ptype, *pt_prev;
4083	rx_handler_func_t *rx_handler;
 
4084	struct net_device *orig_dev;
4085	bool deliver_exact = false;
4086	int ret = NET_RX_DROP;
4087	__be16 type;
4088
4089	net_timestamp_check(!netdev_tstamp_prequeue, skb);
4090
4091	trace_netif_receive_skb(skb);
4092
4093	orig_dev = skb->dev;
4094
4095	skb_reset_network_header(skb);
4096	if (!skb_transport_header_was_set(skb))
4097		skb_reset_transport_header(skb);
4098	skb_reset_mac_len(skb);
4099
4100	pt_prev = NULL;
4101
4102another_round:
4103	skb->skb_iif = skb->dev->ifindex;
4104
4105	__this_cpu_inc(softnet_data.processed);
4106
4107	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4108	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 
 
 
 
 
 
 
 
 
 
 
 
 
4109		skb = skb_vlan_untag(skb);
4110		if (unlikely(!skb))
4111			goto out;
4112	}
4113
4114#ifdef CONFIG_NET_CLS_ACT
4115	if (skb->tc_verd & TC_NCLS) {
4116		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4117		goto ncls;
4118	}
4119#endif
4120
4121	if (pfmemalloc)
4122		goto skip_taps;
4123
4124	list_for_each_entry_rcu(ptype, &ptype_all, list) {
4125		if (pt_prev)
4126			ret = deliver_skb(skb, pt_prev, orig_dev);
4127		pt_prev = ptype;
4128	}
4129
4130	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4131		if (pt_prev)
4132			ret = deliver_skb(skb, pt_prev, orig_dev);
4133		pt_prev = ptype;
4134	}
4135
4136skip_taps:
4137#ifdef CONFIG_NET_INGRESS
4138	if (static_key_false(&ingress_needed)) {
4139		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
 
 
 
 
 
 
4140		if (!skb)
4141			goto out;
4142
 
4143		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4144			goto out;
4145	}
4146#endif
4147#ifdef CONFIG_NET_CLS_ACT
4148	skb->tc_verd = 0;
4149ncls:
4150#endif
4151	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4152		goto drop;
4153
4154	if (skb_vlan_tag_present(skb)) {
4155		if (pt_prev) {
4156			ret = deliver_skb(skb, pt_prev, orig_dev);
4157			pt_prev = NULL;
4158		}
4159		if (vlan_do_receive(&skb))
4160			goto another_round;
4161		else if (unlikely(!skb))
4162			goto out;
4163	}
4164
4165	rx_handler = rcu_dereference(skb->dev->rx_handler);
4166	if (rx_handler) {
4167		if (pt_prev) {
4168			ret = deliver_skb(skb, pt_prev, orig_dev);
4169			pt_prev = NULL;
4170		}
4171		switch (rx_handler(&skb)) {
4172		case RX_HANDLER_CONSUMED:
4173			ret = NET_RX_SUCCESS;
4174			goto out;
4175		case RX_HANDLER_ANOTHER:
4176			goto another_round;
4177		case RX_HANDLER_EXACT:
4178			deliver_exact = true;
 
4179		case RX_HANDLER_PASS:
4180			break;
4181		default:
4182			BUG();
4183		}
4184	}
4185
4186	if (unlikely(skb_vlan_tag_present(skb))) {
4187		if (skb_vlan_tag_get_id(skb))
 
 
 
 
4188			skb->pkt_type = PACKET_OTHERHOST;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4189		/* Note: we might in the future use prio bits
4190		 * and set skb->priority like in vlan_do_receive()
4191		 * For the time being, just ignore Priority Code Point
4192		 */
4193		skb->vlan_tci = 0;
4194	}
4195
4196	type = skb->protocol;
4197
4198	/* deliver only exact match when indicated */
4199	if (likely(!deliver_exact)) {
4200		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4201				       &ptype_base[ntohs(type) &
4202						   PTYPE_HASH_MASK]);
4203	}
4204
4205	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4206			       &orig_dev->ptype_specific);
4207
4208	if (unlikely(skb->dev != orig_dev)) {
4209		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4210				       &skb->dev->ptype_specific);
4211	}
4212
4213	if (pt_prev) {
4214		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4215			goto drop;
4216		else
4217			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4218	} else {
4219drop:
4220		if (!deliver_exact)
4221			atomic_long_inc(&skb->dev->rx_dropped);
4222		else
4223			atomic_long_inc(&skb->dev->rx_nohandler);
4224		kfree_skb(skb);
4225		/* Jamal, now you will not able to escape explaining
4226		 * me how you were going to use this. :-)
4227		 */
4228		ret = NET_RX_DROP;
4229	}
4230
4231out:
 
 
 
 
 
 
 
4232	return ret;
4233}
4234
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4235static int __netif_receive_skb(struct sk_buff *skb)
4236{
4237	int ret;
4238
4239	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4240		unsigned long pflags = current->flags;
4241
4242		/*
4243		 * PFMEMALLOC skbs are special, they should
4244		 * - be delivered to SOCK_MEMALLOC sockets only
4245		 * - stay away from userspace
4246		 * - have bounded memory usage
4247		 *
4248		 * Use PF_MEMALLOC as this saves us from propagating the allocation
4249		 * context down to all allocation sites.
4250		 */
4251		current->flags |= PF_MEMALLOC;
4252		ret = __netif_receive_skb_core(skb, true);
4253		tsk_restore_flags(current, pflags, PF_MEMALLOC);
4254	} else
4255		ret = __netif_receive_skb_core(skb, false);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4256
4257	return ret;
4258}
4259
4260static int netif_receive_skb_internal(struct sk_buff *skb)
4261{
4262	int ret;
4263
4264	net_timestamp_check(netdev_tstamp_prequeue, skb);
4265
4266	if (skb_defer_rx_timestamp(skb))
4267		return NET_RX_SUCCESS;
4268
4269	rcu_read_lock();
4270
4271#ifdef CONFIG_RPS
4272	if (static_key_false(&rps_needed)) {
4273		struct rps_dev_flow voidflow, *rflow = &voidflow;
4274		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4275
4276		if (cpu >= 0) {
4277			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4278			rcu_read_unlock();
4279			return ret;
4280		}
4281	}
4282#endif
4283	ret = __netif_receive_skb(skb);
4284	rcu_read_unlock();
4285	return ret;
4286}
4287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4288/**
4289 *	netif_receive_skb - process receive buffer from network
4290 *	@skb: buffer to process
4291 *
4292 *	netif_receive_skb() is the main receive data processing function.
4293 *	It always succeeds. The buffer may be dropped during processing
4294 *	for congestion control or by the protocol layers.
4295 *
4296 *	This function may only be called from softirq context and interrupts
4297 *	should be enabled.
4298 *
4299 *	Return values (usually ignored):
4300 *	NET_RX_SUCCESS: no congestion
4301 *	NET_RX_DROP: packet was dropped
4302 */
4303int netif_receive_skb(struct sk_buff *skb)
4304{
 
 
4305	trace_netif_receive_skb_entry(skb);
4306
4307	return netif_receive_skb_internal(skb);
 
 
 
4308}
4309EXPORT_SYMBOL(netif_receive_skb);
4310
4311DEFINE_PER_CPU(struct work_struct, flush_works);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4312
4313/* Network device is going away, flush any packets still pending */
4314static void flush_backlog(struct work_struct *work)
4315{
4316	struct sk_buff *skb, *tmp;
4317	struct softnet_data *sd;
4318
4319	local_bh_disable();
4320	sd = this_cpu_ptr(&softnet_data);
4321
4322	local_irq_disable();
4323	rps_lock(sd);
4324	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4325		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4326			__skb_unlink(skb, &sd->input_pkt_queue);
4327			kfree_skb(skb);
4328			input_queue_head_incr(sd);
4329		}
4330	}
4331	rps_unlock(sd);
4332	local_irq_enable();
4333
 
4334	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4335		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4336			__skb_unlink(skb, &sd->process_queue);
4337			kfree_skb(skb);
4338			input_queue_head_incr(sd);
4339		}
4340	}
 
4341	local_bh_enable();
4342}
4343
4344static void flush_all_backlogs(void)
4345{
4346	unsigned int cpu;
 
 
4347
4348	get_online_cpus();
4349
4350	for_each_online_cpu(cpu)
4351		queue_work_on(cpu, system_highpri_wq,
4352			      per_cpu_ptr(&flush_works, cpu));
 
 
 
4353
4354	for_each_online_cpu(cpu)
4355		flush_work(per_cpu_ptr(&flush_works, cpu));
4356
4357	put_online_cpus();
 
 
 
 
4358}
4359
4360static int napi_gro_complete(struct sk_buff *skb)
4361{
4362	struct packet_offload *ptype;
4363	__be16 type = skb->protocol;
4364	struct list_head *head = &offload_base;
4365	int err = -ENOENT;
4366
4367	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4368
4369	if (NAPI_GRO_CB(skb)->count == 1) {
4370		skb_shinfo(skb)->gso_size = 0;
4371		goto out;
4372	}
4373
4374	rcu_read_lock();
4375	list_for_each_entry_rcu(ptype, head, list) {
4376		if (ptype->type != type || !ptype->callbacks.gro_complete)
4377			continue;
4378
4379		err = ptype->callbacks.gro_complete(skb, 0);
4380		break;
4381	}
4382	rcu_read_unlock();
4383
4384	if (err) {
4385		WARN_ON(&ptype->list == head);
4386		kfree_skb(skb);
4387		return NET_RX_SUCCESS;
4388	}
4389
4390out:
4391	return netif_receive_skb_internal(skb);
4392}
4393
4394/* napi->gro_list contains packets ordered by age.
4395 * youngest packets at the head of it.
4396 * Complete skbs in reverse order to reduce latencies.
4397 */
4398void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4399{
4400	struct sk_buff *skb, *prev = NULL;
4401
4402	/* scan list and build reverse chain */
4403	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4404		skb->prev = prev;
4405		prev = skb;
4406	}
4407
4408	for (skb = prev; skb; skb = prev) {
4409		skb->next = NULL;
4410
4411		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4412			return;
4413
4414		prev = skb->prev;
4415		napi_gro_complete(skb);
4416		napi->gro_count--;
4417	}
4418
4419	napi->gro_list = NULL;
4420}
4421EXPORT_SYMBOL(napi_gro_flush);
4422
4423static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4424{
4425	struct sk_buff *p;
4426	unsigned int maclen = skb->dev->hard_header_len;
4427	u32 hash = skb_get_hash_raw(skb);
4428
4429	for (p = napi->gro_list; p; p = p->next) {
4430		unsigned long diffs;
4431
4432		NAPI_GRO_CB(p)->flush = 0;
4433
4434		if (hash != skb_get_hash_raw(p)) {
4435			NAPI_GRO_CB(p)->same_flow = 0;
4436			continue;
4437		}
4438
4439		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4440		diffs |= p->vlan_tci ^ skb->vlan_tci;
4441		diffs |= skb_metadata_dst_cmp(p, skb);
4442		if (maclen == ETH_HLEN)
4443			diffs |= compare_ether_header(skb_mac_header(p),
4444						      skb_mac_header(skb));
4445		else if (!diffs)
4446			diffs = memcmp(skb_mac_header(p),
4447				       skb_mac_header(skb),
4448				       maclen);
4449		NAPI_GRO_CB(p)->same_flow = !diffs;
4450	}
4451}
4452
4453static void skb_gro_reset_offset(struct sk_buff *skb)
4454{
4455	const struct skb_shared_info *pinfo = skb_shinfo(skb);
4456	const skb_frag_t *frag0 = &pinfo->frags[0];
4457
4458	NAPI_GRO_CB(skb)->data_offset = 0;
4459	NAPI_GRO_CB(skb)->frag0 = NULL;
4460	NAPI_GRO_CB(skb)->frag0_len = 0;
4461
4462	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4463	    pinfo->nr_frags &&
4464	    !PageHighMem(skb_frag_page(frag0))) {
4465		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4466		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4467						    skb_frag_size(frag0),
4468						    skb->end - skb->tail);
4469	}
4470}
4471
4472static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4473{
4474	struct skb_shared_info *pinfo = skb_shinfo(skb);
4475
4476	BUG_ON(skb->end - skb->tail < grow);
4477
4478	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4479
4480	skb->data_len -= grow;
4481	skb->tail += grow;
4482
4483	pinfo->frags[0].page_offset += grow;
4484	skb_frag_size_sub(&pinfo->frags[0], grow);
4485
4486	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4487		skb_frag_unref(skb, 0);
4488		memmove(pinfo->frags, pinfo->frags + 1,
4489			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
4490	}
4491}
4492
4493static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4494{
4495	struct sk_buff **pp = NULL;
4496	struct packet_offload *ptype;
4497	__be16 type = skb->protocol;
4498	struct list_head *head = &offload_base;
4499	int same_flow;
4500	enum gro_result ret;
4501	int grow;
4502
4503	if (!(skb->dev->features & NETIF_F_GRO))
4504		goto normal;
4505
4506	if (skb->csum_bad)
4507		goto normal;
4508
4509	gro_list_prepare(napi, skb);
4510
4511	rcu_read_lock();
4512	list_for_each_entry_rcu(ptype, head, list) {
4513		if (ptype->type != type || !ptype->callbacks.gro_receive)
4514			continue;
4515
4516		skb_set_network_header(skb, skb_gro_offset(skb));
4517		skb_reset_mac_len(skb);
4518		NAPI_GRO_CB(skb)->same_flow = 0;
4519		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
4520		NAPI_GRO_CB(skb)->free = 0;
4521		NAPI_GRO_CB(skb)->encap_mark = 0;
4522		NAPI_GRO_CB(skb)->recursion_counter = 0;
4523		NAPI_GRO_CB(skb)->is_fou = 0;
4524		NAPI_GRO_CB(skb)->is_atomic = 1;
4525		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4526
4527		/* Setup for GRO checksum validation */
4528		switch (skb->ip_summed) {
4529		case CHECKSUM_COMPLETE:
4530			NAPI_GRO_CB(skb)->csum = skb->csum;
4531			NAPI_GRO_CB(skb)->csum_valid = 1;
4532			NAPI_GRO_CB(skb)->csum_cnt = 0;
4533			break;
4534		case CHECKSUM_UNNECESSARY:
4535			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4536			NAPI_GRO_CB(skb)->csum_valid = 0;
4537			break;
4538		default:
4539			NAPI_GRO_CB(skb)->csum_cnt = 0;
4540			NAPI_GRO_CB(skb)->csum_valid = 0;
4541		}
4542
4543		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4544		break;
4545	}
4546	rcu_read_unlock();
4547
4548	if (&ptype->list == head)
4549		goto normal;
4550
4551	same_flow = NAPI_GRO_CB(skb)->same_flow;
4552	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4553
4554	if (pp) {
4555		struct sk_buff *nskb = *pp;
4556
4557		*pp = nskb->next;
4558		nskb->next = NULL;
4559		napi_gro_complete(nskb);
4560		napi->gro_count--;
4561	}
4562
4563	if (same_flow)
4564		goto ok;
4565
4566	if (NAPI_GRO_CB(skb)->flush)
4567		goto normal;
4568
4569	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4570		struct sk_buff *nskb = napi->gro_list;
4571
4572		/* locate the end of the list to select the 'oldest' flow */
4573		while (nskb->next) {
4574			pp = &nskb->next;
4575			nskb = *pp;
4576		}
4577		*pp = NULL;
4578		nskb->next = NULL;
4579		napi_gro_complete(nskb);
4580	} else {
4581		napi->gro_count++;
4582	}
4583	NAPI_GRO_CB(skb)->count = 1;
4584	NAPI_GRO_CB(skb)->age = jiffies;
4585	NAPI_GRO_CB(skb)->last = skb;
4586	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4587	skb->next = napi->gro_list;
4588	napi->gro_list = skb;
4589	ret = GRO_HELD;
4590
4591pull:
4592	grow = skb_gro_offset(skb) - skb_headlen(skb);
4593	if (grow > 0)
4594		gro_pull_from_frag0(skb, grow);
4595ok:
4596	return ret;
4597
4598normal:
4599	ret = GRO_NORMAL;
4600	goto pull;
4601}
4602
4603struct packet_offload *gro_find_receive_by_type(__be16 type)
4604{
4605	struct list_head *offload_head = &offload_base;
4606	struct packet_offload *ptype;
4607
4608	list_for_each_entry_rcu(ptype, offload_head, list) {
4609		if (ptype->type != type || !ptype->callbacks.gro_receive)
4610			continue;
4611		return ptype;
4612	}
4613	return NULL;
4614}
4615EXPORT_SYMBOL(gro_find_receive_by_type);
4616
4617struct packet_offload *gro_find_complete_by_type(__be16 type)
4618{
4619	struct list_head *offload_head = &offload_base;
4620	struct packet_offload *ptype;
4621
4622	list_for_each_entry_rcu(ptype, offload_head, list) {
4623		if (ptype->type != type || !ptype->callbacks.gro_complete)
4624			continue;
4625		return ptype;
4626	}
4627	return NULL;
4628}
4629EXPORT_SYMBOL(gro_find_complete_by_type);
4630
4631static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4632{
4633	switch (ret) {
4634	case GRO_NORMAL:
4635		if (netif_receive_skb_internal(skb))
4636			ret = GRO_DROP;
4637		break;
4638
4639	case GRO_DROP:
4640		kfree_skb(skb);
4641		break;
4642
4643	case GRO_MERGED_FREE:
4644		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4645			skb_dst_drop(skb);
4646			kmem_cache_free(skbuff_head_cache, skb);
4647		} else {
4648			__kfree_skb(skb);
4649		}
4650		break;
4651
4652	case GRO_HELD:
4653	case GRO_MERGED:
4654		break;
4655	}
4656
4657	return ret;
4658}
4659
4660gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4661{
4662	skb_mark_napi_id(skb, napi);
4663	trace_napi_gro_receive_entry(skb);
4664
4665	skb_gro_reset_offset(skb);
4666
4667	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4668}
4669EXPORT_SYMBOL(napi_gro_receive);
4670
4671static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4672{
4673	if (unlikely(skb->pfmemalloc)) {
4674		consume_skb(skb);
4675		return;
4676	}
4677	__skb_pull(skb, skb_headlen(skb));
4678	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4679	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4680	skb->vlan_tci = 0;
4681	skb->dev = napi->dev;
4682	skb->skb_iif = 0;
4683	skb->encapsulation = 0;
4684	skb_shinfo(skb)->gso_type = 0;
4685	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4686
4687	napi->skb = skb;
4688}
4689
4690struct sk_buff *napi_get_frags(struct napi_struct *napi)
4691{
4692	struct sk_buff *skb = napi->skb;
4693
4694	if (!skb) {
4695		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4696		if (skb) {
4697			napi->skb = skb;
4698			skb_mark_napi_id(skb, napi);
4699		}
4700	}
4701	return skb;
4702}
4703EXPORT_SYMBOL(napi_get_frags);
4704
4705static gro_result_t napi_frags_finish(struct napi_struct *napi,
4706				      struct sk_buff *skb,
4707				      gro_result_t ret)
4708{
4709	switch (ret) {
4710	case GRO_NORMAL:
4711	case GRO_HELD:
4712		__skb_push(skb, ETH_HLEN);
4713		skb->protocol = eth_type_trans(skb, skb->dev);
4714		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4715			ret = GRO_DROP;
4716		break;
4717
4718	case GRO_DROP:
4719	case GRO_MERGED_FREE:
4720		napi_reuse_skb(napi, skb);
4721		break;
4722
4723	case GRO_MERGED:
4724		break;
4725	}
4726
4727	return ret;
4728}
4729
4730/* Upper GRO stack assumes network header starts at gro_offset=0
4731 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4732 * We copy ethernet header into skb->data to have a common layout.
4733 */
4734static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4735{
4736	struct sk_buff *skb = napi->skb;
4737	const struct ethhdr *eth;
4738	unsigned int hlen = sizeof(*eth);
4739
4740	napi->skb = NULL;
4741
4742	skb_reset_mac_header(skb);
4743	skb_gro_reset_offset(skb);
4744
4745	eth = skb_gro_header_fast(skb, 0);
4746	if (unlikely(skb_gro_header_hard(skb, hlen))) {
4747		eth = skb_gro_header_slow(skb, hlen, 0);
4748		if (unlikely(!eth)) {
4749			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4750					     __func__, napi->dev->name);
4751			napi_reuse_skb(napi, skb);
4752			return NULL;
4753		}
4754	} else {
4755		gro_pull_from_frag0(skb, hlen);
4756		NAPI_GRO_CB(skb)->frag0 += hlen;
4757		NAPI_GRO_CB(skb)->frag0_len -= hlen;
4758	}
4759	__skb_pull(skb, hlen);
4760
4761	/*
4762	 * This works because the only protocols we care about don't require
4763	 * special handling.
4764	 * We'll fix it up properly in napi_frags_finish()
4765	 */
4766	skb->protocol = eth->h_proto;
 
4767
4768	return skb;
4769}
4770
4771gro_result_t napi_gro_frags(struct napi_struct *napi)
4772{
4773	struct sk_buff *skb = napi_frags_skb(napi);
4774
4775	if (!skb)
4776		return GRO_DROP;
4777
4778	trace_napi_gro_frags_entry(skb);
4779
4780	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4781}
4782EXPORT_SYMBOL(napi_gro_frags);
4783
4784/* Compute the checksum from gro_offset and return the folded value
4785 * after adding in any pseudo checksum.
4786 */
4787__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4788{
4789	__wsum wsum;
4790	__sum16 sum;
4791
4792	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4793
4794	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4795	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4796	if (likely(!sum)) {
4797		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4798		    !skb->csum_complete_sw)
4799			netdev_rx_csum_fault(skb->dev);
4800	}
4801
4802	NAPI_GRO_CB(skb)->csum = wsum;
4803	NAPI_GRO_CB(skb)->csum_valid = 1;
4804
4805	return sum;
4806}
4807EXPORT_SYMBOL(__skb_gro_checksum_complete);
4808
4809/*
4810 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4811 * Note: called with local irq disabled, but exits with local irq enabled.
4812 */
4813static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4814{
4815#ifdef CONFIG_RPS
4816	struct softnet_data *remsd = sd->rps_ipi_list;
4817
4818	if (remsd) {
4819		sd->rps_ipi_list = NULL;
4820
4821		local_irq_enable();
4822
4823		/* Send pending IPI's to kick RPS processing on remote cpus. */
4824		while (remsd) {
4825			struct softnet_data *next = remsd->rps_ipi_next;
4826
4827			if (cpu_online(remsd->cpu))
4828				smp_call_function_single_async(remsd->cpu,
4829							   &remsd->csd);
4830			remsd = next;
4831		}
4832	} else
4833#endif
4834		local_irq_enable();
4835}
4836
4837static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4838{
4839#ifdef CONFIG_RPS
4840	return sd->rps_ipi_list != NULL;
4841#else
4842	return false;
4843#endif
4844}
4845
4846static int process_backlog(struct napi_struct *napi, int quota)
4847{
4848	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4849	bool again = true;
4850	int work = 0;
4851
4852	/* Check if we have pending ipi, its better to send them now,
4853	 * not waiting net_rx_action() end.
4854	 */
4855	if (sd_has_rps_ipi_waiting(sd)) {
4856		local_irq_disable();
4857		net_rps_action_and_irq_enable(sd);
4858	}
4859
4860	napi->weight = weight_p;
4861	while (again) {
4862		struct sk_buff *skb;
4863
 
4864		while ((skb = __skb_dequeue(&sd->process_queue))) {
 
4865			rcu_read_lock();
4866			__netif_receive_skb(skb);
4867			rcu_read_unlock();
4868			input_queue_head_incr(sd);
4869			if (++work >= quota)
4870				return work;
 
4871
 
4872		}
 
4873
4874		local_irq_disable();
4875		rps_lock(sd);
4876		if (skb_queue_empty(&sd->input_pkt_queue)) {
4877			/*
4878			 * Inline a custom version of __napi_complete().
4879			 * only current cpu owns and manipulates this napi,
4880			 * and NAPI_STATE_SCHED is the only possible flag set
4881			 * on backlog.
4882			 * We can use a plain write instead of clear_bit(),
4883			 * and we dont need an smp_mb() memory barrier.
4884			 */
4885			napi->state = 0;
4886			again = false;
4887		} else {
 
4888			skb_queue_splice_tail_init(&sd->input_pkt_queue,
4889						   &sd->process_queue);
 
4890		}
4891		rps_unlock(sd);
4892		local_irq_enable();
4893	}
4894
 
 
4895	return work;
4896}
4897
4898/**
4899 * __napi_schedule - schedule for receive
4900 * @n: entry to schedule
4901 *
4902 * The entry's receive function will be scheduled to run.
4903 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4904 */
4905void __napi_schedule(struct napi_struct *n)
4906{
4907	unsigned long flags;
4908
4909	local_irq_save(flags);
4910	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4911	local_irq_restore(flags);
4912}
4913EXPORT_SYMBOL(__napi_schedule);
4914
4915/**
4916 *	napi_schedule_prep - check if napi can be scheduled
4917 *	@n: napi context
4918 *
4919 * Test if NAPI routine is already running, and if not mark
4920 * it as running.  This is used as a condition variable
4921 * insure only one NAPI poll instance runs.  We also make
4922 * sure there is no pending NAPI disable.
4923 */
4924bool napi_schedule_prep(struct napi_struct *n)
4925{
4926	unsigned long val, new;
4927
4928	do {
4929		val = READ_ONCE(n->state);
4930		if (unlikely(val & NAPIF_STATE_DISABLE))
4931			return false;
4932		new = val | NAPIF_STATE_SCHED;
4933
4934		/* Sets STATE_MISSED bit if STATE_SCHED was already set
4935		 * This was suggested by Alexander Duyck, as compiler
4936		 * emits better code than :
4937		 * if (val & NAPIF_STATE_SCHED)
4938		 *     new |= NAPIF_STATE_MISSED;
4939		 */
4940		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
4941						   NAPIF_STATE_MISSED;
4942	} while (cmpxchg(&n->state, val, new) != val);
4943
4944	return !(val & NAPIF_STATE_SCHED);
4945}
4946EXPORT_SYMBOL(napi_schedule_prep);
4947
4948/**
4949 * __napi_schedule_irqoff - schedule for receive
4950 * @n: entry to schedule
4951 *
4952 * Variant of __napi_schedule() assuming hard irqs are masked
 
 
 
 
4953 */
4954void __napi_schedule_irqoff(struct napi_struct *n)
4955{
4956	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 
 
 
4957}
4958EXPORT_SYMBOL(__napi_schedule_irqoff);
4959
4960bool __napi_complete(struct napi_struct *n)
4961{
4962	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4963
4964	/* Some drivers call us directly, instead of calling
4965	 * napi_complete_done().
4966	 */
4967	if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
4968		return false;
4969
4970	list_del_init(&n->poll_list);
4971	smp_mb__before_atomic();
4972	clear_bit(NAPI_STATE_SCHED, &n->state);
4973	return true;
4974}
4975EXPORT_SYMBOL(__napi_complete);
4976
4977bool napi_complete_done(struct napi_struct *n, int work_done)
4978{
4979	unsigned long flags, val, new;
 
4980
4981	/*
4982	 * 1) Don't let napi dequeue from the cpu poll list
4983	 *    just in case its running on a different cpu.
4984	 * 2) If we are busy polling, do nothing here, we have
4985	 *    the guarantee we will be called later.
4986	 */
4987	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
4988				 NAPIF_STATE_IN_BUSY_POLL)))
4989		return false;
4990
4991	if (n->gro_list) {
4992		unsigned long timeout = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4993
4994		if (work_done)
4995			timeout = n->dev->gro_flush_timeout;
4996
4997		if (timeout)
4998			hrtimer_start(&n->timer, ns_to_ktime(timeout),
4999				      HRTIMER_MODE_REL_PINNED);
5000		else
5001			napi_gro_flush(n, false);
5002	}
5003	if (unlikely(!list_empty(&n->poll_list))) {
5004		/* If n->poll_list is not empty, we need to mask irqs */
5005		local_irq_save(flags);
5006		list_del_init(&n->poll_list);
5007		local_irq_restore(flags);
5008	}
 
5009
 
5010	do {
5011		val = READ_ONCE(n->state);
5012
5013		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
5014
5015		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
 
 
5016
5017		/* If STATE_MISSED was set, leave STATE_SCHED set,
5018		 * because we will call napi->poll() one more time.
5019		 * This C code was suggested by Alexander Duyck to help gcc.
5020		 */
5021		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
5022						    NAPIF_STATE_SCHED;
5023	} while (cmpxchg(&n->state, val, new) != val);
5024
5025	if (unlikely(val & NAPIF_STATE_MISSED)) {
5026		__napi_schedule(n);
5027		return false;
5028	}
5029
5030	return true;
 
 
 
5031}
5032EXPORT_SYMBOL(napi_complete_done);
5033
5034/* must be called under rcu_read_lock(), as we dont take a reference */
5035static struct napi_struct *napi_by_id(unsigned int napi_id)
5036{
5037	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
5038	struct napi_struct *napi;
 
 
 
5039
5040	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
5041		if (napi->napi_id == napi_id)
5042			return napi;
 
 
5043
5044	return NULL;
 
 
 
 
5045}
5046
5047#if defined(CONFIG_NET_RX_BUSY_POLL)
5048
5049#define BUSY_POLL_BUDGET 8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5050
5051static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
 
5052{
 
 
 
5053	int rc;
5054
5055	/* Busy polling means there is a high chance device driver hard irq
5056	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
5057	 * set in napi_schedule_prep().
5058	 * Since we are about to call napi->poll() once more, we can safely
5059	 * clear NAPI_STATE_MISSED.
5060	 *
5061	 * Note: x86 could use a single "lock and ..." instruction
5062	 * to perform these two clear_bit()
5063	 */
5064	clear_bit(NAPI_STATE_MISSED, &napi->state);
5065	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
5066
5067	local_bh_disable();
 
 
 
 
 
 
 
 
 
 
5068
5069	/* All we really want here is to re-enable device interrupts.
5070	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
5071	 */
5072	rc = napi->poll(napi, BUSY_POLL_BUDGET);
 
 
 
 
 
5073	netpoll_poll_unlock(have_poll_lock);
5074	if (rc == BUSY_POLL_BUDGET)
5075		__napi_schedule(napi);
 
5076	local_bh_enable();
5077	if (local_softirq_pending())
5078		do_softirq();
5079}
5080
5081bool sk_busy_loop(struct sock *sk, int nonblock)
 
 
5082{
5083	unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
5084	int (*napi_poll)(struct napi_struct *napi, int budget);
5085	int (*busy_poll)(struct napi_struct *dev);
5086	void *have_poll_lock = NULL;
5087	struct napi_struct *napi;
5088	int rc;
 
5089
5090restart:
5091	rc = false;
5092	napi_poll = NULL;
5093
5094	rcu_read_lock();
5095
5096	napi = napi_by_id(sk->sk_napi_id);
5097	if (!napi)
5098		goto out;
5099
5100	/* Note: ndo_busy_poll method is optional in linux-4.5 */
5101	busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
 
 
5102
5103	preempt_disable();
5104	for (;;) {
5105		rc = 0;
5106		local_bh_disable();
5107		if (busy_poll) {
5108			rc = busy_poll(napi);
5109			goto count;
5110		}
5111		if (!napi_poll) {
5112			unsigned long val = READ_ONCE(napi->state);
5113
5114			/* If multiple threads are competing for this napi,
5115			 * we avoid dirtying napi->state as much as we can.
5116			 */
5117			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
5118				   NAPIF_STATE_IN_BUSY_POLL))
 
 
5119				goto count;
 
5120			if (cmpxchg(&napi->state, val,
5121				    val | NAPIF_STATE_IN_BUSY_POLL |
5122					  NAPIF_STATE_SCHED) != val)
 
 
5123				goto count;
 
5124			have_poll_lock = netpoll_poll_lock(napi);
5125			napi_poll = napi->poll;
5126		}
5127		rc = napi_poll(napi, BUSY_POLL_BUDGET);
5128		trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
 
5129count:
5130		if (rc > 0)
5131			__NET_ADD_STATS(sock_net(sk),
5132					LINUX_MIB_BUSYPOLLRXPACKETS, rc);
 
 
5133		local_bh_enable();
5134
5135		if (rc == LL_FLUSH_FAILED)
5136			break; /* permanent failure */
5137
5138		if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
5139		    busy_loop_timeout(end_time))
5140			break;
5141
5142		if (unlikely(need_resched())) {
 
 
5143			if (napi_poll)
5144				busy_poll_stop(napi, have_poll_lock);
5145			preempt_enable();
 
5146			rcu_read_unlock();
5147			cond_resched();
5148			rc = !skb_queue_empty(&sk->sk_receive_queue);
5149			if (rc || busy_loop_timeout(end_time))
5150				return rc;
5151			goto restart;
5152		}
5153		cpu_relax();
5154	}
5155	if (napi_poll)
5156		busy_poll_stop(napi, have_poll_lock);
5157	preempt_enable();
5158	rc = !skb_queue_empty(&sk->sk_receive_queue);
5159out:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5160	rcu_read_unlock();
5161	return rc;
5162}
5163EXPORT_SYMBOL(sk_busy_loop);
5164
5165#endif /* CONFIG_NET_RX_BUSY_POLL */
5166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5167static void napi_hash_add(struct napi_struct *napi)
5168{
5169	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5170	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
 
5171		return;
5172
5173	spin_lock(&napi_hash_lock);
5174
5175	/* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5176	do {
5177		if (unlikely(++napi_gen_id < NR_CPUS + 1))
5178			napi_gen_id = NR_CPUS + 1;
5179	} while (napi_by_id(napi_gen_id));
5180	napi->napi_id = napi_gen_id;
5181
5182	hlist_add_head_rcu(&napi->napi_hash_node,
5183			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5184
5185	spin_unlock(&napi_hash_lock);
5186}
5187
5188/* Warning : caller is responsible to make sure rcu grace period
5189 * is respected before freeing memory containing @napi
5190 */
5191bool napi_hash_del(struct napi_struct *napi)
5192{
5193	bool rcu_sync_needed = false;
5194
5195	spin_lock(&napi_hash_lock);
5196
5197	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5198		rcu_sync_needed = true;
5199		hlist_del_rcu(&napi->napi_hash_node);
5200	}
5201	spin_unlock(&napi_hash_lock);
5202	return rcu_sync_needed;
5203}
5204EXPORT_SYMBOL_GPL(napi_hash_del);
5205
5206static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5207{
5208	struct napi_struct *napi;
5209
5210	napi = container_of(timer, struct napi_struct, timer);
5211
5212	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
5213	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
5214	 */
5215	if (napi->gro_list && !napi_disable_pending(napi) &&
5216	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
 
5217		__napi_schedule_irqoff(napi);
 
5218
5219	return HRTIMER_NORESTART;
5220}
5221
5222void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5223		    int (*poll)(struct napi_struct *, int), int weight)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5224{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5225	INIT_LIST_HEAD(&napi->poll_list);
 
5226	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5227	napi->timer.function = napi_watchdog;
5228	napi->gro_count = 0;
5229	napi->gro_list = NULL;
5230	napi->skb = NULL;
 
 
5231	napi->poll = poll;
5232	if (weight > NAPI_POLL_WEIGHT)
5233		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5234			    weight, dev->name);
5235	napi->weight = weight;
5236	list_add(&napi->dev_list, &dev->napi_list);
5237	napi->dev = dev;
5238#ifdef CONFIG_NETPOLL
5239	napi->poll_owner = -1;
5240#endif
 
5241	set_bit(NAPI_STATE_SCHED, &napi->state);
5242	napi_hash_add(napi);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5243}
5244EXPORT_SYMBOL(netif_napi_add);
5245
5246void napi_disable(struct napi_struct *n)
5247{
 
 
5248	might_sleep();
5249	set_bit(NAPI_STATE_DISABLE, &n->state);
5250
5251	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5252		msleep(1);
5253	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5254		msleep(1);
 
 
 
 
 
 
5255
5256	hrtimer_cancel(&n->timer);
5257
 
 
 
 
 
5258	clear_bit(NAPI_STATE_DISABLE, &n->state);
5259}
5260EXPORT_SYMBOL(napi_disable);
5261
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5262/* Must be called in process context */
5263void netif_napi_del(struct napi_struct *napi)
5264{
5265	might_sleep();
5266	if (napi_hash_del(napi))
5267		synchronize_net();
5268	list_del_init(&napi->dev_list);
 
 
 
 
 
5269	napi_free_frags(napi);
5270
5271	kfree_skb_list(napi->gro_list);
5272	napi->gro_list = NULL;
5273	napi->gro_count = 0;
 
 
 
 
5274}
5275EXPORT_SYMBOL(netif_napi_del);
5276
5277static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5278{
5279	void *have;
5280	int work, weight;
5281
5282	list_del_init(&n->poll_list);
5283
5284	have = netpoll_poll_lock(n);
5285
5286	weight = n->weight;
5287
5288	/* This NAPI_STATE_SCHED test is for avoiding a race
5289	 * with netpoll's poll_napi().  Only the entity which
5290	 * obtains the lock and sees NAPI_STATE_SCHED set will
5291	 * actually make the ->poll() call.  Therefore we avoid
5292	 * accidentally calling ->poll() when NAPI is not scheduled.
5293	 */
5294	work = 0;
5295	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5296		work = n->poll(n, weight);
5297		trace_napi_poll(n, work, weight);
 
 
5298	}
5299
5300	WARN_ON_ONCE(work > weight);
 
 
5301
5302	if (likely(work < weight))
5303		goto out_unlock;
5304
5305	/* Drivers must not modify the NAPI state if they
5306	 * consume the entire weight.  In such cases this code
5307	 * still "owns" the NAPI instance and therefore can
5308	 * move the instance around on the list at-will.
5309	 */
5310	if (unlikely(napi_disable_pending(n))) {
5311		napi_complete(n);
5312		goto out_unlock;
 
 
 
 
 
 
 
 
 
 
 
 
 
5313	}
5314
5315	if (n->gro_list) {
5316		/* flush too old packets
5317		 * If HZ < 1000, flush all packets.
5318		 */
5319		napi_gro_flush(n, HZ >= 1000);
5320	}
5321
 
 
5322	/* Some drivers may have called napi_schedule
5323	 * prior to exhausting their budget.
5324	 */
5325	if (unlikely(!list_empty(&n->poll_list))) {
5326		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5327			     n->dev ? n->dev->name : "backlog");
5328		goto out_unlock;
5329	}
5330
5331	list_add_tail(&n->poll_list, repoll);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5332
5333out_unlock:
5334	netpoll_poll_unlock(have);
5335
5336	return work;
5337}
5338
5339static __latent_entropy void net_rx_action(struct softirq_action *h)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5340{
5341	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5342	unsigned long time_limit = jiffies + 2;
5343	int budget = netdev_budget;
 
 
5344	LIST_HEAD(list);
5345	LIST_HEAD(repoll);
5346
 
 
 
5347	local_irq_disable();
5348	list_splice_init(&sd->poll_list, &list);
5349	local_irq_enable();
5350
5351	for (;;) {
5352		struct napi_struct *n;
5353
 
 
5354		if (list_empty(&list)) {
5355			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5356				goto out;
 
 
 
 
 
 
 
 
 
 
5357			break;
5358		}
5359
5360		n = list_first_entry(&list, struct napi_struct, poll_list);
5361		budget -= napi_poll(n, &repoll);
5362
5363		/* If softirq window is exhausted then punt.
5364		 * Allow this to run for 2 jiffies since which will allow
5365		 * an average latency of 1.5/HZ.
5366		 */
5367		if (unlikely(budget <= 0 ||
5368			     time_after_eq(jiffies, time_limit))) {
5369			sd->time_squeeze++;
5370			break;
5371		}
5372	}
5373
5374	local_irq_disable();
5375
5376	list_splice_tail_init(&sd->poll_list, &list);
5377	list_splice_tail(&repoll, &list);
5378	list_splice(&list, &sd->poll_list);
5379	if (!list_empty(&sd->poll_list))
5380		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 
 
5381
5382	net_rps_action_and_irq_enable(sd);
5383out:
5384	__kfree_skb_flush();
5385}
5386
5387struct netdev_adjacent {
5388	struct net_device *dev;
 
5389
5390	/* upper master flag, there can only be one master device per list */
5391	bool master;
5392
 
 
 
5393	/* counter for the number of times this device was added to us */
5394	u16 ref_nr;
5395
5396	/* private field for the users */
5397	void *private;
5398
5399	struct list_head list;
5400	struct rcu_head rcu;
5401};
5402
5403static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5404						 struct list_head *adj_list)
5405{
5406	struct netdev_adjacent *adj;
5407
5408	list_for_each_entry(adj, adj_list, list) {
5409		if (adj->dev == adj_dev)
5410			return adj;
5411	}
5412	return NULL;
5413}
5414
5415static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
 
5416{
5417	struct net_device *dev = data;
5418
5419	return upper_dev == dev;
5420}
5421
5422/**
5423 * netdev_has_upper_dev - Check if device is linked to an upper device
5424 * @dev: device
5425 * @upper_dev: upper device to check
5426 *
5427 * Find out if a device is linked to specified upper device and return true
5428 * in case it is. Note that this checks only immediate upper device,
5429 * not through a complete stack of devices. The caller must hold the RTNL lock.
5430 */
5431bool netdev_has_upper_dev(struct net_device *dev,
5432			  struct net_device *upper_dev)
5433{
 
 
 
 
5434	ASSERT_RTNL();
5435
5436	return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5437					     upper_dev);
5438}
5439EXPORT_SYMBOL(netdev_has_upper_dev);
5440
5441/**
5442 * netdev_has_upper_dev_all - Check if device is linked to an upper device
5443 * @dev: device
5444 * @upper_dev: upper device to check
5445 *
5446 * Find out if a device is linked to specified upper device and return true
5447 * in case it is. Note that this checks the entire upper device chain.
5448 * The caller must hold rcu lock.
5449 */
5450
5451bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
5452				  struct net_device *upper_dev)
5453{
5454	return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5455					       upper_dev);
 
 
 
 
5456}
5457EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
5458
5459/**
5460 * netdev_has_any_upper_dev - Check if device is linked to some device
5461 * @dev: device
5462 *
5463 * Find out if a device is linked to an upper device and return true in case
5464 * it is. The caller must hold the RTNL lock.
5465 */
5466static bool netdev_has_any_upper_dev(struct net_device *dev)
5467{
5468	ASSERT_RTNL();
5469
5470	return !list_empty(&dev->adj_list.upper);
5471}
 
5472
5473/**
5474 * netdev_master_upper_dev_get - Get master upper device
5475 * @dev: device
5476 *
5477 * Find a master upper device and return pointer to it or NULL in case
5478 * it's not there. The caller must hold the RTNL lock.
5479 */
5480struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5481{
5482	struct netdev_adjacent *upper;
5483
5484	ASSERT_RTNL();
5485
5486	if (list_empty(&dev->adj_list.upper))
5487		return NULL;
5488
5489	upper = list_first_entry(&dev->adj_list.upper,
5490				 struct netdev_adjacent, list);
5491	if (likely(upper->master))
5492		return upper->dev;
5493	return NULL;
5494}
5495EXPORT_SYMBOL(netdev_master_upper_dev_get);
5496
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5497/**
5498 * netdev_has_any_lower_dev - Check if device is linked to some device
5499 * @dev: device
5500 *
5501 * Find out if a device is linked to a lower device and return true in case
5502 * it is. The caller must hold the RTNL lock.
5503 */
5504static bool netdev_has_any_lower_dev(struct net_device *dev)
5505{
5506	ASSERT_RTNL();
5507
5508	return !list_empty(&dev->adj_list.lower);
5509}
5510
5511void *netdev_adjacent_get_private(struct list_head *adj_list)
5512{
5513	struct netdev_adjacent *adj;
5514
5515	adj = list_entry(adj_list, struct netdev_adjacent, list);
5516
5517	return adj->private;
5518}
5519EXPORT_SYMBOL(netdev_adjacent_get_private);
5520
5521/**
5522 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5523 * @dev: device
5524 * @iter: list_head ** of the current position
5525 *
5526 * Gets the next device from the dev's upper list, starting from iter
5527 * position. The caller must hold RCU read lock.
5528 */
5529struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5530						 struct list_head **iter)
5531{
5532	struct netdev_adjacent *upper;
5533
5534	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5535
5536	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5537
5538	if (&upper->list == &dev->adj_list.upper)
5539		return NULL;
5540
5541	*iter = &upper->list;
5542
5543	return upper->dev;
5544}
5545EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5546
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5547static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
5548						    struct list_head **iter)
5549{
5550	struct netdev_adjacent *upper;
5551
5552	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5553
5554	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5555
5556	if (&upper->list == &dev->adj_list.upper)
5557		return NULL;
5558
5559	*iter = &upper->list;
5560
5561	return upper->dev;
5562}
5563
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5564int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
5565				  int (*fn)(struct net_device *dev,
5566					    void *data),
5567				  void *data)
5568{
5569	struct net_device *udev;
5570	struct list_head *iter;
5571	int ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5572
5573	for (iter = &dev->adj_list.upper,
5574	     udev = netdev_next_upper_dev_rcu(dev, &iter);
5575	     udev;
5576	     udev = netdev_next_upper_dev_rcu(dev, &iter)) {
5577		/* first is the upper device itself */
5578		ret = fn(udev, data);
5579		if (ret)
5580			return ret;
5581
5582		/* then look at all of its upper devices */
5583		ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
5584		if (ret)
5585			return ret;
5586	}
5587
5588	return 0;
5589}
5590EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
5591
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5592/**
5593 * netdev_lower_get_next_private - Get the next ->private from the
5594 *				   lower neighbour list
5595 * @dev: device
5596 * @iter: list_head ** of the current position
5597 *
5598 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5599 * list, starting from iter position. The caller must hold either hold the
5600 * RTNL lock or its own locking that guarantees that the neighbour lower
5601 * list will remain unchanged.
5602 */
5603void *netdev_lower_get_next_private(struct net_device *dev,
5604				    struct list_head **iter)
5605{
5606	struct netdev_adjacent *lower;
5607
5608	lower = list_entry(*iter, struct netdev_adjacent, list);
5609
5610	if (&lower->list == &dev->adj_list.lower)
5611		return NULL;
5612
5613	*iter = lower->list.next;
5614
5615	return lower->private;
5616}
5617EXPORT_SYMBOL(netdev_lower_get_next_private);
5618
5619/**
5620 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5621 *				       lower neighbour list, RCU
5622 *				       variant
5623 * @dev: device
5624 * @iter: list_head ** of the current position
5625 *
5626 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5627 * list, starting from iter position. The caller must hold RCU read lock.
5628 */
5629void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5630					struct list_head **iter)
5631{
5632	struct netdev_adjacent *lower;
5633
5634	WARN_ON_ONCE(!rcu_read_lock_held());
5635
5636	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5637
5638	if (&lower->list == &dev->adj_list.lower)
5639		return NULL;
5640
5641	*iter = &lower->list;
5642
5643	return lower->private;
5644}
5645EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5646
5647/**
5648 * netdev_lower_get_next - Get the next device from the lower neighbour
5649 *                         list
5650 * @dev: device
5651 * @iter: list_head ** of the current position
5652 *
5653 * Gets the next netdev_adjacent from the dev's lower neighbour
5654 * list, starting from iter position. The caller must hold RTNL lock or
5655 * its own locking that guarantees that the neighbour lower
5656 * list will remain unchanged.
5657 */
5658void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5659{
5660	struct netdev_adjacent *lower;
5661
5662	lower = list_entry(*iter, struct netdev_adjacent, list);
5663
5664	if (&lower->list == &dev->adj_list.lower)
5665		return NULL;
5666
5667	*iter = lower->list.next;
5668
5669	return lower->dev;
5670}
5671EXPORT_SYMBOL(netdev_lower_get_next);
5672
5673static struct net_device *netdev_next_lower_dev(struct net_device *dev,
5674						struct list_head **iter)
5675{
5676	struct netdev_adjacent *lower;
5677
5678	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5679
5680	if (&lower->list == &dev->adj_list.lower)
5681		return NULL;
5682
5683	*iter = &lower->list;
5684
5685	return lower->dev;
5686}
5687
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5688int netdev_walk_all_lower_dev(struct net_device *dev,
5689			      int (*fn)(struct net_device *dev,
5690					void *data),
5691			      void *data)
5692{
5693	struct net_device *ldev;
5694	struct list_head *iter;
5695	int ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5696
5697	for (iter = &dev->adj_list.lower,
5698	     ldev = netdev_next_lower_dev(dev, &iter);
5699	     ldev;
5700	     ldev = netdev_next_lower_dev(dev, &iter)) {
5701		/* first is the lower device itself */
5702		ret = fn(ldev, data);
5703		if (ret)
5704			return ret;
5705
5706		/* then look at all of its lower devices */
5707		ret = netdev_walk_all_lower_dev(ldev, fn, data);
5708		if (ret)
5709			return ret;
5710	}
5711
5712	return 0;
5713}
5714EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
5715
5716static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
5717						    struct list_head **iter)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5718{
5719	struct netdev_adjacent *lower;
5720
5721	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5722	if (&lower->list == &dev->adj_list.lower)
5723		return NULL;
5724
5725	*iter = &lower->list;
5726
5727	return lower->dev;
5728}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5729
5730int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
5731				  int (*fn)(struct net_device *dev,
5732					    void *data),
5733				  void *data)
5734{
5735	struct net_device *ldev;
5736	struct list_head *iter;
5737	int ret;
 
5738
5739	for (iter = &dev->adj_list.lower,
5740	     ldev = netdev_next_lower_dev_rcu(dev, &iter);
5741	     ldev;
5742	     ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
5743		/* first is the lower device itself */
5744		ret = fn(ldev, data);
5745		if (ret)
5746			return ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5747
5748		/* then look at all of its lower devices */
5749		ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
5750		if (ret)
5751			return ret;
5752	}
5753
5754	return 0;
5755}
5756EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
5757
5758/**
5759 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5760 *				       lower neighbour list, RCU
5761 *				       variant
5762 * @dev: device
5763 *
5764 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5765 * list. The caller must hold RCU read lock.
5766 */
5767void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5768{
5769	struct netdev_adjacent *lower;
5770
5771	lower = list_first_or_null_rcu(&dev->adj_list.lower,
5772			struct netdev_adjacent, list);
5773	if (lower)
5774		return lower->private;
5775	return NULL;
5776}
5777EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5778
5779/**
5780 * netdev_master_upper_dev_get_rcu - Get master upper device
5781 * @dev: device
5782 *
5783 * Find a master upper device and return pointer to it or NULL in case
5784 * it's not there. The caller must hold the RCU read lock.
5785 */
5786struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5787{
5788	struct netdev_adjacent *upper;
5789
5790	upper = list_first_or_null_rcu(&dev->adj_list.upper,
5791				       struct netdev_adjacent, list);
5792	if (upper && likely(upper->master))
5793		return upper->dev;
5794	return NULL;
5795}
5796EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5797
5798static int netdev_adjacent_sysfs_add(struct net_device *dev,
5799			      struct net_device *adj_dev,
5800			      struct list_head *dev_list)
5801{
5802	char linkname[IFNAMSIZ+7];
 
5803	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5804		"upper_%s" : "lower_%s", adj_dev->name);
5805	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5806				 linkname);
5807}
5808static void netdev_adjacent_sysfs_del(struct net_device *dev,
5809			       char *name,
5810			       struct list_head *dev_list)
5811{
5812	char linkname[IFNAMSIZ+7];
 
5813	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5814		"upper_%s" : "lower_%s", name);
5815	sysfs_remove_link(&(dev->dev.kobj), linkname);
5816}
5817
5818static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5819						 struct net_device *adj_dev,
5820						 struct list_head *dev_list)
5821{
5822	return (dev_list == &dev->adj_list.upper ||
5823		dev_list == &dev->adj_list.lower) &&
5824		net_eq(dev_net(dev), dev_net(adj_dev));
5825}
5826
5827static int __netdev_adjacent_dev_insert(struct net_device *dev,
5828					struct net_device *adj_dev,
5829					struct list_head *dev_list,
5830					void *private, bool master)
5831{
5832	struct netdev_adjacent *adj;
5833	int ret;
5834
5835	adj = __netdev_find_adj(adj_dev, dev_list);
5836
5837	if (adj) {
5838		adj->ref_nr += 1;
5839		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
5840			 dev->name, adj_dev->name, adj->ref_nr);
5841
5842		return 0;
5843	}
5844
5845	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5846	if (!adj)
5847		return -ENOMEM;
5848
5849	adj->dev = adj_dev;
5850	adj->master = master;
5851	adj->ref_nr = 1;
5852	adj->private = private;
5853	dev_hold(adj_dev);
 
5854
5855	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
5856		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
5857
5858	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5859		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5860		if (ret)
5861			goto free_adj;
5862	}
5863
5864	/* Ensure that master link is always the first item in list. */
5865	if (master) {
5866		ret = sysfs_create_link(&(dev->dev.kobj),
5867					&(adj_dev->dev.kobj), "master");
5868		if (ret)
5869			goto remove_symlinks;
5870
5871		list_add_rcu(&adj->list, dev_list);
5872	} else {
5873		list_add_tail_rcu(&adj->list, dev_list);
5874	}
5875
5876	return 0;
5877
5878remove_symlinks:
5879	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5880		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5881free_adj:
 
5882	kfree(adj);
5883	dev_put(adj_dev);
5884
5885	return ret;
5886}
5887
5888static void __netdev_adjacent_dev_remove(struct net_device *dev,
5889					 struct net_device *adj_dev,
5890					 u16 ref_nr,
5891					 struct list_head *dev_list)
5892{
5893	struct netdev_adjacent *adj;
5894
5895	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
5896		 dev->name, adj_dev->name, ref_nr);
5897
5898	adj = __netdev_find_adj(adj_dev, dev_list);
5899
5900	if (!adj) {
5901		pr_err("Adjacency does not exist for device %s from %s\n",
5902		       dev->name, adj_dev->name);
5903		WARN_ON(1);
5904		return;
5905	}
5906
5907	if (adj->ref_nr > ref_nr) {
5908		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
5909			 dev->name, adj_dev->name, ref_nr,
5910			 adj->ref_nr - ref_nr);
5911		adj->ref_nr -= ref_nr;
5912		return;
5913	}
5914
5915	if (adj->master)
5916		sysfs_remove_link(&(dev->dev.kobj), "master");
5917
5918	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5919		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5920
5921	list_del_rcu(&adj->list);
5922	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
5923		 adj_dev->name, dev->name, adj_dev->name);
5924	dev_put(adj_dev);
5925	kfree_rcu(adj, rcu);
5926}
5927
5928static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5929					    struct net_device *upper_dev,
5930					    struct list_head *up_list,
5931					    struct list_head *down_list,
5932					    void *private, bool master)
5933{
5934	int ret;
5935
5936	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
5937					   private, master);
5938	if (ret)
5939		return ret;
5940
5941	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
5942					   private, false);
5943	if (ret) {
5944		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
5945		return ret;
5946	}
5947
5948	return 0;
5949}
5950
5951static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5952					       struct net_device *upper_dev,
5953					       u16 ref_nr,
5954					       struct list_head *up_list,
5955					       struct list_head *down_list)
5956{
5957	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5958	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5959}
5960
5961static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5962						struct net_device *upper_dev,
5963						void *private, bool master)
5964{
5965	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5966						&dev->adj_list.upper,
5967						&upper_dev->adj_list.lower,
5968						private, master);
5969}
5970
5971static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5972						   struct net_device *upper_dev)
5973{
5974	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5975					   &dev->adj_list.upper,
5976					   &upper_dev->adj_list.lower);
5977}
5978
5979static int __netdev_upper_dev_link(struct net_device *dev,
5980				   struct net_device *upper_dev, bool master,
5981				   void *upper_priv, void *upper_info)
5982{
5983	struct netdev_notifier_changeupper_info changeupper_info;
 
 
 
 
 
 
 
 
 
 
 
 
5984	int ret = 0;
5985
5986	ASSERT_RTNL();
5987
5988	if (dev == upper_dev)
5989		return -EBUSY;
5990
5991	/* To prevent loops, check if dev is not upper device to upper_dev. */
5992	if (netdev_has_upper_dev(upper_dev, dev))
5993		return -EBUSY;
5994
5995	if (netdev_has_upper_dev(dev, upper_dev))
5996		return -EEXIST;
5997
5998	if (master && netdev_master_upper_dev_get(dev))
5999		return -EBUSY;
6000
6001	changeupper_info.upper_dev = upper_dev;
6002	changeupper_info.master = master;
6003	changeupper_info.linking = true;
6004	changeupper_info.upper_info = upper_info;
 
6005
6006	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
6007					    &changeupper_info.info);
6008	ret = notifier_to_errno(ret);
6009	if (ret)
6010		return ret;
6011
6012	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
6013						   master);
6014	if (ret)
6015		return ret;
6016
6017	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
6018					    &changeupper_info.info);
6019	ret = notifier_to_errno(ret);
6020	if (ret)
6021		goto rollback;
6022
 
 
 
 
 
 
 
6023	return 0;
6024
6025rollback:
6026	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6027
6028	return ret;
6029}
6030
6031/**
6032 * netdev_upper_dev_link - Add a link to the upper device
6033 * @dev: device
6034 * @upper_dev: new upper device
 
6035 *
6036 * Adds a link to device which is upper to this one. The caller must hold
6037 * the RTNL lock. On a failure a negative errno code is returned.
6038 * On success the reference counts are adjusted and the function
6039 * returns zero.
6040 */
6041int netdev_upper_dev_link(struct net_device *dev,
6042			  struct net_device *upper_dev)
 
6043{
6044	return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
 
 
 
 
 
 
6045}
6046EXPORT_SYMBOL(netdev_upper_dev_link);
6047
6048/**
6049 * netdev_master_upper_dev_link - Add a master link to the upper device
6050 * @dev: device
6051 * @upper_dev: new upper device
6052 * @upper_priv: upper device private
6053 * @upper_info: upper info to be passed down via notifier
 
6054 *
6055 * Adds a link to device which is upper to this one. In this case, only
6056 * one master upper device can be linked, although other non-master devices
6057 * might be linked as well. The caller must hold the RTNL lock.
6058 * On a failure a negative errno code is returned. On success the reference
6059 * counts are adjusted and the function returns zero.
6060 */
6061int netdev_master_upper_dev_link(struct net_device *dev,
6062				 struct net_device *upper_dev,
6063				 void *upper_priv, void *upper_info)
 
6064{
 
 
 
 
 
6065	return __netdev_upper_dev_link(dev, upper_dev, true,
6066				       upper_priv, upper_info);
6067}
6068EXPORT_SYMBOL(netdev_master_upper_dev_link);
6069
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6070/**
6071 * netdev_upper_dev_unlink - Removes a link to upper device
6072 * @dev: device
6073 * @upper_dev: new upper device
6074 *
6075 * Removes a link to device which is upper to this one. The caller must hold
6076 * the RTNL lock.
6077 */
6078void netdev_upper_dev_unlink(struct net_device *dev,
6079			     struct net_device *upper_dev)
6080{
6081	struct netdev_notifier_changeupper_info changeupper_info;
6082	ASSERT_RTNL();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6083
6084	changeupper_info.upper_dev = upper_dev;
6085	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
6086	changeupper_info.linking = false;
 
 
 
 
 
6087
6088	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
6089				      &changeupper_info.info);
6090
6091	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 
6092
6093	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
6094				      &changeupper_info.info);
6095}
6096EXPORT_SYMBOL(netdev_upper_dev_unlink);
6097
6098/**
6099 * netdev_bonding_info_change - Dispatch event about slave change
6100 * @dev: device
6101 * @bonding_info: info to dispatch
6102 *
6103 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
6104 * The caller must hold the RTNL lock.
6105 */
6106void netdev_bonding_info_change(struct net_device *dev,
6107				struct netdev_bonding_info *bonding_info)
6108{
6109	struct netdev_notifier_bonding_info	info;
 
 
6110
6111	memcpy(&info.bonding_info, bonding_info,
6112	       sizeof(struct netdev_bonding_info));
6113	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
6114				      &info.info);
6115}
6116EXPORT_SYMBOL(netdev_bonding_info_change);
6117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6118static void netdev_adjacent_add_links(struct net_device *dev)
6119{
6120	struct netdev_adjacent *iter;
6121
6122	struct net *net = dev_net(dev);
6123
6124	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6125		if (!net_eq(net, dev_net(iter->dev)))
6126			continue;
6127		netdev_adjacent_sysfs_add(iter->dev, dev,
6128					  &iter->dev->adj_list.lower);
6129		netdev_adjacent_sysfs_add(dev, iter->dev,
6130					  &dev->adj_list.upper);
6131	}
6132
6133	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6134		if (!net_eq(net, dev_net(iter->dev)))
6135			continue;
6136		netdev_adjacent_sysfs_add(iter->dev, dev,
6137					  &iter->dev->adj_list.upper);
6138		netdev_adjacent_sysfs_add(dev, iter->dev,
6139					  &dev->adj_list.lower);
6140	}
6141}
6142
6143static void netdev_adjacent_del_links(struct net_device *dev)
6144{
6145	struct netdev_adjacent *iter;
6146
6147	struct net *net = dev_net(dev);
6148
6149	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6150		if (!net_eq(net, dev_net(iter->dev)))
6151			continue;
6152		netdev_adjacent_sysfs_del(iter->dev, dev->name,
6153					  &iter->dev->adj_list.lower);
6154		netdev_adjacent_sysfs_del(dev, iter->dev->name,
6155					  &dev->adj_list.upper);
6156	}
6157
6158	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6159		if (!net_eq(net, dev_net(iter->dev)))
6160			continue;
6161		netdev_adjacent_sysfs_del(iter->dev, dev->name,
6162					  &iter->dev->adj_list.upper);
6163		netdev_adjacent_sysfs_del(dev, iter->dev->name,
6164					  &dev->adj_list.lower);
6165	}
6166}
6167
6168void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
6169{
6170	struct netdev_adjacent *iter;
6171
6172	struct net *net = dev_net(dev);
6173
6174	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6175		if (!net_eq(net, dev_net(iter->dev)))
6176			continue;
6177		netdev_adjacent_sysfs_del(iter->dev, oldname,
6178					  &iter->dev->adj_list.lower);
6179		netdev_adjacent_sysfs_add(iter->dev, dev,
6180					  &iter->dev->adj_list.lower);
6181	}
6182
6183	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6184		if (!net_eq(net, dev_net(iter->dev)))
6185			continue;
6186		netdev_adjacent_sysfs_del(iter->dev, oldname,
6187					  &iter->dev->adj_list.upper);
6188		netdev_adjacent_sysfs_add(iter->dev, dev,
6189					  &iter->dev->adj_list.upper);
6190	}
6191}
6192
6193void *netdev_lower_dev_get_private(struct net_device *dev,
6194				   struct net_device *lower_dev)
6195{
6196	struct netdev_adjacent *lower;
6197
6198	if (!lower_dev)
6199		return NULL;
6200	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
6201	if (!lower)
6202		return NULL;
6203
6204	return lower->private;
6205}
6206EXPORT_SYMBOL(netdev_lower_dev_get_private);
6207
6208
6209int dev_get_nest_level(struct net_device *dev)
6210{
6211	struct net_device *lower = NULL;
6212	struct list_head *iter;
6213	int max_nest = -1;
6214	int nest;
6215
6216	ASSERT_RTNL();
6217
6218	netdev_for_each_lower_dev(dev, lower, iter) {
6219		nest = dev_get_nest_level(lower);
6220		if (max_nest < nest)
6221			max_nest = nest;
6222	}
6223
6224	return max_nest + 1;
6225}
6226EXPORT_SYMBOL(dev_get_nest_level);
6227
6228/**
6229 * netdev_lower_change - Dispatch event about lower device state change
6230 * @lower_dev: device
6231 * @lower_state_info: state to dispatch
6232 *
6233 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6234 * The caller must hold the RTNL lock.
6235 */
6236void netdev_lower_state_changed(struct net_device *lower_dev,
6237				void *lower_state_info)
6238{
6239	struct netdev_notifier_changelowerstate_info changelowerstate_info;
 
 
6240
6241	ASSERT_RTNL();
6242	changelowerstate_info.lower_state_info = lower_state_info;
6243	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6244				      &changelowerstate_info.info);
6245}
6246EXPORT_SYMBOL(netdev_lower_state_changed);
6247
6248int netdev_default_l2upper_neigh_construct(struct net_device *dev,
6249					   struct neighbour *n)
6250{
6251	struct net_device *lower_dev, *stop_dev;
6252	struct list_head *iter;
6253	int err;
6254
6255	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6256		if (!lower_dev->netdev_ops->ndo_neigh_construct)
6257			continue;
6258		err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
6259		if (err) {
6260			stop_dev = lower_dev;
6261			goto rollback;
6262		}
6263	}
6264	return 0;
6265
6266rollback:
6267	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6268		if (lower_dev == stop_dev)
6269			break;
6270		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6271			continue;
6272		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6273	}
6274	return err;
6275}
6276EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6277
6278void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6279					  struct neighbour *n)
6280{
6281	struct net_device *lower_dev;
6282	struct list_head *iter;
6283
6284	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6285		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6286			continue;
6287		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6288	}
6289}
6290EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6291
6292static void dev_change_rx_flags(struct net_device *dev, int flags)
6293{
6294	const struct net_device_ops *ops = dev->netdev_ops;
6295
6296	if (ops->ndo_change_rx_flags)
6297		ops->ndo_change_rx_flags(dev, flags);
6298}
6299
6300static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6301{
6302	unsigned int old_flags = dev->flags;
 
6303	kuid_t uid;
6304	kgid_t gid;
6305
6306	ASSERT_RTNL();
6307
6308	dev->flags |= IFF_PROMISC;
6309	dev->promiscuity += inc;
6310	if (dev->promiscuity == 0) {
6311		/*
6312		 * Avoid overflow.
6313		 * If inc causes overflow, untouch promisc and return error.
6314		 */
6315		if (inc < 0)
6316			dev->flags &= ~IFF_PROMISC;
6317		else {
6318			dev->promiscuity -= inc;
6319			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6320				dev->name);
6321			return -EOVERFLOW;
6322		}
 
 
 
6323	}
6324	if (dev->flags != old_flags) {
6325		pr_info("device %s %s promiscuous mode\n",
6326			dev->name,
6327			dev->flags & IFF_PROMISC ? "entered" : "left");
 
6328		if (audit_enabled) {
6329			current_uid_gid(&uid, &gid);
6330			audit_log(current->audit_context, GFP_ATOMIC,
6331				AUDIT_ANOM_PROMISCUOUS,
6332				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6333				dev->name, (dev->flags & IFF_PROMISC),
6334				(old_flags & IFF_PROMISC),
6335				from_kuid(&init_user_ns, audit_get_loginuid(current)),
6336				from_kuid(&init_user_ns, uid),
6337				from_kgid(&init_user_ns, gid),
6338				audit_get_sessionid(current));
6339		}
6340
6341		dev_change_rx_flags(dev, IFF_PROMISC);
6342	}
6343	if (notify)
6344		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
6345	return 0;
6346}
6347
6348/**
6349 *	dev_set_promiscuity	- update promiscuity count on a device
6350 *	@dev: device
6351 *	@inc: modifier
6352 *
6353 *	Add or remove promiscuity from a device. While the count in the device
6354 *	remains above zero the interface remains promiscuous. Once it hits zero
6355 *	the device reverts back to normal filtering operation. A negative inc
6356 *	value is used to drop promiscuity on the device.
6357 *	Return 0 if successful or a negative errno code on error.
6358 */
6359int dev_set_promiscuity(struct net_device *dev, int inc)
6360{
6361	unsigned int old_flags = dev->flags;
6362	int err;
6363
6364	err = __dev_set_promiscuity(dev, inc, true);
6365	if (err < 0)
6366		return err;
6367	if (dev->flags != old_flags)
6368		dev_set_rx_mode(dev);
6369	return err;
6370}
6371EXPORT_SYMBOL(dev_set_promiscuity);
6372
6373static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6374{
6375	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
 
6376
6377	ASSERT_RTNL();
6378
6379	dev->flags |= IFF_ALLMULTI;
6380	dev->allmulti += inc;
6381	if (dev->allmulti == 0) {
6382		/*
6383		 * Avoid overflow.
6384		 * If inc causes overflow, untouch allmulti and return error.
6385		 */
6386		if (inc < 0)
6387			dev->flags &= ~IFF_ALLMULTI;
6388		else {
6389			dev->allmulti -= inc;
6390			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6391				dev->name);
6392			return -EOVERFLOW;
6393		}
 
 
 
6394	}
6395	if (dev->flags ^ old_flags) {
 
 
 
 
6396		dev_change_rx_flags(dev, IFF_ALLMULTI);
6397		dev_set_rx_mode(dev);
6398		if (notify)
6399			__dev_notify_flags(dev, old_flags,
6400					   dev->gflags ^ old_gflags);
6401	}
6402	return 0;
6403}
6404
6405/**
6406 *	dev_set_allmulti	- update allmulti count on a device
6407 *	@dev: device
6408 *	@inc: modifier
6409 *
6410 *	Add or remove reception of all multicast frames to a device. While the
6411 *	count in the device remains above zero the interface remains listening
6412 *	to all interfaces. Once it hits zero the device reverts back to normal
6413 *	filtering operation. A negative @inc value is used to drop the counter
6414 *	when releasing a resource needing all multicasts.
6415 *	Return 0 if successful or a negative errno code on error.
6416 */
6417
6418int dev_set_allmulti(struct net_device *dev, int inc)
6419{
6420	return __dev_set_allmulti(dev, inc, true);
6421}
6422EXPORT_SYMBOL(dev_set_allmulti);
6423
6424/*
6425 *	Upload unicast and multicast address lists to device and
6426 *	configure RX filtering. When the device doesn't support unicast
6427 *	filtering it is put in promiscuous mode while unicast addresses
6428 *	are present.
6429 */
6430void __dev_set_rx_mode(struct net_device *dev)
6431{
6432	const struct net_device_ops *ops = dev->netdev_ops;
6433
6434	/* dev_open will call this function so the list will stay sane. */
6435	if (!(dev->flags&IFF_UP))
6436		return;
6437
6438	if (!netif_device_present(dev))
6439		return;
6440
6441	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6442		/* Unicast addresses changes may only happen under the rtnl,
6443		 * therefore calling __dev_set_promiscuity here is safe.
6444		 */
6445		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6446			__dev_set_promiscuity(dev, 1, false);
6447			dev->uc_promisc = true;
6448		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6449			__dev_set_promiscuity(dev, -1, false);
6450			dev->uc_promisc = false;
6451		}
6452	}
6453
6454	if (ops->ndo_set_rx_mode)
6455		ops->ndo_set_rx_mode(dev);
6456}
6457
6458void dev_set_rx_mode(struct net_device *dev)
6459{
6460	netif_addr_lock_bh(dev);
6461	__dev_set_rx_mode(dev);
6462	netif_addr_unlock_bh(dev);
6463}
6464
6465/**
6466 *	dev_get_flags - get flags reported to userspace
6467 *	@dev: device
6468 *
6469 *	Get the combination of flag bits exported through APIs to userspace.
6470 */
6471unsigned int dev_get_flags(const struct net_device *dev)
6472{
6473	unsigned int flags;
6474
6475	flags = (dev->flags & ~(IFF_PROMISC |
6476				IFF_ALLMULTI |
6477				IFF_RUNNING |
6478				IFF_LOWER_UP |
6479				IFF_DORMANT)) |
6480		(dev->gflags & (IFF_PROMISC |
6481				IFF_ALLMULTI));
6482
6483	if (netif_running(dev)) {
6484		if (netif_oper_up(dev))
6485			flags |= IFF_RUNNING;
6486		if (netif_carrier_ok(dev))
6487			flags |= IFF_LOWER_UP;
6488		if (netif_dormant(dev))
6489			flags |= IFF_DORMANT;
6490	}
6491
6492	return flags;
6493}
6494EXPORT_SYMBOL(dev_get_flags);
6495
6496int __dev_change_flags(struct net_device *dev, unsigned int flags)
 
6497{
6498	unsigned int old_flags = dev->flags;
6499	int ret;
6500
6501	ASSERT_RTNL();
6502
6503	/*
6504	 *	Set the flags on our device.
6505	 */
6506
6507	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6508			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6509			       IFF_AUTOMEDIA)) |
6510		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6511				    IFF_ALLMULTI));
6512
6513	/*
6514	 *	Load in the correct multicast list now the flags have changed.
6515	 */
6516
6517	if ((old_flags ^ flags) & IFF_MULTICAST)
6518		dev_change_rx_flags(dev, IFF_MULTICAST);
6519
6520	dev_set_rx_mode(dev);
6521
6522	/*
6523	 *	Have we downed the interface. We handle IFF_UP ourselves
6524	 *	according to user attempts to set it, rather than blindly
6525	 *	setting it.
6526	 */
6527
6528	ret = 0;
6529	if ((old_flags ^ flags) & IFF_UP)
6530		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
 
 
 
 
6531
6532	if ((flags ^ dev->gflags) & IFF_PROMISC) {
6533		int inc = (flags & IFF_PROMISC) ? 1 : -1;
6534		unsigned int old_flags = dev->flags;
6535
6536		dev->gflags ^= IFF_PROMISC;
6537
6538		if (__dev_set_promiscuity(dev, inc, false) >= 0)
6539			if (dev->flags != old_flags)
6540				dev_set_rx_mode(dev);
6541	}
6542
6543	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6544	   is important. Some (broken) drivers set IFF_PROMISC, when
6545	   IFF_ALLMULTI is requested not asking us and not reporting.
6546	 */
6547	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6548		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6549
6550		dev->gflags ^= IFF_ALLMULTI;
6551		__dev_set_allmulti(dev, inc, false);
6552	}
6553
6554	return ret;
6555}
6556
6557void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6558			unsigned int gchanges)
 
6559{
6560	unsigned int changes = dev->flags ^ old_flags;
6561
6562	if (gchanges)
6563		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6564
6565	if (changes & IFF_UP) {
6566		if (dev->flags & IFF_UP)
6567			call_netdevice_notifiers(NETDEV_UP, dev);
6568		else
6569			call_netdevice_notifiers(NETDEV_DOWN, dev);
6570	}
6571
6572	if (dev->flags & IFF_UP &&
6573	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6574		struct netdev_notifier_change_info change_info;
 
 
 
 
 
6575
6576		change_info.flags_changed = changes;
6577		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6578					      &change_info.info);
6579	}
6580}
6581
6582/**
6583 *	dev_change_flags - change device settings
6584 *	@dev: device
6585 *	@flags: device state flags
 
6586 *
6587 *	Change settings on device based state flags. The flags are
6588 *	in the userspace exported format.
6589 */
6590int dev_change_flags(struct net_device *dev, unsigned int flags)
 
6591{
6592	int ret;
6593	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6594
6595	ret = __dev_change_flags(dev, flags);
6596	if (ret < 0)
6597		return ret;
6598
6599	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6600	__dev_notify_flags(dev, old_flags, changes);
6601	return ret;
6602}
6603EXPORT_SYMBOL(dev_change_flags);
6604
6605static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6606{
6607	const struct net_device_ops *ops = dev->netdev_ops;
6608
6609	if (ops->ndo_change_mtu)
6610		return ops->ndo_change_mtu(dev, new_mtu);
6611
6612	dev->mtu = new_mtu;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6613	return 0;
6614}
6615
6616/**
6617 *	dev_set_mtu - Change maximum transfer unit
6618 *	@dev: device
6619 *	@new_mtu: new transfer unit
 
6620 *
6621 *	Change the maximum transfer size of the network device.
6622 */
6623int dev_set_mtu(struct net_device *dev, int new_mtu)
 
6624{
6625	int err, orig_mtu;
6626
6627	if (new_mtu == dev->mtu)
6628		return 0;
6629
6630	/* MTU must be positive, and in range */
6631	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
6632		net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
6633				    dev->name, new_mtu, dev->min_mtu);
6634		return -EINVAL;
6635	}
6636
6637	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
6638		net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
6639				    dev->name, new_mtu, dev->max_mtu);
6640		return -EINVAL;
6641	}
6642
6643	if (!netif_device_present(dev))
6644		return -ENODEV;
6645
6646	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6647	err = notifier_to_errno(err);
6648	if (err)
6649		return err;
6650
6651	orig_mtu = dev->mtu;
6652	err = __dev_set_mtu(dev, new_mtu);
6653
6654	if (!err) {
6655		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
 
6656		err = notifier_to_errno(err);
6657		if (err) {
6658			/* setting mtu back and notifying everyone again,
6659			 * so that they have a chance to revert changes.
6660			 */
6661			__dev_set_mtu(dev, orig_mtu);
6662			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
 
6663		}
6664	}
6665	return err;
6666}
 
 
 
 
 
 
 
 
 
 
 
 
6667EXPORT_SYMBOL(dev_set_mtu);
6668
6669/**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6670 *	dev_set_group - Change group this device belongs to
6671 *	@dev: device
6672 *	@new_group: group this device should belong to
6673 */
6674void dev_set_group(struct net_device *dev, int new_group)
6675{
6676	dev->group = new_group;
6677}
6678EXPORT_SYMBOL(dev_set_group);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6679
6680/**
6681 *	dev_set_mac_address - Change Media Access Control Address
6682 *	@dev: device
6683 *	@sa: new address
 
6684 *
6685 *	Change the hardware (MAC) address of the device
6686 */
6687int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
 
6688{
6689	const struct net_device_ops *ops = dev->netdev_ops;
6690	int err;
6691
6692	if (!ops->ndo_set_mac_address)
6693		return -EOPNOTSUPP;
6694	if (sa->sa_family != dev->type)
6695		return -EINVAL;
6696	if (!netif_device_present(dev))
6697		return -ENODEV;
6698	err = ops->ndo_set_mac_address(dev, sa);
6699	if (err)
6700		return err;
 
 
 
 
 
6701	dev->addr_assign_type = NET_ADDR_SET;
6702	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6703	add_device_randomness(dev->dev_addr, dev->addr_len);
6704	return 0;
6705}
6706EXPORT_SYMBOL(dev_set_mac_address);
6707
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6708/**
6709 *	dev_change_carrier - Change device carrier
6710 *	@dev: device
6711 *	@new_carrier: new value
6712 *
6713 *	Change device carrier
6714 */
6715int dev_change_carrier(struct net_device *dev, bool new_carrier)
6716{
6717	const struct net_device_ops *ops = dev->netdev_ops;
6718
6719	if (!ops->ndo_change_carrier)
6720		return -EOPNOTSUPP;
6721	if (!netif_device_present(dev))
6722		return -ENODEV;
6723	return ops->ndo_change_carrier(dev, new_carrier);
6724}
6725EXPORT_SYMBOL(dev_change_carrier);
6726
6727/**
6728 *	dev_get_phys_port_id - Get device physical port ID
6729 *	@dev: device
6730 *	@ppid: port ID
6731 *
6732 *	Get device physical port ID
6733 */
6734int dev_get_phys_port_id(struct net_device *dev,
6735			 struct netdev_phys_item_id *ppid)
6736{
6737	const struct net_device_ops *ops = dev->netdev_ops;
6738
6739	if (!ops->ndo_get_phys_port_id)
6740		return -EOPNOTSUPP;
6741	return ops->ndo_get_phys_port_id(dev, ppid);
6742}
6743EXPORT_SYMBOL(dev_get_phys_port_id);
6744
6745/**
6746 *	dev_get_phys_port_name - Get device physical port name
6747 *	@dev: device
6748 *	@name: port name
6749 *	@len: limit of bytes to copy to name
6750 *
6751 *	Get device physical port name
6752 */
6753int dev_get_phys_port_name(struct net_device *dev,
6754			   char *name, size_t len)
6755{
6756	const struct net_device_ops *ops = dev->netdev_ops;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6757
6758	if (!ops->ndo_get_phys_port_name)
6759		return -EOPNOTSUPP;
6760	return ops->ndo_get_phys_port_name(dev, name, len);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6761}
6762EXPORT_SYMBOL(dev_get_phys_port_name);
6763
6764/**
6765 *	dev_change_proto_down - update protocol port state information
 
6766 *	@dev: device
6767 *	@proto_down: new value
6768 *
6769 *	This info can be used by switch drivers to set the phys state of the
6770 *	port.
6771 */
6772int dev_change_proto_down(struct net_device *dev, bool proto_down)
6773{
6774	const struct net_device_ops *ops = dev->netdev_ops;
6775
6776	if (!ops->ndo_change_proto_down)
6777		return -EOPNOTSUPP;
6778	if (!netif_device_present(dev))
6779		return -ENODEV;
6780	return ops->ndo_change_proto_down(dev, proto_down);
 
 
 
 
 
6781}
6782EXPORT_SYMBOL(dev_change_proto_down);
6783
6784/**
6785 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
 
6786 *	@dev: device
6787 *	@fd: new program fd or negative value to clear
6788 *	@flags: xdp-related flags
6789 *
6790 *	Set or clear a bpf program for a device
6791 */
6792int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
 
6793{
6794	const struct net_device_ops *ops = dev->netdev_ops;
6795	struct bpf_prog *prog = NULL;
6796	struct netdev_xdp xdp;
6797	int err;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6798
6799	ASSERT_RTNL();
 
 
 
 
 
6800
6801	if (!ops->ndo_xdp)
 
 
6802		return -EOPNOTSUPP;
6803	if (fd >= 0) {
6804		if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
6805			memset(&xdp, 0, sizeof(xdp));
6806			xdp.command = XDP_QUERY_PROG;
6807
6808			err = ops->ndo_xdp(dev, &xdp);
6809			if (err < 0)
6810				return err;
6811			if (xdp.prog_attached)
6812				return -EBUSY;
6813		}
6814
6815		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6816		if (IS_ERR(prog))
6817			return PTR_ERR(prog);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6818	}
6819
6820	memset(&xdp, 0, sizeof(xdp));
6821	xdp.command = XDP_SETUP_PROG;
 
 
6822	xdp.prog = prog;
6823
6824	err = ops->ndo_xdp(dev, &xdp);
6825	if (err < 0 && prog)
6826		bpf_prog_put(prog);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6827
6828	return err;
6829}
6830EXPORT_SYMBOL(dev_change_xdp_fd);
6831
6832/**
6833 *	dev_new_index	-	allocate an ifindex
6834 *	@net: the applicable net namespace
6835 *
6836 *	Returns a suitable unique value for a new device interface
6837 *	number.  The caller must hold the rtnl semaphore or the
6838 *	dev_base_lock to be sure it remains unique.
6839 */
6840static int dev_new_index(struct net *net)
6841{
6842	int ifindex = net->ifindex;
6843	for (;;) {
6844		if (++ifindex <= 0)
6845			ifindex = 1;
6846		if (!__dev_get_by_index(net, ifindex))
6847			return net->ifindex = ifindex;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6848	}
6849}
6850
6851/* Delayed registration/unregisteration */
6852static LIST_HEAD(net_todo_list);
6853DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6854
6855static void net_set_todo(struct net_device *dev)
 
 
6856{
6857	list_add_tail(&dev->todo_list, &net_todo_list);
6858	dev_net(dev)->dev_unreg_count++;
6859}
6860
6861static void rollback_registered_many(struct list_head *head)
 
 
6862{
6863	struct net_device *dev, *tmp;
6864	LIST_HEAD(close_head);
6865
6866	BUG_ON(dev_boot_phase);
6867	ASSERT_RTNL();
6868
6869	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6870		/* Some devices call without registering
6871		 * for initialization unwind. Remove those
6872		 * devices and proceed with the remaining.
6873		 */
6874		if (dev->reg_state == NETREG_UNINITIALIZED) {
6875			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6876				 dev->name, dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6877
6878			WARN_ON(1);
6879			list_del(&dev->unreg_list);
6880			continue;
6881		}
6882		dev->dismantle = true;
6883		BUG_ON(dev->reg_state != NETREG_REGISTERED);
6884	}
6885
6886	/* If device is running, close it first. */
6887	list_for_each_entry(dev, head, unreg_list)
6888		list_add_tail(&dev->close_list, &close_head);
6889	dev_close_many(&close_head, true);
 
 
 
 
 
6890
6891	list_for_each_entry(dev, head, unreg_list) {
6892		/* And unlink it from device chain. */
6893		unlist_netdevice(dev);
6894
6895		dev->reg_state = NETREG_UNREGISTERING;
 
 
 
 
6896	}
6897	flush_all_backlogs();
6898
6899	synchronize_net();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6900
6901	list_for_each_entry(dev, head, unreg_list) {
6902		struct sk_buff *skb = NULL;
 
 
 
 
6903
6904		/* Shutdown queueing discipline. */
6905		dev_shutdown(dev);
 
 
 
 
 
 
 
6906
 
6907
6908		/* Notify protocols, that we are about to destroy
6909		   this device. They should clean all the things.
6910		*/
6911		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 
 
 
6912
6913		if (!dev->rtnl_link_ops ||
6914		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6915			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6916						     GFP_KERNEL);
6917
6918		/*
6919		 *	Flush the unicast and multicast chains
6920		 */
6921		dev_uc_flush(dev);
6922		dev_mc_flush(dev);
6923
6924		if (dev->netdev_ops->ndo_uninit)
6925			dev->netdev_ops->ndo_uninit(dev);
 
 
6926
6927		if (skb)
6928			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6929
6930		/* Notifier chain MUST detach us all upper devices. */
6931		WARN_ON(netdev_has_any_upper_dev(dev));
6932		WARN_ON(netdev_has_any_lower_dev(dev));
 
 
 
 
 
 
 
 
 
 
 
6933
6934		/* Remove entries from kobject tree */
6935		netdev_unregister_kobject(dev);
6936#ifdef CONFIG_XPS
6937		/* Remove XPS queueing entries */
6938		netif_reset_xps_queues_gt(dev, 0);
6939#endif
6940	}
6941
6942	synchronize_net();
 
 
 
 
 
 
6943
6944	list_for_each_entry(dev, head, unreg_list)
6945		dev_put(dev);
6946}
6947
6948static void rollback_registered(struct net_device *dev)
6949{
6950	LIST_HEAD(single);
 
 
 
 
 
 
 
6951
6952	list_add(&dev->unreg_list, &single);
6953	rollback_registered_many(&single);
6954	list_del(&single);
6955}
6956
6957static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6958	struct net_device *upper, netdev_features_t features)
6959{
6960	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6961	netdev_features_t feature;
6962	int feature_bit;
6963
6964	for_each_netdev_feature(&upper_disables, feature_bit) {
6965		feature = __NETIF_F_BIT(feature_bit);
6966		if (!(upper->wanted_features & feature)
6967		    && (features & feature)) {
6968			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6969				   &feature, upper->name);
6970			features &= ~feature;
6971		}
6972	}
6973
6974	return features;
6975}
6976
6977static void netdev_sync_lower_features(struct net_device *upper,
6978	struct net_device *lower, netdev_features_t features)
6979{
6980	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6981	netdev_features_t feature;
6982	int feature_bit;
6983
6984	for_each_netdev_feature(&upper_disables, feature_bit) {
6985		feature = __NETIF_F_BIT(feature_bit);
6986		if (!(features & feature) && (lower->features & feature)) {
6987			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6988				   &feature, lower->name);
6989			lower->wanted_features &= ~feature;
6990			netdev_update_features(lower);
6991
6992			if (unlikely(lower->features & feature))
6993				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6994					    &feature, lower->name);
 
 
6995		}
6996	}
6997}
6998
 
 
 
 
 
 
 
 
 
6999static netdev_features_t netdev_fix_features(struct net_device *dev,
7000	netdev_features_t features)
7001{
7002	/* Fix illegal checksum combinations */
7003	if ((features & NETIF_F_HW_CSUM) &&
7004	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
7005		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
7006		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
7007	}
7008
7009	/* TSO requires that SG is present as well. */
7010	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
7011		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
7012		features &= ~NETIF_F_ALL_TSO;
7013	}
7014
7015	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
7016					!(features & NETIF_F_IP_CSUM)) {
7017		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
7018		features &= ~NETIF_F_TSO;
7019		features &= ~NETIF_F_TSO_ECN;
7020	}
7021
7022	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
7023					 !(features & NETIF_F_IPV6_CSUM)) {
7024		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
7025		features &= ~NETIF_F_TSO6;
7026	}
7027
7028	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
7029	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
7030		features &= ~NETIF_F_TSO_MANGLEID;
7031
7032	/* TSO ECN requires that TSO is present as well. */
7033	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
7034		features &= ~NETIF_F_TSO_ECN;
7035
7036	/* Software GSO depends on SG. */
7037	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
7038		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
7039		features &= ~NETIF_F_GSO;
7040	}
7041
7042	/* UFO needs SG and checksumming */
7043	if (features & NETIF_F_UFO) {
7044		/* maybe split UFO into V4 and V6? */
7045		if (!(features & NETIF_F_HW_CSUM) &&
7046		    ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
7047		     (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
7048			netdev_dbg(dev,
7049				"Dropping NETIF_F_UFO since no checksum offload features.\n");
7050			features &= ~NETIF_F_UFO;
7051		}
7052
7053		if (!(features & NETIF_F_SG)) {
7054			netdev_dbg(dev,
7055				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
7056			features &= ~NETIF_F_UFO;
7057		}
7058	}
7059
7060	/* GSO partial features require GSO partial be set */
7061	if ((features & dev->gso_partial_features) &&
7062	    !(features & NETIF_F_GSO_PARTIAL)) {
7063		netdev_dbg(dev,
7064			   "Dropping partially supported GSO features since no GSO partial.\n");
7065		features &= ~dev->gso_partial_features;
7066	}
7067
7068#ifdef CONFIG_NET_RX_BUSY_POLL
7069	if (dev->netdev_ops->ndo_busy_poll)
7070		features |= NETIF_F_BUSY_POLL;
7071	else
7072#endif
7073		features &= ~NETIF_F_BUSY_POLL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7074
7075	return features;
7076}
7077
7078int __netdev_update_features(struct net_device *dev)
7079{
7080	struct net_device *upper, *lower;
7081	netdev_features_t features;
7082	struct list_head *iter;
7083	int err = -1;
7084
7085	ASSERT_RTNL();
7086
7087	features = netdev_get_wanted_features(dev);
7088
7089	if (dev->netdev_ops->ndo_fix_features)
7090		features = dev->netdev_ops->ndo_fix_features(dev, features);
7091
7092	/* driver might be less strict about feature dependencies */
7093	features = netdev_fix_features(dev, features);
7094
7095	/* some features can't be enabled if they're off an an upper device */
7096	netdev_for_each_upper_dev_rcu(dev, upper, iter)
7097		features = netdev_sync_upper_features(dev, upper, features);
7098
7099	if (dev->features == features)
7100		goto sync_lower;
7101
7102	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
7103		&dev->features, &features);
7104
7105	if (dev->netdev_ops->ndo_set_features)
7106		err = dev->netdev_ops->ndo_set_features(dev, features);
7107	else
7108		err = 0;
7109
7110	if (unlikely(err < 0)) {
7111		netdev_err(dev,
7112			"set_features() failed (%d); wanted %pNF, left %pNF\n",
7113			err, &features, &dev->features);
7114		/* return non-0 since some features might have changed and
7115		 * it's better to fire a spurious notification than miss it
7116		 */
7117		return -1;
7118	}
7119
7120sync_lower:
7121	/* some features must be disabled on lower devices when disabled
7122	 * on an upper device (think: bonding master or bridge)
7123	 */
7124	netdev_for_each_lower_dev(dev, lower, iter)
7125		netdev_sync_lower_features(dev, lower, features);
7126
7127	if (!err)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7128		dev->features = features;
 
7129
7130	return err < 0 ? 0 : 1;
7131}
7132
7133/**
7134 *	netdev_update_features - recalculate device features
7135 *	@dev: the device to check
7136 *
7137 *	Recalculate dev->features set and send notifications if it
7138 *	has changed. Should be called after driver or hardware dependent
7139 *	conditions might have changed that influence the features.
7140 */
7141void netdev_update_features(struct net_device *dev)
7142{
7143	if (__netdev_update_features(dev))
7144		netdev_features_change(dev);
7145}
7146EXPORT_SYMBOL(netdev_update_features);
7147
7148/**
7149 *	netdev_change_features - recalculate device features
7150 *	@dev: the device to check
7151 *
7152 *	Recalculate dev->features set and send notifications even
7153 *	if they have not changed. Should be called instead of
7154 *	netdev_update_features() if also dev->vlan_features might
7155 *	have changed to allow the changes to be propagated to stacked
7156 *	VLAN devices.
7157 */
7158void netdev_change_features(struct net_device *dev)
7159{
7160	__netdev_update_features(dev);
7161	netdev_features_change(dev);
7162}
7163EXPORT_SYMBOL(netdev_change_features);
7164
7165/**
7166 *	netif_stacked_transfer_operstate -	transfer operstate
7167 *	@rootdev: the root or lower level device to transfer state from
7168 *	@dev: the device to transfer operstate to
7169 *
7170 *	Transfer operational state from root to device. This is normally
7171 *	called when a stacking relationship exists between the root
7172 *	device and the device(a leaf device).
7173 */
7174void netif_stacked_transfer_operstate(const struct net_device *rootdev,
7175					struct net_device *dev)
7176{
7177	if (rootdev->operstate == IF_OPER_DORMANT)
7178		netif_dormant_on(dev);
7179	else
7180		netif_dormant_off(dev);
7181
7182	if (netif_carrier_ok(rootdev)) {
7183		if (!netif_carrier_ok(dev))
7184			netif_carrier_on(dev);
7185	} else {
7186		if (netif_carrier_ok(dev))
7187			netif_carrier_off(dev);
7188	}
 
 
7189}
7190EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7191
7192#ifdef CONFIG_SYSFS
7193static int netif_alloc_rx_queues(struct net_device *dev)
7194{
7195	unsigned int i, count = dev->num_rx_queues;
7196	struct netdev_rx_queue *rx;
7197	size_t sz = count * sizeof(*rx);
 
7198
7199	BUG_ON(count < 1);
7200
7201	rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7202	if (!rx) {
7203		rx = vzalloc(sz);
7204		if (!rx)
7205			return -ENOMEM;
7206	}
7207	dev->_rx = rx;
7208
7209	for (i = 0; i < count; i++)
7210		rx[i].dev = dev;
 
 
 
 
 
 
7211	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7212}
7213#endif
7214
7215static void netdev_init_one_queue(struct net_device *dev,
7216				  struct netdev_queue *queue, void *_unused)
7217{
7218	/* Initialize queue lock */
7219	spin_lock_init(&queue->_xmit_lock);
7220	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7221	queue->xmit_lock_owner = -1;
7222	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
7223	queue->dev = dev;
7224#ifdef CONFIG_BQL
7225	dql_init(&queue->dql, HZ);
7226#endif
7227}
7228
7229static void netif_free_tx_queues(struct net_device *dev)
7230{
7231	kvfree(dev->_tx);
7232}
7233
7234static int netif_alloc_netdev_queues(struct net_device *dev)
7235{
7236	unsigned int count = dev->num_tx_queues;
7237	struct netdev_queue *tx;
7238	size_t sz = count * sizeof(*tx);
7239
7240	if (count < 1 || count > 0xffff)
7241		return -EINVAL;
7242
7243	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7244	if (!tx) {
7245		tx = vzalloc(sz);
7246		if (!tx)
7247			return -ENOMEM;
7248	}
7249	dev->_tx = tx;
7250
7251	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7252	spin_lock_init(&dev->tx_global_lock);
7253
7254	return 0;
7255}
7256
7257void netif_tx_stop_all_queues(struct net_device *dev)
7258{
7259	unsigned int i;
7260
7261	for (i = 0; i < dev->num_tx_queues; i++) {
7262		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 
7263		netif_tx_stop_queue(txq);
7264	}
7265}
7266EXPORT_SYMBOL(netif_tx_stop_all_queues);
7267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7268/**
7269 *	register_netdevice	- register a network device
7270 *	@dev: device to register
7271 *
7272 *	Take a completed network device structure and add it to the kernel
7273 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7274 *	chain. 0 is returned on success. A negative errno code is returned
7275 *	on a failure to set up the device, or if the name is a duplicate.
7276 *
7277 *	Callers must hold the rtnl semaphore. You may want
7278 *	register_netdev() instead of this.
7279 *
7280 *	BUGS:
7281 *	The locking appears insufficient to guarantee two parallel registers
7282 *	will not get the same name.
7283 */
7284
7285int register_netdevice(struct net_device *dev)
7286{
7287	int ret;
7288	struct net *net = dev_net(dev);
7289
 
 
7290	BUG_ON(dev_boot_phase);
7291	ASSERT_RTNL();
7292
7293	might_sleep();
7294
7295	/* When net_device's are persistent, this will be fatal. */
7296	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7297	BUG_ON(!net);
7298
 
 
 
 
 
 
 
 
7299	spin_lock_init(&dev->addr_list_lock);
7300	netdev_set_addr_lockdep_class(dev);
7301
7302	ret = dev_get_valid_name(net, dev, dev->name);
7303	if (ret < 0)
7304		goto out;
7305
 
 
 
 
 
7306	/* Init, if this function is available */
7307	if (dev->netdev_ops->ndo_init) {
7308		ret = dev->netdev_ops->ndo_init(dev);
7309		if (ret) {
7310			if (ret > 0)
7311				ret = -EIO;
7312			goto out;
7313		}
7314	}
7315
7316	if (((dev->hw_features | dev->features) &
7317	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
7318	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7319	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7320		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7321		ret = -EINVAL;
7322		goto err_uninit;
7323	}
7324
7325	ret = -EBUSY;
7326	if (!dev->ifindex)
7327		dev->ifindex = dev_new_index(net);
7328	else if (__dev_get_by_index(net, dev->ifindex))
7329		goto err_uninit;
7330
 
 
 
 
 
7331	/* Transfer changeable features to wanted_features and enable
7332	 * software offloads (GSO and GRO).
7333	 */
7334	dev->hw_features |= NETIF_F_SOFT_FEATURES;
7335	dev->features |= NETIF_F_SOFT_FEATURES;
 
 
 
 
 
 
7336	dev->wanted_features = dev->features & dev->hw_features;
7337
7338	if (!(dev->flags & IFF_LOOPBACK))
7339		dev->hw_features |= NETIF_F_NOCACHE_COPY;
7340
7341	/* If IPv4 TCP segmentation offload is supported we should also
7342	 * allow the device to enable segmenting the frame with the option
7343	 * of ignoring a static IP ID value.  This doesn't enable the
7344	 * feature itself but allows the user to enable it later.
7345	 */
7346	if (dev->hw_features & NETIF_F_TSO)
7347		dev->hw_features |= NETIF_F_TSO_MANGLEID;
7348	if (dev->vlan_features & NETIF_F_TSO)
7349		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7350	if (dev->mpls_features & NETIF_F_TSO)
7351		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7352	if (dev->hw_enc_features & NETIF_F_TSO)
7353		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7354
7355	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7356	 */
7357	dev->vlan_features |= NETIF_F_HIGHDMA;
7358
7359	/* Make NETIF_F_SG inheritable to tunnel devices.
7360	 */
7361	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7362
7363	/* Make NETIF_F_SG inheritable to MPLS.
7364	 */
7365	dev->mpls_features |= NETIF_F_SG;
7366
7367	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7368	ret = notifier_to_errno(ret);
7369	if (ret)
7370		goto err_uninit;
7371
7372	ret = netdev_register_kobject(dev);
 
 
 
7373	if (ret)
7374		goto err_uninit;
7375	dev->reg_state = NETREG_REGISTERED;
7376
7377	__netdev_update_features(dev);
7378
7379	/*
7380	 *	Default initial state at registry is that the
7381	 *	device is present.
7382	 */
7383
7384	set_bit(__LINK_STATE_PRESENT, &dev->state);
7385
7386	linkwatch_init_dev(dev);
7387
7388	dev_init_scheduler(dev);
7389	dev_hold(dev);
 
7390	list_netdevice(dev);
 
7391	add_device_randomness(dev->dev_addr, dev->addr_len);
7392
7393	/* If the device has permanent device address, driver should
7394	 * set dev_addr and also addr_assign_type should be set to
7395	 * NET_ADDR_PERM (default value).
7396	 */
7397	if (dev->addr_assign_type == NET_ADDR_PERM)
7398		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7399
7400	/* Notify protocols, that a new device appeared. */
7401	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7402	ret = notifier_to_errno(ret);
7403	if (ret) {
7404		rollback_registered(dev);
7405		dev->reg_state = NETREG_UNREGISTERED;
 
 
7406	}
7407	/*
7408	 *	Prevent userspace races by waiting until the network
7409	 *	device is fully setup before sending notifications.
7410	 */
7411	if (!dev->rtnl_link_ops ||
7412	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7413		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7414
7415out:
7416	return ret;
7417
 
 
 
 
 
 
7418err_uninit:
7419	if (dev->netdev_ops->ndo_uninit)
7420		dev->netdev_ops->ndo_uninit(dev);
 
 
 
 
7421	goto out;
7422}
7423EXPORT_SYMBOL(register_netdevice);
7424
7425/**
7426 *	init_dummy_netdev	- init a dummy network device for NAPI
7427 *	@dev: device to init
7428 *
7429 *	This takes a network device structure and initialize the minimum
7430 *	amount of fields so it can be used to schedule NAPI polls without
7431 *	registering a full blown interface. This is to be used by drivers
7432 *	that need to tie several hardware interfaces to a single NAPI
7433 *	poll scheduler due to HW limitations.
7434 */
7435int init_dummy_netdev(struct net_device *dev)
7436{
7437	/* Clear everything. Note we don't initialize spinlocks
7438	 * are they aren't supposed to be taken by any of the
7439	 * NAPI code and this dummy netdev is supposed to be
7440	 * only ever used for NAPI polls
7441	 */
7442	memset(dev, 0, sizeof(struct net_device));
7443
7444	/* make sure we BUG if trying to hit standard
7445	 * register/unregister code path
7446	 */
7447	dev->reg_state = NETREG_DUMMY;
7448
7449	/* NAPI wants this */
7450	INIT_LIST_HEAD(&dev->napi_list);
7451
7452	/* a dummy interface is started by default */
7453	set_bit(__LINK_STATE_PRESENT, &dev->state);
7454	set_bit(__LINK_STATE_START, &dev->state);
7455
 
 
 
7456	/* Note : We dont allocate pcpu_refcnt for dummy devices,
7457	 * because users of this 'device' dont need to change
7458	 * its refcount.
7459	 */
 
7460
7461	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7462}
7463EXPORT_SYMBOL_GPL(init_dummy_netdev);
7464
7465
7466/**
7467 *	register_netdev	- register a network device
7468 *	@dev: device to register
7469 *
7470 *	Take a completed network device structure and add it to the kernel
7471 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7472 *	chain. 0 is returned on success. A negative errno code is returned
7473 *	on a failure to set up the device, or if the name is a duplicate.
7474 *
7475 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
7476 *	and expands the device name if you passed a format string to
7477 *	alloc_netdev.
7478 */
7479int register_netdev(struct net_device *dev)
7480{
7481	int err;
7482
7483	rtnl_lock();
 
7484	err = register_netdevice(dev);
7485	rtnl_unlock();
7486	return err;
7487}
7488EXPORT_SYMBOL(register_netdev);
7489
7490int netdev_refcnt_read(const struct net_device *dev)
7491{
 
7492	int i, refcnt = 0;
7493
7494	for_each_possible_cpu(i)
7495		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7496	return refcnt;
 
 
 
7497}
7498EXPORT_SYMBOL(netdev_refcnt_read);
7499
 
 
 
 
7500/**
7501 * netdev_wait_allrefs - wait until all references are gone.
7502 * @dev: target net_device
7503 *
7504 * This is called when unregistering network devices.
7505 *
7506 * Any protocol or device that holds a reference should register
7507 * for netdevice notification, and cleanup and put back the
7508 * reference if they receive an UNREGISTER event.
7509 * We can get stuck here if buggy protocols don't correctly
7510 * call dev_put.
7511 */
7512static void netdev_wait_allrefs(struct net_device *dev)
7513{
7514	unsigned long rebroadcast_time, warning_time;
7515	int refcnt;
 
7516
7517	linkwatch_forget_dev(dev);
7518
7519	rebroadcast_time = warning_time = jiffies;
7520	refcnt = netdev_refcnt_read(dev);
 
7521
7522	while (refcnt != 0) {
7523		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7524			rtnl_lock();
7525
7526			/* Rebroadcast unregister notification */
7527			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 
7528
7529			__rtnl_unlock();
7530			rcu_barrier();
7531			rtnl_lock();
7532
7533			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7534			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7535				     &dev->state)) {
7536				/* We must not have linkwatch events
7537				 * pending on unregister. If this
7538				 * happens, we simply run the queue
7539				 * unscheduled, resulting in a noop
7540				 * for this device.
7541				 */
7542				linkwatch_run_queue();
7543			}
 
7544
7545			__rtnl_unlock();
7546
7547			rebroadcast_time = jiffies;
7548		}
7549
7550		msleep(250);
7551
7552		refcnt = netdev_refcnt_read(dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7553
7554		if (time_after(jiffies, warning_time + 10 * HZ)) {
7555			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7556				 dev->name, refcnt);
7557			warning_time = jiffies;
7558		}
7559	}
7560}
7561
7562/* The sequence is:
7563 *
7564 *	rtnl_lock();
7565 *	...
7566 *	register_netdevice(x1);
7567 *	register_netdevice(x2);
7568 *	...
7569 *	unregister_netdevice(y1);
7570 *	unregister_netdevice(y2);
7571 *      ...
7572 *	rtnl_unlock();
7573 *	free_netdev(y1);
7574 *	free_netdev(y2);
7575 *
7576 * We are invoked by rtnl_unlock().
7577 * This allows us to deal with problems:
7578 * 1) We can delete sysfs objects which invoke hotplug
7579 *    without deadlocking with linkwatch via keventd.
7580 * 2) Since we run with the RTNL semaphore not held, we can sleep
7581 *    safely in order to wait for the netdev refcnt to drop to zero.
7582 *
7583 * We must not return until all unregister events added during
7584 * the interval the lock was held have been completed.
7585 */
7586void netdev_run_todo(void)
7587{
 
7588	struct list_head list;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7589
7590	/* Snapshot list, allow later requests */
7591	list_replace_init(&net_todo_list, &list);
7592
7593	__rtnl_unlock();
7594
7595
7596	/* Wait for rcu callbacks to finish before next phase */
7597	if (!list_empty(&list))
7598		rcu_barrier();
7599
7600	while (!list_empty(&list)) {
7601		struct net_device *dev
7602			= list_first_entry(&list, struct net_device, todo_list);
7603		list_del(&dev->todo_list);
7604
7605		rtnl_lock();
7606		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7607		__rtnl_unlock();
7608
7609		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7610			pr_err("network todo '%s' but state %d\n",
7611			       dev->name, dev->reg_state);
7612			dump_stack();
7613			continue;
7614		}
7615
7616		dev->reg_state = NETREG_UNREGISTERED;
 
 
7617
7618		netdev_wait_allrefs(dev);
 
 
 
7619
7620		/* paranoia */
7621		BUG_ON(netdev_refcnt_read(dev));
7622		BUG_ON(!list_empty(&dev->ptype_all));
7623		BUG_ON(!list_empty(&dev->ptype_specific));
7624		WARN_ON(rcu_access_pointer(dev->ip_ptr));
7625		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7626		WARN_ON(dev->dn_ptr);
7627
7628		if (dev->destructor)
7629			dev->destructor(dev);
 
 
 
7630
7631		/* Report a network device has been unregistered */
7632		rtnl_lock();
7633		dev_net(dev)->dev_unreg_count--;
7634		__rtnl_unlock();
7635		wake_up(&netdev_unregistering_wq);
7636
7637		/* Free network device */
7638		kobject_put(&dev->dev.kobj);
7639	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7640}
7641
7642/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7643 * all the same fields in the same order as net_device_stats, with only
7644 * the type differing, but rtnl_link_stats64 may have additional fields
7645 * at the end for newer counters.
7646 */
7647void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7648			     const struct net_device_stats *netdev_stats)
7649{
7650#if BITS_PER_LONG == 64
7651	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7652	memcpy(stats64, netdev_stats, sizeof(*stats64));
7653	/* zero out counters that only exist in rtnl_link_stats64 */
7654	memset((char *)stats64 + sizeof(*netdev_stats), 0,
7655	       sizeof(*stats64) - sizeof(*netdev_stats));
7656#else
7657	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7658	const unsigned long *src = (const unsigned long *)netdev_stats;
7659	u64 *dst = (u64 *)stats64;
7660
7661	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7662	for (i = 0; i < n; i++)
7663		dst[i] = src[i];
7664	/* zero out counters that only exist in rtnl_link_stats64 */
7665	memset((char *)stats64 + n * sizeof(u64), 0,
7666	       sizeof(*stats64) - n * sizeof(u64));
7667#endif
7668}
7669EXPORT_SYMBOL(netdev_stats_to_stats64);
7670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7671/**
7672 *	dev_get_stats	- get network device statistics
7673 *	@dev: device to get statistics from
7674 *	@storage: place to store stats
7675 *
7676 *	Get network statistics from device. Return @storage.
7677 *	The device driver may provide its own method by setting
7678 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7679 *	otherwise the internal statistics structure is used.
7680 */
7681struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7682					struct rtnl_link_stats64 *storage)
7683{
7684	const struct net_device_ops *ops = dev->netdev_ops;
 
7685
7686	if (ops->ndo_get_stats64) {
7687		memset(storage, 0, sizeof(*storage));
7688		ops->ndo_get_stats64(dev, storage);
7689	} else if (ops->ndo_get_stats) {
7690		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
 
 
 
 
7691	} else {
7692		netdev_stats_to_stats64(storage, &dev->stats);
7693	}
7694	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7695	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7696	storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
 
 
 
 
 
 
 
 
 
 
 
 
7697	return storage;
7698}
7699EXPORT_SYMBOL(dev_get_stats);
7700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7701struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7702{
7703	struct netdev_queue *queue = dev_ingress_queue(dev);
7704
7705#ifdef CONFIG_NET_CLS_ACT
7706	if (queue)
7707		return queue;
7708	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7709	if (!queue)
7710		return NULL;
7711	netdev_init_one_queue(dev, queue, NULL);
7712	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7713	queue->qdisc_sleeping = &noop_qdisc;
7714	rcu_assign_pointer(dev->ingress_queue, queue);
7715#endif
7716	return queue;
7717}
7718
7719static const struct ethtool_ops default_ethtool_ops;
7720
7721void netdev_set_default_ethtool_ops(struct net_device *dev,
7722				    const struct ethtool_ops *ops)
7723{
7724	if (dev->ethtool_ops == &default_ethtool_ops)
7725		dev->ethtool_ops = ops;
7726}
7727EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7728
7729void netdev_freemem(struct net_device *dev)
 
 
 
 
 
 
 
7730{
7731	char *addr = (char *)dev - dev->padded;
7732
7733	kvfree(addr);
 
 
 
7734}
 
7735
7736/**
7737 *	alloc_netdev_mqs - allocate network device
7738 *	@sizeof_priv:		size of private data to allocate space for
7739 *	@name:			device name format string
7740 *	@name_assign_type: 	origin of device name
7741 *	@setup:			callback to initialize device
7742 *	@txqs:			the number of TX subqueues to allocate
7743 *	@rxqs:			the number of RX subqueues to allocate
7744 *
7745 *	Allocates a struct net_device with private data area for driver use
7746 *	and performs basic initialization.  Also allocates subqueue structs
7747 *	for each queue on the device.
7748 */
7749struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7750		unsigned char name_assign_type,
7751		void (*setup)(struct net_device *),
7752		unsigned int txqs, unsigned int rxqs)
7753{
7754	struct net_device *dev;
7755	size_t alloc_size;
7756	struct net_device *p;
7757
7758	BUG_ON(strlen(name) >= sizeof(dev->name));
7759
7760	if (txqs < 1) {
7761		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7762		return NULL;
7763	}
7764
7765#ifdef CONFIG_SYSFS
7766	if (rxqs < 1) {
7767		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7768		return NULL;
7769	}
7770#endif
7771
7772	alloc_size = sizeof(struct net_device);
7773	if (sizeof_priv) {
7774		/* ensure 32-byte alignment of private area */
7775		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7776		alloc_size += sizeof_priv;
7777	}
7778	/* ensure 32-byte alignment of whole construct */
7779	alloc_size += NETDEV_ALIGN - 1;
7780
7781	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7782	if (!p)
7783		p = vzalloc(alloc_size);
7784	if (!p)
7785		return NULL;
7786
7787	dev = PTR_ALIGN(p, NETDEV_ALIGN);
7788	dev->padded = (char *)dev - (char *)p;
7789
 
 
7790	dev->pcpu_refcnt = alloc_percpu(int);
7791	if (!dev->pcpu_refcnt)
7792		goto free_dev;
 
 
 
 
7793
7794	if (dev_addr_init(dev))
7795		goto free_pcpu;
7796
7797	dev_mc_init(dev);
7798	dev_uc_init(dev);
7799
7800	dev_net_set(dev, &init_net);
7801
7802	dev->gso_max_size = GSO_MAX_SIZE;
 
7803	dev->gso_max_segs = GSO_MAX_SEGS;
 
 
 
 
 
 
 
 
 
 
 
7804
7805	INIT_LIST_HEAD(&dev->napi_list);
7806	INIT_LIST_HEAD(&dev->unreg_list);
7807	INIT_LIST_HEAD(&dev->close_list);
7808	INIT_LIST_HEAD(&dev->link_watch_list);
7809	INIT_LIST_HEAD(&dev->adj_list.upper);
7810	INIT_LIST_HEAD(&dev->adj_list.lower);
7811	INIT_LIST_HEAD(&dev->ptype_all);
7812	INIT_LIST_HEAD(&dev->ptype_specific);
 
7813#ifdef CONFIG_NET_SCHED
7814	hash_init(dev->qdisc_hash);
7815#endif
 
 
 
7816	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7817	setup(dev);
7818
7819	if (!dev->tx_queue_len) {
7820		dev->priv_flags |= IFF_NO_QUEUE;
7821		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
7822	}
7823
7824	dev->num_tx_queues = txqs;
7825	dev->real_num_tx_queues = txqs;
7826	if (netif_alloc_netdev_queues(dev))
7827		goto free_all;
7828
7829#ifdef CONFIG_SYSFS
7830	dev->num_rx_queues = rxqs;
7831	dev->real_num_rx_queues = rxqs;
7832	if (netif_alloc_rx_queues(dev))
7833		goto free_all;
7834#endif
 
 
 
 
 
 
 
7835
7836	strcpy(dev->name, name);
7837	dev->name_assign_type = name_assign_type;
7838	dev->group = INIT_NETDEV_GROUP;
7839	if (!dev->ethtool_ops)
7840		dev->ethtool_ops = &default_ethtool_ops;
7841
7842	nf_hook_ingress_init(dev);
7843
7844	return dev;
7845
7846free_all:
7847	free_netdev(dev);
7848	return NULL;
7849
7850free_pcpu:
 
7851	free_percpu(dev->pcpu_refcnt);
7852free_dev:
7853	netdev_freemem(dev);
 
7854	return NULL;
7855}
7856EXPORT_SYMBOL(alloc_netdev_mqs);
7857
7858/**
7859 *	free_netdev - free network device
7860 *	@dev: device
7861 *
7862 *	This function does the last stage of destroying an allocated device
7863 * 	interface. The reference to the device object is released.
7864 *	If this is the last reference then it will be freed.
7865 *	Must be called in process context.
7866 */
7867void free_netdev(struct net_device *dev)
7868{
7869	struct napi_struct *p, *n;
7870
7871	might_sleep();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7872	netif_free_tx_queues(dev);
7873#ifdef CONFIG_SYSFS
7874	kvfree(dev->_rx);
7875#endif
7876
7877	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7878
7879	/* Flush device addresses */
7880	dev_addr_flush(dev);
7881
7882	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7883		netif_napi_del(p);
7884
 
 
 
 
7885	free_percpu(dev->pcpu_refcnt);
7886	dev->pcpu_refcnt = NULL;
 
 
 
 
 
 
 
7887
7888	/*  Compatibility with error handling in drivers */
7889	if (dev->reg_state == NETREG_UNINITIALIZED) {
7890		netdev_freemem(dev);
 
7891		return;
7892	}
7893
7894	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7895	dev->reg_state = NETREG_RELEASED;
7896
7897	/* will free via device release */
7898	put_device(&dev->dev);
7899}
7900EXPORT_SYMBOL(free_netdev);
7901
7902/**
 
 
 
 
 
 
 
 
 
 
 
 
 
7903 *	synchronize_net -  Synchronize with packet receive processing
7904 *
7905 *	Wait for packets currently being received to be done.
7906 *	Does not block later packets from starting.
7907 */
7908void synchronize_net(void)
7909{
7910	might_sleep();
7911	if (rtnl_is_locked())
7912		synchronize_rcu_expedited();
7913	else
7914		synchronize_rcu();
7915}
7916EXPORT_SYMBOL(synchronize_net);
7917
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7918/**
7919 *	unregister_netdevice_queue - remove device from the kernel
7920 *	@dev: device
7921 *	@head: list
7922 *
7923 *	This function shuts down a device interface and removes it
7924 *	from the kernel tables.
7925 *	If head not NULL, device is queued to be unregistered later.
7926 *
7927 *	Callers must hold the rtnl semaphore.  You may want
7928 *	unregister_netdev() instead of this.
7929 */
7930
7931void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7932{
7933	ASSERT_RTNL();
7934
7935	if (head) {
7936		list_move_tail(&dev->unreg_list, head);
7937	} else {
7938		rollback_registered(dev);
7939		/* Finish processing unregister after unlock */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7940		net_set_todo(dev);
 
7941	}
 
 
 
7942}
7943EXPORT_SYMBOL(unregister_netdevice_queue);
7944
7945/**
7946 *	unregister_netdevice_many - unregister many devices
7947 *	@head: list of devices
7948 *
7949 *  Note: As most callers use a stack allocated list_head,
7950 *  we force a list_del() to make sure stack wont be corrupted later.
7951 */
7952void unregister_netdevice_many(struct list_head *head)
7953{
7954	struct net_device *dev;
7955
7956	if (!list_empty(head)) {
7957		rollback_registered_many(head);
7958		list_for_each_entry(dev, head, unreg_list)
7959			net_set_todo(dev);
7960		list_del(head);
7961	}
7962}
7963EXPORT_SYMBOL(unregister_netdevice_many);
7964
7965/**
7966 *	unregister_netdev - remove device from the kernel
7967 *	@dev: device
7968 *
7969 *	This function shuts down a device interface and removes it
7970 *	from the kernel tables.
7971 *
7972 *	This is just a wrapper for unregister_netdevice that takes
7973 *	the rtnl semaphore.  In general you want to use this and not
7974 *	unregister_netdevice.
7975 */
7976void unregister_netdev(struct net_device *dev)
7977{
7978	rtnl_lock();
7979	unregister_netdevice(dev);
7980	rtnl_unlock();
7981}
7982EXPORT_SYMBOL(unregister_netdev);
7983
7984/**
7985 *	dev_change_net_namespace - move device to different nethost namespace
7986 *	@dev: device
7987 *	@net: network namespace
7988 *	@pat: If not NULL name pattern to try if the current device name
7989 *	      is already taken in the destination network namespace.
 
 
7990 *
7991 *	This function shuts down a device interface and moves it
7992 *	to a new network namespace. On success 0 is returned, on
7993 *	a failure a netagive errno code is returned.
7994 *
7995 *	Callers must hold the rtnl semaphore.
7996 */
7997
7998int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
 
7999{
8000	int err;
 
 
 
8001
8002	ASSERT_RTNL();
8003
8004	/* Don't allow namespace local devices to be moved. */
8005	err = -EINVAL;
8006	if (dev->features & NETIF_F_NETNS_LOCAL)
8007		goto out;
8008
8009	/* Ensure the device has been registrered */
8010	if (dev->reg_state != NETREG_REGISTERED)
8011		goto out;
8012
8013	/* Get out if there is nothing todo */
8014	err = 0;
8015	if (net_eq(dev_net(dev), net))
8016		goto out;
8017
8018	/* Pick the destination device name, and ensure
8019	 * we can use it in the destination network namespace.
8020	 */
8021	err = -EEXIST;
8022	if (__dev_get_by_name(net, dev->name)) {
8023		/* We get here if we can't use the current device name */
8024		if (!pat)
8025			goto out;
8026		if (dev_get_valid_name(net, dev, pat) < 0)
 
8027			goto out;
8028	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8029
8030	/*
8031	 * And now a mini version of register_netdevice unregister_netdevice.
8032	 */
8033
8034	/* If device is running close it first. */
8035	dev_close(dev);
8036
8037	/* And unlink it from device chain */
8038	err = -ENODEV;
8039	unlist_netdevice(dev);
8040
8041	synchronize_net();
8042
8043	/* Shutdown queueing discipline. */
8044	dev_shutdown(dev);
8045
8046	/* Notify protocols, that we are about to destroy
8047	   this device. They should clean all the things.
8048
8049	   Note that dev->reg_state stays at NETREG_REGISTERED.
8050	   This is wanted because this way 8021q and macvlan know
8051	   the device is just moving and can keep their slaves up.
8052	*/
8053	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8054	rcu_barrier();
8055	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
8056	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
 
 
 
8057
8058	/*
8059	 *	Flush the unicast and multicast chains
8060	 */
8061	dev_uc_flush(dev);
8062	dev_mc_flush(dev);
8063
8064	/* Send a netdev-removed uevent to the old namespace */
8065	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
8066	netdev_adjacent_del_links(dev);
8067
 
 
 
8068	/* Actually switch the network namespace */
8069	dev_net_set(dev, net);
 
8070
8071	/* If there is an ifindex conflict assign a new one */
8072	if (__dev_get_by_index(net, dev->ifindex))
8073		dev->ifindex = dev_new_index(net);
 
 
 
 
 
 
 
 
 
8074
8075	/* Send a netdev-add uevent to the new namespace */
8076	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
8077	netdev_adjacent_add_links(dev);
8078
8079	/* Fixup kobjects */
8080	err = device_rename(&dev->dev, dev->name);
 
 
8081	WARN_ON(err);
8082
8083	/* Add the device back in the hashes */
8084	list_netdevice(dev);
8085
8086	/* Notify protocols, that a new device appeared. */
8087	call_netdevice_notifiers(NETDEV_REGISTER, dev);
8088
8089	/*
8090	 *	Prevent userspace races by waiting until the network
8091	 *	device is fully setup before sending notifications.
8092	 */
8093	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
8094
8095	synchronize_net();
8096	err = 0;
8097out:
8098	return err;
8099}
8100EXPORT_SYMBOL_GPL(dev_change_net_namespace);
8101
8102static int dev_cpu_dead(unsigned int oldcpu)
8103{
8104	struct sk_buff **list_skb;
8105	struct sk_buff *skb;
8106	unsigned int cpu;
8107	struct softnet_data *sd, *oldsd;
8108
8109	local_irq_disable();
8110	cpu = smp_processor_id();
8111	sd = &per_cpu(softnet_data, cpu);
8112	oldsd = &per_cpu(softnet_data, oldcpu);
8113
8114	/* Find end of our completion_queue. */
8115	list_skb = &sd->completion_queue;
8116	while (*list_skb)
8117		list_skb = &(*list_skb)->next;
8118	/* Append completion queue from offline CPU. */
8119	*list_skb = oldsd->completion_queue;
8120	oldsd->completion_queue = NULL;
8121
8122	/* Append output queue from offline CPU. */
8123	if (oldsd->output_queue) {
8124		*sd->output_queue_tailp = oldsd->output_queue;
8125		sd->output_queue_tailp = oldsd->output_queue_tailp;
8126		oldsd->output_queue = NULL;
8127		oldsd->output_queue_tailp = &oldsd->output_queue;
8128	}
8129	/* Append NAPI poll list from offline CPU, with one exception :
8130	 * process_backlog() must be called by cpu owning percpu backlog.
8131	 * We properly handle process_queue & input_pkt_queue later.
8132	 */
8133	while (!list_empty(&oldsd->poll_list)) {
8134		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
8135							    struct napi_struct,
8136							    poll_list);
8137
8138		list_del_init(&napi->poll_list);
8139		if (napi->poll == process_backlog)
8140			napi->state = 0;
8141		else
8142			____napi_schedule(sd, napi);
8143	}
8144
8145	raise_softirq_irqoff(NET_TX_SOFTIRQ);
8146	local_irq_enable();
8147
 
 
 
 
 
 
 
 
 
8148	/* Process offline CPU's input_pkt_queue */
8149	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
8150		netif_rx_ni(skb);
8151		input_queue_head_incr(oldsd);
8152	}
8153	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
8154		netif_rx_ni(skb);
8155		input_queue_head_incr(oldsd);
8156	}
8157
8158	return 0;
8159}
8160
8161/**
8162 *	netdev_increment_features - increment feature set by one
8163 *	@all: current feature set
8164 *	@one: new feature set
8165 *	@mask: mask feature set
8166 *
8167 *	Computes a new feature set after adding a device with feature set
8168 *	@one to the master device with current feature set @all.  Will not
8169 *	enable anything that is off in @mask. Returns the new feature set.
8170 */
8171netdev_features_t netdev_increment_features(netdev_features_t all,
8172	netdev_features_t one, netdev_features_t mask)
8173{
8174	if (mask & NETIF_F_HW_CSUM)
8175		mask |= NETIF_F_CSUM_MASK;
8176	mask |= NETIF_F_VLAN_CHALLENGED;
8177
8178	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
8179	all &= one | ~NETIF_F_ALL_FOR_ALL;
8180
8181	/* If one device supports hw checksumming, set for all. */
8182	if (all & NETIF_F_HW_CSUM)
8183		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
8184
8185	return all;
8186}
8187EXPORT_SYMBOL(netdev_increment_features);
8188
8189static struct hlist_head * __net_init netdev_create_hash(void)
8190{
8191	int i;
8192	struct hlist_head *hash;
8193
8194	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8195	if (hash != NULL)
8196		for (i = 0; i < NETDEV_HASHENTRIES; i++)
8197			INIT_HLIST_HEAD(&hash[i]);
8198
8199	return hash;
8200}
8201
8202/* Initialize per network namespace state */
8203static int __net_init netdev_init(struct net *net)
8204{
8205	if (net != &init_net)
8206		INIT_LIST_HEAD(&net->dev_base_head);
 
 
8207
8208	net->dev_name_head = netdev_create_hash();
8209	if (net->dev_name_head == NULL)
8210		goto err_name;
8211
8212	net->dev_index_head = netdev_create_hash();
8213	if (net->dev_index_head == NULL)
8214		goto err_idx;
8215
 
 
 
 
8216	return 0;
8217
8218err_idx:
8219	kfree(net->dev_name_head);
8220err_name:
8221	return -ENOMEM;
8222}
8223
8224/**
8225 *	netdev_drivername - network driver for the device
8226 *	@dev: network device
8227 *
8228 *	Determine network driver for device.
8229 */
8230const char *netdev_drivername(const struct net_device *dev)
8231{
8232	const struct device_driver *driver;
8233	const struct device *parent;
8234	const char *empty = "";
8235
8236	parent = dev->dev.parent;
8237	if (!parent)
8238		return empty;
8239
8240	driver = parent->driver;
8241	if (driver && driver->name)
8242		return driver->name;
8243	return empty;
8244}
8245
8246static void __netdev_printk(const char *level, const struct net_device *dev,
8247			    struct va_format *vaf)
8248{
8249	if (dev && dev->dev.parent) {
8250		dev_printk_emit(level[1] - '0',
8251				dev->dev.parent,
8252				"%s %s %s%s: %pV",
8253				dev_driver_string(dev->dev.parent),
8254				dev_name(dev->dev.parent),
8255				netdev_name(dev), netdev_reg_state(dev),
8256				vaf);
8257	} else if (dev) {
8258		printk("%s%s%s: %pV",
8259		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
8260	} else {
8261		printk("%s(NULL net_device): %pV", level, vaf);
8262	}
8263}
8264
8265void netdev_printk(const char *level, const struct net_device *dev,
8266		   const char *format, ...)
8267{
8268	struct va_format vaf;
8269	va_list args;
8270
8271	va_start(args, format);
8272
8273	vaf.fmt = format;
8274	vaf.va = &args;
8275
8276	__netdev_printk(level, dev, &vaf);
8277
8278	va_end(args);
8279}
8280EXPORT_SYMBOL(netdev_printk);
8281
8282#define define_netdev_printk_level(func, level)			\
8283void func(const struct net_device *dev, const char *fmt, ...)	\
8284{								\
8285	struct va_format vaf;					\
8286	va_list args;						\
8287								\
8288	va_start(args, fmt);					\
8289								\
8290	vaf.fmt = fmt;						\
8291	vaf.va = &args;						\
8292								\
8293	__netdev_printk(level, dev, &vaf);			\
8294								\
8295	va_end(args);						\
8296}								\
8297EXPORT_SYMBOL(func);
8298
8299define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8300define_netdev_printk_level(netdev_alert, KERN_ALERT);
8301define_netdev_printk_level(netdev_crit, KERN_CRIT);
8302define_netdev_printk_level(netdev_err, KERN_ERR);
8303define_netdev_printk_level(netdev_warn, KERN_WARNING);
8304define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8305define_netdev_printk_level(netdev_info, KERN_INFO);
8306
8307static void __net_exit netdev_exit(struct net *net)
8308{
8309	kfree(net->dev_name_head);
8310	kfree(net->dev_index_head);
 
 
 
8311}
8312
8313static struct pernet_operations __net_initdata netdev_net_ops = {
8314	.init = netdev_init,
8315	.exit = netdev_exit,
8316};
8317
8318static void __net_exit default_device_exit(struct net *net)
8319{
 
8320	struct net_device *dev, *aux;
8321	/*
8322	 * Push all migratable network devices back to the
8323	 * initial network namespace
8324	 */
8325	rtnl_lock();
8326	for_each_netdev_safe(net, dev, aux) {
8327		int err;
8328		char fb_name[IFNAMSIZ];
8329
8330		/* Ignore unmoveable devices (i.e. loopback) */
8331		if (dev->features & NETIF_F_NETNS_LOCAL)
8332			continue;
8333
8334		/* Leave virtual devices for the generic cleanup */
8335		if (dev->rtnl_link_ops)
8336			continue;
8337
8338		/* Push remaining network devices to init_net */
8339		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
 
 
 
 
 
 
 
8340		err = dev_change_net_namespace(dev, &init_net, fb_name);
8341		if (err) {
8342			pr_emerg("%s: failed to move %s to init_net: %d\n",
8343				 __func__, dev->name, err);
8344			BUG();
8345		}
8346	}
8347	rtnl_unlock();
8348}
8349
8350static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8351{
8352	/* Return with the rtnl_lock held when there are no network
8353	 * devices unregistering in any network namespace in net_list.
8354	 */
8355	struct net *net;
8356	bool unregistering;
8357	DEFINE_WAIT_FUNC(wait, woken_wake_function);
8358
8359	add_wait_queue(&netdev_unregistering_wq, &wait);
8360	for (;;) {
8361		unregistering = false;
8362		rtnl_lock();
8363		list_for_each_entry(net, net_list, exit_list) {
8364			if (net->dev_unreg_count > 0) {
8365				unregistering = true;
8366				break;
8367			}
8368		}
8369		if (!unregistering)
8370			break;
8371		__rtnl_unlock();
8372
8373		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8374	}
8375	remove_wait_queue(&netdev_unregistering_wq, &wait);
8376}
8377
8378static void __net_exit default_device_exit_batch(struct list_head *net_list)
8379{
8380	/* At exit all network devices most be removed from a network
8381	 * namespace.  Do this in the reverse order of registration.
8382	 * Do this across as many network namespaces as possible to
8383	 * improve batching efficiency.
8384	 */
8385	struct net_device *dev;
8386	struct net *net;
8387	LIST_HEAD(dev_kill_list);
8388
8389	/* To prevent network device cleanup code from dereferencing
8390	 * loopback devices or network devices that have been freed
8391	 * wait here for all pending unregistrations to complete,
8392	 * before unregistring the loopback device and allowing the
8393	 * network namespace be freed.
8394	 *
8395	 * The netdev todo list containing all network devices
8396	 * unregistrations that happen in default_device_exit_batch
8397	 * will run in the rtnl_unlock() at the end of
8398	 * default_device_exit_batch.
8399	 */
8400	rtnl_lock_unregistering(net_list);
8401	list_for_each_entry(net, net_list, exit_list) {
8402		for_each_netdev_reverse(net, dev) {
8403			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8404				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8405			else
8406				unregister_netdevice_queue(dev, &dev_kill_list);
8407		}
8408	}
8409	unregister_netdevice_many(&dev_kill_list);
8410	rtnl_unlock();
8411}
8412
8413static struct pernet_operations __net_initdata default_device_ops = {
8414	.exit = default_device_exit,
8415	.exit_batch = default_device_exit_batch,
8416};
8417
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8418/*
8419 *	Initialize the DEV module. At boot time this walks the device list and
8420 *	unhooks any devices that fail to initialise (normally hardware not
8421 *	present) and leaves us with a valid list of present and active devices.
8422 *
8423 */
8424
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8425/*
8426 *       This is called single threaded during boot, so no need
8427 *       to take the rtnl semaphore.
8428 */
8429static int __init net_dev_init(void)
8430{
8431	int i, rc = -ENOMEM;
8432
8433	BUG_ON(!dev_boot_phase);
8434
 
 
8435	if (dev_proc_init())
8436		goto out;
8437
8438	if (netdev_kobject_init())
8439		goto out;
8440
8441	INIT_LIST_HEAD(&ptype_all);
8442	for (i = 0; i < PTYPE_HASH_SIZE; i++)
8443		INIT_LIST_HEAD(&ptype_base[i]);
8444
8445	INIT_LIST_HEAD(&offload_base);
8446
8447	if (register_pernet_subsys(&netdev_net_ops))
8448		goto out;
8449
8450	/*
8451	 *	Initialise the packet receive queues.
8452	 */
8453
8454	for_each_possible_cpu(i) {
8455		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
8456		struct softnet_data *sd = &per_cpu(softnet_data, i);
8457
8458		INIT_WORK(flush, flush_backlog);
8459
8460		skb_queue_head_init(&sd->input_pkt_queue);
8461		skb_queue_head_init(&sd->process_queue);
 
 
 
8462		INIT_LIST_HEAD(&sd->poll_list);
8463		sd->output_queue_tailp = &sd->output_queue;
8464#ifdef CONFIG_RPS
8465		sd->csd.func = rps_trigger_softirq;
8466		sd->csd.info = sd;
8467		sd->cpu = i;
8468#endif
 
 
8469
 
8470		sd->backlog.poll = process_backlog;
8471		sd->backlog.weight = weight_p;
 
 
 
 
8472	}
 
 
8473
8474	dev_boot_phase = 0;
8475
8476	/* The loopback device is special if any other network devices
8477	 * is present in a network namespace the loopback device must
8478	 * be present. Since we now dynamically allocate and free the
8479	 * loopback device ensure this invariant is maintained by
8480	 * keeping the loopback device as the first device on the
8481	 * list of network devices.  Ensuring the loopback devices
8482	 * is the first device that appears and the last network device
8483	 * that disappears.
8484	 */
8485	if (register_pernet_device(&loopback_net_ops))
8486		goto out;
8487
8488	if (register_pernet_device(&default_device_ops))
8489		goto out;
8490
8491	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8492	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8493
8494	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
8495				       NULL, dev_cpu_dead);
8496	WARN_ON(rc < 0);
8497	dst_subsys_init();
8498	rc = 0;
 
 
 
 
8499out:
 
 
 
 
 
 
 
 
 
 
 
 
 
8500	return rc;
8501}
8502
8503subsys_initcall(net_dev_init);