Loading...
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * NET3 Protocol independent device support routines.
4 *
5 * Derived from the non IP parts of dev.c 1.0.19
6 * Authors: Ross Biro
7 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
8 * Mark Evans, <evansmp@uhura.aston.ac.uk>
9 *
10 * Additional Authors:
11 * Florian la Roche <rzsfl@rz.uni-sb.de>
12 * Alan Cox <gw4pts@gw4pts.ampr.org>
13 * David Hinds <dahinds@users.sourceforge.net>
14 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
15 * Adam Sulmicki <adam@cfar.umd.edu>
16 * Pekka Riikonen <priikone@poesidon.pspt.fi>
17 *
18 * Changes:
19 * D.J. Barrow : Fixed bug where dev->refcnt gets set
20 * to 2 if register_netdev gets called
21 * before net_dev_init & also removed a
22 * few lines of code in the process.
23 * Alan Cox : device private ioctl copies fields back.
24 * Alan Cox : Transmit queue code does relevant
25 * stunts to keep the queue safe.
26 * Alan Cox : Fixed double lock.
27 * Alan Cox : Fixed promisc NULL pointer trap
28 * ???????? : Support the full private ioctl range
29 * Alan Cox : Moved ioctl permission check into
30 * drivers
31 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
32 * Alan Cox : 100 backlog just doesn't cut it when
33 * you start doing multicast video 8)
34 * Alan Cox : Rewrote net_bh and list manager.
35 * Alan Cox : Fix ETH_P_ALL echoback lengths.
36 * Alan Cox : Took out transmit every packet pass
37 * Saved a few bytes in the ioctl handler
38 * Alan Cox : Network driver sets packet type before
39 * calling netif_rx. Saves a function
40 * call a packet.
41 * Alan Cox : Hashed net_bh()
42 * Richard Kooijman: Timestamp fixes.
43 * Alan Cox : Wrong field in SIOCGIFDSTADDR
44 * Alan Cox : Device lock protection.
45 * Alan Cox : Fixed nasty side effect of device close
46 * changes.
47 * Rudi Cilibrasi : Pass the right thing to
48 * set_mac_address()
49 * Dave Miller : 32bit quantity for the device lock to
50 * make it work out on a Sparc.
51 * Bjorn Ekwall : Added KERNELD hack.
52 * Alan Cox : Cleaned up the backlog initialise.
53 * Craig Metz : SIOCGIFCONF fix if space for under
54 * 1 device.
55 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
56 * is no device open function.
57 * Andi Kleen : Fix error reporting for SIOCGIFCONF
58 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
59 * Cyrus Durgin : Cleaned for KMOD
60 * Adam Sulmicki : Bug Fix : Network Device Unload
61 * A network device unload needs to purge
62 * the backlog queue.
63 * Paul Rusty Russell : SIOCSIFNAME
64 * Pekka Riikonen : Netdev boot-time settings code
65 * Andrew Morton : Make unregister_netdevice wait
66 * indefinitely on dev->refcnt
67 * J Hadi Salim : - Backlog queue sampling
68 * - netif_rx() feedback
69 */
70
71#include <linux/uaccess.h>
72#include <linux/bitmap.h>
73#include <linux/capability.h>
74#include <linux/cpu.h>
75#include <linux/types.h>
76#include <linux/kernel.h>
77#include <linux/hash.h>
78#include <linux/slab.h>
79#include <linux/sched.h>
80#include <linux/sched/mm.h>
81#include <linux/mutex.h>
82#include <linux/rwsem.h>
83#include <linux/string.h>
84#include <linux/mm.h>
85#include <linux/socket.h>
86#include <linux/sockios.h>
87#include <linux/errno.h>
88#include <linux/interrupt.h>
89#include <linux/if_ether.h>
90#include <linux/netdevice.h>
91#include <linux/etherdevice.h>
92#include <linux/ethtool.h>
93#include <linux/skbuff.h>
94#include <linux/kthread.h>
95#include <linux/bpf.h>
96#include <linux/bpf_trace.h>
97#include <net/net_namespace.h>
98#include <net/sock.h>
99#include <net/busy_poll.h>
100#include <linux/rtnetlink.h>
101#include <linux/stat.h>
102#include <net/dsa.h>
103#include <net/dst.h>
104#include <net/dst_metadata.h>
105#include <net/gro.h>
106#include <net/pkt_sched.h>
107#include <net/pkt_cls.h>
108#include <net/checksum.h>
109#include <net/xfrm.h>
110#include <net/tcx.h>
111#include <linux/highmem.h>
112#include <linux/init.h>
113#include <linux/module.h>
114#include <linux/netpoll.h>
115#include <linux/rcupdate.h>
116#include <linux/delay.h>
117#include <net/iw_handler.h>
118#include <asm/current.h>
119#include <linux/audit.h>
120#include <linux/dmaengine.h>
121#include <linux/err.h>
122#include <linux/ctype.h>
123#include <linux/if_arp.h>
124#include <linux/if_vlan.h>
125#include <linux/ip.h>
126#include <net/ip.h>
127#include <net/mpls.h>
128#include <linux/ipv6.h>
129#include <linux/in.h>
130#include <linux/jhash.h>
131#include <linux/random.h>
132#include <trace/events/napi.h>
133#include <trace/events/net.h>
134#include <trace/events/skb.h>
135#include <trace/events/qdisc.h>
136#include <trace/events/xdp.h>
137#include <linux/inetdevice.h>
138#include <linux/cpu_rmap.h>
139#include <linux/static_key.h>
140#include <linux/hashtable.h>
141#include <linux/vmalloc.h>
142#include <linux/if_macvlan.h>
143#include <linux/errqueue.h>
144#include <linux/hrtimer.h>
145#include <linux/netfilter_netdev.h>
146#include <linux/crash_dump.h>
147#include <linux/sctp.h>
148#include <net/udp_tunnel.h>
149#include <linux/net_namespace.h>
150#include <linux/indirect_call_wrapper.h>
151#include <net/devlink.h>
152#include <linux/pm_runtime.h>
153#include <linux/prandom.h>
154#include <linux/once_lite.h>
155#include <net/netdev_rx_queue.h>
156#include <net/page_pool/types.h>
157#include <net/page_pool/helpers.h>
158#include <net/rps.h>
159
160#include "dev.h"
161#include "net-sysfs.h"
162
163static DEFINE_SPINLOCK(ptype_lock);
164struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
165
166static int netif_rx_internal(struct sk_buff *skb);
167static int call_netdevice_notifiers_extack(unsigned long val,
168 struct net_device *dev,
169 struct netlink_ext_ack *extack);
170
171static DEFINE_MUTEX(ifalias_mutex);
172
173/* protects napi_hash addition/deletion and napi_gen_id */
174static DEFINE_SPINLOCK(napi_hash_lock);
175
176static unsigned int napi_gen_id = NR_CPUS;
177static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
178
179static DECLARE_RWSEM(devnet_rename_sem);
180
181static inline void dev_base_seq_inc(struct net *net)
182{
183 unsigned int val = net->dev_base_seq + 1;
184
185 WRITE_ONCE(net->dev_base_seq, val ?: 1);
186}
187
188static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
189{
190 unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
191
192 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
193}
194
195static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
196{
197 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
198}
199
200static inline void rps_lock_irqsave(struct softnet_data *sd,
201 unsigned long *flags)
202{
203 if (IS_ENABLED(CONFIG_RPS))
204 spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
205 else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
206 local_irq_save(*flags);
207}
208
209static inline void rps_lock_irq_disable(struct softnet_data *sd)
210{
211 if (IS_ENABLED(CONFIG_RPS))
212 spin_lock_irq(&sd->input_pkt_queue.lock);
213 else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
214 local_irq_disable();
215}
216
217static inline void rps_unlock_irq_restore(struct softnet_data *sd,
218 unsigned long *flags)
219{
220 if (IS_ENABLED(CONFIG_RPS))
221 spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
222 else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
223 local_irq_restore(*flags);
224}
225
226static inline void rps_unlock_irq_enable(struct softnet_data *sd)
227{
228 if (IS_ENABLED(CONFIG_RPS))
229 spin_unlock_irq(&sd->input_pkt_queue.lock);
230 else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
231 local_irq_enable();
232}
233
234static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
235 const char *name)
236{
237 struct netdev_name_node *name_node;
238
239 name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
240 if (!name_node)
241 return NULL;
242 INIT_HLIST_NODE(&name_node->hlist);
243 name_node->dev = dev;
244 name_node->name = name;
245 return name_node;
246}
247
248static struct netdev_name_node *
249netdev_name_node_head_alloc(struct net_device *dev)
250{
251 struct netdev_name_node *name_node;
252
253 name_node = netdev_name_node_alloc(dev, dev->name);
254 if (!name_node)
255 return NULL;
256 INIT_LIST_HEAD(&name_node->list);
257 return name_node;
258}
259
260static void netdev_name_node_free(struct netdev_name_node *name_node)
261{
262 kfree(name_node);
263}
264
265static void netdev_name_node_add(struct net *net,
266 struct netdev_name_node *name_node)
267{
268 hlist_add_head_rcu(&name_node->hlist,
269 dev_name_hash(net, name_node->name));
270}
271
272static void netdev_name_node_del(struct netdev_name_node *name_node)
273{
274 hlist_del_rcu(&name_node->hlist);
275}
276
277static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
278 const char *name)
279{
280 struct hlist_head *head = dev_name_hash(net, name);
281 struct netdev_name_node *name_node;
282
283 hlist_for_each_entry(name_node, head, hlist)
284 if (!strcmp(name_node->name, name))
285 return name_node;
286 return NULL;
287}
288
289static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
290 const char *name)
291{
292 struct hlist_head *head = dev_name_hash(net, name);
293 struct netdev_name_node *name_node;
294
295 hlist_for_each_entry_rcu(name_node, head, hlist)
296 if (!strcmp(name_node->name, name))
297 return name_node;
298 return NULL;
299}
300
301bool netdev_name_in_use(struct net *net, const char *name)
302{
303 return netdev_name_node_lookup(net, name);
304}
305EXPORT_SYMBOL(netdev_name_in_use);
306
307int netdev_name_node_alt_create(struct net_device *dev, const char *name)
308{
309 struct netdev_name_node *name_node;
310 struct net *net = dev_net(dev);
311
312 name_node = netdev_name_node_lookup(net, name);
313 if (name_node)
314 return -EEXIST;
315 name_node = netdev_name_node_alloc(dev, name);
316 if (!name_node)
317 return -ENOMEM;
318 netdev_name_node_add(net, name_node);
319 /* The node that holds dev->name acts as a head of per-device list. */
320 list_add_tail_rcu(&name_node->list, &dev->name_node->list);
321
322 return 0;
323}
324
325static void netdev_name_node_alt_free(struct rcu_head *head)
326{
327 struct netdev_name_node *name_node =
328 container_of(head, struct netdev_name_node, rcu);
329
330 kfree(name_node->name);
331 netdev_name_node_free(name_node);
332}
333
334static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
335{
336 netdev_name_node_del(name_node);
337 list_del(&name_node->list);
338 call_rcu(&name_node->rcu, netdev_name_node_alt_free);
339}
340
341int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
342{
343 struct netdev_name_node *name_node;
344 struct net *net = dev_net(dev);
345
346 name_node = netdev_name_node_lookup(net, name);
347 if (!name_node)
348 return -ENOENT;
349 /* lookup might have found our primary name or a name belonging
350 * to another device.
351 */
352 if (name_node == dev->name_node || name_node->dev != dev)
353 return -EINVAL;
354
355 __netdev_name_node_alt_destroy(name_node);
356 return 0;
357}
358
359static void netdev_name_node_alt_flush(struct net_device *dev)
360{
361 struct netdev_name_node *name_node, *tmp;
362
363 list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) {
364 list_del(&name_node->list);
365 netdev_name_node_alt_free(&name_node->rcu);
366 }
367}
368
369/* Device list insertion */
370static void list_netdevice(struct net_device *dev)
371{
372 struct netdev_name_node *name_node;
373 struct net *net = dev_net(dev);
374
375 ASSERT_RTNL();
376
377 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
378 netdev_name_node_add(net, dev->name_node);
379 hlist_add_head_rcu(&dev->index_hlist,
380 dev_index_hash(net, dev->ifindex));
381
382 netdev_for_each_altname(dev, name_node)
383 netdev_name_node_add(net, name_node);
384
385 /* We reserved the ifindex, this can't fail */
386 WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));
387
388 dev_base_seq_inc(net);
389}
390
391/* Device list removal
392 * caller must respect a RCU grace period before freeing/reusing dev
393 */
394static void unlist_netdevice(struct net_device *dev)
395{
396 struct netdev_name_node *name_node;
397 struct net *net = dev_net(dev);
398
399 ASSERT_RTNL();
400
401 xa_erase(&net->dev_by_index, dev->ifindex);
402
403 netdev_for_each_altname(dev, name_node)
404 netdev_name_node_del(name_node);
405
406 /* Unlink dev from the device chain */
407 list_del_rcu(&dev->dev_list);
408 netdev_name_node_del(dev->name_node);
409 hlist_del_rcu(&dev->index_hlist);
410
411 dev_base_seq_inc(dev_net(dev));
412}
413
414/*
415 * Our notifier list
416 */
417
418static RAW_NOTIFIER_HEAD(netdev_chain);
419
420/*
421 * Device drivers call our routines to queue packets here. We empty the
422 * queue in the local softnet handler.
423 */
424
425DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
426EXPORT_PER_CPU_SYMBOL(softnet_data);
427
428/* Page_pool has a lockless array/stack to alloc/recycle pages.
429 * PP consumers must pay attention to run APIs in the appropriate context
430 * (e.g. NAPI context).
431 */
432static DEFINE_PER_CPU(struct page_pool *, system_page_pool);
433
434#ifdef CONFIG_LOCKDEP
435/*
436 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
437 * according to dev->type
438 */
439static const unsigned short netdev_lock_type[] = {
440 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
441 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
442 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
443 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
444 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
445 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
446 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
447 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
448 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
449 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
450 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
451 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
452 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
453 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
454 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
455
456static const char *const netdev_lock_name[] = {
457 "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
458 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
459 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
460 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
461 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
462 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
463 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
464 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
465 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
466 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
467 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
468 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
469 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
470 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
471 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
472
473static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
474static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
475
476static inline unsigned short netdev_lock_pos(unsigned short dev_type)
477{
478 int i;
479
480 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
481 if (netdev_lock_type[i] == dev_type)
482 return i;
483 /* the last key is used by default */
484 return ARRAY_SIZE(netdev_lock_type) - 1;
485}
486
487static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
488 unsigned short dev_type)
489{
490 int i;
491
492 i = netdev_lock_pos(dev_type);
493 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
494 netdev_lock_name[i]);
495}
496
497static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
498{
499 int i;
500
501 i = netdev_lock_pos(dev->type);
502 lockdep_set_class_and_name(&dev->addr_list_lock,
503 &netdev_addr_lock_key[i],
504 netdev_lock_name[i]);
505}
506#else
507static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
508 unsigned short dev_type)
509{
510}
511
512static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
513{
514}
515#endif
516
517/*******************************************************************************
518 *
519 * Protocol management and registration routines
520 *
521 *******************************************************************************/
522
523
524/*
525 * Add a protocol ID to the list. Now that the input handler is
526 * smarter we can dispense with all the messy stuff that used to be
527 * here.
528 *
529 * BEWARE!!! Protocol handlers, mangling input packets,
530 * MUST BE last in hash buckets and checking protocol handlers
531 * MUST start from promiscuous ptype_all chain in net_bh.
532 * It is true now, do not change it.
533 * Explanation follows: if protocol handler, mangling packet, will
534 * be the first on list, it is not able to sense, that packet
535 * is cloned and should be copied-on-write, so that it will
536 * change it and subsequent readers will get broken packet.
537 * --ANK (980803)
538 */
539
540static inline struct list_head *ptype_head(const struct packet_type *pt)
541{
542 if (pt->type == htons(ETH_P_ALL))
543 return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all;
544 else
545 return pt->dev ? &pt->dev->ptype_specific :
546 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
547}
548
549/**
550 * dev_add_pack - add packet handler
551 * @pt: packet type declaration
552 *
553 * Add a protocol handler to the networking stack. The passed &packet_type
554 * is linked into kernel lists and may not be freed until it has been
555 * removed from the kernel lists.
556 *
557 * This call does not sleep therefore it can not
558 * guarantee all CPU's that are in middle of receiving packets
559 * will see the new packet type (until the next received packet).
560 */
561
562void dev_add_pack(struct packet_type *pt)
563{
564 struct list_head *head = ptype_head(pt);
565
566 spin_lock(&ptype_lock);
567 list_add_rcu(&pt->list, head);
568 spin_unlock(&ptype_lock);
569}
570EXPORT_SYMBOL(dev_add_pack);
571
572/**
573 * __dev_remove_pack - remove packet handler
574 * @pt: packet type declaration
575 *
576 * Remove a protocol handler that was previously added to the kernel
577 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
578 * from the kernel lists and can be freed or reused once this function
579 * returns.
580 *
581 * The packet type might still be in use by receivers
582 * and must not be freed until after all the CPU's have gone
583 * through a quiescent state.
584 */
585void __dev_remove_pack(struct packet_type *pt)
586{
587 struct list_head *head = ptype_head(pt);
588 struct packet_type *pt1;
589
590 spin_lock(&ptype_lock);
591
592 list_for_each_entry(pt1, head, list) {
593 if (pt == pt1) {
594 list_del_rcu(&pt->list);
595 goto out;
596 }
597 }
598
599 pr_warn("dev_remove_pack: %p not found\n", pt);
600out:
601 spin_unlock(&ptype_lock);
602}
603EXPORT_SYMBOL(__dev_remove_pack);
604
605/**
606 * dev_remove_pack - remove packet handler
607 * @pt: packet type declaration
608 *
609 * Remove a protocol handler that was previously added to the kernel
610 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
611 * from the kernel lists and can be freed or reused once this function
612 * returns.
613 *
614 * This call sleeps to guarantee that no CPU is looking at the packet
615 * type after return.
616 */
617void dev_remove_pack(struct packet_type *pt)
618{
619 __dev_remove_pack(pt);
620
621 synchronize_net();
622}
623EXPORT_SYMBOL(dev_remove_pack);
624
625
626/*******************************************************************************
627 *
628 * Device Interface Subroutines
629 *
630 *******************************************************************************/
631
632/**
633 * dev_get_iflink - get 'iflink' value of a interface
634 * @dev: targeted interface
635 *
636 * Indicates the ifindex the interface is linked to.
637 * Physical interfaces have the same 'ifindex' and 'iflink' values.
638 */
639
640int dev_get_iflink(const struct net_device *dev)
641{
642 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
643 return dev->netdev_ops->ndo_get_iflink(dev);
644
645 return READ_ONCE(dev->ifindex);
646}
647EXPORT_SYMBOL(dev_get_iflink);
648
649/**
650 * dev_fill_metadata_dst - Retrieve tunnel egress information.
651 * @dev: targeted interface
652 * @skb: The packet.
653 *
654 * For better visibility of tunnel traffic OVS needs to retrieve
655 * egress tunnel information for a packet. Following API allows
656 * user to get this info.
657 */
658int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
659{
660 struct ip_tunnel_info *info;
661
662 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
663 return -EINVAL;
664
665 info = skb_tunnel_info_unclone(skb);
666 if (!info)
667 return -ENOMEM;
668 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
669 return -EINVAL;
670
671 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
672}
673EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
674
675static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
676{
677 int k = stack->num_paths++;
678
679 if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
680 return NULL;
681
682 return &stack->path[k];
683}
684
685int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
686 struct net_device_path_stack *stack)
687{
688 const struct net_device *last_dev;
689 struct net_device_path_ctx ctx = {
690 .dev = dev,
691 };
692 struct net_device_path *path;
693 int ret = 0;
694
695 memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
696 stack->num_paths = 0;
697 while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
698 last_dev = ctx.dev;
699 path = dev_fwd_path(stack);
700 if (!path)
701 return -1;
702
703 memset(path, 0, sizeof(struct net_device_path));
704 ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
705 if (ret < 0)
706 return -1;
707
708 if (WARN_ON_ONCE(last_dev == ctx.dev))
709 return -1;
710 }
711
712 if (!ctx.dev)
713 return ret;
714
715 path = dev_fwd_path(stack);
716 if (!path)
717 return -1;
718 path->type = DEV_PATH_ETHERNET;
719 path->dev = ctx.dev;
720
721 return ret;
722}
723EXPORT_SYMBOL_GPL(dev_fill_forward_path);
724
725/**
726 * __dev_get_by_name - find a device by its name
727 * @net: the applicable net namespace
728 * @name: name to find
729 *
730 * Find an interface by name. Must be called under RTNL semaphore.
731 * If the name is found a pointer to the device is returned.
732 * If the name is not found then %NULL is returned. The
733 * reference counters are not incremented so the caller must be
734 * careful with locks.
735 */
736
737struct net_device *__dev_get_by_name(struct net *net, const char *name)
738{
739 struct netdev_name_node *node_name;
740
741 node_name = netdev_name_node_lookup(net, name);
742 return node_name ? node_name->dev : NULL;
743}
744EXPORT_SYMBOL(__dev_get_by_name);
745
746/**
747 * dev_get_by_name_rcu - find a device by its name
748 * @net: the applicable net namespace
749 * @name: name to find
750 *
751 * Find an interface by name.
752 * If the name is found a pointer to the device is returned.
753 * If the name is not found then %NULL is returned.
754 * The reference counters are not incremented so the caller must be
755 * careful with locks. The caller must hold RCU lock.
756 */
757
758struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
759{
760 struct netdev_name_node *node_name;
761
762 node_name = netdev_name_node_lookup_rcu(net, name);
763 return node_name ? node_name->dev : NULL;
764}
765EXPORT_SYMBOL(dev_get_by_name_rcu);
766
767/* Deprecated for new users, call netdev_get_by_name() instead */
768struct net_device *dev_get_by_name(struct net *net, const char *name)
769{
770 struct net_device *dev;
771
772 rcu_read_lock();
773 dev = dev_get_by_name_rcu(net, name);
774 dev_hold(dev);
775 rcu_read_unlock();
776 return dev;
777}
778EXPORT_SYMBOL(dev_get_by_name);
779
780/**
781 * netdev_get_by_name() - find a device by its name
782 * @net: the applicable net namespace
783 * @name: name to find
784 * @tracker: tracking object for the acquired reference
785 * @gfp: allocation flags for the tracker
786 *
787 * Find an interface by name. This can be called from any
788 * context and does its own locking. The returned handle has
789 * the usage count incremented and the caller must use netdev_put() to
790 * release it when it is no longer needed. %NULL is returned if no
791 * matching device is found.
792 */
793struct net_device *netdev_get_by_name(struct net *net, const char *name,
794 netdevice_tracker *tracker, gfp_t gfp)
795{
796 struct net_device *dev;
797
798 dev = dev_get_by_name(net, name);
799 if (dev)
800 netdev_tracker_alloc(dev, tracker, gfp);
801 return dev;
802}
803EXPORT_SYMBOL(netdev_get_by_name);
804
805/**
806 * __dev_get_by_index - find a device by its ifindex
807 * @net: the applicable net namespace
808 * @ifindex: index of device
809 *
810 * Search for an interface by index. Returns %NULL if the device
811 * is not found or a pointer to the device. The device has not
812 * had its reference counter increased so the caller must be careful
813 * about locking. The caller must hold the RTNL semaphore.
814 */
815
816struct net_device *__dev_get_by_index(struct net *net, int ifindex)
817{
818 struct net_device *dev;
819 struct hlist_head *head = dev_index_hash(net, ifindex);
820
821 hlist_for_each_entry(dev, head, index_hlist)
822 if (dev->ifindex == ifindex)
823 return dev;
824
825 return NULL;
826}
827EXPORT_SYMBOL(__dev_get_by_index);
828
829/**
830 * dev_get_by_index_rcu - find a device by its ifindex
831 * @net: the applicable net namespace
832 * @ifindex: index of device
833 *
834 * Search for an interface by index. Returns %NULL if the device
835 * is not found or a pointer to the device. The device has not
836 * had its reference counter increased so the caller must be careful
837 * about locking. The caller must hold RCU lock.
838 */
839
840struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
841{
842 struct net_device *dev;
843 struct hlist_head *head = dev_index_hash(net, ifindex);
844
845 hlist_for_each_entry_rcu(dev, head, index_hlist)
846 if (dev->ifindex == ifindex)
847 return dev;
848
849 return NULL;
850}
851EXPORT_SYMBOL(dev_get_by_index_rcu);
852
853/* Deprecated for new users, call netdev_get_by_index() instead */
854struct net_device *dev_get_by_index(struct net *net, int ifindex)
855{
856 struct net_device *dev;
857
858 rcu_read_lock();
859 dev = dev_get_by_index_rcu(net, ifindex);
860 dev_hold(dev);
861 rcu_read_unlock();
862 return dev;
863}
864EXPORT_SYMBOL(dev_get_by_index);
865
866/**
867 * netdev_get_by_index() - find a device by its ifindex
868 * @net: the applicable net namespace
869 * @ifindex: index of device
870 * @tracker: tracking object for the acquired reference
871 * @gfp: allocation flags for the tracker
872 *
873 * Search for an interface by index. Returns NULL if the device
874 * is not found or a pointer to the device. The device returned has
875 * had a reference added and the pointer is safe until the user calls
876 * netdev_put() to indicate they have finished with it.
877 */
878struct net_device *netdev_get_by_index(struct net *net, int ifindex,
879 netdevice_tracker *tracker, gfp_t gfp)
880{
881 struct net_device *dev;
882
883 dev = dev_get_by_index(net, ifindex);
884 if (dev)
885 netdev_tracker_alloc(dev, tracker, gfp);
886 return dev;
887}
888EXPORT_SYMBOL(netdev_get_by_index);
889
890/**
891 * dev_get_by_napi_id - find a device by napi_id
892 * @napi_id: ID of the NAPI struct
893 *
894 * Search for an interface by NAPI ID. Returns %NULL if the device
895 * is not found or a pointer to the device. The device has not had
896 * its reference counter increased so the caller must be careful
897 * about locking. The caller must hold RCU lock.
898 */
899
900struct net_device *dev_get_by_napi_id(unsigned int napi_id)
901{
902 struct napi_struct *napi;
903
904 WARN_ON_ONCE(!rcu_read_lock_held());
905
906 if (napi_id < MIN_NAPI_ID)
907 return NULL;
908
909 napi = napi_by_id(napi_id);
910
911 return napi ? napi->dev : NULL;
912}
913EXPORT_SYMBOL(dev_get_by_napi_id);
914
915/**
916 * netdev_get_name - get a netdevice name, knowing its ifindex.
917 * @net: network namespace
918 * @name: a pointer to the buffer where the name will be stored.
919 * @ifindex: the ifindex of the interface to get the name from.
920 */
921int netdev_get_name(struct net *net, char *name, int ifindex)
922{
923 struct net_device *dev;
924 int ret;
925
926 down_read(&devnet_rename_sem);
927 rcu_read_lock();
928
929 dev = dev_get_by_index_rcu(net, ifindex);
930 if (!dev) {
931 ret = -ENODEV;
932 goto out;
933 }
934
935 strcpy(name, dev->name);
936
937 ret = 0;
938out:
939 rcu_read_unlock();
940 up_read(&devnet_rename_sem);
941 return ret;
942}
943
944/**
945 * dev_getbyhwaddr_rcu - find a device by its hardware address
946 * @net: the applicable net namespace
947 * @type: media type of device
948 * @ha: hardware address
949 *
950 * Search for an interface by MAC address. Returns NULL if the device
951 * is not found or a pointer to the device.
952 * The caller must hold RCU or RTNL.
953 * The returned device has not had its ref count increased
954 * and the caller must therefore be careful about locking
955 *
956 */
957
958struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
959 const char *ha)
960{
961 struct net_device *dev;
962
963 for_each_netdev_rcu(net, dev)
964 if (dev->type == type &&
965 !memcmp(dev->dev_addr, ha, dev->addr_len))
966 return dev;
967
968 return NULL;
969}
970EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
971
972struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
973{
974 struct net_device *dev, *ret = NULL;
975
976 rcu_read_lock();
977 for_each_netdev_rcu(net, dev)
978 if (dev->type == type) {
979 dev_hold(dev);
980 ret = dev;
981 break;
982 }
983 rcu_read_unlock();
984 return ret;
985}
986EXPORT_SYMBOL(dev_getfirstbyhwtype);
987
988/**
989 * __dev_get_by_flags - find any device with given flags
990 * @net: the applicable net namespace
991 * @if_flags: IFF_* values
992 * @mask: bitmask of bits in if_flags to check
993 *
994 * Search for any interface with the given flags. Returns NULL if a device
995 * is not found or a pointer to the device. Must be called inside
996 * rtnl_lock(), and result refcount is unchanged.
997 */
998
999struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
1000 unsigned short mask)
1001{
1002 struct net_device *dev, *ret;
1003
1004 ASSERT_RTNL();
1005
1006 ret = NULL;
1007 for_each_netdev(net, dev) {
1008 if (((dev->flags ^ if_flags) & mask) == 0) {
1009 ret = dev;
1010 break;
1011 }
1012 }
1013 return ret;
1014}
1015EXPORT_SYMBOL(__dev_get_by_flags);
1016
1017/**
1018 * dev_valid_name - check if name is okay for network device
1019 * @name: name string
1020 *
1021 * Network device names need to be valid file names to
1022 * allow sysfs to work. We also disallow any kind of
1023 * whitespace.
1024 */
1025bool dev_valid_name(const char *name)
1026{
1027 if (*name == '\0')
1028 return false;
1029 if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1030 return false;
1031 if (!strcmp(name, ".") || !strcmp(name, ".."))
1032 return false;
1033
1034 while (*name) {
1035 if (*name == '/' || *name == ':' || isspace(*name))
1036 return false;
1037 name++;
1038 }
1039 return true;
1040}
1041EXPORT_SYMBOL(dev_valid_name);
1042
1043/**
1044 * __dev_alloc_name - allocate a name for a device
1045 * @net: network namespace to allocate the device name in
1046 * @name: name format string
1047 * @res: result name string
1048 *
1049 * Passed a format string - eg "lt%d" it will try and find a suitable
1050 * id. It scans list of devices to build up a free map, then chooses
1051 * the first empty slot. The caller must hold the dev_base or rtnl lock
1052 * while allocating the name and adding the device in order to avoid
1053 * duplicates.
1054 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1055 * Returns the number of the unit assigned or a negative errno code.
1056 */
1057
1058static int __dev_alloc_name(struct net *net, const char *name, char *res)
1059{
1060 int i = 0;
1061 const char *p;
1062 const int max_netdevices = 8*PAGE_SIZE;
1063 unsigned long *inuse;
1064 struct net_device *d;
1065 char buf[IFNAMSIZ];
1066
1067 /* Verify the string as this thing may have come from the user.
1068 * There must be one "%d" and no other "%" characters.
1069 */
1070 p = strchr(name, '%');
1071 if (!p || p[1] != 'd' || strchr(p + 2, '%'))
1072 return -EINVAL;
1073
1074 /* Use one page as a bit array of possible slots */
1075 inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
1076 if (!inuse)
1077 return -ENOMEM;
1078
1079 for_each_netdev(net, d) {
1080 struct netdev_name_node *name_node;
1081
1082 netdev_for_each_altname(d, name_node) {
1083 if (!sscanf(name_node->name, name, &i))
1084 continue;
1085 if (i < 0 || i >= max_netdevices)
1086 continue;
1087
1088 /* avoid cases where sscanf is not exact inverse of printf */
1089 snprintf(buf, IFNAMSIZ, name, i);
1090 if (!strncmp(buf, name_node->name, IFNAMSIZ))
1091 __set_bit(i, inuse);
1092 }
1093 if (!sscanf(d->name, name, &i))
1094 continue;
1095 if (i < 0 || i >= max_netdevices)
1096 continue;
1097
1098 /* avoid cases where sscanf is not exact inverse of printf */
1099 snprintf(buf, IFNAMSIZ, name, i);
1100 if (!strncmp(buf, d->name, IFNAMSIZ))
1101 __set_bit(i, inuse);
1102 }
1103
1104 i = find_first_zero_bit(inuse, max_netdevices);
1105 bitmap_free(inuse);
1106 if (i == max_netdevices)
1107 return -ENFILE;
1108
1109 /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */
1110 strscpy(buf, name, IFNAMSIZ);
1111 snprintf(res, IFNAMSIZ, buf, i);
1112 return i;
1113}
1114
1115/* Returns negative errno or allocated unit id (see __dev_alloc_name()) */
1116static int dev_prep_valid_name(struct net *net, struct net_device *dev,
1117 const char *want_name, char *out_name,
1118 int dup_errno)
1119{
1120 if (!dev_valid_name(want_name))
1121 return -EINVAL;
1122
1123 if (strchr(want_name, '%'))
1124 return __dev_alloc_name(net, want_name, out_name);
1125
1126 if (netdev_name_in_use(net, want_name))
1127 return -dup_errno;
1128 if (out_name != want_name)
1129 strscpy(out_name, want_name, IFNAMSIZ);
1130 return 0;
1131}
1132
1133/**
1134 * dev_alloc_name - allocate a name for a device
1135 * @dev: device
1136 * @name: name format string
1137 *
1138 * Passed a format string - eg "lt%d" it will try and find a suitable
1139 * id. It scans list of devices to build up a free map, then chooses
1140 * the first empty slot. The caller must hold the dev_base or rtnl lock
1141 * while allocating the name and adding the device in order to avoid
1142 * duplicates.
1143 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1144 * Returns the number of the unit assigned or a negative errno code.
1145 */
1146
1147int dev_alloc_name(struct net_device *dev, const char *name)
1148{
1149 return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE);
1150}
1151EXPORT_SYMBOL(dev_alloc_name);
1152
1153static int dev_get_valid_name(struct net *net, struct net_device *dev,
1154 const char *name)
1155{
1156 int ret;
1157
1158 ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST);
1159 return ret < 0 ? ret : 0;
1160}
1161
1162/**
1163 * dev_change_name - change name of a device
1164 * @dev: device
1165 * @newname: name (or format string) must be at least IFNAMSIZ
1166 *
1167 * Change name of a device, can pass format strings "eth%d".
1168 * for wildcarding.
1169 */
1170int dev_change_name(struct net_device *dev, const char *newname)
1171{
1172 unsigned char old_assign_type;
1173 char oldname[IFNAMSIZ];
1174 int err = 0;
1175 int ret;
1176 struct net *net;
1177
1178 ASSERT_RTNL();
1179 BUG_ON(!dev_net(dev));
1180
1181 net = dev_net(dev);
1182
1183 down_write(&devnet_rename_sem);
1184
1185 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1186 up_write(&devnet_rename_sem);
1187 return 0;
1188 }
1189
1190 memcpy(oldname, dev->name, IFNAMSIZ);
1191
1192 err = dev_get_valid_name(net, dev, newname);
1193 if (err < 0) {
1194 up_write(&devnet_rename_sem);
1195 return err;
1196 }
1197
1198 if (oldname[0] && !strchr(oldname, '%'))
1199 netdev_info(dev, "renamed from %s%s\n", oldname,
1200 dev->flags & IFF_UP ? " (while UP)" : "");
1201
1202 old_assign_type = dev->name_assign_type;
1203 WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED);
1204
1205rollback:
1206 ret = device_rename(&dev->dev, dev->name);
1207 if (ret) {
1208 memcpy(dev->name, oldname, IFNAMSIZ);
1209 WRITE_ONCE(dev->name_assign_type, old_assign_type);
1210 up_write(&devnet_rename_sem);
1211 return ret;
1212 }
1213
1214 up_write(&devnet_rename_sem);
1215
1216 netdev_adjacent_rename_links(dev, oldname);
1217
1218 netdev_name_node_del(dev->name_node);
1219
1220 synchronize_net();
1221
1222 netdev_name_node_add(net, dev->name_node);
1223
1224 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1225 ret = notifier_to_errno(ret);
1226
1227 if (ret) {
1228 /* err >= 0 after dev_alloc_name() or stores the first errno */
1229 if (err >= 0) {
1230 err = ret;
1231 down_write(&devnet_rename_sem);
1232 memcpy(dev->name, oldname, IFNAMSIZ);
1233 memcpy(oldname, newname, IFNAMSIZ);
1234 WRITE_ONCE(dev->name_assign_type, old_assign_type);
1235 old_assign_type = NET_NAME_RENAMED;
1236 goto rollback;
1237 } else {
1238 netdev_err(dev, "name change rollback failed: %d\n",
1239 ret);
1240 }
1241 }
1242
1243 return err;
1244}
1245
1246/**
1247 * dev_set_alias - change ifalias of a device
1248 * @dev: device
1249 * @alias: name up to IFALIASZ
1250 * @len: limit of bytes to copy from info
1251 *
1252 * Set ifalias for a device,
1253 */
1254int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1255{
1256 struct dev_ifalias *new_alias = NULL;
1257
1258 if (len >= IFALIASZ)
1259 return -EINVAL;
1260
1261 if (len) {
1262 new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1263 if (!new_alias)
1264 return -ENOMEM;
1265
1266 memcpy(new_alias->ifalias, alias, len);
1267 new_alias->ifalias[len] = 0;
1268 }
1269
1270 mutex_lock(&ifalias_mutex);
1271 new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
1272 mutex_is_locked(&ifalias_mutex));
1273 mutex_unlock(&ifalias_mutex);
1274
1275 if (new_alias)
1276 kfree_rcu(new_alias, rcuhead);
1277
1278 return len;
1279}
1280EXPORT_SYMBOL(dev_set_alias);
1281
1282/**
1283 * dev_get_alias - get ifalias of a device
1284 * @dev: device
1285 * @name: buffer to store name of ifalias
1286 * @len: size of buffer
1287 *
1288 * get ifalias for a device. Caller must make sure dev cannot go
1289 * away, e.g. rcu read lock or own a reference count to device.
1290 */
1291int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1292{
1293 const struct dev_ifalias *alias;
1294 int ret = 0;
1295
1296 rcu_read_lock();
1297 alias = rcu_dereference(dev->ifalias);
1298 if (alias)
1299 ret = snprintf(name, len, "%s", alias->ifalias);
1300 rcu_read_unlock();
1301
1302 return ret;
1303}
1304
1305/**
1306 * netdev_features_change - device changes features
1307 * @dev: device to cause notification
1308 *
1309 * Called to indicate a device has changed features.
1310 */
1311void netdev_features_change(struct net_device *dev)
1312{
1313 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1314}
1315EXPORT_SYMBOL(netdev_features_change);
1316
1317/**
1318 * netdev_state_change - device changes state
1319 * @dev: device to cause notification
1320 *
1321 * Called to indicate a device has changed state. This function calls
1322 * the notifier chains for netdev_chain and sends a NEWLINK message
1323 * to the routing socket.
1324 */
1325void netdev_state_change(struct net_device *dev)
1326{
1327 if (dev->flags & IFF_UP) {
1328 struct netdev_notifier_change_info change_info = {
1329 .info.dev = dev,
1330 };
1331
1332 call_netdevice_notifiers_info(NETDEV_CHANGE,
1333 &change_info.info);
1334 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
1335 }
1336}
1337EXPORT_SYMBOL(netdev_state_change);
1338
1339/**
1340 * __netdev_notify_peers - notify network peers about existence of @dev,
1341 * to be called when rtnl lock is already held.
1342 * @dev: network device
1343 *
1344 * Generate traffic such that interested network peers are aware of
1345 * @dev, such as by generating a gratuitous ARP. This may be used when
1346 * a device wants to inform the rest of the network about some sort of
1347 * reconfiguration such as a failover event or virtual machine
1348 * migration.
1349 */
1350void __netdev_notify_peers(struct net_device *dev)
1351{
1352 ASSERT_RTNL();
1353 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1354 call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1355}
1356EXPORT_SYMBOL(__netdev_notify_peers);
1357
1358/**
1359 * netdev_notify_peers - notify network peers about existence of @dev
1360 * @dev: network device
1361 *
1362 * Generate traffic such that interested network peers are aware of
1363 * @dev, such as by generating a gratuitous ARP. This may be used when
1364 * a device wants to inform the rest of the network about some sort of
1365 * reconfiguration such as a failover event or virtual machine
1366 * migration.
1367 */
1368void netdev_notify_peers(struct net_device *dev)
1369{
1370 rtnl_lock();
1371 __netdev_notify_peers(dev);
1372 rtnl_unlock();
1373}
1374EXPORT_SYMBOL(netdev_notify_peers);
1375
1376static int napi_threaded_poll(void *data);
1377
1378static int napi_kthread_create(struct napi_struct *n)
1379{
1380 int err = 0;
1381
1382 /* Create and wake up the kthread once to put it in
1383 * TASK_INTERRUPTIBLE mode to avoid the blocked task
1384 * warning and work with loadavg.
1385 */
1386 n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
1387 n->dev->name, n->napi_id);
1388 if (IS_ERR(n->thread)) {
1389 err = PTR_ERR(n->thread);
1390 pr_err("kthread_run failed with err %d\n", err);
1391 n->thread = NULL;
1392 }
1393
1394 return err;
1395}
1396
1397static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1398{
1399 const struct net_device_ops *ops = dev->netdev_ops;
1400 int ret;
1401
1402 ASSERT_RTNL();
1403 dev_addr_check(dev);
1404
1405 if (!netif_device_present(dev)) {
1406 /* may be detached because parent is runtime-suspended */
1407 if (dev->dev.parent)
1408 pm_runtime_resume(dev->dev.parent);
1409 if (!netif_device_present(dev))
1410 return -ENODEV;
1411 }
1412
1413 /* Block netpoll from trying to do any rx path servicing.
1414 * If we don't do this there is a chance ndo_poll_controller
1415 * or ndo_poll may be running while we open the device
1416 */
1417 netpoll_poll_disable(dev);
1418
1419 ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
1420 ret = notifier_to_errno(ret);
1421 if (ret)
1422 return ret;
1423
1424 set_bit(__LINK_STATE_START, &dev->state);
1425
1426 if (ops->ndo_validate_addr)
1427 ret = ops->ndo_validate_addr(dev);
1428
1429 if (!ret && ops->ndo_open)
1430 ret = ops->ndo_open(dev);
1431
1432 netpoll_poll_enable(dev);
1433
1434 if (ret)
1435 clear_bit(__LINK_STATE_START, &dev->state);
1436 else {
1437 dev->flags |= IFF_UP;
1438 dev_set_rx_mode(dev);
1439 dev_activate(dev);
1440 add_device_randomness(dev->dev_addr, dev->addr_len);
1441 }
1442
1443 return ret;
1444}
1445
1446/**
1447 * dev_open - prepare an interface for use.
1448 * @dev: device to open
1449 * @extack: netlink extended ack
1450 *
1451 * Takes a device from down to up state. The device's private open
1452 * function is invoked and then the multicast lists are loaded. Finally
1453 * the device is moved into the up state and a %NETDEV_UP message is
1454 * sent to the netdev notifier chain.
1455 *
1456 * Calling this function on an active interface is a nop. On a failure
1457 * a negative errno code is returned.
1458 */
1459int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1460{
1461 int ret;
1462
1463 if (dev->flags & IFF_UP)
1464 return 0;
1465
1466 ret = __dev_open(dev, extack);
1467 if (ret < 0)
1468 return ret;
1469
1470 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
1471 call_netdevice_notifiers(NETDEV_UP, dev);
1472
1473 return ret;
1474}
1475EXPORT_SYMBOL(dev_open);
1476
1477static void __dev_close_many(struct list_head *head)
1478{
1479 struct net_device *dev;
1480
1481 ASSERT_RTNL();
1482 might_sleep();
1483
1484 list_for_each_entry(dev, head, close_list) {
1485 /* Temporarily disable netpoll until the interface is down */
1486 netpoll_poll_disable(dev);
1487
1488 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1489
1490 clear_bit(__LINK_STATE_START, &dev->state);
1491
1492 /* Synchronize to scheduled poll. We cannot touch poll list, it
1493 * can be even on different cpu. So just clear netif_running().
1494 *
1495 * dev->stop() will invoke napi_disable() on all of it's
1496 * napi_struct instances on this device.
1497 */
1498 smp_mb__after_atomic(); /* Commit netif_running(). */
1499 }
1500
1501 dev_deactivate_many(head);
1502
1503 list_for_each_entry(dev, head, close_list) {
1504 const struct net_device_ops *ops = dev->netdev_ops;
1505
1506 /*
1507 * Call the device specific close. This cannot fail.
1508 * Only if device is UP
1509 *
1510 * We allow it to be called even after a DETACH hot-plug
1511 * event.
1512 */
1513 if (ops->ndo_stop)
1514 ops->ndo_stop(dev);
1515
1516 dev->flags &= ~IFF_UP;
1517 netpoll_poll_enable(dev);
1518 }
1519}
1520
1521static void __dev_close(struct net_device *dev)
1522{
1523 LIST_HEAD(single);
1524
1525 list_add(&dev->close_list, &single);
1526 __dev_close_many(&single);
1527 list_del(&single);
1528}
1529
1530void dev_close_many(struct list_head *head, bool unlink)
1531{
1532 struct net_device *dev, *tmp;
1533
1534 /* Remove the devices that don't need to be closed */
1535 list_for_each_entry_safe(dev, tmp, head, close_list)
1536 if (!(dev->flags & IFF_UP))
1537 list_del_init(&dev->close_list);
1538
1539 __dev_close_many(head);
1540
1541 list_for_each_entry_safe(dev, tmp, head, close_list) {
1542 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
1543 call_netdevice_notifiers(NETDEV_DOWN, dev);
1544 if (unlink)
1545 list_del_init(&dev->close_list);
1546 }
1547}
1548EXPORT_SYMBOL(dev_close_many);
1549
1550/**
1551 * dev_close - shutdown an interface.
1552 * @dev: device to shutdown
1553 *
1554 * This function moves an active device into down state. A
1555 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1556 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1557 * chain.
1558 */
1559void dev_close(struct net_device *dev)
1560{
1561 if (dev->flags & IFF_UP) {
1562 LIST_HEAD(single);
1563
1564 list_add(&dev->close_list, &single);
1565 dev_close_many(&single, true);
1566 list_del(&single);
1567 }
1568}
1569EXPORT_SYMBOL(dev_close);
1570
1571
1572/**
1573 * dev_disable_lro - disable Large Receive Offload on a device
1574 * @dev: device
1575 *
1576 * Disable Large Receive Offload (LRO) on a net device. Must be
1577 * called under RTNL. This is needed if received packets may be
1578 * forwarded to another interface.
1579 */
1580void dev_disable_lro(struct net_device *dev)
1581{
1582 struct net_device *lower_dev;
1583 struct list_head *iter;
1584
1585 dev->wanted_features &= ~NETIF_F_LRO;
1586 netdev_update_features(dev);
1587
1588 if (unlikely(dev->features & NETIF_F_LRO))
1589 netdev_WARN(dev, "failed to disable LRO!\n");
1590
1591 netdev_for_each_lower_dev(dev, lower_dev, iter)
1592 dev_disable_lro(lower_dev);
1593}
1594EXPORT_SYMBOL(dev_disable_lro);
1595
1596/**
1597 * dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1598 * @dev: device
1599 *
1600 * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be
1601 * called under RTNL. This is needed if Generic XDP is installed on
1602 * the device.
1603 */
1604static void dev_disable_gro_hw(struct net_device *dev)
1605{
1606 dev->wanted_features &= ~NETIF_F_GRO_HW;
1607 netdev_update_features(dev);
1608
1609 if (unlikely(dev->features & NETIF_F_GRO_HW))
1610 netdev_WARN(dev, "failed to disable GRO_HW!\n");
1611}
1612
1613const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1614{
1615#define N(val) \
1616 case NETDEV_##val: \
1617 return "NETDEV_" __stringify(val);
1618 switch (cmd) {
1619 N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1620 N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1621 N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1622 N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN)
1623 N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA)
1624 N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE)
1625 N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1626 N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1627 N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1628 N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
1629 N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
1630 N(XDP_FEAT_CHANGE)
1631 }
1632#undef N
1633 return "UNKNOWN_NETDEV_EVENT";
1634}
1635EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1636
1637static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1638 struct net_device *dev)
1639{
1640 struct netdev_notifier_info info = {
1641 .dev = dev,
1642 };
1643
1644 return nb->notifier_call(nb, val, &info);
1645}
1646
1647static int call_netdevice_register_notifiers(struct notifier_block *nb,
1648 struct net_device *dev)
1649{
1650 int err;
1651
1652 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1653 err = notifier_to_errno(err);
1654 if (err)
1655 return err;
1656
1657 if (!(dev->flags & IFF_UP))
1658 return 0;
1659
1660 call_netdevice_notifier(nb, NETDEV_UP, dev);
1661 return 0;
1662}
1663
1664static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
1665 struct net_device *dev)
1666{
1667 if (dev->flags & IFF_UP) {
1668 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1669 dev);
1670 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1671 }
1672 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1673}
1674
1675static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
1676 struct net *net)
1677{
1678 struct net_device *dev;
1679 int err;
1680
1681 for_each_netdev(net, dev) {
1682 err = call_netdevice_register_notifiers(nb, dev);
1683 if (err)
1684 goto rollback;
1685 }
1686 return 0;
1687
1688rollback:
1689 for_each_netdev_continue_reverse(net, dev)
1690 call_netdevice_unregister_notifiers(nb, dev);
1691 return err;
1692}
1693
1694static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
1695 struct net *net)
1696{
1697 struct net_device *dev;
1698
1699 for_each_netdev(net, dev)
1700 call_netdevice_unregister_notifiers(nb, dev);
1701}
1702
1703static int dev_boot_phase = 1;
1704
1705/**
1706 * register_netdevice_notifier - register a network notifier block
1707 * @nb: notifier
1708 *
1709 * Register a notifier to be called when network device events occur.
1710 * The notifier passed is linked into the kernel structures and must
1711 * not be reused until it has been unregistered. A negative errno code
1712 * is returned on a failure.
1713 *
1714 * When registered all registration and up events are replayed
1715 * to the new notifier to allow device to have a race free
1716 * view of the network device list.
1717 */
1718
1719int register_netdevice_notifier(struct notifier_block *nb)
1720{
1721 struct net *net;
1722 int err;
1723
1724 /* Close race with setup_net() and cleanup_net() */
1725 down_write(&pernet_ops_rwsem);
1726 rtnl_lock();
1727 err = raw_notifier_chain_register(&netdev_chain, nb);
1728 if (err)
1729 goto unlock;
1730 if (dev_boot_phase)
1731 goto unlock;
1732 for_each_net(net) {
1733 err = call_netdevice_register_net_notifiers(nb, net);
1734 if (err)
1735 goto rollback;
1736 }
1737
1738unlock:
1739 rtnl_unlock();
1740 up_write(&pernet_ops_rwsem);
1741 return err;
1742
1743rollback:
1744 for_each_net_continue_reverse(net)
1745 call_netdevice_unregister_net_notifiers(nb, net);
1746
1747 raw_notifier_chain_unregister(&netdev_chain, nb);
1748 goto unlock;
1749}
1750EXPORT_SYMBOL(register_netdevice_notifier);
1751
1752/**
1753 * unregister_netdevice_notifier - unregister a network notifier block
1754 * @nb: notifier
1755 *
1756 * Unregister a notifier previously registered by
1757 * register_netdevice_notifier(). The notifier is unlinked into the
1758 * kernel structures and may then be reused. A negative errno code
1759 * is returned on a failure.
1760 *
1761 * After unregistering unregister and down device events are synthesized
1762 * for all devices on the device list to the removed notifier to remove
1763 * the need for special case cleanup code.
1764 */
1765
1766int unregister_netdevice_notifier(struct notifier_block *nb)
1767{
1768 struct net *net;
1769 int err;
1770
1771 /* Close race with setup_net() and cleanup_net() */
1772 down_write(&pernet_ops_rwsem);
1773 rtnl_lock();
1774 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1775 if (err)
1776 goto unlock;
1777
1778 for_each_net(net)
1779 call_netdevice_unregister_net_notifiers(nb, net);
1780
1781unlock:
1782 rtnl_unlock();
1783 up_write(&pernet_ops_rwsem);
1784 return err;
1785}
1786EXPORT_SYMBOL(unregister_netdevice_notifier);
1787
1788static int __register_netdevice_notifier_net(struct net *net,
1789 struct notifier_block *nb,
1790 bool ignore_call_fail)
1791{
1792 int err;
1793
1794 err = raw_notifier_chain_register(&net->netdev_chain, nb);
1795 if (err)
1796 return err;
1797 if (dev_boot_phase)
1798 return 0;
1799
1800 err = call_netdevice_register_net_notifiers(nb, net);
1801 if (err && !ignore_call_fail)
1802 goto chain_unregister;
1803
1804 return 0;
1805
1806chain_unregister:
1807 raw_notifier_chain_unregister(&net->netdev_chain, nb);
1808 return err;
1809}
1810
1811static int __unregister_netdevice_notifier_net(struct net *net,
1812 struct notifier_block *nb)
1813{
1814 int err;
1815
1816 err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
1817 if (err)
1818 return err;
1819
1820 call_netdevice_unregister_net_notifiers(nb, net);
1821 return 0;
1822}
1823
1824/**
1825 * register_netdevice_notifier_net - register a per-netns network notifier block
1826 * @net: network namespace
1827 * @nb: notifier
1828 *
1829 * Register a notifier to be called when network device events occur.
1830 * The notifier passed is linked into the kernel structures and must
1831 * not be reused until it has been unregistered. A negative errno code
1832 * is returned on a failure.
1833 *
1834 * When registered all registration and up events are replayed
1835 * to the new notifier to allow device to have a race free
1836 * view of the network device list.
1837 */
1838
1839int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
1840{
1841 int err;
1842
1843 rtnl_lock();
1844 err = __register_netdevice_notifier_net(net, nb, false);
1845 rtnl_unlock();
1846 return err;
1847}
1848EXPORT_SYMBOL(register_netdevice_notifier_net);
1849
1850/**
1851 * unregister_netdevice_notifier_net - unregister a per-netns
1852 * network notifier block
1853 * @net: network namespace
1854 * @nb: notifier
1855 *
1856 * Unregister a notifier previously registered by
1857 * register_netdevice_notifier_net(). The notifier is unlinked from the
1858 * kernel structures and may then be reused. A negative errno code
1859 * is returned on a failure.
1860 *
1861 * After unregistering unregister and down device events are synthesized
1862 * for all devices on the device list to the removed notifier to remove
1863 * the need for special case cleanup code.
1864 */
1865
1866int unregister_netdevice_notifier_net(struct net *net,
1867 struct notifier_block *nb)
1868{
1869 int err;
1870
1871 rtnl_lock();
1872 err = __unregister_netdevice_notifier_net(net, nb);
1873 rtnl_unlock();
1874 return err;
1875}
1876EXPORT_SYMBOL(unregister_netdevice_notifier_net);
1877
1878static void __move_netdevice_notifier_net(struct net *src_net,
1879 struct net *dst_net,
1880 struct notifier_block *nb)
1881{
1882 __unregister_netdevice_notifier_net(src_net, nb);
1883 __register_netdevice_notifier_net(dst_net, nb, true);
1884}
1885
1886int register_netdevice_notifier_dev_net(struct net_device *dev,
1887 struct notifier_block *nb,
1888 struct netdev_net_notifier *nn)
1889{
1890 int err;
1891
1892 rtnl_lock();
1893 err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
1894 if (!err) {
1895 nn->nb = nb;
1896 list_add(&nn->list, &dev->net_notifier_list);
1897 }
1898 rtnl_unlock();
1899 return err;
1900}
1901EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
1902
1903int unregister_netdevice_notifier_dev_net(struct net_device *dev,
1904 struct notifier_block *nb,
1905 struct netdev_net_notifier *nn)
1906{
1907 int err;
1908
1909 rtnl_lock();
1910 list_del(&nn->list);
1911 err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
1912 rtnl_unlock();
1913 return err;
1914}
1915EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
1916
1917static void move_netdevice_notifiers_dev_net(struct net_device *dev,
1918 struct net *net)
1919{
1920 struct netdev_net_notifier *nn;
1921
1922 list_for_each_entry(nn, &dev->net_notifier_list, list)
1923 __move_netdevice_notifier_net(dev_net(dev), net, nn->nb);
1924}
1925
1926/**
1927 * call_netdevice_notifiers_info - call all network notifier blocks
1928 * @val: value passed unmodified to notifier function
1929 * @info: notifier information data
1930 *
1931 * Call all network notifier blocks. Parameters and return value
1932 * are as for raw_notifier_call_chain().
1933 */
1934
1935int call_netdevice_notifiers_info(unsigned long val,
1936 struct netdev_notifier_info *info)
1937{
1938 struct net *net = dev_net(info->dev);
1939 int ret;
1940
1941 ASSERT_RTNL();
1942
1943 /* Run per-netns notifier block chain first, then run the global one.
1944 * Hopefully, one day, the global one is going to be removed after
1945 * all notifier block registrators get converted to be per-netns.
1946 */
1947 ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
1948 if (ret & NOTIFY_STOP_MASK)
1949 return ret;
1950 return raw_notifier_call_chain(&netdev_chain, val, info);
1951}
1952
1953/**
1954 * call_netdevice_notifiers_info_robust - call per-netns notifier blocks
1955 * for and rollback on error
1956 * @val_up: value passed unmodified to notifier function
1957 * @val_down: value passed unmodified to the notifier function when
1958 * recovering from an error on @val_up
1959 * @info: notifier information data
1960 *
1961 * Call all per-netns network notifier blocks, but not notifier blocks on
1962 * the global notifier chain. Parameters and return value are as for
1963 * raw_notifier_call_chain_robust().
1964 */
1965
1966static int
1967call_netdevice_notifiers_info_robust(unsigned long val_up,
1968 unsigned long val_down,
1969 struct netdev_notifier_info *info)
1970{
1971 struct net *net = dev_net(info->dev);
1972
1973 ASSERT_RTNL();
1974
1975 return raw_notifier_call_chain_robust(&net->netdev_chain,
1976 val_up, val_down, info);
1977}
1978
1979static int call_netdevice_notifiers_extack(unsigned long val,
1980 struct net_device *dev,
1981 struct netlink_ext_ack *extack)
1982{
1983 struct netdev_notifier_info info = {
1984 .dev = dev,
1985 .extack = extack,
1986 };
1987
1988 return call_netdevice_notifiers_info(val, &info);
1989}
1990
1991/**
1992 * call_netdevice_notifiers - call all network notifier blocks
1993 * @val: value passed unmodified to notifier function
1994 * @dev: net_device pointer passed unmodified to notifier function
1995 *
1996 * Call all network notifier blocks. Parameters and return value
1997 * are as for raw_notifier_call_chain().
1998 */
1999
2000int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
2001{
2002 return call_netdevice_notifiers_extack(val, dev, NULL);
2003}
2004EXPORT_SYMBOL(call_netdevice_notifiers);
2005
2006/**
2007 * call_netdevice_notifiers_mtu - call all network notifier blocks
2008 * @val: value passed unmodified to notifier function
2009 * @dev: net_device pointer passed unmodified to notifier function
2010 * @arg: additional u32 argument passed to the notifier function
2011 *
2012 * Call all network notifier blocks. Parameters and return value
2013 * are as for raw_notifier_call_chain().
2014 */
2015static int call_netdevice_notifiers_mtu(unsigned long val,
2016 struct net_device *dev, u32 arg)
2017{
2018 struct netdev_notifier_info_ext info = {
2019 .info.dev = dev,
2020 .ext.mtu = arg,
2021 };
2022
2023 BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
2024
2025 return call_netdevice_notifiers_info(val, &info.info);
2026}
2027
2028#ifdef CONFIG_NET_INGRESS
2029static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
2030
2031void net_inc_ingress_queue(void)
2032{
2033 static_branch_inc(&ingress_needed_key);
2034}
2035EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
2036
2037void net_dec_ingress_queue(void)
2038{
2039 static_branch_dec(&ingress_needed_key);
2040}
2041EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
2042#endif
2043
2044#ifdef CONFIG_NET_EGRESS
2045static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
2046
2047void net_inc_egress_queue(void)
2048{
2049 static_branch_inc(&egress_needed_key);
2050}
2051EXPORT_SYMBOL_GPL(net_inc_egress_queue);
2052
2053void net_dec_egress_queue(void)
2054{
2055 static_branch_dec(&egress_needed_key);
2056}
2057EXPORT_SYMBOL_GPL(net_dec_egress_queue);
2058#endif
2059
2060DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
2061EXPORT_SYMBOL(netstamp_needed_key);
2062#ifdef CONFIG_JUMP_LABEL
2063static atomic_t netstamp_needed_deferred;
2064static atomic_t netstamp_wanted;
2065static void netstamp_clear(struct work_struct *work)
2066{
2067 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
2068 int wanted;
2069
2070 wanted = atomic_add_return(deferred, &netstamp_wanted);
2071 if (wanted > 0)
2072 static_branch_enable(&netstamp_needed_key);
2073 else
2074 static_branch_disable(&netstamp_needed_key);
2075}
2076static DECLARE_WORK(netstamp_work, netstamp_clear);
2077#endif
2078
2079void net_enable_timestamp(void)
2080{
2081#ifdef CONFIG_JUMP_LABEL
2082 int wanted = atomic_read(&netstamp_wanted);
2083
2084 while (wanted > 0) {
2085 if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1))
2086 return;
2087 }
2088 atomic_inc(&netstamp_needed_deferred);
2089 schedule_work(&netstamp_work);
2090#else
2091 static_branch_inc(&netstamp_needed_key);
2092#endif
2093}
2094EXPORT_SYMBOL(net_enable_timestamp);
2095
2096void net_disable_timestamp(void)
2097{
2098#ifdef CONFIG_JUMP_LABEL
2099 int wanted = atomic_read(&netstamp_wanted);
2100
2101 while (wanted > 1) {
2102 if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1))
2103 return;
2104 }
2105 atomic_dec(&netstamp_needed_deferred);
2106 schedule_work(&netstamp_work);
2107#else
2108 static_branch_dec(&netstamp_needed_key);
2109#endif
2110}
2111EXPORT_SYMBOL(net_disable_timestamp);
2112
2113static inline void net_timestamp_set(struct sk_buff *skb)
2114{
2115 skb->tstamp = 0;
2116 skb->mono_delivery_time = 0;
2117 if (static_branch_unlikely(&netstamp_needed_key))
2118 skb->tstamp = ktime_get_real();
2119}
2120
2121#define net_timestamp_check(COND, SKB) \
2122 if (static_branch_unlikely(&netstamp_needed_key)) { \
2123 if ((COND) && !(SKB)->tstamp) \
2124 (SKB)->tstamp = ktime_get_real(); \
2125 } \
2126
2127bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
2128{
2129 return __is_skb_forwardable(dev, skb, true);
2130}
2131EXPORT_SYMBOL_GPL(is_skb_forwardable);
2132
2133static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
2134 bool check_mtu)
2135{
2136 int ret = ____dev_forward_skb(dev, skb, check_mtu);
2137
2138 if (likely(!ret)) {
2139 skb->protocol = eth_type_trans(skb, dev);
2140 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
2141 }
2142
2143 return ret;
2144}
2145
2146int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2147{
2148 return __dev_forward_skb2(dev, skb, true);
2149}
2150EXPORT_SYMBOL_GPL(__dev_forward_skb);
2151
2152/**
2153 * dev_forward_skb - loopback an skb to another netif
2154 *
2155 * @dev: destination network device
2156 * @skb: buffer to forward
2157 *
2158 * return values:
2159 * NET_RX_SUCCESS (no congestion)
2160 * NET_RX_DROP (packet was dropped, but freed)
2161 *
2162 * dev_forward_skb can be used for injecting an skb from the
2163 * start_xmit function of one device into the receive queue
2164 * of another device.
2165 *
2166 * The receiving device may be in another namespace, so
2167 * we have to clear all information in the skb that could
2168 * impact namespace isolation.
2169 */
2170int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2171{
2172 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
2173}
2174EXPORT_SYMBOL_GPL(dev_forward_skb);
2175
2176int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
2177{
2178 return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
2179}
2180
2181static inline int deliver_skb(struct sk_buff *skb,
2182 struct packet_type *pt_prev,
2183 struct net_device *orig_dev)
2184{
2185 if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
2186 return -ENOMEM;
2187 refcount_inc(&skb->users);
2188 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2189}
2190
2191static inline void deliver_ptype_list_skb(struct sk_buff *skb,
2192 struct packet_type **pt,
2193 struct net_device *orig_dev,
2194 __be16 type,
2195 struct list_head *ptype_list)
2196{
2197 struct packet_type *ptype, *pt_prev = *pt;
2198
2199 list_for_each_entry_rcu(ptype, ptype_list, list) {
2200 if (ptype->type != type)
2201 continue;
2202 if (pt_prev)
2203 deliver_skb(skb, pt_prev, orig_dev);
2204 pt_prev = ptype;
2205 }
2206 *pt = pt_prev;
2207}
2208
2209static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
2210{
2211 if (!ptype->af_packet_priv || !skb->sk)
2212 return false;
2213
2214 if (ptype->id_match)
2215 return ptype->id_match(ptype, skb->sk);
2216 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
2217 return true;
2218
2219 return false;
2220}
2221
2222/**
2223 * dev_nit_active - return true if any network interface taps are in use
2224 *
2225 * @dev: network device to check for the presence of taps
2226 */
2227bool dev_nit_active(struct net_device *dev)
2228{
2229 return !list_empty(&net_hotdata.ptype_all) ||
2230 !list_empty(&dev->ptype_all);
2231}
2232EXPORT_SYMBOL_GPL(dev_nit_active);
2233
2234/*
2235 * Support routine. Sends outgoing frames to any network
2236 * taps currently in use.
2237 */
2238
2239void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
2240{
2241 struct list_head *ptype_list = &net_hotdata.ptype_all;
2242 struct packet_type *ptype, *pt_prev = NULL;
2243 struct sk_buff *skb2 = NULL;
2244
2245 rcu_read_lock();
2246again:
2247 list_for_each_entry_rcu(ptype, ptype_list, list) {
2248 if (READ_ONCE(ptype->ignore_outgoing))
2249 continue;
2250
2251 /* Never send packets back to the socket
2252 * they originated from - MvS (miquels@drinkel.ow.org)
2253 */
2254 if (skb_loop_sk(ptype, skb))
2255 continue;
2256
2257 if (pt_prev) {
2258 deliver_skb(skb2, pt_prev, skb->dev);
2259 pt_prev = ptype;
2260 continue;
2261 }
2262
2263 /* need to clone skb, done only once */
2264 skb2 = skb_clone(skb, GFP_ATOMIC);
2265 if (!skb2)
2266 goto out_unlock;
2267
2268 net_timestamp_set(skb2);
2269
2270 /* skb->nh should be correctly
2271 * set by sender, so that the second statement is
2272 * just protection against buggy protocols.
2273 */
2274 skb_reset_mac_header(skb2);
2275
2276 if (skb_network_header(skb2) < skb2->data ||
2277 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2278 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2279 ntohs(skb2->protocol),
2280 dev->name);
2281 skb_reset_network_header(skb2);
2282 }
2283
2284 skb2->transport_header = skb2->network_header;
2285 skb2->pkt_type = PACKET_OUTGOING;
2286 pt_prev = ptype;
2287 }
2288
2289 if (ptype_list == &net_hotdata.ptype_all) {
2290 ptype_list = &dev->ptype_all;
2291 goto again;
2292 }
2293out_unlock:
2294 if (pt_prev) {
2295 if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2296 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2297 else
2298 kfree_skb(skb2);
2299 }
2300 rcu_read_unlock();
2301}
2302EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2303
2304/**
2305 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2306 * @dev: Network device
2307 * @txq: number of queues available
2308 *
2309 * If real_num_tx_queues is changed the tc mappings may no longer be
2310 * valid. To resolve this verify the tc mapping remains valid and if
2311 * not NULL the mapping. With no priorities mapping to this
2312 * offset/count pair it will no longer be used. In the worst case TC0
2313 * is invalid nothing can be done so disable priority mappings. If is
2314 * expected that drivers will fix this mapping if they can before
2315 * calling netif_set_real_num_tx_queues.
2316 */
2317static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2318{
2319 int i;
2320 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2321
2322 /* If TC0 is invalidated disable TC mapping */
2323 if (tc->offset + tc->count > txq) {
2324 netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2325 dev->num_tc = 0;
2326 return;
2327 }
2328
2329 /* Invalidated prio to tc mappings set to TC0 */
2330 for (i = 1; i < TC_BITMASK + 1; i++) {
2331 int q = netdev_get_prio_tc_map(dev, i);
2332
2333 tc = &dev->tc_to_txq[q];
2334 if (tc->offset + tc->count > txq) {
2335 netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2336 i, q);
2337 netdev_set_prio_tc_map(dev, i, 0);
2338 }
2339 }
2340}
2341
2342int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2343{
2344 if (dev->num_tc) {
2345 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2346 int i;
2347
2348 /* walk through the TCs and see if it falls into any of them */
2349 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2350 if ((txq - tc->offset) < tc->count)
2351 return i;
2352 }
2353
2354 /* didn't find it, just return -1 to indicate no match */
2355 return -1;
2356 }
2357
2358 return 0;
2359}
2360EXPORT_SYMBOL(netdev_txq_to_tc);
2361
2362#ifdef CONFIG_XPS
2363static struct static_key xps_needed __read_mostly;
2364static struct static_key xps_rxqs_needed __read_mostly;
2365static DEFINE_MUTEX(xps_map_mutex);
2366#define xmap_dereference(P) \
2367 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2368
2369static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2370 struct xps_dev_maps *old_maps, int tci, u16 index)
2371{
2372 struct xps_map *map = NULL;
2373 int pos;
2374
2375 map = xmap_dereference(dev_maps->attr_map[tci]);
2376 if (!map)
2377 return false;
2378
2379 for (pos = map->len; pos--;) {
2380 if (map->queues[pos] != index)
2381 continue;
2382
2383 if (map->len > 1) {
2384 map->queues[pos] = map->queues[--map->len];
2385 break;
2386 }
2387
2388 if (old_maps)
2389 RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
2390 RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2391 kfree_rcu(map, rcu);
2392 return false;
2393 }
2394
2395 return true;
2396}
2397
2398static bool remove_xps_queue_cpu(struct net_device *dev,
2399 struct xps_dev_maps *dev_maps,
2400 int cpu, u16 offset, u16 count)
2401{
2402 int num_tc = dev_maps->num_tc;
2403 bool active = false;
2404 int tci;
2405
2406 for (tci = cpu * num_tc; num_tc--; tci++) {
2407 int i, j;
2408
2409 for (i = count, j = offset; i--; j++) {
2410 if (!remove_xps_queue(dev_maps, NULL, tci, j))
2411 break;
2412 }
2413
2414 active |= i < 0;
2415 }
2416
2417 return active;
2418}
2419
2420static void reset_xps_maps(struct net_device *dev,
2421 struct xps_dev_maps *dev_maps,
2422 enum xps_map_type type)
2423{
2424 static_key_slow_dec_cpuslocked(&xps_needed);
2425 if (type == XPS_RXQS)
2426 static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
2427
2428 RCU_INIT_POINTER(dev->xps_maps[type], NULL);
2429
2430 kfree_rcu(dev_maps, rcu);
2431}
2432
2433static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
2434 u16 offset, u16 count)
2435{
2436 struct xps_dev_maps *dev_maps;
2437 bool active = false;
2438 int i, j;
2439
2440 dev_maps = xmap_dereference(dev->xps_maps[type]);
2441 if (!dev_maps)
2442 return;
2443
2444 for (j = 0; j < dev_maps->nr_ids; j++)
2445 active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
2446 if (!active)
2447 reset_xps_maps(dev, dev_maps, type);
2448
2449 if (type == XPS_CPUS) {
2450 for (i = offset + (count - 1); count--; i--)
2451 netdev_queue_numa_node_write(
2452 netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
2453 }
2454}
2455
2456static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2457 u16 count)
2458{
2459 if (!static_key_false(&xps_needed))
2460 return;
2461
2462 cpus_read_lock();
2463 mutex_lock(&xps_map_mutex);
2464
2465 if (static_key_false(&xps_rxqs_needed))
2466 clean_xps_maps(dev, XPS_RXQS, offset, count);
2467
2468 clean_xps_maps(dev, XPS_CPUS, offset, count);
2469
2470 mutex_unlock(&xps_map_mutex);
2471 cpus_read_unlock();
2472}
2473
2474static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2475{
2476 netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2477}
2478
2479static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2480 u16 index, bool is_rxqs_map)
2481{
2482 struct xps_map *new_map;
2483 int alloc_len = XPS_MIN_MAP_ALLOC;
2484 int i, pos;
2485
2486 for (pos = 0; map && pos < map->len; pos++) {
2487 if (map->queues[pos] != index)
2488 continue;
2489 return map;
2490 }
2491
2492 /* Need to add tx-queue to this CPU's/rx-queue's existing map */
2493 if (map) {
2494 if (pos < map->alloc_len)
2495 return map;
2496
2497 alloc_len = map->alloc_len * 2;
2498 }
2499
2500 /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2501 * map
2502 */
2503 if (is_rxqs_map)
2504 new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2505 else
2506 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2507 cpu_to_node(attr_index));
2508 if (!new_map)
2509 return NULL;
2510
2511 for (i = 0; i < pos; i++)
2512 new_map->queues[i] = map->queues[i];
2513 new_map->alloc_len = alloc_len;
2514 new_map->len = pos;
2515
2516 return new_map;
2517}
2518
2519/* Copy xps maps at a given index */
2520static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
2521 struct xps_dev_maps *new_dev_maps, int index,
2522 int tc, bool skip_tc)
2523{
2524 int i, tci = index * dev_maps->num_tc;
2525 struct xps_map *map;
2526
2527 /* copy maps belonging to foreign traffic classes */
2528 for (i = 0; i < dev_maps->num_tc; i++, tci++) {
2529 if (i == tc && skip_tc)
2530 continue;
2531
2532 /* fill in the new device map from the old device map */
2533 map = xmap_dereference(dev_maps->attr_map[tci]);
2534 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2535 }
2536}
2537
2538/* Must be called under cpus_read_lock */
2539int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2540 u16 index, enum xps_map_type type)
2541{
2542 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
2543 const unsigned long *online_mask = NULL;
2544 bool active = false, copy = false;
2545 int i, j, tci, numa_node_id = -2;
2546 int maps_sz, num_tc = 1, tc = 0;
2547 struct xps_map *map, *new_map;
2548 unsigned int nr_ids;
2549
2550 WARN_ON_ONCE(index >= dev->num_tx_queues);
2551
2552 if (dev->num_tc) {
2553 /* Do not allow XPS on subordinate device directly */
2554 num_tc = dev->num_tc;
2555 if (num_tc < 0)
2556 return -EINVAL;
2557
2558 /* If queue belongs to subordinate dev use its map */
2559 dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2560
2561 tc = netdev_txq_to_tc(dev, index);
2562 if (tc < 0)
2563 return -EINVAL;
2564 }
2565
2566 mutex_lock(&xps_map_mutex);
2567
2568 dev_maps = xmap_dereference(dev->xps_maps[type]);
2569 if (type == XPS_RXQS) {
2570 maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2571 nr_ids = dev->num_rx_queues;
2572 } else {
2573 maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2574 if (num_possible_cpus() > 1)
2575 online_mask = cpumask_bits(cpu_online_mask);
2576 nr_ids = nr_cpu_ids;
2577 }
2578
2579 if (maps_sz < L1_CACHE_BYTES)
2580 maps_sz = L1_CACHE_BYTES;
2581
2582 /* The old dev_maps could be larger or smaller than the one we're
2583 * setting up now, as dev->num_tc or nr_ids could have been updated in
2584 * between. We could try to be smart, but let's be safe instead and only
2585 * copy foreign traffic classes if the two map sizes match.
2586 */
2587 if (dev_maps &&
2588 dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
2589 copy = true;
2590
2591 /* allocate memory for queue storage */
2592 for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
2593 j < nr_ids;) {
2594 if (!new_dev_maps) {
2595 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2596 if (!new_dev_maps) {
2597 mutex_unlock(&xps_map_mutex);
2598 return -ENOMEM;
2599 }
2600
2601 new_dev_maps->nr_ids = nr_ids;
2602 new_dev_maps->num_tc = num_tc;
2603 }
2604
2605 tci = j * num_tc + tc;
2606 map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
2607
2608 map = expand_xps_map(map, j, index, type == XPS_RXQS);
2609 if (!map)
2610 goto error;
2611
2612 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2613 }
2614
2615 if (!new_dev_maps)
2616 goto out_no_new_maps;
2617
2618 if (!dev_maps) {
2619 /* Increment static keys at most once per type */
2620 static_key_slow_inc_cpuslocked(&xps_needed);
2621 if (type == XPS_RXQS)
2622 static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
2623 }
2624
2625 for (j = 0; j < nr_ids; j++) {
2626 bool skip_tc = false;
2627
2628 tci = j * num_tc + tc;
2629 if (netif_attr_test_mask(j, mask, nr_ids) &&
2630 netif_attr_test_online(j, online_mask, nr_ids)) {
2631 /* add tx-queue to CPU/rx-queue maps */
2632 int pos = 0;
2633
2634 skip_tc = true;
2635
2636 map = xmap_dereference(new_dev_maps->attr_map[tci]);
2637 while ((pos < map->len) && (map->queues[pos] != index))
2638 pos++;
2639
2640 if (pos == map->len)
2641 map->queues[map->len++] = index;
2642#ifdef CONFIG_NUMA
2643 if (type == XPS_CPUS) {
2644 if (numa_node_id == -2)
2645 numa_node_id = cpu_to_node(j);
2646 else if (numa_node_id != cpu_to_node(j))
2647 numa_node_id = -1;
2648 }
2649#endif
2650 }
2651
2652 if (copy)
2653 xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
2654 skip_tc);
2655 }
2656
2657 rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
2658
2659 /* Cleanup old maps */
2660 if (!dev_maps)
2661 goto out_no_old_maps;
2662
2663 for (j = 0; j < dev_maps->nr_ids; j++) {
2664 for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
2665 map = xmap_dereference(dev_maps->attr_map[tci]);
2666 if (!map)
2667 continue;
2668
2669 if (copy) {
2670 new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2671 if (map == new_map)
2672 continue;
2673 }
2674
2675 RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2676 kfree_rcu(map, rcu);
2677 }
2678 }
2679
2680 old_dev_maps = dev_maps;
2681
2682out_no_old_maps:
2683 dev_maps = new_dev_maps;
2684 active = true;
2685
2686out_no_new_maps:
2687 if (type == XPS_CPUS)
2688 /* update Tx queue numa node */
2689 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2690 (numa_node_id >= 0) ?
2691 numa_node_id : NUMA_NO_NODE);
2692
2693 if (!dev_maps)
2694 goto out_no_maps;
2695
2696 /* removes tx-queue from unused CPUs/rx-queues */
2697 for (j = 0; j < dev_maps->nr_ids; j++) {
2698 tci = j * dev_maps->num_tc;
2699
2700 for (i = 0; i < dev_maps->num_tc; i++, tci++) {
2701 if (i == tc &&
2702 netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
2703 netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
2704 continue;
2705
2706 active |= remove_xps_queue(dev_maps,
2707 copy ? old_dev_maps : NULL,
2708 tci, index);
2709 }
2710 }
2711
2712 if (old_dev_maps)
2713 kfree_rcu(old_dev_maps, rcu);
2714
2715 /* free map if not active */
2716 if (!active)
2717 reset_xps_maps(dev, dev_maps, type);
2718
2719out_no_maps:
2720 mutex_unlock(&xps_map_mutex);
2721
2722 return 0;
2723error:
2724 /* remove any maps that we added */
2725 for (j = 0; j < nr_ids; j++) {
2726 for (i = num_tc, tci = j * num_tc; i--; tci++) {
2727 new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2728 map = copy ?
2729 xmap_dereference(dev_maps->attr_map[tci]) :
2730 NULL;
2731 if (new_map && new_map != map)
2732 kfree(new_map);
2733 }
2734 }
2735
2736 mutex_unlock(&xps_map_mutex);
2737
2738 kfree(new_dev_maps);
2739 return -ENOMEM;
2740}
2741EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
2742
2743int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2744 u16 index)
2745{
2746 int ret;
2747
2748 cpus_read_lock();
2749 ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
2750 cpus_read_unlock();
2751
2752 return ret;
2753}
2754EXPORT_SYMBOL(netif_set_xps_queue);
2755
2756#endif
2757static void netdev_unbind_all_sb_channels(struct net_device *dev)
2758{
2759 struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2760
2761 /* Unbind any subordinate channels */
2762 while (txq-- != &dev->_tx[0]) {
2763 if (txq->sb_dev)
2764 netdev_unbind_sb_channel(dev, txq->sb_dev);
2765 }
2766}
2767
2768void netdev_reset_tc(struct net_device *dev)
2769{
2770#ifdef CONFIG_XPS
2771 netif_reset_xps_queues_gt(dev, 0);
2772#endif
2773 netdev_unbind_all_sb_channels(dev);
2774
2775 /* Reset TC configuration of device */
2776 dev->num_tc = 0;
2777 memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2778 memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2779}
2780EXPORT_SYMBOL(netdev_reset_tc);
2781
2782int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2783{
2784 if (tc >= dev->num_tc)
2785 return -EINVAL;
2786
2787#ifdef CONFIG_XPS
2788 netif_reset_xps_queues(dev, offset, count);
2789#endif
2790 dev->tc_to_txq[tc].count = count;
2791 dev->tc_to_txq[tc].offset = offset;
2792 return 0;
2793}
2794EXPORT_SYMBOL(netdev_set_tc_queue);
2795
2796int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2797{
2798 if (num_tc > TC_MAX_QUEUE)
2799 return -EINVAL;
2800
2801#ifdef CONFIG_XPS
2802 netif_reset_xps_queues_gt(dev, 0);
2803#endif
2804 netdev_unbind_all_sb_channels(dev);
2805
2806 dev->num_tc = num_tc;
2807 return 0;
2808}
2809EXPORT_SYMBOL(netdev_set_num_tc);
2810
2811void netdev_unbind_sb_channel(struct net_device *dev,
2812 struct net_device *sb_dev)
2813{
2814 struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2815
2816#ifdef CONFIG_XPS
2817 netif_reset_xps_queues_gt(sb_dev, 0);
2818#endif
2819 memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
2820 memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
2821
2822 while (txq-- != &dev->_tx[0]) {
2823 if (txq->sb_dev == sb_dev)
2824 txq->sb_dev = NULL;
2825 }
2826}
2827EXPORT_SYMBOL(netdev_unbind_sb_channel);
2828
2829int netdev_bind_sb_channel_queue(struct net_device *dev,
2830 struct net_device *sb_dev,
2831 u8 tc, u16 count, u16 offset)
2832{
2833 /* Make certain the sb_dev and dev are already configured */
2834 if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
2835 return -EINVAL;
2836
2837 /* We cannot hand out queues we don't have */
2838 if ((offset + count) > dev->real_num_tx_queues)
2839 return -EINVAL;
2840
2841 /* Record the mapping */
2842 sb_dev->tc_to_txq[tc].count = count;
2843 sb_dev->tc_to_txq[tc].offset = offset;
2844
2845 /* Provide a way for Tx queue to find the tc_to_txq map or
2846 * XPS map for itself.
2847 */
2848 while (count--)
2849 netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
2850
2851 return 0;
2852}
2853EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
2854
2855int netdev_set_sb_channel(struct net_device *dev, u16 channel)
2856{
2857 /* Do not use a multiqueue device to represent a subordinate channel */
2858 if (netif_is_multiqueue(dev))
2859 return -ENODEV;
2860
2861 /* We allow channels 1 - 32767 to be used for subordinate channels.
2862 * Channel 0 is meant to be "native" mode and used only to represent
2863 * the main root device. We allow writing 0 to reset the device back
2864 * to normal mode after being used as a subordinate channel.
2865 */
2866 if (channel > S16_MAX)
2867 return -EINVAL;
2868
2869 dev->num_tc = -channel;
2870
2871 return 0;
2872}
2873EXPORT_SYMBOL(netdev_set_sb_channel);
2874
2875/*
2876 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2877 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2878 */
2879int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2880{
2881 bool disabling;
2882 int rc;
2883
2884 disabling = txq < dev->real_num_tx_queues;
2885
2886 if (txq < 1 || txq > dev->num_tx_queues)
2887 return -EINVAL;
2888
2889 if (dev->reg_state == NETREG_REGISTERED ||
2890 dev->reg_state == NETREG_UNREGISTERING) {
2891 ASSERT_RTNL();
2892
2893 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2894 txq);
2895 if (rc)
2896 return rc;
2897
2898 if (dev->num_tc)
2899 netif_setup_tc(dev, txq);
2900
2901 dev_qdisc_change_real_num_tx(dev, txq);
2902
2903 dev->real_num_tx_queues = txq;
2904
2905 if (disabling) {
2906 synchronize_net();
2907 qdisc_reset_all_tx_gt(dev, txq);
2908#ifdef CONFIG_XPS
2909 netif_reset_xps_queues_gt(dev, txq);
2910#endif
2911 }
2912 } else {
2913 dev->real_num_tx_queues = txq;
2914 }
2915
2916 return 0;
2917}
2918EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2919
2920#ifdef CONFIG_SYSFS
2921/**
2922 * netif_set_real_num_rx_queues - set actual number of RX queues used
2923 * @dev: Network device
2924 * @rxq: Actual number of RX queues
2925 *
2926 * This must be called either with the rtnl_lock held or before
2927 * registration of the net device. Returns 0 on success, or a
2928 * negative error code. If called before registration, it always
2929 * succeeds.
2930 */
2931int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2932{
2933 int rc;
2934
2935 if (rxq < 1 || rxq > dev->num_rx_queues)
2936 return -EINVAL;
2937
2938 if (dev->reg_state == NETREG_REGISTERED) {
2939 ASSERT_RTNL();
2940
2941 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2942 rxq);
2943 if (rc)
2944 return rc;
2945 }
2946
2947 dev->real_num_rx_queues = rxq;
2948 return 0;
2949}
2950EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2951#endif
2952
2953/**
2954 * netif_set_real_num_queues - set actual number of RX and TX queues used
2955 * @dev: Network device
2956 * @txq: Actual number of TX queues
2957 * @rxq: Actual number of RX queues
2958 *
2959 * Set the real number of both TX and RX queues.
2960 * Does nothing if the number of queues is already correct.
2961 */
2962int netif_set_real_num_queues(struct net_device *dev,
2963 unsigned int txq, unsigned int rxq)
2964{
2965 unsigned int old_rxq = dev->real_num_rx_queues;
2966 int err;
2967
2968 if (txq < 1 || txq > dev->num_tx_queues ||
2969 rxq < 1 || rxq > dev->num_rx_queues)
2970 return -EINVAL;
2971
2972 /* Start from increases, so the error path only does decreases -
2973 * decreases can't fail.
2974 */
2975 if (rxq > dev->real_num_rx_queues) {
2976 err = netif_set_real_num_rx_queues(dev, rxq);
2977 if (err)
2978 return err;
2979 }
2980 if (txq > dev->real_num_tx_queues) {
2981 err = netif_set_real_num_tx_queues(dev, txq);
2982 if (err)
2983 goto undo_rx;
2984 }
2985 if (rxq < dev->real_num_rx_queues)
2986 WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
2987 if (txq < dev->real_num_tx_queues)
2988 WARN_ON(netif_set_real_num_tx_queues(dev, txq));
2989
2990 return 0;
2991undo_rx:
2992 WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
2993 return err;
2994}
2995EXPORT_SYMBOL(netif_set_real_num_queues);
2996
2997/**
2998 * netif_set_tso_max_size() - set the max size of TSO frames supported
2999 * @dev: netdev to update
3000 * @size: max skb->len of a TSO frame
3001 *
3002 * Set the limit on the size of TSO super-frames the device can handle.
3003 * Unless explicitly set the stack will assume the value of
3004 * %GSO_LEGACY_MAX_SIZE.
3005 */
3006void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
3007{
3008 dev->tso_max_size = min(GSO_MAX_SIZE, size);
3009 if (size < READ_ONCE(dev->gso_max_size))
3010 netif_set_gso_max_size(dev, size);
3011 if (size < READ_ONCE(dev->gso_ipv4_max_size))
3012 netif_set_gso_ipv4_max_size(dev, size);
3013}
3014EXPORT_SYMBOL(netif_set_tso_max_size);
3015
3016/**
3017 * netif_set_tso_max_segs() - set the max number of segs supported for TSO
3018 * @dev: netdev to update
3019 * @segs: max number of TCP segments
3020 *
3021 * Set the limit on the number of TCP segments the device can generate from
3022 * a single TSO super-frame.
3023 * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
3024 */
3025void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
3026{
3027 dev->tso_max_segs = segs;
3028 if (segs < READ_ONCE(dev->gso_max_segs))
3029 netif_set_gso_max_segs(dev, segs);
3030}
3031EXPORT_SYMBOL(netif_set_tso_max_segs);
3032
3033/**
3034 * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
3035 * @to: netdev to update
3036 * @from: netdev from which to copy the limits
3037 */
3038void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
3039{
3040 netif_set_tso_max_size(to, from->tso_max_size);
3041 netif_set_tso_max_segs(to, from->tso_max_segs);
3042}
3043EXPORT_SYMBOL(netif_inherit_tso_max);
3044
3045/**
3046 * netif_get_num_default_rss_queues - default number of RSS queues
3047 *
3048 * Default value is the number of physical cores if there are only 1 or 2, or
3049 * divided by 2 if there are more.
3050 */
3051int netif_get_num_default_rss_queues(void)
3052{
3053 cpumask_var_t cpus;
3054 int cpu, count = 0;
3055
3056 if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
3057 return 1;
3058
3059 cpumask_copy(cpus, cpu_online_mask);
3060 for_each_cpu(cpu, cpus) {
3061 ++count;
3062 cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
3063 }
3064 free_cpumask_var(cpus);
3065
3066 return count > 2 ? DIV_ROUND_UP(count, 2) : count;
3067}
3068EXPORT_SYMBOL(netif_get_num_default_rss_queues);
3069
3070static void __netif_reschedule(struct Qdisc *q)
3071{
3072 struct softnet_data *sd;
3073 unsigned long flags;
3074
3075 local_irq_save(flags);
3076 sd = this_cpu_ptr(&softnet_data);
3077 q->next_sched = NULL;
3078 *sd->output_queue_tailp = q;
3079 sd->output_queue_tailp = &q->next_sched;
3080 raise_softirq_irqoff(NET_TX_SOFTIRQ);
3081 local_irq_restore(flags);
3082}
3083
3084void __netif_schedule(struct Qdisc *q)
3085{
3086 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
3087 __netif_reschedule(q);
3088}
3089EXPORT_SYMBOL(__netif_schedule);
3090
3091struct dev_kfree_skb_cb {
3092 enum skb_drop_reason reason;
3093};
3094
3095static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
3096{
3097 return (struct dev_kfree_skb_cb *)skb->cb;
3098}
3099
3100void netif_schedule_queue(struct netdev_queue *txq)
3101{
3102 rcu_read_lock();
3103 if (!netif_xmit_stopped(txq)) {
3104 struct Qdisc *q = rcu_dereference(txq->qdisc);
3105
3106 __netif_schedule(q);
3107 }
3108 rcu_read_unlock();
3109}
3110EXPORT_SYMBOL(netif_schedule_queue);
3111
3112void netif_tx_wake_queue(struct netdev_queue *dev_queue)
3113{
3114 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
3115 struct Qdisc *q;
3116
3117 rcu_read_lock();
3118 q = rcu_dereference(dev_queue->qdisc);
3119 __netif_schedule(q);
3120 rcu_read_unlock();
3121 }
3122}
3123EXPORT_SYMBOL(netif_tx_wake_queue);
3124
3125void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason)
3126{
3127 unsigned long flags;
3128
3129 if (unlikely(!skb))
3130 return;
3131
3132 if (likely(refcount_read(&skb->users) == 1)) {
3133 smp_rmb();
3134 refcount_set(&skb->users, 0);
3135 } else if (likely(!refcount_dec_and_test(&skb->users))) {
3136 return;
3137 }
3138 get_kfree_skb_cb(skb)->reason = reason;
3139 local_irq_save(flags);
3140 skb->next = __this_cpu_read(softnet_data.completion_queue);
3141 __this_cpu_write(softnet_data.completion_queue, skb);
3142 raise_softirq_irqoff(NET_TX_SOFTIRQ);
3143 local_irq_restore(flags);
3144}
3145EXPORT_SYMBOL(dev_kfree_skb_irq_reason);
3146
3147void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason)
3148{
3149 if (in_hardirq() || irqs_disabled())
3150 dev_kfree_skb_irq_reason(skb, reason);
3151 else
3152 kfree_skb_reason(skb, reason);
3153}
3154EXPORT_SYMBOL(dev_kfree_skb_any_reason);
3155
3156
3157/**
3158 * netif_device_detach - mark device as removed
3159 * @dev: network device
3160 *
3161 * Mark device as removed from system and therefore no longer available.
3162 */
3163void netif_device_detach(struct net_device *dev)
3164{
3165 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
3166 netif_running(dev)) {
3167 netif_tx_stop_all_queues(dev);
3168 }
3169}
3170EXPORT_SYMBOL(netif_device_detach);
3171
3172/**
3173 * netif_device_attach - mark device as attached
3174 * @dev: network device
3175 *
3176 * Mark device as attached from system and restart if needed.
3177 */
3178void netif_device_attach(struct net_device *dev)
3179{
3180 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
3181 netif_running(dev)) {
3182 netif_tx_wake_all_queues(dev);
3183 __netdev_watchdog_up(dev);
3184 }
3185}
3186EXPORT_SYMBOL(netif_device_attach);
3187
3188/*
3189 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
3190 * to be used as a distribution range.
3191 */
3192static u16 skb_tx_hash(const struct net_device *dev,
3193 const struct net_device *sb_dev,
3194 struct sk_buff *skb)
3195{
3196 u32 hash;
3197 u16 qoffset = 0;
3198 u16 qcount = dev->real_num_tx_queues;
3199
3200 if (dev->num_tc) {
3201 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
3202
3203 qoffset = sb_dev->tc_to_txq[tc].offset;
3204 qcount = sb_dev->tc_to_txq[tc].count;
3205 if (unlikely(!qcount)) {
3206 net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
3207 sb_dev->name, qoffset, tc);
3208 qoffset = 0;
3209 qcount = dev->real_num_tx_queues;
3210 }
3211 }
3212
3213 if (skb_rx_queue_recorded(skb)) {
3214 DEBUG_NET_WARN_ON_ONCE(qcount == 0);
3215 hash = skb_get_rx_queue(skb);
3216 if (hash >= qoffset)
3217 hash -= qoffset;
3218 while (unlikely(hash >= qcount))
3219 hash -= qcount;
3220 return hash + qoffset;
3221 }
3222
3223 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
3224}
3225
3226void skb_warn_bad_offload(const struct sk_buff *skb)
3227{
3228 static const netdev_features_t null_features;
3229 struct net_device *dev = skb->dev;
3230 const char *name = "";
3231
3232 if (!net_ratelimit())
3233 return;
3234
3235 if (dev) {
3236 if (dev->dev.parent)
3237 name = dev_driver_string(dev->dev.parent);
3238 else
3239 name = netdev_name(dev);
3240 }
3241 skb_dump(KERN_WARNING, skb, false);
3242 WARN(1, "%s: caps=(%pNF, %pNF)\n",
3243 name, dev ? &dev->features : &null_features,
3244 skb->sk ? &skb->sk->sk_route_caps : &null_features);
3245}
3246
3247/*
3248 * Invalidate hardware checksum when packet is to be mangled, and
3249 * complete checksum manually on outgoing path.
3250 */
3251int skb_checksum_help(struct sk_buff *skb)
3252{
3253 __wsum csum;
3254 int ret = 0, offset;
3255
3256 if (skb->ip_summed == CHECKSUM_COMPLETE)
3257 goto out_set_summed;
3258
3259 if (unlikely(skb_is_gso(skb))) {
3260 skb_warn_bad_offload(skb);
3261 return -EINVAL;
3262 }
3263
3264 /* Before computing a checksum, we should make sure no frag could
3265 * be modified by an external entity : checksum could be wrong.
3266 */
3267 if (skb_has_shared_frag(skb)) {
3268 ret = __skb_linearize(skb);
3269 if (ret)
3270 goto out;
3271 }
3272
3273 offset = skb_checksum_start_offset(skb);
3274 ret = -EINVAL;
3275 if (unlikely(offset >= skb_headlen(skb))) {
3276 DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
3277 WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n",
3278 offset, skb_headlen(skb));
3279 goto out;
3280 }
3281 csum = skb_checksum(skb, offset, skb->len - offset, 0);
3282
3283 offset += skb->csum_offset;
3284 if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) {
3285 DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
3286 WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n",
3287 offset + sizeof(__sum16), skb_headlen(skb));
3288 goto out;
3289 }
3290 ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
3291 if (ret)
3292 goto out;
3293
3294 *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
3295out_set_summed:
3296 skb->ip_summed = CHECKSUM_NONE;
3297out:
3298 return ret;
3299}
3300EXPORT_SYMBOL(skb_checksum_help);
3301
3302int skb_crc32c_csum_help(struct sk_buff *skb)
3303{
3304 __le32 crc32c_csum;
3305 int ret = 0, offset, start;
3306
3307 if (skb->ip_summed != CHECKSUM_PARTIAL)
3308 goto out;
3309
3310 if (unlikely(skb_is_gso(skb)))
3311 goto out;
3312
3313 /* Before computing a checksum, we should make sure no frag could
3314 * be modified by an external entity : checksum could be wrong.
3315 */
3316 if (unlikely(skb_has_shared_frag(skb))) {
3317 ret = __skb_linearize(skb);
3318 if (ret)
3319 goto out;
3320 }
3321 start = skb_checksum_start_offset(skb);
3322 offset = start + offsetof(struct sctphdr, checksum);
3323 if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
3324 ret = -EINVAL;
3325 goto out;
3326 }
3327
3328 ret = skb_ensure_writable(skb, offset + sizeof(__le32));
3329 if (ret)
3330 goto out;
3331
3332 crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
3333 skb->len - start, ~(__u32)0,
3334 crc32c_csum_stub));
3335 *(__le32 *)(skb->data + offset) = crc32c_csum;
3336 skb_reset_csum_not_inet(skb);
3337out:
3338 return ret;
3339}
3340
3341__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
3342{
3343 __be16 type = skb->protocol;
3344
3345 /* Tunnel gso handlers can set protocol to ethernet. */
3346 if (type == htons(ETH_P_TEB)) {
3347 struct ethhdr *eth;
3348
3349 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
3350 return 0;
3351
3352 eth = (struct ethhdr *)skb->data;
3353 type = eth->h_proto;
3354 }
3355
3356 return vlan_get_protocol_and_depth(skb, type, depth);
3357}
3358
3359
3360/* Take action when hardware reception checksum errors are detected. */
3361#ifdef CONFIG_BUG
3362static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3363{
3364 netdev_err(dev, "hw csum failure\n");
3365 skb_dump(KERN_ERR, skb, true);
3366 dump_stack();
3367}
3368
3369void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3370{
3371 DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
3372}
3373EXPORT_SYMBOL(netdev_rx_csum_fault);
3374#endif
3375
3376/* XXX: check that highmem exists at all on the given machine. */
3377static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
3378{
3379#ifdef CONFIG_HIGHMEM
3380 int i;
3381
3382 if (!(dev->features & NETIF_F_HIGHDMA)) {
3383 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3384 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3385
3386 if (PageHighMem(skb_frag_page(frag)))
3387 return 1;
3388 }
3389 }
3390#endif
3391 return 0;
3392}
3393
3394/* If MPLS offload request, verify we are testing hardware MPLS features
3395 * instead of standard features for the netdev.
3396 */
3397#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3398static netdev_features_t net_mpls_features(struct sk_buff *skb,
3399 netdev_features_t features,
3400 __be16 type)
3401{
3402 if (eth_p_mpls(type))
3403 features &= skb->dev->mpls_features;
3404
3405 return features;
3406}
3407#else
3408static netdev_features_t net_mpls_features(struct sk_buff *skb,
3409 netdev_features_t features,
3410 __be16 type)
3411{
3412 return features;
3413}
3414#endif
3415
3416static netdev_features_t harmonize_features(struct sk_buff *skb,
3417 netdev_features_t features)
3418{
3419 __be16 type;
3420
3421 type = skb_network_protocol(skb, NULL);
3422 features = net_mpls_features(skb, features, type);
3423
3424 if (skb->ip_summed != CHECKSUM_NONE &&
3425 !can_checksum_protocol(features, type)) {
3426 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3427 }
3428 if (illegal_highdma(skb->dev, skb))
3429 features &= ~NETIF_F_SG;
3430
3431 return features;
3432}
3433
3434netdev_features_t passthru_features_check(struct sk_buff *skb,
3435 struct net_device *dev,
3436 netdev_features_t features)
3437{
3438 return features;
3439}
3440EXPORT_SYMBOL(passthru_features_check);
3441
3442static netdev_features_t dflt_features_check(struct sk_buff *skb,
3443 struct net_device *dev,
3444 netdev_features_t features)
3445{
3446 return vlan_features_check(skb, features);
3447}
3448
3449static netdev_features_t gso_features_check(const struct sk_buff *skb,
3450 struct net_device *dev,
3451 netdev_features_t features)
3452{
3453 u16 gso_segs = skb_shinfo(skb)->gso_segs;
3454
3455 if (gso_segs > READ_ONCE(dev->gso_max_segs))
3456 return features & ~NETIF_F_GSO_MASK;
3457
3458 if (unlikely(skb->len >= READ_ONCE(dev->gso_max_size)))
3459 return features & ~NETIF_F_GSO_MASK;
3460
3461 if (!skb_shinfo(skb)->gso_type) {
3462 skb_warn_bad_offload(skb);
3463 return features & ~NETIF_F_GSO_MASK;
3464 }
3465
3466 /* Support for GSO partial features requires software
3467 * intervention before we can actually process the packets
3468 * so we need to strip support for any partial features now
3469 * and we can pull them back in after we have partially
3470 * segmented the frame.
3471 */
3472 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
3473 features &= ~dev->gso_partial_features;
3474
3475 /* Make sure to clear the IPv4 ID mangling feature if the
3476 * IPv4 header has the potential to be fragmented.
3477 */
3478 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3479 struct iphdr *iph = skb->encapsulation ?
3480 inner_ip_hdr(skb) : ip_hdr(skb);
3481
3482 if (!(iph->frag_off & htons(IP_DF)))
3483 features &= ~NETIF_F_TSO_MANGLEID;
3484 }
3485
3486 return features;
3487}
3488
3489netdev_features_t netif_skb_features(struct sk_buff *skb)
3490{
3491 struct net_device *dev = skb->dev;
3492 netdev_features_t features = dev->features;
3493
3494 if (skb_is_gso(skb))
3495 features = gso_features_check(skb, dev, features);
3496
3497 /* If encapsulation offload request, verify we are testing
3498 * hardware encapsulation features instead of standard
3499 * features for the netdev
3500 */
3501 if (skb->encapsulation)
3502 features &= dev->hw_enc_features;
3503
3504 if (skb_vlan_tagged(skb))
3505 features = netdev_intersect_features(features,
3506 dev->vlan_features |
3507 NETIF_F_HW_VLAN_CTAG_TX |
3508 NETIF_F_HW_VLAN_STAG_TX);
3509
3510 if (dev->netdev_ops->ndo_features_check)
3511 features &= dev->netdev_ops->ndo_features_check(skb, dev,
3512 features);
3513 else
3514 features &= dflt_features_check(skb, dev, features);
3515
3516 return harmonize_features(skb, features);
3517}
3518EXPORT_SYMBOL(netif_skb_features);
3519
3520static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3521 struct netdev_queue *txq, bool more)
3522{
3523 unsigned int len;
3524 int rc;
3525
3526 if (dev_nit_active(dev))
3527 dev_queue_xmit_nit(skb, dev);
3528
3529 len = skb->len;
3530 trace_net_dev_start_xmit(skb, dev);
3531 rc = netdev_start_xmit(skb, dev, txq, more);
3532 trace_net_dev_xmit(skb, rc, dev, len);
3533
3534 return rc;
3535}
3536
3537struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3538 struct netdev_queue *txq, int *ret)
3539{
3540 struct sk_buff *skb = first;
3541 int rc = NETDEV_TX_OK;
3542
3543 while (skb) {
3544 struct sk_buff *next = skb->next;
3545
3546 skb_mark_not_on_list(skb);
3547 rc = xmit_one(skb, dev, txq, next != NULL);
3548 if (unlikely(!dev_xmit_complete(rc))) {
3549 skb->next = next;
3550 goto out;
3551 }
3552
3553 skb = next;
3554 if (netif_tx_queue_stopped(txq) && skb) {
3555 rc = NETDEV_TX_BUSY;
3556 break;
3557 }
3558 }
3559
3560out:
3561 *ret = rc;
3562 return skb;
3563}
3564
3565static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3566 netdev_features_t features)
3567{
3568 if (skb_vlan_tag_present(skb) &&
3569 !vlan_hw_offload_capable(features, skb->vlan_proto))
3570 skb = __vlan_hwaccel_push_inside(skb);
3571 return skb;
3572}
3573
3574int skb_csum_hwoffload_help(struct sk_buff *skb,
3575 const netdev_features_t features)
3576{
3577 if (unlikely(skb_csum_is_sctp(skb)))
3578 return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3579 skb_crc32c_csum_help(skb);
3580
3581 if (features & NETIF_F_HW_CSUM)
3582 return 0;
3583
3584 if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
3585 switch (skb->csum_offset) {
3586 case offsetof(struct tcphdr, check):
3587 case offsetof(struct udphdr, check):
3588 return 0;
3589 }
3590 }
3591
3592 return skb_checksum_help(skb);
3593}
3594EXPORT_SYMBOL(skb_csum_hwoffload_help);
3595
3596static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3597{
3598 netdev_features_t features;
3599
3600 features = netif_skb_features(skb);
3601 skb = validate_xmit_vlan(skb, features);
3602 if (unlikely(!skb))
3603 goto out_null;
3604
3605 skb = sk_validate_xmit_skb(skb, dev);
3606 if (unlikely(!skb))
3607 goto out_null;
3608
3609 if (netif_needs_gso(skb, features)) {
3610 struct sk_buff *segs;
3611
3612 segs = skb_gso_segment(skb, features);
3613 if (IS_ERR(segs)) {
3614 goto out_kfree_skb;
3615 } else if (segs) {
3616 consume_skb(skb);
3617 skb = segs;
3618 }
3619 } else {
3620 if (skb_needs_linearize(skb, features) &&
3621 __skb_linearize(skb))
3622 goto out_kfree_skb;
3623
3624 /* If packet is not checksummed and device does not
3625 * support checksumming for this protocol, complete
3626 * checksumming here.
3627 */
3628 if (skb->ip_summed == CHECKSUM_PARTIAL) {
3629 if (skb->encapsulation)
3630 skb_set_inner_transport_header(skb,
3631 skb_checksum_start_offset(skb));
3632 else
3633 skb_set_transport_header(skb,
3634 skb_checksum_start_offset(skb));
3635 if (skb_csum_hwoffload_help(skb, features))
3636 goto out_kfree_skb;
3637 }
3638 }
3639
3640 skb = validate_xmit_xfrm(skb, features, again);
3641
3642 return skb;
3643
3644out_kfree_skb:
3645 kfree_skb(skb);
3646out_null:
3647 dev_core_stats_tx_dropped_inc(dev);
3648 return NULL;
3649}
3650
3651struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3652{
3653 struct sk_buff *next, *head = NULL, *tail;
3654
3655 for (; skb != NULL; skb = next) {
3656 next = skb->next;
3657 skb_mark_not_on_list(skb);
3658
3659 /* in case skb wont be segmented, point to itself */
3660 skb->prev = skb;
3661
3662 skb = validate_xmit_skb(skb, dev, again);
3663 if (!skb)
3664 continue;
3665
3666 if (!head)
3667 head = skb;
3668 else
3669 tail->next = skb;
3670 /* If skb was segmented, skb->prev points to
3671 * the last segment. If not, it still contains skb.
3672 */
3673 tail = skb->prev;
3674 }
3675 return head;
3676}
3677EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3678
3679static void qdisc_pkt_len_init(struct sk_buff *skb)
3680{
3681 const struct skb_shared_info *shinfo = skb_shinfo(skb);
3682
3683 qdisc_skb_cb(skb)->pkt_len = skb->len;
3684
3685 /* To get more precise estimation of bytes sent on wire,
3686 * we add to pkt_len the headers size of all segments
3687 */
3688 if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
3689 u16 gso_segs = shinfo->gso_segs;
3690 unsigned int hdr_len;
3691
3692 /* mac layer + network layer */
3693 hdr_len = skb_transport_offset(skb);
3694
3695 /* + transport layer */
3696 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3697 const struct tcphdr *th;
3698 struct tcphdr _tcphdr;
3699
3700 th = skb_header_pointer(skb, hdr_len,
3701 sizeof(_tcphdr), &_tcphdr);
3702 if (likely(th))
3703 hdr_len += __tcp_hdrlen(th);
3704 } else {
3705 struct udphdr _udphdr;
3706
3707 if (skb_header_pointer(skb, hdr_len,
3708 sizeof(_udphdr), &_udphdr))
3709 hdr_len += sizeof(struct udphdr);
3710 }
3711
3712 if (shinfo->gso_type & SKB_GSO_DODGY)
3713 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3714 shinfo->gso_size);
3715
3716 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3717 }
3718}
3719
3720static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
3721 struct sk_buff **to_free,
3722 struct netdev_queue *txq)
3723{
3724 int rc;
3725
3726 rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
3727 if (rc == NET_XMIT_SUCCESS)
3728 trace_qdisc_enqueue(q, txq, skb);
3729 return rc;
3730}
3731
3732static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3733 struct net_device *dev,
3734 struct netdev_queue *txq)
3735{
3736 spinlock_t *root_lock = qdisc_lock(q);
3737 struct sk_buff *to_free = NULL;
3738 bool contended;
3739 int rc;
3740
3741 qdisc_calculate_pkt_len(skb, q);
3742
3743 tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP);
3744
3745 if (q->flags & TCQ_F_NOLOCK) {
3746 if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
3747 qdisc_run_begin(q)) {
3748 /* Retest nolock_qdisc_is_empty() within the protection
3749 * of q->seqlock to protect from racing with requeuing.
3750 */
3751 if (unlikely(!nolock_qdisc_is_empty(q))) {
3752 rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3753 __qdisc_run(q);
3754 qdisc_run_end(q);
3755
3756 goto no_lock_out;
3757 }
3758
3759 qdisc_bstats_cpu_update(q, skb);
3760 if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
3761 !nolock_qdisc_is_empty(q))
3762 __qdisc_run(q);
3763
3764 qdisc_run_end(q);
3765 return NET_XMIT_SUCCESS;
3766 }
3767
3768 rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3769 qdisc_run(q);
3770
3771no_lock_out:
3772 if (unlikely(to_free))
3773 kfree_skb_list_reason(to_free,
3774 tcf_get_drop_reason(to_free));
3775 return rc;
3776 }
3777
3778 if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
3779 kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
3780 return NET_XMIT_DROP;
3781 }
3782 /*
3783 * Heuristic to force contended enqueues to serialize on a
3784 * separate lock before trying to get qdisc main lock.
3785 * This permits qdisc->running owner to get the lock more
3786 * often and dequeue packets faster.
3787 * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
3788 * and then other tasks will only enqueue packets. The packets will be
3789 * sent after the qdisc owner is scheduled again. To prevent this
3790 * scenario the task always serialize on the lock.
3791 */
3792 contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
3793 if (unlikely(contended))
3794 spin_lock(&q->busylock);
3795
3796 spin_lock(root_lock);
3797 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3798 __qdisc_drop(skb, &to_free);
3799 rc = NET_XMIT_DROP;
3800 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3801 qdisc_run_begin(q)) {
3802 /*
3803 * This is a work-conserving queue; there are no old skbs
3804 * waiting to be sent out; and the qdisc is not running -
3805 * xmit the skb directly.
3806 */
3807
3808 qdisc_bstats_update(q, skb);
3809
3810 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3811 if (unlikely(contended)) {
3812 spin_unlock(&q->busylock);
3813 contended = false;
3814 }
3815 __qdisc_run(q);
3816 }
3817
3818 qdisc_run_end(q);
3819 rc = NET_XMIT_SUCCESS;
3820 } else {
3821 WRITE_ONCE(q->owner, smp_processor_id());
3822 rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3823 WRITE_ONCE(q->owner, -1);
3824 if (qdisc_run_begin(q)) {
3825 if (unlikely(contended)) {
3826 spin_unlock(&q->busylock);
3827 contended = false;
3828 }
3829 __qdisc_run(q);
3830 qdisc_run_end(q);
3831 }
3832 }
3833 spin_unlock(root_lock);
3834 if (unlikely(to_free))
3835 kfree_skb_list_reason(to_free,
3836 tcf_get_drop_reason(to_free));
3837 if (unlikely(contended))
3838 spin_unlock(&q->busylock);
3839 return rc;
3840}
3841
3842#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3843static void skb_update_prio(struct sk_buff *skb)
3844{
3845 const struct netprio_map *map;
3846 const struct sock *sk;
3847 unsigned int prioidx;
3848
3849 if (skb->priority)
3850 return;
3851 map = rcu_dereference_bh(skb->dev->priomap);
3852 if (!map)
3853 return;
3854 sk = skb_to_full_sk(skb);
3855 if (!sk)
3856 return;
3857
3858 prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3859
3860 if (prioidx < map->priomap_len)
3861 skb->priority = map->priomap[prioidx];
3862}
3863#else
3864#define skb_update_prio(skb)
3865#endif
3866
3867/**
3868 * dev_loopback_xmit - loop back @skb
3869 * @net: network namespace this loopback is happening in
3870 * @sk: sk needed to be a netfilter okfn
3871 * @skb: buffer to transmit
3872 */
3873int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3874{
3875 skb_reset_mac_header(skb);
3876 __skb_pull(skb, skb_network_offset(skb));
3877 skb->pkt_type = PACKET_LOOPBACK;
3878 if (skb->ip_summed == CHECKSUM_NONE)
3879 skb->ip_summed = CHECKSUM_UNNECESSARY;
3880 DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb));
3881 skb_dst_force(skb);
3882 netif_rx(skb);
3883 return 0;
3884}
3885EXPORT_SYMBOL(dev_loopback_xmit);
3886
3887#ifdef CONFIG_NET_EGRESS
3888static struct netdev_queue *
3889netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
3890{
3891 int qm = skb_get_queue_mapping(skb);
3892
3893 return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
3894}
3895
3896static bool netdev_xmit_txqueue_skipped(void)
3897{
3898 return __this_cpu_read(softnet_data.xmit.skip_txqueue);
3899}
3900
3901void netdev_xmit_skip_txqueue(bool skip)
3902{
3903 __this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
3904}
3905EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
3906#endif /* CONFIG_NET_EGRESS */
3907
3908#ifdef CONFIG_NET_XGRESS
3909static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
3910 enum skb_drop_reason *drop_reason)
3911{
3912 int ret = TC_ACT_UNSPEC;
3913#ifdef CONFIG_NET_CLS_ACT
3914 struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
3915 struct tcf_result res;
3916
3917 if (!miniq)
3918 return ret;
3919
3920 tc_skb_cb(skb)->mru = 0;
3921 tc_skb_cb(skb)->post_ct = false;
3922 tcf_set_drop_reason(skb, *drop_reason);
3923
3924 mini_qdisc_bstats_cpu_update(miniq, skb);
3925 ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
3926 /* Only tcf related quirks below. */
3927 switch (ret) {
3928 case TC_ACT_SHOT:
3929 *drop_reason = tcf_get_drop_reason(skb);
3930 mini_qdisc_qstats_cpu_drop(miniq);
3931 break;
3932 case TC_ACT_OK:
3933 case TC_ACT_RECLASSIFY:
3934 skb->tc_index = TC_H_MIN(res.classid);
3935 break;
3936 }
3937#endif /* CONFIG_NET_CLS_ACT */
3938 return ret;
3939}
3940
3941static DEFINE_STATIC_KEY_FALSE(tcx_needed_key);
3942
3943void tcx_inc(void)
3944{
3945 static_branch_inc(&tcx_needed_key);
3946}
3947
3948void tcx_dec(void)
3949{
3950 static_branch_dec(&tcx_needed_key);
3951}
3952
3953static __always_inline enum tcx_action_base
3954tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
3955 const bool needs_mac)
3956{
3957 const struct bpf_mprog_fp *fp;
3958 const struct bpf_prog *prog;
3959 int ret = TCX_NEXT;
3960
3961 if (needs_mac)
3962 __skb_push(skb, skb->mac_len);
3963 bpf_mprog_foreach_prog(entry, fp, prog) {
3964 bpf_compute_data_pointers(skb);
3965 ret = bpf_prog_run(prog, skb);
3966 if (ret != TCX_NEXT)
3967 break;
3968 }
3969 if (needs_mac)
3970 __skb_pull(skb, skb->mac_len);
3971 return tcx_action_code(skb, ret);
3972}
3973
3974static __always_inline struct sk_buff *
3975sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3976 struct net_device *orig_dev, bool *another)
3977{
3978 struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
3979 enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
3980 int sch_ret;
3981
3982 if (!entry)
3983 return skb;
3984 if (*pt_prev) {
3985 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3986 *pt_prev = NULL;
3987 }
3988
3989 qdisc_skb_cb(skb)->pkt_len = skb->len;
3990 tcx_set_ingress(skb, true);
3991
3992 if (static_branch_unlikely(&tcx_needed_key)) {
3993 sch_ret = tcx_run(entry, skb, true);
3994 if (sch_ret != TC_ACT_UNSPEC)
3995 goto ingress_verdict;
3996 }
3997 sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
3998ingress_verdict:
3999 switch (sch_ret) {
4000 case TC_ACT_REDIRECT:
4001 /* skb_mac_header check was done by BPF, so we can safely
4002 * push the L2 header back before redirecting to another
4003 * netdev.
4004 */
4005 __skb_push(skb, skb->mac_len);
4006 if (skb_do_redirect(skb) == -EAGAIN) {
4007 __skb_pull(skb, skb->mac_len);
4008 *another = true;
4009 break;
4010 }
4011 *ret = NET_RX_SUCCESS;
4012 return NULL;
4013 case TC_ACT_SHOT:
4014 kfree_skb_reason(skb, drop_reason);
4015 *ret = NET_RX_DROP;
4016 return NULL;
4017 /* used by tc_run */
4018 case TC_ACT_STOLEN:
4019 case TC_ACT_QUEUED:
4020 case TC_ACT_TRAP:
4021 consume_skb(skb);
4022 fallthrough;
4023 case TC_ACT_CONSUMED:
4024 *ret = NET_RX_SUCCESS;
4025 return NULL;
4026 }
4027
4028 return skb;
4029}
4030
4031static __always_inline struct sk_buff *
4032sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
4033{
4034 struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
4035 enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
4036 int sch_ret;
4037
4038 if (!entry)
4039 return skb;
4040
4041 /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
4042 * already set by the caller.
4043 */
4044 if (static_branch_unlikely(&tcx_needed_key)) {
4045 sch_ret = tcx_run(entry, skb, false);
4046 if (sch_ret != TC_ACT_UNSPEC)
4047 goto egress_verdict;
4048 }
4049 sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
4050egress_verdict:
4051 switch (sch_ret) {
4052 case TC_ACT_REDIRECT:
4053 /* No need to push/pop skb's mac_header here on egress! */
4054 skb_do_redirect(skb);
4055 *ret = NET_XMIT_SUCCESS;
4056 return NULL;
4057 case TC_ACT_SHOT:
4058 kfree_skb_reason(skb, drop_reason);
4059 *ret = NET_XMIT_DROP;
4060 return NULL;
4061 /* used by tc_run */
4062 case TC_ACT_STOLEN:
4063 case TC_ACT_QUEUED:
4064 case TC_ACT_TRAP:
4065 consume_skb(skb);
4066 fallthrough;
4067 case TC_ACT_CONSUMED:
4068 *ret = NET_XMIT_SUCCESS;
4069 return NULL;
4070 }
4071
4072 return skb;
4073}
4074#else
4075static __always_inline struct sk_buff *
4076sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4077 struct net_device *orig_dev, bool *another)
4078{
4079 return skb;
4080}
4081
4082static __always_inline struct sk_buff *
4083sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
4084{
4085 return skb;
4086}
4087#endif /* CONFIG_NET_XGRESS */
4088
4089#ifdef CONFIG_XPS
4090static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
4091 struct xps_dev_maps *dev_maps, unsigned int tci)
4092{
4093 int tc = netdev_get_prio_tc_map(dev, skb->priority);
4094 struct xps_map *map;
4095 int queue_index = -1;
4096
4097 if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
4098 return queue_index;
4099
4100 tci *= dev_maps->num_tc;
4101 tci += tc;
4102
4103 map = rcu_dereference(dev_maps->attr_map[tci]);
4104 if (map) {
4105 if (map->len == 1)
4106 queue_index = map->queues[0];
4107 else
4108 queue_index = map->queues[reciprocal_scale(
4109 skb_get_hash(skb), map->len)];
4110 if (unlikely(queue_index >= dev->real_num_tx_queues))
4111 queue_index = -1;
4112 }
4113 return queue_index;
4114}
4115#endif
4116
4117static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
4118 struct sk_buff *skb)
4119{
4120#ifdef CONFIG_XPS
4121 struct xps_dev_maps *dev_maps;
4122 struct sock *sk = skb->sk;
4123 int queue_index = -1;
4124
4125 if (!static_key_false(&xps_needed))
4126 return -1;
4127
4128 rcu_read_lock();
4129 if (!static_key_false(&xps_rxqs_needed))
4130 goto get_cpus_map;
4131
4132 dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
4133 if (dev_maps) {
4134 int tci = sk_rx_queue_get(sk);
4135
4136 if (tci >= 0)
4137 queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
4138 tci);
4139 }
4140
4141get_cpus_map:
4142 if (queue_index < 0) {
4143 dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
4144 if (dev_maps) {
4145 unsigned int tci = skb->sender_cpu - 1;
4146
4147 queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
4148 tci);
4149 }
4150 }
4151 rcu_read_unlock();
4152
4153 return queue_index;
4154#else
4155 return -1;
4156#endif
4157}
4158
4159u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
4160 struct net_device *sb_dev)
4161{
4162 return 0;
4163}
4164EXPORT_SYMBOL(dev_pick_tx_zero);
4165
4166u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
4167 struct net_device *sb_dev)
4168{
4169 return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
4170}
4171EXPORT_SYMBOL(dev_pick_tx_cpu_id);
4172
4173u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
4174 struct net_device *sb_dev)
4175{
4176 struct sock *sk = skb->sk;
4177 int queue_index = sk_tx_queue_get(sk);
4178
4179 sb_dev = sb_dev ? : dev;
4180
4181 if (queue_index < 0 || skb->ooo_okay ||
4182 queue_index >= dev->real_num_tx_queues) {
4183 int new_index = get_xps_queue(dev, sb_dev, skb);
4184
4185 if (new_index < 0)
4186 new_index = skb_tx_hash(dev, sb_dev, skb);
4187
4188 if (queue_index != new_index && sk &&
4189 sk_fullsock(sk) &&
4190 rcu_access_pointer(sk->sk_dst_cache))
4191 sk_tx_queue_set(sk, new_index);
4192
4193 queue_index = new_index;
4194 }
4195
4196 return queue_index;
4197}
4198EXPORT_SYMBOL(netdev_pick_tx);
4199
4200struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
4201 struct sk_buff *skb,
4202 struct net_device *sb_dev)
4203{
4204 int queue_index = 0;
4205
4206#ifdef CONFIG_XPS
4207 u32 sender_cpu = skb->sender_cpu - 1;
4208
4209 if (sender_cpu >= (u32)NR_CPUS)
4210 skb->sender_cpu = raw_smp_processor_id() + 1;
4211#endif
4212
4213 if (dev->real_num_tx_queues != 1) {
4214 const struct net_device_ops *ops = dev->netdev_ops;
4215
4216 if (ops->ndo_select_queue)
4217 queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
4218 else
4219 queue_index = netdev_pick_tx(dev, skb, sb_dev);
4220
4221 queue_index = netdev_cap_txqueue(dev, queue_index);
4222 }
4223
4224 skb_set_queue_mapping(skb, queue_index);
4225 return netdev_get_tx_queue(dev, queue_index);
4226}
4227
4228/**
4229 * __dev_queue_xmit() - transmit a buffer
4230 * @skb: buffer to transmit
4231 * @sb_dev: suboordinate device used for L2 forwarding offload
4232 *
4233 * Queue a buffer for transmission to a network device. The caller must
4234 * have set the device and priority and built the buffer before calling
4235 * this function. The function can be called from an interrupt.
4236 *
4237 * When calling this method, interrupts MUST be enabled. This is because
4238 * the BH enable code must have IRQs enabled so that it will not deadlock.
4239 *
4240 * Regardless of the return value, the skb is consumed, so it is currently
4241 * difficult to retry a send to this method. (You can bump the ref count
4242 * before sending to hold a reference for retry if you are careful.)
4243 *
4244 * Return:
4245 * * 0 - buffer successfully transmitted
4246 * * positive qdisc return code - NET_XMIT_DROP etc.
4247 * * negative errno - other errors
4248 */
4249int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
4250{
4251 struct net_device *dev = skb->dev;
4252 struct netdev_queue *txq = NULL;
4253 struct Qdisc *q;
4254 int rc = -ENOMEM;
4255 bool again = false;
4256
4257 skb_reset_mac_header(skb);
4258 skb_assert_len(skb);
4259
4260 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
4261 __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
4262
4263 /* Disable soft irqs for various locks below. Also
4264 * stops preemption for RCU.
4265 */
4266 rcu_read_lock_bh();
4267
4268 skb_update_prio(skb);
4269
4270 qdisc_pkt_len_init(skb);
4271 tcx_set_ingress(skb, false);
4272#ifdef CONFIG_NET_EGRESS
4273 if (static_branch_unlikely(&egress_needed_key)) {
4274 if (nf_hook_egress_active()) {
4275 skb = nf_hook_egress(skb, &rc, dev);
4276 if (!skb)
4277 goto out;
4278 }
4279
4280 netdev_xmit_skip_txqueue(false);
4281
4282 nf_skip_egress(skb, true);
4283 skb = sch_handle_egress(skb, &rc, dev);
4284 if (!skb)
4285 goto out;
4286 nf_skip_egress(skb, false);
4287
4288 if (netdev_xmit_txqueue_skipped())
4289 txq = netdev_tx_queue_mapping(dev, skb);
4290 }
4291#endif
4292 /* If device/qdisc don't need skb->dst, release it right now while
4293 * its hot in this cpu cache.
4294 */
4295 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
4296 skb_dst_drop(skb);
4297 else
4298 skb_dst_force(skb);
4299
4300 if (!txq)
4301 txq = netdev_core_pick_tx(dev, skb, sb_dev);
4302
4303 q = rcu_dereference_bh(txq->qdisc);
4304
4305 trace_net_dev_queue(skb);
4306 if (q->enqueue) {
4307 rc = __dev_xmit_skb(skb, q, dev, txq);
4308 goto out;
4309 }
4310
4311 /* The device has no queue. Common case for software devices:
4312 * loopback, all the sorts of tunnels...
4313
4314 * Really, it is unlikely that netif_tx_lock protection is necessary
4315 * here. (f.e. loopback and IP tunnels are clean ignoring statistics
4316 * counters.)
4317 * However, it is possible, that they rely on protection
4318 * made by us here.
4319
4320 * Check this and shot the lock. It is not prone from deadlocks.
4321 *Either shot noqueue qdisc, it is even simpler 8)
4322 */
4323 if (dev->flags & IFF_UP) {
4324 int cpu = smp_processor_id(); /* ok because BHs are off */
4325
4326 /* Other cpus might concurrently change txq->xmit_lock_owner
4327 * to -1 or to their cpu id, but not to our id.
4328 */
4329 if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
4330 if (dev_xmit_recursion())
4331 goto recursion_alert;
4332
4333 skb = validate_xmit_skb(skb, dev, &again);
4334 if (!skb)
4335 goto out;
4336
4337 HARD_TX_LOCK(dev, txq, cpu);
4338
4339 if (!netif_xmit_stopped(txq)) {
4340 dev_xmit_recursion_inc();
4341 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
4342 dev_xmit_recursion_dec();
4343 if (dev_xmit_complete(rc)) {
4344 HARD_TX_UNLOCK(dev, txq);
4345 goto out;
4346 }
4347 }
4348 HARD_TX_UNLOCK(dev, txq);
4349 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
4350 dev->name);
4351 } else {
4352 /* Recursion is detected! It is possible,
4353 * unfortunately
4354 */
4355recursion_alert:
4356 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
4357 dev->name);
4358 }
4359 }
4360
4361 rc = -ENETDOWN;
4362 rcu_read_unlock_bh();
4363
4364 dev_core_stats_tx_dropped_inc(dev);
4365 kfree_skb_list(skb);
4366 return rc;
4367out:
4368 rcu_read_unlock_bh();
4369 return rc;
4370}
4371EXPORT_SYMBOL(__dev_queue_xmit);
4372
4373int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
4374{
4375 struct net_device *dev = skb->dev;
4376 struct sk_buff *orig_skb = skb;
4377 struct netdev_queue *txq;
4378 int ret = NETDEV_TX_BUSY;
4379 bool again = false;
4380
4381 if (unlikely(!netif_running(dev) ||
4382 !netif_carrier_ok(dev)))
4383 goto drop;
4384
4385 skb = validate_xmit_skb_list(skb, dev, &again);
4386 if (skb != orig_skb)
4387 goto drop;
4388
4389 skb_set_queue_mapping(skb, queue_id);
4390 txq = skb_get_tx_queue(dev, skb);
4391
4392 local_bh_disable();
4393
4394 dev_xmit_recursion_inc();
4395 HARD_TX_LOCK(dev, txq, smp_processor_id());
4396 if (!netif_xmit_frozen_or_drv_stopped(txq))
4397 ret = netdev_start_xmit(skb, dev, txq, false);
4398 HARD_TX_UNLOCK(dev, txq);
4399 dev_xmit_recursion_dec();
4400
4401 local_bh_enable();
4402 return ret;
4403drop:
4404 dev_core_stats_tx_dropped_inc(dev);
4405 kfree_skb_list(skb);
4406 return NET_XMIT_DROP;
4407}
4408EXPORT_SYMBOL(__dev_direct_xmit);
4409
4410/*************************************************************************
4411 * Receiver routines
4412 *************************************************************************/
4413
4414unsigned int sysctl_skb_defer_max __read_mostly = 64;
4415int weight_p __read_mostly = 64; /* old backlog weight */
4416int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
4417int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
4418
4419/* Called with irq disabled */
4420static inline void ____napi_schedule(struct softnet_data *sd,
4421 struct napi_struct *napi)
4422{
4423 struct task_struct *thread;
4424
4425 lockdep_assert_irqs_disabled();
4426
4427 if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
4428 /* Paired with smp_mb__before_atomic() in
4429 * napi_enable()/dev_set_threaded().
4430 * Use READ_ONCE() to guarantee a complete
4431 * read on napi->thread. Only call
4432 * wake_up_process() when it's not NULL.
4433 */
4434 thread = READ_ONCE(napi->thread);
4435 if (thread) {
4436 /* Avoid doing set_bit() if the thread is in
4437 * INTERRUPTIBLE state, cause napi_thread_wait()
4438 * makes sure to proceed with napi polling
4439 * if the thread is explicitly woken from here.
4440 */
4441 if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
4442 set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
4443 wake_up_process(thread);
4444 return;
4445 }
4446 }
4447
4448 list_add_tail(&napi->poll_list, &sd->poll_list);
4449 WRITE_ONCE(napi->list_owner, smp_processor_id());
4450 /* If not called from net_rx_action()
4451 * we have to raise NET_RX_SOFTIRQ.
4452 */
4453 if (!sd->in_net_rx_action)
4454 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4455}
4456
4457#ifdef CONFIG_RPS
4458
4459struct static_key_false rps_needed __read_mostly;
4460EXPORT_SYMBOL(rps_needed);
4461struct static_key_false rfs_needed __read_mostly;
4462EXPORT_SYMBOL(rfs_needed);
4463
4464static struct rps_dev_flow *
4465set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4466 struct rps_dev_flow *rflow, u16 next_cpu)
4467{
4468 if (next_cpu < nr_cpu_ids) {
4469#ifdef CONFIG_RFS_ACCEL
4470 struct netdev_rx_queue *rxqueue;
4471 struct rps_dev_flow_table *flow_table;
4472 struct rps_dev_flow *old_rflow;
4473 u32 flow_id;
4474 u16 rxq_index;
4475 int rc;
4476
4477 /* Should we steer this flow to a different hardware queue? */
4478 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
4479 !(dev->features & NETIF_F_NTUPLE))
4480 goto out;
4481 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
4482 if (rxq_index == skb_get_rx_queue(skb))
4483 goto out;
4484
4485 rxqueue = dev->_rx + rxq_index;
4486 flow_table = rcu_dereference(rxqueue->rps_flow_table);
4487 if (!flow_table)
4488 goto out;
4489 flow_id = skb_get_hash(skb) & flow_table->mask;
4490 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
4491 rxq_index, flow_id);
4492 if (rc < 0)
4493 goto out;
4494 old_rflow = rflow;
4495 rflow = &flow_table->flows[flow_id];
4496 rflow->filter = rc;
4497 if (old_rflow->filter == rflow->filter)
4498 old_rflow->filter = RPS_NO_FILTER;
4499 out:
4500#endif
4501 rflow->last_qtail =
4502 per_cpu(softnet_data, next_cpu).input_queue_head;
4503 }
4504
4505 rflow->cpu = next_cpu;
4506 return rflow;
4507}
4508
4509/*
4510 * get_rps_cpu is called from netif_receive_skb and returns the target
4511 * CPU from the RPS map of the receiving queue for a given skb.
4512 * rcu_read_lock must be held on entry.
4513 */
4514static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4515 struct rps_dev_flow **rflowp)
4516{
4517 const struct rps_sock_flow_table *sock_flow_table;
4518 struct netdev_rx_queue *rxqueue = dev->_rx;
4519 struct rps_dev_flow_table *flow_table;
4520 struct rps_map *map;
4521 int cpu = -1;
4522 u32 tcpu;
4523 u32 hash;
4524
4525 if (skb_rx_queue_recorded(skb)) {
4526 u16 index = skb_get_rx_queue(skb);
4527
4528 if (unlikely(index >= dev->real_num_rx_queues)) {
4529 WARN_ONCE(dev->real_num_rx_queues > 1,
4530 "%s received packet on queue %u, but number "
4531 "of RX queues is %u\n",
4532 dev->name, index, dev->real_num_rx_queues);
4533 goto done;
4534 }
4535 rxqueue += index;
4536 }
4537
4538 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
4539
4540 flow_table = rcu_dereference(rxqueue->rps_flow_table);
4541 map = rcu_dereference(rxqueue->rps_map);
4542 if (!flow_table && !map)
4543 goto done;
4544
4545 skb_reset_network_header(skb);
4546 hash = skb_get_hash(skb);
4547 if (!hash)
4548 goto done;
4549
4550 sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
4551 if (flow_table && sock_flow_table) {
4552 struct rps_dev_flow *rflow;
4553 u32 next_cpu;
4554 u32 ident;
4555
4556 /* First check into global flow table if there is a match.
4557 * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
4558 */
4559 ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
4560 if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask)
4561 goto try_rps;
4562
4563 next_cpu = ident & net_hotdata.rps_cpu_mask;
4564
4565 /* OK, now we know there is a match,
4566 * we can look at the local (per receive queue) flow table
4567 */
4568 rflow = &flow_table->flows[hash & flow_table->mask];
4569 tcpu = rflow->cpu;
4570
4571 /*
4572 * If the desired CPU (where last recvmsg was done) is
4573 * different from current CPU (one in the rx-queue flow
4574 * table entry), switch if one of the following holds:
4575 * - Current CPU is unset (>= nr_cpu_ids).
4576 * - Current CPU is offline.
4577 * - The current CPU's queue tail has advanced beyond the
4578 * last packet that was enqueued using this table entry.
4579 * This guarantees that all previous packets for the flow
4580 * have been dequeued, thus preserving in order delivery.
4581 */
4582 if (unlikely(tcpu != next_cpu) &&
4583 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
4584 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
4585 rflow->last_qtail)) >= 0)) {
4586 tcpu = next_cpu;
4587 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
4588 }
4589
4590 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
4591 *rflowp = rflow;
4592 cpu = tcpu;
4593 goto done;
4594 }
4595 }
4596
4597try_rps:
4598
4599 if (map) {
4600 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
4601 if (cpu_online(tcpu)) {
4602 cpu = tcpu;
4603 goto done;
4604 }
4605 }
4606
4607done:
4608 return cpu;
4609}
4610
4611#ifdef CONFIG_RFS_ACCEL
4612
4613/**
4614 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
4615 * @dev: Device on which the filter was set
4616 * @rxq_index: RX queue index
4617 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
4618 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
4619 *
4620 * Drivers that implement ndo_rx_flow_steer() should periodically call
4621 * this function for each installed filter and remove the filters for
4622 * which it returns %true.
4623 */
4624bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
4625 u32 flow_id, u16 filter_id)
4626{
4627 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
4628 struct rps_dev_flow_table *flow_table;
4629 struct rps_dev_flow *rflow;
4630 bool expire = true;
4631 unsigned int cpu;
4632
4633 rcu_read_lock();
4634 flow_table = rcu_dereference(rxqueue->rps_flow_table);
4635 if (flow_table && flow_id <= flow_table->mask) {
4636 rflow = &flow_table->flows[flow_id];
4637 cpu = READ_ONCE(rflow->cpu);
4638 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
4639 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
4640 rflow->last_qtail) <
4641 (int)(10 * flow_table->mask)))
4642 expire = false;
4643 }
4644 rcu_read_unlock();
4645 return expire;
4646}
4647EXPORT_SYMBOL(rps_may_expire_flow);
4648
4649#endif /* CONFIG_RFS_ACCEL */
4650
4651/* Called from hardirq (IPI) context */
4652static void rps_trigger_softirq(void *data)
4653{
4654 struct softnet_data *sd = data;
4655
4656 ____napi_schedule(sd, &sd->backlog);
4657 sd->received_rps++;
4658}
4659
4660#endif /* CONFIG_RPS */
4661
4662/* Called from hardirq (IPI) context */
4663static void trigger_rx_softirq(void *data)
4664{
4665 struct softnet_data *sd = data;
4666
4667 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4668 smp_store_release(&sd->defer_ipi_scheduled, 0);
4669}
4670
4671/*
4672 * After we queued a packet into sd->input_pkt_queue,
4673 * we need to make sure this queue is serviced soon.
4674 *
4675 * - If this is another cpu queue, link it to our rps_ipi_list,
4676 * and make sure we will process rps_ipi_list from net_rx_action().
4677 *
4678 * - If this is our own queue, NAPI schedule our backlog.
4679 * Note that this also raises NET_RX_SOFTIRQ.
4680 */
4681static void napi_schedule_rps(struct softnet_data *sd)
4682{
4683 struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
4684
4685#ifdef CONFIG_RPS
4686 if (sd != mysd) {
4687 sd->rps_ipi_next = mysd->rps_ipi_list;
4688 mysd->rps_ipi_list = sd;
4689
4690 /* If not called from net_rx_action() or napi_threaded_poll()
4691 * we have to raise NET_RX_SOFTIRQ.
4692 */
4693 if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll)
4694 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4695 return;
4696 }
4697#endif /* CONFIG_RPS */
4698 __napi_schedule_irqoff(&mysd->backlog);
4699}
4700
4701#ifdef CONFIG_NET_FLOW_LIMIT
4702int netdev_flow_limit_table_len __read_mostly = (1 << 12);
4703#endif
4704
4705static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
4706{
4707#ifdef CONFIG_NET_FLOW_LIMIT
4708 struct sd_flow_limit *fl;
4709 struct softnet_data *sd;
4710 unsigned int old_flow, new_flow;
4711
4712 if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1))
4713 return false;
4714
4715 sd = this_cpu_ptr(&softnet_data);
4716
4717 rcu_read_lock();
4718 fl = rcu_dereference(sd->flow_limit);
4719 if (fl) {
4720 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
4721 old_flow = fl->history[fl->history_head];
4722 fl->history[fl->history_head] = new_flow;
4723
4724 fl->history_head++;
4725 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
4726
4727 if (likely(fl->buckets[old_flow]))
4728 fl->buckets[old_flow]--;
4729
4730 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
4731 fl->count++;
4732 rcu_read_unlock();
4733 return true;
4734 }
4735 }
4736 rcu_read_unlock();
4737#endif
4738 return false;
4739}
4740
4741/*
4742 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
4743 * queue (may be a remote CPU queue).
4744 */
4745static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
4746 unsigned int *qtail)
4747{
4748 enum skb_drop_reason reason;
4749 struct softnet_data *sd;
4750 unsigned long flags;
4751 unsigned int qlen;
4752
4753 reason = SKB_DROP_REASON_NOT_SPECIFIED;
4754 sd = &per_cpu(softnet_data, cpu);
4755
4756 rps_lock_irqsave(sd, &flags);
4757 if (!netif_running(skb->dev))
4758 goto drop;
4759 qlen = skb_queue_len(&sd->input_pkt_queue);
4760 if (qlen <= READ_ONCE(net_hotdata.max_backlog) &&
4761 !skb_flow_limit(skb, qlen)) {
4762 if (qlen) {
4763enqueue:
4764 __skb_queue_tail(&sd->input_pkt_queue, skb);
4765 input_queue_tail_incr_save(sd, qtail);
4766 rps_unlock_irq_restore(sd, &flags);
4767 return NET_RX_SUCCESS;
4768 }
4769
4770 /* Schedule NAPI for backlog device
4771 * We can use non atomic operation since we own the queue lock
4772 */
4773 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
4774 napi_schedule_rps(sd);
4775 goto enqueue;
4776 }
4777 reason = SKB_DROP_REASON_CPU_BACKLOG;
4778
4779drop:
4780 sd->dropped++;
4781 rps_unlock_irq_restore(sd, &flags);
4782
4783 dev_core_stats_rx_dropped_inc(skb->dev);
4784 kfree_skb_reason(skb, reason);
4785 return NET_RX_DROP;
4786}
4787
4788static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
4789{
4790 struct net_device *dev = skb->dev;
4791 struct netdev_rx_queue *rxqueue;
4792
4793 rxqueue = dev->_rx;
4794
4795 if (skb_rx_queue_recorded(skb)) {
4796 u16 index = skb_get_rx_queue(skb);
4797
4798 if (unlikely(index >= dev->real_num_rx_queues)) {
4799 WARN_ONCE(dev->real_num_rx_queues > 1,
4800 "%s received packet on queue %u, but number "
4801 "of RX queues is %u\n",
4802 dev->name, index, dev->real_num_rx_queues);
4803
4804 return rxqueue; /* Return first rxqueue */
4805 }
4806 rxqueue += index;
4807 }
4808 return rxqueue;
4809}
4810
4811u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
4812 struct bpf_prog *xdp_prog)
4813{
4814 void *orig_data, *orig_data_end, *hard_start;
4815 struct netdev_rx_queue *rxqueue;
4816 bool orig_bcast, orig_host;
4817 u32 mac_len, frame_sz;
4818 __be16 orig_eth_type;
4819 struct ethhdr *eth;
4820 u32 metalen, act;
4821 int off;
4822
4823 /* The XDP program wants to see the packet starting at the MAC
4824 * header.
4825 */
4826 mac_len = skb->data - skb_mac_header(skb);
4827 hard_start = skb->data - skb_headroom(skb);
4828
4829 /* SKB "head" area always have tailroom for skb_shared_info */
4830 frame_sz = (void *)skb_end_pointer(skb) - hard_start;
4831 frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
4832
4833 rxqueue = netif_get_rxqueue(skb);
4834 xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
4835 xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
4836 skb_headlen(skb) + mac_len, true);
4837 if (skb_is_nonlinear(skb)) {
4838 skb_shinfo(skb)->xdp_frags_size = skb->data_len;
4839 xdp_buff_set_frags_flag(xdp);
4840 } else {
4841 xdp_buff_clear_frags_flag(xdp);
4842 }
4843
4844 orig_data_end = xdp->data_end;
4845 orig_data = xdp->data;
4846 eth = (struct ethhdr *)xdp->data;
4847 orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
4848 orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
4849 orig_eth_type = eth->h_proto;
4850
4851 act = bpf_prog_run_xdp(xdp_prog, xdp);
4852
4853 /* check if bpf_xdp_adjust_head was used */
4854 off = xdp->data - orig_data;
4855 if (off) {
4856 if (off > 0)
4857 __skb_pull(skb, off);
4858 else if (off < 0)
4859 __skb_push(skb, -off);
4860
4861 skb->mac_header += off;
4862 skb_reset_network_header(skb);
4863 }
4864
4865 /* check if bpf_xdp_adjust_tail was used */
4866 off = xdp->data_end - orig_data_end;
4867 if (off != 0) {
4868 skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4869 skb->len += off; /* positive on grow, negative on shrink */
4870 }
4871
4872 /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
4873 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
4874 */
4875 if (xdp_buff_has_frags(xdp))
4876 skb->data_len = skb_shinfo(skb)->xdp_frags_size;
4877 else
4878 skb->data_len = 0;
4879
4880 /* check if XDP changed eth hdr such SKB needs update */
4881 eth = (struct ethhdr *)xdp->data;
4882 if ((orig_eth_type != eth->h_proto) ||
4883 (orig_host != ether_addr_equal_64bits(eth->h_dest,
4884 skb->dev->dev_addr)) ||
4885 (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
4886 __skb_push(skb, ETH_HLEN);
4887 skb->pkt_type = PACKET_HOST;
4888 skb->protocol = eth_type_trans(skb, skb->dev);
4889 }
4890
4891 /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
4892 * before calling us again on redirect path. We do not call do_redirect
4893 * as we leave that up to the caller.
4894 *
4895 * Caller is responsible for managing lifetime of skb (i.e. calling
4896 * kfree_skb in response to actions it cannot handle/XDP_DROP).
4897 */
4898 switch (act) {
4899 case XDP_REDIRECT:
4900 case XDP_TX:
4901 __skb_push(skb, mac_len);
4902 break;
4903 case XDP_PASS:
4904 metalen = xdp->data - xdp->data_meta;
4905 if (metalen)
4906 skb_metadata_set(skb, metalen);
4907 break;
4908 }
4909
4910 return act;
4911}
4912
4913static int
4914netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog)
4915{
4916 struct sk_buff *skb = *pskb;
4917 int err, hroom, troom;
4918
4919 if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog))
4920 return 0;
4921
4922 /* In case we have to go down the path and also linearize,
4923 * then lets do the pskb_expand_head() work just once here.
4924 */
4925 hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
4926 troom = skb->tail + skb->data_len - skb->end;
4927 err = pskb_expand_head(skb,
4928 hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
4929 troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
4930 if (err)
4931 return err;
4932
4933 return skb_linearize(skb);
4934}
4935
4936static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
4937 struct xdp_buff *xdp,
4938 struct bpf_prog *xdp_prog)
4939{
4940 struct sk_buff *skb = *pskb;
4941 u32 mac_len, act = XDP_DROP;
4942
4943 /* Reinjected packets coming from act_mirred or similar should
4944 * not get XDP generic processing.
4945 */
4946 if (skb_is_redirected(skb))
4947 return XDP_PASS;
4948
4949 /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
4950 * bytes. This is the guarantee that also native XDP provides,
4951 * thus we need to do it here as well.
4952 */
4953 mac_len = skb->data - skb_mac_header(skb);
4954 __skb_push(skb, mac_len);
4955
4956 if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
4957 skb_headroom(skb) < XDP_PACKET_HEADROOM) {
4958 if (netif_skb_check_for_xdp(pskb, xdp_prog))
4959 goto do_drop;
4960 }
4961
4962 __skb_pull(*pskb, mac_len);
4963
4964 act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
4965 switch (act) {
4966 case XDP_REDIRECT:
4967 case XDP_TX:
4968 case XDP_PASS:
4969 break;
4970 default:
4971 bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
4972 fallthrough;
4973 case XDP_ABORTED:
4974 trace_xdp_exception((*pskb)->dev, xdp_prog, act);
4975 fallthrough;
4976 case XDP_DROP:
4977 do_drop:
4978 kfree_skb(*pskb);
4979 break;
4980 }
4981
4982 return act;
4983}
4984
4985/* When doing generic XDP we have to bypass the qdisc layer and the
4986 * network taps in order to match in-driver-XDP behavior. This also means
4987 * that XDP packets are able to starve other packets going through a qdisc,
4988 * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
4989 * queues, so they do not have this starvation issue.
4990 */
4991void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
4992{
4993 struct net_device *dev = skb->dev;
4994 struct netdev_queue *txq;
4995 bool free_skb = true;
4996 int cpu, rc;
4997
4998 txq = netdev_core_pick_tx(dev, skb, NULL);
4999 cpu = smp_processor_id();
5000 HARD_TX_LOCK(dev, txq, cpu);
5001 if (!netif_xmit_frozen_or_drv_stopped(txq)) {
5002 rc = netdev_start_xmit(skb, dev, txq, 0);
5003 if (dev_xmit_complete(rc))
5004 free_skb = false;
5005 }
5006 HARD_TX_UNLOCK(dev, txq);
5007 if (free_skb) {
5008 trace_xdp_exception(dev, xdp_prog, XDP_TX);
5009 dev_core_stats_tx_dropped_inc(dev);
5010 kfree_skb(skb);
5011 }
5012}
5013
5014static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
5015
5016int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb)
5017{
5018 if (xdp_prog) {
5019 struct xdp_buff xdp;
5020 u32 act;
5021 int err;
5022
5023 act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog);
5024 if (act != XDP_PASS) {
5025 switch (act) {
5026 case XDP_REDIRECT:
5027 err = xdp_do_generic_redirect((*pskb)->dev, *pskb,
5028 &xdp, xdp_prog);
5029 if (err)
5030 goto out_redir;
5031 break;
5032 case XDP_TX:
5033 generic_xdp_tx(*pskb, xdp_prog);
5034 break;
5035 }
5036 return XDP_DROP;
5037 }
5038 }
5039 return XDP_PASS;
5040out_redir:
5041 kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP);
5042 return XDP_DROP;
5043}
5044EXPORT_SYMBOL_GPL(do_xdp_generic);
5045
5046static int netif_rx_internal(struct sk_buff *skb)
5047{
5048 int ret;
5049
5050 net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
5051
5052 trace_netif_rx(skb);
5053
5054#ifdef CONFIG_RPS
5055 if (static_branch_unlikely(&rps_needed)) {
5056 struct rps_dev_flow voidflow, *rflow = &voidflow;
5057 int cpu;
5058
5059 rcu_read_lock();
5060
5061 cpu = get_rps_cpu(skb->dev, skb, &rflow);
5062 if (cpu < 0)
5063 cpu = smp_processor_id();
5064
5065 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5066
5067 rcu_read_unlock();
5068 } else
5069#endif
5070 {
5071 unsigned int qtail;
5072
5073 ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
5074 }
5075 return ret;
5076}
5077
5078/**
5079 * __netif_rx - Slightly optimized version of netif_rx
5080 * @skb: buffer to post
5081 *
5082 * This behaves as netif_rx except that it does not disable bottom halves.
5083 * As a result this function may only be invoked from the interrupt context
5084 * (either hard or soft interrupt).
5085 */
5086int __netif_rx(struct sk_buff *skb)
5087{
5088 int ret;
5089
5090 lockdep_assert_once(hardirq_count() | softirq_count());
5091
5092 trace_netif_rx_entry(skb);
5093 ret = netif_rx_internal(skb);
5094 trace_netif_rx_exit(ret);
5095 return ret;
5096}
5097EXPORT_SYMBOL(__netif_rx);
5098
5099/**
5100 * netif_rx - post buffer to the network code
5101 * @skb: buffer to post
5102 *
5103 * This function receives a packet from a device driver and queues it for
5104 * the upper (protocol) levels to process via the backlog NAPI device. It
5105 * always succeeds. The buffer may be dropped during processing for
5106 * congestion control or by the protocol layers.
5107 * The network buffer is passed via the backlog NAPI device. Modern NIC
5108 * driver should use NAPI and GRO.
5109 * This function can used from interrupt and from process context. The
5110 * caller from process context must not disable interrupts before invoking
5111 * this function.
5112 *
5113 * return values:
5114 * NET_RX_SUCCESS (no congestion)
5115 * NET_RX_DROP (packet was dropped)
5116 *
5117 */
5118int netif_rx(struct sk_buff *skb)
5119{
5120 bool need_bh_off = !(hardirq_count() | softirq_count());
5121 int ret;
5122
5123 if (need_bh_off)
5124 local_bh_disable();
5125 trace_netif_rx_entry(skb);
5126 ret = netif_rx_internal(skb);
5127 trace_netif_rx_exit(ret);
5128 if (need_bh_off)
5129 local_bh_enable();
5130 return ret;
5131}
5132EXPORT_SYMBOL(netif_rx);
5133
5134static __latent_entropy void net_tx_action(struct softirq_action *h)
5135{
5136 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5137
5138 if (sd->completion_queue) {
5139 struct sk_buff *clist;
5140
5141 local_irq_disable();
5142 clist = sd->completion_queue;
5143 sd->completion_queue = NULL;
5144 local_irq_enable();
5145
5146 while (clist) {
5147 struct sk_buff *skb = clist;
5148
5149 clist = clist->next;
5150
5151 WARN_ON(refcount_read(&skb->users));
5152 if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED))
5153 trace_consume_skb(skb, net_tx_action);
5154 else
5155 trace_kfree_skb(skb, net_tx_action,
5156 get_kfree_skb_cb(skb)->reason);
5157
5158 if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
5159 __kfree_skb(skb);
5160 else
5161 __napi_kfree_skb(skb,
5162 get_kfree_skb_cb(skb)->reason);
5163 }
5164 }
5165
5166 if (sd->output_queue) {
5167 struct Qdisc *head;
5168
5169 local_irq_disable();
5170 head = sd->output_queue;
5171 sd->output_queue = NULL;
5172 sd->output_queue_tailp = &sd->output_queue;
5173 local_irq_enable();
5174
5175 rcu_read_lock();
5176
5177 while (head) {
5178 struct Qdisc *q = head;
5179 spinlock_t *root_lock = NULL;
5180
5181 head = head->next_sched;
5182
5183 /* We need to make sure head->next_sched is read
5184 * before clearing __QDISC_STATE_SCHED
5185 */
5186 smp_mb__before_atomic();
5187
5188 if (!(q->flags & TCQ_F_NOLOCK)) {
5189 root_lock = qdisc_lock(q);
5190 spin_lock(root_lock);
5191 } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
5192 &q->state))) {
5193 /* There is a synchronize_net() between
5194 * STATE_DEACTIVATED flag being set and
5195 * qdisc_reset()/some_qdisc_is_busy() in
5196 * dev_deactivate(), so we can safely bail out
5197 * early here to avoid data race between
5198 * qdisc_deactivate() and some_qdisc_is_busy()
5199 * for lockless qdisc.
5200 */
5201 clear_bit(__QDISC_STATE_SCHED, &q->state);
5202 continue;
5203 }
5204
5205 clear_bit(__QDISC_STATE_SCHED, &q->state);
5206 qdisc_run(q);
5207 if (root_lock)
5208 spin_unlock(root_lock);
5209 }
5210
5211 rcu_read_unlock();
5212 }
5213
5214 xfrm_dev_backlog(sd);
5215}
5216
5217#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
5218/* This hook is defined here for ATM LANE */
5219int (*br_fdb_test_addr_hook)(struct net_device *dev,
5220 unsigned char *addr) __read_mostly;
5221EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
5222#endif
5223
5224/**
5225 * netdev_is_rx_handler_busy - check if receive handler is registered
5226 * @dev: device to check
5227 *
5228 * Check if a receive handler is already registered for a given device.
5229 * Return true if there one.
5230 *
5231 * The caller must hold the rtnl_mutex.
5232 */
5233bool netdev_is_rx_handler_busy(struct net_device *dev)
5234{
5235 ASSERT_RTNL();
5236 return dev && rtnl_dereference(dev->rx_handler);
5237}
5238EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
5239
5240/**
5241 * netdev_rx_handler_register - register receive handler
5242 * @dev: device to register a handler for
5243 * @rx_handler: receive handler to register
5244 * @rx_handler_data: data pointer that is used by rx handler
5245 *
5246 * Register a receive handler for a device. This handler will then be
5247 * called from __netif_receive_skb. A negative errno code is returned
5248 * on a failure.
5249 *
5250 * The caller must hold the rtnl_mutex.
5251 *
5252 * For a general description of rx_handler, see enum rx_handler_result.
5253 */
5254int netdev_rx_handler_register(struct net_device *dev,
5255 rx_handler_func_t *rx_handler,
5256 void *rx_handler_data)
5257{
5258 if (netdev_is_rx_handler_busy(dev))
5259 return -EBUSY;
5260
5261 if (dev->priv_flags & IFF_NO_RX_HANDLER)
5262 return -EINVAL;
5263
5264 /* Note: rx_handler_data must be set before rx_handler */
5265 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
5266 rcu_assign_pointer(dev->rx_handler, rx_handler);
5267
5268 return 0;
5269}
5270EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
5271
5272/**
5273 * netdev_rx_handler_unregister - unregister receive handler
5274 * @dev: device to unregister a handler from
5275 *
5276 * Unregister a receive handler from a device.
5277 *
5278 * The caller must hold the rtnl_mutex.
5279 */
5280void netdev_rx_handler_unregister(struct net_device *dev)
5281{
5282
5283 ASSERT_RTNL();
5284 RCU_INIT_POINTER(dev->rx_handler, NULL);
5285 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
5286 * section has a guarantee to see a non NULL rx_handler_data
5287 * as well.
5288 */
5289 synchronize_net();
5290 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
5291}
5292EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
5293
5294/*
5295 * Limit the use of PFMEMALLOC reserves to those protocols that implement
5296 * the special handling of PFMEMALLOC skbs.
5297 */
5298static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
5299{
5300 switch (skb->protocol) {
5301 case htons(ETH_P_ARP):
5302 case htons(ETH_P_IP):
5303 case htons(ETH_P_IPV6):
5304 case htons(ETH_P_8021Q):
5305 case htons(ETH_P_8021AD):
5306 return true;
5307 default:
5308 return false;
5309 }
5310}
5311
5312static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
5313 int *ret, struct net_device *orig_dev)
5314{
5315 if (nf_hook_ingress_active(skb)) {
5316 int ingress_retval;
5317
5318 if (*pt_prev) {
5319 *ret = deliver_skb(skb, *pt_prev, orig_dev);
5320 *pt_prev = NULL;
5321 }
5322
5323 rcu_read_lock();
5324 ingress_retval = nf_hook_ingress(skb);
5325 rcu_read_unlock();
5326 return ingress_retval;
5327 }
5328 return 0;
5329}
5330
5331static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
5332 struct packet_type **ppt_prev)
5333{
5334 struct packet_type *ptype, *pt_prev;
5335 rx_handler_func_t *rx_handler;
5336 struct sk_buff *skb = *pskb;
5337 struct net_device *orig_dev;
5338 bool deliver_exact = false;
5339 int ret = NET_RX_DROP;
5340 __be16 type;
5341
5342 net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb);
5343
5344 trace_netif_receive_skb(skb);
5345
5346 orig_dev = skb->dev;
5347
5348 skb_reset_network_header(skb);
5349 if (!skb_transport_header_was_set(skb))
5350 skb_reset_transport_header(skb);
5351 skb_reset_mac_len(skb);
5352
5353 pt_prev = NULL;
5354
5355another_round:
5356 skb->skb_iif = skb->dev->ifindex;
5357
5358 __this_cpu_inc(softnet_data.processed);
5359
5360 if (static_branch_unlikely(&generic_xdp_needed_key)) {
5361 int ret2;
5362
5363 migrate_disable();
5364 ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
5365 &skb);
5366 migrate_enable();
5367
5368 if (ret2 != XDP_PASS) {
5369 ret = NET_RX_DROP;
5370 goto out;
5371 }
5372 }
5373
5374 if (eth_type_vlan(skb->protocol)) {
5375 skb = skb_vlan_untag(skb);
5376 if (unlikely(!skb))
5377 goto out;
5378 }
5379
5380 if (skb_skip_tc_classify(skb))
5381 goto skip_classify;
5382
5383 if (pfmemalloc)
5384 goto skip_taps;
5385
5386 list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) {
5387 if (pt_prev)
5388 ret = deliver_skb(skb, pt_prev, orig_dev);
5389 pt_prev = ptype;
5390 }
5391
5392 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
5393 if (pt_prev)
5394 ret = deliver_skb(skb, pt_prev, orig_dev);
5395 pt_prev = ptype;
5396 }
5397
5398skip_taps:
5399#ifdef CONFIG_NET_INGRESS
5400 if (static_branch_unlikely(&ingress_needed_key)) {
5401 bool another = false;
5402
5403 nf_skip_egress(skb, true);
5404 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
5405 &another);
5406 if (another)
5407 goto another_round;
5408 if (!skb)
5409 goto out;
5410
5411 nf_skip_egress(skb, false);
5412 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
5413 goto out;
5414 }
5415#endif
5416 skb_reset_redirect(skb);
5417skip_classify:
5418 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
5419 goto drop;
5420
5421 if (skb_vlan_tag_present(skb)) {
5422 if (pt_prev) {
5423 ret = deliver_skb(skb, pt_prev, orig_dev);
5424 pt_prev = NULL;
5425 }
5426 if (vlan_do_receive(&skb))
5427 goto another_round;
5428 else if (unlikely(!skb))
5429 goto out;
5430 }
5431
5432 rx_handler = rcu_dereference(skb->dev->rx_handler);
5433 if (rx_handler) {
5434 if (pt_prev) {
5435 ret = deliver_skb(skb, pt_prev, orig_dev);
5436 pt_prev = NULL;
5437 }
5438 switch (rx_handler(&skb)) {
5439 case RX_HANDLER_CONSUMED:
5440 ret = NET_RX_SUCCESS;
5441 goto out;
5442 case RX_HANDLER_ANOTHER:
5443 goto another_round;
5444 case RX_HANDLER_EXACT:
5445 deliver_exact = true;
5446 break;
5447 case RX_HANDLER_PASS:
5448 break;
5449 default:
5450 BUG();
5451 }
5452 }
5453
5454 if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
5455check_vlan_id:
5456 if (skb_vlan_tag_get_id(skb)) {
5457 /* Vlan id is non 0 and vlan_do_receive() above couldn't
5458 * find vlan device.
5459 */
5460 skb->pkt_type = PACKET_OTHERHOST;
5461 } else if (eth_type_vlan(skb->protocol)) {
5462 /* Outer header is 802.1P with vlan 0, inner header is
5463 * 802.1Q or 802.1AD and vlan_do_receive() above could
5464 * not find vlan dev for vlan id 0.
5465 */
5466 __vlan_hwaccel_clear_tag(skb);
5467 skb = skb_vlan_untag(skb);
5468 if (unlikely(!skb))
5469 goto out;
5470 if (vlan_do_receive(&skb))
5471 /* After stripping off 802.1P header with vlan 0
5472 * vlan dev is found for inner header.
5473 */
5474 goto another_round;
5475 else if (unlikely(!skb))
5476 goto out;
5477 else
5478 /* We have stripped outer 802.1P vlan 0 header.
5479 * But could not find vlan dev.
5480 * check again for vlan id to set OTHERHOST.
5481 */
5482 goto check_vlan_id;
5483 }
5484 /* Note: we might in the future use prio bits
5485 * and set skb->priority like in vlan_do_receive()
5486 * For the time being, just ignore Priority Code Point
5487 */
5488 __vlan_hwaccel_clear_tag(skb);
5489 }
5490
5491 type = skb->protocol;
5492
5493 /* deliver only exact match when indicated */
5494 if (likely(!deliver_exact)) {
5495 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5496 &ptype_base[ntohs(type) &
5497 PTYPE_HASH_MASK]);
5498 }
5499
5500 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5501 &orig_dev->ptype_specific);
5502
5503 if (unlikely(skb->dev != orig_dev)) {
5504 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5505 &skb->dev->ptype_specific);
5506 }
5507
5508 if (pt_prev) {
5509 if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
5510 goto drop;
5511 *ppt_prev = pt_prev;
5512 } else {
5513drop:
5514 if (!deliver_exact)
5515 dev_core_stats_rx_dropped_inc(skb->dev);
5516 else
5517 dev_core_stats_rx_nohandler_inc(skb->dev);
5518 kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
5519 /* Jamal, now you will not able to escape explaining
5520 * me how you were going to use this. :-)
5521 */
5522 ret = NET_RX_DROP;
5523 }
5524
5525out:
5526 /* The invariant here is that if *ppt_prev is not NULL
5527 * then skb should also be non-NULL.
5528 *
5529 * Apparently *ppt_prev assignment above holds this invariant due to
5530 * skb dereferencing near it.
5531 */
5532 *pskb = skb;
5533 return ret;
5534}
5535
5536static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
5537{
5538 struct net_device *orig_dev = skb->dev;
5539 struct packet_type *pt_prev = NULL;
5540 int ret;
5541
5542 ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5543 if (pt_prev)
5544 ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
5545 skb->dev, pt_prev, orig_dev);
5546 return ret;
5547}
5548
5549/**
5550 * netif_receive_skb_core - special purpose version of netif_receive_skb
5551 * @skb: buffer to process
5552 *
5553 * More direct receive version of netif_receive_skb(). It should
5554 * only be used by callers that have a need to skip RPS and Generic XDP.
5555 * Caller must also take care of handling if ``(page_is_)pfmemalloc``.
5556 *
5557 * This function may only be called from softirq context and interrupts
5558 * should be enabled.
5559 *
5560 * Return values (usually ignored):
5561 * NET_RX_SUCCESS: no congestion
5562 * NET_RX_DROP: packet was dropped
5563 */
5564int netif_receive_skb_core(struct sk_buff *skb)
5565{
5566 int ret;
5567
5568 rcu_read_lock();
5569 ret = __netif_receive_skb_one_core(skb, false);
5570 rcu_read_unlock();
5571
5572 return ret;
5573}
5574EXPORT_SYMBOL(netif_receive_skb_core);
5575
5576static inline void __netif_receive_skb_list_ptype(struct list_head *head,
5577 struct packet_type *pt_prev,
5578 struct net_device *orig_dev)
5579{
5580 struct sk_buff *skb, *next;
5581
5582 if (!pt_prev)
5583 return;
5584 if (list_empty(head))
5585 return;
5586 if (pt_prev->list_func != NULL)
5587 INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
5588 ip_list_rcv, head, pt_prev, orig_dev);
5589 else
5590 list_for_each_entry_safe(skb, next, head, list) {
5591 skb_list_del_init(skb);
5592 pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
5593 }
5594}
5595
5596static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
5597{
5598 /* Fast-path assumptions:
5599 * - There is no RX handler.
5600 * - Only one packet_type matches.
5601 * If either of these fails, we will end up doing some per-packet
5602 * processing in-line, then handling the 'last ptype' for the whole
5603 * sublist. This can't cause out-of-order delivery to any single ptype,
5604 * because the 'last ptype' must be constant across the sublist, and all
5605 * other ptypes are handled per-packet.
5606 */
5607 /* Current (common) ptype of sublist */
5608 struct packet_type *pt_curr = NULL;
5609 /* Current (common) orig_dev of sublist */
5610 struct net_device *od_curr = NULL;
5611 struct list_head sublist;
5612 struct sk_buff *skb, *next;
5613
5614 INIT_LIST_HEAD(&sublist);
5615 list_for_each_entry_safe(skb, next, head, list) {
5616 struct net_device *orig_dev = skb->dev;
5617 struct packet_type *pt_prev = NULL;
5618
5619 skb_list_del_init(skb);
5620 __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5621 if (!pt_prev)
5622 continue;
5623 if (pt_curr != pt_prev || od_curr != orig_dev) {
5624 /* dispatch old sublist */
5625 __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5626 /* start new sublist */
5627 INIT_LIST_HEAD(&sublist);
5628 pt_curr = pt_prev;
5629 od_curr = orig_dev;
5630 }
5631 list_add_tail(&skb->list, &sublist);
5632 }
5633
5634 /* dispatch final sublist */
5635 __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5636}
5637
5638static int __netif_receive_skb(struct sk_buff *skb)
5639{
5640 int ret;
5641
5642 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
5643 unsigned int noreclaim_flag;
5644
5645 /*
5646 * PFMEMALLOC skbs are special, they should
5647 * - be delivered to SOCK_MEMALLOC sockets only
5648 * - stay away from userspace
5649 * - have bounded memory usage
5650 *
5651 * Use PF_MEMALLOC as this saves us from propagating the allocation
5652 * context down to all allocation sites.
5653 */
5654 noreclaim_flag = memalloc_noreclaim_save();
5655 ret = __netif_receive_skb_one_core(skb, true);
5656 memalloc_noreclaim_restore(noreclaim_flag);
5657 } else
5658 ret = __netif_receive_skb_one_core(skb, false);
5659
5660 return ret;
5661}
5662
5663static void __netif_receive_skb_list(struct list_head *head)
5664{
5665 unsigned long noreclaim_flag = 0;
5666 struct sk_buff *skb, *next;
5667 bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
5668
5669 list_for_each_entry_safe(skb, next, head, list) {
5670 if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
5671 struct list_head sublist;
5672
5673 /* Handle the previous sublist */
5674 list_cut_before(&sublist, head, &skb->list);
5675 if (!list_empty(&sublist))
5676 __netif_receive_skb_list_core(&sublist, pfmemalloc);
5677 pfmemalloc = !pfmemalloc;
5678 /* See comments in __netif_receive_skb */
5679 if (pfmemalloc)
5680 noreclaim_flag = memalloc_noreclaim_save();
5681 else
5682 memalloc_noreclaim_restore(noreclaim_flag);
5683 }
5684 }
5685 /* Handle the remaining sublist */
5686 if (!list_empty(head))
5687 __netif_receive_skb_list_core(head, pfmemalloc);
5688 /* Restore pflags */
5689 if (pfmemalloc)
5690 memalloc_noreclaim_restore(noreclaim_flag);
5691}
5692
5693static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
5694{
5695 struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
5696 struct bpf_prog *new = xdp->prog;
5697 int ret = 0;
5698
5699 switch (xdp->command) {
5700 case XDP_SETUP_PROG:
5701 rcu_assign_pointer(dev->xdp_prog, new);
5702 if (old)
5703 bpf_prog_put(old);
5704
5705 if (old && !new) {
5706 static_branch_dec(&generic_xdp_needed_key);
5707 } else if (new && !old) {
5708 static_branch_inc(&generic_xdp_needed_key);
5709 dev_disable_lro(dev);
5710 dev_disable_gro_hw(dev);
5711 }
5712 break;
5713
5714 default:
5715 ret = -EINVAL;
5716 break;
5717 }
5718
5719 return ret;
5720}
5721
5722static int netif_receive_skb_internal(struct sk_buff *skb)
5723{
5724 int ret;
5725
5726 net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
5727
5728 if (skb_defer_rx_timestamp(skb))
5729 return NET_RX_SUCCESS;
5730
5731 rcu_read_lock();
5732#ifdef CONFIG_RPS
5733 if (static_branch_unlikely(&rps_needed)) {
5734 struct rps_dev_flow voidflow, *rflow = &voidflow;
5735 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5736
5737 if (cpu >= 0) {
5738 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5739 rcu_read_unlock();
5740 return ret;
5741 }
5742 }
5743#endif
5744 ret = __netif_receive_skb(skb);
5745 rcu_read_unlock();
5746 return ret;
5747}
5748
5749void netif_receive_skb_list_internal(struct list_head *head)
5750{
5751 struct sk_buff *skb, *next;
5752 struct list_head sublist;
5753
5754 INIT_LIST_HEAD(&sublist);
5755 list_for_each_entry_safe(skb, next, head, list) {
5756 net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue),
5757 skb);
5758 skb_list_del_init(skb);
5759 if (!skb_defer_rx_timestamp(skb))
5760 list_add_tail(&skb->list, &sublist);
5761 }
5762 list_splice_init(&sublist, head);
5763
5764 rcu_read_lock();
5765#ifdef CONFIG_RPS
5766 if (static_branch_unlikely(&rps_needed)) {
5767 list_for_each_entry_safe(skb, next, head, list) {
5768 struct rps_dev_flow voidflow, *rflow = &voidflow;
5769 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5770
5771 if (cpu >= 0) {
5772 /* Will be handled, remove from list */
5773 skb_list_del_init(skb);
5774 enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5775 }
5776 }
5777 }
5778#endif
5779 __netif_receive_skb_list(head);
5780 rcu_read_unlock();
5781}
5782
5783/**
5784 * netif_receive_skb - process receive buffer from network
5785 * @skb: buffer to process
5786 *
5787 * netif_receive_skb() is the main receive data processing function.
5788 * It always succeeds. The buffer may be dropped during processing
5789 * for congestion control or by the protocol layers.
5790 *
5791 * This function may only be called from softirq context and interrupts
5792 * should be enabled.
5793 *
5794 * Return values (usually ignored):
5795 * NET_RX_SUCCESS: no congestion
5796 * NET_RX_DROP: packet was dropped
5797 */
5798int netif_receive_skb(struct sk_buff *skb)
5799{
5800 int ret;
5801
5802 trace_netif_receive_skb_entry(skb);
5803
5804 ret = netif_receive_skb_internal(skb);
5805 trace_netif_receive_skb_exit(ret);
5806
5807 return ret;
5808}
5809EXPORT_SYMBOL(netif_receive_skb);
5810
5811/**
5812 * netif_receive_skb_list - process many receive buffers from network
5813 * @head: list of skbs to process.
5814 *
5815 * Since return value of netif_receive_skb() is normally ignored, and
5816 * wouldn't be meaningful for a list, this function returns void.
5817 *
5818 * This function may only be called from softirq context and interrupts
5819 * should be enabled.
5820 */
5821void netif_receive_skb_list(struct list_head *head)
5822{
5823 struct sk_buff *skb;
5824
5825 if (list_empty(head))
5826 return;
5827 if (trace_netif_receive_skb_list_entry_enabled()) {
5828 list_for_each_entry(skb, head, list)
5829 trace_netif_receive_skb_list_entry(skb);
5830 }
5831 netif_receive_skb_list_internal(head);
5832 trace_netif_receive_skb_list_exit(0);
5833}
5834EXPORT_SYMBOL(netif_receive_skb_list);
5835
5836static DEFINE_PER_CPU(struct work_struct, flush_works);
5837
5838/* Network device is going away, flush any packets still pending */
5839static void flush_backlog(struct work_struct *work)
5840{
5841 struct sk_buff *skb, *tmp;
5842 struct softnet_data *sd;
5843
5844 local_bh_disable();
5845 sd = this_cpu_ptr(&softnet_data);
5846
5847 rps_lock_irq_disable(sd);
5848 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
5849 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5850 __skb_unlink(skb, &sd->input_pkt_queue);
5851 dev_kfree_skb_irq(skb);
5852 input_queue_head_incr(sd);
5853 }
5854 }
5855 rps_unlock_irq_enable(sd);
5856
5857 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
5858 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5859 __skb_unlink(skb, &sd->process_queue);
5860 kfree_skb(skb);
5861 input_queue_head_incr(sd);
5862 }
5863 }
5864 local_bh_enable();
5865}
5866
5867static bool flush_required(int cpu)
5868{
5869#if IS_ENABLED(CONFIG_RPS)
5870 struct softnet_data *sd = &per_cpu(softnet_data, cpu);
5871 bool do_flush;
5872
5873 rps_lock_irq_disable(sd);
5874
5875 /* as insertion into process_queue happens with the rps lock held,
5876 * process_queue access may race only with dequeue
5877 */
5878 do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
5879 !skb_queue_empty_lockless(&sd->process_queue);
5880 rps_unlock_irq_enable(sd);
5881
5882 return do_flush;
5883#endif
5884 /* without RPS we can't safely check input_pkt_queue: during a
5885 * concurrent remote skb_queue_splice() we can detect as empty both
5886 * input_pkt_queue and process_queue even if the latter could end-up
5887 * containing a lot of packets.
5888 */
5889 return true;
5890}
5891
5892static void flush_all_backlogs(void)
5893{
5894 static cpumask_t flush_cpus;
5895 unsigned int cpu;
5896
5897 /* since we are under rtnl lock protection we can use static data
5898 * for the cpumask and avoid allocating on stack the possibly
5899 * large mask
5900 */
5901 ASSERT_RTNL();
5902
5903 cpus_read_lock();
5904
5905 cpumask_clear(&flush_cpus);
5906 for_each_online_cpu(cpu) {
5907 if (flush_required(cpu)) {
5908 queue_work_on(cpu, system_highpri_wq,
5909 per_cpu_ptr(&flush_works, cpu));
5910 cpumask_set_cpu(cpu, &flush_cpus);
5911 }
5912 }
5913
5914 /* we can have in flight packet[s] on the cpus we are not flushing,
5915 * synchronize_net() in unregister_netdevice_many() will take care of
5916 * them
5917 */
5918 for_each_cpu(cpu, &flush_cpus)
5919 flush_work(per_cpu_ptr(&flush_works, cpu));
5920
5921 cpus_read_unlock();
5922}
5923
5924static void net_rps_send_ipi(struct softnet_data *remsd)
5925{
5926#ifdef CONFIG_RPS
5927 while (remsd) {
5928 struct softnet_data *next = remsd->rps_ipi_next;
5929
5930 if (cpu_online(remsd->cpu))
5931 smp_call_function_single_async(remsd->cpu, &remsd->csd);
5932 remsd = next;
5933 }
5934#endif
5935}
5936
5937/*
5938 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
5939 * Note: called with local irq disabled, but exits with local irq enabled.
5940 */
5941static void net_rps_action_and_irq_enable(struct softnet_data *sd)
5942{
5943#ifdef CONFIG_RPS
5944 struct softnet_data *remsd = sd->rps_ipi_list;
5945
5946 if (remsd) {
5947 sd->rps_ipi_list = NULL;
5948
5949 local_irq_enable();
5950
5951 /* Send pending IPI's to kick RPS processing on remote cpus. */
5952 net_rps_send_ipi(remsd);
5953 } else
5954#endif
5955 local_irq_enable();
5956}
5957
5958static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
5959{
5960#ifdef CONFIG_RPS
5961 return sd->rps_ipi_list != NULL;
5962#else
5963 return false;
5964#endif
5965}
5966
5967static int process_backlog(struct napi_struct *napi, int quota)
5968{
5969 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
5970 bool again = true;
5971 int work = 0;
5972
5973 /* Check if we have pending ipi, its better to send them now,
5974 * not waiting net_rx_action() end.
5975 */
5976 if (sd_has_rps_ipi_waiting(sd)) {
5977 local_irq_disable();
5978 net_rps_action_and_irq_enable(sd);
5979 }
5980
5981 napi->weight = READ_ONCE(net_hotdata.dev_rx_weight);
5982 while (again) {
5983 struct sk_buff *skb;
5984
5985 while ((skb = __skb_dequeue(&sd->process_queue))) {
5986 rcu_read_lock();
5987 __netif_receive_skb(skb);
5988 rcu_read_unlock();
5989 input_queue_head_incr(sd);
5990 if (++work >= quota)
5991 return work;
5992
5993 }
5994
5995 rps_lock_irq_disable(sd);
5996 if (skb_queue_empty(&sd->input_pkt_queue)) {
5997 /*
5998 * Inline a custom version of __napi_complete().
5999 * only current cpu owns and manipulates this napi,
6000 * and NAPI_STATE_SCHED is the only possible flag set
6001 * on backlog.
6002 * We can use a plain write instead of clear_bit(),
6003 * and we dont need an smp_mb() memory barrier.
6004 */
6005 napi->state = 0;
6006 again = false;
6007 } else {
6008 skb_queue_splice_tail_init(&sd->input_pkt_queue,
6009 &sd->process_queue);
6010 }
6011 rps_unlock_irq_enable(sd);
6012 }
6013
6014 return work;
6015}
6016
6017/**
6018 * __napi_schedule - schedule for receive
6019 * @n: entry to schedule
6020 *
6021 * The entry's receive function will be scheduled to run.
6022 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
6023 */
6024void __napi_schedule(struct napi_struct *n)
6025{
6026 unsigned long flags;
6027
6028 local_irq_save(flags);
6029 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6030 local_irq_restore(flags);
6031}
6032EXPORT_SYMBOL(__napi_schedule);
6033
6034/**
6035 * napi_schedule_prep - check if napi can be scheduled
6036 * @n: napi context
6037 *
6038 * Test if NAPI routine is already running, and if not mark
6039 * it as running. This is used as a condition variable to
6040 * insure only one NAPI poll instance runs. We also make
6041 * sure there is no pending NAPI disable.
6042 */
6043bool napi_schedule_prep(struct napi_struct *n)
6044{
6045 unsigned long new, val = READ_ONCE(n->state);
6046
6047 do {
6048 if (unlikely(val & NAPIF_STATE_DISABLE))
6049 return false;
6050 new = val | NAPIF_STATE_SCHED;
6051
6052 /* Sets STATE_MISSED bit if STATE_SCHED was already set
6053 * This was suggested by Alexander Duyck, as compiler
6054 * emits better code than :
6055 * if (val & NAPIF_STATE_SCHED)
6056 * new |= NAPIF_STATE_MISSED;
6057 */
6058 new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
6059 NAPIF_STATE_MISSED;
6060 } while (!try_cmpxchg(&n->state, &val, new));
6061
6062 return !(val & NAPIF_STATE_SCHED);
6063}
6064EXPORT_SYMBOL(napi_schedule_prep);
6065
6066/**
6067 * __napi_schedule_irqoff - schedule for receive
6068 * @n: entry to schedule
6069 *
6070 * Variant of __napi_schedule() assuming hard irqs are masked.
6071 *
6072 * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
6073 * because the interrupt disabled assumption might not be true
6074 * due to force-threaded interrupts and spinlock substitution.
6075 */
6076void __napi_schedule_irqoff(struct napi_struct *n)
6077{
6078 if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6079 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6080 else
6081 __napi_schedule(n);
6082}
6083EXPORT_SYMBOL(__napi_schedule_irqoff);
6084
6085bool napi_complete_done(struct napi_struct *n, int work_done)
6086{
6087 unsigned long flags, val, new, timeout = 0;
6088 bool ret = true;
6089
6090 /*
6091 * 1) Don't let napi dequeue from the cpu poll list
6092 * just in case its running on a different cpu.
6093 * 2) If we are busy polling, do nothing here, we have
6094 * the guarantee we will be called later.
6095 */
6096 if (unlikely(n->state & (NAPIF_STATE_NPSVC |
6097 NAPIF_STATE_IN_BUSY_POLL)))
6098 return false;
6099
6100 if (work_done) {
6101 if (n->gro_bitmask)
6102 timeout = READ_ONCE(n->dev->gro_flush_timeout);
6103 n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
6104 }
6105 if (n->defer_hard_irqs_count > 0) {
6106 n->defer_hard_irqs_count--;
6107 timeout = READ_ONCE(n->dev->gro_flush_timeout);
6108 if (timeout)
6109 ret = false;
6110 }
6111 if (n->gro_bitmask) {
6112 /* When the NAPI instance uses a timeout and keeps postponing
6113 * it, we need to bound somehow the time packets are kept in
6114 * the GRO layer
6115 */
6116 napi_gro_flush(n, !!timeout);
6117 }
6118
6119 gro_normal_list(n);
6120
6121 if (unlikely(!list_empty(&n->poll_list))) {
6122 /* If n->poll_list is not empty, we need to mask irqs */
6123 local_irq_save(flags);
6124 list_del_init(&n->poll_list);
6125 local_irq_restore(flags);
6126 }
6127 WRITE_ONCE(n->list_owner, -1);
6128
6129 val = READ_ONCE(n->state);
6130 do {
6131 WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
6132
6133 new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
6134 NAPIF_STATE_SCHED_THREADED |
6135 NAPIF_STATE_PREFER_BUSY_POLL);
6136
6137 /* If STATE_MISSED was set, leave STATE_SCHED set,
6138 * because we will call napi->poll() one more time.
6139 * This C code was suggested by Alexander Duyck to help gcc.
6140 */
6141 new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
6142 NAPIF_STATE_SCHED;
6143 } while (!try_cmpxchg(&n->state, &val, new));
6144
6145 if (unlikely(val & NAPIF_STATE_MISSED)) {
6146 __napi_schedule(n);
6147 return false;
6148 }
6149
6150 if (timeout)
6151 hrtimer_start(&n->timer, ns_to_ktime(timeout),
6152 HRTIMER_MODE_REL_PINNED);
6153 return ret;
6154}
6155EXPORT_SYMBOL(napi_complete_done);
6156
6157/* must be called under rcu_read_lock(), as we dont take a reference */
6158struct napi_struct *napi_by_id(unsigned int napi_id)
6159{
6160 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
6161 struct napi_struct *napi;
6162
6163 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
6164 if (napi->napi_id == napi_id)
6165 return napi;
6166
6167 return NULL;
6168}
6169
6170static void skb_defer_free_flush(struct softnet_data *sd)
6171{
6172 struct sk_buff *skb, *next;
6173
6174 /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
6175 if (!READ_ONCE(sd->defer_list))
6176 return;
6177
6178 spin_lock(&sd->defer_lock);
6179 skb = sd->defer_list;
6180 sd->defer_list = NULL;
6181 sd->defer_count = 0;
6182 spin_unlock(&sd->defer_lock);
6183
6184 while (skb != NULL) {
6185 next = skb->next;
6186 napi_consume_skb(skb, 1);
6187 skb = next;
6188 }
6189}
6190
6191#if defined(CONFIG_NET_RX_BUSY_POLL)
6192
6193static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
6194{
6195 if (!skip_schedule) {
6196 gro_normal_list(napi);
6197 __napi_schedule(napi);
6198 return;
6199 }
6200
6201 if (napi->gro_bitmask) {
6202 /* flush too old packets
6203 * If HZ < 1000, flush all packets.
6204 */
6205 napi_gro_flush(napi, HZ >= 1000);
6206 }
6207
6208 gro_normal_list(napi);
6209 clear_bit(NAPI_STATE_SCHED, &napi->state);
6210}
6211
6212enum {
6213 NAPI_F_PREFER_BUSY_POLL = 1,
6214 NAPI_F_END_ON_RESCHED = 2,
6215};
6216
6217static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
6218 unsigned flags, u16 budget)
6219{
6220 bool skip_schedule = false;
6221 unsigned long timeout;
6222 int rc;
6223
6224 /* Busy polling means there is a high chance device driver hard irq
6225 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
6226 * set in napi_schedule_prep().
6227 * Since we are about to call napi->poll() once more, we can safely
6228 * clear NAPI_STATE_MISSED.
6229 *
6230 * Note: x86 could use a single "lock and ..." instruction
6231 * to perform these two clear_bit()
6232 */
6233 clear_bit(NAPI_STATE_MISSED, &napi->state);
6234 clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
6235
6236 local_bh_disable();
6237
6238 if (flags & NAPI_F_PREFER_BUSY_POLL) {
6239 napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
6240 timeout = READ_ONCE(napi->dev->gro_flush_timeout);
6241 if (napi->defer_hard_irqs_count && timeout) {
6242 hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
6243 skip_schedule = true;
6244 }
6245 }
6246
6247 /* All we really want here is to re-enable device interrupts.
6248 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
6249 */
6250 rc = napi->poll(napi, budget);
6251 /* We can't gro_normal_list() here, because napi->poll() might have
6252 * rearmed the napi (napi_complete_done()) in which case it could
6253 * already be running on another CPU.
6254 */
6255 trace_napi_poll(napi, rc, budget);
6256 netpoll_poll_unlock(have_poll_lock);
6257 if (rc == budget)
6258 __busy_poll_stop(napi, skip_schedule);
6259 local_bh_enable();
6260}
6261
6262static void __napi_busy_loop(unsigned int napi_id,
6263 bool (*loop_end)(void *, unsigned long),
6264 void *loop_end_arg, unsigned flags, u16 budget)
6265{
6266 unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
6267 int (*napi_poll)(struct napi_struct *napi, int budget);
6268 void *have_poll_lock = NULL;
6269 struct napi_struct *napi;
6270
6271 WARN_ON_ONCE(!rcu_read_lock_held());
6272
6273restart:
6274 napi_poll = NULL;
6275
6276 napi = napi_by_id(napi_id);
6277 if (!napi)
6278 return;
6279
6280 if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6281 preempt_disable();
6282 for (;;) {
6283 int work = 0;
6284
6285 local_bh_disable();
6286 if (!napi_poll) {
6287 unsigned long val = READ_ONCE(napi->state);
6288
6289 /* If multiple threads are competing for this napi,
6290 * we avoid dirtying napi->state as much as we can.
6291 */
6292 if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
6293 NAPIF_STATE_IN_BUSY_POLL)) {
6294 if (flags & NAPI_F_PREFER_BUSY_POLL)
6295 set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6296 goto count;
6297 }
6298 if (cmpxchg(&napi->state, val,
6299 val | NAPIF_STATE_IN_BUSY_POLL |
6300 NAPIF_STATE_SCHED) != val) {
6301 if (flags & NAPI_F_PREFER_BUSY_POLL)
6302 set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6303 goto count;
6304 }
6305 have_poll_lock = netpoll_poll_lock(napi);
6306 napi_poll = napi->poll;
6307 }
6308 work = napi_poll(napi, budget);
6309 trace_napi_poll(napi, work, budget);
6310 gro_normal_list(napi);
6311count:
6312 if (work > 0)
6313 __NET_ADD_STATS(dev_net(napi->dev),
6314 LINUX_MIB_BUSYPOLLRXPACKETS, work);
6315 skb_defer_free_flush(this_cpu_ptr(&softnet_data));
6316 local_bh_enable();
6317
6318 if (!loop_end || loop_end(loop_end_arg, start_time))
6319 break;
6320
6321 if (unlikely(need_resched())) {
6322 if (flags & NAPI_F_END_ON_RESCHED)
6323 break;
6324 if (napi_poll)
6325 busy_poll_stop(napi, have_poll_lock, flags, budget);
6326 if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6327 preempt_enable();
6328 rcu_read_unlock();
6329 cond_resched();
6330 rcu_read_lock();
6331 if (loop_end(loop_end_arg, start_time))
6332 return;
6333 goto restart;
6334 }
6335 cpu_relax();
6336 }
6337 if (napi_poll)
6338 busy_poll_stop(napi, have_poll_lock, flags, budget);
6339 if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6340 preempt_enable();
6341}
6342
6343void napi_busy_loop_rcu(unsigned int napi_id,
6344 bool (*loop_end)(void *, unsigned long),
6345 void *loop_end_arg, bool prefer_busy_poll, u16 budget)
6346{
6347 unsigned flags = NAPI_F_END_ON_RESCHED;
6348
6349 if (prefer_busy_poll)
6350 flags |= NAPI_F_PREFER_BUSY_POLL;
6351
6352 __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
6353}
6354
6355void napi_busy_loop(unsigned int napi_id,
6356 bool (*loop_end)(void *, unsigned long),
6357 void *loop_end_arg, bool prefer_busy_poll, u16 budget)
6358{
6359 unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0;
6360
6361 rcu_read_lock();
6362 __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
6363 rcu_read_unlock();
6364}
6365EXPORT_SYMBOL(napi_busy_loop);
6366
6367#endif /* CONFIG_NET_RX_BUSY_POLL */
6368
6369static void napi_hash_add(struct napi_struct *napi)
6370{
6371 if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
6372 return;
6373
6374 spin_lock(&napi_hash_lock);
6375
6376 /* 0..NR_CPUS range is reserved for sender_cpu use */
6377 do {
6378 if (unlikely(++napi_gen_id < MIN_NAPI_ID))
6379 napi_gen_id = MIN_NAPI_ID;
6380 } while (napi_by_id(napi_gen_id));
6381 napi->napi_id = napi_gen_id;
6382
6383 hlist_add_head_rcu(&napi->napi_hash_node,
6384 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
6385
6386 spin_unlock(&napi_hash_lock);
6387}
6388
6389/* Warning : caller is responsible to make sure rcu grace period
6390 * is respected before freeing memory containing @napi
6391 */
6392static void napi_hash_del(struct napi_struct *napi)
6393{
6394 spin_lock(&napi_hash_lock);
6395
6396 hlist_del_init_rcu(&napi->napi_hash_node);
6397
6398 spin_unlock(&napi_hash_lock);
6399}
6400
6401static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
6402{
6403 struct napi_struct *napi;
6404
6405 napi = container_of(timer, struct napi_struct, timer);
6406
6407 /* Note : we use a relaxed variant of napi_schedule_prep() not setting
6408 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
6409 */
6410 if (!napi_disable_pending(napi) &&
6411 !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
6412 clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6413 __napi_schedule_irqoff(napi);
6414 }
6415
6416 return HRTIMER_NORESTART;
6417}
6418
6419static void init_gro_hash(struct napi_struct *napi)
6420{
6421 int i;
6422
6423 for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6424 INIT_LIST_HEAD(&napi->gro_hash[i].list);
6425 napi->gro_hash[i].count = 0;
6426 }
6427 napi->gro_bitmask = 0;
6428}
6429
6430int dev_set_threaded(struct net_device *dev, bool threaded)
6431{
6432 struct napi_struct *napi;
6433 int err = 0;
6434
6435 if (dev->threaded == threaded)
6436 return 0;
6437
6438 if (threaded) {
6439 list_for_each_entry(napi, &dev->napi_list, dev_list) {
6440 if (!napi->thread) {
6441 err = napi_kthread_create(napi);
6442 if (err) {
6443 threaded = false;
6444 break;
6445 }
6446 }
6447 }
6448 }
6449
6450 dev->threaded = threaded;
6451
6452 /* Make sure kthread is created before THREADED bit
6453 * is set.
6454 */
6455 smp_mb__before_atomic();
6456
6457 /* Setting/unsetting threaded mode on a napi might not immediately
6458 * take effect, if the current napi instance is actively being
6459 * polled. In this case, the switch between threaded mode and
6460 * softirq mode will happen in the next round of napi_schedule().
6461 * This should not cause hiccups/stalls to the live traffic.
6462 */
6463 list_for_each_entry(napi, &dev->napi_list, dev_list)
6464 assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
6465
6466 return err;
6467}
6468EXPORT_SYMBOL(dev_set_threaded);
6469
6470/**
6471 * netif_queue_set_napi - Associate queue with the napi
6472 * @dev: device to which NAPI and queue belong
6473 * @queue_index: Index of queue
6474 * @type: queue type as RX or TX
6475 * @napi: NAPI context, pass NULL to clear previously set NAPI
6476 *
6477 * Set queue with its corresponding napi context. This should be done after
6478 * registering the NAPI handler for the queue-vector and the queues have been
6479 * mapped to the corresponding interrupt vector.
6480 */
6481void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
6482 enum netdev_queue_type type, struct napi_struct *napi)
6483{
6484 struct netdev_rx_queue *rxq;
6485 struct netdev_queue *txq;
6486
6487 if (WARN_ON_ONCE(napi && !napi->dev))
6488 return;
6489 if (dev->reg_state >= NETREG_REGISTERED)
6490 ASSERT_RTNL();
6491
6492 switch (type) {
6493 case NETDEV_QUEUE_TYPE_RX:
6494 rxq = __netif_get_rx_queue(dev, queue_index);
6495 rxq->napi = napi;
6496 return;
6497 case NETDEV_QUEUE_TYPE_TX:
6498 txq = netdev_get_tx_queue(dev, queue_index);
6499 txq->napi = napi;
6500 return;
6501 default:
6502 return;
6503 }
6504}
6505EXPORT_SYMBOL(netif_queue_set_napi);
6506
6507void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
6508 int (*poll)(struct napi_struct *, int), int weight)
6509{
6510 if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
6511 return;
6512
6513 INIT_LIST_HEAD(&napi->poll_list);
6514 INIT_HLIST_NODE(&napi->napi_hash_node);
6515 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
6516 napi->timer.function = napi_watchdog;
6517 init_gro_hash(napi);
6518 napi->skb = NULL;
6519 INIT_LIST_HEAD(&napi->rx_list);
6520 napi->rx_count = 0;
6521 napi->poll = poll;
6522 if (weight > NAPI_POLL_WEIGHT)
6523 netdev_err_once(dev, "%s() called with weight %d\n", __func__,
6524 weight);
6525 napi->weight = weight;
6526 napi->dev = dev;
6527#ifdef CONFIG_NETPOLL
6528 napi->poll_owner = -1;
6529#endif
6530 napi->list_owner = -1;
6531 set_bit(NAPI_STATE_SCHED, &napi->state);
6532 set_bit(NAPI_STATE_NPSVC, &napi->state);
6533 list_add_rcu(&napi->dev_list, &dev->napi_list);
6534 napi_hash_add(napi);
6535 napi_get_frags_check(napi);
6536 /* Create kthread for this napi if dev->threaded is set.
6537 * Clear dev->threaded if kthread creation failed so that
6538 * threaded mode will not be enabled in napi_enable().
6539 */
6540 if (dev->threaded && napi_kthread_create(napi))
6541 dev->threaded = 0;
6542 netif_napi_set_irq(napi, -1);
6543}
6544EXPORT_SYMBOL(netif_napi_add_weight);
6545
6546void napi_disable(struct napi_struct *n)
6547{
6548 unsigned long val, new;
6549
6550 might_sleep();
6551 set_bit(NAPI_STATE_DISABLE, &n->state);
6552
6553 val = READ_ONCE(n->state);
6554 do {
6555 while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
6556 usleep_range(20, 200);
6557 val = READ_ONCE(n->state);
6558 }
6559
6560 new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
6561 new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
6562 } while (!try_cmpxchg(&n->state, &val, new));
6563
6564 hrtimer_cancel(&n->timer);
6565
6566 clear_bit(NAPI_STATE_DISABLE, &n->state);
6567}
6568EXPORT_SYMBOL(napi_disable);
6569
6570/**
6571 * napi_enable - enable NAPI scheduling
6572 * @n: NAPI context
6573 *
6574 * Resume NAPI from being scheduled on this context.
6575 * Must be paired with napi_disable.
6576 */
6577void napi_enable(struct napi_struct *n)
6578{
6579 unsigned long new, val = READ_ONCE(n->state);
6580
6581 do {
6582 BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
6583
6584 new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
6585 if (n->dev->threaded && n->thread)
6586 new |= NAPIF_STATE_THREADED;
6587 } while (!try_cmpxchg(&n->state, &val, new));
6588}
6589EXPORT_SYMBOL(napi_enable);
6590
6591static void flush_gro_hash(struct napi_struct *napi)
6592{
6593 int i;
6594
6595 for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6596 struct sk_buff *skb, *n;
6597
6598 list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
6599 kfree_skb(skb);
6600 napi->gro_hash[i].count = 0;
6601 }
6602}
6603
6604/* Must be called in process context */
6605void __netif_napi_del(struct napi_struct *napi)
6606{
6607 if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
6608 return;
6609
6610 napi_hash_del(napi);
6611 list_del_rcu(&napi->dev_list);
6612 napi_free_frags(napi);
6613
6614 flush_gro_hash(napi);
6615 napi->gro_bitmask = 0;
6616
6617 if (napi->thread) {
6618 kthread_stop(napi->thread);
6619 napi->thread = NULL;
6620 }
6621}
6622EXPORT_SYMBOL(__netif_napi_del);
6623
6624static int __napi_poll(struct napi_struct *n, bool *repoll)
6625{
6626 int work, weight;
6627
6628 weight = n->weight;
6629
6630 /* This NAPI_STATE_SCHED test is for avoiding a race
6631 * with netpoll's poll_napi(). Only the entity which
6632 * obtains the lock and sees NAPI_STATE_SCHED set will
6633 * actually make the ->poll() call. Therefore we avoid
6634 * accidentally calling ->poll() when NAPI is not scheduled.
6635 */
6636 work = 0;
6637 if (napi_is_scheduled(n)) {
6638 work = n->poll(n, weight);
6639 trace_napi_poll(n, work, weight);
6640
6641 xdp_do_check_flushed(n);
6642 }
6643
6644 if (unlikely(work > weight))
6645 netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
6646 n->poll, work, weight);
6647
6648 if (likely(work < weight))
6649 return work;
6650
6651 /* Drivers must not modify the NAPI state if they
6652 * consume the entire weight. In such cases this code
6653 * still "owns" the NAPI instance and therefore can
6654 * move the instance around on the list at-will.
6655 */
6656 if (unlikely(napi_disable_pending(n))) {
6657 napi_complete(n);
6658 return work;
6659 }
6660
6661 /* The NAPI context has more processing work, but busy-polling
6662 * is preferred. Exit early.
6663 */
6664 if (napi_prefer_busy_poll(n)) {
6665 if (napi_complete_done(n, work)) {
6666 /* If timeout is not set, we need to make sure
6667 * that the NAPI is re-scheduled.
6668 */
6669 napi_schedule(n);
6670 }
6671 return work;
6672 }
6673
6674 if (n->gro_bitmask) {
6675 /* flush too old packets
6676 * If HZ < 1000, flush all packets.
6677 */
6678 napi_gro_flush(n, HZ >= 1000);
6679 }
6680
6681 gro_normal_list(n);
6682
6683 /* Some drivers may have called napi_schedule
6684 * prior to exhausting their budget.
6685 */
6686 if (unlikely(!list_empty(&n->poll_list))) {
6687 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
6688 n->dev ? n->dev->name : "backlog");
6689 return work;
6690 }
6691
6692 *repoll = true;
6693
6694 return work;
6695}
6696
6697static int napi_poll(struct napi_struct *n, struct list_head *repoll)
6698{
6699 bool do_repoll = false;
6700 void *have;
6701 int work;
6702
6703 list_del_init(&n->poll_list);
6704
6705 have = netpoll_poll_lock(n);
6706
6707 work = __napi_poll(n, &do_repoll);
6708
6709 if (do_repoll)
6710 list_add_tail(&n->poll_list, repoll);
6711
6712 netpoll_poll_unlock(have);
6713
6714 return work;
6715}
6716
6717static int napi_thread_wait(struct napi_struct *napi)
6718{
6719 bool woken = false;
6720
6721 set_current_state(TASK_INTERRUPTIBLE);
6722
6723 while (!kthread_should_stop()) {
6724 /* Testing SCHED_THREADED bit here to make sure the current
6725 * kthread owns this napi and could poll on this napi.
6726 * Testing SCHED bit is not enough because SCHED bit might be
6727 * set by some other busy poll thread or by napi_disable().
6728 */
6729 if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
6730 WARN_ON(!list_empty(&napi->poll_list));
6731 __set_current_state(TASK_RUNNING);
6732 return 0;
6733 }
6734
6735 schedule();
6736 /* woken being true indicates this thread owns this napi. */
6737 woken = true;
6738 set_current_state(TASK_INTERRUPTIBLE);
6739 }
6740 __set_current_state(TASK_RUNNING);
6741
6742 return -1;
6743}
6744
6745static int napi_threaded_poll(void *data)
6746{
6747 struct napi_struct *napi = data;
6748 struct softnet_data *sd;
6749 void *have;
6750
6751 while (!napi_thread_wait(napi)) {
6752 unsigned long last_qs = jiffies;
6753
6754 for (;;) {
6755 bool repoll = false;
6756
6757 local_bh_disable();
6758 sd = this_cpu_ptr(&softnet_data);
6759 sd->in_napi_threaded_poll = true;
6760
6761 have = netpoll_poll_lock(napi);
6762 __napi_poll(napi, &repoll);
6763 netpoll_poll_unlock(have);
6764
6765 sd->in_napi_threaded_poll = false;
6766 barrier();
6767
6768 if (sd_has_rps_ipi_waiting(sd)) {
6769 local_irq_disable();
6770 net_rps_action_and_irq_enable(sd);
6771 }
6772 skb_defer_free_flush(sd);
6773 local_bh_enable();
6774
6775 if (!repoll)
6776 break;
6777
6778 rcu_softirq_qs_periodic(last_qs);
6779 cond_resched();
6780 }
6781 }
6782 return 0;
6783}
6784
6785static __latent_entropy void net_rx_action(struct softirq_action *h)
6786{
6787 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6788 unsigned long time_limit = jiffies +
6789 usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
6790 int budget = READ_ONCE(net_hotdata.netdev_budget);
6791 LIST_HEAD(list);
6792 LIST_HEAD(repoll);
6793
6794start:
6795 sd->in_net_rx_action = true;
6796 local_irq_disable();
6797 list_splice_init(&sd->poll_list, &list);
6798 local_irq_enable();
6799
6800 for (;;) {
6801 struct napi_struct *n;
6802
6803 skb_defer_free_flush(sd);
6804
6805 if (list_empty(&list)) {
6806 if (list_empty(&repoll)) {
6807 sd->in_net_rx_action = false;
6808 barrier();
6809 /* We need to check if ____napi_schedule()
6810 * had refilled poll_list while
6811 * sd->in_net_rx_action was true.
6812 */
6813 if (!list_empty(&sd->poll_list))
6814 goto start;
6815 if (!sd_has_rps_ipi_waiting(sd))
6816 goto end;
6817 }
6818 break;
6819 }
6820
6821 n = list_first_entry(&list, struct napi_struct, poll_list);
6822 budget -= napi_poll(n, &repoll);
6823
6824 /* If softirq window is exhausted then punt.
6825 * Allow this to run for 2 jiffies since which will allow
6826 * an average latency of 1.5/HZ.
6827 */
6828 if (unlikely(budget <= 0 ||
6829 time_after_eq(jiffies, time_limit))) {
6830 sd->time_squeeze++;
6831 break;
6832 }
6833 }
6834
6835 local_irq_disable();
6836
6837 list_splice_tail_init(&sd->poll_list, &list);
6838 list_splice_tail(&repoll, &list);
6839 list_splice(&list, &sd->poll_list);
6840 if (!list_empty(&sd->poll_list))
6841 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
6842 else
6843 sd->in_net_rx_action = false;
6844
6845 net_rps_action_and_irq_enable(sd);
6846end:;
6847}
6848
6849struct netdev_adjacent {
6850 struct net_device *dev;
6851 netdevice_tracker dev_tracker;
6852
6853 /* upper master flag, there can only be one master device per list */
6854 bool master;
6855
6856 /* lookup ignore flag */
6857 bool ignore;
6858
6859 /* counter for the number of times this device was added to us */
6860 u16 ref_nr;
6861
6862 /* private field for the users */
6863 void *private;
6864
6865 struct list_head list;
6866 struct rcu_head rcu;
6867};
6868
6869static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
6870 struct list_head *adj_list)
6871{
6872 struct netdev_adjacent *adj;
6873
6874 list_for_each_entry(adj, adj_list, list) {
6875 if (adj->dev == adj_dev)
6876 return adj;
6877 }
6878 return NULL;
6879}
6880
6881static int ____netdev_has_upper_dev(struct net_device *upper_dev,
6882 struct netdev_nested_priv *priv)
6883{
6884 struct net_device *dev = (struct net_device *)priv->data;
6885
6886 return upper_dev == dev;
6887}
6888
6889/**
6890 * netdev_has_upper_dev - Check if device is linked to an upper device
6891 * @dev: device
6892 * @upper_dev: upper device to check
6893 *
6894 * Find out if a device is linked to specified upper device and return true
6895 * in case it is. Note that this checks only immediate upper device,
6896 * not through a complete stack of devices. The caller must hold the RTNL lock.
6897 */
6898bool netdev_has_upper_dev(struct net_device *dev,
6899 struct net_device *upper_dev)
6900{
6901 struct netdev_nested_priv priv = {
6902 .data = (void *)upper_dev,
6903 };
6904
6905 ASSERT_RTNL();
6906
6907 return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6908 &priv);
6909}
6910EXPORT_SYMBOL(netdev_has_upper_dev);
6911
6912/**
6913 * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
6914 * @dev: device
6915 * @upper_dev: upper device to check
6916 *
6917 * Find out if a device is linked to specified upper device and return true
6918 * in case it is. Note that this checks the entire upper device chain.
6919 * The caller must hold rcu lock.
6920 */
6921
6922bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
6923 struct net_device *upper_dev)
6924{
6925 struct netdev_nested_priv priv = {
6926 .data = (void *)upper_dev,
6927 };
6928
6929 return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6930 &priv);
6931}
6932EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
6933
6934/**
6935 * netdev_has_any_upper_dev - Check if device is linked to some device
6936 * @dev: device
6937 *
6938 * Find out if a device is linked to an upper device and return true in case
6939 * it is. The caller must hold the RTNL lock.
6940 */
6941bool netdev_has_any_upper_dev(struct net_device *dev)
6942{
6943 ASSERT_RTNL();
6944
6945 return !list_empty(&dev->adj_list.upper);
6946}
6947EXPORT_SYMBOL(netdev_has_any_upper_dev);
6948
6949/**
6950 * netdev_master_upper_dev_get - Get master upper device
6951 * @dev: device
6952 *
6953 * Find a master upper device and return pointer to it or NULL in case
6954 * it's not there. The caller must hold the RTNL lock.
6955 */
6956struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
6957{
6958 struct netdev_adjacent *upper;
6959
6960 ASSERT_RTNL();
6961
6962 if (list_empty(&dev->adj_list.upper))
6963 return NULL;
6964
6965 upper = list_first_entry(&dev->adj_list.upper,
6966 struct netdev_adjacent, list);
6967 if (likely(upper->master))
6968 return upper->dev;
6969 return NULL;
6970}
6971EXPORT_SYMBOL(netdev_master_upper_dev_get);
6972
6973static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
6974{
6975 struct netdev_adjacent *upper;
6976
6977 ASSERT_RTNL();
6978
6979 if (list_empty(&dev->adj_list.upper))
6980 return NULL;
6981
6982 upper = list_first_entry(&dev->adj_list.upper,
6983 struct netdev_adjacent, list);
6984 if (likely(upper->master) && !upper->ignore)
6985 return upper->dev;
6986 return NULL;
6987}
6988
6989/**
6990 * netdev_has_any_lower_dev - Check if device is linked to some device
6991 * @dev: device
6992 *
6993 * Find out if a device is linked to a lower device and return true in case
6994 * it is. The caller must hold the RTNL lock.
6995 */
6996static bool netdev_has_any_lower_dev(struct net_device *dev)
6997{
6998 ASSERT_RTNL();
6999
7000 return !list_empty(&dev->adj_list.lower);
7001}
7002
7003void *netdev_adjacent_get_private(struct list_head *adj_list)
7004{
7005 struct netdev_adjacent *adj;
7006
7007 adj = list_entry(adj_list, struct netdev_adjacent, list);
7008
7009 return adj->private;
7010}
7011EXPORT_SYMBOL(netdev_adjacent_get_private);
7012
7013/**
7014 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
7015 * @dev: device
7016 * @iter: list_head ** of the current position
7017 *
7018 * Gets the next device from the dev's upper list, starting from iter
7019 * position. The caller must hold RCU read lock.
7020 */
7021struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
7022 struct list_head **iter)
7023{
7024 struct netdev_adjacent *upper;
7025
7026 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
7027
7028 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7029
7030 if (&upper->list == &dev->adj_list.upper)
7031 return NULL;
7032
7033 *iter = &upper->list;
7034
7035 return upper->dev;
7036}
7037EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
7038
7039static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
7040 struct list_head **iter,
7041 bool *ignore)
7042{
7043 struct netdev_adjacent *upper;
7044
7045 upper = list_entry((*iter)->next, struct netdev_adjacent, list);
7046
7047 if (&upper->list == &dev->adj_list.upper)
7048 return NULL;
7049
7050 *iter = &upper->list;
7051 *ignore = upper->ignore;
7052
7053 return upper->dev;
7054}
7055
7056static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
7057 struct list_head **iter)
7058{
7059 struct netdev_adjacent *upper;
7060
7061 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
7062
7063 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7064
7065 if (&upper->list == &dev->adj_list.upper)
7066 return NULL;
7067
7068 *iter = &upper->list;
7069
7070 return upper->dev;
7071}
7072
7073static int __netdev_walk_all_upper_dev(struct net_device *dev,
7074 int (*fn)(struct net_device *dev,
7075 struct netdev_nested_priv *priv),
7076 struct netdev_nested_priv *priv)
7077{
7078 struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7079 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7080 int ret, cur = 0;
7081 bool ignore;
7082
7083 now = dev;
7084 iter = &dev->adj_list.upper;
7085
7086 while (1) {
7087 if (now != dev) {
7088 ret = fn(now, priv);
7089 if (ret)
7090 return ret;
7091 }
7092
7093 next = NULL;
7094 while (1) {
7095 udev = __netdev_next_upper_dev(now, &iter, &ignore);
7096 if (!udev)
7097 break;
7098 if (ignore)
7099 continue;
7100
7101 next = udev;
7102 niter = &udev->adj_list.upper;
7103 dev_stack[cur] = now;
7104 iter_stack[cur++] = iter;
7105 break;
7106 }
7107
7108 if (!next) {
7109 if (!cur)
7110 return 0;
7111 next = dev_stack[--cur];
7112 niter = iter_stack[cur];
7113 }
7114
7115 now = next;
7116 iter = niter;
7117 }
7118
7119 return 0;
7120}
7121
7122int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
7123 int (*fn)(struct net_device *dev,
7124 struct netdev_nested_priv *priv),
7125 struct netdev_nested_priv *priv)
7126{
7127 struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7128 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7129 int ret, cur = 0;
7130
7131 now = dev;
7132 iter = &dev->adj_list.upper;
7133
7134 while (1) {
7135 if (now != dev) {
7136 ret = fn(now, priv);
7137 if (ret)
7138 return ret;
7139 }
7140
7141 next = NULL;
7142 while (1) {
7143 udev = netdev_next_upper_dev_rcu(now, &iter);
7144 if (!udev)
7145 break;
7146
7147 next = udev;
7148 niter = &udev->adj_list.upper;
7149 dev_stack[cur] = now;
7150 iter_stack[cur++] = iter;
7151 break;
7152 }
7153
7154 if (!next) {
7155 if (!cur)
7156 return 0;
7157 next = dev_stack[--cur];
7158 niter = iter_stack[cur];
7159 }
7160
7161 now = next;
7162 iter = niter;
7163 }
7164
7165 return 0;
7166}
7167EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
7168
7169static bool __netdev_has_upper_dev(struct net_device *dev,
7170 struct net_device *upper_dev)
7171{
7172 struct netdev_nested_priv priv = {
7173 .flags = 0,
7174 .data = (void *)upper_dev,
7175 };
7176
7177 ASSERT_RTNL();
7178
7179 return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
7180 &priv);
7181}
7182
7183/**
7184 * netdev_lower_get_next_private - Get the next ->private from the
7185 * lower neighbour list
7186 * @dev: device
7187 * @iter: list_head ** of the current position
7188 *
7189 * Gets the next netdev_adjacent->private from the dev's lower neighbour
7190 * list, starting from iter position. The caller must hold either hold the
7191 * RTNL lock or its own locking that guarantees that the neighbour lower
7192 * list will remain unchanged.
7193 */
7194void *netdev_lower_get_next_private(struct net_device *dev,
7195 struct list_head **iter)
7196{
7197 struct netdev_adjacent *lower;
7198
7199 lower = list_entry(*iter, struct netdev_adjacent, list);
7200
7201 if (&lower->list == &dev->adj_list.lower)
7202 return NULL;
7203
7204 *iter = lower->list.next;
7205
7206 return lower->private;
7207}
7208EXPORT_SYMBOL(netdev_lower_get_next_private);
7209
7210/**
7211 * netdev_lower_get_next_private_rcu - Get the next ->private from the
7212 * lower neighbour list, RCU
7213 * variant
7214 * @dev: device
7215 * @iter: list_head ** of the current position
7216 *
7217 * Gets the next netdev_adjacent->private from the dev's lower neighbour
7218 * list, starting from iter position. The caller must hold RCU read lock.
7219 */
7220void *netdev_lower_get_next_private_rcu(struct net_device *dev,
7221 struct list_head **iter)
7222{
7223 struct netdev_adjacent *lower;
7224
7225 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
7226
7227 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7228
7229 if (&lower->list == &dev->adj_list.lower)
7230 return NULL;
7231
7232 *iter = &lower->list;
7233
7234 return lower->private;
7235}
7236EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
7237
7238/**
7239 * netdev_lower_get_next - Get the next device from the lower neighbour
7240 * list
7241 * @dev: device
7242 * @iter: list_head ** of the current position
7243 *
7244 * Gets the next netdev_adjacent from the dev's lower neighbour
7245 * list, starting from iter position. The caller must hold RTNL lock or
7246 * its own locking that guarantees that the neighbour lower
7247 * list will remain unchanged.
7248 */
7249void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
7250{
7251 struct netdev_adjacent *lower;
7252
7253 lower = list_entry(*iter, struct netdev_adjacent, list);
7254
7255 if (&lower->list == &dev->adj_list.lower)
7256 return NULL;
7257
7258 *iter = lower->list.next;
7259
7260 return lower->dev;
7261}
7262EXPORT_SYMBOL(netdev_lower_get_next);
7263
7264static struct net_device *netdev_next_lower_dev(struct net_device *dev,
7265 struct list_head **iter)
7266{
7267 struct netdev_adjacent *lower;
7268
7269 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7270
7271 if (&lower->list == &dev->adj_list.lower)
7272 return NULL;
7273
7274 *iter = &lower->list;
7275
7276 return lower->dev;
7277}
7278
7279static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
7280 struct list_head **iter,
7281 bool *ignore)
7282{
7283 struct netdev_adjacent *lower;
7284
7285 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7286
7287 if (&lower->list == &dev->adj_list.lower)
7288 return NULL;
7289
7290 *iter = &lower->list;
7291 *ignore = lower->ignore;
7292
7293 return lower->dev;
7294}
7295
7296int netdev_walk_all_lower_dev(struct net_device *dev,
7297 int (*fn)(struct net_device *dev,
7298 struct netdev_nested_priv *priv),
7299 struct netdev_nested_priv *priv)
7300{
7301 struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7302 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7303 int ret, cur = 0;
7304
7305 now = dev;
7306 iter = &dev->adj_list.lower;
7307
7308 while (1) {
7309 if (now != dev) {
7310 ret = fn(now, priv);
7311 if (ret)
7312 return ret;
7313 }
7314
7315 next = NULL;
7316 while (1) {
7317 ldev = netdev_next_lower_dev(now, &iter);
7318 if (!ldev)
7319 break;
7320
7321 next = ldev;
7322 niter = &ldev->adj_list.lower;
7323 dev_stack[cur] = now;
7324 iter_stack[cur++] = iter;
7325 break;
7326 }
7327
7328 if (!next) {
7329 if (!cur)
7330 return 0;
7331 next = dev_stack[--cur];
7332 niter = iter_stack[cur];
7333 }
7334
7335 now = next;
7336 iter = niter;
7337 }
7338
7339 return 0;
7340}
7341EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
7342
7343static int __netdev_walk_all_lower_dev(struct net_device *dev,
7344 int (*fn)(struct net_device *dev,
7345 struct netdev_nested_priv *priv),
7346 struct netdev_nested_priv *priv)
7347{
7348 struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7349 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7350 int ret, cur = 0;
7351 bool ignore;
7352
7353 now = dev;
7354 iter = &dev->adj_list.lower;
7355
7356 while (1) {
7357 if (now != dev) {
7358 ret = fn(now, priv);
7359 if (ret)
7360 return ret;
7361 }
7362
7363 next = NULL;
7364 while (1) {
7365 ldev = __netdev_next_lower_dev(now, &iter, &ignore);
7366 if (!ldev)
7367 break;
7368 if (ignore)
7369 continue;
7370
7371 next = ldev;
7372 niter = &ldev->adj_list.lower;
7373 dev_stack[cur] = now;
7374 iter_stack[cur++] = iter;
7375 break;
7376 }
7377
7378 if (!next) {
7379 if (!cur)
7380 return 0;
7381 next = dev_stack[--cur];
7382 niter = iter_stack[cur];
7383 }
7384
7385 now = next;
7386 iter = niter;
7387 }
7388
7389 return 0;
7390}
7391
7392struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
7393 struct list_head **iter)
7394{
7395 struct netdev_adjacent *lower;
7396
7397 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7398 if (&lower->list == &dev->adj_list.lower)
7399 return NULL;
7400
7401 *iter = &lower->list;
7402
7403 return lower->dev;
7404}
7405EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
7406
7407static u8 __netdev_upper_depth(struct net_device *dev)
7408{
7409 struct net_device *udev;
7410 struct list_head *iter;
7411 u8 max_depth = 0;
7412 bool ignore;
7413
7414 for (iter = &dev->adj_list.upper,
7415 udev = __netdev_next_upper_dev(dev, &iter, &ignore);
7416 udev;
7417 udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
7418 if (ignore)
7419 continue;
7420 if (max_depth < udev->upper_level)
7421 max_depth = udev->upper_level;
7422 }
7423
7424 return max_depth;
7425}
7426
7427static u8 __netdev_lower_depth(struct net_device *dev)
7428{
7429 struct net_device *ldev;
7430 struct list_head *iter;
7431 u8 max_depth = 0;
7432 bool ignore;
7433
7434 for (iter = &dev->adj_list.lower,
7435 ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
7436 ldev;
7437 ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
7438 if (ignore)
7439 continue;
7440 if (max_depth < ldev->lower_level)
7441 max_depth = ldev->lower_level;
7442 }
7443
7444 return max_depth;
7445}
7446
7447static int __netdev_update_upper_level(struct net_device *dev,
7448 struct netdev_nested_priv *__unused)
7449{
7450 dev->upper_level = __netdev_upper_depth(dev) + 1;
7451 return 0;
7452}
7453
7454#ifdef CONFIG_LOCKDEP
7455static LIST_HEAD(net_unlink_list);
7456
7457static void net_unlink_todo(struct net_device *dev)
7458{
7459 if (list_empty(&dev->unlink_list))
7460 list_add_tail(&dev->unlink_list, &net_unlink_list);
7461}
7462#endif
7463
7464static int __netdev_update_lower_level(struct net_device *dev,
7465 struct netdev_nested_priv *priv)
7466{
7467 dev->lower_level = __netdev_lower_depth(dev) + 1;
7468
7469#ifdef CONFIG_LOCKDEP
7470 if (!priv)
7471 return 0;
7472
7473 if (priv->flags & NESTED_SYNC_IMM)
7474 dev->nested_level = dev->lower_level - 1;
7475 if (priv->flags & NESTED_SYNC_TODO)
7476 net_unlink_todo(dev);
7477#endif
7478 return 0;
7479}
7480
7481int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
7482 int (*fn)(struct net_device *dev,
7483 struct netdev_nested_priv *priv),
7484 struct netdev_nested_priv *priv)
7485{
7486 struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7487 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7488 int ret, cur = 0;
7489
7490 now = dev;
7491 iter = &dev->adj_list.lower;
7492
7493 while (1) {
7494 if (now != dev) {
7495 ret = fn(now, priv);
7496 if (ret)
7497 return ret;
7498 }
7499
7500 next = NULL;
7501 while (1) {
7502 ldev = netdev_next_lower_dev_rcu(now, &iter);
7503 if (!ldev)
7504 break;
7505
7506 next = ldev;
7507 niter = &ldev->adj_list.lower;
7508 dev_stack[cur] = now;
7509 iter_stack[cur++] = iter;
7510 break;
7511 }
7512
7513 if (!next) {
7514 if (!cur)
7515 return 0;
7516 next = dev_stack[--cur];
7517 niter = iter_stack[cur];
7518 }
7519
7520 now = next;
7521 iter = niter;
7522 }
7523
7524 return 0;
7525}
7526EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
7527
7528/**
7529 * netdev_lower_get_first_private_rcu - Get the first ->private from the
7530 * lower neighbour list, RCU
7531 * variant
7532 * @dev: device
7533 *
7534 * Gets the first netdev_adjacent->private from the dev's lower neighbour
7535 * list. The caller must hold RCU read lock.
7536 */
7537void *netdev_lower_get_first_private_rcu(struct net_device *dev)
7538{
7539 struct netdev_adjacent *lower;
7540
7541 lower = list_first_or_null_rcu(&dev->adj_list.lower,
7542 struct netdev_adjacent, list);
7543 if (lower)
7544 return lower->private;
7545 return NULL;
7546}
7547EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
7548
7549/**
7550 * netdev_master_upper_dev_get_rcu - Get master upper device
7551 * @dev: device
7552 *
7553 * Find a master upper device and return pointer to it or NULL in case
7554 * it's not there. The caller must hold the RCU read lock.
7555 */
7556struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
7557{
7558 struct netdev_adjacent *upper;
7559
7560 upper = list_first_or_null_rcu(&dev->adj_list.upper,
7561 struct netdev_adjacent, list);
7562 if (upper && likely(upper->master))
7563 return upper->dev;
7564 return NULL;
7565}
7566EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
7567
7568static int netdev_adjacent_sysfs_add(struct net_device *dev,
7569 struct net_device *adj_dev,
7570 struct list_head *dev_list)
7571{
7572 char linkname[IFNAMSIZ+7];
7573
7574 sprintf(linkname, dev_list == &dev->adj_list.upper ?
7575 "upper_%s" : "lower_%s", adj_dev->name);
7576 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
7577 linkname);
7578}
7579static void netdev_adjacent_sysfs_del(struct net_device *dev,
7580 char *name,
7581 struct list_head *dev_list)
7582{
7583 char linkname[IFNAMSIZ+7];
7584
7585 sprintf(linkname, dev_list == &dev->adj_list.upper ?
7586 "upper_%s" : "lower_%s", name);
7587 sysfs_remove_link(&(dev->dev.kobj), linkname);
7588}
7589
7590static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
7591 struct net_device *adj_dev,
7592 struct list_head *dev_list)
7593{
7594 return (dev_list == &dev->adj_list.upper ||
7595 dev_list == &dev->adj_list.lower) &&
7596 net_eq(dev_net(dev), dev_net(adj_dev));
7597}
7598
7599static int __netdev_adjacent_dev_insert(struct net_device *dev,
7600 struct net_device *adj_dev,
7601 struct list_head *dev_list,
7602 void *private, bool master)
7603{
7604 struct netdev_adjacent *adj;
7605 int ret;
7606
7607 adj = __netdev_find_adj(adj_dev, dev_list);
7608
7609 if (adj) {
7610 adj->ref_nr += 1;
7611 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
7612 dev->name, adj_dev->name, adj->ref_nr);
7613
7614 return 0;
7615 }
7616
7617 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
7618 if (!adj)
7619 return -ENOMEM;
7620
7621 adj->dev = adj_dev;
7622 adj->master = master;
7623 adj->ref_nr = 1;
7624 adj->private = private;
7625 adj->ignore = false;
7626 netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL);
7627
7628 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
7629 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
7630
7631 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
7632 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
7633 if (ret)
7634 goto free_adj;
7635 }
7636
7637 /* Ensure that master link is always the first item in list. */
7638 if (master) {
7639 ret = sysfs_create_link(&(dev->dev.kobj),
7640 &(adj_dev->dev.kobj), "master");
7641 if (ret)
7642 goto remove_symlinks;
7643
7644 list_add_rcu(&adj->list, dev_list);
7645 } else {
7646 list_add_tail_rcu(&adj->list, dev_list);
7647 }
7648
7649 return 0;
7650
7651remove_symlinks:
7652 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7653 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7654free_adj:
7655 netdev_put(adj_dev, &adj->dev_tracker);
7656 kfree(adj);
7657
7658 return ret;
7659}
7660
7661static void __netdev_adjacent_dev_remove(struct net_device *dev,
7662 struct net_device *adj_dev,
7663 u16 ref_nr,
7664 struct list_head *dev_list)
7665{
7666 struct netdev_adjacent *adj;
7667
7668 pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
7669 dev->name, adj_dev->name, ref_nr);
7670
7671 adj = __netdev_find_adj(adj_dev, dev_list);
7672
7673 if (!adj) {
7674 pr_err("Adjacency does not exist for device %s from %s\n",
7675 dev->name, adj_dev->name);
7676 WARN_ON(1);
7677 return;
7678 }
7679
7680 if (adj->ref_nr > ref_nr) {
7681 pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
7682 dev->name, adj_dev->name, ref_nr,
7683 adj->ref_nr - ref_nr);
7684 adj->ref_nr -= ref_nr;
7685 return;
7686 }
7687
7688 if (adj->master)
7689 sysfs_remove_link(&(dev->dev.kobj), "master");
7690
7691 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7692 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7693
7694 list_del_rcu(&adj->list);
7695 pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
7696 adj_dev->name, dev->name, adj_dev->name);
7697 netdev_put(adj_dev, &adj->dev_tracker);
7698 kfree_rcu(adj, rcu);
7699}
7700
7701static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
7702 struct net_device *upper_dev,
7703 struct list_head *up_list,
7704 struct list_head *down_list,
7705 void *private, bool master)
7706{
7707 int ret;
7708
7709 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
7710 private, master);
7711 if (ret)
7712 return ret;
7713
7714 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
7715 private, false);
7716 if (ret) {
7717 __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
7718 return ret;
7719 }
7720
7721 return 0;
7722}
7723
7724static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
7725 struct net_device *upper_dev,
7726 u16 ref_nr,
7727 struct list_head *up_list,
7728 struct list_head *down_list)
7729{
7730 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
7731 __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
7732}
7733
7734static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
7735 struct net_device *upper_dev,
7736 void *private, bool master)
7737{
7738 return __netdev_adjacent_dev_link_lists(dev, upper_dev,
7739 &dev->adj_list.upper,
7740 &upper_dev->adj_list.lower,
7741 private, master);
7742}
7743
7744static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
7745 struct net_device *upper_dev)
7746{
7747 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
7748 &dev->adj_list.upper,
7749 &upper_dev->adj_list.lower);
7750}
7751
7752static int __netdev_upper_dev_link(struct net_device *dev,
7753 struct net_device *upper_dev, bool master,
7754 void *upper_priv, void *upper_info,
7755 struct netdev_nested_priv *priv,
7756 struct netlink_ext_ack *extack)
7757{
7758 struct netdev_notifier_changeupper_info changeupper_info = {
7759 .info = {
7760 .dev = dev,
7761 .extack = extack,
7762 },
7763 .upper_dev = upper_dev,
7764 .master = master,
7765 .linking = true,
7766 .upper_info = upper_info,
7767 };
7768 struct net_device *master_dev;
7769 int ret = 0;
7770
7771 ASSERT_RTNL();
7772
7773 if (dev == upper_dev)
7774 return -EBUSY;
7775
7776 /* To prevent loops, check if dev is not upper device to upper_dev. */
7777 if (__netdev_has_upper_dev(upper_dev, dev))
7778 return -EBUSY;
7779
7780 if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
7781 return -EMLINK;
7782
7783 if (!master) {
7784 if (__netdev_has_upper_dev(dev, upper_dev))
7785 return -EEXIST;
7786 } else {
7787 master_dev = __netdev_master_upper_dev_get(dev);
7788 if (master_dev)
7789 return master_dev == upper_dev ? -EEXIST : -EBUSY;
7790 }
7791
7792 ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7793 &changeupper_info.info);
7794 ret = notifier_to_errno(ret);
7795 if (ret)
7796 return ret;
7797
7798 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
7799 master);
7800 if (ret)
7801 return ret;
7802
7803 ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7804 &changeupper_info.info);
7805 ret = notifier_to_errno(ret);
7806 if (ret)
7807 goto rollback;
7808
7809 __netdev_update_upper_level(dev, NULL);
7810 __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7811
7812 __netdev_update_lower_level(upper_dev, priv);
7813 __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7814 priv);
7815
7816 return 0;
7817
7818rollback:
7819 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7820
7821 return ret;
7822}
7823
7824/**
7825 * netdev_upper_dev_link - Add a link to the upper device
7826 * @dev: device
7827 * @upper_dev: new upper device
7828 * @extack: netlink extended ack
7829 *
7830 * Adds a link to device which is upper to this one. The caller must hold
7831 * the RTNL lock. On a failure a negative errno code is returned.
7832 * On success the reference counts are adjusted and the function
7833 * returns zero.
7834 */
7835int netdev_upper_dev_link(struct net_device *dev,
7836 struct net_device *upper_dev,
7837 struct netlink_ext_ack *extack)
7838{
7839 struct netdev_nested_priv priv = {
7840 .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7841 .data = NULL,
7842 };
7843
7844 return __netdev_upper_dev_link(dev, upper_dev, false,
7845 NULL, NULL, &priv, extack);
7846}
7847EXPORT_SYMBOL(netdev_upper_dev_link);
7848
7849/**
7850 * netdev_master_upper_dev_link - Add a master link to the upper device
7851 * @dev: device
7852 * @upper_dev: new upper device
7853 * @upper_priv: upper device private
7854 * @upper_info: upper info to be passed down via notifier
7855 * @extack: netlink extended ack
7856 *
7857 * Adds a link to device which is upper to this one. In this case, only
7858 * one master upper device can be linked, although other non-master devices
7859 * might be linked as well. The caller must hold the RTNL lock.
7860 * On a failure a negative errno code is returned. On success the reference
7861 * counts are adjusted and the function returns zero.
7862 */
7863int netdev_master_upper_dev_link(struct net_device *dev,
7864 struct net_device *upper_dev,
7865 void *upper_priv, void *upper_info,
7866 struct netlink_ext_ack *extack)
7867{
7868 struct netdev_nested_priv priv = {
7869 .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7870 .data = NULL,
7871 };
7872
7873 return __netdev_upper_dev_link(dev, upper_dev, true,
7874 upper_priv, upper_info, &priv, extack);
7875}
7876EXPORT_SYMBOL(netdev_master_upper_dev_link);
7877
7878static void __netdev_upper_dev_unlink(struct net_device *dev,
7879 struct net_device *upper_dev,
7880 struct netdev_nested_priv *priv)
7881{
7882 struct netdev_notifier_changeupper_info changeupper_info = {
7883 .info = {
7884 .dev = dev,
7885 },
7886 .upper_dev = upper_dev,
7887 .linking = false,
7888 };
7889
7890 ASSERT_RTNL();
7891
7892 changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
7893
7894 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7895 &changeupper_info.info);
7896
7897 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7898
7899 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7900 &changeupper_info.info);
7901
7902 __netdev_update_upper_level(dev, NULL);
7903 __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7904
7905 __netdev_update_lower_level(upper_dev, priv);
7906 __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7907 priv);
7908}
7909
7910/**
7911 * netdev_upper_dev_unlink - Removes a link to upper device
7912 * @dev: device
7913 * @upper_dev: new upper device
7914 *
7915 * Removes a link to device which is upper to this one. The caller must hold
7916 * the RTNL lock.
7917 */
7918void netdev_upper_dev_unlink(struct net_device *dev,
7919 struct net_device *upper_dev)
7920{
7921 struct netdev_nested_priv priv = {
7922 .flags = NESTED_SYNC_TODO,
7923 .data = NULL,
7924 };
7925
7926 __netdev_upper_dev_unlink(dev, upper_dev, &priv);
7927}
7928EXPORT_SYMBOL(netdev_upper_dev_unlink);
7929
7930static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
7931 struct net_device *lower_dev,
7932 bool val)
7933{
7934 struct netdev_adjacent *adj;
7935
7936 adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
7937 if (adj)
7938 adj->ignore = val;
7939
7940 adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
7941 if (adj)
7942 adj->ignore = val;
7943}
7944
7945static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
7946 struct net_device *lower_dev)
7947{
7948 __netdev_adjacent_dev_set(upper_dev, lower_dev, true);
7949}
7950
7951static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
7952 struct net_device *lower_dev)
7953{
7954 __netdev_adjacent_dev_set(upper_dev, lower_dev, false);
7955}
7956
7957int netdev_adjacent_change_prepare(struct net_device *old_dev,
7958 struct net_device *new_dev,
7959 struct net_device *dev,
7960 struct netlink_ext_ack *extack)
7961{
7962 struct netdev_nested_priv priv = {
7963 .flags = 0,
7964 .data = NULL,
7965 };
7966 int err;
7967
7968 if (!new_dev)
7969 return 0;
7970
7971 if (old_dev && new_dev != old_dev)
7972 netdev_adjacent_dev_disable(dev, old_dev);
7973 err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
7974 extack);
7975 if (err) {
7976 if (old_dev && new_dev != old_dev)
7977 netdev_adjacent_dev_enable(dev, old_dev);
7978 return err;
7979 }
7980
7981 return 0;
7982}
7983EXPORT_SYMBOL(netdev_adjacent_change_prepare);
7984
7985void netdev_adjacent_change_commit(struct net_device *old_dev,
7986 struct net_device *new_dev,
7987 struct net_device *dev)
7988{
7989 struct netdev_nested_priv priv = {
7990 .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7991 .data = NULL,
7992 };
7993
7994 if (!new_dev || !old_dev)
7995 return;
7996
7997 if (new_dev == old_dev)
7998 return;
7999
8000 netdev_adjacent_dev_enable(dev, old_dev);
8001 __netdev_upper_dev_unlink(old_dev, dev, &priv);
8002}
8003EXPORT_SYMBOL(netdev_adjacent_change_commit);
8004
8005void netdev_adjacent_change_abort(struct net_device *old_dev,
8006 struct net_device *new_dev,
8007 struct net_device *dev)
8008{
8009 struct netdev_nested_priv priv = {
8010 .flags = 0,
8011 .data = NULL,
8012 };
8013
8014 if (!new_dev)
8015 return;
8016
8017 if (old_dev && new_dev != old_dev)
8018 netdev_adjacent_dev_enable(dev, old_dev);
8019
8020 __netdev_upper_dev_unlink(new_dev, dev, &priv);
8021}
8022EXPORT_SYMBOL(netdev_adjacent_change_abort);
8023
8024/**
8025 * netdev_bonding_info_change - Dispatch event about slave change
8026 * @dev: device
8027 * @bonding_info: info to dispatch
8028 *
8029 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
8030 * The caller must hold the RTNL lock.
8031 */
8032void netdev_bonding_info_change(struct net_device *dev,
8033 struct netdev_bonding_info *bonding_info)
8034{
8035 struct netdev_notifier_bonding_info info = {
8036 .info.dev = dev,
8037 };
8038
8039 memcpy(&info.bonding_info, bonding_info,
8040 sizeof(struct netdev_bonding_info));
8041 call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
8042 &info.info);
8043}
8044EXPORT_SYMBOL(netdev_bonding_info_change);
8045
8046static int netdev_offload_xstats_enable_l3(struct net_device *dev,
8047 struct netlink_ext_ack *extack)
8048{
8049 struct netdev_notifier_offload_xstats_info info = {
8050 .info.dev = dev,
8051 .info.extack = extack,
8052 .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
8053 };
8054 int err;
8055 int rc;
8056
8057 dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
8058 GFP_KERNEL);
8059 if (!dev->offload_xstats_l3)
8060 return -ENOMEM;
8061
8062 rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
8063 NETDEV_OFFLOAD_XSTATS_DISABLE,
8064 &info.info);
8065 err = notifier_to_errno(rc);
8066 if (err)
8067 goto free_stats;
8068
8069 return 0;
8070
8071free_stats:
8072 kfree(dev->offload_xstats_l3);
8073 dev->offload_xstats_l3 = NULL;
8074 return err;
8075}
8076
8077int netdev_offload_xstats_enable(struct net_device *dev,
8078 enum netdev_offload_xstats_type type,
8079 struct netlink_ext_ack *extack)
8080{
8081 ASSERT_RTNL();
8082
8083 if (netdev_offload_xstats_enabled(dev, type))
8084 return -EALREADY;
8085
8086 switch (type) {
8087 case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
8088 return netdev_offload_xstats_enable_l3(dev, extack);
8089 }
8090
8091 WARN_ON(1);
8092 return -EINVAL;
8093}
8094EXPORT_SYMBOL(netdev_offload_xstats_enable);
8095
8096static void netdev_offload_xstats_disable_l3(struct net_device *dev)
8097{
8098 struct netdev_notifier_offload_xstats_info info = {
8099 .info.dev = dev,
8100 .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
8101 };
8102
8103 call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
8104 &info.info);
8105 kfree(dev->offload_xstats_l3);
8106 dev->offload_xstats_l3 = NULL;
8107}
8108
8109int netdev_offload_xstats_disable(struct net_device *dev,
8110 enum netdev_offload_xstats_type type)
8111{
8112 ASSERT_RTNL();
8113
8114 if (!netdev_offload_xstats_enabled(dev, type))
8115 return -EALREADY;
8116
8117 switch (type) {
8118 case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
8119 netdev_offload_xstats_disable_l3(dev);
8120 return 0;
8121 }
8122
8123 WARN_ON(1);
8124 return -EINVAL;
8125}
8126EXPORT_SYMBOL(netdev_offload_xstats_disable);
8127
8128static void netdev_offload_xstats_disable_all(struct net_device *dev)
8129{
8130 netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
8131}
8132
8133static struct rtnl_hw_stats64 *
8134netdev_offload_xstats_get_ptr(const struct net_device *dev,
8135 enum netdev_offload_xstats_type type)
8136{
8137 switch (type) {
8138 case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
8139 return dev->offload_xstats_l3;
8140 }
8141
8142 WARN_ON(1);
8143 return NULL;
8144}
8145
8146bool netdev_offload_xstats_enabled(const struct net_device *dev,
8147 enum netdev_offload_xstats_type type)
8148{
8149 ASSERT_RTNL();
8150
8151 return netdev_offload_xstats_get_ptr(dev, type);
8152}
8153EXPORT_SYMBOL(netdev_offload_xstats_enabled);
8154
8155struct netdev_notifier_offload_xstats_ru {
8156 bool used;
8157};
8158
8159struct netdev_notifier_offload_xstats_rd {
8160 struct rtnl_hw_stats64 stats;
8161 bool used;
8162};
8163
8164static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
8165 const struct rtnl_hw_stats64 *src)
8166{
8167 dest->rx_packets += src->rx_packets;
8168 dest->tx_packets += src->tx_packets;
8169 dest->rx_bytes += src->rx_bytes;
8170 dest->tx_bytes += src->tx_bytes;
8171 dest->rx_errors += src->rx_errors;
8172 dest->tx_errors += src->tx_errors;
8173 dest->rx_dropped += src->rx_dropped;
8174 dest->tx_dropped += src->tx_dropped;
8175 dest->multicast += src->multicast;
8176}
8177
8178static int netdev_offload_xstats_get_used(struct net_device *dev,
8179 enum netdev_offload_xstats_type type,
8180 bool *p_used,
8181 struct netlink_ext_ack *extack)
8182{
8183 struct netdev_notifier_offload_xstats_ru report_used = {};
8184 struct netdev_notifier_offload_xstats_info info = {
8185 .info.dev = dev,
8186 .info.extack = extack,
8187 .type = type,
8188 .report_used = &report_used,
8189 };
8190 int rc;
8191
8192 WARN_ON(!netdev_offload_xstats_enabled(dev, type));
8193 rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
8194 &info.info);
8195 *p_used = report_used.used;
8196 return notifier_to_errno(rc);
8197}
8198
8199static int netdev_offload_xstats_get_stats(struct net_device *dev,
8200 enum netdev_offload_xstats_type type,
8201 struct rtnl_hw_stats64 *p_stats,
8202 bool *p_used,
8203 struct netlink_ext_ack *extack)
8204{
8205 struct netdev_notifier_offload_xstats_rd report_delta = {};
8206 struct netdev_notifier_offload_xstats_info info = {
8207 .info.dev = dev,
8208 .info.extack = extack,
8209 .type = type,
8210 .report_delta = &report_delta,
8211 };
8212 struct rtnl_hw_stats64 *stats;
8213 int rc;
8214
8215 stats = netdev_offload_xstats_get_ptr(dev, type);
8216 if (WARN_ON(!stats))
8217 return -EINVAL;
8218
8219 rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
8220 &info.info);
8221
8222 /* Cache whatever we got, even if there was an error, otherwise the
8223 * successful stats retrievals would get lost.
8224 */
8225 netdev_hw_stats64_add(stats, &report_delta.stats);
8226
8227 if (p_stats)
8228 *p_stats = *stats;
8229 *p_used = report_delta.used;
8230
8231 return notifier_to_errno(rc);
8232}
8233
8234int netdev_offload_xstats_get(struct net_device *dev,
8235 enum netdev_offload_xstats_type type,
8236 struct rtnl_hw_stats64 *p_stats, bool *p_used,
8237 struct netlink_ext_ack *extack)
8238{
8239 ASSERT_RTNL();
8240
8241 if (p_stats)
8242 return netdev_offload_xstats_get_stats(dev, type, p_stats,
8243 p_used, extack);
8244 else
8245 return netdev_offload_xstats_get_used(dev, type, p_used,
8246 extack);
8247}
8248EXPORT_SYMBOL(netdev_offload_xstats_get);
8249
8250void
8251netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
8252 const struct rtnl_hw_stats64 *stats)
8253{
8254 report_delta->used = true;
8255 netdev_hw_stats64_add(&report_delta->stats, stats);
8256}
8257EXPORT_SYMBOL(netdev_offload_xstats_report_delta);
8258
8259void
8260netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
8261{
8262 report_used->used = true;
8263}
8264EXPORT_SYMBOL(netdev_offload_xstats_report_used);
8265
8266void netdev_offload_xstats_push_delta(struct net_device *dev,
8267 enum netdev_offload_xstats_type type,
8268 const struct rtnl_hw_stats64 *p_stats)
8269{
8270 struct rtnl_hw_stats64 *stats;
8271
8272 ASSERT_RTNL();
8273
8274 stats = netdev_offload_xstats_get_ptr(dev, type);
8275 if (WARN_ON(!stats))
8276 return;
8277
8278 netdev_hw_stats64_add(stats, p_stats);
8279}
8280EXPORT_SYMBOL(netdev_offload_xstats_push_delta);
8281
8282/**
8283 * netdev_get_xmit_slave - Get the xmit slave of master device
8284 * @dev: device
8285 * @skb: The packet
8286 * @all_slaves: assume all the slaves are active
8287 *
8288 * The reference counters are not incremented so the caller must be
8289 * careful with locks. The caller must hold RCU lock.
8290 * %NULL is returned if no slave is found.
8291 */
8292
8293struct net_device *netdev_get_xmit_slave(struct net_device *dev,
8294 struct sk_buff *skb,
8295 bool all_slaves)
8296{
8297 const struct net_device_ops *ops = dev->netdev_ops;
8298
8299 if (!ops->ndo_get_xmit_slave)
8300 return NULL;
8301 return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
8302}
8303EXPORT_SYMBOL(netdev_get_xmit_slave);
8304
8305static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
8306 struct sock *sk)
8307{
8308 const struct net_device_ops *ops = dev->netdev_ops;
8309
8310 if (!ops->ndo_sk_get_lower_dev)
8311 return NULL;
8312 return ops->ndo_sk_get_lower_dev(dev, sk);
8313}
8314
8315/**
8316 * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
8317 * @dev: device
8318 * @sk: the socket
8319 *
8320 * %NULL is returned if no lower device is found.
8321 */
8322
8323struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
8324 struct sock *sk)
8325{
8326 struct net_device *lower;
8327
8328 lower = netdev_sk_get_lower_dev(dev, sk);
8329 while (lower) {
8330 dev = lower;
8331 lower = netdev_sk_get_lower_dev(dev, sk);
8332 }
8333
8334 return dev;
8335}
8336EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
8337
8338static void netdev_adjacent_add_links(struct net_device *dev)
8339{
8340 struct netdev_adjacent *iter;
8341
8342 struct net *net = dev_net(dev);
8343
8344 list_for_each_entry(iter, &dev->adj_list.upper, list) {
8345 if (!net_eq(net, dev_net(iter->dev)))
8346 continue;
8347 netdev_adjacent_sysfs_add(iter->dev, dev,
8348 &iter->dev->adj_list.lower);
8349 netdev_adjacent_sysfs_add(dev, iter->dev,
8350 &dev->adj_list.upper);
8351 }
8352
8353 list_for_each_entry(iter, &dev->adj_list.lower, list) {
8354 if (!net_eq(net, dev_net(iter->dev)))
8355 continue;
8356 netdev_adjacent_sysfs_add(iter->dev, dev,
8357 &iter->dev->adj_list.upper);
8358 netdev_adjacent_sysfs_add(dev, iter->dev,
8359 &dev->adj_list.lower);
8360 }
8361}
8362
8363static void netdev_adjacent_del_links(struct net_device *dev)
8364{
8365 struct netdev_adjacent *iter;
8366
8367 struct net *net = dev_net(dev);
8368
8369 list_for_each_entry(iter, &dev->adj_list.upper, list) {
8370 if (!net_eq(net, dev_net(iter->dev)))
8371 continue;
8372 netdev_adjacent_sysfs_del(iter->dev, dev->name,
8373 &iter->dev->adj_list.lower);
8374 netdev_adjacent_sysfs_del(dev, iter->dev->name,
8375 &dev->adj_list.upper);
8376 }
8377
8378 list_for_each_entry(iter, &dev->adj_list.lower, list) {
8379 if (!net_eq(net, dev_net(iter->dev)))
8380 continue;
8381 netdev_adjacent_sysfs_del(iter->dev, dev->name,
8382 &iter->dev->adj_list.upper);
8383 netdev_adjacent_sysfs_del(dev, iter->dev->name,
8384 &dev->adj_list.lower);
8385 }
8386}
8387
8388void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
8389{
8390 struct netdev_adjacent *iter;
8391
8392 struct net *net = dev_net(dev);
8393
8394 list_for_each_entry(iter, &dev->adj_list.upper, list) {
8395 if (!net_eq(net, dev_net(iter->dev)))
8396 continue;
8397 netdev_adjacent_sysfs_del(iter->dev, oldname,
8398 &iter->dev->adj_list.lower);
8399 netdev_adjacent_sysfs_add(iter->dev, dev,
8400 &iter->dev->adj_list.lower);
8401 }
8402
8403 list_for_each_entry(iter, &dev->adj_list.lower, list) {
8404 if (!net_eq(net, dev_net(iter->dev)))
8405 continue;
8406 netdev_adjacent_sysfs_del(iter->dev, oldname,
8407 &iter->dev->adj_list.upper);
8408 netdev_adjacent_sysfs_add(iter->dev, dev,
8409 &iter->dev->adj_list.upper);
8410 }
8411}
8412
8413void *netdev_lower_dev_get_private(struct net_device *dev,
8414 struct net_device *lower_dev)
8415{
8416 struct netdev_adjacent *lower;
8417
8418 if (!lower_dev)
8419 return NULL;
8420 lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
8421 if (!lower)
8422 return NULL;
8423
8424 return lower->private;
8425}
8426EXPORT_SYMBOL(netdev_lower_dev_get_private);
8427
8428
8429/**
8430 * netdev_lower_state_changed - Dispatch event about lower device state change
8431 * @lower_dev: device
8432 * @lower_state_info: state to dispatch
8433 *
8434 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
8435 * The caller must hold the RTNL lock.
8436 */
8437void netdev_lower_state_changed(struct net_device *lower_dev,
8438 void *lower_state_info)
8439{
8440 struct netdev_notifier_changelowerstate_info changelowerstate_info = {
8441 .info.dev = lower_dev,
8442 };
8443
8444 ASSERT_RTNL();
8445 changelowerstate_info.lower_state_info = lower_state_info;
8446 call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
8447 &changelowerstate_info.info);
8448}
8449EXPORT_SYMBOL(netdev_lower_state_changed);
8450
8451static void dev_change_rx_flags(struct net_device *dev, int flags)
8452{
8453 const struct net_device_ops *ops = dev->netdev_ops;
8454
8455 if (ops->ndo_change_rx_flags)
8456 ops->ndo_change_rx_flags(dev, flags);
8457}
8458
8459static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
8460{
8461 unsigned int old_flags = dev->flags;
8462 kuid_t uid;
8463 kgid_t gid;
8464
8465 ASSERT_RTNL();
8466
8467 dev->flags |= IFF_PROMISC;
8468 dev->promiscuity += inc;
8469 if (dev->promiscuity == 0) {
8470 /*
8471 * Avoid overflow.
8472 * If inc causes overflow, untouch promisc and return error.
8473 */
8474 if (inc < 0)
8475 dev->flags &= ~IFF_PROMISC;
8476 else {
8477 dev->promiscuity -= inc;
8478 netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
8479 return -EOVERFLOW;
8480 }
8481 }
8482 if (dev->flags != old_flags) {
8483 netdev_info(dev, "%s promiscuous mode\n",
8484 dev->flags & IFF_PROMISC ? "entered" : "left");
8485 if (audit_enabled) {
8486 current_uid_gid(&uid, &gid);
8487 audit_log(audit_context(), GFP_ATOMIC,
8488 AUDIT_ANOM_PROMISCUOUS,
8489 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
8490 dev->name, (dev->flags & IFF_PROMISC),
8491 (old_flags & IFF_PROMISC),
8492 from_kuid(&init_user_ns, audit_get_loginuid(current)),
8493 from_kuid(&init_user_ns, uid),
8494 from_kgid(&init_user_ns, gid),
8495 audit_get_sessionid(current));
8496 }
8497
8498 dev_change_rx_flags(dev, IFF_PROMISC);
8499 }
8500 if (notify)
8501 __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
8502 return 0;
8503}
8504
8505/**
8506 * dev_set_promiscuity - update promiscuity count on a device
8507 * @dev: device
8508 * @inc: modifier
8509 *
8510 * Add or remove promiscuity from a device. While the count in the device
8511 * remains above zero the interface remains promiscuous. Once it hits zero
8512 * the device reverts back to normal filtering operation. A negative inc
8513 * value is used to drop promiscuity on the device.
8514 * Return 0 if successful or a negative errno code on error.
8515 */
8516int dev_set_promiscuity(struct net_device *dev, int inc)
8517{
8518 unsigned int old_flags = dev->flags;
8519 int err;
8520
8521 err = __dev_set_promiscuity(dev, inc, true);
8522 if (err < 0)
8523 return err;
8524 if (dev->flags != old_flags)
8525 dev_set_rx_mode(dev);
8526 return err;
8527}
8528EXPORT_SYMBOL(dev_set_promiscuity);
8529
8530static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
8531{
8532 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
8533
8534 ASSERT_RTNL();
8535
8536 dev->flags |= IFF_ALLMULTI;
8537 dev->allmulti += inc;
8538 if (dev->allmulti == 0) {
8539 /*
8540 * Avoid overflow.
8541 * If inc causes overflow, untouch allmulti and return error.
8542 */
8543 if (inc < 0)
8544 dev->flags &= ~IFF_ALLMULTI;
8545 else {
8546 dev->allmulti -= inc;
8547 netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
8548 return -EOVERFLOW;
8549 }
8550 }
8551 if (dev->flags ^ old_flags) {
8552 netdev_info(dev, "%s allmulticast mode\n",
8553 dev->flags & IFF_ALLMULTI ? "entered" : "left");
8554 dev_change_rx_flags(dev, IFF_ALLMULTI);
8555 dev_set_rx_mode(dev);
8556 if (notify)
8557 __dev_notify_flags(dev, old_flags,
8558 dev->gflags ^ old_gflags, 0, NULL);
8559 }
8560 return 0;
8561}
8562
8563/**
8564 * dev_set_allmulti - update allmulti count on a device
8565 * @dev: device
8566 * @inc: modifier
8567 *
8568 * Add or remove reception of all multicast frames to a device. While the
8569 * count in the device remains above zero the interface remains listening
8570 * to all interfaces. Once it hits zero the device reverts back to normal
8571 * filtering operation. A negative @inc value is used to drop the counter
8572 * when releasing a resource needing all multicasts.
8573 * Return 0 if successful or a negative errno code on error.
8574 */
8575
8576int dev_set_allmulti(struct net_device *dev, int inc)
8577{
8578 return __dev_set_allmulti(dev, inc, true);
8579}
8580EXPORT_SYMBOL(dev_set_allmulti);
8581
8582/*
8583 * Upload unicast and multicast address lists to device and
8584 * configure RX filtering. When the device doesn't support unicast
8585 * filtering it is put in promiscuous mode while unicast addresses
8586 * are present.
8587 */
8588void __dev_set_rx_mode(struct net_device *dev)
8589{
8590 const struct net_device_ops *ops = dev->netdev_ops;
8591
8592 /* dev_open will call this function so the list will stay sane. */
8593 if (!(dev->flags&IFF_UP))
8594 return;
8595
8596 if (!netif_device_present(dev))
8597 return;
8598
8599 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
8600 /* Unicast addresses changes may only happen under the rtnl,
8601 * therefore calling __dev_set_promiscuity here is safe.
8602 */
8603 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
8604 __dev_set_promiscuity(dev, 1, false);
8605 dev->uc_promisc = true;
8606 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
8607 __dev_set_promiscuity(dev, -1, false);
8608 dev->uc_promisc = false;
8609 }
8610 }
8611
8612 if (ops->ndo_set_rx_mode)
8613 ops->ndo_set_rx_mode(dev);
8614}
8615
8616void dev_set_rx_mode(struct net_device *dev)
8617{
8618 netif_addr_lock_bh(dev);
8619 __dev_set_rx_mode(dev);
8620 netif_addr_unlock_bh(dev);
8621}
8622
8623/**
8624 * dev_get_flags - get flags reported to userspace
8625 * @dev: device
8626 *
8627 * Get the combination of flag bits exported through APIs to userspace.
8628 */
8629unsigned int dev_get_flags(const struct net_device *dev)
8630{
8631 unsigned int flags;
8632
8633 flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC |
8634 IFF_ALLMULTI |
8635 IFF_RUNNING |
8636 IFF_LOWER_UP |
8637 IFF_DORMANT)) |
8638 (READ_ONCE(dev->gflags) & (IFF_PROMISC |
8639 IFF_ALLMULTI));
8640
8641 if (netif_running(dev)) {
8642 if (netif_oper_up(dev))
8643 flags |= IFF_RUNNING;
8644 if (netif_carrier_ok(dev))
8645 flags |= IFF_LOWER_UP;
8646 if (netif_dormant(dev))
8647 flags |= IFF_DORMANT;
8648 }
8649
8650 return flags;
8651}
8652EXPORT_SYMBOL(dev_get_flags);
8653
8654int __dev_change_flags(struct net_device *dev, unsigned int flags,
8655 struct netlink_ext_ack *extack)
8656{
8657 unsigned int old_flags = dev->flags;
8658 int ret;
8659
8660 ASSERT_RTNL();
8661
8662 /*
8663 * Set the flags on our device.
8664 */
8665
8666 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
8667 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
8668 IFF_AUTOMEDIA)) |
8669 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
8670 IFF_ALLMULTI));
8671
8672 /*
8673 * Load in the correct multicast list now the flags have changed.
8674 */
8675
8676 if ((old_flags ^ flags) & IFF_MULTICAST)
8677 dev_change_rx_flags(dev, IFF_MULTICAST);
8678
8679 dev_set_rx_mode(dev);
8680
8681 /*
8682 * Have we downed the interface. We handle IFF_UP ourselves
8683 * according to user attempts to set it, rather than blindly
8684 * setting it.
8685 */
8686
8687 ret = 0;
8688 if ((old_flags ^ flags) & IFF_UP) {
8689 if (old_flags & IFF_UP)
8690 __dev_close(dev);
8691 else
8692 ret = __dev_open(dev, extack);
8693 }
8694
8695 if ((flags ^ dev->gflags) & IFF_PROMISC) {
8696 int inc = (flags & IFF_PROMISC) ? 1 : -1;
8697 unsigned int old_flags = dev->flags;
8698
8699 dev->gflags ^= IFF_PROMISC;
8700
8701 if (__dev_set_promiscuity(dev, inc, false) >= 0)
8702 if (dev->flags != old_flags)
8703 dev_set_rx_mode(dev);
8704 }
8705
8706 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
8707 * is important. Some (broken) drivers set IFF_PROMISC, when
8708 * IFF_ALLMULTI is requested not asking us and not reporting.
8709 */
8710 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
8711 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
8712
8713 dev->gflags ^= IFF_ALLMULTI;
8714 __dev_set_allmulti(dev, inc, false);
8715 }
8716
8717 return ret;
8718}
8719
8720void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
8721 unsigned int gchanges, u32 portid,
8722 const struct nlmsghdr *nlh)
8723{
8724 unsigned int changes = dev->flags ^ old_flags;
8725
8726 if (gchanges)
8727 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh);
8728
8729 if (changes & IFF_UP) {
8730 if (dev->flags & IFF_UP)
8731 call_netdevice_notifiers(NETDEV_UP, dev);
8732 else
8733 call_netdevice_notifiers(NETDEV_DOWN, dev);
8734 }
8735
8736 if (dev->flags & IFF_UP &&
8737 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
8738 struct netdev_notifier_change_info change_info = {
8739 .info = {
8740 .dev = dev,
8741 },
8742 .flags_changed = changes,
8743 };
8744
8745 call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
8746 }
8747}
8748
8749/**
8750 * dev_change_flags - change device settings
8751 * @dev: device
8752 * @flags: device state flags
8753 * @extack: netlink extended ack
8754 *
8755 * Change settings on device based state flags. The flags are
8756 * in the userspace exported format.
8757 */
8758int dev_change_flags(struct net_device *dev, unsigned int flags,
8759 struct netlink_ext_ack *extack)
8760{
8761 int ret;
8762 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
8763
8764 ret = __dev_change_flags(dev, flags, extack);
8765 if (ret < 0)
8766 return ret;
8767
8768 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
8769 __dev_notify_flags(dev, old_flags, changes, 0, NULL);
8770 return ret;
8771}
8772EXPORT_SYMBOL(dev_change_flags);
8773
8774int __dev_set_mtu(struct net_device *dev, int new_mtu)
8775{
8776 const struct net_device_ops *ops = dev->netdev_ops;
8777
8778 if (ops->ndo_change_mtu)
8779 return ops->ndo_change_mtu(dev, new_mtu);
8780
8781 /* Pairs with all the lockless reads of dev->mtu in the stack */
8782 WRITE_ONCE(dev->mtu, new_mtu);
8783 return 0;
8784}
8785EXPORT_SYMBOL(__dev_set_mtu);
8786
8787int dev_validate_mtu(struct net_device *dev, int new_mtu,
8788 struct netlink_ext_ack *extack)
8789{
8790 /* MTU must be positive, and in range */
8791 if (new_mtu < 0 || new_mtu < dev->min_mtu) {
8792 NL_SET_ERR_MSG(extack, "mtu less than device minimum");
8793 return -EINVAL;
8794 }
8795
8796 if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
8797 NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
8798 return -EINVAL;
8799 }
8800 return 0;
8801}
8802
8803/**
8804 * dev_set_mtu_ext - Change maximum transfer unit
8805 * @dev: device
8806 * @new_mtu: new transfer unit
8807 * @extack: netlink extended ack
8808 *
8809 * Change the maximum transfer size of the network device.
8810 */
8811int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
8812 struct netlink_ext_ack *extack)
8813{
8814 int err, orig_mtu;
8815
8816 if (new_mtu == dev->mtu)
8817 return 0;
8818
8819 err = dev_validate_mtu(dev, new_mtu, extack);
8820 if (err)
8821 return err;
8822
8823 if (!netif_device_present(dev))
8824 return -ENODEV;
8825
8826 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
8827 err = notifier_to_errno(err);
8828 if (err)
8829 return err;
8830
8831 orig_mtu = dev->mtu;
8832 err = __dev_set_mtu(dev, new_mtu);
8833
8834 if (!err) {
8835 err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8836 orig_mtu);
8837 err = notifier_to_errno(err);
8838 if (err) {
8839 /* setting mtu back and notifying everyone again,
8840 * so that they have a chance to revert changes.
8841 */
8842 __dev_set_mtu(dev, orig_mtu);
8843 call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8844 new_mtu);
8845 }
8846 }
8847 return err;
8848}
8849
8850int dev_set_mtu(struct net_device *dev, int new_mtu)
8851{
8852 struct netlink_ext_ack extack;
8853 int err;
8854
8855 memset(&extack, 0, sizeof(extack));
8856 err = dev_set_mtu_ext(dev, new_mtu, &extack);
8857 if (err && extack._msg)
8858 net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
8859 return err;
8860}
8861EXPORT_SYMBOL(dev_set_mtu);
8862
8863/**
8864 * dev_change_tx_queue_len - Change TX queue length of a netdevice
8865 * @dev: device
8866 * @new_len: new tx queue length
8867 */
8868int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
8869{
8870 unsigned int orig_len = dev->tx_queue_len;
8871 int res;
8872
8873 if (new_len != (unsigned int)new_len)
8874 return -ERANGE;
8875
8876 if (new_len != orig_len) {
8877 dev->tx_queue_len = new_len;
8878 res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
8879 res = notifier_to_errno(res);
8880 if (res)
8881 goto err_rollback;
8882 res = dev_qdisc_change_tx_queue_len(dev);
8883 if (res)
8884 goto err_rollback;
8885 }
8886
8887 return 0;
8888
8889err_rollback:
8890 netdev_err(dev, "refused to change device tx_queue_len\n");
8891 dev->tx_queue_len = orig_len;
8892 return res;
8893}
8894
8895/**
8896 * dev_set_group - Change group this device belongs to
8897 * @dev: device
8898 * @new_group: group this device should belong to
8899 */
8900void dev_set_group(struct net_device *dev, int new_group)
8901{
8902 dev->group = new_group;
8903}
8904
8905/**
8906 * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
8907 * @dev: device
8908 * @addr: new address
8909 * @extack: netlink extended ack
8910 */
8911int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
8912 struct netlink_ext_ack *extack)
8913{
8914 struct netdev_notifier_pre_changeaddr_info info = {
8915 .info.dev = dev,
8916 .info.extack = extack,
8917 .dev_addr = addr,
8918 };
8919 int rc;
8920
8921 rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
8922 return notifier_to_errno(rc);
8923}
8924EXPORT_SYMBOL(dev_pre_changeaddr_notify);
8925
8926/**
8927 * dev_set_mac_address - Change Media Access Control Address
8928 * @dev: device
8929 * @sa: new address
8930 * @extack: netlink extended ack
8931 *
8932 * Change the hardware (MAC) address of the device
8933 */
8934int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
8935 struct netlink_ext_ack *extack)
8936{
8937 const struct net_device_ops *ops = dev->netdev_ops;
8938 int err;
8939
8940 if (!ops->ndo_set_mac_address)
8941 return -EOPNOTSUPP;
8942 if (sa->sa_family != dev->type)
8943 return -EINVAL;
8944 if (!netif_device_present(dev))
8945 return -ENODEV;
8946 err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
8947 if (err)
8948 return err;
8949 if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) {
8950 err = ops->ndo_set_mac_address(dev, sa);
8951 if (err)
8952 return err;
8953 }
8954 dev->addr_assign_type = NET_ADDR_SET;
8955 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
8956 add_device_randomness(dev->dev_addr, dev->addr_len);
8957 return 0;
8958}
8959EXPORT_SYMBOL(dev_set_mac_address);
8960
8961DECLARE_RWSEM(dev_addr_sem);
8962
8963int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
8964 struct netlink_ext_ack *extack)
8965{
8966 int ret;
8967
8968 down_write(&dev_addr_sem);
8969 ret = dev_set_mac_address(dev, sa, extack);
8970 up_write(&dev_addr_sem);
8971 return ret;
8972}
8973EXPORT_SYMBOL(dev_set_mac_address_user);
8974
8975int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
8976{
8977 size_t size = sizeof(sa->sa_data_min);
8978 struct net_device *dev;
8979 int ret = 0;
8980
8981 down_read(&dev_addr_sem);
8982 rcu_read_lock();
8983
8984 dev = dev_get_by_name_rcu(net, dev_name);
8985 if (!dev) {
8986 ret = -ENODEV;
8987 goto unlock;
8988 }
8989 if (!dev->addr_len)
8990 memset(sa->sa_data, 0, size);
8991 else
8992 memcpy(sa->sa_data, dev->dev_addr,
8993 min_t(size_t, size, dev->addr_len));
8994 sa->sa_family = dev->type;
8995
8996unlock:
8997 rcu_read_unlock();
8998 up_read(&dev_addr_sem);
8999 return ret;
9000}
9001EXPORT_SYMBOL(dev_get_mac_address);
9002
9003/**
9004 * dev_change_carrier - Change device carrier
9005 * @dev: device
9006 * @new_carrier: new value
9007 *
9008 * Change device carrier
9009 */
9010int dev_change_carrier(struct net_device *dev, bool new_carrier)
9011{
9012 const struct net_device_ops *ops = dev->netdev_ops;
9013
9014 if (!ops->ndo_change_carrier)
9015 return -EOPNOTSUPP;
9016 if (!netif_device_present(dev))
9017 return -ENODEV;
9018 return ops->ndo_change_carrier(dev, new_carrier);
9019}
9020
9021/**
9022 * dev_get_phys_port_id - Get device physical port ID
9023 * @dev: device
9024 * @ppid: port ID
9025 *
9026 * Get device physical port ID
9027 */
9028int dev_get_phys_port_id(struct net_device *dev,
9029 struct netdev_phys_item_id *ppid)
9030{
9031 const struct net_device_ops *ops = dev->netdev_ops;
9032
9033 if (!ops->ndo_get_phys_port_id)
9034 return -EOPNOTSUPP;
9035 return ops->ndo_get_phys_port_id(dev, ppid);
9036}
9037
9038/**
9039 * dev_get_phys_port_name - Get device physical port name
9040 * @dev: device
9041 * @name: port name
9042 * @len: limit of bytes to copy to name
9043 *
9044 * Get device physical port name
9045 */
9046int dev_get_phys_port_name(struct net_device *dev,
9047 char *name, size_t len)
9048{
9049 const struct net_device_ops *ops = dev->netdev_ops;
9050 int err;
9051
9052 if (ops->ndo_get_phys_port_name) {
9053 err = ops->ndo_get_phys_port_name(dev, name, len);
9054 if (err != -EOPNOTSUPP)
9055 return err;
9056 }
9057 return devlink_compat_phys_port_name_get(dev, name, len);
9058}
9059
9060/**
9061 * dev_get_port_parent_id - Get the device's port parent identifier
9062 * @dev: network device
9063 * @ppid: pointer to a storage for the port's parent identifier
9064 * @recurse: allow/disallow recursion to lower devices
9065 *
9066 * Get the devices's port parent identifier
9067 */
9068int dev_get_port_parent_id(struct net_device *dev,
9069 struct netdev_phys_item_id *ppid,
9070 bool recurse)
9071{
9072 const struct net_device_ops *ops = dev->netdev_ops;
9073 struct netdev_phys_item_id first = { };
9074 struct net_device *lower_dev;
9075 struct list_head *iter;
9076 int err;
9077
9078 if (ops->ndo_get_port_parent_id) {
9079 err = ops->ndo_get_port_parent_id(dev, ppid);
9080 if (err != -EOPNOTSUPP)
9081 return err;
9082 }
9083
9084 err = devlink_compat_switch_id_get(dev, ppid);
9085 if (!recurse || err != -EOPNOTSUPP)
9086 return err;
9087
9088 netdev_for_each_lower_dev(dev, lower_dev, iter) {
9089 err = dev_get_port_parent_id(lower_dev, ppid, true);
9090 if (err)
9091 break;
9092 if (!first.id_len)
9093 first = *ppid;
9094 else if (memcmp(&first, ppid, sizeof(*ppid)))
9095 return -EOPNOTSUPP;
9096 }
9097
9098 return err;
9099}
9100EXPORT_SYMBOL(dev_get_port_parent_id);
9101
9102/**
9103 * netdev_port_same_parent_id - Indicate if two network devices have
9104 * the same port parent identifier
9105 * @a: first network device
9106 * @b: second network device
9107 */
9108bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
9109{
9110 struct netdev_phys_item_id a_id = { };
9111 struct netdev_phys_item_id b_id = { };
9112
9113 if (dev_get_port_parent_id(a, &a_id, true) ||
9114 dev_get_port_parent_id(b, &b_id, true))
9115 return false;
9116
9117 return netdev_phys_item_id_same(&a_id, &b_id);
9118}
9119EXPORT_SYMBOL(netdev_port_same_parent_id);
9120
9121/**
9122 * dev_change_proto_down - set carrier according to proto_down.
9123 *
9124 * @dev: device
9125 * @proto_down: new value
9126 */
9127int dev_change_proto_down(struct net_device *dev, bool proto_down)
9128{
9129 if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN))
9130 return -EOPNOTSUPP;
9131 if (!netif_device_present(dev))
9132 return -ENODEV;
9133 if (proto_down)
9134 netif_carrier_off(dev);
9135 else
9136 netif_carrier_on(dev);
9137 dev->proto_down = proto_down;
9138 return 0;
9139}
9140
9141/**
9142 * dev_change_proto_down_reason - proto down reason
9143 *
9144 * @dev: device
9145 * @mask: proto down mask
9146 * @value: proto down value
9147 */
9148void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
9149 u32 value)
9150{
9151 int b;
9152
9153 if (!mask) {
9154 dev->proto_down_reason = value;
9155 } else {
9156 for_each_set_bit(b, &mask, 32) {
9157 if (value & (1 << b))
9158 dev->proto_down_reason |= BIT(b);
9159 else
9160 dev->proto_down_reason &= ~BIT(b);
9161 }
9162 }
9163}
9164
9165struct bpf_xdp_link {
9166 struct bpf_link link;
9167 struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
9168 int flags;
9169};
9170
9171static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
9172{
9173 if (flags & XDP_FLAGS_HW_MODE)
9174 return XDP_MODE_HW;
9175 if (flags & XDP_FLAGS_DRV_MODE)
9176 return XDP_MODE_DRV;
9177 if (flags & XDP_FLAGS_SKB_MODE)
9178 return XDP_MODE_SKB;
9179 return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
9180}
9181
9182static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
9183{
9184 switch (mode) {
9185 case XDP_MODE_SKB:
9186 return generic_xdp_install;
9187 case XDP_MODE_DRV:
9188 case XDP_MODE_HW:
9189 return dev->netdev_ops->ndo_bpf;
9190 default:
9191 return NULL;
9192 }
9193}
9194
9195static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
9196 enum bpf_xdp_mode mode)
9197{
9198 return dev->xdp_state[mode].link;
9199}
9200
9201static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
9202 enum bpf_xdp_mode mode)
9203{
9204 struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
9205
9206 if (link)
9207 return link->link.prog;
9208 return dev->xdp_state[mode].prog;
9209}
9210
9211u8 dev_xdp_prog_count(struct net_device *dev)
9212{
9213 u8 count = 0;
9214 int i;
9215
9216 for (i = 0; i < __MAX_XDP_MODE; i++)
9217 if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
9218 count++;
9219 return count;
9220}
9221EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
9222
9223u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
9224{
9225 struct bpf_prog *prog = dev_xdp_prog(dev, mode);
9226
9227 return prog ? prog->aux->id : 0;
9228}
9229
9230static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
9231 struct bpf_xdp_link *link)
9232{
9233 dev->xdp_state[mode].link = link;
9234 dev->xdp_state[mode].prog = NULL;
9235}
9236
9237static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
9238 struct bpf_prog *prog)
9239{
9240 dev->xdp_state[mode].link = NULL;
9241 dev->xdp_state[mode].prog = prog;
9242}
9243
9244static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
9245 bpf_op_t bpf_op, struct netlink_ext_ack *extack,
9246 u32 flags, struct bpf_prog *prog)
9247{
9248 struct netdev_bpf xdp;
9249 int err;
9250
9251 memset(&xdp, 0, sizeof(xdp));
9252 xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
9253 xdp.extack = extack;
9254 xdp.flags = flags;
9255 xdp.prog = prog;
9256
9257 /* Drivers assume refcnt is already incremented (i.e, prog pointer is
9258 * "moved" into driver), so they don't increment it on their own, but
9259 * they do decrement refcnt when program is detached or replaced.
9260 * Given net_device also owns link/prog, we need to bump refcnt here
9261 * to prevent drivers from underflowing it.
9262 */
9263 if (prog)
9264 bpf_prog_inc(prog);
9265 err = bpf_op(dev, &xdp);
9266 if (err) {
9267 if (prog)
9268 bpf_prog_put(prog);
9269 return err;
9270 }
9271
9272 if (mode != XDP_MODE_HW)
9273 bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
9274
9275 return 0;
9276}
9277
9278static void dev_xdp_uninstall(struct net_device *dev)
9279{
9280 struct bpf_xdp_link *link;
9281 struct bpf_prog *prog;
9282 enum bpf_xdp_mode mode;
9283 bpf_op_t bpf_op;
9284
9285 ASSERT_RTNL();
9286
9287 for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
9288 prog = dev_xdp_prog(dev, mode);
9289 if (!prog)
9290 continue;
9291
9292 bpf_op = dev_xdp_bpf_op(dev, mode);
9293 if (!bpf_op)
9294 continue;
9295
9296 WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9297
9298 /* auto-detach link from net device */
9299 link = dev_xdp_link(dev, mode);
9300 if (link)
9301 link->dev = NULL;
9302 else
9303 bpf_prog_put(prog);
9304
9305 dev_xdp_set_link(dev, mode, NULL);
9306 }
9307}
9308
9309static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
9310 struct bpf_xdp_link *link, struct bpf_prog *new_prog,
9311 struct bpf_prog *old_prog, u32 flags)
9312{
9313 unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
9314 struct bpf_prog *cur_prog;
9315 struct net_device *upper;
9316 struct list_head *iter;
9317 enum bpf_xdp_mode mode;
9318 bpf_op_t bpf_op;
9319 int err;
9320
9321 ASSERT_RTNL();
9322
9323 /* either link or prog attachment, never both */
9324 if (link && (new_prog || old_prog))
9325 return -EINVAL;
9326 /* link supports only XDP mode flags */
9327 if (link && (flags & ~XDP_FLAGS_MODES)) {
9328 NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
9329 return -EINVAL;
9330 }
9331 /* just one XDP mode bit should be set, zero defaults to drv/skb mode */
9332 if (num_modes > 1) {
9333 NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
9334 return -EINVAL;
9335 }
9336 /* avoid ambiguity if offload + drv/skb mode progs are both loaded */
9337 if (!num_modes && dev_xdp_prog_count(dev) > 1) {
9338 NL_SET_ERR_MSG(extack,
9339 "More than one program loaded, unset mode is ambiguous");
9340 return -EINVAL;
9341 }
9342 /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
9343 if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
9344 NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
9345 return -EINVAL;
9346 }
9347
9348 mode = dev_xdp_mode(dev, flags);
9349 /* can't replace attached link */
9350 if (dev_xdp_link(dev, mode)) {
9351 NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
9352 return -EBUSY;
9353 }
9354
9355 /* don't allow if an upper device already has a program */
9356 netdev_for_each_upper_dev_rcu(dev, upper, iter) {
9357 if (dev_xdp_prog_count(upper) > 0) {
9358 NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
9359 return -EEXIST;
9360 }
9361 }
9362
9363 cur_prog = dev_xdp_prog(dev, mode);
9364 /* can't replace attached prog with link */
9365 if (link && cur_prog) {
9366 NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
9367 return -EBUSY;
9368 }
9369 if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
9370 NL_SET_ERR_MSG(extack, "Active program does not match expected");
9371 return -EEXIST;
9372 }
9373
9374 /* put effective new program into new_prog */
9375 if (link)
9376 new_prog = link->link.prog;
9377
9378 if (new_prog) {
9379 bool offload = mode == XDP_MODE_HW;
9380 enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
9381 ? XDP_MODE_DRV : XDP_MODE_SKB;
9382
9383 if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
9384 NL_SET_ERR_MSG(extack, "XDP program already attached");
9385 return -EBUSY;
9386 }
9387 if (!offload && dev_xdp_prog(dev, other_mode)) {
9388 NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
9389 return -EEXIST;
9390 }
9391 if (!offload && bpf_prog_is_offloaded(new_prog->aux)) {
9392 NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported");
9393 return -EINVAL;
9394 }
9395 if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) {
9396 NL_SET_ERR_MSG(extack, "Program bound to different device");
9397 return -EINVAL;
9398 }
9399 if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
9400 NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
9401 return -EINVAL;
9402 }
9403 if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
9404 NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
9405 return -EINVAL;
9406 }
9407 }
9408
9409 /* don't call drivers if the effective program didn't change */
9410 if (new_prog != cur_prog) {
9411 bpf_op = dev_xdp_bpf_op(dev, mode);
9412 if (!bpf_op) {
9413 NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
9414 return -EOPNOTSUPP;
9415 }
9416
9417 err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
9418 if (err)
9419 return err;
9420 }
9421
9422 if (link)
9423 dev_xdp_set_link(dev, mode, link);
9424 else
9425 dev_xdp_set_prog(dev, mode, new_prog);
9426 if (cur_prog)
9427 bpf_prog_put(cur_prog);
9428
9429 return 0;
9430}
9431
9432static int dev_xdp_attach_link(struct net_device *dev,
9433 struct netlink_ext_ack *extack,
9434 struct bpf_xdp_link *link)
9435{
9436 return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
9437}
9438
9439static int dev_xdp_detach_link(struct net_device *dev,
9440 struct netlink_ext_ack *extack,
9441 struct bpf_xdp_link *link)
9442{
9443 enum bpf_xdp_mode mode;
9444 bpf_op_t bpf_op;
9445
9446 ASSERT_RTNL();
9447
9448 mode = dev_xdp_mode(dev, link->flags);
9449 if (dev_xdp_link(dev, mode) != link)
9450 return -EINVAL;
9451
9452 bpf_op = dev_xdp_bpf_op(dev, mode);
9453 WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9454 dev_xdp_set_link(dev, mode, NULL);
9455 return 0;
9456}
9457
9458static void bpf_xdp_link_release(struct bpf_link *link)
9459{
9460 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9461
9462 rtnl_lock();
9463
9464 /* if racing with net_device's tear down, xdp_link->dev might be
9465 * already NULL, in which case link was already auto-detached
9466 */
9467 if (xdp_link->dev) {
9468 WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
9469 xdp_link->dev = NULL;
9470 }
9471
9472 rtnl_unlock();
9473}
9474
9475static int bpf_xdp_link_detach(struct bpf_link *link)
9476{
9477 bpf_xdp_link_release(link);
9478 return 0;
9479}
9480
9481static void bpf_xdp_link_dealloc(struct bpf_link *link)
9482{
9483 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9484
9485 kfree(xdp_link);
9486}
9487
9488static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
9489 struct seq_file *seq)
9490{
9491 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9492 u32 ifindex = 0;
9493
9494 rtnl_lock();
9495 if (xdp_link->dev)
9496 ifindex = xdp_link->dev->ifindex;
9497 rtnl_unlock();
9498
9499 seq_printf(seq, "ifindex:\t%u\n", ifindex);
9500}
9501
9502static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
9503 struct bpf_link_info *info)
9504{
9505 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9506 u32 ifindex = 0;
9507
9508 rtnl_lock();
9509 if (xdp_link->dev)
9510 ifindex = xdp_link->dev->ifindex;
9511 rtnl_unlock();
9512
9513 info->xdp.ifindex = ifindex;
9514 return 0;
9515}
9516
9517static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
9518 struct bpf_prog *old_prog)
9519{
9520 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9521 enum bpf_xdp_mode mode;
9522 bpf_op_t bpf_op;
9523 int err = 0;
9524
9525 rtnl_lock();
9526
9527 /* link might have been auto-released already, so fail */
9528 if (!xdp_link->dev) {
9529 err = -ENOLINK;
9530 goto out_unlock;
9531 }
9532
9533 if (old_prog && link->prog != old_prog) {
9534 err = -EPERM;
9535 goto out_unlock;
9536 }
9537 old_prog = link->prog;
9538 if (old_prog->type != new_prog->type ||
9539 old_prog->expected_attach_type != new_prog->expected_attach_type) {
9540 err = -EINVAL;
9541 goto out_unlock;
9542 }
9543
9544 if (old_prog == new_prog) {
9545 /* no-op, don't disturb drivers */
9546 bpf_prog_put(new_prog);
9547 goto out_unlock;
9548 }
9549
9550 mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
9551 bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
9552 err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
9553 xdp_link->flags, new_prog);
9554 if (err)
9555 goto out_unlock;
9556
9557 old_prog = xchg(&link->prog, new_prog);
9558 bpf_prog_put(old_prog);
9559
9560out_unlock:
9561 rtnl_unlock();
9562 return err;
9563}
9564
9565static const struct bpf_link_ops bpf_xdp_link_lops = {
9566 .release = bpf_xdp_link_release,
9567 .dealloc = bpf_xdp_link_dealloc,
9568 .detach = bpf_xdp_link_detach,
9569 .show_fdinfo = bpf_xdp_link_show_fdinfo,
9570 .fill_link_info = bpf_xdp_link_fill_link_info,
9571 .update_prog = bpf_xdp_link_update,
9572};
9573
9574int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
9575{
9576 struct net *net = current->nsproxy->net_ns;
9577 struct bpf_link_primer link_primer;
9578 struct netlink_ext_ack extack = {};
9579 struct bpf_xdp_link *link;
9580 struct net_device *dev;
9581 int err, fd;
9582
9583 rtnl_lock();
9584 dev = dev_get_by_index(net, attr->link_create.target_ifindex);
9585 if (!dev) {
9586 rtnl_unlock();
9587 return -EINVAL;
9588 }
9589
9590 link = kzalloc(sizeof(*link), GFP_USER);
9591 if (!link) {
9592 err = -ENOMEM;
9593 goto unlock;
9594 }
9595
9596 bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
9597 link->dev = dev;
9598 link->flags = attr->link_create.flags;
9599
9600 err = bpf_link_prime(&link->link, &link_primer);
9601 if (err) {
9602 kfree(link);
9603 goto unlock;
9604 }
9605
9606 err = dev_xdp_attach_link(dev, &extack, link);
9607 rtnl_unlock();
9608
9609 if (err) {
9610 link->dev = NULL;
9611 bpf_link_cleanup(&link_primer);
9612 trace_bpf_xdp_link_attach_failed(extack._msg);
9613 goto out_put_dev;
9614 }
9615
9616 fd = bpf_link_settle(&link_primer);
9617 /* link itself doesn't hold dev's refcnt to not complicate shutdown */
9618 dev_put(dev);
9619 return fd;
9620
9621unlock:
9622 rtnl_unlock();
9623
9624out_put_dev:
9625 dev_put(dev);
9626 return err;
9627}
9628
9629/**
9630 * dev_change_xdp_fd - set or clear a bpf program for a device rx path
9631 * @dev: device
9632 * @extack: netlink extended ack
9633 * @fd: new program fd or negative value to clear
9634 * @expected_fd: old program fd that userspace expects to replace or clear
9635 * @flags: xdp-related flags
9636 *
9637 * Set or clear a bpf program for a device
9638 */
9639int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
9640 int fd, int expected_fd, u32 flags)
9641{
9642 enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
9643 struct bpf_prog *new_prog = NULL, *old_prog = NULL;
9644 int err;
9645
9646 ASSERT_RTNL();
9647
9648 if (fd >= 0) {
9649 new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
9650 mode != XDP_MODE_SKB);
9651 if (IS_ERR(new_prog))
9652 return PTR_ERR(new_prog);
9653 }
9654
9655 if (expected_fd >= 0) {
9656 old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
9657 mode != XDP_MODE_SKB);
9658 if (IS_ERR(old_prog)) {
9659 err = PTR_ERR(old_prog);
9660 old_prog = NULL;
9661 goto err_out;
9662 }
9663 }
9664
9665 err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
9666
9667err_out:
9668 if (err && new_prog)
9669 bpf_prog_put(new_prog);
9670 if (old_prog)
9671 bpf_prog_put(old_prog);
9672 return err;
9673}
9674
9675/**
9676 * dev_index_reserve() - allocate an ifindex in a namespace
9677 * @net: the applicable net namespace
9678 * @ifindex: requested ifindex, pass %0 to get one allocated
9679 *
9680 * Allocate a ifindex for a new device. Caller must either use the ifindex
9681 * to store the device (via list_netdevice()) or call dev_index_release()
9682 * to give the index up.
9683 *
9684 * Return: a suitable unique value for a new device interface number or -errno.
9685 */
9686static int dev_index_reserve(struct net *net, u32 ifindex)
9687{
9688 int err;
9689
9690 if (ifindex > INT_MAX) {
9691 DEBUG_NET_WARN_ON_ONCE(1);
9692 return -EINVAL;
9693 }
9694
9695 if (!ifindex)
9696 err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL,
9697 xa_limit_31b, &net->ifindex, GFP_KERNEL);
9698 else
9699 err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL);
9700 if (err < 0)
9701 return err;
9702
9703 return ifindex;
9704}
9705
9706static void dev_index_release(struct net *net, int ifindex)
9707{
9708 /* Expect only unused indexes, unlist_netdevice() removes the used */
9709 WARN_ON(xa_erase(&net->dev_by_index, ifindex));
9710}
9711
9712/* Delayed registration/unregisteration */
9713LIST_HEAD(net_todo_list);
9714DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
9715atomic_t dev_unreg_count = ATOMIC_INIT(0);
9716
9717static void net_set_todo(struct net_device *dev)
9718{
9719 list_add_tail(&dev->todo_list, &net_todo_list);
9720}
9721
9722static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
9723 struct net_device *upper, netdev_features_t features)
9724{
9725 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9726 netdev_features_t feature;
9727 int feature_bit;
9728
9729 for_each_netdev_feature(upper_disables, feature_bit) {
9730 feature = __NETIF_F_BIT(feature_bit);
9731 if (!(upper->wanted_features & feature)
9732 && (features & feature)) {
9733 netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
9734 &feature, upper->name);
9735 features &= ~feature;
9736 }
9737 }
9738
9739 return features;
9740}
9741
9742static void netdev_sync_lower_features(struct net_device *upper,
9743 struct net_device *lower, netdev_features_t features)
9744{
9745 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9746 netdev_features_t feature;
9747 int feature_bit;
9748
9749 for_each_netdev_feature(upper_disables, feature_bit) {
9750 feature = __NETIF_F_BIT(feature_bit);
9751 if (!(features & feature) && (lower->features & feature)) {
9752 netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
9753 &feature, lower->name);
9754 lower->wanted_features &= ~feature;
9755 __netdev_update_features(lower);
9756
9757 if (unlikely(lower->features & feature))
9758 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
9759 &feature, lower->name);
9760 else
9761 netdev_features_change(lower);
9762 }
9763 }
9764}
9765
9766static netdev_features_t netdev_fix_features(struct net_device *dev,
9767 netdev_features_t features)
9768{
9769 /* Fix illegal checksum combinations */
9770 if ((features & NETIF_F_HW_CSUM) &&
9771 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
9772 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
9773 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
9774 }
9775
9776 /* TSO requires that SG is present as well. */
9777 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
9778 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
9779 features &= ~NETIF_F_ALL_TSO;
9780 }
9781
9782 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
9783 !(features & NETIF_F_IP_CSUM)) {
9784 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
9785 features &= ~NETIF_F_TSO;
9786 features &= ~NETIF_F_TSO_ECN;
9787 }
9788
9789 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
9790 !(features & NETIF_F_IPV6_CSUM)) {
9791 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
9792 features &= ~NETIF_F_TSO6;
9793 }
9794
9795 /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
9796 if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
9797 features &= ~NETIF_F_TSO_MANGLEID;
9798
9799 /* TSO ECN requires that TSO is present as well. */
9800 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
9801 features &= ~NETIF_F_TSO_ECN;
9802
9803 /* Software GSO depends on SG. */
9804 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
9805 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
9806 features &= ~NETIF_F_GSO;
9807 }
9808
9809 /* GSO partial features require GSO partial be set */
9810 if ((features & dev->gso_partial_features) &&
9811 !(features & NETIF_F_GSO_PARTIAL)) {
9812 netdev_dbg(dev,
9813 "Dropping partially supported GSO features since no GSO partial.\n");
9814 features &= ~dev->gso_partial_features;
9815 }
9816
9817 if (!(features & NETIF_F_RXCSUM)) {
9818 /* NETIF_F_GRO_HW implies doing RXCSUM since every packet
9819 * successfully merged by hardware must also have the
9820 * checksum verified by hardware. If the user does not
9821 * want to enable RXCSUM, logically, we should disable GRO_HW.
9822 */
9823 if (features & NETIF_F_GRO_HW) {
9824 netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
9825 features &= ~NETIF_F_GRO_HW;
9826 }
9827 }
9828
9829 /* LRO/HW-GRO features cannot be combined with RX-FCS */
9830 if (features & NETIF_F_RXFCS) {
9831 if (features & NETIF_F_LRO) {
9832 netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
9833 features &= ~NETIF_F_LRO;
9834 }
9835
9836 if (features & NETIF_F_GRO_HW) {
9837 netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
9838 features &= ~NETIF_F_GRO_HW;
9839 }
9840 }
9841
9842 if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
9843 netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
9844 features &= ~NETIF_F_LRO;
9845 }
9846
9847 if (features & NETIF_F_HW_TLS_TX) {
9848 bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
9849 (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
9850 bool hw_csum = features & NETIF_F_HW_CSUM;
9851
9852 if (!ip_csum && !hw_csum) {
9853 netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
9854 features &= ~NETIF_F_HW_TLS_TX;
9855 }
9856 }
9857
9858 if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
9859 netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
9860 features &= ~NETIF_F_HW_TLS_RX;
9861 }
9862
9863 return features;
9864}
9865
9866int __netdev_update_features(struct net_device *dev)
9867{
9868 struct net_device *upper, *lower;
9869 netdev_features_t features;
9870 struct list_head *iter;
9871 int err = -1;
9872
9873 ASSERT_RTNL();
9874
9875 features = netdev_get_wanted_features(dev);
9876
9877 if (dev->netdev_ops->ndo_fix_features)
9878 features = dev->netdev_ops->ndo_fix_features(dev, features);
9879
9880 /* driver might be less strict about feature dependencies */
9881 features = netdev_fix_features(dev, features);
9882
9883 /* some features can't be enabled if they're off on an upper device */
9884 netdev_for_each_upper_dev_rcu(dev, upper, iter)
9885 features = netdev_sync_upper_features(dev, upper, features);
9886
9887 if (dev->features == features)
9888 goto sync_lower;
9889
9890 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
9891 &dev->features, &features);
9892
9893 if (dev->netdev_ops->ndo_set_features)
9894 err = dev->netdev_ops->ndo_set_features(dev, features);
9895 else
9896 err = 0;
9897
9898 if (unlikely(err < 0)) {
9899 netdev_err(dev,
9900 "set_features() failed (%d); wanted %pNF, left %pNF\n",
9901 err, &features, &dev->features);
9902 /* return non-0 since some features might have changed and
9903 * it's better to fire a spurious notification than miss it
9904 */
9905 return -1;
9906 }
9907
9908sync_lower:
9909 /* some features must be disabled on lower devices when disabled
9910 * on an upper device (think: bonding master or bridge)
9911 */
9912 netdev_for_each_lower_dev(dev, lower, iter)
9913 netdev_sync_lower_features(dev, lower, features);
9914
9915 if (!err) {
9916 netdev_features_t diff = features ^ dev->features;
9917
9918 if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
9919 /* udp_tunnel_{get,drop}_rx_info both need
9920 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
9921 * device, or they won't do anything.
9922 * Thus we need to update dev->features
9923 * *before* calling udp_tunnel_get_rx_info,
9924 * but *after* calling udp_tunnel_drop_rx_info.
9925 */
9926 if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
9927 dev->features = features;
9928 udp_tunnel_get_rx_info(dev);
9929 } else {
9930 udp_tunnel_drop_rx_info(dev);
9931 }
9932 }
9933
9934 if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
9935 if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
9936 dev->features = features;
9937 err |= vlan_get_rx_ctag_filter_info(dev);
9938 } else {
9939 vlan_drop_rx_ctag_filter_info(dev);
9940 }
9941 }
9942
9943 if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
9944 if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
9945 dev->features = features;
9946 err |= vlan_get_rx_stag_filter_info(dev);
9947 } else {
9948 vlan_drop_rx_stag_filter_info(dev);
9949 }
9950 }
9951
9952 dev->features = features;
9953 }
9954
9955 return err < 0 ? 0 : 1;
9956}
9957
9958/**
9959 * netdev_update_features - recalculate device features
9960 * @dev: the device to check
9961 *
9962 * Recalculate dev->features set and send notifications if it
9963 * has changed. Should be called after driver or hardware dependent
9964 * conditions might have changed that influence the features.
9965 */
9966void netdev_update_features(struct net_device *dev)
9967{
9968 if (__netdev_update_features(dev))
9969 netdev_features_change(dev);
9970}
9971EXPORT_SYMBOL(netdev_update_features);
9972
9973/**
9974 * netdev_change_features - recalculate device features
9975 * @dev: the device to check
9976 *
9977 * Recalculate dev->features set and send notifications even
9978 * if they have not changed. Should be called instead of
9979 * netdev_update_features() if also dev->vlan_features might
9980 * have changed to allow the changes to be propagated to stacked
9981 * VLAN devices.
9982 */
9983void netdev_change_features(struct net_device *dev)
9984{
9985 __netdev_update_features(dev);
9986 netdev_features_change(dev);
9987}
9988EXPORT_SYMBOL(netdev_change_features);
9989
9990/**
9991 * netif_stacked_transfer_operstate - transfer operstate
9992 * @rootdev: the root or lower level device to transfer state from
9993 * @dev: the device to transfer operstate to
9994 *
9995 * Transfer operational state from root to device. This is normally
9996 * called when a stacking relationship exists between the root
9997 * device and the device(a leaf device).
9998 */
9999void netif_stacked_transfer_operstate(const struct net_device *rootdev,
10000 struct net_device *dev)
10001{
10002 if (rootdev->operstate == IF_OPER_DORMANT)
10003 netif_dormant_on(dev);
10004 else
10005 netif_dormant_off(dev);
10006
10007 if (rootdev->operstate == IF_OPER_TESTING)
10008 netif_testing_on(dev);
10009 else
10010 netif_testing_off(dev);
10011
10012 if (netif_carrier_ok(rootdev))
10013 netif_carrier_on(dev);
10014 else
10015 netif_carrier_off(dev);
10016}
10017EXPORT_SYMBOL(netif_stacked_transfer_operstate);
10018
10019static int netif_alloc_rx_queues(struct net_device *dev)
10020{
10021 unsigned int i, count = dev->num_rx_queues;
10022 struct netdev_rx_queue *rx;
10023 size_t sz = count * sizeof(*rx);
10024 int err = 0;
10025
10026 BUG_ON(count < 1);
10027
10028 rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10029 if (!rx)
10030 return -ENOMEM;
10031
10032 dev->_rx = rx;
10033
10034 for (i = 0; i < count; i++) {
10035 rx[i].dev = dev;
10036
10037 /* XDP RX-queue setup */
10038 err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
10039 if (err < 0)
10040 goto err_rxq_info;
10041 }
10042 return 0;
10043
10044err_rxq_info:
10045 /* Rollback successful reg's and free other resources */
10046 while (i--)
10047 xdp_rxq_info_unreg(&rx[i].xdp_rxq);
10048 kvfree(dev->_rx);
10049 dev->_rx = NULL;
10050 return err;
10051}
10052
10053static void netif_free_rx_queues(struct net_device *dev)
10054{
10055 unsigned int i, count = dev->num_rx_queues;
10056
10057 /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
10058 if (!dev->_rx)
10059 return;
10060
10061 for (i = 0; i < count; i++)
10062 xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
10063
10064 kvfree(dev->_rx);
10065}
10066
10067static void netdev_init_one_queue(struct net_device *dev,
10068 struct netdev_queue *queue, void *_unused)
10069{
10070 /* Initialize queue lock */
10071 spin_lock_init(&queue->_xmit_lock);
10072 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
10073 queue->xmit_lock_owner = -1;
10074 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
10075 queue->dev = dev;
10076#ifdef CONFIG_BQL
10077 dql_init(&queue->dql, HZ);
10078#endif
10079}
10080
10081static void netif_free_tx_queues(struct net_device *dev)
10082{
10083 kvfree(dev->_tx);
10084}
10085
10086static int netif_alloc_netdev_queues(struct net_device *dev)
10087{
10088 unsigned int count = dev->num_tx_queues;
10089 struct netdev_queue *tx;
10090 size_t sz = count * sizeof(*tx);
10091
10092 if (count < 1 || count > 0xffff)
10093 return -EINVAL;
10094
10095 tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10096 if (!tx)
10097 return -ENOMEM;
10098
10099 dev->_tx = tx;
10100
10101 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
10102 spin_lock_init(&dev->tx_global_lock);
10103
10104 return 0;
10105}
10106
10107void netif_tx_stop_all_queues(struct net_device *dev)
10108{
10109 unsigned int i;
10110
10111 for (i = 0; i < dev->num_tx_queues; i++) {
10112 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
10113
10114 netif_tx_stop_queue(txq);
10115 }
10116}
10117EXPORT_SYMBOL(netif_tx_stop_all_queues);
10118
10119static int netdev_do_alloc_pcpu_stats(struct net_device *dev)
10120{
10121 void __percpu *v;
10122
10123 /* Drivers implementing ndo_get_peer_dev must support tstat
10124 * accounting, so that skb_do_redirect() can bump the dev's
10125 * RX stats upon network namespace switch.
10126 */
10127 if (dev->netdev_ops->ndo_get_peer_dev &&
10128 dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS)
10129 return -EOPNOTSUPP;
10130
10131 switch (dev->pcpu_stat_type) {
10132 case NETDEV_PCPU_STAT_NONE:
10133 return 0;
10134 case NETDEV_PCPU_STAT_LSTATS:
10135 v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
10136 break;
10137 case NETDEV_PCPU_STAT_TSTATS:
10138 v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
10139 break;
10140 case NETDEV_PCPU_STAT_DSTATS:
10141 v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
10142 break;
10143 default:
10144 return -EINVAL;
10145 }
10146
10147 return v ? 0 : -ENOMEM;
10148}
10149
10150static void netdev_do_free_pcpu_stats(struct net_device *dev)
10151{
10152 switch (dev->pcpu_stat_type) {
10153 case NETDEV_PCPU_STAT_NONE:
10154 return;
10155 case NETDEV_PCPU_STAT_LSTATS:
10156 free_percpu(dev->lstats);
10157 break;
10158 case NETDEV_PCPU_STAT_TSTATS:
10159 free_percpu(dev->tstats);
10160 break;
10161 case NETDEV_PCPU_STAT_DSTATS:
10162 free_percpu(dev->dstats);
10163 break;
10164 }
10165}
10166
10167/**
10168 * register_netdevice() - register a network device
10169 * @dev: device to register
10170 *
10171 * Take a prepared network device structure and make it externally accessible.
10172 * A %NETDEV_REGISTER message is sent to the netdev notifier chain.
10173 * Callers must hold the rtnl lock - you may want register_netdev()
10174 * instead of this.
10175 */
10176int register_netdevice(struct net_device *dev)
10177{
10178 int ret;
10179 struct net *net = dev_net(dev);
10180
10181 BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
10182 NETDEV_FEATURE_COUNT);
10183 BUG_ON(dev_boot_phase);
10184 ASSERT_RTNL();
10185
10186 might_sleep();
10187
10188 /* When net_device's are persistent, this will be fatal. */
10189 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
10190 BUG_ON(!net);
10191
10192 ret = ethtool_check_ops(dev->ethtool_ops);
10193 if (ret)
10194 return ret;
10195
10196 spin_lock_init(&dev->addr_list_lock);
10197 netdev_set_addr_lockdep_class(dev);
10198
10199 ret = dev_get_valid_name(net, dev, dev->name);
10200 if (ret < 0)
10201 goto out;
10202
10203 ret = -ENOMEM;
10204 dev->name_node = netdev_name_node_head_alloc(dev);
10205 if (!dev->name_node)
10206 goto out;
10207
10208 /* Init, if this function is available */
10209 if (dev->netdev_ops->ndo_init) {
10210 ret = dev->netdev_ops->ndo_init(dev);
10211 if (ret) {
10212 if (ret > 0)
10213 ret = -EIO;
10214 goto err_free_name;
10215 }
10216 }
10217
10218 if (((dev->hw_features | dev->features) &
10219 NETIF_F_HW_VLAN_CTAG_FILTER) &&
10220 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
10221 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
10222 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
10223 ret = -EINVAL;
10224 goto err_uninit;
10225 }
10226
10227 ret = netdev_do_alloc_pcpu_stats(dev);
10228 if (ret)
10229 goto err_uninit;
10230
10231 ret = dev_index_reserve(net, dev->ifindex);
10232 if (ret < 0)
10233 goto err_free_pcpu;
10234 dev->ifindex = ret;
10235
10236 /* Transfer changeable features to wanted_features and enable
10237 * software offloads (GSO and GRO).
10238 */
10239 dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
10240 dev->features |= NETIF_F_SOFT_FEATURES;
10241
10242 if (dev->udp_tunnel_nic_info) {
10243 dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10244 dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10245 }
10246
10247 dev->wanted_features = dev->features & dev->hw_features;
10248
10249 if (!(dev->flags & IFF_LOOPBACK))
10250 dev->hw_features |= NETIF_F_NOCACHE_COPY;
10251
10252 /* If IPv4 TCP segmentation offload is supported we should also
10253 * allow the device to enable segmenting the frame with the option
10254 * of ignoring a static IP ID value. This doesn't enable the
10255 * feature itself but allows the user to enable it later.
10256 */
10257 if (dev->hw_features & NETIF_F_TSO)
10258 dev->hw_features |= NETIF_F_TSO_MANGLEID;
10259 if (dev->vlan_features & NETIF_F_TSO)
10260 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
10261 if (dev->mpls_features & NETIF_F_TSO)
10262 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
10263 if (dev->hw_enc_features & NETIF_F_TSO)
10264 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
10265
10266 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
10267 */
10268 dev->vlan_features |= NETIF_F_HIGHDMA;
10269
10270 /* Make NETIF_F_SG inheritable to tunnel devices.
10271 */
10272 dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
10273
10274 /* Make NETIF_F_SG inheritable to MPLS.
10275 */
10276 dev->mpls_features |= NETIF_F_SG;
10277
10278 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
10279 ret = notifier_to_errno(ret);
10280 if (ret)
10281 goto err_ifindex_release;
10282
10283 ret = netdev_register_kobject(dev);
10284
10285 WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
10286
10287 if (ret)
10288 goto err_uninit_notify;
10289
10290 __netdev_update_features(dev);
10291
10292 /*
10293 * Default initial state at registry is that the
10294 * device is present.
10295 */
10296
10297 set_bit(__LINK_STATE_PRESENT, &dev->state);
10298
10299 linkwatch_init_dev(dev);
10300
10301 dev_init_scheduler(dev);
10302
10303 netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL);
10304 list_netdevice(dev);
10305
10306 add_device_randomness(dev->dev_addr, dev->addr_len);
10307
10308 /* If the device has permanent device address, driver should
10309 * set dev_addr and also addr_assign_type should be set to
10310 * NET_ADDR_PERM (default value).
10311 */
10312 if (dev->addr_assign_type == NET_ADDR_PERM)
10313 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
10314
10315 /* Notify protocols, that a new device appeared. */
10316 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
10317 ret = notifier_to_errno(ret);
10318 if (ret) {
10319 /* Expect explicit free_netdev() on failure */
10320 dev->needs_free_netdev = false;
10321 unregister_netdevice_queue(dev, NULL);
10322 goto out;
10323 }
10324 /*
10325 * Prevent userspace races by waiting until the network
10326 * device is fully setup before sending notifications.
10327 */
10328 if (!dev->rtnl_link_ops ||
10329 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10330 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
10331
10332out:
10333 return ret;
10334
10335err_uninit_notify:
10336 call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
10337err_ifindex_release:
10338 dev_index_release(net, dev->ifindex);
10339err_free_pcpu:
10340 netdev_do_free_pcpu_stats(dev);
10341err_uninit:
10342 if (dev->netdev_ops->ndo_uninit)
10343 dev->netdev_ops->ndo_uninit(dev);
10344 if (dev->priv_destructor)
10345 dev->priv_destructor(dev);
10346err_free_name:
10347 netdev_name_node_free(dev->name_node);
10348 goto out;
10349}
10350EXPORT_SYMBOL(register_netdevice);
10351
10352/**
10353 * init_dummy_netdev - init a dummy network device for NAPI
10354 * @dev: device to init
10355 *
10356 * This takes a network device structure and initialize the minimum
10357 * amount of fields so it can be used to schedule NAPI polls without
10358 * registering a full blown interface. This is to be used by drivers
10359 * that need to tie several hardware interfaces to a single NAPI
10360 * poll scheduler due to HW limitations.
10361 */
10362void init_dummy_netdev(struct net_device *dev)
10363{
10364 /* Clear everything. Note we don't initialize spinlocks
10365 * are they aren't supposed to be taken by any of the
10366 * NAPI code and this dummy netdev is supposed to be
10367 * only ever used for NAPI polls
10368 */
10369 memset(dev, 0, sizeof(struct net_device));
10370
10371 /* make sure we BUG if trying to hit standard
10372 * register/unregister code path
10373 */
10374 dev->reg_state = NETREG_DUMMY;
10375
10376 /* NAPI wants this */
10377 INIT_LIST_HEAD(&dev->napi_list);
10378
10379 /* a dummy interface is started by default */
10380 set_bit(__LINK_STATE_PRESENT, &dev->state);
10381 set_bit(__LINK_STATE_START, &dev->state);
10382
10383 /* napi_busy_loop stats accounting wants this */
10384 dev_net_set(dev, &init_net);
10385
10386 /* Note : We dont allocate pcpu_refcnt for dummy devices,
10387 * because users of this 'device' dont need to change
10388 * its refcount.
10389 */
10390}
10391EXPORT_SYMBOL_GPL(init_dummy_netdev);
10392
10393
10394/**
10395 * register_netdev - register a network device
10396 * @dev: device to register
10397 *
10398 * Take a completed network device structure and add it to the kernel
10399 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
10400 * chain. 0 is returned on success. A negative errno code is returned
10401 * on a failure to set up the device, or if the name is a duplicate.
10402 *
10403 * This is a wrapper around register_netdevice that takes the rtnl semaphore
10404 * and expands the device name if you passed a format string to
10405 * alloc_netdev.
10406 */
10407int register_netdev(struct net_device *dev)
10408{
10409 int err;
10410
10411 if (rtnl_lock_killable())
10412 return -EINTR;
10413 err = register_netdevice(dev);
10414 rtnl_unlock();
10415 return err;
10416}
10417EXPORT_SYMBOL(register_netdev);
10418
10419int netdev_refcnt_read(const struct net_device *dev)
10420{
10421#ifdef CONFIG_PCPU_DEV_REFCNT
10422 int i, refcnt = 0;
10423
10424 for_each_possible_cpu(i)
10425 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
10426 return refcnt;
10427#else
10428 return refcount_read(&dev->dev_refcnt);
10429#endif
10430}
10431EXPORT_SYMBOL(netdev_refcnt_read);
10432
10433int netdev_unregister_timeout_secs __read_mostly = 10;
10434
10435#define WAIT_REFS_MIN_MSECS 1
10436#define WAIT_REFS_MAX_MSECS 250
10437/**
10438 * netdev_wait_allrefs_any - wait until all references are gone.
10439 * @list: list of net_devices to wait on
10440 *
10441 * This is called when unregistering network devices.
10442 *
10443 * Any protocol or device that holds a reference should register
10444 * for netdevice notification, and cleanup and put back the
10445 * reference if they receive an UNREGISTER event.
10446 * We can get stuck here if buggy protocols don't correctly
10447 * call dev_put.
10448 */
10449static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
10450{
10451 unsigned long rebroadcast_time, warning_time;
10452 struct net_device *dev;
10453 int wait = 0;
10454
10455 rebroadcast_time = warning_time = jiffies;
10456
10457 list_for_each_entry(dev, list, todo_list)
10458 if (netdev_refcnt_read(dev) == 1)
10459 return dev;
10460
10461 while (true) {
10462 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
10463 rtnl_lock();
10464
10465 /* Rebroadcast unregister notification */
10466 list_for_each_entry(dev, list, todo_list)
10467 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10468
10469 __rtnl_unlock();
10470 rcu_barrier();
10471 rtnl_lock();
10472
10473 list_for_each_entry(dev, list, todo_list)
10474 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
10475 &dev->state)) {
10476 /* We must not have linkwatch events
10477 * pending on unregister. If this
10478 * happens, we simply run the queue
10479 * unscheduled, resulting in a noop
10480 * for this device.
10481 */
10482 linkwatch_run_queue();
10483 break;
10484 }
10485
10486 __rtnl_unlock();
10487
10488 rebroadcast_time = jiffies;
10489 }
10490
10491 rcu_barrier();
10492
10493 if (!wait) {
10494 wait = WAIT_REFS_MIN_MSECS;
10495 } else {
10496 msleep(wait);
10497 wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
10498 }
10499
10500 list_for_each_entry(dev, list, todo_list)
10501 if (netdev_refcnt_read(dev) == 1)
10502 return dev;
10503
10504 if (time_after(jiffies, warning_time +
10505 READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
10506 list_for_each_entry(dev, list, todo_list) {
10507 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
10508 dev->name, netdev_refcnt_read(dev));
10509 ref_tracker_dir_print(&dev->refcnt_tracker, 10);
10510 }
10511
10512 warning_time = jiffies;
10513 }
10514 }
10515}
10516
10517/* The sequence is:
10518 *
10519 * rtnl_lock();
10520 * ...
10521 * register_netdevice(x1);
10522 * register_netdevice(x2);
10523 * ...
10524 * unregister_netdevice(y1);
10525 * unregister_netdevice(y2);
10526 * ...
10527 * rtnl_unlock();
10528 * free_netdev(y1);
10529 * free_netdev(y2);
10530 *
10531 * We are invoked by rtnl_unlock().
10532 * This allows us to deal with problems:
10533 * 1) We can delete sysfs objects which invoke hotplug
10534 * without deadlocking with linkwatch via keventd.
10535 * 2) Since we run with the RTNL semaphore not held, we can sleep
10536 * safely in order to wait for the netdev refcnt to drop to zero.
10537 *
10538 * We must not return until all unregister events added during
10539 * the interval the lock was held have been completed.
10540 */
10541void netdev_run_todo(void)
10542{
10543 struct net_device *dev, *tmp;
10544 struct list_head list;
10545 int cnt;
10546#ifdef CONFIG_LOCKDEP
10547 struct list_head unlink_list;
10548
10549 list_replace_init(&net_unlink_list, &unlink_list);
10550
10551 while (!list_empty(&unlink_list)) {
10552 struct net_device *dev = list_first_entry(&unlink_list,
10553 struct net_device,
10554 unlink_list);
10555 list_del_init(&dev->unlink_list);
10556 dev->nested_level = dev->lower_level - 1;
10557 }
10558#endif
10559
10560 /* Snapshot list, allow later requests */
10561 list_replace_init(&net_todo_list, &list);
10562
10563 __rtnl_unlock();
10564
10565 /* Wait for rcu callbacks to finish before next phase */
10566 if (!list_empty(&list))
10567 rcu_barrier();
10568
10569 list_for_each_entry_safe(dev, tmp, &list, todo_list) {
10570 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
10571 netdev_WARN(dev, "run_todo but not unregistering\n");
10572 list_del(&dev->todo_list);
10573 continue;
10574 }
10575
10576 WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
10577 linkwatch_sync_dev(dev);
10578 }
10579
10580 cnt = 0;
10581 while (!list_empty(&list)) {
10582 dev = netdev_wait_allrefs_any(&list);
10583 list_del(&dev->todo_list);
10584
10585 /* paranoia */
10586 BUG_ON(netdev_refcnt_read(dev) != 1);
10587 BUG_ON(!list_empty(&dev->ptype_all));
10588 BUG_ON(!list_empty(&dev->ptype_specific));
10589 WARN_ON(rcu_access_pointer(dev->ip_ptr));
10590 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
10591
10592 netdev_do_free_pcpu_stats(dev);
10593 if (dev->priv_destructor)
10594 dev->priv_destructor(dev);
10595 if (dev->needs_free_netdev)
10596 free_netdev(dev);
10597
10598 cnt++;
10599
10600 /* Free network device */
10601 kobject_put(&dev->dev.kobj);
10602 }
10603 if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count))
10604 wake_up(&netdev_unregistering_wq);
10605}
10606
10607/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
10608 * all the same fields in the same order as net_device_stats, with only
10609 * the type differing, but rtnl_link_stats64 may have additional fields
10610 * at the end for newer counters.
10611 */
10612void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
10613 const struct net_device_stats *netdev_stats)
10614{
10615 size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
10616 const atomic_long_t *src = (atomic_long_t *)netdev_stats;
10617 u64 *dst = (u64 *)stats64;
10618
10619 BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
10620 for (i = 0; i < n; i++)
10621 dst[i] = (unsigned long)atomic_long_read(&src[i]);
10622 /* zero out counters that only exist in rtnl_link_stats64 */
10623 memset((char *)stats64 + n * sizeof(u64), 0,
10624 sizeof(*stats64) - n * sizeof(u64));
10625}
10626EXPORT_SYMBOL(netdev_stats_to_stats64);
10627
10628static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc(
10629 struct net_device *dev)
10630{
10631 struct net_device_core_stats __percpu *p;
10632
10633 p = alloc_percpu_gfp(struct net_device_core_stats,
10634 GFP_ATOMIC | __GFP_NOWARN);
10635
10636 if (p && cmpxchg(&dev->core_stats, NULL, p))
10637 free_percpu(p);
10638
10639 /* This READ_ONCE() pairs with the cmpxchg() above */
10640 return READ_ONCE(dev->core_stats);
10641}
10642
10643noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset)
10644{
10645 /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
10646 struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
10647 unsigned long __percpu *field;
10648
10649 if (unlikely(!p)) {
10650 p = netdev_core_stats_alloc(dev);
10651 if (!p)
10652 return;
10653 }
10654
10655 field = (__force unsigned long __percpu *)((__force void *)p + offset);
10656 this_cpu_inc(*field);
10657}
10658EXPORT_SYMBOL_GPL(netdev_core_stats_inc);
10659
10660/**
10661 * dev_get_stats - get network device statistics
10662 * @dev: device to get statistics from
10663 * @storage: place to store stats
10664 *
10665 * Get network statistics from device. Return @storage.
10666 * The device driver may provide its own method by setting
10667 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
10668 * otherwise the internal statistics structure is used.
10669 */
10670struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
10671 struct rtnl_link_stats64 *storage)
10672{
10673 const struct net_device_ops *ops = dev->netdev_ops;
10674 const struct net_device_core_stats __percpu *p;
10675
10676 if (ops->ndo_get_stats64) {
10677 memset(storage, 0, sizeof(*storage));
10678 ops->ndo_get_stats64(dev, storage);
10679 } else if (ops->ndo_get_stats) {
10680 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
10681 } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) {
10682 dev_get_tstats64(dev, storage);
10683 } else {
10684 netdev_stats_to_stats64(storage, &dev->stats);
10685 }
10686
10687 /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
10688 p = READ_ONCE(dev->core_stats);
10689 if (p) {
10690 const struct net_device_core_stats *core_stats;
10691 int i;
10692
10693 for_each_possible_cpu(i) {
10694 core_stats = per_cpu_ptr(p, i);
10695 storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
10696 storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
10697 storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
10698 storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped);
10699 }
10700 }
10701 return storage;
10702}
10703EXPORT_SYMBOL(dev_get_stats);
10704
10705/**
10706 * dev_fetch_sw_netstats - get per-cpu network device statistics
10707 * @s: place to store stats
10708 * @netstats: per-cpu network stats to read from
10709 *
10710 * Read per-cpu network statistics and populate the related fields in @s.
10711 */
10712void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
10713 const struct pcpu_sw_netstats __percpu *netstats)
10714{
10715 int cpu;
10716
10717 for_each_possible_cpu(cpu) {
10718 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
10719 const struct pcpu_sw_netstats *stats;
10720 unsigned int start;
10721
10722 stats = per_cpu_ptr(netstats, cpu);
10723 do {
10724 start = u64_stats_fetch_begin(&stats->syncp);
10725 rx_packets = u64_stats_read(&stats->rx_packets);
10726 rx_bytes = u64_stats_read(&stats->rx_bytes);
10727 tx_packets = u64_stats_read(&stats->tx_packets);
10728 tx_bytes = u64_stats_read(&stats->tx_bytes);
10729 } while (u64_stats_fetch_retry(&stats->syncp, start));
10730
10731 s->rx_packets += rx_packets;
10732 s->rx_bytes += rx_bytes;
10733 s->tx_packets += tx_packets;
10734 s->tx_bytes += tx_bytes;
10735 }
10736}
10737EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
10738
10739/**
10740 * dev_get_tstats64 - ndo_get_stats64 implementation
10741 * @dev: device to get statistics from
10742 * @s: place to store stats
10743 *
10744 * Populate @s from dev->stats and dev->tstats. Can be used as
10745 * ndo_get_stats64() callback.
10746 */
10747void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
10748{
10749 netdev_stats_to_stats64(s, &dev->stats);
10750 dev_fetch_sw_netstats(s, dev->tstats);
10751}
10752EXPORT_SYMBOL_GPL(dev_get_tstats64);
10753
10754struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
10755{
10756 struct netdev_queue *queue = dev_ingress_queue(dev);
10757
10758#ifdef CONFIG_NET_CLS_ACT
10759 if (queue)
10760 return queue;
10761 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
10762 if (!queue)
10763 return NULL;
10764 netdev_init_one_queue(dev, queue, NULL);
10765 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
10766 RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc);
10767 rcu_assign_pointer(dev->ingress_queue, queue);
10768#endif
10769 return queue;
10770}
10771
10772static const struct ethtool_ops default_ethtool_ops;
10773
10774void netdev_set_default_ethtool_ops(struct net_device *dev,
10775 const struct ethtool_ops *ops)
10776{
10777 if (dev->ethtool_ops == &default_ethtool_ops)
10778 dev->ethtool_ops = ops;
10779}
10780EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
10781
10782/**
10783 * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default
10784 * @dev: netdev to enable the IRQ coalescing on
10785 *
10786 * Sets a conservative default for SW IRQ coalescing. Users can use
10787 * sysfs attributes to override the default values.
10788 */
10789void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
10790{
10791 WARN_ON(dev->reg_state == NETREG_REGISTERED);
10792
10793 if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
10794 dev->gro_flush_timeout = 20000;
10795 dev->napi_defer_hard_irqs = 1;
10796 }
10797}
10798EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
10799
10800void netdev_freemem(struct net_device *dev)
10801{
10802 char *addr = (char *)dev - dev->padded;
10803
10804 kvfree(addr);
10805}
10806
10807/**
10808 * alloc_netdev_mqs - allocate network device
10809 * @sizeof_priv: size of private data to allocate space for
10810 * @name: device name format string
10811 * @name_assign_type: origin of device name
10812 * @setup: callback to initialize device
10813 * @txqs: the number of TX subqueues to allocate
10814 * @rxqs: the number of RX subqueues to allocate
10815 *
10816 * Allocates a struct net_device with private data area for driver use
10817 * and performs basic initialization. Also allocates subqueue structs
10818 * for each queue on the device.
10819 */
10820struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
10821 unsigned char name_assign_type,
10822 void (*setup)(struct net_device *),
10823 unsigned int txqs, unsigned int rxqs)
10824{
10825 struct net_device *dev;
10826 unsigned int alloc_size;
10827 struct net_device *p;
10828
10829 BUG_ON(strlen(name) >= sizeof(dev->name));
10830
10831 if (txqs < 1) {
10832 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
10833 return NULL;
10834 }
10835
10836 if (rxqs < 1) {
10837 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
10838 return NULL;
10839 }
10840
10841 alloc_size = sizeof(struct net_device);
10842 if (sizeof_priv) {
10843 /* ensure 32-byte alignment of private area */
10844 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
10845 alloc_size += sizeof_priv;
10846 }
10847 /* ensure 32-byte alignment of whole construct */
10848 alloc_size += NETDEV_ALIGN - 1;
10849
10850 p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10851 if (!p)
10852 return NULL;
10853
10854 dev = PTR_ALIGN(p, NETDEV_ALIGN);
10855 dev->padded = (char *)dev - (char *)p;
10856
10857 ref_tracker_dir_init(&dev->refcnt_tracker, 128, name);
10858#ifdef CONFIG_PCPU_DEV_REFCNT
10859 dev->pcpu_refcnt = alloc_percpu(int);
10860 if (!dev->pcpu_refcnt)
10861 goto free_dev;
10862 __dev_hold(dev);
10863#else
10864 refcount_set(&dev->dev_refcnt, 1);
10865#endif
10866
10867 if (dev_addr_init(dev))
10868 goto free_pcpu;
10869
10870 dev_mc_init(dev);
10871 dev_uc_init(dev);
10872
10873 dev_net_set(dev, &init_net);
10874
10875 dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
10876 dev->xdp_zc_max_segs = 1;
10877 dev->gso_max_segs = GSO_MAX_SEGS;
10878 dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
10879 dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE;
10880 dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE;
10881 dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
10882 dev->tso_max_segs = TSO_MAX_SEGS;
10883 dev->upper_level = 1;
10884 dev->lower_level = 1;
10885#ifdef CONFIG_LOCKDEP
10886 dev->nested_level = 0;
10887 INIT_LIST_HEAD(&dev->unlink_list);
10888#endif
10889
10890 INIT_LIST_HEAD(&dev->napi_list);
10891 INIT_LIST_HEAD(&dev->unreg_list);
10892 INIT_LIST_HEAD(&dev->close_list);
10893 INIT_LIST_HEAD(&dev->link_watch_list);
10894 INIT_LIST_HEAD(&dev->adj_list.upper);
10895 INIT_LIST_HEAD(&dev->adj_list.lower);
10896 INIT_LIST_HEAD(&dev->ptype_all);
10897 INIT_LIST_HEAD(&dev->ptype_specific);
10898 INIT_LIST_HEAD(&dev->net_notifier_list);
10899#ifdef CONFIG_NET_SCHED
10900 hash_init(dev->qdisc_hash);
10901#endif
10902 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
10903 setup(dev);
10904
10905 if (!dev->tx_queue_len) {
10906 dev->priv_flags |= IFF_NO_QUEUE;
10907 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
10908 }
10909
10910 dev->num_tx_queues = txqs;
10911 dev->real_num_tx_queues = txqs;
10912 if (netif_alloc_netdev_queues(dev))
10913 goto free_all;
10914
10915 dev->num_rx_queues = rxqs;
10916 dev->real_num_rx_queues = rxqs;
10917 if (netif_alloc_rx_queues(dev))
10918 goto free_all;
10919
10920 strcpy(dev->name, name);
10921 dev->name_assign_type = name_assign_type;
10922 dev->group = INIT_NETDEV_GROUP;
10923 if (!dev->ethtool_ops)
10924 dev->ethtool_ops = &default_ethtool_ops;
10925
10926 nf_hook_netdev_init(dev);
10927
10928 return dev;
10929
10930free_all:
10931 free_netdev(dev);
10932 return NULL;
10933
10934free_pcpu:
10935#ifdef CONFIG_PCPU_DEV_REFCNT
10936 free_percpu(dev->pcpu_refcnt);
10937free_dev:
10938#endif
10939 netdev_freemem(dev);
10940 return NULL;
10941}
10942EXPORT_SYMBOL(alloc_netdev_mqs);
10943
10944/**
10945 * free_netdev - free network device
10946 * @dev: device
10947 *
10948 * This function does the last stage of destroying an allocated device
10949 * interface. The reference to the device object is released. If this
10950 * is the last reference then it will be freed.Must be called in process
10951 * context.
10952 */
10953void free_netdev(struct net_device *dev)
10954{
10955 struct napi_struct *p, *n;
10956
10957 might_sleep();
10958
10959 /* When called immediately after register_netdevice() failed the unwind
10960 * handling may still be dismantling the device. Handle that case by
10961 * deferring the free.
10962 */
10963 if (dev->reg_state == NETREG_UNREGISTERING) {
10964 ASSERT_RTNL();
10965 dev->needs_free_netdev = true;
10966 return;
10967 }
10968
10969 netif_free_tx_queues(dev);
10970 netif_free_rx_queues(dev);
10971
10972 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
10973
10974 /* Flush device addresses */
10975 dev_addr_flush(dev);
10976
10977 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
10978 netif_napi_del(p);
10979
10980 ref_tracker_dir_exit(&dev->refcnt_tracker);
10981#ifdef CONFIG_PCPU_DEV_REFCNT
10982 free_percpu(dev->pcpu_refcnt);
10983 dev->pcpu_refcnt = NULL;
10984#endif
10985 free_percpu(dev->core_stats);
10986 dev->core_stats = NULL;
10987 free_percpu(dev->xdp_bulkq);
10988 dev->xdp_bulkq = NULL;
10989
10990 /* Compatibility with error handling in drivers */
10991 if (dev->reg_state == NETREG_UNINITIALIZED) {
10992 netdev_freemem(dev);
10993 return;
10994 }
10995
10996 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
10997 WRITE_ONCE(dev->reg_state, NETREG_RELEASED);
10998
10999 /* will free via device release */
11000 put_device(&dev->dev);
11001}
11002EXPORT_SYMBOL(free_netdev);
11003
11004/**
11005 * synchronize_net - Synchronize with packet receive processing
11006 *
11007 * Wait for packets currently being received to be done.
11008 * Does not block later packets from starting.
11009 */
11010void synchronize_net(void)
11011{
11012 might_sleep();
11013 if (rtnl_is_locked())
11014 synchronize_rcu_expedited();
11015 else
11016 synchronize_rcu();
11017}
11018EXPORT_SYMBOL(synchronize_net);
11019
11020/**
11021 * unregister_netdevice_queue - remove device from the kernel
11022 * @dev: device
11023 * @head: list
11024 *
11025 * This function shuts down a device interface and removes it
11026 * from the kernel tables.
11027 * If head not NULL, device is queued to be unregistered later.
11028 *
11029 * Callers must hold the rtnl semaphore. You may want
11030 * unregister_netdev() instead of this.
11031 */
11032
11033void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
11034{
11035 ASSERT_RTNL();
11036
11037 if (head) {
11038 list_move_tail(&dev->unreg_list, head);
11039 } else {
11040 LIST_HEAD(single);
11041
11042 list_add(&dev->unreg_list, &single);
11043 unregister_netdevice_many(&single);
11044 }
11045}
11046EXPORT_SYMBOL(unregister_netdevice_queue);
11047
11048void unregister_netdevice_many_notify(struct list_head *head,
11049 u32 portid, const struct nlmsghdr *nlh)
11050{
11051 struct net_device *dev, *tmp;
11052 LIST_HEAD(close_head);
11053 int cnt = 0;
11054
11055 BUG_ON(dev_boot_phase);
11056 ASSERT_RTNL();
11057
11058 if (list_empty(head))
11059 return;
11060
11061 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
11062 /* Some devices call without registering
11063 * for initialization unwind. Remove those
11064 * devices and proceed with the remaining.
11065 */
11066 if (dev->reg_state == NETREG_UNINITIALIZED) {
11067 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
11068 dev->name, dev);
11069
11070 WARN_ON(1);
11071 list_del(&dev->unreg_list);
11072 continue;
11073 }
11074 dev->dismantle = true;
11075 BUG_ON(dev->reg_state != NETREG_REGISTERED);
11076 }
11077
11078 /* If device is running, close it first. */
11079 list_for_each_entry(dev, head, unreg_list)
11080 list_add_tail(&dev->close_list, &close_head);
11081 dev_close_many(&close_head, true);
11082
11083 list_for_each_entry(dev, head, unreg_list) {
11084 /* And unlink it from device chain. */
11085 unlist_netdevice(dev);
11086 WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
11087 }
11088 flush_all_backlogs();
11089
11090 synchronize_net();
11091
11092 list_for_each_entry(dev, head, unreg_list) {
11093 struct sk_buff *skb = NULL;
11094
11095 /* Shutdown queueing discipline. */
11096 dev_shutdown(dev);
11097 dev_tcx_uninstall(dev);
11098 dev_xdp_uninstall(dev);
11099 bpf_dev_bound_netdev_unregister(dev);
11100
11101 netdev_offload_xstats_disable_all(dev);
11102
11103 /* Notify protocols, that we are about to destroy
11104 * this device. They should clean all the things.
11105 */
11106 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
11107
11108 if (!dev->rtnl_link_ops ||
11109 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
11110 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
11111 GFP_KERNEL, NULL, 0,
11112 portid, nlh);
11113
11114 /*
11115 * Flush the unicast and multicast chains
11116 */
11117 dev_uc_flush(dev);
11118 dev_mc_flush(dev);
11119
11120 netdev_name_node_alt_flush(dev);
11121 netdev_name_node_free(dev->name_node);
11122
11123 call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
11124
11125 if (dev->netdev_ops->ndo_uninit)
11126 dev->netdev_ops->ndo_uninit(dev);
11127
11128 if (skb)
11129 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);
11130
11131 /* Notifier chain MUST detach us all upper devices. */
11132 WARN_ON(netdev_has_any_upper_dev(dev));
11133 WARN_ON(netdev_has_any_lower_dev(dev));
11134
11135 /* Remove entries from kobject tree */
11136 netdev_unregister_kobject(dev);
11137#ifdef CONFIG_XPS
11138 /* Remove XPS queueing entries */
11139 netif_reset_xps_queues_gt(dev, 0);
11140#endif
11141 }
11142
11143 synchronize_net();
11144
11145 list_for_each_entry(dev, head, unreg_list) {
11146 netdev_put(dev, &dev->dev_registered_tracker);
11147 net_set_todo(dev);
11148 cnt++;
11149 }
11150 atomic_add(cnt, &dev_unreg_count);
11151
11152 list_del(head);
11153}
11154
11155/**
11156 * unregister_netdevice_many - unregister many devices
11157 * @head: list of devices
11158 *
11159 * Note: As most callers use a stack allocated list_head,
11160 * we force a list_del() to make sure stack wont be corrupted later.
11161 */
11162void unregister_netdevice_many(struct list_head *head)
11163{
11164 unregister_netdevice_many_notify(head, 0, NULL);
11165}
11166EXPORT_SYMBOL(unregister_netdevice_many);
11167
11168/**
11169 * unregister_netdev - remove device from the kernel
11170 * @dev: device
11171 *
11172 * This function shuts down a device interface and removes it
11173 * from the kernel tables.
11174 *
11175 * This is just a wrapper for unregister_netdevice that takes
11176 * the rtnl semaphore. In general you want to use this and not
11177 * unregister_netdevice.
11178 */
11179void unregister_netdev(struct net_device *dev)
11180{
11181 rtnl_lock();
11182 unregister_netdevice(dev);
11183 rtnl_unlock();
11184}
11185EXPORT_SYMBOL(unregister_netdev);
11186
11187/**
11188 * __dev_change_net_namespace - move device to different nethost namespace
11189 * @dev: device
11190 * @net: network namespace
11191 * @pat: If not NULL name pattern to try if the current device name
11192 * is already taken in the destination network namespace.
11193 * @new_ifindex: If not zero, specifies device index in the target
11194 * namespace.
11195 *
11196 * This function shuts down a device interface and moves it
11197 * to a new network namespace. On success 0 is returned, on
11198 * a failure a netagive errno code is returned.
11199 *
11200 * Callers must hold the rtnl semaphore.
11201 */
11202
11203int __dev_change_net_namespace(struct net_device *dev, struct net *net,
11204 const char *pat, int new_ifindex)
11205{
11206 struct netdev_name_node *name_node;
11207 struct net *net_old = dev_net(dev);
11208 char new_name[IFNAMSIZ] = {};
11209 int err, new_nsid;
11210
11211 ASSERT_RTNL();
11212
11213 /* Don't allow namespace local devices to be moved. */
11214 err = -EINVAL;
11215 if (dev->features & NETIF_F_NETNS_LOCAL)
11216 goto out;
11217
11218 /* Ensure the device has been registrered */
11219 if (dev->reg_state != NETREG_REGISTERED)
11220 goto out;
11221
11222 /* Get out if there is nothing todo */
11223 err = 0;
11224 if (net_eq(net_old, net))
11225 goto out;
11226
11227 /* Pick the destination device name, and ensure
11228 * we can use it in the destination network namespace.
11229 */
11230 err = -EEXIST;
11231 if (netdev_name_in_use(net, dev->name)) {
11232 /* We get here if we can't use the current device name */
11233 if (!pat)
11234 goto out;
11235 err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST);
11236 if (err < 0)
11237 goto out;
11238 }
11239 /* Check that none of the altnames conflicts. */
11240 err = -EEXIST;
11241 netdev_for_each_altname(dev, name_node)
11242 if (netdev_name_in_use(net, name_node->name))
11243 goto out;
11244
11245 /* Check that new_ifindex isn't used yet. */
11246 if (new_ifindex) {
11247 err = dev_index_reserve(net, new_ifindex);
11248 if (err < 0)
11249 goto out;
11250 } else {
11251 /* If there is an ifindex conflict assign a new one */
11252 err = dev_index_reserve(net, dev->ifindex);
11253 if (err == -EBUSY)
11254 err = dev_index_reserve(net, 0);
11255 if (err < 0)
11256 goto out;
11257 new_ifindex = err;
11258 }
11259
11260 /*
11261 * And now a mini version of register_netdevice unregister_netdevice.
11262 */
11263
11264 /* If device is running close it first. */
11265 dev_close(dev);
11266
11267 /* And unlink it from device chain */
11268 unlist_netdevice(dev);
11269
11270 synchronize_net();
11271
11272 /* Shutdown queueing discipline. */
11273 dev_shutdown(dev);
11274
11275 /* Notify protocols, that we are about to destroy
11276 * this device. They should clean all the things.
11277 *
11278 * Note that dev->reg_state stays at NETREG_REGISTERED.
11279 * This is wanted because this way 8021q and macvlan know
11280 * the device is just moving and can keep their slaves up.
11281 */
11282 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
11283 rcu_barrier();
11284
11285 new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
11286
11287 rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
11288 new_ifindex);
11289
11290 /*
11291 * Flush the unicast and multicast chains
11292 */
11293 dev_uc_flush(dev);
11294 dev_mc_flush(dev);
11295
11296 /* Send a netdev-removed uevent to the old namespace */
11297 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
11298 netdev_adjacent_del_links(dev);
11299
11300 /* Move per-net netdevice notifiers that are following the netdevice */
11301 move_netdevice_notifiers_dev_net(dev, net);
11302
11303 /* Actually switch the network namespace */
11304 dev_net_set(dev, net);
11305 dev->ifindex = new_ifindex;
11306
11307 if (new_name[0]) /* Rename the netdev to prepared name */
11308 strscpy(dev->name, new_name, IFNAMSIZ);
11309
11310 /* Fixup kobjects */
11311 dev_set_uevent_suppress(&dev->dev, 1);
11312 err = device_rename(&dev->dev, dev->name);
11313 dev_set_uevent_suppress(&dev->dev, 0);
11314 WARN_ON(err);
11315
11316 /* Send a netdev-add uevent to the new namespace */
11317 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
11318 netdev_adjacent_add_links(dev);
11319
11320 /* Adapt owner in case owning user namespace of target network
11321 * namespace is different from the original one.
11322 */
11323 err = netdev_change_owner(dev, net_old, net);
11324 WARN_ON(err);
11325
11326 /* Add the device back in the hashes */
11327 list_netdevice(dev);
11328
11329 /* Notify protocols, that a new device appeared. */
11330 call_netdevice_notifiers(NETDEV_REGISTER, dev);
11331
11332 /*
11333 * Prevent userspace races by waiting until the network
11334 * device is fully setup before sending notifications.
11335 */
11336 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
11337
11338 synchronize_net();
11339 err = 0;
11340out:
11341 return err;
11342}
11343EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
11344
11345static int dev_cpu_dead(unsigned int oldcpu)
11346{
11347 struct sk_buff **list_skb;
11348 struct sk_buff *skb;
11349 unsigned int cpu;
11350 struct softnet_data *sd, *oldsd, *remsd = NULL;
11351
11352 local_irq_disable();
11353 cpu = smp_processor_id();
11354 sd = &per_cpu(softnet_data, cpu);
11355 oldsd = &per_cpu(softnet_data, oldcpu);
11356
11357 /* Find end of our completion_queue. */
11358 list_skb = &sd->completion_queue;
11359 while (*list_skb)
11360 list_skb = &(*list_skb)->next;
11361 /* Append completion queue from offline CPU. */
11362 *list_skb = oldsd->completion_queue;
11363 oldsd->completion_queue = NULL;
11364
11365 /* Append output queue from offline CPU. */
11366 if (oldsd->output_queue) {
11367 *sd->output_queue_tailp = oldsd->output_queue;
11368 sd->output_queue_tailp = oldsd->output_queue_tailp;
11369 oldsd->output_queue = NULL;
11370 oldsd->output_queue_tailp = &oldsd->output_queue;
11371 }
11372 /* Append NAPI poll list from offline CPU, with one exception :
11373 * process_backlog() must be called by cpu owning percpu backlog.
11374 * We properly handle process_queue & input_pkt_queue later.
11375 */
11376 while (!list_empty(&oldsd->poll_list)) {
11377 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
11378 struct napi_struct,
11379 poll_list);
11380
11381 list_del_init(&napi->poll_list);
11382 if (napi->poll == process_backlog)
11383 napi->state = 0;
11384 else
11385 ____napi_schedule(sd, napi);
11386 }
11387
11388 raise_softirq_irqoff(NET_TX_SOFTIRQ);
11389 local_irq_enable();
11390
11391#ifdef CONFIG_RPS
11392 remsd = oldsd->rps_ipi_list;
11393 oldsd->rps_ipi_list = NULL;
11394#endif
11395 /* send out pending IPI's on offline CPU */
11396 net_rps_send_ipi(remsd);
11397
11398 /* Process offline CPU's input_pkt_queue */
11399 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
11400 netif_rx(skb);
11401 input_queue_head_incr(oldsd);
11402 }
11403 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
11404 netif_rx(skb);
11405 input_queue_head_incr(oldsd);
11406 }
11407
11408 return 0;
11409}
11410
11411/**
11412 * netdev_increment_features - increment feature set by one
11413 * @all: current feature set
11414 * @one: new feature set
11415 * @mask: mask feature set
11416 *
11417 * Computes a new feature set after adding a device with feature set
11418 * @one to the master device with current feature set @all. Will not
11419 * enable anything that is off in @mask. Returns the new feature set.
11420 */
11421netdev_features_t netdev_increment_features(netdev_features_t all,
11422 netdev_features_t one, netdev_features_t mask)
11423{
11424 if (mask & NETIF_F_HW_CSUM)
11425 mask |= NETIF_F_CSUM_MASK;
11426 mask |= NETIF_F_VLAN_CHALLENGED;
11427
11428 all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
11429 all &= one | ~NETIF_F_ALL_FOR_ALL;
11430
11431 /* If one device supports hw checksumming, set for all. */
11432 if (all & NETIF_F_HW_CSUM)
11433 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
11434
11435 return all;
11436}
11437EXPORT_SYMBOL(netdev_increment_features);
11438
11439static struct hlist_head * __net_init netdev_create_hash(void)
11440{
11441 int i;
11442 struct hlist_head *hash;
11443
11444 hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
11445 if (hash != NULL)
11446 for (i = 0; i < NETDEV_HASHENTRIES; i++)
11447 INIT_HLIST_HEAD(&hash[i]);
11448
11449 return hash;
11450}
11451
11452/* Initialize per network namespace state */
11453static int __net_init netdev_init(struct net *net)
11454{
11455 BUILD_BUG_ON(GRO_HASH_BUCKETS >
11456 8 * sizeof_field(struct napi_struct, gro_bitmask));
11457
11458 INIT_LIST_HEAD(&net->dev_base_head);
11459
11460 net->dev_name_head = netdev_create_hash();
11461 if (net->dev_name_head == NULL)
11462 goto err_name;
11463
11464 net->dev_index_head = netdev_create_hash();
11465 if (net->dev_index_head == NULL)
11466 goto err_idx;
11467
11468 xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1);
11469
11470 RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
11471
11472 return 0;
11473
11474err_idx:
11475 kfree(net->dev_name_head);
11476err_name:
11477 return -ENOMEM;
11478}
11479
11480/**
11481 * netdev_drivername - network driver for the device
11482 * @dev: network device
11483 *
11484 * Determine network driver for device.
11485 */
11486const char *netdev_drivername(const struct net_device *dev)
11487{
11488 const struct device_driver *driver;
11489 const struct device *parent;
11490 const char *empty = "";
11491
11492 parent = dev->dev.parent;
11493 if (!parent)
11494 return empty;
11495
11496 driver = parent->driver;
11497 if (driver && driver->name)
11498 return driver->name;
11499 return empty;
11500}
11501
11502static void __netdev_printk(const char *level, const struct net_device *dev,
11503 struct va_format *vaf)
11504{
11505 if (dev && dev->dev.parent) {
11506 dev_printk_emit(level[1] - '0',
11507 dev->dev.parent,
11508 "%s %s %s%s: %pV",
11509 dev_driver_string(dev->dev.parent),
11510 dev_name(dev->dev.parent),
11511 netdev_name(dev), netdev_reg_state(dev),
11512 vaf);
11513 } else if (dev) {
11514 printk("%s%s%s: %pV",
11515 level, netdev_name(dev), netdev_reg_state(dev), vaf);
11516 } else {
11517 printk("%s(NULL net_device): %pV", level, vaf);
11518 }
11519}
11520
11521void netdev_printk(const char *level, const struct net_device *dev,
11522 const char *format, ...)
11523{
11524 struct va_format vaf;
11525 va_list args;
11526
11527 va_start(args, format);
11528
11529 vaf.fmt = format;
11530 vaf.va = &args;
11531
11532 __netdev_printk(level, dev, &vaf);
11533
11534 va_end(args);
11535}
11536EXPORT_SYMBOL(netdev_printk);
11537
11538#define define_netdev_printk_level(func, level) \
11539void func(const struct net_device *dev, const char *fmt, ...) \
11540{ \
11541 struct va_format vaf; \
11542 va_list args; \
11543 \
11544 va_start(args, fmt); \
11545 \
11546 vaf.fmt = fmt; \
11547 vaf.va = &args; \
11548 \
11549 __netdev_printk(level, dev, &vaf); \
11550 \
11551 va_end(args); \
11552} \
11553EXPORT_SYMBOL(func);
11554
11555define_netdev_printk_level(netdev_emerg, KERN_EMERG);
11556define_netdev_printk_level(netdev_alert, KERN_ALERT);
11557define_netdev_printk_level(netdev_crit, KERN_CRIT);
11558define_netdev_printk_level(netdev_err, KERN_ERR);
11559define_netdev_printk_level(netdev_warn, KERN_WARNING);
11560define_netdev_printk_level(netdev_notice, KERN_NOTICE);
11561define_netdev_printk_level(netdev_info, KERN_INFO);
11562
11563static void __net_exit netdev_exit(struct net *net)
11564{
11565 kfree(net->dev_name_head);
11566 kfree(net->dev_index_head);
11567 xa_destroy(&net->dev_by_index);
11568 if (net != &init_net)
11569 WARN_ON_ONCE(!list_empty(&net->dev_base_head));
11570}
11571
11572static struct pernet_operations __net_initdata netdev_net_ops = {
11573 .init = netdev_init,
11574 .exit = netdev_exit,
11575};
11576
11577static void __net_exit default_device_exit_net(struct net *net)
11578{
11579 struct netdev_name_node *name_node, *tmp;
11580 struct net_device *dev, *aux;
11581 /*
11582 * Push all migratable network devices back to the
11583 * initial network namespace
11584 */
11585 ASSERT_RTNL();
11586 for_each_netdev_safe(net, dev, aux) {
11587 int err;
11588 char fb_name[IFNAMSIZ];
11589
11590 /* Ignore unmoveable devices (i.e. loopback) */
11591 if (dev->features & NETIF_F_NETNS_LOCAL)
11592 continue;
11593
11594 /* Leave virtual devices for the generic cleanup */
11595 if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
11596 continue;
11597
11598 /* Push remaining network devices to init_net */
11599 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
11600 if (netdev_name_in_use(&init_net, fb_name))
11601 snprintf(fb_name, IFNAMSIZ, "dev%%d");
11602
11603 netdev_for_each_altname_safe(dev, name_node, tmp)
11604 if (netdev_name_in_use(&init_net, name_node->name))
11605 __netdev_name_node_alt_destroy(name_node);
11606
11607 err = dev_change_net_namespace(dev, &init_net, fb_name);
11608 if (err) {
11609 pr_emerg("%s: failed to move %s to init_net: %d\n",
11610 __func__, dev->name, err);
11611 BUG();
11612 }
11613 }
11614}
11615
11616static void __net_exit default_device_exit_batch(struct list_head *net_list)
11617{
11618 /* At exit all network devices most be removed from a network
11619 * namespace. Do this in the reverse order of registration.
11620 * Do this across as many network namespaces as possible to
11621 * improve batching efficiency.
11622 */
11623 struct net_device *dev;
11624 struct net *net;
11625 LIST_HEAD(dev_kill_list);
11626
11627 rtnl_lock();
11628 list_for_each_entry(net, net_list, exit_list) {
11629 default_device_exit_net(net);
11630 cond_resched();
11631 }
11632
11633 list_for_each_entry(net, net_list, exit_list) {
11634 for_each_netdev_reverse(net, dev) {
11635 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
11636 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
11637 else
11638 unregister_netdevice_queue(dev, &dev_kill_list);
11639 }
11640 }
11641 unregister_netdevice_many(&dev_kill_list);
11642 rtnl_unlock();
11643}
11644
11645static struct pernet_operations __net_initdata default_device_ops = {
11646 .exit_batch = default_device_exit_batch,
11647};
11648
11649static void __init net_dev_struct_check(void)
11650{
11651 /* TX read-mostly hotpath */
11652 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags);
11653 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops);
11654 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops);
11655 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx);
11656 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues);
11657 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size);
11658 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size);
11659 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs);
11660 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features);
11661 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc);
11662 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu);
11663 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom);
11664 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq);
11665#ifdef CONFIG_XPS
11666 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps);
11667#endif
11668#ifdef CONFIG_NETFILTER_EGRESS
11669 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress);
11670#endif
11671#ifdef CONFIG_NET_XGRESS
11672 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress);
11673#endif
11674 CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160);
11675
11676 /* TXRX read-mostly hotpath */
11677 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats);
11678 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state);
11679 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags);
11680 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len);
11681 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features);
11682 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr);
11683 CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46);
11684
11685 /* RX read-mostly hotpath */
11686 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific);
11687 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
11688 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
11689 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
11690 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout);
11691 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs);
11692 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
11693 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
11694 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
11695 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data);
11696 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net);
11697#ifdef CONFIG_NETPOLL
11698 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo);
11699#endif
11700#ifdef CONFIG_NET_XGRESS
11701 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
11702#endif
11703 CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104);
11704}
11705
11706/*
11707 * Initialize the DEV module. At boot time this walks the device list and
11708 * unhooks any devices that fail to initialise (normally hardware not
11709 * present) and leaves us with a valid list of present and active devices.
11710 *
11711 */
11712
11713/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */
11714#define SYSTEM_PERCPU_PAGE_POOL_SIZE ((1 << 20) / PAGE_SIZE)
11715
11716static int net_page_pool_create(int cpuid)
11717{
11718#if IS_ENABLED(CONFIG_PAGE_POOL)
11719 struct page_pool_params page_pool_params = {
11720 .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE,
11721 .flags = PP_FLAG_SYSTEM_POOL,
11722 .nid = NUMA_NO_NODE,
11723 };
11724 struct page_pool *pp_ptr;
11725
11726 pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
11727 if (IS_ERR(pp_ptr))
11728 return -ENOMEM;
11729
11730 per_cpu(system_page_pool, cpuid) = pp_ptr;
11731#endif
11732 return 0;
11733}
11734
11735/*
11736 * This is called single threaded during boot, so no need
11737 * to take the rtnl semaphore.
11738 */
11739static int __init net_dev_init(void)
11740{
11741 int i, rc = -ENOMEM;
11742
11743 BUG_ON(!dev_boot_phase);
11744
11745 net_dev_struct_check();
11746
11747 if (dev_proc_init())
11748 goto out;
11749
11750 if (netdev_kobject_init())
11751 goto out;
11752
11753 for (i = 0; i < PTYPE_HASH_SIZE; i++)
11754 INIT_LIST_HEAD(&ptype_base[i]);
11755
11756 if (register_pernet_subsys(&netdev_net_ops))
11757 goto out;
11758
11759 /*
11760 * Initialise the packet receive queues.
11761 */
11762
11763 for_each_possible_cpu(i) {
11764 struct work_struct *flush = per_cpu_ptr(&flush_works, i);
11765 struct softnet_data *sd = &per_cpu(softnet_data, i);
11766
11767 INIT_WORK(flush, flush_backlog);
11768
11769 skb_queue_head_init(&sd->input_pkt_queue);
11770 skb_queue_head_init(&sd->process_queue);
11771#ifdef CONFIG_XFRM_OFFLOAD
11772 skb_queue_head_init(&sd->xfrm_backlog);
11773#endif
11774 INIT_LIST_HEAD(&sd->poll_list);
11775 sd->output_queue_tailp = &sd->output_queue;
11776#ifdef CONFIG_RPS
11777 INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
11778 sd->cpu = i;
11779#endif
11780 INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
11781 spin_lock_init(&sd->defer_lock);
11782
11783 init_gro_hash(&sd->backlog);
11784 sd->backlog.poll = process_backlog;
11785 sd->backlog.weight = weight_p;
11786
11787 if (net_page_pool_create(i))
11788 goto out;
11789 }
11790
11791 dev_boot_phase = 0;
11792
11793 /* The loopback device is special if any other network devices
11794 * is present in a network namespace the loopback device must
11795 * be present. Since we now dynamically allocate and free the
11796 * loopback device ensure this invariant is maintained by
11797 * keeping the loopback device as the first device on the
11798 * list of network devices. Ensuring the loopback devices
11799 * is the first device that appears and the last network device
11800 * that disappears.
11801 */
11802 if (register_pernet_device(&loopback_net_ops))
11803 goto out;
11804
11805 if (register_pernet_device(&default_device_ops))
11806 goto out;
11807
11808 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
11809 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
11810
11811 rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
11812 NULL, dev_cpu_dead);
11813 WARN_ON(rc < 0);
11814 rc = 0;
11815out:
11816 if (rc < 0) {
11817 for_each_possible_cpu(i) {
11818 struct page_pool *pp_ptr;
11819
11820 pp_ptr = per_cpu(system_page_pool, i);
11821 if (!pp_ptr)
11822 continue;
11823
11824 page_pool_destroy(pp_ptr);
11825 per_cpu(system_page_pool, i) = NULL;
11826 }
11827 }
11828
11829 return rc;
11830}
11831
11832subsys_initcall(net_dev_init);
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <linux/uaccess.h>
76#include <linux/bitops.h>
77#include <linux/capability.h>
78#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
81#include <linux/hash.h>
82#include <linux/slab.h>
83#include <linux/sched.h>
84#include <linux/mutex.h>
85#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
94#include <linux/ethtool.h>
95#include <linux/notifier.h>
96#include <linux/skbuff.h>
97#include <linux/bpf.h>
98#include <net/net_namespace.h>
99#include <net/sock.h>
100#include <net/busy_poll.h>
101#include <linux/rtnetlink.h>
102#include <linux/stat.h>
103#include <net/dst.h>
104#include <net/dst_metadata.h>
105#include <net/pkt_sched.h>
106#include <net/checksum.h>
107#include <net/xfrm.h>
108#include <linux/highmem.h>
109#include <linux/init.h>
110#include <linux/module.h>
111#include <linux/netpoll.h>
112#include <linux/rcupdate.h>
113#include <linux/delay.h>
114#include <net/iw_handler.h>
115#include <asm/current.h>
116#include <linux/audit.h>
117#include <linux/dmaengine.h>
118#include <linux/err.h>
119#include <linux/ctype.h>
120#include <linux/if_arp.h>
121#include <linux/if_vlan.h>
122#include <linux/ip.h>
123#include <net/ip.h>
124#include <net/mpls.h>
125#include <linux/ipv6.h>
126#include <linux/in.h>
127#include <linux/jhash.h>
128#include <linux/random.h>
129#include <trace/events/napi.h>
130#include <trace/events/net.h>
131#include <trace/events/skb.h>
132#include <linux/pci.h>
133#include <linux/inetdevice.h>
134#include <linux/cpu_rmap.h>
135#include <linux/static_key.h>
136#include <linux/hashtable.h>
137#include <linux/vmalloc.h>
138#include <linux/if_macvlan.h>
139#include <linux/errqueue.h>
140#include <linux/hrtimer.h>
141#include <linux/netfilter_ingress.h>
142#include <linux/crash_dump.h>
143
144#include "net-sysfs.h"
145
146/* Instead of increasing this, you should create a hash table. */
147#define MAX_GRO_SKBS 8
148
149/* This should be increased if a protocol with a bigger head is added. */
150#define GRO_MAX_HEAD (MAX_HEADER + 128)
151
152static DEFINE_SPINLOCK(ptype_lock);
153static DEFINE_SPINLOCK(offload_lock);
154struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
155struct list_head ptype_all __read_mostly; /* Taps */
156static struct list_head offload_base __read_mostly;
157
158static int netif_rx_internal(struct sk_buff *skb);
159static int call_netdevice_notifiers_info(unsigned long val,
160 struct net_device *dev,
161 struct netdev_notifier_info *info);
162
163/*
164 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
165 * semaphore.
166 *
167 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
168 *
169 * Writers must hold the rtnl semaphore while they loop through the
170 * dev_base_head list, and hold dev_base_lock for writing when they do the
171 * actual updates. This allows pure readers to access the list even
172 * while a writer is preparing to update it.
173 *
174 * To put it another way, dev_base_lock is held for writing only to
175 * protect against pure readers; the rtnl semaphore provides the
176 * protection against other writers.
177 *
178 * See, for example usages, register_netdevice() and
179 * unregister_netdevice(), which must be called with the rtnl
180 * semaphore held.
181 */
182DEFINE_RWLOCK(dev_base_lock);
183EXPORT_SYMBOL(dev_base_lock);
184
185/* protects napi_hash addition/deletion and napi_gen_id */
186static DEFINE_SPINLOCK(napi_hash_lock);
187
188static unsigned int napi_gen_id = NR_CPUS;
189static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
190
191static seqcount_t devnet_rename_seq;
192
193static inline void dev_base_seq_inc(struct net *net)
194{
195 while (++net->dev_base_seq == 0);
196}
197
198static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
199{
200 unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
201
202 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
203}
204
205static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
206{
207 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
208}
209
210static inline void rps_lock(struct softnet_data *sd)
211{
212#ifdef CONFIG_RPS
213 spin_lock(&sd->input_pkt_queue.lock);
214#endif
215}
216
217static inline void rps_unlock(struct softnet_data *sd)
218{
219#ifdef CONFIG_RPS
220 spin_unlock(&sd->input_pkt_queue.lock);
221#endif
222}
223
224/* Device list insertion */
225static void list_netdevice(struct net_device *dev)
226{
227 struct net *net = dev_net(dev);
228
229 ASSERT_RTNL();
230
231 write_lock_bh(&dev_base_lock);
232 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
233 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
234 hlist_add_head_rcu(&dev->index_hlist,
235 dev_index_hash(net, dev->ifindex));
236 write_unlock_bh(&dev_base_lock);
237
238 dev_base_seq_inc(net);
239}
240
241/* Device list removal
242 * caller must respect a RCU grace period before freeing/reusing dev
243 */
244static void unlist_netdevice(struct net_device *dev)
245{
246 ASSERT_RTNL();
247
248 /* Unlink dev from the device chain */
249 write_lock_bh(&dev_base_lock);
250 list_del_rcu(&dev->dev_list);
251 hlist_del_rcu(&dev->name_hlist);
252 hlist_del_rcu(&dev->index_hlist);
253 write_unlock_bh(&dev_base_lock);
254
255 dev_base_seq_inc(dev_net(dev));
256}
257
258/*
259 * Our notifier list
260 */
261
262static RAW_NOTIFIER_HEAD(netdev_chain);
263
264/*
265 * Device drivers call our routines to queue packets here. We empty the
266 * queue in the local softnet handler.
267 */
268
269DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
270EXPORT_PER_CPU_SYMBOL(softnet_data);
271
272#ifdef CONFIG_LOCKDEP
273/*
274 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275 * according to dev->type
276 */
277static const unsigned short netdev_lock_type[] =
278 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
290 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
291 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
292 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
293
294static const char *const netdev_lock_name[] =
295 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
296 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
297 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
298 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
299 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
300 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
301 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
302 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
303 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
304 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
305 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
306 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
307 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
308 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
309 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
310
311static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
312static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
313
314static inline unsigned short netdev_lock_pos(unsigned short dev_type)
315{
316 int i;
317
318 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
319 if (netdev_lock_type[i] == dev_type)
320 return i;
321 /* the last key is used by default */
322 return ARRAY_SIZE(netdev_lock_type) - 1;
323}
324
325static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
326 unsigned short dev_type)
327{
328 int i;
329
330 i = netdev_lock_pos(dev_type);
331 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
332 netdev_lock_name[i]);
333}
334
335static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
336{
337 int i;
338
339 i = netdev_lock_pos(dev->type);
340 lockdep_set_class_and_name(&dev->addr_list_lock,
341 &netdev_addr_lock_key[i],
342 netdev_lock_name[i]);
343}
344#else
345static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
346 unsigned short dev_type)
347{
348}
349static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
350{
351}
352#endif
353
354/*******************************************************************************
355
356 Protocol management and registration routines
357
358*******************************************************************************/
359
360/*
361 * Add a protocol ID to the list. Now that the input handler is
362 * smarter we can dispense with all the messy stuff that used to be
363 * here.
364 *
365 * BEWARE!!! Protocol handlers, mangling input packets,
366 * MUST BE last in hash buckets and checking protocol handlers
367 * MUST start from promiscuous ptype_all chain in net_bh.
368 * It is true now, do not change it.
369 * Explanation follows: if protocol handler, mangling packet, will
370 * be the first on list, it is not able to sense, that packet
371 * is cloned and should be copied-on-write, so that it will
372 * change it and subsequent readers will get broken packet.
373 * --ANK (980803)
374 */
375
376static inline struct list_head *ptype_head(const struct packet_type *pt)
377{
378 if (pt->type == htons(ETH_P_ALL))
379 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
380 else
381 return pt->dev ? &pt->dev->ptype_specific :
382 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
383}
384
385/**
386 * dev_add_pack - add packet handler
387 * @pt: packet type declaration
388 *
389 * Add a protocol handler to the networking stack. The passed &packet_type
390 * is linked into kernel lists and may not be freed until it has been
391 * removed from the kernel lists.
392 *
393 * This call does not sleep therefore it can not
394 * guarantee all CPU's that are in middle of receiving packets
395 * will see the new packet type (until the next received packet).
396 */
397
398void dev_add_pack(struct packet_type *pt)
399{
400 struct list_head *head = ptype_head(pt);
401
402 spin_lock(&ptype_lock);
403 list_add_rcu(&pt->list, head);
404 spin_unlock(&ptype_lock);
405}
406EXPORT_SYMBOL(dev_add_pack);
407
408/**
409 * __dev_remove_pack - remove packet handler
410 * @pt: packet type declaration
411 *
412 * Remove a protocol handler that was previously added to the kernel
413 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
414 * from the kernel lists and can be freed or reused once this function
415 * returns.
416 *
417 * The packet type might still be in use by receivers
418 * and must not be freed until after all the CPU's have gone
419 * through a quiescent state.
420 */
421void __dev_remove_pack(struct packet_type *pt)
422{
423 struct list_head *head = ptype_head(pt);
424 struct packet_type *pt1;
425
426 spin_lock(&ptype_lock);
427
428 list_for_each_entry(pt1, head, list) {
429 if (pt == pt1) {
430 list_del_rcu(&pt->list);
431 goto out;
432 }
433 }
434
435 pr_warn("dev_remove_pack: %p not found\n", pt);
436out:
437 spin_unlock(&ptype_lock);
438}
439EXPORT_SYMBOL(__dev_remove_pack);
440
441/**
442 * dev_remove_pack - remove packet handler
443 * @pt: packet type declaration
444 *
445 * Remove a protocol handler that was previously added to the kernel
446 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
447 * from the kernel lists and can be freed or reused once this function
448 * returns.
449 *
450 * This call sleeps to guarantee that no CPU is looking at the packet
451 * type after return.
452 */
453void dev_remove_pack(struct packet_type *pt)
454{
455 __dev_remove_pack(pt);
456
457 synchronize_net();
458}
459EXPORT_SYMBOL(dev_remove_pack);
460
461
462/**
463 * dev_add_offload - register offload handlers
464 * @po: protocol offload declaration
465 *
466 * Add protocol offload handlers to the networking stack. The passed
467 * &proto_offload is linked into kernel lists and may not be freed until
468 * it has been removed from the kernel lists.
469 *
470 * This call does not sleep therefore it can not
471 * guarantee all CPU's that are in middle of receiving packets
472 * will see the new offload handlers (until the next received packet).
473 */
474void dev_add_offload(struct packet_offload *po)
475{
476 struct packet_offload *elem;
477
478 spin_lock(&offload_lock);
479 list_for_each_entry(elem, &offload_base, list) {
480 if (po->priority < elem->priority)
481 break;
482 }
483 list_add_rcu(&po->list, elem->list.prev);
484 spin_unlock(&offload_lock);
485}
486EXPORT_SYMBOL(dev_add_offload);
487
488/**
489 * __dev_remove_offload - remove offload handler
490 * @po: packet offload declaration
491 *
492 * Remove a protocol offload handler that was previously added to the
493 * kernel offload handlers by dev_add_offload(). The passed &offload_type
494 * is removed from the kernel lists and can be freed or reused once this
495 * function returns.
496 *
497 * The packet type might still be in use by receivers
498 * and must not be freed until after all the CPU's have gone
499 * through a quiescent state.
500 */
501static void __dev_remove_offload(struct packet_offload *po)
502{
503 struct list_head *head = &offload_base;
504 struct packet_offload *po1;
505
506 spin_lock(&offload_lock);
507
508 list_for_each_entry(po1, head, list) {
509 if (po == po1) {
510 list_del_rcu(&po->list);
511 goto out;
512 }
513 }
514
515 pr_warn("dev_remove_offload: %p not found\n", po);
516out:
517 spin_unlock(&offload_lock);
518}
519
520/**
521 * dev_remove_offload - remove packet offload handler
522 * @po: packet offload declaration
523 *
524 * Remove a packet offload handler that was previously added to the kernel
525 * offload handlers by dev_add_offload(). The passed &offload_type is
526 * removed from the kernel lists and can be freed or reused once this
527 * function returns.
528 *
529 * This call sleeps to guarantee that no CPU is looking at the packet
530 * type after return.
531 */
532void dev_remove_offload(struct packet_offload *po)
533{
534 __dev_remove_offload(po);
535
536 synchronize_net();
537}
538EXPORT_SYMBOL(dev_remove_offload);
539
540/******************************************************************************
541
542 Device Boot-time Settings Routines
543
544*******************************************************************************/
545
546/* Boot time configuration table */
547static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
548
549/**
550 * netdev_boot_setup_add - add new setup entry
551 * @name: name of the device
552 * @map: configured settings for the device
553 *
554 * Adds new setup entry to the dev_boot_setup list. The function
555 * returns 0 on error and 1 on success. This is a generic routine to
556 * all netdevices.
557 */
558static int netdev_boot_setup_add(char *name, struct ifmap *map)
559{
560 struct netdev_boot_setup *s;
561 int i;
562
563 s = dev_boot_setup;
564 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
565 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
566 memset(s[i].name, 0, sizeof(s[i].name));
567 strlcpy(s[i].name, name, IFNAMSIZ);
568 memcpy(&s[i].map, map, sizeof(s[i].map));
569 break;
570 }
571 }
572
573 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
574}
575
576/**
577 * netdev_boot_setup_check - check boot time settings
578 * @dev: the netdevice
579 *
580 * Check boot time settings for the device.
581 * The found settings are set for the device to be used
582 * later in the device probing.
583 * Returns 0 if no settings found, 1 if they are.
584 */
585int netdev_boot_setup_check(struct net_device *dev)
586{
587 struct netdev_boot_setup *s = dev_boot_setup;
588 int i;
589
590 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
591 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
592 !strcmp(dev->name, s[i].name)) {
593 dev->irq = s[i].map.irq;
594 dev->base_addr = s[i].map.base_addr;
595 dev->mem_start = s[i].map.mem_start;
596 dev->mem_end = s[i].map.mem_end;
597 return 1;
598 }
599 }
600 return 0;
601}
602EXPORT_SYMBOL(netdev_boot_setup_check);
603
604
605/**
606 * netdev_boot_base - get address from boot time settings
607 * @prefix: prefix for network device
608 * @unit: id for network device
609 *
610 * Check boot time settings for the base address of device.
611 * The found settings are set for the device to be used
612 * later in the device probing.
613 * Returns 0 if no settings found.
614 */
615unsigned long netdev_boot_base(const char *prefix, int unit)
616{
617 const struct netdev_boot_setup *s = dev_boot_setup;
618 char name[IFNAMSIZ];
619 int i;
620
621 sprintf(name, "%s%d", prefix, unit);
622
623 /*
624 * If device already registered then return base of 1
625 * to indicate not to probe for this interface
626 */
627 if (__dev_get_by_name(&init_net, name))
628 return 1;
629
630 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
631 if (!strcmp(name, s[i].name))
632 return s[i].map.base_addr;
633 return 0;
634}
635
636/*
637 * Saves at boot time configured settings for any netdevice.
638 */
639int __init netdev_boot_setup(char *str)
640{
641 int ints[5];
642 struct ifmap map;
643
644 str = get_options(str, ARRAY_SIZE(ints), ints);
645 if (!str || !*str)
646 return 0;
647
648 /* Save settings */
649 memset(&map, 0, sizeof(map));
650 if (ints[0] > 0)
651 map.irq = ints[1];
652 if (ints[0] > 1)
653 map.base_addr = ints[2];
654 if (ints[0] > 2)
655 map.mem_start = ints[3];
656 if (ints[0] > 3)
657 map.mem_end = ints[4];
658
659 /* Add new entry to the list */
660 return netdev_boot_setup_add(str, &map);
661}
662
663__setup("netdev=", netdev_boot_setup);
664
665/*******************************************************************************
666
667 Device Interface Subroutines
668
669*******************************************************************************/
670
671/**
672 * dev_get_iflink - get 'iflink' value of a interface
673 * @dev: targeted interface
674 *
675 * Indicates the ifindex the interface is linked to.
676 * Physical interfaces have the same 'ifindex' and 'iflink' values.
677 */
678
679int dev_get_iflink(const struct net_device *dev)
680{
681 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
682 return dev->netdev_ops->ndo_get_iflink(dev);
683
684 return dev->ifindex;
685}
686EXPORT_SYMBOL(dev_get_iflink);
687
688/**
689 * dev_fill_metadata_dst - Retrieve tunnel egress information.
690 * @dev: targeted interface
691 * @skb: The packet.
692 *
693 * For better visibility of tunnel traffic OVS needs to retrieve
694 * egress tunnel information for a packet. Following API allows
695 * user to get this info.
696 */
697int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
698{
699 struct ip_tunnel_info *info;
700
701 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
702 return -EINVAL;
703
704 info = skb_tunnel_info_unclone(skb);
705 if (!info)
706 return -ENOMEM;
707 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
708 return -EINVAL;
709
710 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
711}
712EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
713
714/**
715 * __dev_get_by_name - find a device by its name
716 * @net: the applicable net namespace
717 * @name: name to find
718 *
719 * Find an interface by name. Must be called under RTNL semaphore
720 * or @dev_base_lock. If the name is found a pointer to the device
721 * is returned. If the name is not found then %NULL is returned. The
722 * reference counters are not incremented so the caller must be
723 * careful with locks.
724 */
725
726struct net_device *__dev_get_by_name(struct net *net, const char *name)
727{
728 struct net_device *dev;
729 struct hlist_head *head = dev_name_hash(net, name);
730
731 hlist_for_each_entry(dev, head, name_hlist)
732 if (!strncmp(dev->name, name, IFNAMSIZ))
733 return dev;
734
735 return NULL;
736}
737EXPORT_SYMBOL(__dev_get_by_name);
738
739/**
740 * dev_get_by_name_rcu - find a device by its name
741 * @net: the applicable net namespace
742 * @name: name to find
743 *
744 * Find an interface by name.
745 * If the name is found a pointer to the device is returned.
746 * If the name is not found then %NULL is returned.
747 * The reference counters are not incremented so the caller must be
748 * careful with locks. The caller must hold RCU lock.
749 */
750
751struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
752{
753 struct net_device *dev;
754 struct hlist_head *head = dev_name_hash(net, name);
755
756 hlist_for_each_entry_rcu(dev, head, name_hlist)
757 if (!strncmp(dev->name, name, IFNAMSIZ))
758 return dev;
759
760 return NULL;
761}
762EXPORT_SYMBOL(dev_get_by_name_rcu);
763
764/**
765 * dev_get_by_name - find a device by its name
766 * @net: the applicable net namespace
767 * @name: name to find
768 *
769 * Find an interface by name. This can be called from any
770 * context and does its own locking. The returned handle has
771 * the usage count incremented and the caller must use dev_put() to
772 * release it when it is no longer needed. %NULL is returned if no
773 * matching device is found.
774 */
775
776struct net_device *dev_get_by_name(struct net *net, const char *name)
777{
778 struct net_device *dev;
779
780 rcu_read_lock();
781 dev = dev_get_by_name_rcu(net, name);
782 if (dev)
783 dev_hold(dev);
784 rcu_read_unlock();
785 return dev;
786}
787EXPORT_SYMBOL(dev_get_by_name);
788
789/**
790 * __dev_get_by_index - find a device by its ifindex
791 * @net: the applicable net namespace
792 * @ifindex: index of device
793 *
794 * Search for an interface by index. Returns %NULL if the device
795 * is not found or a pointer to the device. The device has not
796 * had its reference counter increased so the caller must be careful
797 * about locking. The caller must hold either the RTNL semaphore
798 * or @dev_base_lock.
799 */
800
801struct net_device *__dev_get_by_index(struct net *net, int ifindex)
802{
803 struct net_device *dev;
804 struct hlist_head *head = dev_index_hash(net, ifindex);
805
806 hlist_for_each_entry(dev, head, index_hlist)
807 if (dev->ifindex == ifindex)
808 return dev;
809
810 return NULL;
811}
812EXPORT_SYMBOL(__dev_get_by_index);
813
814/**
815 * dev_get_by_index_rcu - find a device by its ifindex
816 * @net: the applicable net namespace
817 * @ifindex: index of device
818 *
819 * Search for an interface by index. Returns %NULL if the device
820 * is not found or a pointer to the device. The device has not
821 * had its reference counter increased so the caller must be careful
822 * about locking. The caller must hold RCU lock.
823 */
824
825struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
826{
827 struct net_device *dev;
828 struct hlist_head *head = dev_index_hash(net, ifindex);
829
830 hlist_for_each_entry_rcu(dev, head, index_hlist)
831 if (dev->ifindex == ifindex)
832 return dev;
833
834 return NULL;
835}
836EXPORT_SYMBOL(dev_get_by_index_rcu);
837
838
839/**
840 * dev_get_by_index - find a device by its ifindex
841 * @net: the applicable net namespace
842 * @ifindex: index of device
843 *
844 * Search for an interface by index. Returns NULL if the device
845 * is not found or a pointer to the device. The device returned has
846 * had a reference added and the pointer is safe until the user calls
847 * dev_put to indicate they have finished with it.
848 */
849
850struct net_device *dev_get_by_index(struct net *net, int ifindex)
851{
852 struct net_device *dev;
853
854 rcu_read_lock();
855 dev = dev_get_by_index_rcu(net, ifindex);
856 if (dev)
857 dev_hold(dev);
858 rcu_read_unlock();
859 return dev;
860}
861EXPORT_SYMBOL(dev_get_by_index);
862
863/**
864 * netdev_get_name - get a netdevice name, knowing its ifindex.
865 * @net: network namespace
866 * @name: a pointer to the buffer where the name will be stored.
867 * @ifindex: the ifindex of the interface to get the name from.
868 *
869 * The use of raw_seqcount_begin() and cond_resched() before
870 * retrying is required as we want to give the writers a chance
871 * to complete when CONFIG_PREEMPT is not set.
872 */
873int netdev_get_name(struct net *net, char *name, int ifindex)
874{
875 struct net_device *dev;
876 unsigned int seq;
877
878retry:
879 seq = raw_seqcount_begin(&devnet_rename_seq);
880 rcu_read_lock();
881 dev = dev_get_by_index_rcu(net, ifindex);
882 if (!dev) {
883 rcu_read_unlock();
884 return -ENODEV;
885 }
886
887 strcpy(name, dev->name);
888 rcu_read_unlock();
889 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
890 cond_resched();
891 goto retry;
892 }
893
894 return 0;
895}
896
897/**
898 * dev_getbyhwaddr_rcu - find a device by its hardware address
899 * @net: the applicable net namespace
900 * @type: media type of device
901 * @ha: hardware address
902 *
903 * Search for an interface by MAC address. Returns NULL if the device
904 * is not found or a pointer to the device.
905 * The caller must hold RCU or RTNL.
906 * The returned device has not had its ref count increased
907 * and the caller must therefore be careful about locking
908 *
909 */
910
911struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
912 const char *ha)
913{
914 struct net_device *dev;
915
916 for_each_netdev_rcu(net, dev)
917 if (dev->type == type &&
918 !memcmp(dev->dev_addr, ha, dev->addr_len))
919 return dev;
920
921 return NULL;
922}
923EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
924
925struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
926{
927 struct net_device *dev;
928
929 ASSERT_RTNL();
930 for_each_netdev(net, dev)
931 if (dev->type == type)
932 return dev;
933
934 return NULL;
935}
936EXPORT_SYMBOL(__dev_getfirstbyhwtype);
937
938struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
939{
940 struct net_device *dev, *ret = NULL;
941
942 rcu_read_lock();
943 for_each_netdev_rcu(net, dev)
944 if (dev->type == type) {
945 dev_hold(dev);
946 ret = dev;
947 break;
948 }
949 rcu_read_unlock();
950 return ret;
951}
952EXPORT_SYMBOL(dev_getfirstbyhwtype);
953
954/**
955 * __dev_get_by_flags - find any device with given flags
956 * @net: the applicable net namespace
957 * @if_flags: IFF_* values
958 * @mask: bitmask of bits in if_flags to check
959 *
960 * Search for any interface with the given flags. Returns NULL if a device
961 * is not found or a pointer to the device. Must be called inside
962 * rtnl_lock(), and result refcount is unchanged.
963 */
964
965struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
966 unsigned short mask)
967{
968 struct net_device *dev, *ret;
969
970 ASSERT_RTNL();
971
972 ret = NULL;
973 for_each_netdev(net, dev) {
974 if (((dev->flags ^ if_flags) & mask) == 0) {
975 ret = dev;
976 break;
977 }
978 }
979 return ret;
980}
981EXPORT_SYMBOL(__dev_get_by_flags);
982
983/**
984 * dev_valid_name - check if name is okay for network device
985 * @name: name string
986 *
987 * Network device names need to be valid file names to
988 * to allow sysfs to work. We also disallow any kind of
989 * whitespace.
990 */
991bool dev_valid_name(const char *name)
992{
993 if (*name == '\0')
994 return false;
995 if (strlen(name) >= IFNAMSIZ)
996 return false;
997 if (!strcmp(name, ".") || !strcmp(name, ".."))
998 return false;
999
1000 while (*name) {
1001 if (*name == '/' || *name == ':' || isspace(*name))
1002 return false;
1003 name++;
1004 }
1005 return true;
1006}
1007EXPORT_SYMBOL(dev_valid_name);
1008
1009/**
1010 * __dev_alloc_name - allocate a name for a device
1011 * @net: network namespace to allocate the device name in
1012 * @name: name format string
1013 * @buf: scratch buffer and result name string
1014 *
1015 * Passed a format string - eg "lt%d" it will try and find a suitable
1016 * id. It scans list of devices to build up a free map, then chooses
1017 * the first empty slot. The caller must hold the dev_base or rtnl lock
1018 * while allocating the name and adding the device in order to avoid
1019 * duplicates.
1020 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021 * Returns the number of the unit assigned or a negative errno code.
1022 */
1023
1024static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1025{
1026 int i = 0;
1027 const char *p;
1028 const int max_netdevices = 8*PAGE_SIZE;
1029 unsigned long *inuse;
1030 struct net_device *d;
1031
1032 p = strnchr(name, IFNAMSIZ-1, '%');
1033 if (p) {
1034 /*
1035 * Verify the string as this thing may have come from
1036 * the user. There must be either one "%d" and no other "%"
1037 * characters.
1038 */
1039 if (p[1] != 'd' || strchr(p + 2, '%'))
1040 return -EINVAL;
1041
1042 /* Use one page as a bit array of possible slots */
1043 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1044 if (!inuse)
1045 return -ENOMEM;
1046
1047 for_each_netdev(net, d) {
1048 if (!sscanf(d->name, name, &i))
1049 continue;
1050 if (i < 0 || i >= max_netdevices)
1051 continue;
1052
1053 /* avoid cases where sscanf is not exact inverse of printf */
1054 snprintf(buf, IFNAMSIZ, name, i);
1055 if (!strncmp(buf, d->name, IFNAMSIZ))
1056 set_bit(i, inuse);
1057 }
1058
1059 i = find_first_zero_bit(inuse, max_netdevices);
1060 free_page((unsigned long) inuse);
1061 }
1062
1063 if (buf != name)
1064 snprintf(buf, IFNAMSIZ, name, i);
1065 if (!__dev_get_by_name(net, buf))
1066 return i;
1067
1068 /* It is possible to run out of possible slots
1069 * when the name is long and there isn't enough space left
1070 * for the digits, or if all bits are used.
1071 */
1072 return -ENFILE;
1073}
1074
1075/**
1076 * dev_alloc_name - allocate a name for a device
1077 * @dev: device
1078 * @name: name format string
1079 *
1080 * Passed a format string - eg "lt%d" it will try and find a suitable
1081 * id. It scans list of devices to build up a free map, then chooses
1082 * the first empty slot. The caller must hold the dev_base or rtnl lock
1083 * while allocating the name and adding the device in order to avoid
1084 * duplicates.
1085 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086 * Returns the number of the unit assigned or a negative errno code.
1087 */
1088
1089int dev_alloc_name(struct net_device *dev, const char *name)
1090{
1091 char buf[IFNAMSIZ];
1092 struct net *net;
1093 int ret;
1094
1095 BUG_ON(!dev_net(dev));
1096 net = dev_net(dev);
1097 ret = __dev_alloc_name(net, name, buf);
1098 if (ret >= 0)
1099 strlcpy(dev->name, buf, IFNAMSIZ);
1100 return ret;
1101}
1102EXPORT_SYMBOL(dev_alloc_name);
1103
1104static int dev_alloc_name_ns(struct net *net,
1105 struct net_device *dev,
1106 const char *name)
1107{
1108 char buf[IFNAMSIZ];
1109 int ret;
1110
1111 ret = __dev_alloc_name(net, name, buf);
1112 if (ret >= 0)
1113 strlcpy(dev->name, buf, IFNAMSIZ);
1114 return ret;
1115}
1116
1117static int dev_get_valid_name(struct net *net,
1118 struct net_device *dev,
1119 const char *name)
1120{
1121 BUG_ON(!net);
1122
1123 if (!dev_valid_name(name))
1124 return -EINVAL;
1125
1126 if (strchr(name, '%'))
1127 return dev_alloc_name_ns(net, dev, name);
1128 else if (__dev_get_by_name(net, name))
1129 return -EEXIST;
1130 else if (dev->name != name)
1131 strlcpy(dev->name, name, IFNAMSIZ);
1132
1133 return 0;
1134}
1135
1136/**
1137 * dev_change_name - change name of a device
1138 * @dev: device
1139 * @newname: name (or format string) must be at least IFNAMSIZ
1140 *
1141 * Change name of a device, can pass format strings "eth%d".
1142 * for wildcarding.
1143 */
1144int dev_change_name(struct net_device *dev, const char *newname)
1145{
1146 unsigned char old_assign_type;
1147 char oldname[IFNAMSIZ];
1148 int err = 0;
1149 int ret;
1150 struct net *net;
1151
1152 ASSERT_RTNL();
1153 BUG_ON(!dev_net(dev));
1154
1155 net = dev_net(dev);
1156 if (dev->flags & IFF_UP)
1157 return -EBUSY;
1158
1159 write_seqcount_begin(&devnet_rename_seq);
1160
1161 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1162 write_seqcount_end(&devnet_rename_seq);
1163 return 0;
1164 }
1165
1166 memcpy(oldname, dev->name, IFNAMSIZ);
1167
1168 err = dev_get_valid_name(net, dev, newname);
1169 if (err < 0) {
1170 write_seqcount_end(&devnet_rename_seq);
1171 return err;
1172 }
1173
1174 if (oldname[0] && !strchr(oldname, '%'))
1175 netdev_info(dev, "renamed from %s\n", oldname);
1176
1177 old_assign_type = dev->name_assign_type;
1178 dev->name_assign_type = NET_NAME_RENAMED;
1179
1180rollback:
1181 ret = device_rename(&dev->dev, dev->name);
1182 if (ret) {
1183 memcpy(dev->name, oldname, IFNAMSIZ);
1184 dev->name_assign_type = old_assign_type;
1185 write_seqcount_end(&devnet_rename_seq);
1186 return ret;
1187 }
1188
1189 write_seqcount_end(&devnet_rename_seq);
1190
1191 netdev_adjacent_rename_links(dev, oldname);
1192
1193 write_lock_bh(&dev_base_lock);
1194 hlist_del_rcu(&dev->name_hlist);
1195 write_unlock_bh(&dev_base_lock);
1196
1197 synchronize_rcu();
1198
1199 write_lock_bh(&dev_base_lock);
1200 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1201 write_unlock_bh(&dev_base_lock);
1202
1203 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1204 ret = notifier_to_errno(ret);
1205
1206 if (ret) {
1207 /* err >= 0 after dev_alloc_name() or stores the first errno */
1208 if (err >= 0) {
1209 err = ret;
1210 write_seqcount_begin(&devnet_rename_seq);
1211 memcpy(dev->name, oldname, IFNAMSIZ);
1212 memcpy(oldname, newname, IFNAMSIZ);
1213 dev->name_assign_type = old_assign_type;
1214 old_assign_type = NET_NAME_RENAMED;
1215 goto rollback;
1216 } else {
1217 pr_err("%s: name change rollback failed: %d\n",
1218 dev->name, ret);
1219 }
1220 }
1221
1222 return err;
1223}
1224
1225/**
1226 * dev_set_alias - change ifalias of a device
1227 * @dev: device
1228 * @alias: name up to IFALIASZ
1229 * @len: limit of bytes to copy from info
1230 *
1231 * Set ifalias for a device,
1232 */
1233int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1234{
1235 char *new_ifalias;
1236
1237 ASSERT_RTNL();
1238
1239 if (len >= IFALIASZ)
1240 return -EINVAL;
1241
1242 if (!len) {
1243 kfree(dev->ifalias);
1244 dev->ifalias = NULL;
1245 return 0;
1246 }
1247
1248 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1249 if (!new_ifalias)
1250 return -ENOMEM;
1251 dev->ifalias = new_ifalias;
1252
1253 strlcpy(dev->ifalias, alias, len+1);
1254 return len;
1255}
1256
1257
1258/**
1259 * netdev_features_change - device changes features
1260 * @dev: device to cause notification
1261 *
1262 * Called to indicate a device has changed features.
1263 */
1264void netdev_features_change(struct net_device *dev)
1265{
1266 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1267}
1268EXPORT_SYMBOL(netdev_features_change);
1269
1270/**
1271 * netdev_state_change - device changes state
1272 * @dev: device to cause notification
1273 *
1274 * Called to indicate a device has changed state. This function calls
1275 * the notifier chains for netdev_chain and sends a NEWLINK message
1276 * to the routing socket.
1277 */
1278void netdev_state_change(struct net_device *dev)
1279{
1280 if (dev->flags & IFF_UP) {
1281 struct netdev_notifier_change_info change_info;
1282
1283 change_info.flags_changed = 0;
1284 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1285 &change_info.info);
1286 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1287 }
1288}
1289EXPORT_SYMBOL(netdev_state_change);
1290
1291/**
1292 * netdev_notify_peers - notify network peers about existence of @dev
1293 * @dev: network device
1294 *
1295 * Generate traffic such that interested network peers are aware of
1296 * @dev, such as by generating a gratuitous ARP. This may be used when
1297 * a device wants to inform the rest of the network about some sort of
1298 * reconfiguration such as a failover event or virtual machine
1299 * migration.
1300 */
1301void netdev_notify_peers(struct net_device *dev)
1302{
1303 rtnl_lock();
1304 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1305 rtnl_unlock();
1306}
1307EXPORT_SYMBOL(netdev_notify_peers);
1308
1309static int __dev_open(struct net_device *dev)
1310{
1311 const struct net_device_ops *ops = dev->netdev_ops;
1312 int ret;
1313
1314 ASSERT_RTNL();
1315
1316 if (!netif_device_present(dev))
1317 return -ENODEV;
1318
1319 /* Block netpoll from trying to do any rx path servicing.
1320 * If we don't do this there is a chance ndo_poll_controller
1321 * or ndo_poll may be running while we open the device
1322 */
1323 netpoll_poll_disable(dev);
1324
1325 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1326 ret = notifier_to_errno(ret);
1327 if (ret)
1328 return ret;
1329
1330 set_bit(__LINK_STATE_START, &dev->state);
1331
1332 if (ops->ndo_validate_addr)
1333 ret = ops->ndo_validate_addr(dev);
1334
1335 if (!ret && ops->ndo_open)
1336 ret = ops->ndo_open(dev);
1337
1338 netpoll_poll_enable(dev);
1339
1340 if (ret)
1341 clear_bit(__LINK_STATE_START, &dev->state);
1342 else {
1343 dev->flags |= IFF_UP;
1344 dev_set_rx_mode(dev);
1345 dev_activate(dev);
1346 add_device_randomness(dev->dev_addr, dev->addr_len);
1347 }
1348
1349 return ret;
1350}
1351
1352/**
1353 * dev_open - prepare an interface for use.
1354 * @dev: device to open
1355 *
1356 * Takes a device from down to up state. The device's private open
1357 * function is invoked and then the multicast lists are loaded. Finally
1358 * the device is moved into the up state and a %NETDEV_UP message is
1359 * sent to the netdev notifier chain.
1360 *
1361 * Calling this function on an active interface is a nop. On a failure
1362 * a negative errno code is returned.
1363 */
1364int dev_open(struct net_device *dev)
1365{
1366 int ret;
1367
1368 if (dev->flags & IFF_UP)
1369 return 0;
1370
1371 ret = __dev_open(dev);
1372 if (ret < 0)
1373 return ret;
1374
1375 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376 call_netdevice_notifiers(NETDEV_UP, dev);
1377
1378 return ret;
1379}
1380EXPORT_SYMBOL(dev_open);
1381
1382static int __dev_close_many(struct list_head *head)
1383{
1384 struct net_device *dev;
1385
1386 ASSERT_RTNL();
1387 might_sleep();
1388
1389 list_for_each_entry(dev, head, close_list) {
1390 /* Temporarily disable netpoll until the interface is down */
1391 netpoll_poll_disable(dev);
1392
1393 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1394
1395 clear_bit(__LINK_STATE_START, &dev->state);
1396
1397 /* Synchronize to scheduled poll. We cannot touch poll list, it
1398 * can be even on different cpu. So just clear netif_running().
1399 *
1400 * dev->stop() will invoke napi_disable() on all of it's
1401 * napi_struct instances on this device.
1402 */
1403 smp_mb__after_atomic(); /* Commit netif_running(). */
1404 }
1405
1406 dev_deactivate_many(head);
1407
1408 list_for_each_entry(dev, head, close_list) {
1409 const struct net_device_ops *ops = dev->netdev_ops;
1410
1411 /*
1412 * Call the device specific close. This cannot fail.
1413 * Only if device is UP
1414 *
1415 * We allow it to be called even after a DETACH hot-plug
1416 * event.
1417 */
1418 if (ops->ndo_stop)
1419 ops->ndo_stop(dev);
1420
1421 dev->flags &= ~IFF_UP;
1422 netpoll_poll_enable(dev);
1423 }
1424
1425 return 0;
1426}
1427
1428static int __dev_close(struct net_device *dev)
1429{
1430 int retval;
1431 LIST_HEAD(single);
1432
1433 list_add(&dev->close_list, &single);
1434 retval = __dev_close_many(&single);
1435 list_del(&single);
1436
1437 return retval;
1438}
1439
1440int dev_close_many(struct list_head *head, bool unlink)
1441{
1442 struct net_device *dev, *tmp;
1443
1444 /* Remove the devices that don't need to be closed */
1445 list_for_each_entry_safe(dev, tmp, head, close_list)
1446 if (!(dev->flags & IFF_UP))
1447 list_del_init(&dev->close_list);
1448
1449 __dev_close_many(head);
1450
1451 list_for_each_entry_safe(dev, tmp, head, close_list) {
1452 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1453 call_netdevice_notifiers(NETDEV_DOWN, dev);
1454 if (unlink)
1455 list_del_init(&dev->close_list);
1456 }
1457
1458 return 0;
1459}
1460EXPORT_SYMBOL(dev_close_many);
1461
1462/**
1463 * dev_close - shutdown an interface.
1464 * @dev: device to shutdown
1465 *
1466 * This function moves an active device into down state. A
1467 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1468 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1469 * chain.
1470 */
1471int dev_close(struct net_device *dev)
1472{
1473 if (dev->flags & IFF_UP) {
1474 LIST_HEAD(single);
1475
1476 list_add(&dev->close_list, &single);
1477 dev_close_many(&single, true);
1478 list_del(&single);
1479 }
1480 return 0;
1481}
1482EXPORT_SYMBOL(dev_close);
1483
1484
1485/**
1486 * dev_disable_lro - disable Large Receive Offload on a device
1487 * @dev: device
1488 *
1489 * Disable Large Receive Offload (LRO) on a net device. Must be
1490 * called under RTNL. This is needed if received packets may be
1491 * forwarded to another interface.
1492 */
1493void dev_disable_lro(struct net_device *dev)
1494{
1495 struct net_device *lower_dev;
1496 struct list_head *iter;
1497
1498 dev->wanted_features &= ~NETIF_F_LRO;
1499 netdev_update_features(dev);
1500
1501 if (unlikely(dev->features & NETIF_F_LRO))
1502 netdev_WARN(dev, "failed to disable LRO!\n");
1503
1504 netdev_for_each_lower_dev(dev, lower_dev, iter)
1505 dev_disable_lro(lower_dev);
1506}
1507EXPORT_SYMBOL(dev_disable_lro);
1508
1509static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1510 struct net_device *dev)
1511{
1512 struct netdev_notifier_info info;
1513
1514 netdev_notifier_info_init(&info, dev);
1515 return nb->notifier_call(nb, val, &info);
1516}
1517
1518static int dev_boot_phase = 1;
1519
1520/**
1521 * register_netdevice_notifier - register a network notifier block
1522 * @nb: notifier
1523 *
1524 * Register a notifier to be called when network device events occur.
1525 * The notifier passed is linked into the kernel structures and must
1526 * not be reused until it has been unregistered. A negative errno code
1527 * is returned on a failure.
1528 *
1529 * When registered all registration and up events are replayed
1530 * to the new notifier to allow device to have a race free
1531 * view of the network device list.
1532 */
1533
1534int register_netdevice_notifier(struct notifier_block *nb)
1535{
1536 struct net_device *dev;
1537 struct net_device *last;
1538 struct net *net;
1539 int err;
1540
1541 rtnl_lock();
1542 err = raw_notifier_chain_register(&netdev_chain, nb);
1543 if (err)
1544 goto unlock;
1545 if (dev_boot_phase)
1546 goto unlock;
1547 for_each_net(net) {
1548 for_each_netdev(net, dev) {
1549 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1550 err = notifier_to_errno(err);
1551 if (err)
1552 goto rollback;
1553
1554 if (!(dev->flags & IFF_UP))
1555 continue;
1556
1557 call_netdevice_notifier(nb, NETDEV_UP, dev);
1558 }
1559 }
1560
1561unlock:
1562 rtnl_unlock();
1563 return err;
1564
1565rollback:
1566 last = dev;
1567 for_each_net(net) {
1568 for_each_netdev(net, dev) {
1569 if (dev == last)
1570 goto outroll;
1571
1572 if (dev->flags & IFF_UP) {
1573 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1574 dev);
1575 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1576 }
1577 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1578 }
1579 }
1580
1581outroll:
1582 raw_notifier_chain_unregister(&netdev_chain, nb);
1583 goto unlock;
1584}
1585EXPORT_SYMBOL(register_netdevice_notifier);
1586
1587/**
1588 * unregister_netdevice_notifier - unregister a network notifier block
1589 * @nb: notifier
1590 *
1591 * Unregister a notifier previously registered by
1592 * register_netdevice_notifier(). The notifier is unlinked into the
1593 * kernel structures and may then be reused. A negative errno code
1594 * is returned on a failure.
1595 *
1596 * After unregistering unregister and down device events are synthesized
1597 * for all devices on the device list to the removed notifier to remove
1598 * the need for special case cleanup code.
1599 */
1600
1601int unregister_netdevice_notifier(struct notifier_block *nb)
1602{
1603 struct net_device *dev;
1604 struct net *net;
1605 int err;
1606
1607 rtnl_lock();
1608 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1609 if (err)
1610 goto unlock;
1611
1612 for_each_net(net) {
1613 for_each_netdev(net, dev) {
1614 if (dev->flags & IFF_UP) {
1615 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1616 dev);
1617 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1618 }
1619 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1620 }
1621 }
1622unlock:
1623 rtnl_unlock();
1624 return err;
1625}
1626EXPORT_SYMBOL(unregister_netdevice_notifier);
1627
1628/**
1629 * call_netdevice_notifiers_info - call all network notifier blocks
1630 * @val: value passed unmodified to notifier function
1631 * @dev: net_device pointer passed unmodified to notifier function
1632 * @info: notifier information data
1633 *
1634 * Call all network notifier blocks. Parameters and return value
1635 * are as for raw_notifier_call_chain().
1636 */
1637
1638static int call_netdevice_notifiers_info(unsigned long val,
1639 struct net_device *dev,
1640 struct netdev_notifier_info *info)
1641{
1642 ASSERT_RTNL();
1643 netdev_notifier_info_init(info, dev);
1644 return raw_notifier_call_chain(&netdev_chain, val, info);
1645}
1646
1647/**
1648 * call_netdevice_notifiers - call all network notifier blocks
1649 * @val: value passed unmodified to notifier function
1650 * @dev: net_device pointer passed unmodified to notifier function
1651 *
1652 * Call all network notifier blocks. Parameters and return value
1653 * are as for raw_notifier_call_chain().
1654 */
1655
1656int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1657{
1658 struct netdev_notifier_info info;
1659
1660 return call_netdevice_notifiers_info(val, dev, &info);
1661}
1662EXPORT_SYMBOL(call_netdevice_notifiers);
1663
1664#ifdef CONFIG_NET_INGRESS
1665static struct static_key ingress_needed __read_mostly;
1666
1667void net_inc_ingress_queue(void)
1668{
1669 static_key_slow_inc(&ingress_needed);
1670}
1671EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1672
1673void net_dec_ingress_queue(void)
1674{
1675 static_key_slow_dec(&ingress_needed);
1676}
1677EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1678#endif
1679
1680#ifdef CONFIG_NET_EGRESS
1681static struct static_key egress_needed __read_mostly;
1682
1683void net_inc_egress_queue(void)
1684{
1685 static_key_slow_inc(&egress_needed);
1686}
1687EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1688
1689void net_dec_egress_queue(void)
1690{
1691 static_key_slow_dec(&egress_needed);
1692}
1693EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1694#endif
1695
1696static struct static_key netstamp_needed __read_mostly;
1697#ifdef HAVE_JUMP_LABEL
1698static atomic_t netstamp_needed_deferred;
1699static atomic_t netstamp_wanted;
1700static void netstamp_clear(struct work_struct *work)
1701{
1702 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1703 int wanted;
1704
1705 wanted = atomic_add_return(deferred, &netstamp_wanted);
1706 if (wanted > 0)
1707 static_key_enable(&netstamp_needed);
1708 else
1709 static_key_disable(&netstamp_needed);
1710}
1711static DECLARE_WORK(netstamp_work, netstamp_clear);
1712#endif
1713
1714void net_enable_timestamp(void)
1715{
1716#ifdef HAVE_JUMP_LABEL
1717 int wanted;
1718
1719 while (1) {
1720 wanted = atomic_read(&netstamp_wanted);
1721 if (wanted <= 0)
1722 break;
1723 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1724 return;
1725 }
1726 atomic_inc(&netstamp_needed_deferred);
1727 schedule_work(&netstamp_work);
1728#else
1729 static_key_slow_inc(&netstamp_needed);
1730#endif
1731}
1732EXPORT_SYMBOL(net_enable_timestamp);
1733
1734void net_disable_timestamp(void)
1735{
1736#ifdef HAVE_JUMP_LABEL
1737 int wanted;
1738
1739 while (1) {
1740 wanted = atomic_read(&netstamp_wanted);
1741 if (wanted <= 1)
1742 break;
1743 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1744 return;
1745 }
1746 atomic_dec(&netstamp_needed_deferred);
1747 schedule_work(&netstamp_work);
1748#else
1749 static_key_slow_dec(&netstamp_needed);
1750#endif
1751}
1752EXPORT_SYMBOL(net_disable_timestamp);
1753
1754static inline void net_timestamp_set(struct sk_buff *skb)
1755{
1756 skb->tstamp = 0;
1757 if (static_key_false(&netstamp_needed))
1758 __net_timestamp(skb);
1759}
1760
1761#define net_timestamp_check(COND, SKB) \
1762 if (static_key_false(&netstamp_needed)) { \
1763 if ((COND) && !(SKB)->tstamp) \
1764 __net_timestamp(SKB); \
1765 } \
1766
1767bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1768{
1769 unsigned int len;
1770
1771 if (!(dev->flags & IFF_UP))
1772 return false;
1773
1774 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1775 if (skb->len <= len)
1776 return true;
1777
1778 /* if TSO is enabled, we don't care about the length as the packet
1779 * could be forwarded without being segmented before
1780 */
1781 if (skb_is_gso(skb))
1782 return true;
1783
1784 return false;
1785}
1786EXPORT_SYMBOL_GPL(is_skb_forwardable);
1787
1788int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1789{
1790 int ret = ____dev_forward_skb(dev, skb);
1791
1792 if (likely(!ret)) {
1793 skb->protocol = eth_type_trans(skb, dev);
1794 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1795 }
1796
1797 return ret;
1798}
1799EXPORT_SYMBOL_GPL(__dev_forward_skb);
1800
1801/**
1802 * dev_forward_skb - loopback an skb to another netif
1803 *
1804 * @dev: destination network device
1805 * @skb: buffer to forward
1806 *
1807 * return values:
1808 * NET_RX_SUCCESS (no congestion)
1809 * NET_RX_DROP (packet was dropped, but freed)
1810 *
1811 * dev_forward_skb can be used for injecting an skb from the
1812 * start_xmit function of one device into the receive queue
1813 * of another device.
1814 *
1815 * The receiving device may be in another namespace, so
1816 * we have to clear all information in the skb that could
1817 * impact namespace isolation.
1818 */
1819int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1820{
1821 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1822}
1823EXPORT_SYMBOL_GPL(dev_forward_skb);
1824
1825static inline int deliver_skb(struct sk_buff *skb,
1826 struct packet_type *pt_prev,
1827 struct net_device *orig_dev)
1828{
1829 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1830 return -ENOMEM;
1831 atomic_inc(&skb->users);
1832 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1833}
1834
1835static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1836 struct packet_type **pt,
1837 struct net_device *orig_dev,
1838 __be16 type,
1839 struct list_head *ptype_list)
1840{
1841 struct packet_type *ptype, *pt_prev = *pt;
1842
1843 list_for_each_entry_rcu(ptype, ptype_list, list) {
1844 if (ptype->type != type)
1845 continue;
1846 if (pt_prev)
1847 deliver_skb(skb, pt_prev, orig_dev);
1848 pt_prev = ptype;
1849 }
1850 *pt = pt_prev;
1851}
1852
1853static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1854{
1855 if (!ptype->af_packet_priv || !skb->sk)
1856 return false;
1857
1858 if (ptype->id_match)
1859 return ptype->id_match(ptype, skb->sk);
1860 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1861 return true;
1862
1863 return false;
1864}
1865
1866/*
1867 * Support routine. Sends outgoing frames to any network
1868 * taps currently in use.
1869 */
1870
1871void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1872{
1873 struct packet_type *ptype;
1874 struct sk_buff *skb2 = NULL;
1875 struct packet_type *pt_prev = NULL;
1876 struct list_head *ptype_list = &ptype_all;
1877
1878 rcu_read_lock();
1879again:
1880 list_for_each_entry_rcu(ptype, ptype_list, list) {
1881 /* Never send packets back to the socket
1882 * they originated from - MvS (miquels@drinkel.ow.org)
1883 */
1884 if (skb_loop_sk(ptype, skb))
1885 continue;
1886
1887 if (pt_prev) {
1888 deliver_skb(skb2, pt_prev, skb->dev);
1889 pt_prev = ptype;
1890 continue;
1891 }
1892
1893 /* need to clone skb, done only once */
1894 skb2 = skb_clone(skb, GFP_ATOMIC);
1895 if (!skb2)
1896 goto out_unlock;
1897
1898 net_timestamp_set(skb2);
1899
1900 /* skb->nh should be correctly
1901 * set by sender, so that the second statement is
1902 * just protection against buggy protocols.
1903 */
1904 skb_reset_mac_header(skb2);
1905
1906 if (skb_network_header(skb2) < skb2->data ||
1907 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1908 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1909 ntohs(skb2->protocol),
1910 dev->name);
1911 skb_reset_network_header(skb2);
1912 }
1913
1914 skb2->transport_header = skb2->network_header;
1915 skb2->pkt_type = PACKET_OUTGOING;
1916 pt_prev = ptype;
1917 }
1918
1919 if (ptype_list == &ptype_all) {
1920 ptype_list = &dev->ptype_all;
1921 goto again;
1922 }
1923out_unlock:
1924 if (pt_prev)
1925 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1926 rcu_read_unlock();
1927}
1928EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1929
1930/**
1931 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1932 * @dev: Network device
1933 * @txq: number of queues available
1934 *
1935 * If real_num_tx_queues is changed the tc mappings may no longer be
1936 * valid. To resolve this verify the tc mapping remains valid and if
1937 * not NULL the mapping. With no priorities mapping to this
1938 * offset/count pair it will no longer be used. In the worst case TC0
1939 * is invalid nothing can be done so disable priority mappings. If is
1940 * expected that drivers will fix this mapping if they can before
1941 * calling netif_set_real_num_tx_queues.
1942 */
1943static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1944{
1945 int i;
1946 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1947
1948 /* If TC0 is invalidated disable TC mapping */
1949 if (tc->offset + tc->count > txq) {
1950 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1951 dev->num_tc = 0;
1952 return;
1953 }
1954
1955 /* Invalidated prio to tc mappings set to TC0 */
1956 for (i = 1; i < TC_BITMASK + 1; i++) {
1957 int q = netdev_get_prio_tc_map(dev, i);
1958
1959 tc = &dev->tc_to_txq[q];
1960 if (tc->offset + tc->count > txq) {
1961 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1962 i, q);
1963 netdev_set_prio_tc_map(dev, i, 0);
1964 }
1965 }
1966}
1967
1968int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
1969{
1970 if (dev->num_tc) {
1971 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1972 int i;
1973
1974 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
1975 if ((txq - tc->offset) < tc->count)
1976 return i;
1977 }
1978
1979 return -1;
1980 }
1981
1982 return 0;
1983}
1984
1985#ifdef CONFIG_XPS
1986static DEFINE_MUTEX(xps_map_mutex);
1987#define xmap_dereference(P) \
1988 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1989
1990static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
1991 int tci, u16 index)
1992{
1993 struct xps_map *map = NULL;
1994 int pos;
1995
1996 if (dev_maps)
1997 map = xmap_dereference(dev_maps->cpu_map[tci]);
1998 if (!map)
1999 return false;
2000
2001 for (pos = map->len; pos--;) {
2002 if (map->queues[pos] != index)
2003 continue;
2004
2005 if (map->len > 1) {
2006 map->queues[pos] = map->queues[--map->len];
2007 break;
2008 }
2009
2010 RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
2011 kfree_rcu(map, rcu);
2012 return false;
2013 }
2014
2015 return true;
2016}
2017
2018static bool remove_xps_queue_cpu(struct net_device *dev,
2019 struct xps_dev_maps *dev_maps,
2020 int cpu, u16 offset, u16 count)
2021{
2022 int num_tc = dev->num_tc ? : 1;
2023 bool active = false;
2024 int tci;
2025
2026 for (tci = cpu * num_tc; num_tc--; tci++) {
2027 int i, j;
2028
2029 for (i = count, j = offset; i--; j++) {
2030 if (!remove_xps_queue(dev_maps, cpu, j))
2031 break;
2032 }
2033
2034 active |= i < 0;
2035 }
2036
2037 return active;
2038}
2039
2040static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2041 u16 count)
2042{
2043 struct xps_dev_maps *dev_maps;
2044 int cpu, i;
2045 bool active = false;
2046
2047 mutex_lock(&xps_map_mutex);
2048 dev_maps = xmap_dereference(dev->xps_maps);
2049
2050 if (!dev_maps)
2051 goto out_no_maps;
2052
2053 for_each_possible_cpu(cpu)
2054 active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
2055 offset, count);
2056
2057 if (!active) {
2058 RCU_INIT_POINTER(dev->xps_maps, NULL);
2059 kfree_rcu(dev_maps, rcu);
2060 }
2061
2062 for (i = offset + (count - 1); count--; i--)
2063 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2064 NUMA_NO_NODE);
2065
2066out_no_maps:
2067 mutex_unlock(&xps_map_mutex);
2068}
2069
2070static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2071{
2072 netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2073}
2074
2075static struct xps_map *expand_xps_map(struct xps_map *map,
2076 int cpu, u16 index)
2077{
2078 struct xps_map *new_map;
2079 int alloc_len = XPS_MIN_MAP_ALLOC;
2080 int i, pos;
2081
2082 for (pos = 0; map && pos < map->len; pos++) {
2083 if (map->queues[pos] != index)
2084 continue;
2085 return map;
2086 }
2087
2088 /* Need to add queue to this CPU's existing map */
2089 if (map) {
2090 if (pos < map->alloc_len)
2091 return map;
2092
2093 alloc_len = map->alloc_len * 2;
2094 }
2095
2096 /* Need to allocate new map to store queue on this CPU's map */
2097 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2098 cpu_to_node(cpu));
2099 if (!new_map)
2100 return NULL;
2101
2102 for (i = 0; i < pos; i++)
2103 new_map->queues[i] = map->queues[i];
2104 new_map->alloc_len = alloc_len;
2105 new_map->len = pos;
2106
2107 return new_map;
2108}
2109
2110int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2111 u16 index)
2112{
2113 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2114 int i, cpu, tci, numa_node_id = -2;
2115 int maps_sz, num_tc = 1, tc = 0;
2116 struct xps_map *map, *new_map;
2117 bool active = false;
2118
2119 if (dev->num_tc) {
2120 num_tc = dev->num_tc;
2121 tc = netdev_txq_to_tc(dev, index);
2122 if (tc < 0)
2123 return -EINVAL;
2124 }
2125
2126 maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
2127 if (maps_sz < L1_CACHE_BYTES)
2128 maps_sz = L1_CACHE_BYTES;
2129
2130 mutex_lock(&xps_map_mutex);
2131
2132 dev_maps = xmap_dereference(dev->xps_maps);
2133
2134 /* allocate memory for queue storage */
2135 for_each_cpu_and(cpu, cpu_online_mask, mask) {
2136 if (!new_dev_maps)
2137 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2138 if (!new_dev_maps) {
2139 mutex_unlock(&xps_map_mutex);
2140 return -ENOMEM;
2141 }
2142
2143 tci = cpu * num_tc + tc;
2144 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
2145 NULL;
2146
2147 map = expand_xps_map(map, cpu, index);
2148 if (!map)
2149 goto error;
2150
2151 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2152 }
2153
2154 if (!new_dev_maps)
2155 goto out_no_new_maps;
2156
2157 for_each_possible_cpu(cpu) {
2158 /* copy maps belonging to foreign traffic classes */
2159 for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
2160 /* fill in the new device map from the old device map */
2161 map = xmap_dereference(dev_maps->cpu_map[tci]);
2162 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2163 }
2164
2165 /* We need to explicitly update tci as prevous loop
2166 * could break out early if dev_maps is NULL.
2167 */
2168 tci = cpu * num_tc + tc;
2169
2170 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2171 /* add queue to CPU maps */
2172 int pos = 0;
2173
2174 map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2175 while ((pos < map->len) && (map->queues[pos] != index))
2176 pos++;
2177
2178 if (pos == map->len)
2179 map->queues[map->len++] = index;
2180#ifdef CONFIG_NUMA
2181 if (numa_node_id == -2)
2182 numa_node_id = cpu_to_node(cpu);
2183 else if (numa_node_id != cpu_to_node(cpu))
2184 numa_node_id = -1;
2185#endif
2186 } else if (dev_maps) {
2187 /* fill in the new device map from the old device map */
2188 map = xmap_dereference(dev_maps->cpu_map[tci]);
2189 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2190 }
2191
2192 /* copy maps belonging to foreign traffic classes */
2193 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2194 /* fill in the new device map from the old device map */
2195 map = xmap_dereference(dev_maps->cpu_map[tci]);
2196 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2197 }
2198 }
2199
2200 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2201
2202 /* Cleanup old maps */
2203 if (!dev_maps)
2204 goto out_no_old_maps;
2205
2206 for_each_possible_cpu(cpu) {
2207 for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2208 new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2209 map = xmap_dereference(dev_maps->cpu_map[tci]);
2210 if (map && map != new_map)
2211 kfree_rcu(map, rcu);
2212 }
2213 }
2214
2215 kfree_rcu(dev_maps, rcu);
2216
2217out_no_old_maps:
2218 dev_maps = new_dev_maps;
2219 active = true;
2220
2221out_no_new_maps:
2222 /* update Tx queue numa node */
2223 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2224 (numa_node_id >= 0) ? numa_node_id :
2225 NUMA_NO_NODE);
2226
2227 if (!dev_maps)
2228 goto out_no_maps;
2229
2230 /* removes queue from unused CPUs */
2231 for_each_possible_cpu(cpu) {
2232 for (i = tc, tci = cpu * num_tc; i--; tci++)
2233 active |= remove_xps_queue(dev_maps, tci, index);
2234 if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
2235 active |= remove_xps_queue(dev_maps, tci, index);
2236 for (i = num_tc - tc, tci++; --i; tci++)
2237 active |= remove_xps_queue(dev_maps, tci, index);
2238 }
2239
2240 /* free map if not active */
2241 if (!active) {
2242 RCU_INIT_POINTER(dev->xps_maps, NULL);
2243 kfree_rcu(dev_maps, rcu);
2244 }
2245
2246out_no_maps:
2247 mutex_unlock(&xps_map_mutex);
2248
2249 return 0;
2250error:
2251 /* remove any maps that we added */
2252 for_each_possible_cpu(cpu) {
2253 for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2254 new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2255 map = dev_maps ?
2256 xmap_dereference(dev_maps->cpu_map[tci]) :
2257 NULL;
2258 if (new_map && new_map != map)
2259 kfree(new_map);
2260 }
2261 }
2262
2263 mutex_unlock(&xps_map_mutex);
2264
2265 kfree(new_dev_maps);
2266 return -ENOMEM;
2267}
2268EXPORT_SYMBOL(netif_set_xps_queue);
2269
2270#endif
2271void netdev_reset_tc(struct net_device *dev)
2272{
2273#ifdef CONFIG_XPS
2274 netif_reset_xps_queues_gt(dev, 0);
2275#endif
2276 dev->num_tc = 0;
2277 memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2278 memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2279}
2280EXPORT_SYMBOL(netdev_reset_tc);
2281
2282int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2283{
2284 if (tc >= dev->num_tc)
2285 return -EINVAL;
2286
2287#ifdef CONFIG_XPS
2288 netif_reset_xps_queues(dev, offset, count);
2289#endif
2290 dev->tc_to_txq[tc].count = count;
2291 dev->tc_to_txq[tc].offset = offset;
2292 return 0;
2293}
2294EXPORT_SYMBOL(netdev_set_tc_queue);
2295
2296int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2297{
2298 if (num_tc > TC_MAX_QUEUE)
2299 return -EINVAL;
2300
2301#ifdef CONFIG_XPS
2302 netif_reset_xps_queues_gt(dev, 0);
2303#endif
2304 dev->num_tc = num_tc;
2305 return 0;
2306}
2307EXPORT_SYMBOL(netdev_set_num_tc);
2308
2309/*
2310 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2311 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2312 */
2313int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2314{
2315 int rc;
2316
2317 if (txq < 1 || txq > dev->num_tx_queues)
2318 return -EINVAL;
2319
2320 if (dev->reg_state == NETREG_REGISTERED ||
2321 dev->reg_state == NETREG_UNREGISTERING) {
2322 ASSERT_RTNL();
2323
2324 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2325 txq);
2326 if (rc)
2327 return rc;
2328
2329 if (dev->num_tc)
2330 netif_setup_tc(dev, txq);
2331
2332 if (txq < dev->real_num_tx_queues) {
2333 qdisc_reset_all_tx_gt(dev, txq);
2334#ifdef CONFIG_XPS
2335 netif_reset_xps_queues_gt(dev, txq);
2336#endif
2337 }
2338 }
2339
2340 dev->real_num_tx_queues = txq;
2341 return 0;
2342}
2343EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2344
2345#ifdef CONFIG_SYSFS
2346/**
2347 * netif_set_real_num_rx_queues - set actual number of RX queues used
2348 * @dev: Network device
2349 * @rxq: Actual number of RX queues
2350 *
2351 * This must be called either with the rtnl_lock held or before
2352 * registration of the net device. Returns 0 on success, or a
2353 * negative error code. If called before registration, it always
2354 * succeeds.
2355 */
2356int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2357{
2358 int rc;
2359
2360 if (rxq < 1 || rxq > dev->num_rx_queues)
2361 return -EINVAL;
2362
2363 if (dev->reg_state == NETREG_REGISTERED) {
2364 ASSERT_RTNL();
2365
2366 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2367 rxq);
2368 if (rc)
2369 return rc;
2370 }
2371
2372 dev->real_num_rx_queues = rxq;
2373 return 0;
2374}
2375EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2376#endif
2377
2378/**
2379 * netif_get_num_default_rss_queues - default number of RSS queues
2380 *
2381 * This routine should set an upper limit on the number of RSS queues
2382 * used by default by multiqueue devices.
2383 */
2384int netif_get_num_default_rss_queues(void)
2385{
2386 return is_kdump_kernel() ?
2387 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2388}
2389EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2390
2391static void __netif_reschedule(struct Qdisc *q)
2392{
2393 struct softnet_data *sd;
2394 unsigned long flags;
2395
2396 local_irq_save(flags);
2397 sd = this_cpu_ptr(&softnet_data);
2398 q->next_sched = NULL;
2399 *sd->output_queue_tailp = q;
2400 sd->output_queue_tailp = &q->next_sched;
2401 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2402 local_irq_restore(flags);
2403}
2404
2405void __netif_schedule(struct Qdisc *q)
2406{
2407 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2408 __netif_reschedule(q);
2409}
2410EXPORT_SYMBOL(__netif_schedule);
2411
2412struct dev_kfree_skb_cb {
2413 enum skb_free_reason reason;
2414};
2415
2416static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2417{
2418 return (struct dev_kfree_skb_cb *)skb->cb;
2419}
2420
2421void netif_schedule_queue(struct netdev_queue *txq)
2422{
2423 rcu_read_lock();
2424 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2425 struct Qdisc *q = rcu_dereference(txq->qdisc);
2426
2427 __netif_schedule(q);
2428 }
2429 rcu_read_unlock();
2430}
2431EXPORT_SYMBOL(netif_schedule_queue);
2432
2433/**
2434 * netif_wake_subqueue - allow sending packets on subqueue
2435 * @dev: network device
2436 * @queue_index: sub queue index
2437 *
2438 * Resume individual transmit queue of a device with multiple transmit queues.
2439 */
2440void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2441{
2442 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2443
2444 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2445 struct Qdisc *q;
2446
2447 rcu_read_lock();
2448 q = rcu_dereference(txq->qdisc);
2449 __netif_schedule(q);
2450 rcu_read_unlock();
2451 }
2452}
2453EXPORT_SYMBOL(netif_wake_subqueue);
2454
2455void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2456{
2457 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2458 struct Qdisc *q;
2459
2460 rcu_read_lock();
2461 q = rcu_dereference(dev_queue->qdisc);
2462 __netif_schedule(q);
2463 rcu_read_unlock();
2464 }
2465}
2466EXPORT_SYMBOL(netif_tx_wake_queue);
2467
2468void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2469{
2470 unsigned long flags;
2471
2472 if (likely(atomic_read(&skb->users) == 1)) {
2473 smp_rmb();
2474 atomic_set(&skb->users, 0);
2475 } else if (likely(!atomic_dec_and_test(&skb->users))) {
2476 return;
2477 }
2478 get_kfree_skb_cb(skb)->reason = reason;
2479 local_irq_save(flags);
2480 skb->next = __this_cpu_read(softnet_data.completion_queue);
2481 __this_cpu_write(softnet_data.completion_queue, skb);
2482 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2483 local_irq_restore(flags);
2484}
2485EXPORT_SYMBOL(__dev_kfree_skb_irq);
2486
2487void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2488{
2489 if (in_irq() || irqs_disabled())
2490 __dev_kfree_skb_irq(skb, reason);
2491 else
2492 dev_kfree_skb(skb);
2493}
2494EXPORT_SYMBOL(__dev_kfree_skb_any);
2495
2496
2497/**
2498 * netif_device_detach - mark device as removed
2499 * @dev: network device
2500 *
2501 * Mark device as removed from system and therefore no longer available.
2502 */
2503void netif_device_detach(struct net_device *dev)
2504{
2505 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2506 netif_running(dev)) {
2507 netif_tx_stop_all_queues(dev);
2508 }
2509}
2510EXPORT_SYMBOL(netif_device_detach);
2511
2512/**
2513 * netif_device_attach - mark device as attached
2514 * @dev: network device
2515 *
2516 * Mark device as attached from system and restart if needed.
2517 */
2518void netif_device_attach(struct net_device *dev)
2519{
2520 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2521 netif_running(dev)) {
2522 netif_tx_wake_all_queues(dev);
2523 __netdev_watchdog_up(dev);
2524 }
2525}
2526EXPORT_SYMBOL(netif_device_attach);
2527
2528/*
2529 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2530 * to be used as a distribution range.
2531 */
2532u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2533 unsigned int num_tx_queues)
2534{
2535 u32 hash;
2536 u16 qoffset = 0;
2537 u16 qcount = num_tx_queues;
2538
2539 if (skb_rx_queue_recorded(skb)) {
2540 hash = skb_get_rx_queue(skb);
2541 while (unlikely(hash >= num_tx_queues))
2542 hash -= num_tx_queues;
2543 return hash;
2544 }
2545
2546 if (dev->num_tc) {
2547 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2548 qoffset = dev->tc_to_txq[tc].offset;
2549 qcount = dev->tc_to_txq[tc].count;
2550 }
2551
2552 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2553}
2554EXPORT_SYMBOL(__skb_tx_hash);
2555
2556static void skb_warn_bad_offload(const struct sk_buff *skb)
2557{
2558 static const netdev_features_t null_features;
2559 struct net_device *dev = skb->dev;
2560 const char *name = "";
2561
2562 if (!net_ratelimit())
2563 return;
2564
2565 if (dev) {
2566 if (dev->dev.parent)
2567 name = dev_driver_string(dev->dev.parent);
2568 else
2569 name = netdev_name(dev);
2570 }
2571 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2572 "gso_type=%d ip_summed=%d\n",
2573 name, dev ? &dev->features : &null_features,
2574 skb->sk ? &skb->sk->sk_route_caps : &null_features,
2575 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2576 skb_shinfo(skb)->gso_type, skb->ip_summed);
2577}
2578
2579/*
2580 * Invalidate hardware checksum when packet is to be mangled, and
2581 * complete checksum manually on outgoing path.
2582 */
2583int skb_checksum_help(struct sk_buff *skb)
2584{
2585 __wsum csum;
2586 int ret = 0, offset;
2587
2588 if (skb->ip_summed == CHECKSUM_COMPLETE)
2589 goto out_set_summed;
2590
2591 if (unlikely(skb_shinfo(skb)->gso_size)) {
2592 skb_warn_bad_offload(skb);
2593 return -EINVAL;
2594 }
2595
2596 /* Before computing a checksum, we should make sure no frag could
2597 * be modified by an external entity : checksum could be wrong.
2598 */
2599 if (skb_has_shared_frag(skb)) {
2600 ret = __skb_linearize(skb);
2601 if (ret)
2602 goto out;
2603 }
2604
2605 offset = skb_checksum_start_offset(skb);
2606 BUG_ON(offset >= skb_headlen(skb));
2607 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2608
2609 offset += skb->csum_offset;
2610 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2611
2612 if (skb_cloned(skb) &&
2613 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2614 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2615 if (ret)
2616 goto out;
2617 }
2618
2619 *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2620out_set_summed:
2621 skb->ip_summed = CHECKSUM_NONE;
2622out:
2623 return ret;
2624}
2625EXPORT_SYMBOL(skb_checksum_help);
2626
2627__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2628{
2629 __be16 type = skb->protocol;
2630
2631 /* Tunnel gso handlers can set protocol to ethernet. */
2632 if (type == htons(ETH_P_TEB)) {
2633 struct ethhdr *eth;
2634
2635 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2636 return 0;
2637
2638 eth = (struct ethhdr *)skb_mac_header(skb);
2639 type = eth->h_proto;
2640 }
2641
2642 return __vlan_get_protocol(skb, type, depth);
2643}
2644
2645/**
2646 * skb_mac_gso_segment - mac layer segmentation handler.
2647 * @skb: buffer to segment
2648 * @features: features for the output path (see dev->features)
2649 */
2650struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2651 netdev_features_t features)
2652{
2653 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2654 struct packet_offload *ptype;
2655 int vlan_depth = skb->mac_len;
2656 __be16 type = skb_network_protocol(skb, &vlan_depth);
2657
2658 if (unlikely(!type))
2659 return ERR_PTR(-EINVAL);
2660
2661 __skb_pull(skb, vlan_depth);
2662
2663 rcu_read_lock();
2664 list_for_each_entry_rcu(ptype, &offload_base, list) {
2665 if (ptype->type == type && ptype->callbacks.gso_segment) {
2666 segs = ptype->callbacks.gso_segment(skb, features);
2667 break;
2668 }
2669 }
2670 rcu_read_unlock();
2671
2672 __skb_push(skb, skb->data - skb_mac_header(skb));
2673
2674 return segs;
2675}
2676EXPORT_SYMBOL(skb_mac_gso_segment);
2677
2678
2679/* openvswitch calls this on rx path, so we need a different check.
2680 */
2681static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2682{
2683 if (tx_path)
2684 return skb->ip_summed != CHECKSUM_PARTIAL;
2685 else
2686 return skb->ip_summed == CHECKSUM_NONE;
2687}
2688
2689/**
2690 * __skb_gso_segment - Perform segmentation on skb.
2691 * @skb: buffer to segment
2692 * @features: features for the output path (see dev->features)
2693 * @tx_path: whether it is called in TX path
2694 *
2695 * This function segments the given skb and returns a list of segments.
2696 *
2697 * It may return NULL if the skb requires no segmentation. This is
2698 * only possible when GSO is used for verifying header integrity.
2699 *
2700 * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2701 */
2702struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2703 netdev_features_t features, bool tx_path)
2704{
2705 if (unlikely(skb_needs_check(skb, tx_path))) {
2706 int err;
2707
2708 skb_warn_bad_offload(skb);
2709
2710 err = skb_cow_head(skb, 0);
2711 if (err < 0)
2712 return ERR_PTR(err);
2713 }
2714
2715 /* Only report GSO partial support if it will enable us to
2716 * support segmentation on this frame without needing additional
2717 * work.
2718 */
2719 if (features & NETIF_F_GSO_PARTIAL) {
2720 netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2721 struct net_device *dev = skb->dev;
2722
2723 partial_features |= dev->features & dev->gso_partial_features;
2724 if (!skb_gso_ok(skb, features | partial_features))
2725 features &= ~NETIF_F_GSO_PARTIAL;
2726 }
2727
2728 BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2729 sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2730
2731 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2732 SKB_GSO_CB(skb)->encap_level = 0;
2733
2734 skb_reset_mac_header(skb);
2735 skb_reset_mac_len(skb);
2736
2737 return skb_mac_gso_segment(skb, features);
2738}
2739EXPORT_SYMBOL(__skb_gso_segment);
2740
2741/* Take action when hardware reception checksum errors are detected. */
2742#ifdef CONFIG_BUG
2743void netdev_rx_csum_fault(struct net_device *dev)
2744{
2745 if (net_ratelimit()) {
2746 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2747 dump_stack();
2748 }
2749}
2750EXPORT_SYMBOL(netdev_rx_csum_fault);
2751#endif
2752
2753/* Actually, we should eliminate this check as soon as we know, that:
2754 * 1. IOMMU is present and allows to map all the memory.
2755 * 2. No high memory really exists on this machine.
2756 */
2757
2758static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2759{
2760#ifdef CONFIG_HIGHMEM
2761 int i;
2762 if (!(dev->features & NETIF_F_HIGHDMA)) {
2763 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2764 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2765 if (PageHighMem(skb_frag_page(frag)))
2766 return 1;
2767 }
2768 }
2769
2770 if (PCI_DMA_BUS_IS_PHYS) {
2771 struct device *pdev = dev->dev.parent;
2772
2773 if (!pdev)
2774 return 0;
2775 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2776 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2777 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2778 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2779 return 1;
2780 }
2781 }
2782#endif
2783 return 0;
2784}
2785
2786/* If MPLS offload request, verify we are testing hardware MPLS features
2787 * instead of standard features for the netdev.
2788 */
2789#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2790static netdev_features_t net_mpls_features(struct sk_buff *skb,
2791 netdev_features_t features,
2792 __be16 type)
2793{
2794 if (eth_p_mpls(type))
2795 features &= skb->dev->mpls_features;
2796
2797 return features;
2798}
2799#else
2800static netdev_features_t net_mpls_features(struct sk_buff *skb,
2801 netdev_features_t features,
2802 __be16 type)
2803{
2804 return features;
2805}
2806#endif
2807
2808static netdev_features_t harmonize_features(struct sk_buff *skb,
2809 netdev_features_t features)
2810{
2811 int tmp;
2812 __be16 type;
2813
2814 type = skb_network_protocol(skb, &tmp);
2815 features = net_mpls_features(skb, features, type);
2816
2817 if (skb->ip_summed != CHECKSUM_NONE &&
2818 !can_checksum_protocol(features, type)) {
2819 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2820 }
2821 if (illegal_highdma(skb->dev, skb))
2822 features &= ~NETIF_F_SG;
2823
2824 return features;
2825}
2826
2827netdev_features_t passthru_features_check(struct sk_buff *skb,
2828 struct net_device *dev,
2829 netdev_features_t features)
2830{
2831 return features;
2832}
2833EXPORT_SYMBOL(passthru_features_check);
2834
2835static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2836 struct net_device *dev,
2837 netdev_features_t features)
2838{
2839 return vlan_features_check(skb, features);
2840}
2841
2842static netdev_features_t gso_features_check(const struct sk_buff *skb,
2843 struct net_device *dev,
2844 netdev_features_t features)
2845{
2846 u16 gso_segs = skb_shinfo(skb)->gso_segs;
2847
2848 if (gso_segs > dev->gso_max_segs)
2849 return features & ~NETIF_F_GSO_MASK;
2850
2851 /* Support for GSO partial features requires software
2852 * intervention before we can actually process the packets
2853 * so we need to strip support for any partial features now
2854 * and we can pull them back in after we have partially
2855 * segmented the frame.
2856 */
2857 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2858 features &= ~dev->gso_partial_features;
2859
2860 /* Make sure to clear the IPv4 ID mangling feature if the
2861 * IPv4 header has the potential to be fragmented.
2862 */
2863 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2864 struct iphdr *iph = skb->encapsulation ?
2865 inner_ip_hdr(skb) : ip_hdr(skb);
2866
2867 if (!(iph->frag_off & htons(IP_DF)))
2868 features &= ~NETIF_F_TSO_MANGLEID;
2869 }
2870
2871 return features;
2872}
2873
2874netdev_features_t netif_skb_features(struct sk_buff *skb)
2875{
2876 struct net_device *dev = skb->dev;
2877 netdev_features_t features = dev->features;
2878
2879 if (skb_is_gso(skb))
2880 features = gso_features_check(skb, dev, features);
2881
2882 /* If encapsulation offload request, verify we are testing
2883 * hardware encapsulation features instead of standard
2884 * features for the netdev
2885 */
2886 if (skb->encapsulation)
2887 features &= dev->hw_enc_features;
2888
2889 if (skb_vlan_tagged(skb))
2890 features = netdev_intersect_features(features,
2891 dev->vlan_features |
2892 NETIF_F_HW_VLAN_CTAG_TX |
2893 NETIF_F_HW_VLAN_STAG_TX);
2894
2895 if (dev->netdev_ops->ndo_features_check)
2896 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2897 features);
2898 else
2899 features &= dflt_features_check(skb, dev, features);
2900
2901 return harmonize_features(skb, features);
2902}
2903EXPORT_SYMBOL(netif_skb_features);
2904
2905static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2906 struct netdev_queue *txq, bool more)
2907{
2908 unsigned int len;
2909 int rc;
2910
2911 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2912 dev_queue_xmit_nit(skb, dev);
2913
2914 len = skb->len;
2915 trace_net_dev_start_xmit(skb, dev);
2916 rc = netdev_start_xmit(skb, dev, txq, more);
2917 trace_net_dev_xmit(skb, rc, dev, len);
2918
2919 return rc;
2920}
2921
2922struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2923 struct netdev_queue *txq, int *ret)
2924{
2925 struct sk_buff *skb = first;
2926 int rc = NETDEV_TX_OK;
2927
2928 while (skb) {
2929 struct sk_buff *next = skb->next;
2930
2931 skb->next = NULL;
2932 rc = xmit_one(skb, dev, txq, next != NULL);
2933 if (unlikely(!dev_xmit_complete(rc))) {
2934 skb->next = next;
2935 goto out;
2936 }
2937
2938 skb = next;
2939 if (netif_xmit_stopped(txq) && skb) {
2940 rc = NETDEV_TX_BUSY;
2941 break;
2942 }
2943 }
2944
2945out:
2946 *ret = rc;
2947 return skb;
2948}
2949
2950static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2951 netdev_features_t features)
2952{
2953 if (skb_vlan_tag_present(skb) &&
2954 !vlan_hw_offload_capable(features, skb->vlan_proto))
2955 skb = __vlan_hwaccel_push_inside(skb);
2956 return skb;
2957}
2958
2959static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2960{
2961 netdev_features_t features;
2962
2963 features = netif_skb_features(skb);
2964 skb = validate_xmit_vlan(skb, features);
2965 if (unlikely(!skb))
2966 goto out_null;
2967
2968 if (netif_needs_gso(skb, features)) {
2969 struct sk_buff *segs;
2970
2971 segs = skb_gso_segment(skb, features);
2972 if (IS_ERR(segs)) {
2973 goto out_kfree_skb;
2974 } else if (segs) {
2975 consume_skb(skb);
2976 skb = segs;
2977 }
2978 } else {
2979 if (skb_needs_linearize(skb, features) &&
2980 __skb_linearize(skb))
2981 goto out_kfree_skb;
2982
2983 /* If packet is not checksummed and device does not
2984 * support checksumming for this protocol, complete
2985 * checksumming here.
2986 */
2987 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2988 if (skb->encapsulation)
2989 skb_set_inner_transport_header(skb,
2990 skb_checksum_start_offset(skb));
2991 else
2992 skb_set_transport_header(skb,
2993 skb_checksum_start_offset(skb));
2994 if (!(features & NETIF_F_CSUM_MASK) &&
2995 skb_checksum_help(skb))
2996 goto out_kfree_skb;
2997 }
2998 }
2999
3000 return skb;
3001
3002out_kfree_skb:
3003 kfree_skb(skb);
3004out_null:
3005 atomic_long_inc(&dev->tx_dropped);
3006 return NULL;
3007}
3008
3009struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
3010{
3011 struct sk_buff *next, *head = NULL, *tail;
3012
3013 for (; skb != NULL; skb = next) {
3014 next = skb->next;
3015 skb->next = NULL;
3016
3017 /* in case skb wont be segmented, point to itself */
3018 skb->prev = skb;
3019
3020 skb = validate_xmit_skb(skb, dev);
3021 if (!skb)
3022 continue;
3023
3024 if (!head)
3025 head = skb;
3026 else
3027 tail->next = skb;
3028 /* If skb was segmented, skb->prev points to
3029 * the last segment. If not, it still contains skb.
3030 */
3031 tail = skb->prev;
3032 }
3033 return head;
3034}
3035EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3036
3037static void qdisc_pkt_len_init(struct sk_buff *skb)
3038{
3039 const struct skb_shared_info *shinfo = skb_shinfo(skb);
3040
3041 qdisc_skb_cb(skb)->pkt_len = skb->len;
3042
3043 /* To get more precise estimation of bytes sent on wire,
3044 * we add to pkt_len the headers size of all segments
3045 */
3046 if (shinfo->gso_size) {
3047 unsigned int hdr_len;
3048 u16 gso_segs = shinfo->gso_segs;
3049
3050 /* mac layer + network layer */
3051 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3052
3053 /* + transport layer */
3054 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3055 hdr_len += tcp_hdrlen(skb);
3056 else
3057 hdr_len += sizeof(struct udphdr);
3058
3059 if (shinfo->gso_type & SKB_GSO_DODGY)
3060 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3061 shinfo->gso_size);
3062
3063 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3064 }
3065}
3066
3067static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3068 struct net_device *dev,
3069 struct netdev_queue *txq)
3070{
3071 spinlock_t *root_lock = qdisc_lock(q);
3072 struct sk_buff *to_free = NULL;
3073 bool contended;
3074 int rc;
3075
3076 qdisc_calculate_pkt_len(skb, q);
3077 /*
3078 * Heuristic to force contended enqueues to serialize on a
3079 * separate lock before trying to get qdisc main lock.
3080 * This permits qdisc->running owner to get the lock more
3081 * often and dequeue packets faster.
3082 */
3083 contended = qdisc_is_running(q);
3084 if (unlikely(contended))
3085 spin_lock(&q->busylock);
3086
3087 spin_lock(root_lock);
3088 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3089 __qdisc_drop(skb, &to_free);
3090 rc = NET_XMIT_DROP;
3091 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3092 qdisc_run_begin(q)) {
3093 /*
3094 * This is a work-conserving queue; there are no old skbs
3095 * waiting to be sent out; and the qdisc is not running -
3096 * xmit the skb directly.
3097 */
3098
3099 qdisc_bstats_update(q, skb);
3100
3101 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3102 if (unlikely(contended)) {
3103 spin_unlock(&q->busylock);
3104 contended = false;
3105 }
3106 __qdisc_run(q);
3107 } else
3108 qdisc_run_end(q);
3109
3110 rc = NET_XMIT_SUCCESS;
3111 } else {
3112 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3113 if (qdisc_run_begin(q)) {
3114 if (unlikely(contended)) {
3115 spin_unlock(&q->busylock);
3116 contended = false;
3117 }
3118 __qdisc_run(q);
3119 }
3120 }
3121 spin_unlock(root_lock);
3122 if (unlikely(to_free))
3123 kfree_skb_list(to_free);
3124 if (unlikely(contended))
3125 spin_unlock(&q->busylock);
3126 return rc;
3127}
3128
3129#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3130static void skb_update_prio(struct sk_buff *skb)
3131{
3132 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3133
3134 if (!skb->priority && skb->sk && map) {
3135 unsigned int prioidx =
3136 sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3137
3138 if (prioidx < map->priomap_len)
3139 skb->priority = map->priomap[prioidx];
3140 }
3141}
3142#else
3143#define skb_update_prio(skb)
3144#endif
3145
3146DEFINE_PER_CPU(int, xmit_recursion);
3147EXPORT_SYMBOL(xmit_recursion);
3148
3149/**
3150 * dev_loopback_xmit - loop back @skb
3151 * @net: network namespace this loopback is happening in
3152 * @sk: sk needed to be a netfilter okfn
3153 * @skb: buffer to transmit
3154 */
3155int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3156{
3157 skb_reset_mac_header(skb);
3158 __skb_pull(skb, skb_network_offset(skb));
3159 skb->pkt_type = PACKET_LOOPBACK;
3160 skb->ip_summed = CHECKSUM_UNNECESSARY;
3161 WARN_ON(!skb_dst(skb));
3162 skb_dst_force(skb);
3163 netif_rx_ni(skb);
3164 return 0;
3165}
3166EXPORT_SYMBOL(dev_loopback_xmit);
3167
3168#ifdef CONFIG_NET_EGRESS
3169static struct sk_buff *
3170sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3171{
3172 struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3173 struct tcf_result cl_res;
3174
3175 if (!cl)
3176 return skb;
3177
3178 /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3179 * earlier by the caller.
3180 */
3181 qdisc_bstats_cpu_update(cl->q, skb);
3182
3183 switch (tc_classify(skb, cl, &cl_res, false)) {
3184 case TC_ACT_OK:
3185 case TC_ACT_RECLASSIFY:
3186 skb->tc_index = TC_H_MIN(cl_res.classid);
3187 break;
3188 case TC_ACT_SHOT:
3189 qdisc_qstats_cpu_drop(cl->q);
3190 *ret = NET_XMIT_DROP;
3191 kfree_skb(skb);
3192 return NULL;
3193 case TC_ACT_STOLEN:
3194 case TC_ACT_QUEUED:
3195 *ret = NET_XMIT_SUCCESS;
3196 consume_skb(skb);
3197 return NULL;
3198 case TC_ACT_REDIRECT:
3199 /* No need to push/pop skb's mac_header here on egress! */
3200 skb_do_redirect(skb);
3201 *ret = NET_XMIT_SUCCESS;
3202 return NULL;
3203 default:
3204 break;
3205 }
3206
3207 return skb;
3208}
3209#endif /* CONFIG_NET_EGRESS */
3210
3211static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3212{
3213#ifdef CONFIG_XPS
3214 struct xps_dev_maps *dev_maps;
3215 struct xps_map *map;
3216 int queue_index = -1;
3217
3218 rcu_read_lock();
3219 dev_maps = rcu_dereference(dev->xps_maps);
3220 if (dev_maps) {
3221 unsigned int tci = skb->sender_cpu - 1;
3222
3223 if (dev->num_tc) {
3224 tci *= dev->num_tc;
3225 tci += netdev_get_prio_tc_map(dev, skb->priority);
3226 }
3227
3228 map = rcu_dereference(dev_maps->cpu_map[tci]);
3229 if (map) {
3230 if (map->len == 1)
3231 queue_index = map->queues[0];
3232 else
3233 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3234 map->len)];
3235 if (unlikely(queue_index >= dev->real_num_tx_queues))
3236 queue_index = -1;
3237 }
3238 }
3239 rcu_read_unlock();
3240
3241 return queue_index;
3242#else
3243 return -1;
3244#endif
3245}
3246
3247static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3248{
3249 struct sock *sk = skb->sk;
3250 int queue_index = sk_tx_queue_get(sk);
3251
3252 if (queue_index < 0 || skb->ooo_okay ||
3253 queue_index >= dev->real_num_tx_queues) {
3254 int new_index = get_xps_queue(dev, skb);
3255 if (new_index < 0)
3256 new_index = skb_tx_hash(dev, skb);
3257
3258 if (queue_index != new_index && sk &&
3259 sk_fullsock(sk) &&
3260 rcu_access_pointer(sk->sk_dst_cache))
3261 sk_tx_queue_set(sk, new_index);
3262
3263 queue_index = new_index;
3264 }
3265
3266 return queue_index;
3267}
3268
3269struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3270 struct sk_buff *skb,
3271 void *accel_priv)
3272{
3273 int queue_index = 0;
3274
3275#ifdef CONFIG_XPS
3276 u32 sender_cpu = skb->sender_cpu - 1;
3277
3278 if (sender_cpu >= (u32)NR_CPUS)
3279 skb->sender_cpu = raw_smp_processor_id() + 1;
3280#endif
3281
3282 if (dev->real_num_tx_queues != 1) {
3283 const struct net_device_ops *ops = dev->netdev_ops;
3284 if (ops->ndo_select_queue)
3285 queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3286 __netdev_pick_tx);
3287 else
3288 queue_index = __netdev_pick_tx(dev, skb);
3289
3290 if (!accel_priv)
3291 queue_index = netdev_cap_txqueue(dev, queue_index);
3292 }
3293
3294 skb_set_queue_mapping(skb, queue_index);
3295 return netdev_get_tx_queue(dev, queue_index);
3296}
3297
3298/**
3299 * __dev_queue_xmit - transmit a buffer
3300 * @skb: buffer to transmit
3301 * @accel_priv: private data used for L2 forwarding offload
3302 *
3303 * Queue a buffer for transmission to a network device. The caller must
3304 * have set the device and priority and built the buffer before calling
3305 * this function. The function can be called from an interrupt.
3306 *
3307 * A negative errno code is returned on a failure. A success does not
3308 * guarantee the frame will be transmitted as it may be dropped due
3309 * to congestion or traffic shaping.
3310 *
3311 * -----------------------------------------------------------------------------------
3312 * I notice this method can also return errors from the queue disciplines,
3313 * including NET_XMIT_DROP, which is a positive value. So, errors can also
3314 * be positive.
3315 *
3316 * Regardless of the return value, the skb is consumed, so it is currently
3317 * difficult to retry a send to this method. (You can bump the ref count
3318 * before sending to hold a reference for retry if you are careful.)
3319 *
3320 * When calling this method, interrupts MUST be enabled. This is because
3321 * the BH enable code must have IRQs enabled so that it will not deadlock.
3322 * --BLG
3323 */
3324static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3325{
3326 struct net_device *dev = skb->dev;
3327 struct netdev_queue *txq;
3328 struct Qdisc *q;
3329 int rc = -ENOMEM;
3330
3331 skb_reset_mac_header(skb);
3332
3333 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3334 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3335
3336 /* Disable soft irqs for various locks below. Also
3337 * stops preemption for RCU.
3338 */
3339 rcu_read_lock_bh();
3340
3341 skb_update_prio(skb);
3342
3343 qdisc_pkt_len_init(skb);
3344#ifdef CONFIG_NET_CLS_ACT
3345 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3346# ifdef CONFIG_NET_EGRESS
3347 if (static_key_false(&egress_needed)) {
3348 skb = sch_handle_egress(skb, &rc, dev);
3349 if (!skb)
3350 goto out;
3351 }
3352# endif
3353#endif
3354 /* If device/qdisc don't need skb->dst, release it right now while
3355 * its hot in this cpu cache.
3356 */
3357 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3358 skb_dst_drop(skb);
3359 else
3360 skb_dst_force(skb);
3361
3362 txq = netdev_pick_tx(dev, skb, accel_priv);
3363 q = rcu_dereference_bh(txq->qdisc);
3364
3365 trace_net_dev_queue(skb);
3366 if (q->enqueue) {
3367 rc = __dev_xmit_skb(skb, q, dev, txq);
3368 goto out;
3369 }
3370
3371 /* The device has no queue. Common case for software devices:
3372 loopback, all the sorts of tunnels...
3373
3374 Really, it is unlikely that netif_tx_lock protection is necessary
3375 here. (f.e. loopback and IP tunnels are clean ignoring statistics
3376 counters.)
3377 However, it is possible, that they rely on protection
3378 made by us here.
3379
3380 Check this and shot the lock. It is not prone from deadlocks.
3381 Either shot noqueue qdisc, it is even simpler 8)
3382 */
3383 if (dev->flags & IFF_UP) {
3384 int cpu = smp_processor_id(); /* ok because BHs are off */
3385
3386 if (txq->xmit_lock_owner != cpu) {
3387 if (unlikely(__this_cpu_read(xmit_recursion) >
3388 XMIT_RECURSION_LIMIT))
3389 goto recursion_alert;
3390
3391 skb = validate_xmit_skb(skb, dev);
3392 if (!skb)
3393 goto out;
3394
3395 HARD_TX_LOCK(dev, txq, cpu);
3396
3397 if (!netif_xmit_stopped(txq)) {
3398 __this_cpu_inc(xmit_recursion);
3399 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3400 __this_cpu_dec(xmit_recursion);
3401 if (dev_xmit_complete(rc)) {
3402 HARD_TX_UNLOCK(dev, txq);
3403 goto out;
3404 }
3405 }
3406 HARD_TX_UNLOCK(dev, txq);
3407 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3408 dev->name);
3409 } else {
3410 /* Recursion is detected! It is possible,
3411 * unfortunately
3412 */
3413recursion_alert:
3414 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3415 dev->name);
3416 }
3417 }
3418
3419 rc = -ENETDOWN;
3420 rcu_read_unlock_bh();
3421
3422 atomic_long_inc(&dev->tx_dropped);
3423 kfree_skb_list(skb);
3424 return rc;
3425out:
3426 rcu_read_unlock_bh();
3427 return rc;
3428}
3429
3430int dev_queue_xmit(struct sk_buff *skb)
3431{
3432 return __dev_queue_xmit(skb, NULL);
3433}
3434EXPORT_SYMBOL(dev_queue_xmit);
3435
3436int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3437{
3438 return __dev_queue_xmit(skb, accel_priv);
3439}
3440EXPORT_SYMBOL(dev_queue_xmit_accel);
3441
3442
3443/*=======================================================================
3444 Receiver routines
3445 =======================================================================*/
3446
3447int netdev_max_backlog __read_mostly = 1000;
3448EXPORT_SYMBOL(netdev_max_backlog);
3449
3450int netdev_tstamp_prequeue __read_mostly = 1;
3451int netdev_budget __read_mostly = 300;
3452int weight_p __read_mostly = 64; /* old backlog weight */
3453
3454/* Called with irq disabled */
3455static inline void ____napi_schedule(struct softnet_data *sd,
3456 struct napi_struct *napi)
3457{
3458 list_add_tail(&napi->poll_list, &sd->poll_list);
3459 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3460}
3461
3462#ifdef CONFIG_RPS
3463
3464/* One global table that all flow-based protocols share. */
3465struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3466EXPORT_SYMBOL(rps_sock_flow_table);
3467u32 rps_cpu_mask __read_mostly;
3468EXPORT_SYMBOL(rps_cpu_mask);
3469
3470struct static_key rps_needed __read_mostly;
3471EXPORT_SYMBOL(rps_needed);
3472struct static_key rfs_needed __read_mostly;
3473EXPORT_SYMBOL(rfs_needed);
3474
3475static struct rps_dev_flow *
3476set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3477 struct rps_dev_flow *rflow, u16 next_cpu)
3478{
3479 if (next_cpu < nr_cpu_ids) {
3480#ifdef CONFIG_RFS_ACCEL
3481 struct netdev_rx_queue *rxqueue;
3482 struct rps_dev_flow_table *flow_table;
3483 struct rps_dev_flow *old_rflow;
3484 u32 flow_id;
3485 u16 rxq_index;
3486 int rc;
3487
3488 /* Should we steer this flow to a different hardware queue? */
3489 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3490 !(dev->features & NETIF_F_NTUPLE))
3491 goto out;
3492 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3493 if (rxq_index == skb_get_rx_queue(skb))
3494 goto out;
3495
3496 rxqueue = dev->_rx + rxq_index;
3497 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3498 if (!flow_table)
3499 goto out;
3500 flow_id = skb_get_hash(skb) & flow_table->mask;
3501 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3502 rxq_index, flow_id);
3503 if (rc < 0)
3504 goto out;
3505 old_rflow = rflow;
3506 rflow = &flow_table->flows[flow_id];
3507 rflow->filter = rc;
3508 if (old_rflow->filter == rflow->filter)
3509 old_rflow->filter = RPS_NO_FILTER;
3510 out:
3511#endif
3512 rflow->last_qtail =
3513 per_cpu(softnet_data, next_cpu).input_queue_head;
3514 }
3515
3516 rflow->cpu = next_cpu;
3517 return rflow;
3518}
3519
3520/*
3521 * get_rps_cpu is called from netif_receive_skb and returns the target
3522 * CPU from the RPS map of the receiving queue for a given skb.
3523 * rcu_read_lock must be held on entry.
3524 */
3525static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3526 struct rps_dev_flow **rflowp)
3527{
3528 const struct rps_sock_flow_table *sock_flow_table;
3529 struct netdev_rx_queue *rxqueue = dev->_rx;
3530 struct rps_dev_flow_table *flow_table;
3531 struct rps_map *map;
3532 int cpu = -1;
3533 u32 tcpu;
3534 u32 hash;
3535
3536 if (skb_rx_queue_recorded(skb)) {
3537 u16 index = skb_get_rx_queue(skb);
3538
3539 if (unlikely(index >= dev->real_num_rx_queues)) {
3540 WARN_ONCE(dev->real_num_rx_queues > 1,
3541 "%s received packet on queue %u, but number "
3542 "of RX queues is %u\n",
3543 dev->name, index, dev->real_num_rx_queues);
3544 goto done;
3545 }
3546 rxqueue += index;
3547 }
3548
3549 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3550
3551 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3552 map = rcu_dereference(rxqueue->rps_map);
3553 if (!flow_table && !map)
3554 goto done;
3555
3556 skb_reset_network_header(skb);
3557 hash = skb_get_hash(skb);
3558 if (!hash)
3559 goto done;
3560
3561 sock_flow_table = rcu_dereference(rps_sock_flow_table);
3562 if (flow_table && sock_flow_table) {
3563 struct rps_dev_flow *rflow;
3564 u32 next_cpu;
3565 u32 ident;
3566
3567 /* First check into global flow table if there is a match */
3568 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3569 if ((ident ^ hash) & ~rps_cpu_mask)
3570 goto try_rps;
3571
3572 next_cpu = ident & rps_cpu_mask;
3573
3574 /* OK, now we know there is a match,
3575 * we can look at the local (per receive queue) flow table
3576 */
3577 rflow = &flow_table->flows[hash & flow_table->mask];
3578 tcpu = rflow->cpu;
3579
3580 /*
3581 * If the desired CPU (where last recvmsg was done) is
3582 * different from current CPU (one in the rx-queue flow
3583 * table entry), switch if one of the following holds:
3584 * - Current CPU is unset (>= nr_cpu_ids).
3585 * - Current CPU is offline.
3586 * - The current CPU's queue tail has advanced beyond the
3587 * last packet that was enqueued using this table entry.
3588 * This guarantees that all previous packets for the flow
3589 * have been dequeued, thus preserving in order delivery.
3590 */
3591 if (unlikely(tcpu != next_cpu) &&
3592 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3593 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3594 rflow->last_qtail)) >= 0)) {
3595 tcpu = next_cpu;
3596 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3597 }
3598
3599 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3600 *rflowp = rflow;
3601 cpu = tcpu;
3602 goto done;
3603 }
3604 }
3605
3606try_rps:
3607
3608 if (map) {
3609 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3610 if (cpu_online(tcpu)) {
3611 cpu = tcpu;
3612 goto done;
3613 }
3614 }
3615
3616done:
3617 return cpu;
3618}
3619
3620#ifdef CONFIG_RFS_ACCEL
3621
3622/**
3623 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3624 * @dev: Device on which the filter was set
3625 * @rxq_index: RX queue index
3626 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3627 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3628 *
3629 * Drivers that implement ndo_rx_flow_steer() should periodically call
3630 * this function for each installed filter and remove the filters for
3631 * which it returns %true.
3632 */
3633bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3634 u32 flow_id, u16 filter_id)
3635{
3636 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3637 struct rps_dev_flow_table *flow_table;
3638 struct rps_dev_flow *rflow;
3639 bool expire = true;
3640 unsigned int cpu;
3641
3642 rcu_read_lock();
3643 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3644 if (flow_table && flow_id <= flow_table->mask) {
3645 rflow = &flow_table->flows[flow_id];
3646 cpu = ACCESS_ONCE(rflow->cpu);
3647 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3648 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3649 rflow->last_qtail) <
3650 (int)(10 * flow_table->mask)))
3651 expire = false;
3652 }
3653 rcu_read_unlock();
3654 return expire;
3655}
3656EXPORT_SYMBOL(rps_may_expire_flow);
3657
3658#endif /* CONFIG_RFS_ACCEL */
3659
3660/* Called from hardirq (IPI) context */
3661static void rps_trigger_softirq(void *data)
3662{
3663 struct softnet_data *sd = data;
3664
3665 ____napi_schedule(sd, &sd->backlog);
3666 sd->received_rps++;
3667}
3668
3669#endif /* CONFIG_RPS */
3670
3671/*
3672 * Check if this softnet_data structure is another cpu one
3673 * If yes, queue it to our IPI list and return 1
3674 * If no, return 0
3675 */
3676static int rps_ipi_queued(struct softnet_data *sd)
3677{
3678#ifdef CONFIG_RPS
3679 struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3680
3681 if (sd != mysd) {
3682 sd->rps_ipi_next = mysd->rps_ipi_list;
3683 mysd->rps_ipi_list = sd;
3684
3685 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3686 return 1;
3687 }
3688#endif /* CONFIG_RPS */
3689 return 0;
3690}
3691
3692#ifdef CONFIG_NET_FLOW_LIMIT
3693int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3694#endif
3695
3696static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3697{
3698#ifdef CONFIG_NET_FLOW_LIMIT
3699 struct sd_flow_limit *fl;
3700 struct softnet_data *sd;
3701 unsigned int old_flow, new_flow;
3702
3703 if (qlen < (netdev_max_backlog >> 1))
3704 return false;
3705
3706 sd = this_cpu_ptr(&softnet_data);
3707
3708 rcu_read_lock();
3709 fl = rcu_dereference(sd->flow_limit);
3710 if (fl) {
3711 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3712 old_flow = fl->history[fl->history_head];
3713 fl->history[fl->history_head] = new_flow;
3714
3715 fl->history_head++;
3716 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3717
3718 if (likely(fl->buckets[old_flow]))
3719 fl->buckets[old_flow]--;
3720
3721 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3722 fl->count++;
3723 rcu_read_unlock();
3724 return true;
3725 }
3726 }
3727 rcu_read_unlock();
3728#endif
3729 return false;
3730}
3731
3732/*
3733 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3734 * queue (may be a remote CPU queue).
3735 */
3736static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3737 unsigned int *qtail)
3738{
3739 struct softnet_data *sd;
3740 unsigned long flags;
3741 unsigned int qlen;
3742
3743 sd = &per_cpu(softnet_data, cpu);
3744
3745 local_irq_save(flags);
3746
3747 rps_lock(sd);
3748 if (!netif_running(skb->dev))
3749 goto drop;
3750 qlen = skb_queue_len(&sd->input_pkt_queue);
3751 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3752 if (qlen) {
3753enqueue:
3754 __skb_queue_tail(&sd->input_pkt_queue, skb);
3755 input_queue_tail_incr_save(sd, qtail);
3756 rps_unlock(sd);
3757 local_irq_restore(flags);
3758 return NET_RX_SUCCESS;
3759 }
3760
3761 /* Schedule NAPI for backlog device
3762 * We can use non atomic operation since we own the queue lock
3763 */
3764 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3765 if (!rps_ipi_queued(sd))
3766 ____napi_schedule(sd, &sd->backlog);
3767 }
3768 goto enqueue;
3769 }
3770
3771drop:
3772 sd->dropped++;
3773 rps_unlock(sd);
3774
3775 local_irq_restore(flags);
3776
3777 atomic_long_inc(&skb->dev->rx_dropped);
3778 kfree_skb(skb);
3779 return NET_RX_DROP;
3780}
3781
3782static int netif_rx_internal(struct sk_buff *skb)
3783{
3784 int ret;
3785
3786 net_timestamp_check(netdev_tstamp_prequeue, skb);
3787
3788 trace_netif_rx(skb);
3789#ifdef CONFIG_RPS
3790 if (static_key_false(&rps_needed)) {
3791 struct rps_dev_flow voidflow, *rflow = &voidflow;
3792 int cpu;
3793
3794 preempt_disable();
3795 rcu_read_lock();
3796
3797 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3798 if (cpu < 0)
3799 cpu = smp_processor_id();
3800
3801 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3802
3803 rcu_read_unlock();
3804 preempt_enable();
3805 } else
3806#endif
3807 {
3808 unsigned int qtail;
3809 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3810 put_cpu();
3811 }
3812 return ret;
3813}
3814
3815/**
3816 * netif_rx - post buffer to the network code
3817 * @skb: buffer to post
3818 *
3819 * This function receives a packet from a device driver and queues it for
3820 * the upper (protocol) levels to process. It always succeeds. The buffer
3821 * may be dropped during processing for congestion control or by the
3822 * protocol layers.
3823 *
3824 * return values:
3825 * NET_RX_SUCCESS (no congestion)
3826 * NET_RX_DROP (packet was dropped)
3827 *
3828 */
3829
3830int netif_rx(struct sk_buff *skb)
3831{
3832 trace_netif_rx_entry(skb);
3833
3834 return netif_rx_internal(skb);
3835}
3836EXPORT_SYMBOL(netif_rx);
3837
3838int netif_rx_ni(struct sk_buff *skb)
3839{
3840 int err;
3841
3842 trace_netif_rx_ni_entry(skb);
3843
3844 preempt_disable();
3845 err = netif_rx_internal(skb);
3846 if (local_softirq_pending())
3847 do_softirq();
3848 preempt_enable();
3849
3850 return err;
3851}
3852EXPORT_SYMBOL(netif_rx_ni);
3853
3854static __latent_entropy void net_tx_action(struct softirq_action *h)
3855{
3856 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3857
3858 if (sd->completion_queue) {
3859 struct sk_buff *clist;
3860
3861 local_irq_disable();
3862 clist = sd->completion_queue;
3863 sd->completion_queue = NULL;
3864 local_irq_enable();
3865
3866 while (clist) {
3867 struct sk_buff *skb = clist;
3868 clist = clist->next;
3869
3870 WARN_ON(atomic_read(&skb->users));
3871 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3872 trace_consume_skb(skb);
3873 else
3874 trace_kfree_skb(skb, net_tx_action);
3875
3876 if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3877 __kfree_skb(skb);
3878 else
3879 __kfree_skb_defer(skb);
3880 }
3881
3882 __kfree_skb_flush();
3883 }
3884
3885 if (sd->output_queue) {
3886 struct Qdisc *head;
3887
3888 local_irq_disable();
3889 head = sd->output_queue;
3890 sd->output_queue = NULL;
3891 sd->output_queue_tailp = &sd->output_queue;
3892 local_irq_enable();
3893
3894 while (head) {
3895 struct Qdisc *q = head;
3896 spinlock_t *root_lock;
3897
3898 head = head->next_sched;
3899
3900 root_lock = qdisc_lock(q);
3901 spin_lock(root_lock);
3902 /* We need to make sure head->next_sched is read
3903 * before clearing __QDISC_STATE_SCHED
3904 */
3905 smp_mb__before_atomic();
3906 clear_bit(__QDISC_STATE_SCHED, &q->state);
3907 qdisc_run(q);
3908 spin_unlock(root_lock);
3909 }
3910 }
3911}
3912
3913#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3914/* This hook is defined here for ATM LANE */
3915int (*br_fdb_test_addr_hook)(struct net_device *dev,
3916 unsigned char *addr) __read_mostly;
3917EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3918#endif
3919
3920static inline struct sk_buff *
3921sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3922 struct net_device *orig_dev)
3923{
3924#ifdef CONFIG_NET_CLS_ACT
3925 struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3926 struct tcf_result cl_res;
3927
3928 /* If there's at least one ingress present somewhere (so
3929 * we get here via enabled static key), remaining devices
3930 * that are not configured with an ingress qdisc will bail
3931 * out here.
3932 */
3933 if (!cl)
3934 return skb;
3935 if (*pt_prev) {
3936 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3937 *pt_prev = NULL;
3938 }
3939
3940 qdisc_skb_cb(skb)->pkt_len = skb->len;
3941 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3942 qdisc_bstats_cpu_update(cl->q, skb);
3943
3944 switch (tc_classify(skb, cl, &cl_res, false)) {
3945 case TC_ACT_OK:
3946 case TC_ACT_RECLASSIFY:
3947 skb->tc_index = TC_H_MIN(cl_res.classid);
3948 break;
3949 case TC_ACT_SHOT:
3950 qdisc_qstats_cpu_drop(cl->q);
3951 kfree_skb(skb);
3952 return NULL;
3953 case TC_ACT_STOLEN:
3954 case TC_ACT_QUEUED:
3955 consume_skb(skb);
3956 return NULL;
3957 case TC_ACT_REDIRECT:
3958 /* skb_mac_header check was done by cls/act_bpf, so
3959 * we can safely push the L2 header back before
3960 * redirecting to another netdev
3961 */
3962 __skb_push(skb, skb->mac_len);
3963 skb_do_redirect(skb);
3964 return NULL;
3965 default:
3966 break;
3967 }
3968#endif /* CONFIG_NET_CLS_ACT */
3969 return skb;
3970}
3971
3972/**
3973 * netdev_is_rx_handler_busy - check if receive handler is registered
3974 * @dev: device to check
3975 *
3976 * Check if a receive handler is already registered for a given device.
3977 * Return true if there one.
3978 *
3979 * The caller must hold the rtnl_mutex.
3980 */
3981bool netdev_is_rx_handler_busy(struct net_device *dev)
3982{
3983 ASSERT_RTNL();
3984 return dev && rtnl_dereference(dev->rx_handler);
3985}
3986EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3987
3988/**
3989 * netdev_rx_handler_register - register receive handler
3990 * @dev: device to register a handler for
3991 * @rx_handler: receive handler to register
3992 * @rx_handler_data: data pointer that is used by rx handler
3993 *
3994 * Register a receive handler for a device. This handler will then be
3995 * called from __netif_receive_skb. A negative errno code is returned
3996 * on a failure.
3997 *
3998 * The caller must hold the rtnl_mutex.
3999 *
4000 * For a general description of rx_handler, see enum rx_handler_result.
4001 */
4002int netdev_rx_handler_register(struct net_device *dev,
4003 rx_handler_func_t *rx_handler,
4004 void *rx_handler_data)
4005{
4006 ASSERT_RTNL();
4007
4008 if (dev->rx_handler)
4009 return -EBUSY;
4010
4011 /* Note: rx_handler_data must be set before rx_handler */
4012 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4013 rcu_assign_pointer(dev->rx_handler, rx_handler);
4014
4015 return 0;
4016}
4017EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4018
4019/**
4020 * netdev_rx_handler_unregister - unregister receive handler
4021 * @dev: device to unregister a handler from
4022 *
4023 * Unregister a receive handler from a device.
4024 *
4025 * The caller must hold the rtnl_mutex.
4026 */
4027void netdev_rx_handler_unregister(struct net_device *dev)
4028{
4029
4030 ASSERT_RTNL();
4031 RCU_INIT_POINTER(dev->rx_handler, NULL);
4032 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4033 * section has a guarantee to see a non NULL rx_handler_data
4034 * as well.
4035 */
4036 synchronize_net();
4037 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4038}
4039EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4040
4041/*
4042 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4043 * the special handling of PFMEMALLOC skbs.
4044 */
4045static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4046{
4047 switch (skb->protocol) {
4048 case htons(ETH_P_ARP):
4049 case htons(ETH_P_IP):
4050 case htons(ETH_P_IPV6):
4051 case htons(ETH_P_8021Q):
4052 case htons(ETH_P_8021AD):
4053 return true;
4054 default:
4055 return false;
4056 }
4057}
4058
4059static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4060 int *ret, struct net_device *orig_dev)
4061{
4062#ifdef CONFIG_NETFILTER_INGRESS
4063 if (nf_hook_ingress_active(skb)) {
4064 int ingress_retval;
4065
4066 if (*pt_prev) {
4067 *ret = deliver_skb(skb, *pt_prev, orig_dev);
4068 *pt_prev = NULL;
4069 }
4070
4071 rcu_read_lock();
4072 ingress_retval = nf_hook_ingress(skb);
4073 rcu_read_unlock();
4074 return ingress_retval;
4075 }
4076#endif /* CONFIG_NETFILTER_INGRESS */
4077 return 0;
4078}
4079
4080static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4081{
4082 struct packet_type *ptype, *pt_prev;
4083 rx_handler_func_t *rx_handler;
4084 struct net_device *orig_dev;
4085 bool deliver_exact = false;
4086 int ret = NET_RX_DROP;
4087 __be16 type;
4088
4089 net_timestamp_check(!netdev_tstamp_prequeue, skb);
4090
4091 trace_netif_receive_skb(skb);
4092
4093 orig_dev = skb->dev;
4094
4095 skb_reset_network_header(skb);
4096 if (!skb_transport_header_was_set(skb))
4097 skb_reset_transport_header(skb);
4098 skb_reset_mac_len(skb);
4099
4100 pt_prev = NULL;
4101
4102another_round:
4103 skb->skb_iif = skb->dev->ifindex;
4104
4105 __this_cpu_inc(softnet_data.processed);
4106
4107 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4108 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4109 skb = skb_vlan_untag(skb);
4110 if (unlikely(!skb))
4111 goto out;
4112 }
4113
4114#ifdef CONFIG_NET_CLS_ACT
4115 if (skb->tc_verd & TC_NCLS) {
4116 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4117 goto ncls;
4118 }
4119#endif
4120
4121 if (pfmemalloc)
4122 goto skip_taps;
4123
4124 list_for_each_entry_rcu(ptype, &ptype_all, list) {
4125 if (pt_prev)
4126 ret = deliver_skb(skb, pt_prev, orig_dev);
4127 pt_prev = ptype;
4128 }
4129
4130 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4131 if (pt_prev)
4132 ret = deliver_skb(skb, pt_prev, orig_dev);
4133 pt_prev = ptype;
4134 }
4135
4136skip_taps:
4137#ifdef CONFIG_NET_INGRESS
4138 if (static_key_false(&ingress_needed)) {
4139 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4140 if (!skb)
4141 goto out;
4142
4143 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4144 goto out;
4145 }
4146#endif
4147#ifdef CONFIG_NET_CLS_ACT
4148 skb->tc_verd = 0;
4149ncls:
4150#endif
4151 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4152 goto drop;
4153
4154 if (skb_vlan_tag_present(skb)) {
4155 if (pt_prev) {
4156 ret = deliver_skb(skb, pt_prev, orig_dev);
4157 pt_prev = NULL;
4158 }
4159 if (vlan_do_receive(&skb))
4160 goto another_round;
4161 else if (unlikely(!skb))
4162 goto out;
4163 }
4164
4165 rx_handler = rcu_dereference(skb->dev->rx_handler);
4166 if (rx_handler) {
4167 if (pt_prev) {
4168 ret = deliver_skb(skb, pt_prev, orig_dev);
4169 pt_prev = NULL;
4170 }
4171 switch (rx_handler(&skb)) {
4172 case RX_HANDLER_CONSUMED:
4173 ret = NET_RX_SUCCESS;
4174 goto out;
4175 case RX_HANDLER_ANOTHER:
4176 goto another_round;
4177 case RX_HANDLER_EXACT:
4178 deliver_exact = true;
4179 case RX_HANDLER_PASS:
4180 break;
4181 default:
4182 BUG();
4183 }
4184 }
4185
4186 if (unlikely(skb_vlan_tag_present(skb))) {
4187 if (skb_vlan_tag_get_id(skb))
4188 skb->pkt_type = PACKET_OTHERHOST;
4189 /* Note: we might in the future use prio bits
4190 * and set skb->priority like in vlan_do_receive()
4191 * For the time being, just ignore Priority Code Point
4192 */
4193 skb->vlan_tci = 0;
4194 }
4195
4196 type = skb->protocol;
4197
4198 /* deliver only exact match when indicated */
4199 if (likely(!deliver_exact)) {
4200 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4201 &ptype_base[ntohs(type) &
4202 PTYPE_HASH_MASK]);
4203 }
4204
4205 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4206 &orig_dev->ptype_specific);
4207
4208 if (unlikely(skb->dev != orig_dev)) {
4209 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4210 &skb->dev->ptype_specific);
4211 }
4212
4213 if (pt_prev) {
4214 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4215 goto drop;
4216 else
4217 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4218 } else {
4219drop:
4220 if (!deliver_exact)
4221 atomic_long_inc(&skb->dev->rx_dropped);
4222 else
4223 atomic_long_inc(&skb->dev->rx_nohandler);
4224 kfree_skb(skb);
4225 /* Jamal, now you will not able to escape explaining
4226 * me how you were going to use this. :-)
4227 */
4228 ret = NET_RX_DROP;
4229 }
4230
4231out:
4232 return ret;
4233}
4234
4235static int __netif_receive_skb(struct sk_buff *skb)
4236{
4237 int ret;
4238
4239 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4240 unsigned long pflags = current->flags;
4241
4242 /*
4243 * PFMEMALLOC skbs are special, they should
4244 * - be delivered to SOCK_MEMALLOC sockets only
4245 * - stay away from userspace
4246 * - have bounded memory usage
4247 *
4248 * Use PF_MEMALLOC as this saves us from propagating the allocation
4249 * context down to all allocation sites.
4250 */
4251 current->flags |= PF_MEMALLOC;
4252 ret = __netif_receive_skb_core(skb, true);
4253 tsk_restore_flags(current, pflags, PF_MEMALLOC);
4254 } else
4255 ret = __netif_receive_skb_core(skb, false);
4256
4257 return ret;
4258}
4259
4260static int netif_receive_skb_internal(struct sk_buff *skb)
4261{
4262 int ret;
4263
4264 net_timestamp_check(netdev_tstamp_prequeue, skb);
4265
4266 if (skb_defer_rx_timestamp(skb))
4267 return NET_RX_SUCCESS;
4268
4269 rcu_read_lock();
4270
4271#ifdef CONFIG_RPS
4272 if (static_key_false(&rps_needed)) {
4273 struct rps_dev_flow voidflow, *rflow = &voidflow;
4274 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4275
4276 if (cpu >= 0) {
4277 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4278 rcu_read_unlock();
4279 return ret;
4280 }
4281 }
4282#endif
4283 ret = __netif_receive_skb(skb);
4284 rcu_read_unlock();
4285 return ret;
4286}
4287
4288/**
4289 * netif_receive_skb - process receive buffer from network
4290 * @skb: buffer to process
4291 *
4292 * netif_receive_skb() is the main receive data processing function.
4293 * It always succeeds. The buffer may be dropped during processing
4294 * for congestion control or by the protocol layers.
4295 *
4296 * This function may only be called from softirq context and interrupts
4297 * should be enabled.
4298 *
4299 * Return values (usually ignored):
4300 * NET_RX_SUCCESS: no congestion
4301 * NET_RX_DROP: packet was dropped
4302 */
4303int netif_receive_skb(struct sk_buff *skb)
4304{
4305 trace_netif_receive_skb_entry(skb);
4306
4307 return netif_receive_skb_internal(skb);
4308}
4309EXPORT_SYMBOL(netif_receive_skb);
4310
4311DEFINE_PER_CPU(struct work_struct, flush_works);
4312
4313/* Network device is going away, flush any packets still pending */
4314static void flush_backlog(struct work_struct *work)
4315{
4316 struct sk_buff *skb, *tmp;
4317 struct softnet_data *sd;
4318
4319 local_bh_disable();
4320 sd = this_cpu_ptr(&softnet_data);
4321
4322 local_irq_disable();
4323 rps_lock(sd);
4324 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4325 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4326 __skb_unlink(skb, &sd->input_pkt_queue);
4327 kfree_skb(skb);
4328 input_queue_head_incr(sd);
4329 }
4330 }
4331 rps_unlock(sd);
4332 local_irq_enable();
4333
4334 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4335 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4336 __skb_unlink(skb, &sd->process_queue);
4337 kfree_skb(skb);
4338 input_queue_head_incr(sd);
4339 }
4340 }
4341 local_bh_enable();
4342}
4343
4344static void flush_all_backlogs(void)
4345{
4346 unsigned int cpu;
4347
4348 get_online_cpus();
4349
4350 for_each_online_cpu(cpu)
4351 queue_work_on(cpu, system_highpri_wq,
4352 per_cpu_ptr(&flush_works, cpu));
4353
4354 for_each_online_cpu(cpu)
4355 flush_work(per_cpu_ptr(&flush_works, cpu));
4356
4357 put_online_cpus();
4358}
4359
4360static int napi_gro_complete(struct sk_buff *skb)
4361{
4362 struct packet_offload *ptype;
4363 __be16 type = skb->protocol;
4364 struct list_head *head = &offload_base;
4365 int err = -ENOENT;
4366
4367 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4368
4369 if (NAPI_GRO_CB(skb)->count == 1) {
4370 skb_shinfo(skb)->gso_size = 0;
4371 goto out;
4372 }
4373
4374 rcu_read_lock();
4375 list_for_each_entry_rcu(ptype, head, list) {
4376 if (ptype->type != type || !ptype->callbacks.gro_complete)
4377 continue;
4378
4379 err = ptype->callbacks.gro_complete(skb, 0);
4380 break;
4381 }
4382 rcu_read_unlock();
4383
4384 if (err) {
4385 WARN_ON(&ptype->list == head);
4386 kfree_skb(skb);
4387 return NET_RX_SUCCESS;
4388 }
4389
4390out:
4391 return netif_receive_skb_internal(skb);
4392}
4393
4394/* napi->gro_list contains packets ordered by age.
4395 * youngest packets at the head of it.
4396 * Complete skbs in reverse order to reduce latencies.
4397 */
4398void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4399{
4400 struct sk_buff *skb, *prev = NULL;
4401
4402 /* scan list and build reverse chain */
4403 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4404 skb->prev = prev;
4405 prev = skb;
4406 }
4407
4408 for (skb = prev; skb; skb = prev) {
4409 skb->next = NULL;
4410
4411 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4412 return;
4413
4414 prev = skb->prev;
4415 napi_gro_complete(skb);
4416 napi->gro_count--;
4417 }
4418
4419 napi->gro_list = NULL;
4420}
4421EXPORT_SYMBOL(napi_gro_flush);
4422
4423static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4424{
4425 struct sk_buff *p;
4426 unsigned int maclen = skb->dev->hard_header_len;
4427 u32 hash = skb_get_hash_raw(skb);
4428
4429 for (p = napi->gro_list; p; p = p->next) {
4430 unsigned long diffs;
4431
4432 NAPI_GRO_CB(p)->flush = 0;
4433
4434 if (hash != skb_get_hash_raw(p)) {
4435 NAPI_GRO_CB(p)->same_flow = 0;
4436 continue;
4437 }
4438
4439 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4440 diffs |= p->vlan_tci ^ skb->vlan_tci;
4441 diffs |= skb_metadata_dst_cmp(p, skb);
4442 if (maclen == ETH_HLEN)
4443 diffs |= compare_ether_header(skb_mac_header(p),
4444 skb_mac_header(skb));
4445 else if (!diffs)
4446 diffs = memcmp(skb_mac_header(p),
4447 skb_mac_header(skb),
4448 maclen);
4449 NAPI_GRO_CB(p)->same_flow = !diffs;
4450 }
4451}
4452
4453static void skb_gro_reset_offset(struct sk_buff *skb)
4454{
4455 const struct skb_shared_info *pinfo = skb_shinfo(skb);
4456 const skb_frag_t *frag0 = &pinfo->frags[0];
4457
4458 NAPI_GRO_CB(skb)->data_offset = 0;
4459 NAPI_GRO_CB(skb)->frag0 = NULL;
4460 NAPI_GRO_CB(skb)->frag0_len = 0;
4461
4462 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4463 pinfo->nr_frags &&
4464 !PageHighMem(skb_frag_page(frag0))) {
4465 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4466 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4467 skb_frag_size(frag0),
4468 skb->end - skb->tail);
4469 }
4470}
4471
4472static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4473{
4474 struct skb_shared_info *pinfo = skb_shinfo(skb);
4475
4476 BUG_ON(skb->end - skb->tail < grow);
4477
4478 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4479
4480 skb->data_len -= grow;
4481 skb->tail += grow;
4482
4483 pinfo->frags[0].page_offset += grow;
4484 skb_frag_size_sub(&pinfo->frags[0], grow);
4485
4486 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4487 skb_frag_unref(skb, 0);
4488 memmove(pinfo->frags, pinfo->frags + 1,
4489 --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4490 }
4491}
4492
4493static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4494{
4495 struct sk_buff **pp = NULL;
4496 struct packet_offload *ptype;
4497 __be16 type = skb->protocol;
4498 struct list_head *head = &offload_base;
4499 int same_flow;
4500 enum gro_result ret;
4501 int grow;
4502
4503 if (!(skb->dev->features & NETIF_F_GRO))
4504 goto normal;
4505
4506 if (skb->csum_bad)
4507 goto normal;
4508
4509 gro_list_prepare(napi, skb);
4510
4511 rcu_read_lock();
4512 list_for_each_entry_rcu(ptype, head, list) {
4513 if (ptype->type != type || !ptype->callbacks.gro_receive)
4514 continue;
4515
4516 skb_set_network_header(skb, skb_gro_offset(skb));
4517 skb_reset_mac_len(skb);
4518 NAPI_GRO_CB(skb)->same_flow = 0;
4519 NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
4520 NAPI_GRO_CB(skb)->free = 0;
4521 NAPI_GRO_CB(skb)->encap_mark = 0;
4522 NAPI_GRO_CB(skb)->recursion_counter = 0;
4523 NAPI_GRO_CB(skb)->is_fou = 0;
4524 NAPI_GRO_CB(skb)->is_atomic = 1;
4525 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4526
4527 /* Setup for GRO checksum validation */
4528 switch (skb->ip_summed) {
4529 case CHECKSUM_COMPLETE:
4530 NAPI_GRO_CB(skb)->csum = skb->csum;
4531 NAPI_GRO_CB(skb)->csum_valid = 1;
4532 NAPI_GRO_CB(skb)->csum_cnt = 0;
4533 break;
4534 case CHECKSUM_UNNECESSARY:
4535 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4536 NAPI_GRO_CB(skb)->csum_valid = 0;
4537 break;
4538 default:
4539 NAPI_GRO_CB(skb)->csum_cnt = 0;
4540 NAPI_GRO_CB(skb)->csum_valid = 0;
4541 }
4542
4543 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4544 break;
4545 }
4546 rcu_read_unlock();
4547
4548 if (&ptype->list == head)
4549 goto normal;
4550
4551 same_flow = NAPI_GRO_CB(skb)->same_flow;
4552 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4553
4554 if (pp) {
4555 struct sk_buff *nskb = *pp;
4556
4557 *pp = nskb->next;
4558 nskb->next = NULL;
4559 napi_gro_complete(nskb);
4560 napi->gro_count--;
4561 }
4562
4563 if (same_flow)
4564 goto ok;
4565
4566 if (NAPI_GRO_CB(skb)->flush)
4567 goto normal;
4568
4569 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4570 struct sk_buff *nskb = napi->gro_list;
4571
4572 /* locate the end of the list to select the 'oldest' flow */
4573 while (nskb->next) {
4574 pp = &nskb->next;
4575 nskb = *pp;
4576 }
4577 *pp = NULL;
4578 nskb->next = NULL;
4579 napi_gro_complete(nskb);
4580 } else {
4581 napi->gro_count++;
4582 }
4583 NAPI_GRO_CB(skb)->count = 1;
4584 NAPI_GRO_CB(skb)->age = jiffies;
4585 NAPI_GRO_CB(skb)->last = skb;
4586 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4587 skb->next = napi->gro_list;
4588 napi->gro_list = skb;
4589 ret = GRO_HELD;
4590
4591pull:
4592 grow = skb_gro_offset(skb) - skb_headlen(skb);
4593 if (grow > 0)
4594 gro_pull_from_frag0(skb, grow);
4595ok:
4596 return ret;
4597
4598normal:
4599 ret = GRO_NORMAL;
4600 goto pull;
4601}
4602
4603struct packet_offload *gro_find_receive_by_type(__be16 type)
4604{
4605 struct list_head *offload_head = &offload_base;
4606 struct packet_offload *ptype;
4607
4608 list_for_each_entry_rcu(ptype, offload_head, list) {
4609 if (ptype->type != type || !ptype->callbacks.gro_receive)
4610 continue;
4611 return ptype;
4612 }
4613 return NULL;
4614}
4615EXPORT_SYMBOL(gro_find_receive_by_type);
4616
4617struct packet_offload *gro_find_complete_by_type(__be16 type)
4618{
4619 struct list_head *offload_head = &offload_base;
4620 struct packet_offload *ptype;
4621
4622 list_for_each_entry_rcu(ptype, offload_head, list) {
4623 if (ptype->type != type || !ptype->callbacks.gro_complete)
4624 continue;
4625 return ptype;
4626 }
4627 return NULL;
4628}
4629EXPORT_SYMBOL(gro_find_complete_by_type);
4630
4631static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4632{
4633 switch (ret) {
4634 case GRO_NORMAL:
4635 if (netif_receive_skb_internal(skb))
4636 ret = GRO_DROP;
4637 break;
4638
4639 case GRO_DROP:
4640 kfree_skb(skb);
4641 break;
4642
4643 case GRO_MERGED_FREE:
4644 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4645 skb_dst_drop(skb);
4646 kmem_cache_free(skbuff_head_cache, skb);
4647 } else {
4648 __kfree_skb(skb);
4649 }
4650 break;
4651
4652 case GRO_HELD:
4653 case GRO_MERGED:
4654 break;
4655 }
4656
4657 return ret;
4658}
4659
4660gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4661{
4662 skb_mark_napi_id(skb, napi);
4663 trace_napi_gro_receive_entry(skb);
4664
4665 skb_gro_reset_offset(skb);
4666
4667 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4668}
4669EXPORT_SYMBOL(napi_gro_receive);
4670
4671static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4672{
4673 if (unlikely(skb->pfmemalloc)) {
4674 consume_skb(skb);
4675 return;
4676 }
4677 __skb_pull(skb, skb_headlen(skb));
4678 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4679 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4680 skb->vlan_tci = 0;
4681 skb->dev = napi->dev;
4682 skb->skb_iif = 0;
4683 skb->encapsulation = 0;
4684 skb_shinfo(skb)->gso_type = 0;
4685 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4686
4687 napi->skb = skb;
4688}
4689
4690struct sk_buff *napi_get_frags(struct napi_struct *napi)
4691{
4692 struct sk_buff *skb = napi->skb;
4693
4694 if (!skb) {
4695 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4696 if (skb) {
4697 napi->skb = skb;
4698 skb_mark_napi_id(skb, napi);
4699 }
4700 }
4701 return skb;
4702}
4703EXPORT_SYMBOL(napi_get_frags);
4704
4705static gro_result_t napi_frags_finish(struct napi_struct *napi,
4706 struct sk_buff *skb,
4707 gro_result_t ret)
4708{
4709 switch (ret) {
4710 case GRO_NORMAL:
4711 case GRO_HELD:
4712 __skb_push(skb, ETH_HLEN);
4713 skb->protocol = eth_type_trans(skb, skb->dev);
4714 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4715 ret = GRO_DROP;
4716 break;
4717
4718 case GRO_DROP:
4719 case GRO_MERGED_FREE:
4720 napi_reuse_skb(napi, skb);
4721 break;
4722
4723 case GRO_MERGED:
4724 break;
4725 }
4726
4727 return ret;
4728}
4729
4730/* Upper GRO stack assumes network header starts at gro_offset=0
4731 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4732 * We copy ethernet header into skb->data to have a common layout.
4733 */
4734static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4735{
4736 struct sk_buff *skb = napi->skb;
4737 const struct ethhdr *eth;
4738 unsigned int hlen = sizeof(*eth);
4739
4740 napi->skb = NULL;
4741
4742 skb_reset_mac_header(skb);
4743 skb_gro_reset_offset(skb);
4744
4745 eth = skb_gro_header_fast(skb, 0);
4746 if (unlikely(skb_gro_header_hard(skb, hlen))) {
4747 eth = skb_gro_header_slow(skb, hlen, 0);
4748 if (unlikely(!eth)) {
4749 net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4750 __func__, napi->dev->name);
4751 napi_reuse_skb(napi, skb);
4752 return NULL;
4753 }
4754 } else {
4755 gro_pull_from_frag0(skb, hlen);
4756 NAPI_GRO_CB(skb)->frag0 += hlen;
4757 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4758 }
4759 __skb_pull(skb, hlen);
4760
4761 /*
4762 * This works because the only protocols we care about don't require
4763 * special handling.
4764 * We'll fix it up properly in napi_frags_finish()
4765 */
4766 skb->protocol = eth->h_proto;
4767
4768 return skb;
4769}
4770
4771gro_result_t napi_gro_frags(struct napi_struct *napi)
4772{
4773 struct sk_buff *skb = napi_frags_skb(napi);
4774
4775 if (!skb)
4776 return GRO_DROP;
4777
4778 trace_napi_gro_frags_entry(skb);
4779
4780 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4781}
4782EXPORT_SYMBOL(napi_gro_frags);
4783
4784/* Compute the checksum from gro_offset and return the folded value
4785 * after adding in any pseudo checksum.
4786 */
4787__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4788{
4789 __wsum wsum;
4790 __sum16 sum;
4791
4792 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4793
4794 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4795 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4796 if (likely(!sum)) {
4797 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4798 !skb->csum_complete_sw)
4799 netdev_rx_csum_fault(skb->dev);
4800 }
4801
4802 NAPI_GRO_CB(skb)->csum = wsum;
4803 NAPI_GRO_CB(skb)->csum_valid = 1;
4804
4805 return sum;
4806}
4807EXPORT_SYMBOL(__skb_gro_checksum_complete);
4808
4809/*
4810 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4811 * Note: called with local irq disabled, but exits with local irq enabled.
4812 */
4813static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4814{
4815#ifdef CONFIG_RPS
4816 struct softnet_data *remsd = sd->rps_ipi_list;
4817
4818 if (remsd) {
4819 sd->rps_ipi_list = NULL;
4820
4821 local_irq_enable();
4822
4823 /* Send pending IPI's to kick RPS processing on remote cpus. */
4824 while (remsd) {
4825 struct softnet_data *next = remsd->rps_ipi_next;
4826
4827 if (cpu_online(remsd->cpu))
4828 smp_call_function_single_async(remsd->cpu,
4829 &remsd->csd);
4830 remsd = next;
4831 }
4832 } else
4833#endif
4834 local_irq_enable();
4835}
4836
4837static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4838{
4839#ifdef CONFIG_RPS
4840 return sd->rps_ipi_list != NULL;
4841#else
4842 return false;
4843#endif
4844}
4845
4846static int process_backlog(struct napi_struct *napi, int quota)
4847{
4848 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4849 bool again = true;
4850 int work = 0;
4851
4852 /* Check if we have pending ipi, its better to send them now,
4853 * not waiting net_rx_action() end.
4854 */
4855 if (sd_has_rps_ipi_waiting(sd)) {
4856 local_irq_disable();
4857 net_rps_action_and_irq_enable(sd);
4858 }
4859
4860 napi->weight = weight_p;
4861 while (again) {
4862 struct sk_buff *skb;
4863
4864 while ((skb = __skb_dequeue(&sd->process_queue))) {
4865 rcu_read_lock();
4866 __netif_receive_skb(skb);
4867 rcu_read_unlock();
4868 input_queue_head_incr(sd);
4869 if (++work >= quota)
4870 return work;
4871
4872 }
4873
4874 local_irq_disable();
4875 rps_lock(sd);
4876 if (skb_queue_empty(&sd->input_pkt_queue)) {
4877 /*
4878 * Inline a custom version of __napi_complete().
4879 * only current cpu owns and manipulates this napi,
4880 * and NAPI_STATE_SCHED is the only possible flag set
4881 * on backlog.
4882 * We can use a plain write instead of clear_bit(),
4883 * and we dont need an smp_mb() memory barrier.
4884 */
4885 napi->state = 0;
4886 again = false;
4887 } else {
4888 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4889 &sd->process_queue);
4890 }
4891 rps_unlock(sd);
4892 local_irq_enable();
4893 }
4894
4895 return work;
4896}
4897
4898/**
4899 * __napi_schedule - schedule for receive
4900 * @n: entry to schedule
4901 *
4902 * The entry's receive function will be scheduled to run.
4903 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4904 */
4905void __napi_schedule(struct napi_struct *n)
4906{
4907 unsigned long flags;
4908
4909 local_irq_save(flags);
4910 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4911 local_irq_restore(flags);
4912}
4913EXPORT_SYMBOL(__napi_schedule);
4914
4915/**
4916 * napi_schedule_prep - check if napi can be scheduled
4917 * @n: napi context
4918 *
4919 * Test if NAPI routine is already running, and if not mark
4920 * it as running. This is used as a condition variable
4921 * insure only one NAPI poll instance runs. We also make
4922 * sure there is no pending NAPI disable.
4923 */
4924bool napi_schedule_prep(struct napi_struct *n)
4925{
4926 unsigned long val, new;
4927
4928 do {
4929 val = READ_ONCE(n->state);
4930 if (unlikely(val & NAPIF_STATE_DISABLE))
4931 return false;
4932 new = val | NAPIF_STATE_SCHED;
4933
4934 /* Sets STATE_MISSED bit if STATE_SCHED was already set
4935 * This was suggested by Alexander Duyck, as compiler
4936 * emits better code than :
4937 * if (val & NAPIF_STATE_SCHED)
4938 * new |= NAPIF_STATE_MISSED;
4939 */
4940 new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
4941 NAPIF_STATE_MISSED;
4942 } while (cmpxchg(&n->state, val, new) != val);
4943
4944 return !(val & NAPIF_STATE_SCHED);
4945}
4946EXPORT_SYMBOL(napi_schedule_prep);
4947
4948/**
4949 * __napi_schedule_irqoff - schedule for receive
4950 * @n: entry to schedule
4951 *
4952 * Variant of __napi_schedule() assuming hard irqs are masked
4953 */
4954void __napi_schedule_irqoff(struct napi_struct *n)
4955{
4956 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4957}
4958EXPORT_SYMBOL(__napi_schedule_irqoff);
4959
4960bool __napi_complete(struct napi_struct *n)
4961{
4962 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4963
4964 /* Some drivers call us directly, instead of calling
4965 * napi_complete_done().
4966 */
4967 if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
4968 return false;
4969
4970 list_del_init(&n->poll_list);
4971 smp_mb__before_atomic();
4972 clear_bit(NAPI_STATE_SCHED, &n->state);
4973 return true;
4974}
4975EXPORT_SYMBOL(__napi_complete);
4976
4977bool napi_complete_done(struct napi_struct *n, int work_done)
4978{
4979 unsigned long flags, val, new;
4980
4981 /*
4982 * 1) Don't let napi dequeue from the cpu poll list
4983 * just in case its running on a different cpu.
4984 * 2) If we are busy polling, do nothing here, we have
4985 * the guarantee we will be called later.
4986 */
4987 if (unlikely(n->state & (NAPIF_STATE_NPSVC |
4988 NAPIF_STATE_IN_BUSY_POLL)))
4989 return false;
4990
4991 if (n->gro_list) {
4992 unsigned long timeout = 0;
4993
4994 if (work_done)
4995 timeout = n->dev->gro_flush_timeout;
4996
4997 if (timeout)
4998 hrtimer_start(&n->timer, ns_to_ktime(timeout),
4999 HRTIMER_MODE_REL_PINNED);
5000 else
5001 napi_gro_flush(n, false);
5002 }
5003 if (unlikely(!list_empty(&n->poll_list))) {
5004 /* If n->poll_list is not empty, we need to mask irqs */
5005 local_irq_save(flags);
5006 list_del_init(&n->poll_list);
5007 local_irq_restore(flags);
5008 }
5009
5010 do {
5011 val = READ_ONCE(n->state);
5012
5013 WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
5014
5015 new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
5016
5017 /* If STATE_MISSED was set, leave STATE_SCHED set,
5018 * because we will call napi->poll() one more time.
5019 * This C code was suggested by Alexander Duyck to help gcc.
5020 */
5021 new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
5022 NAPIF_STATE_SCHED;
5023 } while (cmpxchg(&n->state, val, new) != val);
5024
5025 if (unlikely(val & NAPIF_STATE_MISSED)) {
5026 __napi_schedule(n);
5027 return false;
5028 }
5029
5030 return true;
5031}
5032EXPORT_SYMBOL(napi_complete_done);
5033
5034/* must be called under rcu_read_lock(), as we dont take a reference */
5035static struct napi_struct *napi_by_id(unsigned int napi_id)
5036{
5037 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
5038 struct napi_struct *napi;
5039
5040 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
5041 if (napi->napi_id == napi_id)
5042 return napi;
5043
5044 return NULL;
5045}
5046
5047#if defined(CONFIG_NET_RX_BUSY_POLL)
5048
5049#define BUSY_POLL_BUDGET 8
5050
5051static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
5052{
5053 int rc;
5054
5055 /* Busy polling means there is a high chance device driver hard irq
5056 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
5057 * set in napi_schedule_prep().
5058 * Since we are about to call napi->poll() once more, we can safely
5059 * clear NAPI_STATE_MISSED.
5060 *
5061 * Note: x86 could use a single "lock and ..." instruction
5062 * to perform these two clear_bit()
5063 */
5064 clear_bit(NAPI_STATE_MISSED, &napi->state);
5065 clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
5066
5067 local_bh_disable();
5068
5069 /* All we really want here is to re-enable device interrupts.
5070 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
5071 */
5072 rc = napi->poll(napi, BUSY_POLL_BUDGET);
5073 netpoll_poll_unlock(have_poll_lock);
5074 if (rc == BUSY_POLL_BUDGET)
5075 __napi_schedule(napi);
5076 local_bh_enable();
5077 if (local_softirq_pending())
5078 do_softirq();
5079}
5080
5081bool sk_busy_loop(struct sock *sk, int nonblock)
5082{
5083 unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
5084 int (*napi_poll)(struct napi_struct *napi, int budget);
5085 int (*busy_poll)(struct napi_struct *dev);
5086 void *have_poll_lock = NULL;
5087 struct napi_struct *napi;
5088 int rc;
5089
5090restart:
5091 rc = false;
5092 napi_poll = NULL;
5093
5094 rcu_read_lock();
5095
5096 napi = napi_by_id(sk->sk_napi_id);
5097 if (!napi)
5098 goto out;
5099
5100 /* Note: ndo_busy_poll method is optional in linux-4.5 */
5101 busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
5102
5103 preempt_disable();
5104 for (;;) {
5105 rc = 0;
5106 local_bh_disable();
5107 if (busy_poll) {
5108 rc = busy_poll(napi);
5109 goto count;
5110 }
5111 if (!napi_poll) {
5112 unsigned long val = READ_ONCE(napi->state);
5113
5114 /* If multiple threads are competing for this napi,
5115 * we avoid dirtying napi->state as much as we can.
5116 */
5117 if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
5118 NAPIF_STATE_IN_BUSY_POLL))
5119 goto count;
5120 if (cmpxchg(&napi->state, val,
5121 val | NAPIF_STATE_IN_BUSY_POLL |
5122 NAPIF_STATE_SCHED) != val)
5123 goto count;
5124 have_poll_lock = netpoll_poll_lock(napi);
5125 napi_poll = napi->poll;
5126 }
5127 rc = napi_poll(napi, BUSY_POLL_BUDGET);
5128 trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5129count:
5130 if (rc > 0)
5131 __NET_ADD_STATS(sock_net(sk),
5132 LINUX_MIB_BUSYPOLLRXPACKETS, rc);
5133 local_bh_enable();
5134
5135 if (rc == LL_FLUSH_FAILED)
5136 break; /* permanent failure */
5137
5138 if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
5139 busy_loop_timeout(end_time))
5140 break;
5141
5142 if (unlikely(need_resched())) {
5143 if (napi_poll)
5144 busy_poll_stop(napi, have_poll_lock);
5145 preempt_enable();
5146 rcu_read_unlock();
5147 cond_resched();
5148 rc = !skb_queue_empty(&sk->sk_receive_queue);
5149 if (rc || busy_loop_timeout(end_time))
5150 return rc;
5151 goto restart;
5152 }
5153 cpu_relax();
5154 }
5155 if (napi_poll)
5156 busy_poll_stop(napi, have_poll_lock);
5157 preempt_enable();
5158 rc = !skb_queue_empty(&sk->sk_receive_queue);
5159out:
5160 rcu_read_unlock();
5161 return rc;
5162}
5163EXPORT_SYMBOL(sk_busy_loop);
5164
5165#endif /* CONFIG_NET_RX_BUSY_POLL */
5166
5167static void napi_hash_add(struct napi_struct *napi)
5168{
5169 if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5170 test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5171 return;
5172
5173 spin_lock(&napi_hash_lock);
5174
5175 /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5176 do {
5177 if (unlikely(++napi_gen_id < NR_CPUS + 1))
5178 napi_gen_id = NR_CPUS + 1;
5179 } while (napi_by_id(napi_gen_id));
5180 napi->napi_id = napi_gen_id;
5181
5182 hlist_add_head_rcu(&napi->napi_hash_node,
5183 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5184
5185 spin_unlock(&napi_hash_lock);
5186}
5187
5188/* Warning : caller is responsible to make sure rcu grace period
5189 * is respected before freeing memory containing @napi
5190 */
5191bool napi_hash_del(struct napi_struct *napi)
5192{
5193 bool rcu_sync_needed = false;
5194
5195 spin_lock(&napi_hash_lock);
5196
5197 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5198 rcu_sync_needed = true;
5199 hlist_del_rcu(&napi->napi_hash_node);
5200 }
5201 spin_unlock(&napi_hash_lock);
5202 return rcu_sync_needed;
5203}
5204EXPORT_SYMBOL_GPL(napi_hash_del);
5205
5206static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5207{
5208 struct napi_struct *napi;
5209
5210 napi = container_of(timer, struct napi_struct, timer);
5211
5212 /* Note : we use a relaxed variant of napi_schedule_prep() not setting
5213 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
5214 */
5215 if (napi->gro_list && !napi_disable_pending(napi) &&
5216 !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
5217 __napi_schedule_irqoff(napi);
5218
5219 return HRTIMER_NORESTART;
5220}
5221
5222void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5223 int (*poll)(struct napi_struct *, int), int weight)
5224{
5225 INIT_LIST_HEAD(&napi->poll_list);
5226 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5227 napi->timer.function = napi_watchdog;
5228 napi->gro_count = 0;
5229 napi->gro_list = NULL;
5230 napi->skb = NULL;
5231 napi->poll = poll;
5232 if (weight > NAPI_POLL_WEIGHT)
5233 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5234 weight, dev->name);
5235 napi->weight = weight;
5236 list_add(&napi->dev_list, &dev->napi_list);
5237 napi->dev = dev;
5238#ifdef CONFIG_NETPOLL
5239 napi->poll_owner = -1;
5240#endif
5241 set_bit(NAPI_STATE_SCHED, &napi->state);
5242 napi_hash_add(napi);
5243}
5244EXPORT_SYMBOL(netif_napi_add);
5245
5246void napi_disable(struct napi_struct *n)
5247{
5248 might_sleep();
5249 set_bit(NAPI_STATE_DISABLE, &n->state);
5250
5251 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5252 msleep(1);
5253 while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5254 msleep(1);
5255
5256 hrtimer_cancel(&n->timer);
5257
5258 clear_bit(NAPI_STATE_DISABLE, &n->state);
5259}
5260EXPORT_SYMBOL(napi_disable);
5261
5262/* Must be called in process context */
5263void netif_napi_del(struct napi_struct *napi)
5264{
5265 might_sleep();
5266 if (napi_hash_del(napi))
5267 synchronize_net();
5268 list_del_init(&napi->dev_list);
5269 napi_free_frags(napi);
5270
5271 kfree_skb_list(napi->gro_list);
5272 napi->gro_list = NULL;
5273 napi->gro_count = 0;
5274}
5275EXPORT_SYMBOL(netif_napi_del);
5276
5277static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5278{
5279 void *have;
5280 int work, weight;
5281
5282 list_del_init(&n->poll_list);
5283
5284 have = netpoll_poll_lock(n);
5285
5286 weight = n->weight;
5287
5288 /* This NAPI_STATE_SCHED test is for avoiding a race
5289 * with netpoll's poll_napi(). Only the entity which
5290 * obtains the lock and sees NAPI_STATE_SCHED set will
5291 * actually make the ->poll() call. Therefore we avoid
5292 * accidentally calling ->poll() when NAPI is not scheduled.
5293 */
5294 work = 0;
5295 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5296 work = n->poll(n, weight);
5297 trace_napi_poll(n, work, weight);
5298 }
5299
5300 WARN_ON_ONCE(work > weight);
5301
5302 if (likely(work < weight))
5303 goto out_unlock;
5304
5305 /* Drivers must not modify the NAPI state if they
5306 * consume the entire weight. In such cases this code
5307 * still "owns" the NAPI instance and therefore can
5308 * move the instance around on the list at-will.
5309 */
5310 if (unlikely(napi_disable_pending(n))) {
5311 napi_complete(n);
5312 goto out_unlock;
5313 }
5314
5315 if (n->gro_list) {
5316 /* flush too old packets
5317 * If HZ < 1000, flush all packets.
5318 */
5319 napi_gro_flush(n, HZ >= 1000);
5320 }
5321
5322 /* Some drivers may have called napi_schedule
5323 * prior to exhausting their budget.
5324 */
5325 if (unlikely(!list_empty(&n->poll_list))) {
5326 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5327 n->dev ? n->dev->name : "backlog");
5328 goto out_unlock;
5329 }
5330
5331 list_add_tail(&n->poll_list, repoll);
5332
5333out_unlock:
5334 netpoll_poll_unlock(have);
5335
5336 return work;
5337}
5338
5339static __latent_entropy void net_rx_action(struct softirq_action *h)
5340{
5341 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5342 unsigned long time_limit = jiffies + 2;
5343 int budget = netdev_budget;
5344 LIST_HEAD(list);
5345 LIST_HEAD(repoll);
5346
5347 local_irq_disable();
5348 list_splice_init(&sd->poll_list, &list);
5349 local_irq_enable();
5350
5351 for (;;) {
5352 struct napi_struct *n;
5353
5354 if (list_empty(&list)) {
5355 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5356 goto out;
5357 break;
5358 }
5359
5360 n = list_first_entry(&list, struct napi_struct, poll_list);
5361 budget -= napi_poll(n, &repoll);
5362
5363 /* If softirq window is exhausted then punt.
5364 * Allow this to run for 2 jiffies since which will allow
5365 * an average latency of 1.5/HZ.
5366 */
5367 if (unlikely(budget <= 0 ||
5368 time_after_eq(jiffies, time_limit))) {
5369 sd->time_squeeze++;
5370 break;
5371 }
5372 }
5373
5374 local_irq_disable();
5375
5376 list_splice_tail_init(&sd->poll_list, &list);
5377 list_splice_tail(&repoll, &list);
5378 list_splice(&list, &sd->poll_list);
5379 if (!list_empty(&sd->poll_list))
5380 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5381
5382 net_rps_action_and_irq_enable(sd);
5383out:
5384 __kfree_skb_flush();
5385}
5386
5387struct netdev_adjacent {
5388 struct net_device *dev;
5389
5390 /* upper master flag, there can only be one master device per list */
5391 bool master;
5392
5393 /* counter for the number of times this device was added to us */
5394 u16 ref_nr;
5395
5396 /* private field for the users */
5397 void *private;
5398
5399 struct list_head list;
5400 struct rcu_head rcu;
5401};
5402
5403static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5404 struct list_head *adj_list)
5405{
5406 struct netdev_adjacent *adj;
5407
5408 list_for_each_entry(adj, adj_list, list) {
5409 if (adj->dev == adj_dev)
5410 return adj;
5411 }
5412 return NULL;
5413}
5414
5415static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
5416{
5417 struct net_device *dev = data;
5418
5419 return upper_dev == dev;
5420}
5421
5422/**
5423 * netdev_has_upper_dev - Check if device is linked to an upper device
5424 * @dev: device
5425 * @upper_dev: upper device to check
5426 *
5427 * Find out if a device is linked to specified upper device and return true
5428 * in case it is. Note that this checks only immediate upper device,
5429 * not through a complete stack of devices. The caller must hold the RTNL lock.
5430 */
5431bool netdev_has_upper_dev(struct net_device *dev,
5432 struct net_device *upper_dev)
5433{
5434 ASSERT_RTNL();
5435
5436 return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5437 upper_dev);
5438}
5439EXPORT_SYMBOL(netdev_has_upper_dev);
5440
5441/**
5442 * netdev_has_upper_dev_all - Check if device is linked to an upper device
5443 * @dev: device
5444 * @upper_dev: upper device to check
5445 *
5446 * Find out if a device is linked to specified upper device and return true
5447 * in case it is. Note that this checks the entire upper device chain.
5448 * The caller must hold rcu lock.
5449 */
5450
5451bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
5452 struct net_device *upper_dev)
5453{
5454 return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5455 upper_dev);
5456}
5457EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
5458
5459/**
5460 * netdev_has_any_upper_dev - Check if device is linked to some device
5461 * @dev: device
5462 *
5463 * Find out if a device is linked to an upper device and return true in case
5464 * it is. The caller must hold the RTNL lock.
5465 */
5466static bool netdev_has_any_upper_dev(struct net_device *dev)
5467{
5468 ASSERT_RTNL();
5469
5470 return !list_empty(&dev->adj_list.upper);
5471}
5472
5473/**
5474 * netdev_master_upper_dev_get - Get master upper device
5475 * @dev: device
5476 *
5477 * Find a master upper device and return pointer to it or NULL in case
5478 * it's not there. The caller must hold the RTNL lock.
5479 */
5480struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5481{
5482 struct netdev_adjacent *upper;
5483
5484 ASSERT_RTNL();
5485
5486 if (list_empty(&dev->adj_list.upper))
5487 return NULL;
5488
5489 upper = list_first_entry(&dev->adj_list.upper,
5490 struct netdev_adjacent, list);
5491 if (likely(upper->master))
5492 return upper->dev;
5493 return NULL;
5494}
5495EXPORT_SYMBOL(netdev_master_upper_dev_get);
5496
5497/**
5498 * netdev_has_any_lower_dev - Check if device is linked to some device
5499 * @dev: device
5500 *
5501 * Find out if a device is linked to a lower device and return true in case
5502 * it is. The caller must hold the RTNL lock.
5503 */
5504static bool netdev_has_any_lower_dev(struct net_device *dev)
5505{
5506 ASSERT_RTNL();
5507
5508 return !list_empty(&dev->adj_list.lower);
5509}
5510
5511void *netdev_adjacent_get_private(struct list_head *adj_list)
5512{
5513 struct netdev_adjacent *adj;
5514
5515 adj = list_entry(adj_list, struct netdev_adjacent, list);
5516
5517 return adj->private;
5518}
5519EXPORT_SYMBOL(netdev_adjacent_get_private);
5520
5521/**
5522 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5523 * @dev: device
5524 * @iter: list_head ** of the current position
5525 *
5526 * Gets the next device from the dev's upper list, starting from iter
5527 * position. The caller must hold RCU read lock.
5528 */
5529struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5530 struct list_head **iter)
5531{
5532 struct netdev_adjacent *upper;
5533
5534 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5535
5536 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5537
5538 if (&upper->list == &dev->adj_list.upper)
5539 return NULL;
5540
5541 *iter = &upper->list;
5542
5543 return upper->dev;
5544}
5545EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5546
5547static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
5548 struct list_head **iter)
5549{
5550 struct netdev_adjacent *upper;
5551
5552 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5553
5554 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5555
5556 if (&upper->list == &dev->adj_list.upper)
5557 return NULL;
5558
5559 *iter = &upper->list;
5560
5561 return upper->dev;
5562}
5563
5564int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
5565 int (*fn)(struct net_device *dev,
5566 void *data),
5567 void *data)
5568{
5569 struct net_device *udev;
5570 struct list_head *iter;
5571 int ret;
5572
5573 for (iter = &dev->adj_list.upper,
5574 udev = netdev_next_upper_dev_rcu(dev, &iter);
5575 udev;
5576 udev = netdev_next_upper_dev_rcu(dev, &iter)) {
5577 /* first is the upper device itself */
5578 ret = fn(udev, data);
5579 if (ret)
5580 return ret;
5581
5582 /* then look at all of its upper devices */
5583 ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
5584 if (ret)
5585 return ret;
5586 }
5587
5588 return 0;
5589}
5590EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
5591
5592/**
5593 * netdev_lower_get_next_private - Get the next ->private from the
5594 * lower neighbour list
5595 * @dev: device
5596 * @iter: list_head ** of the current position
5597 *
5598 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5599 * list, starting from iter position. The caller must hold either hold the
5600 * RTNL lock or its own locking that guarantees that the neighbour lower
5601 * list will remain unchanged.
5602 */
5603void *netdev_lower_get_next_private(struct net_device *dev,
5604 struct list_head **iter)
5605{
5606 struct netdev_adjacent *lower;
5607
5608 lower = list_entry(*iter, struct netdev_adjacent, list);
5609
5610 if (&lower->list == &dev->adj_list.lower)
5611 return NULL;
5612
5613 *iter = lower->list.next;
5614
5615 return lower->private;
5616}
5617EXPORT_SYMBOL(netdev_lower_get_next_private);
5618
5619/**
5620 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5621 * lower neighbour list, RCU
5622 * variant
5623 * @dev: device
5624 * @iter: list_head ** of the current position
5625 *
5626 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5627 * list, starting from iter position. The caller must hold RCU read lock.
5628 */
5629void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5630 struct list_head **iter)
5631{
5632 struct netdev_adjacent *lower;
5633
5634 WARN_ON_ONCE(!rcu_read_lock_held());
5635
5636 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5637
5638 if (&lower->list == &dev->adj_list.lower)
5639 return NULL;
5640
5641 *iter = &lower->list;
5642
5643 return lower->private;
5644}
5645EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5646
5647/**
5648 * netdev_lower_get_next - Get the next device from the lower neighbour
5649 * list
5650 * @dev: device
5651 * @iter: list_head ** of the current position
5652 *
5653 * Gets the next netdev_adjacent from the dev's lower neighbour
5654 * list, starting from iter position. The caller must hold RTNL lock or
5655 * its own locking that guarantees that the neighbour lower
5656 * list will remain unchanged.
5657 */
5658void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5659{
5660 struct netdev_adjacent *lower;
5661
5662 lower = list_entry(*iter, struct netdev_adjacent, list);
5663
5664 if (&lower->list == &dev->adj_list.lower)
5665 return NULL;
5666
5667 *iter = lower->list.next;
5668
5669 return lower->dev;
5670}
5671EXPORT_SYMBOL(netdev_lower_get_next);
5672
5673static struct net_device *netdev_next_lower_dev(struct net_device *dev,
5674 struct list_head **iter)
5675{
5676 struct netdev_adjacent *lower;
5677
5678 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5679
5680 if (&lower->list == &dev->adj_list.lower)
5681 return NULL;
5682
5683 *iter = &lower->list;
5684
5685 return lower->dev;
5686}
5687
5688int netdev_walk_all_lower_dev(struct net_device *dev,
5689 int (*fn)(struct net_device *dev,
5690 void *data),
5691 void *data)
5692{
5693 struct net_device *ldev;
5694 struct list_head *iter;
5695 int ret;
5696
5697 for (iter = &dev->adj_list.lower,
5698 ldev = netdev_next_lower_dev(dev, &iter);
5699 ldev;
5700 ldev = netdev_next_lower_dev(dev, &iter)) {
5701 /* first is the lower device itself */
5702 ret = fn(ldev, data);
5703 if (ret)
5704 return ret;
5705
5706 /* then look at all of its lower devices */
5707 ret = netdev_walk_all_lower_dev(ldev, fn, data);
5708 if (ret)
5709 return ret;
5710 }
5711
5712 return 0;
5713}
5714EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
5715
5716static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
5717 struct list_head **iter)
5718{
5719 struct netdev_adjacent *lower;
5720
5721 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5722 if (&lower->list == &dev->adj_list.lower)
5723 return NULL;
5724
5725 *iter = &lower->list;
5726
5727 return lower->dev;
5728}
5729
5730int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
5731 int (*fn)(struct net_device *dev,
5732 void *data),
5733 void *data)
5734{
5735 struct net_device *ldev;
5736 struct list_head *iter;
5737 int ret;
5738
5739 for (iter = &dev->adj_list.lower,
5740 ldev = netdev_next_lower_dev_rcu(dev, &iter);
5741 ldev;
5742 ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
5743 /* first is the lower device itself */
5744 ret = fn(ldev, data);
5745 if (ret)
5746 return ret;
5747
5748 /* then look at all of its lower devices */
5749 ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
5750 if (ret)
5751 return ret;
5752 }
5753
5754 return 0;
5755}
5756EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
5757
5758/**
5759 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5760 * lower neighbour list, RCU
5761 * variant
5762 * @dev: device
5763 *
5764 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5765 * list. The caller must hold RCU read lock.
5766 */
5767void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5768{
5769 struct netdev_adjacent *lower;
5770
5771 lower = list_first_or_null_rcu(&dev->adj_list.lower,
5772 struct netdev_adjacent, list);
5773 if (lower)
5774 return lower->private;
5775 return NULL;
5776}
5777EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5778
5779/**
5780 * netdev_master_upper_dev_get_rcu - Get master upper device
5781 * @dev: device
5782 *
5783 * Find a master upper device and return pointer to it or NULL in case
5784 * it's not there. The caller must hold the RCU read lock.
5785 */
5786struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5787{
5788 struct netdev_adjacent *upper;
5789
5790 upper = list_first_or_null_rcu(&dev->adj_list.upper,
5791 struct netdev_adjacent, list);
5792 if (upper && likely(upper->master))
5793 return upper->dev;
5794 return NULL;
5795}
5796EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5797
5798static int netdev_adjacent_sysfs_add(struct net_device *dev,
5799 struct net_device *adj_dev,
5800 struct list_head *dev_list)
5801{
5802 char linkname[IFNAMSIZ+7];
5803 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5804 "upper_%s" : "lower_%s", adj_dev->name);
5805 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5806 linkname);
5807}
5808static void netdev_adjacent_sysfs_del(struct net_device *dev,
5809 char *name,
5810 struct list_head *dev_list)
5811{
5812 char linkname[IFNAMSIZ+7];
5813 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5814 "upper_%s" : "lower_%s", name);
5815 sysfs_remove_link(&(dev->dev.kobj), linkname);
5816}
5817
5818static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5819 struct net_device *adj_dev,
5820 struct list_head *dev_list)
5821{
5822 return (dev_list == &dev->adj_list.upper ||
5823 dev_list == &dev->adj_list.lower) &&
5824 net_eq(dev_net(dev), dev_net(adj_dev));
5825}
5826
5827static int __netdev_adjacent_dev_insert(struct net_device *dev,
5828 struct net_device *adj_dev,
5829 struct list_head *dev_list,
5830 void *private, bool master)
5831{
5832 struct netdev_adjacent *adj;
5833 int ret;
5834
5835 adj = __netdev_find_adj(adj_dev, dev_list);
5836
5837 if (adj) {
5838 adj->ref_nr += 1;
5839 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
5840 dev->name, adj_dev->name, adj->ref_nr);
5841
5842 return 0;
5843 }
5844
5845 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5846 if (!adj)
5847 return -ENOMEM;
5848
5849 adj->dev = adj_dev;
5850 adj->master = master;
5851 adj->ref_nr = 1;
5852 adj->private = private;
5853 dev_hold(adj_dev);
5854
5855 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
5856 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
5857
5858 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5859 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5860 if (ret)
5861 goto free_adj;
5862 }
5863
5864 /* Ensure that master link is always the first item in list. */
5865 if (master) {
5866 ret = sysfs_create_link(&(dev->dev.kobj),
5867 &(adj_dev->dev.kobj), "master");
5868 if (ret)
5869 goto remove_symlinks;
5870
5871 list_add_rcu(&adj->list, dev_list);
5872 } else {
5873 list_add_tail_rcu(&adj->list, dev_list);
5874 }
5875
5876 return 0;
5877
5878remove_symlinks:
5879 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5880 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5881free_adj:
5882 kfree(adj);
5883 dev_put(adj_dev);
5884
5885 return ret;
5886}
5887
5888static void __netdev_adjacent_dev_remove(struct net_device *dev,
5889 struct net_device *adj_dev,
5890 u16 ref_nr,
5891 struct list_head *dev_list)
5892{
5893 struct netdev_adjacent *adj;
5894
5895 pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
5896 dev->name, adj_dev->name, ref_nr);
5897
5898 adj = __netdev_find_adj(adj_dev, dev_list);
5899
5900 if (!adj) {
5901 pr_err("Adjacency does not exist for device %s from %s\n",
5902 dev->name, adj_dev->name);
5903 WARN_ON(1);
5904 return;
5905 }
5906
5907 if (adj->ref_nr > ref_nr) {
5908 pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
5909 dev->name, adj_dev->name, ref_nr,
5910 adj->ref_nr - ref_nr);
5911 adj->ref_nr -= ref_nr;
5912 return;
5913 }
5914
5915 if (adj->master)
5916 sysfs_remove_link(&(dev->dev.kobj), "master");
5917
5918 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5919 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5920
5921 list_del_rcu(&adj->list);
5922 pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
5923 adj_dev->name, dev->name, adj_dev->name);
5924 dev_put(adj_dev);
5925 kfree_rcu(adj, rcu);
5926}
5927
5928static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5929 struct net_device *upper_dev,
5930 struct list_head *up_list,
5931 struct list_head *down_list,
5932 void *private, bool master)
5933{
5934 int ret;
5935
5936 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
5937 private, master);
5938 if (ret)
5939 return ret;
5940
5941 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
5942 private, false);
5943 if (ret) {
5944 __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
5945 return ret;
5946 }
5947
5948 return 0;
5949}
5950
5951static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5952 struct net_device *upper_dev,
5953 u16 ref_nr,
5954 struct list_head *up_list,
5955 struct list_head *down_list)
5956{
5957 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5958 __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5959}
5960
5961static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5962 struct net_device *upper_dev,
5963 void *private, bool master)
5964{
5965 return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5966 &dev->adj_list.upper,
5967 &upper_dev->adj_list.lower,
5968 private, master);
5969}
5970
5971static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5972 struct net_device *upper_dev)
5973{
5974 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5975 &dev->adj_list.upper,
5976 &upper_dev->adj_list.lower);
5977}
5978
5979static int __netdev_upper_dev_link(struct net_device *dev,
5980 struct net_device *upper_dev, bool master,
5981 void *upper_priv, void *upper_info)
5982{
5983 struct netdev_notifier_changeupper_info changeupper_info;
5984 int ret = 0;
5985
5986 ASSERT_RTNL();
5987
5988 if (dev == upper_dev)
5989 return -EBUSY;
5990
5991 /* To prevent loops, check if dev is not upper device to upper_dev. */
5992 if (netdev_has_upper_dev(upper_dev, dev))
5993 return -EBUSY;
5994
5995 if (netdev_has_upper_dev(dev, upper_dev))
5996 return -EEXIST;
5997
5998 if (master && netdev_master_upper_dev_get(dev))
5999 return -EBUSY;
6000
6001 changeupper_info.upper_dev = upper_dev;
6002 changeupper_info.master = master;
6003 changeupper_info.linking = true;
6004 changeupper_info.upper_info = upper_info;
6005
6006 ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
6007 &changeupper_info.info);
6008 ret = notifier_to_errno(ret);
6009 if (ret)
6010 return ret;
6011
6012 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
6013 master);
6014 if (ret)
6015 return ret;
6016
6017 ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
6018 &changeupper_info.info);
6019 ret = notifier_to_errno(ret);
6020 if (ret)
6021 goto rollback;
6022
6023 return 0;
6024
6025rollback:
6026 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6027
6028 return ret;
6029}
6030
6031/**
6032 * netdev_upper_dev_link - Add a link to the upper device
6033 * @dev: device
6034 * @upper_dev: new upper device
6035 *
6036 * Adds a link to device which is upper to this one. The caller must hold
6037 * the RTNL lock. On a failure a negative errno code is returned.
6038 * On success the reference counts are adjusted and the function
6039 * returns zero.
6040 */
6041int netdev_upper_dev_link(struct net_device *dev,
6042 struct net_device *upper_dev)
6043{
6044 return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
6045}
6046EXPORT_SYMBOL(netdev_upper_dev_link);
6047
6048/**
6049 * netdev_master_upper_dev_link - Add a master link to the upper device
6050 * @dev: device
6051 * @upper_dev: new upper device
6052 * @upper_priv: upper device private
6053 * @upper_info: upper info to be passed down via notifier
6054 *
6055 * Adds a link to device which is upper to this one. In this case, only
6056 * one master upper device can be linked, although other non-master devices
6057 * might be linked as well. The caller must hold the RTNL lock.
6058 * On a failure a negative errno code is returned. On success the reference
6059 * counts are adjusted and the function returns zero.
6060 */
6061int netdev_master_upper_dev_link(struct net_device *dev,
6062 struct net_device *upper_dev,
6063 void *upper_priv, void *upper_info)
6064{
6065 return __netdev_upper_dev_link(dev, upper_dev, true,
6066 upper_priv, upper_info);
6067}
6068EXPORT_SYMBOL(netdev_master_upper_dev_link);
6069
6070/**
6071 * netdev_upper_dev_unlink - Removes a link to upper device
6072 * @dev: device
6073 * @upper_dev: new upper device
6074 *
6075 * Removes a link to device which is upper to this one. The caller must hold
6076 * the RTNL lock.
6077 */
6078void netdev_upper_dev_unlink(struct net_device *dev,
6079 struct net_device *upper_dev)
6080{
6081 struct netdev_notifier_changeupper_info changeupper_info;
6082 ASSERT_RTNL();
6083
6084 changeupper_info.upper_dev = upper_dev;
6085 changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
6086 changeupper_info.linking = false;
6087
6088 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
6089 &changeupper_info.info);
6090
6091 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6092
6093 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
6094 &changeupper_info.info);
6095}
6096EXPORT_SYMBOL(netdev_upper_dev_unlink);
6097
6098/**
6099 * netdev_bonding_info_change - Dispatch event about slave change
6100 * @dev: device
6101 * @bonding_info: info to dispatch
6102 *
6103 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
6104 * The caller must hold the RTNL lock.
6105 */
6106void netdev_bonding_info_change(struct net_device *dev,
6107 struct netdev_bonding_info *bonding_info)
6108{
6109 struct netdev_notifier_bonding_info info;
6110
6111 memcpy(&info.bonding_info, bonding_info,
6112 sizeof(struct netdev_bonding_info));
6113 call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
6114 &info.info);
6115}
6116EXPORT_SYMBOL(netdev_bonding_info_change);
6117
6118static void netdev_adjacent_add_links(struct net_device *dev)
6119{
6120 struct netdev_adjacent *iter;
6121
6122 struct net *net = dev_net(dev);
6123
6124 list_for_each_entry(iter, &dev->adj_list.upper, list) {
6125 if (!net_eq(net, dev_net(iter->dev)))
6126 continue;
6127 netdev_adjacent_sysfs_add(iter->dev, dev,
6128 &iter->dev->adj_list.lower);
6129 netdev_adjacent_sysfs_add(dev, iter->dev,
6130 &dev->adj_list.upper);
6131 }
6132
6133 list_for_each_entry(iter, &dev->adj_list.lower, list) {
6134 if (!net_eq(net, dev_net(iter->dev)))
6135 continue;
6136 netdev_adjacent_sysfs_add(iter->dev, dev,
6137 &iter->dev->adj_list.upper);
6138 netdev_adjacent_sysfs_add(dev, iter->dev,
6139 &dev->adj_list.lower);
6140 }
6141}
6142
6143static void netdev_adjacent_del_links(struct net_device *dev)
6144{
6145 struct netdev_adjacent *iter;
6146
6147 struct net *net = dev_net(dev);
6148
6149 list_for_each_entry(iter, &dev->adj_list.upper, list) {
6150 if (!net_eq(net, dev_net(iter->dev)))
6151 continue;
6152 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6153 &iter->dev->adj_list.lower);
6154 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6155 &dev->adj_list.upper);
6156 }
6157
6158 list_for_each_entry(iter, &dev->adj_list.lower, list) {
6159 if (!net_eq(net, dev_net(iter->dev)))
6160 continue;
6161 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6162 &iter->dev->adj_list.upper);
6163 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6164 &dev->adj_list.lower);
6165 }
6166}
6167
6168void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
6169{
6170 struct netdev_adjacent *iter;
6171
6172 struct net *net = dev_net(dev);
6173
6174 list_for_each_entry(iter, &dev->adj_list.upper, list) {
6175 if (!net_eq(net, dev_net(iter->dev)))
6176 continue;
6177 netdev_adjacent_sysfs_del(iter->dev, oldname,
6178 &iter->dev->adj_list.lower);
6179 netdev_adjacent_sysfs_add(iter->dev, dev,
6180 &iter->dev->adj_list.lower);
6181 }
6182
6183 list_for_each_entry(iter, &dev->adj_list.lower, list) {
6184 if (!net_eq(net, dev_net(iter->dev)))
6185 continue;
6186 netdev_adjacent_sysfs_del(iter->dev, oldname,
6187 &iter->dev->adj_list.upper);
6188 netdev_adjacent_sysfs_add(iter->dev, dev,
6189 &iter->dev->adj_list.upper);
6190 }
6191}
6192
6193void *netdev_lower_dev_get_private(struct net_device *dev,
6194 struct net_device *lower_dev)
6195{
6196 struct netdev_adjacent *lower;
6197
6198 if (!lower_dev)
6199 return NULL;
6200 lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
6201 if (!lower)
6202 return NULL;
6203
6204 return lower->private;
6205}
6206EXPORT_SYMBOL(netdev_lower_dev_get_private);
6207
6208
6209int dev_get_nest_level(struct net_device *dev)
6210{
6211 struct net_device *lower = NULL;
6212 struct list_head *iter;
6213 int max_nest = -1;
6214 int nest;
6215
6216 ASSERT_RTNL();
6217
6218 netdev_for_each_lower_dev(dev, lower, iter) {
6219 nest = dev_get_nest_level(lower);
6220 if (max_nest < nest)
6221 max_nest = nest;
6222 }
6223
6224 return max_nest + 1;
6225}
6226EXPORT_SYMBOL(dev_get_nest_level);
6227
6228/**
6229 * netdev_lower_change - Dispatch event about lower device state change
6230 * @lower_dev: device
6231 * @lower_state_info: state to dispatch
6232 *
6233 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6234 * The caller must hold the RTNL lock.
6235 */
6236void netdev_lower_state_changed(struct net_device *lower_dev,
6237 void *lower_state_info)
6238{
6239 struct netdev_notifier_changelowerstate_info changelowerstate_info;
6240
6241 ASSERT_RTNL();
6242 changelowerstate_info.lower_state_info = lower_state_info;
6243 call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6244 &changelowerstate_info.info);
6245}
6246EXPORT_SYMBOL(netdev_lower_state_changed);
6247
6248int netdev_default_l2upper_neigh_construct(struct net_device *dev,
6249 struct neighbour *n)
6250{
6251 struct net_device *lower_dev, *stop_dev;
6252 struct list_head *iter;
6253 int err;
6254
6255 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6256 if (!lower_dev->netdev_ops->ndo_neigh_construct)
6257 continue;
6258 err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
6259 if (err) {
6260 stop_dev = lower_dev;
6261 goto rollback;
6262 }
6263 }
6264 return 0;
6265
6266rollback:
6267 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6268 if (lower_dev == stop_dev)
6269 break;
6270 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6271 continue;
6272 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6273 }
6274 return err;
6275}
6276EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6277
6278void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6279 struct neighbour *n)
6280{
6281 struct net_device *lower_dev;
6282 struct list_head *iter;
6283
6284 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6285 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6286 continue;
6287 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6288 }
6289}
6290EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6291
6292static void dev_change_rx_flags(struct net_device *dev, int flags)
6293{
6294 const struct net_device_ops *ops = dev->netdev_ops;
6295
6296 if (ops->ndo_change_rx_flags)
6297 ops->ndo_change_rx_flags(dev, flags);
6298}
6299
6300static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6301{
6302 unsigned int old_flags = dev->flags;
6303 kuid_t uid;
6304 kgid_t gid;
6305
6306 ASSERT_RTNL();
6307
6308 dev->flags |= IFF_PROMISC;
6309 dev->promiscuity += inc;
6310 if (dev->promiscuity == 0) {
6311 /*
6312 * Avoid overflow.
6313 * If inc causes overflow, untouch promisc and return error.
6314 */
6315 if (inc < 0)
6316 dev->flags &= ~IFF_PROMISC;
6317 else {
6318 dev->promiscuity -= inc;
6319 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6320 dev->name);
6321 return -EOVERFLOW;
6322 }
6323 }
6324 if (dev->flags != old_flags) {
6325 pr_info("device %s %s promiscuous mode\n",
6326 dev->name,
6327 dev->flags & IFF_PROMISC ? "entered" : "left");
6328 if (audit_enabled) {
6329 current_uid_gid(&uid, &gid);
6330 audit_log(current->audit_context, GFP_ATOMIC,
6331 AUDIT_ANOM_PROMISCUOUS,
6332 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6333 dev->name, (dev->flags & IFF_PROMISC),
6334 (old_flags & IFF_PROMISC),
6335 from_kuid(&init_user_ns, audit_get_loginuid(current)),
6336 from_kuid(&init_user_ns, uid),
6337 from_kgid(&init_user_ns, gid),
6338 audit_get_sessionid(current));
6339 }
6340
6341 dev_change_rx_flags(dev, IFF_PROMISC);
6342 }
6343 if (notify)
6344 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
6345 return 0;
6346}
6347
6348/**
6349 * dev_set_promiscuity - update promiscuity count on a device
6350 * @dev: device
6351 * @inc: modifier
6352 *
6353 * Add or remove promiscuity from a device. While the count in the device
6354 * remains above zero the interface remains promiscuous. Once it hits zero
6355 * the device reverts back to normal filtering operation. A negative inc
6356 * value is used to drop promiscuity on the device.
6357 * Return 0 if successful or a negative errno code on error.
6358 */
6359int dev_set_promiscuity(struct net_device *dev, int inc)
6360{
6361 unsigned int old_flags = dev->flags;
6362 int err;
6363
6364 err = __dev_set_promiscuity(dev, inc, true);
6365 if (err < 0)
6366 return err;
6367 if (dev->flags != old_flags)
6368 dev_set_rx_mode(dev);
6369 return err;
6370}
6371EXPORT_SYMBOL(dev_set_promiscuity);
6372
6373static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6374{
6375 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6376
6377 ASSERT_RTNL();
6378
6379 dev->flags |= IFF_ALLMULTI;
6380 dev->allmulti += inc;
6381 if (dev->allmulti == 0) {
6382 /*
6383 * Avoid overflow.
6384 * If inc causes overflow, untouch allmulti and return error.
6385 */
6386 if (inc < 0)
6387 dev->flags &= ~IFF_ALLMULTI;
6388 else {
6389 dev->allmulti -= inc;
6390 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6391 dev->name);
6392 return -EOVERFLOW;
6393 }
6394 }
6395 if (dev->flags ^ old_flags) {
6396 dev_change_rx_flags(dev, IFF_ALLMULTI);
6397 dev_set_rx_mode(dev);
6398 if (notify)
6399 __dev_notify_flags(dev, old_flags,
6400 dev->gflags ^ old_gflags);
6401 }
6402 return 0;
6403}
6404
6405/**
6406 * dev_set_allmulti - update allmulti count on a device
6407 * @dev: device
6408 * @inc: modifier
6409 *
6410 * Add or remove reception of all multicast frames to a device. While the
6411 * count in the device remains above zero the interface remains listening
6412 * to all interfaces. Once it hits zero the device reverts back to normal
6413 * filtering operation. A negative @inc value is used to drop the counter
6414 * when releasing a resource needing all multicasts.
6415 * Return 0 if successful or a negative errno code on error.
6416 */
6417
6418int dev_set_allmulti(struct net_device *dev, int inc)
6419{
6420 return __dev_set_allmulti(dev, inc, true);
6421}
6422EXPORT_SYMBOL(dev_set_allmulti);
6423
6424/*
6425 * Upload unicast and multicast address lists to device and
6426 * configure RX filtering. When the device doesn't support unicast
6427 * filtering it is put in promiscuous mode while unicast addresses
6428 * are present.
6429 */
6430void __dev_set_rx_mode(struct net_device *dev)
6431{
6432 const struct net_device_ops *ops = dev->netdev_ops;
6433
6434 /* dev_open will call this function so the list will stay sane. */
6435 if (!(dev->flags&IFF_UP))
6436 return;
6437
6438 if (!netif_device_present(dev))
6439 return;
6440
6441 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6442 /* Unicast addresses changes may only happen under the rtnl,
6443 * therefore calling __dev_set_promiscuity here is safe.
6444 */
6445 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6446 __dev_set_promiscuity(dev, 1, false);
6447 dev->uc_promisc = true;
6448 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6449 __dev_set_promiscuity(dev, -1, false);
6450 dev->uc_promisc = false;
6451 }
6452 }
6453
6454 if (ops->ndo_set_rx_mode)
6455 ops->ndo_set_rx_mode(dev);
6456}
6457
6458void dev_set_rx_mode(struct net_device *dev)
6459{
6460 netif_addr_lock_bh(dev);
6461 __dev_set_rx_mode(dev);
6462 netif_addr_unlock_bh(dev);
6463}
6464
6465/**
6466 * dev_get_flags - get flags reported to userspace
6467 * @dev: device
6468 *
6469 * Get the combination of flag bits exported through APIs to userspace.
6470 */
6471unsigned int dev_get_flags(const struct net_device *dev)
6472{
6473 unsigned int flags;
6474
6475 flags = (dev->flags & ~(IFF_PROMISC |
6476 IFF_ALLMULTI |
6477 IFF_RUNNING |
6478 IFF_LOWER_UP |
6479 IFF_DORMANT)) |
6480 (dev->gflags & (IFF_PROMISC |
6481 IFF_ALLMULTI));
6482
6483 if (netif_running(dev)) {
6484 if (netif_oper_up(dev))
6485 flags |= IFF_RUNNING;
6486 if (netif_carrier_ok(dev))
6487 flags |= IFF_LOWER_UP;
6488 if (netif_dormant(dev))
6489 flags |= IFF_DORMANT;
6490 }
6491
6492 return flags;
6493}
6494EXPORT_SYMBOL(dev_get_flags);
6495
6496int __dev_change_flags(struct net_device *dev, unsigned int flags)
6497{
6498 unsigned int old_flags = dev->flags;
6499 int ret;
6500
6501 ASSERT_RTNL();
6502
6503 /*
6504 * Set the flags on our device.
6505 */
6506
6507 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6508 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6509 IFF_AUTOMEDIA)) |
6510 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6511 IFF_ALLMULTI));
6512
6513 /*
6514 * Load in the correct multicast list now the flags have changed.
6515 */
6516
6517 if ((old_flags ^ flags) & IFF_MULTICAST)
6518 dev_change_rx_flags(dev, IFF_MULTICAST);
6519
6520 dev_set_rx_mode(dev);
6521
6522 /*
6523 * Have we downed the interface. We handle IFF_UP ourselves
6524 * according to user attempts to set it, rather than blindly
6525 * setting it.
6526 */
6527
6528 ret = 0;
6529 if ((old_flags ^ flags) & IFF_UP)
6530 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6531
6532 if ((flags ^ dev->gflags) & IFF_PROMISC) {
6533 int inc = (flags & IFF_PROMISC) ? 1 : -1;
6534 unsigned int old_flags = dev->flags;
6535
6536 dev->gflags ^= IFF_PROMISC;
6537
6538 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6539 if (dev->flags != old_flags)
6540 dev_set_rx_mode(dev);
6541 }
6542
6543 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6544 is important. Some (broken) drivers set IFF_PROMISC, when
6545 IFF_ALLMULTI is requested not asking us and not reporting.
6546 */
6547 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6548 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6549
6550 dev->gflags ^= IFF_ALLMULTI;
6551 __dev_set_allmulti(dev, inc, false);
6552 }
6553
6554 return ret;
6555}
6556
6557void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6558 unsigned int gchanges)
6559{
6560 unsigned int changes = dev->flags ^ old_flags;
6561
6562 if (gchanges)
6563 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6564
6565 if (changes & IFF_UP) {
6566 if (dev->flags & IFF_UP)
6567 call_netdevice_notifiers(NETDEV_UP, dev);
6568 else
6569 call_netdevice_notifiers(NETDEV_DOWN, dev);
6570 }
6571
6572 if (dev->flags & IFF_UP &&
6573 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6574 struct netdev_notifier_change_info change_info;
6575
6576 change_info.flags_changed = changes;
6577 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6578 &change_info.info);
6579 }
6580}
6581
6582/**
6583 * dev_change_flags - change device settings
6584 * @dev: device
6585 * @flags: device state flags
6586 *
6587 * Change settings on device based state flags. The flags are
6588 * in the userspace exported format.
6589 */
6590int dev_change_flags(struct net_device *dev, unsigned int flags)
6591{
6592 int ret;
6593 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6594
6595 ret = __dev_change_flags(dev, flags);
6596 if (ret < 0)
6597 return ret;
6598
6599 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6600 __dev_notify_flags(dev, old_flags, changes);
6601 return ret;
6602}
6603EXPORT_SYMBOL(dev_change_flags);
6604
6605static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6606{
6607 const struct net_device_ops *ops = dev->netdev_ops;
6608
6609 if (ops->ndo_change_mtu)
6610 return ops->ndo_change_mtu(dev, new_mtu);
6611
6612 dev->mtu = new_mtu;
6613 return 0;
6614}
6615
6616/**
6617 * dev_set_mtu - Change maximum transfer unit
6618 * @dev: device
6619 * @new_mtu: new transfer unit
6620 *
6621 * Change the maximum transfer size of the network device.
6622 */
6623int dev_set_mtu(struct net_device *dev, int new_mtu)
6624{
6625 int err, orig_mtu;
6626
6627 if (new_mtu == dev->mtu)
6628 return 0;
6629
6630 /* MTU must be positive, and in range */
6631 if (new_mtu < 0 || new_mtu < dev->min_mtu) {
6632 net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
6633 dev->name, new_mtu, dev->min_mtu);
6634 return -EINVAL;
6635 }
6636
6637 if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
6638 net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
6639 dev->name, new_mtu, dev->max_mtu);
6640 return -EINVAL;
6641 }
6642
6643 if (!netif_device_present(dev))
6644 return -ENODEV;
6645
6646 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6647 err = notifier_to_errno(err);
6648 if (err)
6649 return err;
6650
6651 orig_mtu = dev->mtu;
6652 err = __dev_set_mtu(dev, new_mtu);
6653
6654 if (!err) {
6655 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6656 err = notifier_to_errno(err);
6657 if (err) {
6658 /* setting mtu back and notifying everyone again,
6659 * so that they have a chance to revert changes.
6660 */
6661 __dev_set_mtu(dev, orig_mtu);
6662 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6663 }
6664 }
6665 return err;
6666}
6667EXPORT_SYMBOL(dev_set_mtu);
6668
6669/**
6670 * dev_set_group - Change group this device belongs to
6671 * @dev: device
6672 * @new_group: group this device should belong to
6673 */
6674void dev_set_group(struct net_device *dev, int new_group)
6675{
6676 dev->group = new_group;
6677}
6678EXPORT_SYMBOL(dev_set_group);
6679
6680/**
6681 * dev_set_mac_address - Change Media Access Control Address
6682 * @dev: device
6683 * @sa: new address
6684 *
6685 * Change the hardware (MAC) address of the device
6686 */
6687int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6688{
6689 const struct net_device_ops *ops = dev->netdev_ops;
6690 int err;
6691
6692 if (!ops->ndo_set_mac_address)
6693 return -EOPNOTSUPP;
6694 if (sa->sa_family != dev->type)
6695 return -EINVAL;
6696 if (!netif_device_present(dev))
6697 return -ENODEV;
6698 err = ops->ndo_set_mac_address(dev, sa);
6699 if (err)
6700 return err;
6701 dev->addr_assign_type = NET_ADDR_SET;
6702 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6703 add_device_randomness(dev->dev_addr, dev->addr_len);
6704 return 0;
6705}
6706EXPORT_SYMBOL(dev_set_mac_address);
6707
6708/**
6709 * dev_change_carrier - Change device carrier
6710 * @dev: device
6711 * @new_carrier: new value
6712 *
6713 * Change device carrier
6714 */
6715int dev_change_carrier(struct net_device *dev, bool new_carrier)
6716{
6717 const struct net_device_ops *ops = dev->netdev_ops;
6718
6719 if (!ops->ndo_change_carrier)
6720 return -EOPNOTSUPP;
6721 if (!netif_device_present(dev))
6722 return -ENODEV;
6723 return ops->ndo_change_carrier(dev, new_carrier);
6724}
6725EXPORT_SYMBOL(dev_change_carrier);
6726
6727/**
6728 * dev_get_phys_port_id - Get device physical port ID
6729 * @dev: device
6730 * @ppid: port ID
6731 *
6732 * Get device physical port ID
6733 */
6734int dev_get_phys_port_id(struct net_device *dev,
6735 struct netdev_phys_item_id *ppid)
6736{
6737 const struct net_device_ops *ops = dev->netdev_ops;
6738
6739 if (!ops->ndo_get_phys_port_id)
6740 return -EOPNOTSUPP;
6741 return ops->ndo_get_phys_port_id(dev, ppid);
6742}
6743EXPORT_SYMBOL(dev_get_phys_port_id);
6744
6745/**
6746 * dev_get_phys_port_name - Get device physical port name
6747 * @dev: device
6748 * @name: port name
6749 * @len: limit of bytes to copy to name
6750 *
6751 * Get device physical port name
6752 */
6753int dev_get_phys_port_name(struct net_device *dev,
6754 char *name, size_t len)
6755{
6756 const struct net_device_ops *ops = dev->netdev_ops;
6757
6758 if (!ops->ndo_get_phys_port_name)
6759 return -EOPNOTSUPP;
6760 return ops->ndo_get_phys_port_name(dev, name, len);
6761}
6762EXPORT_SYMBOL(dev_get_phys_port_name);
6763
6764/**
6765 * dev_change_proto_down - update protocol port state information
6766 * @dev: device
6767 * @proto_down: new value
6768 *
6769 * This info can be used by switch drivers to set the phys state of the
6770 * port.
6771 */
6772int dev_change_proto_down(struct net_device *dev, bool proto_down)
6773{
6774 const struct net_device_ops *ops = dev->netdev_ops;
6775
6776 if (!ops->ndo_change_proto_down)
6777 return -EOPNOTSUPP;
6778 if (!netif_device_present(dev))
6779 return -ENODEV;
6780 return ops->ndo_change_proto_down(dev, proto_down);
6781}
6782EXPORT_SYMBOL(dev_change_proto_down);
6783
6784/**
6785 * dev_change_xdp_fd - set or clear a bpf program for a device rx path
6786 * @dev: device
6787 * @fd: new program fd or negative value to clear
6788 * @flags: xdp-related flags
6789 *
6790 * Set or clear a bpf program for a device
6791 */
6792int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
6793{
6794 const struct net_device_ops *ops = dev->netdev_ops;
6795 struct bpf_prog *prog = NULL;
6796 struct netdev_xdp xdp;
6797 int err;
6798
6799 ASSERT_RTNL();
6800
6801 if (!ops->ndo_xdp)
6802 return -EOPNOTSUPP;
6803 if (fd >= 0) {
6804 if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
6805 memset(&xdp, 0, sizeof(xdp));
6806 xdp.command = XDP_QUERY_PROG;
6807
6808 err = ops->ndo_xdp(dev, &xdp);
6809 if (err < 0)
6810 return err;
6811 if (xdp.prog_attached)
6812 return -EBUSY;
6813 }
6814
6815 prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6816 if (IS_ERR(prog))
6817 return PTR_ERR(prog);
6818 }
6819
6820 memset(&xdp, 0, sizeof(xdp));
6821 xdp.command = XDP_SETUP_PROG;
6822 xdp.prog = prog;
6823
6824 err = ops->ndo_xdp(dev, &xdp);
6825 if (err < 0 && prog)
6826 bpf_prog_put(prog);
6827
6828 return err;
6829}
6830EXPORT_SYMBOL(dev_change_xdp_fd);
6831
6832/**
6833 * dev_new_index - allocate an ifindex
6834 * @net: the applicable net namespace
6835 *
6836 * Returns a suitable unique value for a new device interface
6837 * number. The caller must hold the rtnl semaphore or the
6838 * dev_base_lock to be sure it remains unique.
6839 */
6840static int dev_new_index(struct net *net)
6841{
6842 int ifindex = net->ifindex;
6843 for (;;) {
6844 if (++ifindex <= 0)
6845 ifindex = 1;
6846 if (!__dev_get_by_index(net, ifindex))
6847 return net->ifindex = ifindex;
6848 }
6849}
6850
6851/* Delayed registration/unregisteration */
6852static LIST_HEAD(net_todo_list);
6853DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6854
6855static void net_set_todo(struct net_device *dev)
6856{
6857 list_add_tail(&dev->todo_list, &net_todo_list);
6858 dev_net(dev)->dev_unreg_count++;
6859}
6860
6861static void rollback_registered_many(struct list_head *head)
6862{
6863 struct net_device *dev, *tmp;
6864 LIST_HEAD(close_head);
6865
6866 BUG_ON(dev_boot_phase);
6867 ASSERT_RTNL();
6868
6869 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6870 /* Some devices call without registering
6871 * for initialization unwind. Remove those
6872 * devices and proceed with the remaining.
6873 */
6874 if (dev->reg_state == NETREG_UNINITIALIZED) {
6875 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6876 dev->name, dev);
6877
6878 WARN_ON(1);
6879 list_del(&dev->unreg_list);
6880 continue;
6881 }
6882 dev->dismantle = true;
6883 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6884 }
6885
6886 /* If device is running, close it first. */
6887 list_for_each_entry(dev, head, unreg_list)
6888 list_add_tail(&dev->close_list, &close_head);
6889 dev_close_many(&close_head, true);
6890
6891 list_for_each_entry(dev, head, unreg_list) {
6892 /* And unlink it from device chain. */
6893 unlist_netdevice(dev);
6894
6895 dev->reg_state = NETREG_UNREGISTERING;
6896 }
6897 flush_all_backlogs();
6898
6899 synchronize_net();
6900
6901 list_for_each_entry(dev, head, unreg_list) {
6902 struct sk_buff *skb = NULL;
6903
6904 /* Shutdown queueing discipline. */
6905 dev_shutdown(dev);
6906
6907
6908 /* Notify protocols, that we are about to destroy
6909 this device. They should clean all the things.
6910 */
6911 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6912
6913 if (!dev->rtnl_link_ops ||
6914 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6915 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6916 GFP_KERNEL);
6917
6918 /*
6919 * Flush the unicast and multicast chains
6920 */
6921 dev_uc_flush(dev);
6922 dev_mc_flush(dev);
6923
6924 if (dev->netdev_ops->ndo_uninit)
6925 dev->netdev_ops->ndo_uninit(dev);
6926
6927 if (skb)
6928 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6929
6930 /* Notifier chain MUST detach us all upper devices. */
6931 WARN_ON(netdev_has_any_upper_dev(dev));
6932 WARN_ON(netdev_has_any_lower_dev(dev));
6933
6934 /* Remove entries from kobject tree */
6935 netdev_unregister_kobject(dev);
6936#ifdef CONFIG_XPS
6937 /* Remove XPS queueing entries */
6938 netif_reset_xps_queues_gt(dev, 0);
6939#endif
6940 }
6941
6942 synchronize_net();
6943
6944 list_for_each_entry(dev, head, unreg_list)
6945 dev_put(dev);
6946}
6947
6948static void rollback_registered(struct net_device *dev)
6949{
6950 LIST_HEAD(single);
6951
6952 list_add(&dev->unreg_list, &single);
6953 rollback_registered_many(&single);
6954 list_del(&single);
6955}
6956
6957static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6958 struct net_device *upper, netdev_features_t features)
6959{
6960 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6961 netdev_features_t feature;
6962 int feature_bit;
6963
6964 for_each_netdev_feature(&upper_disables, feature_bit) {
6965 feature = __NETIF_F_BIT(feature_bit);
6966 if (!(upper->wanted_features & feature)
6967 && (features & feature)) {
6968 netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6969 &feature, upper->name);
6970 features &= ~feature;
6971 }
6972 }
6973
6974 return features;
6975}
6976
6977static void netdev_sync_lower_features(struct net_device *upper,
6978 struct net_device *lower, netdev_features_t features)
6979{
6980 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6981 netdev_features_t feature;
6982 int feature_bit;
6983
6984 for_each_netdev_feature(&upper_disables, feature_bit) {
6985 feature = __NETIF_F_BIT(feature_bit);
6986 if (!(features & feature) && (lower->features & feature)) {
6987 netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6988 &feature, lower->name);
6989 lower->wanted_features &= ~feature;
6990 netdev_update_features(lower);
6991
6992 if (unlikely(lower->features & feature))
6993 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6994 &feature, lower->name);
6995 }
6996 }
6997}
6998
6999static netdev_features_t netdev_fix_features(struct net_device *dev,
7000 netdev_features_t features)
7001{
7002 /* Fix illegal checksum combinations */
7003 if ((features & NETIF_F_HW_CSUM) &&
7004 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
7005 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
7006 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
7007 }
7008
7009 /* TSO requires that SG is present as well. */
7010 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
7011 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
7012 features &= ~NETIF_F_ALL_TSO;
7013 }
7014
7015 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
7016 !(features & NETIF_F_IP_CSUM)) {
7017 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
7018 features &= ~NETIF_F_TSO;
7019 features &= ~NETIF_F_TSO_ECN;
7020 }
7021
7022 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
7023 !(features & NETIF_F_IPV6_CSUM)) {
7024 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
7025 features &= ~NETIF_F_TSO6;
7026 }
7027
7028 /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
7029 if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
7030 features &= ~NETIF_F_TSO_MANGLEID;
7031
7032 /* TSO ECN requires that TSO is present as well. */
7033 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
7034 features &= ~NETIF_F_TSO_ECN;
7035
7036 /* Software GSO depends on SG. */
7037 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
7038 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
7039 features &= ~NETIF_F_GSO;
7040 }
7041
7042 /* UFO needs SG and checksumming */
7043 if (features & NETIF_F_UFO) {
7044 /* maybe split UFO into V4 and V6? */
7045 if (!(features & NETIF_F_HW_CSUM) &&
7046 ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
7047 (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
7048 netdev_dbg(dev,
7049 "Dropping NETIF_F_UFO since no checksum offload features.\n");
7050 features &= ~NETIF_F_UFO;
7051 }
7052
7053 if (!(features & NETIF_F_SG)) {
7054 netdev_dbg(dev,
7055 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
7056 features &= ~NETIF_F_UFO;
7057 }
7058 }
7059
7060 /* GSO partial features require GSO partial be set */
7061 if ((features & dev->gso_partial_features) &&
7062 !(features & NETIF_F_GSO_PARTIAL)) {
7063 netdev_dbg(dev,
7064 "Dropping partially supported GSO features since no GSO partial.\n");
7065 features &= ~dev->gso_partial_features;
7066 }
7067
7068#ifdef CONFIG_NET_RX_BUSY_POLL
7069 if (dev->netdev_ops->ndo_busy_poll)
7070 features |= NETIF_F_BUSY_POLL;
7071 else
7072#endif
7073 features &= ~NETIF_F_BUSY_POLL;
7074
7075 return features;
7076}
7077
7078int __netdev_update_features(struct net_device *dev)
7079{
7080 struct net_device *upper, *lower;
7081 netdev_features_t features;
7082 struct list_head *iter;
7083 int err = -1;
7084
7085 ASSERT_RTNL();
7086
7087 features = netdev_get_wanted_features(dev);
7088
7089 if (dev->netdev_ops->ndo_fix_features)
7090 features = dev->netdev_ops->ndo_fix_features(dev, features);
7091
7092 /* driver might be less strict about feature dependencies */
7093 features = netdev_fix_features(dev, features);
7094
7095 /* some features can't be enabled if they're off an an upper device */
7096 netdev_for_each_upper_dev_rcu(dev, upper, iter)
7097 features = netdev_sync_upper_features(dev, upper, features);
7098
7099 if (dev->features == features)
7100 goto sync_lower;
7101
7102 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
7103 &dev->features, &features);
7104
7105 if (dev->netdev_ops->ndo_set_features)
7106 err = dev->netdev_ops->ndo_set_features(dev, features);
7107 else
7108 err = 0;
7109
7110 if (unlikely(err < 0)) {
7111 netdev_err(dev,
7112 "set_features() failed (%d); wanted %pNF, left %pNF\n",
7113 err, &features, &dev->features);
7114 /* return non-0 since some features might have changed and
7115 * it's better to fire a spurious notification than miss it
7116 */
7117 return -1;
7118 }
7119
7120sync_lower:
7121 /* some features must be disabled on lower devices when disabled
7122 * on an upper device (think: bonding master or bridge)
7123 */
7124 netdev_for_each_lower_dev(dev, lower, iter)
7125 netdev_sync_lower_features(dev, lower, features);
7126
7127 if (!err)
7128 dev->features = features;
7129
7130 return err < 0 ? 0 : 1;
7131}
7132
7133/**
7134 * netdev_update_features - recalculate device features
7135 * @dev: the device to check
7136 *
7137 * Recalculate dev->features set and send notifications if it
7138 * has changed. Should be called after driver or hardware dependent
7139 * conditions might have changed that influence the features.
7140 */
7141void netdev_update_features(struct net_device *dev)
7142{
7143 if (__netdev_update_features(dev))
7144 netdev_features_change(dev);
7145}
7146EXPORT_SYMBOL(netdev_update_features);
7147
7148/**
7149 * netdev_change_features - recalculate device features
7150 * @dev: the device to check
7151 *
7152 * Recalculate dev->features set and send notifications even
7153 * if they have not changed. Should be called instead of
7154 * netdev_update_features() if also dev->vlan_features might
7155 * have changed to allow the changes to be propagated to stacked
7156 * VLAN devices.
7157 */
7158void netdev_change_features(struct net_device *dev)
7159{
7160 __netdev_update_features(dev);
7161 netdev_features_change(dev);
7162}
7163EXPORT_SYMBOL(netdev_change_features);
7164
7165/**
7166 * netif_stacked_transfer_operstate - transfer operstate
7167 * @rootdev: the root or lower level device to transfer state from
7168 * @dev: the device to transfer operstate to
7169 *
7170 * Transfer operational state from root to device. This is normally
7171 * called when a stacking relationship exists between the root
7172 * device and the device(a leaf device).
7173 */
7174void netif_stacked_transfer_operstate(const struct net_device *rootdev,
7175 struct net_device *dev)
7176{
7177 if (rootdev->operstate == IF_OPER_DORMANT)
7178 netif_dormant_on(dev);
7179 else
7180 netif_dormant_off(dev);
7181
7182 if (netif_carrier_ok(rootdev)) {
7183 if (!netif_carrier_ok(dev))
7184 netif_carrier_on(dev);
7185 } else {
7186 if (netif_carrier_ok(dev))
7187 netif_carrier_off(dev);
7188 }
7189}
7190EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7191
7192#ifdef CONFIG_SYSFS
7193static int netif_alloc_rx_queues(struct net_device *dev)
7194{
7195 unsigned int i, count = dev->num_rx_queues;
7196 struct netdev_rx_queue *rx;
7197 size_t sz = count * sizeof(*rx);
7198
7199 BUG_ON(count < 1);
7200
7201 rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7202 if (!rx) {
7203 rx = vzalloc(sz);
7204 if (!rx)
7205 return -ENOMEM;
7206 }
7207 dev->_rx = rx;
7208
7209 for (i = 0; i < count; i++)
7210 rx[i].dev = dev;
7211 return 0;
7212}
7213#endif
7214
7215static void netdev_init_one_queue(struct net_device *dev,
7216 struct netdev_queue *queue, void *_unused)
7217{
7218 /* Initialize queue lock */
7219 spin_lock_init(&queue->_xmit_lock);
7220 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7221 queue->xmit_lock_owner = -1;
7222 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
7223 queue->dev = dev;
7224#ifdef CONFIG_BQL
7225 dql_init(&queue->dql, HZ);
7226#endif
7227}
7228
7229static void netif_free_tx_queues(struct net_device *dev)
7230{
7231 kvfree(dev->_tx);
7232}
7233
7234static int netif_alloc_netdev_queues(struct net_device *dev)
7235{
7236 unsigned int count = dev->num_tx_queues;
7237 struct netdev_queue *tx;
7238 size_t sz = count * sizeof(*tx);
7239
7240 if (count < 1 || count > 0xffff)
7241 return -EINVAL;
7242
7243 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7244 if (!tx) {
7245 tx = vzalloc(sz);
7246 if (!tx)
7247 return -ENOMEM;
7248 }
7249 dev->_tx = tx;
7250
7251 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7252 spin_lock_init(&dev->tx_global_lock);
7253
7254 return 0;
7255}
7256
7257void netif_tx_stop_all_queues(struct net_device *dev)
7258{
7259 unsigned int i;
7260
7261 for (i = 0; i < dev->num_tx_queues; i++) {
7262 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
7263 netif_tx_stop_queue(txq);
7264 }
7265}
7266EXPORT_SYMBOL(netif_tx_stop_all_queues);
7267
7268/**
7269 * register_netdevice - register a network device
7270 * @dev: device to register
7271 *
7272 * Take a completed network device structure and add it to the kernel
7273 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7274 * chain. 0 is returned on success. A negative errno code is returned
7275 * on a failure to set up the device, or if the name is a duplicate.
7276 *
7277 * Callers must hold the rtnl semaphore. You may want
7278 * register_netdev() instead of this.
7279 *
7280 * BUGS:
7281 * The locking appears insufficient to guarantee two parallel registers
7282 * will not get the same name.
7283 */
7284
7285int register_netdevice(struct net_device *dev)
7286{
7287 int ret;
7288 struct net *net = dev_net(dev);
7289
7290 BUG_ON(dev_boot_phase);
7291 ASSERT_RTNL();
7292
7293 might_sleep();
7294
7295 /* When net_device's are persistent, this will be fatal. */
7296 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7297 BUG_ON(!net);
7298
7299 spin_lock_init(&dev->addr_list_lock);
7300 netdev_set_addr_lockdep_class(dev);
7301
7302 ret = dev_get_valid_name(net, dev, dev->name);
7303 if (ret < 0)
7304 goto out;
7305
7306 /* Init, if this function is available */
7307 if (dev->netdev_ops->ndo_init) {
7308 ret = dev->netdev_ops->ndo_init(dev);
7309 if (ret) {
7310 if (ret > 0)
7311 ret = -EIO;
7312 goto out;
7313 }
7314 }
7315
7316 if (((dev->hw_features | dev->features) &
7317 NETIF_F_HW_VLAN_CTAG_FILTER) &&
7318 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7319 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7320 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7321 ret = -EINVAL;
7322 goto err_uninit;
7323 }
7324
7325 ret = -EBUSY;
7326 if (!dev->ifindex)
7327 dev->ifindex = dev_new_index(net);
7328 else if (__dev_get_by_index(net, dev->ifindex))
7329 goto err_uninit;
7330
7331 /* Transfer changeable features to wanted_features and enable
7332 * software offloads (GSO and GRO).
7333 */
7334 dev->hw_features |= NETIF_F_SOFT_FEATURES;
7335 dev->features |= NETIF_F_SOFT_FEATURES;
7336 dev->wanted_features = dev->features & dev->hw_features;
7337
7338 if (!(dev->flags & IFF_LOOPBACK))
7339 dev->hw_features |= NETIF_F_NOCACHE_COPY;
7340
7341 /* If IPv4 TCP segmentation offload is supported we should also
7342 * allow the device to enable segmenting the frame with the option
7343 * of ignoring a static IP ID value. This doesn't enable the
7344 * feature itself but allows the user to enable it later.
7345 */
7346 if (dev->hw_features & NETIF_F_TSO)
7347 dev->hw_features |= NETIF_F_TSO_MANGLEID;
7348 if (dev->vlan_features & NETIF_F_TSO)
7349 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7350 if (dev->mpls_features & NETIF_F_TSO)
7351 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7352 if (dev->hw_enc_features & NETIF_F_TSO)
7353 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7354
7355 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7356 */
7357 dev->vlan_features |= NETIF_F_HIGHDMA;
7358
7359 /* Make NETIF_F_SG inheritable to tunnel devices.
7360 */
7361 dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7362
7363 /* Make NETIF_F_SG inheritable to MPLS.
7364 */
7365 dev->mpls_features |= NETIF_F_SG;
7366
7367 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7368 ret = notifier_to_errno(ret);
7369 if (ret)
7370 goto err_uninit;
7371
7372 ret = netdev_register_kobject(dev);
7373 if (ret)
7374 goto err_uninit;
7375 dev->reg_state = NETREG_REGISTERED;
7376
7377 __netdev_update_features(dev);
7378
7379 /*
7380 * Default initial state at registry is that the
7381 * device is present.
7382 */
7383
7384 set_bit(__LINK_STATE_PRESENT, &dev->state);
7385
7386 linkwatch_init_dev(dev);
7387
7388 dev_init_scheduler(dev);
7389 dev_hold(dev);
7390 list_netdevice(dev);
7391 add_device_randomness(dev->dev_addr, dev->addr_len);
7392
7393 /* If the device has permanent device address, driver should
7394 * set dev_addr and also addr_assign_type should be set to
7395 * NET_ADDR_PERM (default value).
7396 */
7397 if (dev->addr_assign_type == NET_ADDR_PERM)
7398 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7399
7400 /* Notify protocols, that a new device appeared. */
7401 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7402 ret = notifier_to_errno(ret);
7403 if (ret) {
7404 rollback_registered(dev);
7405 dev->reg_state = NETREG_UNREGISTERED;
7406 }
7407 /*
7408 * Prevent userspace races by waiting until the network
7409 * device is fully setup before sending notifications.
7410 */
7411 if (!dev->rtnl_link_ops ||
7412 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7413 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7414
7415out:
7416 return ret;
7417
7418err_uninit:
7419 if (dev->netdev_ops->ndo_uninit)
7420 dev->netdev_ops->ndo_uninit(dev);
7421 goto out;
7422}
7423EXPORT_SYMBOL(register_netdevice);
7424
7425/**
7426 * init_dummy_netdev - init a dummy network device for NAPI
7427 * @dev: device to init
7428 *
7429 * This takes a network device structure and initialize the minimum
7430 * amount of fields so it can be used to schedule NAPI polls without
7431 * registering a full blown interface. This is to be used by drivers
7432 * that need to tie several hardware interfaces to a single NAPI
7433 * poll scheduler due to HW limitations.
7434 */
7435int init_dummy_netdev(struct net_device *dev)
7436{
7437 /* Clear everything. Note we don't initialize spinlocks
7438 * are they aren't supposed to be taken by any of the
7439 * NAPI code and this dummy netdev is supposed to be
7440 * only ever used for NAPI polls
7441 */
7442 memset(dev, 0, sizeof(struct net_device));
7443
7444 /* make sure we BUG if trying to hit standard
7445 * register/unregister code path
7446 */
7447 dev->reg_state = NETREG_DUMMY;
7448
7449 /* NAPI wants this */
7450 INIT_LIST_HEAD(&dev->napi_list);
7451
7452 /* a dummy interface is started by default */
7453 set_bit(__LINK_STATE_PRESENT, &dev->state);
7454 set_bit(__LINK_STATE_START, &dev->state);
7455
7456 /* Note : We dont allocate pcpu_refcnt for dummy devices,
7457 * because users of this 'device' dont need to change
7458 * its refcount.
7459 */
7460
7461 return 0;
7462}
7463EXPORT_SYMBOL_GPL(init_dummy_netdev);
7464
7465
7466/**
7467 * register_netdev - register a network device
7468 * @dev: device to register
7469 *
7470 * Take a completed network device structure and add it to the kernel
7471 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7472 * chain. 0 is returned on success. A negative errno code is returned
7473 * on a failure to set up the device, or if the name is a duplicate.
7474 *
7475 * This is a wrapper around register_netdevice that takes the rtnl semaphore
7476 * and expands the device name if you passed a format string to
7477 * alloc_netdev.
7478 */
7479int register_netdev(struct net_device *dev)
7480{
7481 int err;
7482
7483 rtnl_lock();
7484 err = register_netdevice(dev);
7485 rtnl_unlock();
7486 return err;
7487}
7488EXPORT_SYMBOL(register_netdev);
7489
7490int netdev_refcnt_read(const struct net_device *dev)
7491{
7492 int i, refcnt = 0;
7493
7494 for_each_possible_cpu(i)
7495 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7496 return refcnt;
7497}
7498EXPORT_SYMBOL(netdev_refcnt_read);
7499
7500/**
7501 * netdev_wait_allrefs - wait until all references are gone.
7502 * @dev: target net_device
7503 *
7504 * This is called when unregistering network devices.
7505 *
7506 * Any protocol or device that holds a reference should register
7507 * for netdevice notification, and cleanup and put back the
7508 * reference if they receive an UNREGISTER event.
7509 * We can get stuck here if buggy protocols don't correctly
7510 * call dev_put.
7511 */
7512static void netdev_wait_allrefs(struct net_device *dev)
7513{
7514 unsigned long rebroadcast_time, warning_time;
7515 int refcnt;
7516
7517 linkwatch_forget_dev(dev);
7518
7519 rebroadcast_time = warning_time = jiffies;
7520 refcnt = netdev_refcnt_read(dev);
7521
7522 while (refcnt != 0) {
7523 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7524 rtnl_lock();
7525
7526 /* Rebroadcast unregister notification */
7527 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7528
7529 __rtnl_unlock();
7530 rcu_barrier();
7531 rtnl_lock();
7532
7533 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7534 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7535 &dev->state)) {
7536 /* We must not have linkwatch events
7537 * pending on unregister. If this
7538 * happens, we simply run the queue
7539 * unscheduled, resulting in a noop
7540 * for this device.
7541 */
7542 linkwatch_run_queue();
7543 }
7544
7545 __rtnl_unlock();
7546
7547 rebroadcast_time = jiffies;
7548 }
7549
7550 msleep(250);
7551
7552 refcnt = netdev_refcnt_read(dev);
7553
7554 if (time_after(jiffies, warning_time + 10 * HZ)) {
7555 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7556 dev->name, refcnt);
7557 warning_time = jiffies;
7558 }
7559 }
7560}
7561
7562/* The sequence is:
7563 *
7564 * rtnl_lock();
7565 * ...
7566 * register_netdevice(x1);
7567 * register_netdevice(x2);
7568 * ...
7569 * unregister_netdevice(y1);
7570 * unregister_netdevice(y2);
7571 * ...
7572 * rtnl_unlock();
7573 * free_netdev(y1);
7574 * free_netdev(y2);
7575 *
7576 * We are invoked by rtnl_unlock().
7577 * This allows us to deal with problems:
7578 * 1) We can delete sysfs objects which invoke hotplug
7579 * without deadlocking with linkwatch via keventd.
7580 * 2) Since we run with the RTNL semaphore not held, we can sleep
7581 * safely in order to wait for the netdev refcnt to drop to zero.
7582 *
7583 * We must not return until all unregister events added during
7584 * the interval the lock was held have been completed.
7585 */
7586void netdev_run_todo(void)
7587{
7588 struct list_head list;
7589
7590 /* Snapshot list, allow later requests */
7591 list_replace_init(&net_todo_list, &list);
7592
7593 __rtnl_unlock();
7594
7595
7596 /* Wait for rcu callbacks to finish before next phase */
7597 if (!list_empty(&list))
7598 rcu_barrier();
7599
7600 while (!list_empty(&list)) {
7601 struct net_device *dev
7602 = list_first_entry(&list, struct net_device, todo_list);
7603 list_del(&dev->todo_list);
7604
7605 rtnl_lock();
7606 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7607 __rtnl_unlock();
7608
7609 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7610 pr_err("network todo '%s' but state %d\n",
7611 dev->name, dev->reg_state);
7612 dump_stack();
7613 continue;
7614 }
7615
7616 dev->reg_state = NETREG_UNREGISTERED;
7617
7618 netdev_wait_allrefs(dev);
7619
7620 /* paranoia */
7621 BUG_ON(netdev_refcnt_read(dev));
7622 BUG_ON(!list_empty(&dev->ptype_all));
7623 BUG_ON(!list_empty(&dev->ptype_specific));
7624 WARN_ON(rcu_access_pointer(dev->ip_ptr));
7625 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7626 WARN_ON(dev->dn_ptr);
7627
7628 if (dev->destructor)
7629 dev->destructor(dev);
7630
7631 /* Report a network device has been unregistered */
7632 rtnl_lock();
7633 dev_net(dev)->dev_unreg_count--;
7634 __rtnl_unlock();
7635 wake_up(&netdev_unregistering_wq);
7636
7637 /* Free network device */
7638 kobject_put(&dev->dev.kobj);
7639 }
7640}
7641
7642/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7643 * all the same fields in the same order as net_device_stats, with only
7644 * the type differing, but rtnl_link_stats64 may have additional fields
7645 * at the end for newer counters.
7646 */
7647void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7648 const struct net_device_stats *netdev_stats)
7649{
7650#if BITS_PER_LONG == 64
7651 BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7652 memcpy(stats64, netdev_stats, sizeof(*stats64));
7653 /* zero out counters that only exist in rtnl_link_stats64 */
7654 memset((char *)stats64 + sizeof(*netdev_stats), 0,
7655 sizeof(*stats64) - sizeof(*netdev_stats));
7656#else
7657 size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7658 const unsigned long *src = (const unsigned long *)netdev_stats;
7659 u64 *dst = (u64 *)stats64;
7660
7661 BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7662 for (i = 0; i < n; i++)
7663 dst[i] = src[i];
7664 /* zero out counters that only exist in rtnl_link_stats64 */
7665 memset((char *)stats64 + n * sizeof(u64), 0,
7666 sizeof(*stats64) - n * sizeof(u64));
7667#endif
7668}
7669EXPORT_SYMBOL(netdev_stats_to_stats64);
7670
7671/**
7672 * dev_get_stats - get network device statistics
7673 * @dev: device to get statistics from
7674 * @storage: place to store stats
7675 *
7676 * Get network statistics from device. Return @storage.
7677 * The device driver may provide its own method by setting
7678 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7679 * otherwise the internal statistics structure is used.
7680 */
7681struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7682 struct rtnl_link_stats64 *storage)
7683{
7684 const struct net_device_ops *ops = dev->netdev_ops;
7685
7686 if (ops->ndo_get_stats64) {
7687 memset(storage, 0, sizeof(*storage));
7688 ops->ndo_get_stats64(dev, storage);
7689 } else if (ops->ndo_get_stats) {
7690 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7691 } else {
7692 netdev_stats_to_stats64(storage, &dev->stats);
7693 }
7694 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7695 storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7696 storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7697 return storage;
7698}
7699EXPORT_SYMBOL(dev_get_stats);
7700
7701struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7702{
7703 struct netdev_queue *queue = dev_ingress_queue(dev);
7704
7705#ifdef CONFIG_NET_CLS_ACT
7706 if (queue)
7707 return queue;
7708 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7709 if (!queue)
7710 return NULL;
7711 netdev_init_one_queue(dev, queue, NULL);
7712 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7713 queue->qdisc_sleeping = &noop_qdisc;
7714 rcu_assign_pointer(dev->ingress_queue, queue);
7715#endif
7716 return queue;
7717}
7718
7719static const struct ethtool_ops default_ethtool_ops;
7720
7721void netdev_set_default_ethtool_ops(struct net_device *dev,
7722 const struct ethtool_ops *ops)
7723{
7724 if (dev->ethtool_ops == &default_ethtool_ops)
7725 dev->ethtool_ops = ops;
7726}
7727EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7728
7729void netdev_freemem(struct net_device *dev)
7730{
7731 char *addr = (char *)dev - dev->padded;
7732
7733 kvfree(addr);
7734}
7735
7736/**
7737 * alloc_netdev_mqs - allocate network device
7738 * @sizeof_priv: size of private data to allocate space for
7739 * @name: device name format string
7740 * @name_assign_type: origin of device name
7741 * @setup: callback to initialize device
7742 * @txqs: the number of TX subqueues to allocate
7743 * @rxqs: the number of RX subqueues to allocate
7744 *
7745 * Allocates a struct net_device with private data area for driver use
7746 * and performs basic initialization. Also allocates subqueue structs
7747 * for each queue on the device.
7748 */
7749struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7750 unsigned char name_assign_type,
7751 void (*setup)(struct net_device *),
7752 unsigned int txqs, unsigned int rxqs)
7753{
7754 struct net_device *dev;
7755 size_t alloc_size;
7756 struct net_device *p;
7757
7758 BUG_ON(strlen(name) >= sizeof(dev->name));
7759
7760 if (txqs < 1) {
7761 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7762 return NULL;
7763 }
7764
7765#ifdef CONFIG_SYSFS
7766 if (rxqs < 1) {
7767 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7768 return NULL;
7769 }
7770#endif
7771
7772 alloc_size = sizeof(struct net_device);
7773 if (sizeof_priv) {
7774 /* ensure 32-byte alignment of private area */
7775 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7776 alloc_size += sizeof_priv;
7777 }
7778 /* ensure 32-byte alignment of whole construct */
7779 alloc_size += NETDEV_ALIGN - 1;
7780
7781 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7782 if (!p)
7783 p = vzalloc(alloc_size);
7784 if (!p)
7785 return NULL;
7786
7787 dev = PTR_ALIGN(p, NETDEV_ALIGN);
7788 dev->padded = (char *)dev - (char *)p;
7789
7790 dev->pcpu_refcnt = alloc_percpu(int);
7791 if (!dev->pcpu_refcnt)
7792 goto free_dev;
7793
7794 if (dev_addr_init(dev))
7795 goto free_pcpu;
7796
7797 dev_mc_init(dev);
7798 dev_uc_init(dev);
7799
7800 dev_net_set(dev, &init_net);
7801
7802 dev->gso_max_size = GSO_MAX_SIZE;
7803 dev->gso_max_segs = GSO_MAX_SEGS;
7804
7805 INIT_LIST_HEAD(&dev->napi_list);
7806 INIT_LIST_HEAD(&dev->unreg_list);
7807 INIT_LIST_HEAD(&dev->close_list);
7808 INIT_LIST_HEAD(&dev->link_watch_list);
7809 INIT_LIST_HEAD(&dev->adj_list.upper);
7810 INIT_LIST_HEAD(&dev->adj_list.lower);
7811 INIT_LIST_HEAD(&dev->ptype_all);
7812 INIT_LIST_HEAD(&dev->ptype_specific);
7813#ifdef CONFIG_NET_SCHED
7814 hash_init(dev->qdisc_hash);
7815#endif
7816 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7817 setup(dev);
7818
7819 if (!dev->tx_queue_len) {
7820 dev->priv_flags |= IFF_NO_QUEUE;
7821 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
7822 }
7823
7824 dev->num_tx_queues = txqs;
7825 dev->real_num_tx_queues = txqs;
7826 if (netif_alloc_netdev_queues(dev))
7827 goto free_all;
7828
7829#ifdef CONFIG_SYSFS
7830 dev->num_rx_queues = rxqs;
7831 dev->real_num_rx_queues = rxqs;
7832 if (netif_alloc_rx_queues(dev))
7833 goto free_all;
7834#endif
7835
7836 strcpy(dev->name, name);
7837 dev->name_assign_type = name_assign_type;
7838 dev->group = INIT_NETDEV_GROUP;
7839 if (!dev->ethtool_ops)
7840 dev->ethtool_ops = &default_ethtool_ops;
7841
7842 nf_hook_ingress_init(dev);
7843
7844 return dev;
7845
7846free_all:
7847 free_netdev(dev);
7848 return NULL;
7849
7850free_pcpu:
7851 free_percpu(dev->pcpu_refcnt);
7852free_dev:
7853 netdev_freemem(dev);
7854 return NULL;
7855}
7856EXPORT_SYMBOL(alloc_netdev_mqs);
7857
7858/**
7859 * free_netdev - free network device
7860 * @dev: device
7861 *
7862 * This function does the last stage of destroying an allocated device
7863 * interface. The reference to the device object is released.
7864 * If this is the last reference then it will be freed.
7865 * Must be called in process context.
7866 */
7867void free_netdev(struct net_device *dev)
7868{
7869 struct napi_struct *p, *n;
7870
7871 might_sleep();
7872 netif_free_tx_queues(dev);
7873#ifdef CONFIG_SYSFS
7874 kvfree(dev->_rx);
7875#endif
7876
7877 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7878
7879 /* Flush device addresses */
7880 dev_addr_flush(dev);
7881
7882 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7883 netif_napi_del(p);
7884
7885 free_percpu(dev->pcpu_refcnt);
7886 dev->pcpu_refcnt = NULL;
7887
7888 /* Compatibility with error handling in drivers */
7889 if (dev->reg_state == NETREG_UNINITIALIZED) {
7890 netdev_freemem(dev);
7891 return;
7892 }
7893
7894 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7895 dev->reg_state = NETREG_RELEASED;
7896
7897 /* will free via device release */
7898 put_device(&dev->dev);
7899}
7900EXPORT_SYMBOL(free_netdev);
7901
7902/**
7903 * synchronize_net - Synchronize with packet receive processing
7904 *
7905 * Wait for packets currently being received to be done.
7906 * Does not block later packets from starting.
7907 */
7908void synchronize_net(void)
7909{
7910 might_sleep();
7911 if (rtnl_is_locked())
7912 synchronize_rcu_expedited();
7913 else
7914 synchronize_rcu();
7915}
7916EXPORT_SYMBOL(synchronize_net);
7917
7918/**
7919 * unregister_netdevice_queue - remove device from the kernel
7920 * @dev: device
7921 * @head: list
7922 *
7923 * This function shuts down a device interface and removes it
7924 * from the kernel tables.
7925 * If head not NULL, device is queued to be unregistered later.
7926 *
7927 * Callers must hold the rtnl semaphore. You may want
7928 * unregister_netdev() instead of this.
7929 */
7930
7931void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7932{
7933 ASSERT_RTNL();
7934
7935 if (head) {
7936 list_move_tail(&dev->unreg_list, head);
7937 } else {
7938 rollback_registered(dev);
7939 /* Finish processing unregister after unlock */
7940 net_set_todo(dev);
7941 }
7942}
7943EXPORT_SYMBOL(unregister_netdevice_queue);
7944
7945/**
7946 * unregister_netdevice_many - unregister many devices
7947 * @head: list of devices
7948 *
7949 * Note: As most callers use a stack allocated list_head,
7950 * we force a list_del() to make sure stack wont be corrupted later.
7951 */
7952void unregister_netdevice_many(struct list_head *head)
7953{
7954 struct net_device *dev;
7955
7956 if (!list_empty(head)) {
7957 rollback_registered_many(head);
7958 list_for_each_entry(dev, head, unreg_list)
7959 net_set_todo(dev);
7960 list_del(head);
7961 }
7962}
7963EXPORT_SYMBOL(unregister_netdevice_many);
7964
7965/**
7966 * unregister_netdev - remove device from the kernel
7967 * @dev: device
7968 *
7969 * This function shuts down a device interface and removes it
7970 * from the kernel tables.
7971 *
7972 * This is just a wrapper for unregister_netdevice that takes
7973 * the rtnl semaphore. In general you want to use this and not
7974 * unregister_netdevice.
7975 */
7976void unregister_netdev(struct net_device *dev)
7977{
7978 rtnl_lock();
7979 unregister_netdevice(dev);
7980 rtnl_unlock();
7981}
7982EXPORT_SYMBOL(unregister_netdev);
7983
7984/**
7985 * dev_change_net_namespace - move device to different nethost namespace
7986 * @dev: device
7987 * @net: network namespace
7988 * @pat: If not NULL name pattern to try if the current device name
7989 * is already taken in the destination network namespace.
7990 *
7991 * This function shuts down a device interface and moves it
7992 * to a new network namespace. On success 0 is returned, on
7993 * a failure a netagive errno code is returned.
7994 *
7995 * Callers must hold the rtnl semaphore.
7996 */
7997
7998int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7999{
8000 int err;
8001
8002 ASSERT_RTNL();
8003
8004 /* Don't allow namespace local devices to be moved. */
8005 err = -EINVAL;
8006 if (dev->features & NETIF_F_NETNS_LOCAL)
8007 goto out;
8008
8009 /* Ensure the device has been registrered */
8010 if (dev->reg_state != NETREG_REGISTERED)
8011 goto out;
8012
8013 /* Get out if there is nothing todo */
8014 err = 0;
8015 if (net_eq(dev_net(dev), net))
8016 goto out;
8017
8018 /* Pick the destination device name, and ensure
8019 * we can use it in the destination network namespace.
8020 */
8021 err = -EEXIST;
8022 if (__dev_get_by_name(net, dev->name)) {
8023 /* We get here if we can't use the current device name */
8024 if (!pat)
8025 goto out;
8026 if (dev_get_valid_name(net, dev, pat) < 0)
8027 goto out;
8028 }
8029
8030 /*
8031 * And now a mini version of register_netdevice unregister_netdevice.
8032 */
8033
8034 /* If device is running close it first. */
8035 dev_close(dev);
8036
8037 /* And unlink it from device chain */
8038 err = -ENODEV;
8039 unlist_netdevice(dev);
8040
8041 synchronize_net();
8042
8043 /* Shutdown queueing discipline. */
8044 dev_shutdown(dev);
8045
8046 /* Notify protocols, that we are about to destroy
8047 this device. They should clean all the things.
8048
8049 Note that dev->reg_state stays at NETREG_REGISTERED.
8050 This is wanted because this way 8021q and macvlan know
8051 the device is just moving and can keep their slaves up.
8052 */
8053 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8054 rcu_barrier();
8055 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
8056 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
8057
8058 /*
8059 * Flush the unicast and multicast chains
8060 */
8061 dev_uc_flush(dev);
8062 dev_mc_flush(dev);
8063
8064 /* Send a netdev-removed uevent to the old namespace */
8065 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
8066 netdev_adjacent_del_links(dev);
8067
8068 /* Actually switch the network namespace */
8069 dev_net_set(dev, net);
8070
8071 /* If there is an ifindex conflict assign a new one */
8072 if (__dev_get_by_index(net, dev->ifindex))
8073 dev->ifindex = dev_new_index(net);
8074
8075 /* Send a netdev-add uevent to the new namespace */
8076 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
8077 netdev_adjacent_add_links(dev);
8078
8079 /* Fixup kobjects */
8080 err = device_rename(&dev->dev, dev->name);
8081 WARN_ON(err);
8082
8083 /* Add the device back in the hashes */
8084 list_netdevice(dev);
8085
8086 /* Notify protocols, that a new device appeared. */
8087 call_netdevice_notifiers(NETDEV_REGISTER, dev);
8088
8089 /*
8090 * Prevent userspace races by waiting until the network
8091 * device is fully setup before sending notifications.
8092 */
8093 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
8094
8095 synchronize_net();
8096 err = 0;
8097out:
8098 return err;
8099}
8100EXPORT_SYMBOL_GPL(dev_change_net_namespace);
8101
8102static int dev_cpu_dead(unsigned int oldcpu)
8103{
8104 struct sk_buff **list_skb;
8105 struct sk_buff *skb;
8106 unsigned int cpu;
8107 struct softnet_data *sd, *oldsd;
8108
8109 local_irq_disable();
8110 cpu = smp_processor_id();
8111 sd = &per_cpu(softnet_data, cpu);
8112 oldsd = &per_cpu(softnet_data, oldcpu);
8113
8114 /* Find end of our completion_queue. */
8115 list_skb = &sd->completion_queue;
8116 while (*list_skb)
8117 list_skb = &(*list_skb)->next;
8118 /* Append completion queue from offline CPU. */
8119 *list_skb = oldsd->completion_queue;
8120 oldsd->completion_queue = NULL;
8121
8122 /* Append output queue from offline CPU. */
8123 if (oldsd->output_queue) {
8124 *sd->output_queue_tailp = oldsd->output_queue;
8125 sd->output_queue_tailp = oldsd->output_queue_tailp;
8126 oldsd->output_queue = NULL;
8127 oldsd->output_queue_tailp = &oldsd->output_queue;
8128 }
8129 /* Append NAPI poll list from offline CPU, with one exception :
8130 * process_backlog() must be called by cpu owning percpu backlog.
8131 * We properly handle process_queue & input_pkt_queue later.
8132 */
8133 while (!list_empty(&oldsd->poll_list)) {
8134 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
8135 struct napi_struct,
8136 poll_list);
8137
8138 list_del_init(&napi->poll_list);
8139 if (napi->poll == process_backlog)
8140 napi->state = 0;
8141 else
8142 ____napi_schedule(sd, napi);
8143 }
8144
8145 raise_softirq_irqoff(NET_TX_SOFTIRQ);
8146 local_irq_enable();
8147
8148 /* Process offline CPU's input_pkt_queue */
8149 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
8150 netif_rx_ni(skb);
8151 input_queue_head_incr(oldsd);
8152 }
8153 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
8154 netif_rx_ni(skb);
8155 input_queue_head_incr(oldsd);
8156 }
8157
8158 return 0;
8159}
8160
8161/**
8162 * netdev_increment_features - increment feature set by one
8163 * @all: current feature set
8164 * @one: new feature set
8165 * @mask: mask feature set
8166 *
8167 * Computes a new feature set after adding a device with feature set
8168 * @one to the master device with current feature set @all. Will not
8169 * enable anything that is off in @mask. Returns the new feature set.
8170 */
8171netdev_features_t netdev_increment_features(netdev_features_t all,
8172 netdev_features_t one, netdev_features_t mask)
8173{
8174 if (mask & NETIF_F_HW_CSUM)
8175 mask |= NETIF_F_CSUM_MASK;
8176 mask |= NETIF_F_VLAN_CHALLENGED;
8177
8178 all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
8179 all &= one | ~NETIF_F_ALL_FOR_ALL;
8180
8181 /* If one device supports hw checksumming, set for all. */
8182 if (all & NETIF_F_HW_CSUM)
8183 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
8184
8185 return all;
8186}
8187EXPORT_SYMBOL(netdev_increment_features);
8188
8189static struct hlist_head * __net_init netdev_create_hash(void)
8190{
8191 int i;
8192 struct hlist_head *hash;
8193
8194 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8195 if (hash != NULL)
8196 for (i = 0; i < NETDEV_HASHENTRIES; i++)
8197 INIT_HLIST_HEAD(&hash[i]);
8198
8199 return hash;
8200}
8201
8202/* Initialize per network namespace state */
8203static int __net_init netdev_init(struct net *net)
8204{
8205 if (net != &init_net)
8206 INIT_LIST_HEAD(&net->dev_base_head);
8207
8208 net->dev_name_head = netdev_create_hash();
8209 if (net->dev_name_head == NULL)
8210 goto err_name;
8211
8212 net->dev_index_head = netdev_create_hash();
8213 if (net->dev_index_head == NULL)
8214 goto err_idx;
8215
8216 return 0;
8217
8218err_idx:
8219 kfree(net->dev_name_head);
8220err_name:
8221 return -ENOMEM;
8222}
8223
8224/**
8225 * netdev_drivername - network driver for the device
8226 * @dev: network device
8227 *
8228 * Determine network driver for device.
8229 */
8230const char *netdev_drivername(const struct net_device *dev)
8231{
8232 const struct device_driver *driver;
8233 const struct device *parent;
8234 const char *empty = "";
8235
8236 parent = dev->dev.parent;
8237 if (!parent)
8238 return empty;
8239
8240 driver = parent->driver;
8241 if (driver && driver->name)
8242 return driver->name;
8243 return empty;
8244}
8245
8246static void __netdev_printk(const char *level, const struct net_device *dev,
8247 struct va_format *vaf)
8248{
8249 if (dev && dev->dev.parent) {
8250 dev_printk_emit(level[1] - '0',
8251 dev->dev.parent,
8252 "%s %s %s%s: %pV",
8253 dev_driver_string(dev->dev.parent),
8254 dev_name(dev->dev.parent),
8255 netdev_name(dev), netdev_reg_state(dev),
8256 vaf);
8257 } else if (dev) {
8258 printk("%s%s%s: %pV",
8259 level, netdev_name(dev), netdev_reg_state(dev), vaf);
8260 } else {
8261 printk("%s(NULL net_device): %pV", level, vaf);
8262 }
8263}
8264
8265void netdev_printk(const char *level, const struct net_device *dev,
8266 const char *format, ...)
8267{
8268 struct va_format vaf;
8269 va_list args;
8270
8271 va_start(args, format);
8272
8273 vaf.fmt = format;
8274 vaf.va = &args;
8275
8276 __netdev_printk(level, dev, &vaf);
8277
8278 va_end(args);
8279}
8280EXPORT_SYMBOL(netdev_printk);
8281
8282#define define_netdev_printk_level(func, level) \
8283void func(const struct net_device *dev, const char *fmt, ...) \
8284{ \
8285 struct va_format vaf; \
8286 va_list args; \
8287 \
8288 va_start(args, fmt); \
8289 \
8290 vaf.fmt = fmt; \
8291 vaf.va = &args; \
8292 \
8293 __netdev_printk(level, dev, &vaf); \
8294 \
8295 va_end(args); \
8296} \
8297EXPORT_SYMBOL(func);
8298
8299define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8300define_netdev_printk_level(netdev_alert, KERN_ALERT);
8301define_netdev_printk_level(netdev_crit, KERN_CRIT);
8302define_netdev_printk_level(netdev_err, KERN_ERR);
8303define_netdev_printk_level(netdev_warn, KERN_WARNING);
8304define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8305define_netdev_printk_level(netdev_info, KERN_INFO);
8306
8307static void __net_exit netdev_exit(struct net *net)
8308{
8309 kfree(net->dev_name_head);
8310 kfree(net->dev_index_head);
8311}
8312
8313static struct pernet_operations __net_initdata netdev_net_ops = {
8314 .init = netdev_init,
8315 .exit = netdev_exit,
8316};
8317
8318static void __net_exit default_device_exit(struct net *net)
8319{
8320 struct net_device *dev, *aux;
8321 /*
8322 * Push all migratable network devices back to the
8323 * initial network namespace
8324 */
8325 rtnl_lock();
8326 for_each_netdev_safe(net, dev, aux) {
8327 int err;
8328 char fb_name[IFNAMSIZ];
8329
8330 /* Ignore unmoveable devices (i.e. loopback) */
8331 if (dev->features & NETIF_F_NETNS_LOCAL)
8332 continue;
8333
8334 /* Leave virtual devices for the generic cleanup */
8335 if (dev->rtnl_link_ops)
8336 continue;
8337
8338 /* Push remaining network devices to init_net */
8339 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8340 err = dev_change_net_namespace(dev, &init_net, fb_name);
8341 if (err) {
8342 pr_emerg("%s: failed to move %s to init_net: %d\n",
8343 __func__, dev->name, err);
8344 BUG();
8345 }
8346 }
8347 rtnl_unlock();
8348}
8349
8350static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8351{
8352 /* Return with the rtnl_lock held when there are no network
8353 * devices unregistering in any network namespace in net_list.
8354 */
8355 struct net *net;
8356 bool unregistering;
8357 DEFINE_WAIT_FUNC(wait, woken_wake_function);
8358
8359 add_wait_queue(&netdev_unregistering_wq, &wait);
8360 for (;;) {
8361 unregistering = false;
8362 rtnl_lock();
8363 list_for_each_entry(net, net_list, exit_list) {
8364 if (net->dev_unreg_count > 0) {
8365 unregistering = true;
8366 break;
8367 }
8368 }
8369 if (!unregistering)
8370 break;
8371 __rtnl_unlock();
8372
8373 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8374 }
8375 remove_wait_queue(&netdev_unregistering_wq, &wait);
8376}
8377
8378static void __net_exit default_device_exit_batch(struct list_head *net_list)
8379{
8380 /* At exit all network devices most be removed from a network
8381 * namespace. Do this in the reverse order of registration.
8382 * Do this across as many network namespaces as possible to
8383 * improve batching efficiency.
8384 */
8385 struct net_device *dev;
8386 struct net *net;
8387 LIST_HEAD(dev_kill_list);
8388
8389 /* To prevent network device cleanup code from dereferencing
8390 * loopback devices or network devices that have been freed
8391 * wait here for all pending unregistrations to complete,
8392 * before unregistring the loopback device and allowing the
8393 * network namespace be freed.
8394 *
8395 * The netdev todo list containing all network devices
8396 * unregistrations that happen in default_device_exit_batch
8397 * will run in the rtnl_unlock() at the end of
8398 * default_device_exit_batch.
8399 */
8400 rtnl_lock_unregistering(net_list);
8401 list_for_each_entry(net, net_list, exit_list) {
8402 for_each_netdev_reverse(net, dev) {
8403 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8404 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8405 else
8406 unregister_netdevice_queue(dev, &dev_kill_list);
8407 }
8408 }
8409 unregister_netdevice_many(&dev_kill_list);
8410 rtnl_unlock();
8411}
8412
8413static struct pernet_operations __net_initdata default_device_ops = {
8414 .exit = default_device_exit,
8415 .exit_batch = default_device_exit_batch,
8416};
8417
8418/*
8419 * Initialize the DEV module. At boot time this walks the device list and
8420 * unhooks any devices that fail to initialise (normally hardware not
8421 * present) and leaves us with a valid list of present and active devices.
8422 *
8423 */
8424
8425/*
8426 * This is called single threaded during boot, so no need
8427 * to take the rtnl semaphore.
8428 */
8429static int __init net_dev_init(void)
8430{
8431 int i, rc = -ENOMEM;
8432
8433 BUG_ON(!dev_boot_phase);
8434
8435 if (dev_proc_init())
8436 goto out;
8437
8438 if (netdev_kobject_init())
8439 goto out;
8440
8441 INIT_LIST_HEAD(&ptype_all);
8442 for (i = 0; i < PTYPE_HASH_SIZE; i++)
8443 INIT_LIST_HEAD(&ptype_base[i]);
8444
8445 INIT_LIST_HEAD(&offload_base);
8446
8447 if (register_pernet_subsys(&netdev_net_ops))
8448 goto out;
8449
8450 /*
8451 * Initialise the packet receive queues.
8452 */
8453
8454 for_each_possible_cpu(i) {
8455 struct work_struct *flush = per_cpu_ptr(&flush_works, i);
8456 struct softnet_data *sd = &per_cpu(softnet_data, i);
8457
8458 INIT_WORK(flush, flush_backlog);
8459
8460 skb_queue_head_init(&sd->input_pkt_queue);
8461 skb_queue_head_init(&sd->process_queue);
8462 INIT_LIST_HEAD(&sd->poll_list);
8463 sd->output_queue_tailp = &sd->output_queue;
8464#ifdef CONFIG_RPS
8465 sd->csd.func = rps_trigger_softirq;
8466 sd->csd.info = sd;
8467 sd->cpu = i;
8468#endif
8469
8470 sd->backlog.poll = process_backlog;
8471 sd->backlog.weight = weight_p;
8472 }
8473
8474 dev_boot_phase = 0;
8475
8476 /* The loopback device is special if any other network devices
8477 * is present in a network namespace the loopback device must
8478 * be present. Since we now dynamically allocate and free the
8479 * loopback device ensure this invariant is maintained by
8480 * keeping the loopback device as the first device on the
8481 * list of network devices. Ensuring the loopback devices
8482 * is the first device that appears and the last network device
8483 * that disappears.
8484 */
8485 if (register_pernet_device(&loopback_net_ops))
8486 goto out;
8487
8488 if (register_pernet_device(&default_device_ops))
8489 goto out;
8490
8491 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8492 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8493
8494 rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
8495 NULL, dev_cpu_dead);
8496 WARN_ON(rc < 0);
8497 dst_subsys_init();
8498 rc = 0;
8499out:
8500 return rc;
8501}
8502
8503subsys_initcall(net_dev_init);