Loading...
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * NET3 Protocol independent device support routines.
4 *
5 * Derived from the non IP parts of dev.c 1.0.19
6 * Authors: Ross Biro
7 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
8 * Mark Evans, <evansmp@uhura.aston.ac.uk>
9 *
10 * Additional Authors:
11 * Florian la Roche <rzsfl@rz.uni-sb.de>
12 * Alan Cox <gw4pts@gw4pts.ampr.org>
13 * David Hinds <dahinds@users.sourceforge.net>
14 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
15 * Adam Sulmicki <adam@cfar.umd.edu>
16 * Pekka Riikonen <priikone@poesidon.pspt.fi>
17 *
18 * Changes:
19 * D.J. Barrow : Fixed bug where dev->refcnt gets set
20 * to 2 if register_netdev gets called
21 * before net_dev_init & also removed a
22 * few lines of code in the process.
23 * Alan Cox : device private ioctl copies fields back.
24 * Alan Cox : Transmit queue code does relevant
25 * stunts to keep the queue safe.
26 * Alan Cox : Fixed double lock.
27 * Alan Cox : Fixed promisc NULL pointer trap
28 * ???????? : Support the full private ioctl range
29 * Alan Cox : Moved ioctl permission check into
30 * drivers
31 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
32 * Alan Cox : 100 backlog just doesn't cut it when
33 * you start doing multicast video 8)
34 * Alan Cox : Rewrote net_bh and list manager.
35 * Alan Cox : Fix ETH_P_ALL echoback lengths.
36 * Alan Cox : Took out transmit every packet pass
37 * Saved a few bytes in the ioctl handler
38 * Alan Cox : Network driver sets packet type before
39 * calling netif_rx. Saves a function
40 * call a packet.
41 * Alan Cox : Hashed net_bh()
42 * Richard Kooijman: Timestamp fixes.
43 * Alan Cox : Wrong field in SIOCGIFDSTADDR
44 * Alan Cox : Device lock protection.
45 * Alan Cox : Fixed nasty side effect of device close
46 * changes.
47 * Rudi Cilibrasi : Pass the right thing to
48 * set_mac_address()
49 * Dave Miller : 32bit quantity for the device lock to
50 * make it work out on a Sparc.
51 * Bjorn Ekwall : Added KERNELD hack.
52 * Alan Cox : Cleaned up the backlog initialise.
53 * Craig Metz : SIOCGIFCONF fix if space for under
54 * 1 device.
55 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
56 * is no device open function.
57 * Andi Kleen : Fix error reporting for SIOCGIFCONF
58 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
59 * Cyrus Durgin : Cleaned for KMOD
60 * Adam Sulmicki : Bug Fix : Network Device Unload
61 * A network device unload needs to purge
62 * the backlog queue.
63 * Paul Rusty Russell : SIOCSIFNAME
64 * Pekka Riikonen : Netdev boot-time settings code
65 * Andrew Morton : Make unregister_netdevice wait
66 * indefinitely on dev->refcnt
67 * J Hadi Salim : - Backlog queue sampling
68 * - netif_rx() feedback
69 */
70
71#include <linux/uaccess.h>
72#include <linux/bitmap.h>
73#include <linux/capability.h>
74#include <linux/cpu.h>
75#include <linux/types.h>
76#include <linux/kernel.h>
77#include <linux/hash.h>
78#include <linux/slab.h>
79#include <linux/sched.h>
80#include <linux/sched/mm.h>
81#include <linux/mutex.h>
82#include <linux/rwsem.h>
83#include <linux/string.h>
84#include <linux/mm.h>
85#include <linux/socket.h>
86#include <linux/sockios.h>
87#include <linux/errno.h>
88#include <linux/interrupt.h>
89#include <linux/if_ether.h>
90#include <linux/netdevice.h>
91#include <linux/etherdevice.h>
92#include <linux/ethtool.h>
93#include <linux/skbuff.h>
94#include <linux/kthread.h>
95#include <linux/bpf.h>
96#include <linux/bpf_trace.h>
97#include <net/net_namespace.h>
98#include <net/sock.h>
99#include <net/busy_poll.h>
100#include <linux/rtnetlink.h>
101#include <linux/stat.h>
102#include <net/dsa.h>
103#include <net/dst.h>
104#include <net/dst_metadata.h>
105#include <net/gro.h>
106#include <net/pkt_sched.h>
107#include <net/pkt_cls.h>
108#include <net/checksum.h>
109#include <net/xfrm.h>
110#include <net/tcx.h>
111#include <linux/highmem.h>
112#include <linux/init.h>
113#include <linux/module.h>
114#include <linux/netpoll.h>
115#include <linux/rcupdate.h>
116#include <linux/delay.h>
117#include <net/iw_handler.h>
118#include <asm/current.h>
119#include <linux/audit.h>
120#include <linux/dmaengine.h>
121#include <linux/err.h>
122#include <linux/ctype.h>
123#include <linux/if_arp.h>
124#include <linux/if_vlan.h>
125#include <linux/ip.h>
126#include <net/ip.h>
127#include <net/mpls.h>
128#include <linux/ipv6.h>
129#include <linux/in.h>
130#include <linux/jhash.h>
131#include <linux/random.h>
132#include <trace/events/napi.h>
133#include <trace/events/net.h>
134#include <trace/events/skb.h>
135#include <trace/events/qdisc.h>
136#include <trace/events/xdp.h>
137#include <linux/inetdevice.h>
138#include <linux/cpu_rmap.h>
139#include <linux/static_key.h>
140#include <linux/hashtable.h>
141#include <linux/vmalloc.h>
142#include <linux/if_macvlan.h>
143#include <linux/errqueue.h>
144#include <linux/hrtimer.h>
145#include <linux/netfilter_netdev.h>
146#include <linux/crash_dump.h>
147#include <linux/sctp.h>
148#include <net/udp_tunnel.h>
149#include <linux/net_namespace.h>
150#include <linux/indirect_call_wrapper.h>
151#include <net/devlink.h>
152#include <linux/pm_runtime.h>
153#include <linux/prandom.h>
154#include <linux/once_lite.h>
155#include <net/netdev_rx_queue.h>
156#include <net/page_pool/types.h>
157#include <net/page_pool/helpers.h>
158#include <net/rps.h>
159
160#include "dev.h"
161#include "net-sysfs.h"
162
163static DEFINE_SPINLOCK(ptype_lock);
164struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
165
166static int netif_rx_internal(struct sk_buff *skb);
167static int call_netdevice_notifiers_extack(unsigned long val,
168 struct net_device *dev,
169 struct netlink_ext_ack *extack);
170
171static DEFINE_MUTEX(ifalias_mutex);
172
173/* protects napi_hash addition/deletion and napi_gen_id */
174static DEFINE_SPINLOCK(napi_hash_lock);
175
176static unsigned int napi_gen_id = NR_CPUS;
177static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
178
179static DECLARE_RWSEM(devnet_rename_sem);
180
181static inline void dev_base_seq_inc(struct net *net)
182{
183 unsigned int val = net->dev_base_seq + 1;
184
185 WRITE_ONCE(net->dev_base_seq, val ?: 1);
186}
187
188static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
189{
190 unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
191
192 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
193}
194
195static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
196{
197 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
198}
199
200static inline void rps_lock_irqsave(struct softnet_data *sd,
201 unsigned long *flags)
202{
203 if (IS_ENABLED(CONFIG_RPS))
204 spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
205 else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
206 local_irq_save(*flags);
207}
208
209static inline void rps_lock_irq_disable(struct softnet_data *sd)
210{
211 if (IS_ENABLED(CONFIG_RPS))
212 spin_lock_irq(&sd->input_pkt_queue.lock);
213 else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
214 local_irq_disable();
215}
216
217static inline void rps_unlock_irq_restore(struct softnet_data *sd,
218 unsigned long *flags)
219{
220 if (IS_ENABLED(CONFIG_RPS))
221 spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
222 else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
223 local_irq_restore(*flags);
224}
225
226static inline void rps_unlock_irq_enable(struct softnet_data *sd)
227{
228 if (IS_ENABLED(CONFIG_RPS))
229 spin_unlock_irq(&sd->input_pkt_queue.lock);
230 else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
231 local_irq_enable();
232}
233
234static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
235 const char *name)
236{
237 struct netdev_name_node *name_node;
238
239 name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
240 if (!name_node)
241 return NULL;
242 INIT_HLIST_NODE(&name_node->hlist);
243 name_node->dev = dev;
244 name_node->name = name;
245 return name_node;
246}
247
248static struct netdev_name_node *
249netdev_name_node_head_alloc(struct net_device *dev)
250{
251 struct netdev_name_node *name_node;
252
253 name_node = netdev_name_node_alloc(dev, dev->name);
254 if (!name_node)
255 return NULL;
256 INIT_LIST_HEAD(&name_node->list);
257 return name_node;
258}
259
260static void netdev_name_node_free(struct netdev_name_node *name_node)
261{
262 kfree(name_node);
263}
264
265static void netdev_name_node_add(struct net *net,
266 struct netdev_name_node *name_node)
267{
268 hlist_add_head_rcu(&name_node->hlist,
269 dev_name_hash(net, name_node->name));
270}
271
272static void netdev_name_node_del(struct netdev_name_node *name_node)
273{
274 hlist_del_rcu(&name_node->hlist);
275}
276
277static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
278 const char *name)
279{
280 struct hlist_head *head = dev_name_hash(net, name);
281 struct netdev_name_node *name_node;
282
283 hlist_for_each_entry(name_node, head, hlist)
284 if (!strcmp(name_node->name, name))
285 return name_node;
286 return NULL;
287}
288
289static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
290 const char *name)
291{
292 struct hlist_head *head = dev_name_hash(net, name);
293 struct netdev_name_node *name_node;
294
295 hlist_for_each_entry_rcu(name_node, head, hlist)
296 if (!strcmp(name_node->name, name))
297 return name_node;
298 return NULL;
299}
300
301bool netdev_name_in_use(struct net *net, const char *name)
302{
303 return netdev_name_node_lookup(net, name);
304}
305EXPORT_SYMBOL(netdev_name_in_use);
306
307int netdev_name_node_alt_create(struct net_device *dev, const char *name)
308{
309 struct netdev_name_node *name_node;
310 struct net *net = dev_net(dev);
311
312 name_node = netdev_name_node_lookup(net, name);
313 if (name_node)
314 return -EEXIST;
315 name_node = netdev_name_node_alloc(dev, name);
316 if (!name_node)
317 return -ENOMEM;
318 netdev_name_node_add(net, name_node);
319 /* The node that holds dev->name acts as a head of per-device list. */
320 list_add_tail_rcu(&name_node->list, &dev->name_node->list);
321
322 return 0;
323}
324
325static void netdev_name_node_alt_free(struct rcu_head *head)
326{
327 struct netdev_name_node *name_node =
328 container_of(head, struct netdev_name_node, rcu);
329
330 kfree(name_node->name);
331 netdev_name_node_free(name_node);
332}
333
334static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
335{
336 netdev_name_node_del(name_node);
337 list_del(&name_node->list);
338 call_rcu(&name_node->rcu, netdev_name_node_alt_free);
339}
340
341int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
342{
343 struct netdev_name_node *name_node;
344 struct net *net = dev_net(dev);
345
346 name_node = netdev_name_node_lookup(net, name);
347 if (!name_node)
348 return -ENOENT;
349 /* lookup might have found our primary name or a name belonging
350 * to another device.
351 */
352 if (name_node == dev->name_node || name_node->dev != dev)
353 return -EINVAL;
354
355 __netdev_name_node_alt_destroy(name_node);
356 return 0;
357}
358
359static void netdev_name_node_alt_flush(struct net_device *dev)
360{
361 struct netdev_name_node *name_node, *tmp;
362
363 list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) {
364 list_del(&name_node->list);
365 netdev_name_node_alt_free(&name_node->rcu);
366 }
367}
368
369/* Device list insertion */
370static void list_netdevice(struct net_device *dev)
371{
372 struct netdev_name_node *name_node;
373 struct net *net = dev_net(dev);
374
375 ASSERT_RTNL();
376
377 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
378 netdev_name_node_add(net, dev->name_node);
379 hlist_add_head_rcu(&dev->index_hlist,
380 dev_index_hash(net, dev->ifindex));
381
382 netdev_for_each_altname(dev, name_node)
383 netdev_name_node_add(net, name_node);
384
385 /* We reserved the ifindex, this can't fail */
386 WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));
387
388 dev_base_seq_inc(net);
389}
390
391/* Device list removal
392 * caller must respect a RCU grace period before freeing/reusing dev
393 */
394static void unlist_netdevice(struct net_device *dev)
395{
396 struct netdev_name_node *name_node;
397 struct net *net = dev_net(dev);
398
399 ASSERT_RTNL();
400
401 xa_erase(&net->dev_by_index, dev->ifindex);
402
403 netdev_for_each_altname(dev, name_node)
404 netdev_name_node_del(name_node);
405
406 /* Unlink dev from the device chain */
407 list_del_rcu(&dev->dev_list);
408 netdev_name_node_del(dev->name_node);
409 hlist_del_rcu(&dev->index_hlist);
410
411 dev_base_seq_inc(dev_net(dev));
412}
413
414/*
415 * Our notifier list
416 */
417
418static RAW_NOTIFIER_HEAD(netdev_chain);
419
420/*
421 * Device drivers call our routines to queue packets here. We empty the
422 * queue in the local softnet handler.
423 */
424
425DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
426EXPORT_PER_CPU_SYMBOL(softnet_data);
427
428/* Page_pool has a lockless array/stack to alloc/recycle pages.
429 * PP consumers must pay attention to run APIs in the appropriate context
430 * (e.g. NAPI context).
431 */
432static DEFINE_PER_CPU(struct page_pool *, system_page_pool);
433
434#ifdef CONFIG_LOCKDEP
435/*
436 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
437 * according to dev->type
438 */
439static const unsigned short netdev_lock_type[] = {
440 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
441 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
442 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
443 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
444 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
445 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
446 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
447 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
448 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
449 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
450 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
451 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
452 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
453 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
454 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
455
456static const char *const netdev_lock_name[] = {
457 "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
458 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
459 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
460 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
461 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
462 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
463 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
464 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
465 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
466 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
467 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
468 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
469 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
470 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
471 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
472
473static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
474static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
475
476static inline unsigned short netdev_lock_pos(unsigned short dev_type)
477{
478 int i;
479
480 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
481 if (netdev_lock_type[i] == dev_type)
482 return i;
483 /* the last key is used by default */
484 return ARRAY_SIZE(netdev_lock_type) - 1;
485}
486
487static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
488 unsigned short dev_type)
489{
490 int i;
491
492 i = netdev_lock_pos(dev_type);
493 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
494 netdev_lock_name[i]);
495}
496
497static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
498{
499 int i;
500
501 i = netdev_lock_pos(dev->type);
502 lockdep_set_class_and_name(&dev->addr_list_lock,
503 &netdev_addr_lock_key[i],
504 netdev_lock_name[i]);
505}
506#else
507static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
508 unsigned short dev_type)
509{
510}
511
512static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
513{
514}
515#endif
516
517/*******************************************************************************
518 *
519 * Protocol management and registration routines
520 *
521 *******************************************************************************/
522
523
524/*
525 * Add a protocol ID to the list. Now that the input handler is
526 * smarter we can dispense with all the messy stuff that used to be
527 * here.
528 *
529 * BEWARE!!! Protocol handlers, mangling input packets,
530 * MUST BE last in hash buckets and checking protocol handlers
531 * MUST start from promiscuous ptype_all chain in net_bh.
532 * It is true now, do not change it.
533 * Explanation follows: if protocol handler, mangling packet, will
534 * be the first on list, it is not able to sense, that packet
535 * is cloned and should be copied-on-write, so that it will
536 * change it and subsequent readers will get broken packet.
537 * --ANK (980803)
538 */
539
540static inline struct list_head *ptype_head(const struct packet_type *pt)
541{
542 if (pt->type == htons(ETH_P_ALL))
543 return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all;
544 else
545 return pt->dev ? &pt->dev->ptype_specific :
546 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
547}
548
549/**
550 * dev_add_pack - add packet handler
551 * @pt: packet type declaration
552 *
553 * Add a protocol handler to the networking stack. The passed &packet_type
554 * is linked into kernel lists and may not be freed until it has been
555 * removed from the kernel lists.
556 *
557 * This call does not sleep therefore it can not
558 * guarantee all CPU's that are in middle of receiving packets
559 * will see the new packet type (until the next received packet).
560 */
561
562void dev_add_pack(struct packet_type *pt)
563{
564 struct list_head *head = ptype_head(pt);
565
566 spin_lock(&ptype_lock);
567 list_add_rcu(&pt->list, head);
568 spin_unlock(&ptype_lock);
569}
570EXPORT_SYMBOL(dev_add_pack);
571
572/**
573 * __dev_remove_pack - remove packet handler
574 * @pt: packet type declaration
575 *
576 * Remove a protocol handler that was previously added to the kernel
577 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
578 * from the kernel lists and can be freed or reused once this function
579 * returns.
580 *
581 * The packet type might still be in use by receivers
582 * and must not be freed until after all the CPU's have gone
583 * through a quiescent state.
584 */
585void __dev_remove_pack(struct packet_type *pt)
586{
587 struct list_head *head = ptype_head(pt);
588 struct packet_type *pt1;
589
590 spin_lock(&ptype_lock);
591
592 list_for_each_entry(pt1, head, list) {
593 if (pt == pt1) {
594 list_del_rcu(&pt->list);
595 goto out;
596 }
597 }
598
599 pr_warn("dev_remove_pack: %p not found\n", pt);
600out:
601 spin_unlock(&ptype_lock);
602}
603EXPORT_SYMBOL(__dev_remove_pack);
604
605/**
606 * dev_remove_pack - remove packet handler
607 * @pt: packet type declaration
608 *
609 * Remove a protocol handler that was previously added to the kernel
610 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
611 * from the kernel lists and can be freed or reused once this function
612 * returns.
613 *
614 * This call sleeps to guarantee that no CPU is looking at the packet
615 * type after return.
616 */
617void dev_remove_pack(struct packet_type *pt)
618{
619 __dev_remove_pack(pt);
620
621 synchronize_net();
622}
623EXPORT_SYMBOL(dev_remove_pack);
624
625
626/*******************************************************************************
627 *
628 * Device Interface Subroutines
629 *
630 *******************************************************************************/
631
632/**
633 * dev_get_iflink - get 'iflink' value of a interface
634 * @dev: targeted interface
635 *
636 * Indicates the ifindex the interface is linked to.
637 * Physical interfaces have the same 'ifindex' and 'iflink' values.
638 */
639
640int dev_get_iflink(const struct net_device *dev)
641{
642 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
643 return dev->netdev_ops->ndo_get_iflink(dev);
644
645 return READ_ONCE(dev->ifindex);
646}
647EXPORT_SYMBOL(dev_get_iflink);
648
649/**
650 * dev_fill_metadata_dst - Retrieve tunnel egress information.
651 * @dev: targeted interface
652 * @skb: The packet.
653 *
654 * For better visibility of tunnel traffic OVS needs to retrieve
655 * egress tunnel information for a packet. Following API allows
656 * user to get this info.
657 */
658int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
659{
660 struct ip_tunnel_info *info;
661
662 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
663 return -EINVAL;
664
665 info = skb_tunnel_info_unclone(skb);
666 if (!info)
667 return -ENOMEM;
668 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
669 return -EINVAL;
670
671 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
672}
673EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
674
675static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
676{
677 int k = stack->num_paths++;
678
679 if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
680 return NULL;
681
682 return &stack->path[k];
683}
684
685int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
686 struct net_device_path_stack *stack)
687{
688 const struct net_device *last_dev;
689 struct net_device_path_ctx ctx = {
690 .dev = dev,
691 };
692 struct net_device_path *path;
693 int ret = 0;
694
695 memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
696 stack->num_paths = 0;
697 while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
698 last_dev = ctx.dev;
699 path = dev_fwd_path(stack);
700 if (!path)
701 return -1;
702
703 memset(path, 0, sizeof(struct net_device_path));
704 ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
705 if (ret < 0)
706 return -1;
707
708 if (WARN_ON_ONCE(last_dev == ctx.dev))
709 return -1;
710 }
711
712 if (!ctx.dev)
713 return ret;
714
715 path = dev_fwd_path(stack);
716 if (!path)
717 return -1;
718 path->type = DEV_PATH_ETHERNET;
719 path->dev = ctx.dev;
720
721 return ret;
722}
723EXPORT_SYMBOL_GPL(dev_fill_forward_path);
724
725/**
726 * __dev_get_by_name - find a device by its name
727 * @net: the applicable net namespace
728 * @name: name to find
729 *
730 * Find an interface by name. Must be called under RTNL semaphore.
731 * If the name is found a pointer to the device is returned.
732 * If the name is not found then %NULL is returned. The
733 * reference counters are not incremented so the caller must be
734 * careful with locks.
735 */
736
737struct net_device *__dev_get_by_name(struct net *net, const char *name)
738{
739 struct netdev_name_node *node_name;
740
741 node_name = netdev_name_node_lookup(net, name);
742 return node_name ? node_name->dev : NULL;
743}
744EXPORT_SYMBOL(__dev_get_by_name);
745
746/**
747 * dev_get_by_name_rcu - find a device by its name
748 * @net: the applicable net namespace
749 * @name: name to find
750 *
751 * Find an interface by name.
752 * If the name is found a pointer to the device is returned.
753 * If the name is not found then %NULL is returned.
754 * The reference counters are not incremented so the caller must be
755 * careful with locks. The caller must hold RCU lock.
756 */
757
758struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
759{
760 struct netdev_name_node *node_name;
761
762 node_name = netdev_name_node_lookup_rcu(net, name);
763 return node_name ? node_name->dev : NULL;
764}
765EXPORT_SYMBOL(dev_get_by_name_rcu);
766
767/* Deprecated for new users, call netdev_get_by_name() instead */
768struct net_device *dev_get_by_name(struct net *net, const char *name)
769{
770 struct net_device *dev;
771
772 rcu_read_lock();
773 dev = dev_get_by_name_rcu(net, name);
774 dev_hold(dev);
775 rcu_read_unlock();
776 return dev;
777}
778EXPORT_SYMBOL(dev_get_by_name);
779
780/**
781 * netdev_get_by_name() - find a device by its name
782 * @net: the applicable net namespace
783 * @name: name to find
784 * @tracker: tracking object for the acquired reference
785 * @gfp: allocation flags for the tracker
786 *
787 * Find an interface by name. This can be called from any
788 * context and does its own locking. The returned handle has
789 * the usage count incremented and the caller must use netdev_put() to
790 * release it when it is no longer needed. %NULL is returned if no
791 * matching device is found.
792 */
793struct net_device *netdev_get_by_name(struct net *net, const char *name,
794 netdevice_tracker *tracker, gfp_t gfp)
795{
796 struct net_device *dev;
797
798 dev = dev_get_by_name(net, name);
799 if (dev)
800 netdev_tracker_alloc(dev, tracker, gfp);
801 return dev;
802}
803EXPORT_SYMBOL(netdev_get_by_name);
804
805/**
806 * __dev_get_by_index - find a device by its ifindex
807 * @net: the applicable net namespace
808 * @ifindex: index of device
809 *
810 * Search for an interface by index. Returns %NULL if the device
811 * is not found or a pointer to the device. The device has not
812 * had its reference counter increased so the caller must be careful
813 * about locking. The caller must hold the RTNL semaphore.
814 */
815
816struct net_device *__dev_get_by_index(struct net *net, int ifindex)
817{
818 struct net_device *dev;
819 struct hlist_head *head = dev_index_hash(net, ifindex);
820
821 hlist_for_each_entry(dev, head, index_hlist)
822 if (dev->ifindex == ifindex)
823 return dev;
824
825 return NULL;
826}
827EXPORT_SYMBOL(__dev_get_by_index);
828
829/**
830 * dev_get_by_index_rcu - find a device by its ifindex
831 * @net: the applicable net namespace
832 * @ifindex: index of device
833 *
834 * Search for an interface by index. Returns %NULL if the device
835 * is not found or a pointer to the device. The device has not
836 * had its reference counter increased so the caller must be careful
837 * about locking. The caller must hold RCU lock.
838 */
839
840struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
841{
842 struct net_device *dev;
843 struct hlist_head *head = dev_index_hash(net, ifindex);
844
845 hlist_for_each_entry_rcu(dev, head, index_hlist)
846 if (dev->ifindex == ifindex)
847 return dev;
848
849 return NULL;
850}
851EXPORT_SYMBOL(dev_get_by_index_rcu);
852
853/* Deprecated for new users, call netdev_get_by_index() instead */
854struct net_device *dev_get_by_index(struct net *net, int ifindex)
855{
856 struct net_device *dev;
857
858 rcu_read_lock();
859 dev = dev_get_by_index_rcu(net, ifindex);
860 dev_hold(dev);
861 rcu_read_unlock();
862 return dev;
863}
864EXPORT_SYMBOL(dev_get_by_index);
865
866/**
867 * netdev_get_by_index() - find a device by its ifindex
868 * @net: the applicable net namespace
869 * @ifindex: index of device
870 * @tracker: tracking object for the acquired reference
871 * @gfp: allocation flags for the tracker
872 *
873 * Search for an interface by index. Returns NULL if the device
874 * is not found or a pointer to the device. The device returned has
875 * had a reference added and the pointer is safe until the user calls
876 * netdev_put() to indicate they have finished with it.
877 */
878struct net_device *netdev_get_by_index(struct net *net, int ifindex,
879 netdevice_tracker *tracker, gfp_t gfp)
880{
881 struct net_device *dev;
882
883 dev = dev_get_by_index(net, ifindex);
884 if (dev)
885 netdev_tracker_alloc(dev, tracker, gfp);
886 return dev;
887}
888EXPORT_SYMBOL(netdev_get_by_index);
889
890/**
891 * dev_get_by_napi_id - find a device by napi_id
892 * @napi_id: ID of the NAPI struct
893 *
894 * Search for an interface by NAPI ID. Returns %NULL if the device
895 * is not found or a pointer to the device. The device has not had
896 * its reference counter increased so the caller must be careful
897 * about locking. The caller must hold RCU lock.
898 */
899
900struct net_device *dev_get_by_napi_id(unsigned int napi_id)
901{
902 struct napi_struct *napi;
903
904 WARN_ON_ONCE(!rcu_read_lock_held());
905
906 if (napi_id < MIN_NAPI_ID)
907 return NULL;
908
909 napi = napi_by_id(napi_id);
910
911 return napi ? napi->dev : NULL;
912}
913EXPORT_SYMBOL(dev_get_by_napi_id);
914
915/**
916 * netdev_get_name - get a netdevice name, knowing its ifindex.
917 * @net: network namespace
918 * @name: a pointer to the buffer where the name will be stored.
919 * @ifindex: the ifindex of the interface to get the name from.
920 */
921int netdev_get_name(struct net *net, char *name, int ifindex)
922{
923 struct net_device *dev;
924 int ret;
925
926 down_read(&devnet_rename_sem);
927 rcu_read_lock();
928
929 dev = dev_get_by_index_rcu(net, ifindex);
930 if (!dev) {
931 ret = -ENODEV;
932 goto out;
933 }
934
935 strcpy(name, dev->name);
936
937 ret = 0;
938out:
939 rcu_read_unlock();
940 up_read(&devnet_rename_sem);
941 return ret;
942}
943
944/**
945 * dev_getbyhwaddr_rcu - find a device by its hardware address
946 * @net: the applicable net namespace
947 * @type: media type of device
948 * @ha: hardware address
949 *
950 * Search for an interface by MAC address. Returns NULL if the device
951 * is not found or a pointer to the device.
952 * The caller must hold RCU or RTNL.
953 * The returned device has not had its ref count increased
954 * and the caller must therefore be careful about locking
955 *
956 */
957
958struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
959 const char *ha)
960{
961 struct net_device *dev;
962
963 for_each_netdev_rcu(net, dev)
964 if (dev->type == type &&
965 !memcmp(dev->dev_addr, ha, dev->addr_len))
966 return dev;
967
968 return NULL;
969}
970EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
971
972struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
973{
974 struct net_device *dev, *ret = NULL;
975
976 rcu_read_lock();
977 for_each_netdev_rcu(net, dev)
978 if (dev->type == type) {
979 dev_hold(dev);
980 ret = dev;
981 break;
982 }
983 rcu_read_unlock();
984 return ret;
985}
986EXPORT_SYMBOL(dev_getfirstbyhwtype);
987
988/**
989 * __dev_get_by_flags - find any device with given flags
990 * @net: the applicable net namespace
991 * @if_flags: IFF_* values
992 * @mask: bitmask of bits in if_flags to check
993 *
994 * Search for any interface with the given flags. Returns NULL if a device
995 * is not found or a pointer to the device. Must be called inside
996 * rtnl_lock(), and result refcount is unchanged.
997 */
998
999struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
1000 unsigned short mask)
1001{
1002 struct net_device *dev, *ret;
1003
1004 ASSERT_RTNL();
1005
1006 ret = NULL;
1007 for_each_netdev(net, dev) {
1008 if (((dev->flags ^ if_flags) & mask) == 0) {
1009 ret = dev;
1010 break;
1011 }
1012 }
1013 return ret;
1014}
1015EXPORT_SYMBOL(__dev_get_by_flags);
1016
1017/**
1018 * dev_valid_name - check if name is okay for network device
1019 * @name: name string
1020 *
1021 * Network device names need to be valid file names to
1022 * allow sysfs to work. We also disallow any kind of
1023 * whitespace.
1024 */
1025bool dev_valid_name(const char *name)
1026{
1027 if (*name == '\0')
1028 return false;
1029 if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1030 return false;
1031 if (!strcmp(name, ".") || !strcmp(name, ".."))
1032 return false;
1033
1034 while (*name) {
1035 if (*name == '/' || *name == ':' || isspace(*name))
1036 return false;
1037 name++;
1038 }
1039 return true;
1040}
1041EXPORT_SYMBOL(dev_valid_name);
1042
1043/**
1044 * __dev_alloc_name - allocate a name for a device
1045 * @net: network namespace to allocate the device name in
1046 * @name: name format string
1047 * @res: result name string
1048 *
1049 * Passed a format string - eg "lt%d" it will try and find a suitable
1050 * id. It scans list of devices to build up a free map, then chooses
1051 * the first empty slot. The caller must hold the dev_base or rtnl lock
1052 * while allocating the name and adding the device in order to avoid
1053 * duplicates.
1054 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1055 * Returns the number of the unit assigned or a negative errno code.
1056 */
1057
1058static int __dev_alloc_name(struct net *net, const char *name, char *res)
1059{
1060 int i = 0;
1061 const char *p;
1062 const int max_netdevices = 8*PAGE_SIZE;
1063 unsigned long *inuse;
1064 struct net_device *d;
1065 char buf[IFNAMSIZ];
1066
1067 /* Verify the string as this thing may have come from the user.
1068 * There must be one "%d" and no other "%" characters.
1069 */
1070 p = strchr(name, '%');
1071 if (!p || p[1] != 'd' || strchr(p + 2, '%'))
1072 return -EINVAL;
1073
1074 /* Use one page as a bit array of possible slots */
1075 inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
1076 if (!inuse)
1077 return -ENOMEM;
1078
1079 for_each_netdev(net, d) {
1080 struct netdev_name_node *name_node;
1081
1082 netdev_for_each_altname(d, name_node) {
1083 if (!sscanf(name_node->name, name, &i))
1084 continue;
1085 if (i < 0 || i >= max_netdevices)
1086 continue;
1087
1088 /* avoid cases where sscanf is not exact inverse of printf */
1089 snprintf(buf, IFNAMSIZ, name, i);
1090 if (!strncmp(buf, name_node->name, IFNAMSIZ))
1091 __set_bit(i, inuse);
1092 }
1093 if (!sscanf(d->name, name, &i))
1094 continue;
1095 if (i < 0 || i >= max_netdevices)
1096 continue;
1097
1098 /* avoid cases where sscanf is not exact inverse of printf */
1099 snprintf(buf, IFNAMSIZ, name, i);
1100 if (!strncmp(buf, d->name, IFNAMSIZ))
1101 __set_bit(i, inuse);
1102 }
1103
1104 i = find_first_zero_bit(inuse, max_netdevices);
1105 bitmap_free(inuse);
1106 if (i == max_netdevices)
1107 return -ENFILE;
1108
1109 /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */
1110 strscpy(buf, name, IFNAMSIZ);
1111 snprintf(res, IFNAMSIZ, buf, i);
1112 return i;
1113}
1114
1115/* Returns negative errno or allocated unit id (see __dev_alloc_name()) */
1116static int dev_prep_valid_name(struct net *net, struct net_device *dev,
1117 const char *want_name, char *out_name,
1118 int dup_errno)
1119{
1120 if (!dev_valid_name(want_name))
1121 return -EINVAL;
1122
1123 if (strchr(want_name, '%'))
1124 return __dev_alloc_name(net, want_name, out_name);
1125
1126 if (netdev_name_in_use(net, want_name))
1127 return -dup_errno;
1128 if (out_name != want_name)
1129 strscpy(out_name, want_name, IFNAMSIZ);
1130 return 0;
1131}
1132
1133/**
1134 * dev_alloc_name - allocate a name for a device
1135 * @dev: device
1136 * @name: name format string
1137 *
1138 * Passed a format string - eg "lt%d" it will try and find a suitable
1139 * id. It scans list of devices to build up a free map, then chooses
1140 * the first empty slot. The caller must hold the dev_base or rtnl lock
1141 * while allocating the name and adding the device in order to avoid
1142 * duplicates.
1143 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1144 * Returns the number of the unit assigned or a negative errno code.
1145 */
1146
1147int dev_alloc_name(struct net_device *dev, const char *name)
1148{
1149 return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE);
1150}
1151EXPORT_SYMBOL(dev_alloc_name);
1152
1153static int dev_get_valid_name(struct net *net, struct net_device *dev,
1154 const char *name)
1155{
1156 int ret;
1157
1158 ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST);
1159 return ret < 0 ? ret : 0;
1160}
1161
1162/**
1163 * dev_change_name - change name of a device
1164 * @dev: device
1165 * @newname: name (or format string) must be at least IFNAMSIZ
1166 *
1167 * Change name of a device, can pass format strings "eth%d".
1168 * for wildcarding.
1169 */
1170int dev_change_name(struct net_device *dev, const char *newname)
1171{
1172 unsigned char old_assign_type;
1173 char oldname[IFNAMSIZ];
1174 int err = 0;
1175 int ret;
1176 struct net *net;
1177
1178 ASSERT_RTNL();
1179 BUG_ON(!dev_net(dev));
1180
1181 net = dev_net(dev);
1182
1183 down_write(&devnet_rename_sem);
1184
1185 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1186 up_write(&devnet_rename_sem);
1187 return 0;
1188 }
1189
1190 memcpy(oldname, dev->name, IFNAMSIZ);
1191
1192 err = dev_get_valid_name(net, dev, newname);
1193 if (err < 0) {
1194 up_write(&devnet_rename_sem);
1195 return err;
1196 }
1197
1198 if (oldname[0] && !strchr(oldname, '%'))
1199 netdev_info(dev, "renamed from %s%s\n", oldname,
1200 dev->flags & IFF_UP ? " (while UP)" : "");
1201
1202 old_assign_type = dev->name_assign_type;
1203 WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED);
1204
1205rollback:
1206 ret = device_rename(&dev->dev, dev->name);
1207 if (ret) {
1208 memcpy(dev->name, oldname, IFNAMSIZ);
1209 WRITE_ONCE(dev->name_assign_type, old_assign_type);
1210 up_write(&devnet_rename_sem);
1211 return ret;
1212 }
1213
1214 up_write(&devnet_rename_sem);
1215
1216 netdev_adjacent_rename_links(dev, oldname);
1217
1218 netdev_name_node_del(dev->name_node);
1219
1220 synchronize_net();
1221
1222 netdev_name_node_add(net, dev->name_node);
1223
1224 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1225 ret = notifier_to_errno(ret);
1226
1227 if (ret) {
1228 /* err >= 0 after dev_alloc_name() or stores the first errno */
1229 if (err >= 0) {
1230 err = ret;
1231 down_write(&devnet_rename_sem);
1232 memcpy(dev->name, oldname, IFNAMSIZ);
1233 memcpy(oldname, newname, IFNAMSIZ);
1234 WRITE_ONCE(dev->name_assign_type, old_assign_type);
1235 old_assign_type = NET_NAME_RENAMED;
1236 goto rollback;
1237 } else {
1238 netdev_err(dev, "name change rollback failed: %d\n",
1239 ret);
1240 }
1241 }
1242
1243 return err;
1244}
1245
1246/**
1247 * dev_set_alias - change ifalias of a device
1248 * @dev: device
1249 * @alias: name up to IFALIASZ
1250 * @len: limit of bytes to copy from info
1251 *
1252 * Set ifalias for a device,
1253 */
1254int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1255{
1256 struct dev_ifalias *new_alias = NULL;
1257
1258 if (len >= IFALIASZ)
1259 return -EINVAL;
1260
1261 if (len) {
1262 new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1263 if (!new_alias)
1264 return -ENOMEM;
1265
1266 memcpy(new_alias->ifalias, alias, len);
1267 new_alias->ifalias[len] = 0;
1268 }
1269
1270 mutex_lock(&ifalias_mutex);
1271 new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
1272 mutex_is_locked(&ifalias_mutex));
1273 mutex_unlock(&ifalias_mutex);
1274
1275 if (new_alias)
1276 kfree_rcu(new_alias, rcuhead);
1277
1278 return len;
1279}
1280EXPORT_SYMBOL(dev_set_alias);
1281
1282/**
1283 * dev_get_alias - get ifalias of a device
1284 * @dev: device
1285 * @name: buffer to store name of ifalias
1286 * @len: size of buffer
1287 *
1288 * get ifalias for a device. Caller must make sure dev cannot go
1289 * away, e.g. rcu read lock or own a reference count to device.
1290 */
1291int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1292{
1293 const struct dev_ifalias *alias;
1294 int ret = 0;
1295
1296 rcu_read_lock();
1297 alias = rcu_dereference(dev->ifalias);
1298 if (alias)
1299 ret = snprintf(name, len, "%s", alias->ifalias);
1300 rcu_read_unlock();
1301
1302 return ret;
1303}
1304
1305/**
1306 * netdev_features_change - device changes features
1307 * @dev: device to cause notification
1308 *
1309 * Called to indicate a device has changed features.
1310 */
1311void netdev_features_change(struct net_device *dev)
1312{
1313 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1314}
1315EXPORT_SYMBOL(netdev_features_change);
1316
1317/**
1318 * netdev_state_change - device changes state
1319 * @dev: device to cause notification
1320 *
1321 * Called to indicate a device has changed state. This function calls
1322 * the notifier chains for netdev_chain and sends a NEWLINK message
1323 * to the routing socket.
1324 */
1325void netdev_state_change(struct net_device *dev)
1326{
1327 if (dev->flags & IFF_UP) {
1328 struct netdev_notifier_change_info change_info = {
1329 .info.dev = dev,
1330 };
1331
1332 call_netdevice_notifiers_info(NETDEV_CHANGE,
1333 &change_info.info);
1334 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
1335 }
1336}
1337EXPORT_SYMBOL(netdev_state_change);
1338
1339/**
1340 * __netdev_notify_peers - notify network peers about existence of @dev,
1341 * to be called when rtnl lock is already held.
1342 * @dev: network device
1343 *
1344 * Generate traffic such that interested network peers are aware of
1345 * @dev, such as by generating a gratuitous ARP. This may be used when
1346 * a device wants to inform the rest of the network about some sort of
1347 * reconfiguration such as a failover event or virtual machine
1348 * migration.
1349 */
1350void __netdev_notify_peers(struct net_device *dev)
1351{
1352 ASSERT_RTNL();
1353 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1354 call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1355}
1356EXPORT_SYMBOL(__netdev_notify_peers);
1357
1358/**
1359 * netdev_notify_peers - notify network peers about existence of @dev
1360 * @dev: network device
1361 *
1362 * Generate traffic such that interested network peers are aware of
1363 * @dev, such as by generating a gratuitous ARP. This may be used when
1364 * a device wants to inform the rest of the network about some sort of
1365 * reconfiguration such as a failover event or virtual machine
1366 * migration.
1367 */
1368void netdev_notify_peers(struct net_device *dev)
1369{
1370 rtnl_lock();
1371 __netdev_notify_peers(dev);
1372 rtnl_unlock();
1373}
1374EXPORT_SYMBOL(netdev_notify_peers);
1375
1376static int napi_threaded_poll(void *data);
1377
1378static int napi_kthread_create(struct napi_struct *n)
1379{
1380 int err = 0;
1381
1382 /* Create and wake up the kthread once to put it in
1383 * TASK_INTERRUPTIBLE mode to avoid the blocked task
1384 * warning and work with loadavg.
1385 */
1386 n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
1387 n->dev->name, n->napi_id);
1388 if (IS_ERR(n->thread)) {
1389 err = PTR_ERR(n->thread);
1390 pr_err("kthread_run failed with err %d\n", err);
1391 n->thread = NULL;
1392 }
1393
1394 return err;
1395}
1396
1397static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1398{
1399 const struct net_device_ops *ops = dev->netdev_ops;
1400 int ret;
1401
1402 ASSERT_RTNL();
1403 dev_addr_check(dev);
1404
1405 if (!netif_device_present(dev)) {
1406 /* may be detached because parent is runtime-suspended */
1407 if (dev->dev.parent)
1408 pm_runtime_resume(dev->dev.parent);
1409 if (!netif_device_present(dev))
1410 return -ENODEV;
1411 }
1412
1413 /* Block netpoll from trying to do any rx path servicing.
1414 * If we don't do this there is a chance ndo_poll_controller
1415 * or ndo_poll may be running while we open the device
1416 */
1417 netpoll_poll_disable(dev);
1418
1419 ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
1420 ret = notifier_to_errno(ret);
1421 if (ret)
1422 return ret;
1423
1424 set_bit(__LINK_STATE_START, &dev->state);
1425
1426 if (ops->ndo_validate_addr)
1427 ret = ops->ndo_validate_addr(dev);
1428
1429 if (!ret && ops->ndo_open)
1430 ret = ops->ndo_open(dev);
1431
1432 netpoll_poll_enable(dev);
1433
1434 if (ret)
1435 clear_bit(__LINK_STATE_START, &dev->state);
1436 else {
1437 dev->flags |= IFF_UP;
1438 dev_set_rx_mode(dev);
1439 dev_activate(dev);
1440 add_device_randomness(dev->dev_addr, dev->addr_len);
1441 }
1442
1443 return ret;
1444}
1445
1446/**
1447 * dev_open - prepare an interface for use.
1448 * @dev: device to open
1449 * @extack: netlink extended ack
1450 *
1451 * Takes a device from down to up state. The device's private open
1452 * function is invoked and then the multicast lists are loaded. Finally
1453 * the device is moved into the up state and a %NETDEV_UP message is
1454 * sent to the netdev notifier chain.
1455 *
1456 * Calling this function on an active interface is a nop. On a failure
1457 * a negative errno code is returned.
1458 */
1459int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1460{
1461 int ret;
1462
1463 if (dev->flags & IFF_UP)
1464 return 0;
1465
1466 ret = __dev_open(dev, extack);
1467 if (ret < 0)
1468 return ret;
1469
1470 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
1471 call_netdevice_notifiers(NETDEV_UP, dev);
1472
1473 return ret;
1474}
1475EXPORT_SYMBOL(dev_open);
1476
1477static void __dev_close_many(struct list_head *head)
1478{
1479 struct net_device *dev;
1480
1481 ASSERT_RTNL();
1482 might_sleep();
1483
1484 list_for_each_entry(dev, head, close_list) {
1485 /* Temporarily disable netpoll until the interface is down */
1486 netpoll_poll_disable(dev);
1487
1488 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1489
1490 clear_bit(__LINK_STATE_START, &dev->state);
1491
1492 /* Synchronize to scheduled poll. We cannot touch poll list, it
1493 * can be even on different cpu. So just clear netif_running().
1494 *
1495 * dev->stop() will invoke napi_disable() on all of it's
1496 * napi_struct instances on this device.
1497 */
1498 smp_mb__after_atomic(); /* Commit netif_running(). */
1499 }
1500
1501 dev_deactivate_many(head);
1502
1503 list_for_each_entry(dev, head, close_list) {
1504 const struct net_device_ops *ops = dev->netdev_ops;
1505
1506 /*
1507 * Call the device specific close. This cannot fail.
1508 * Only if device is UP
1509 *
1510 * We allow it to be called even after a DETACH hot-plug
1511 * event.
1512 */
1513 if (ops->ndo_stop)
1514 ops->ndo_stop(dev);
1515
1516 dev->flags &= ~IFF_UP;
1517 netpoll_poll_enable(dev);
1518 }
1519}
1520
1521static void __dev_close(struct net_device *dev)
1522{
1523 LIST_HEAD(single);
1524
1525 list_add(&dev->close_list, &single);
1526 __dev_close_many(&single);
1527 list_del(&single);
1528}
1529
1530void dev_close_many(struct list_head *head, bool unlink)
1531{
1532 struct net_device *dev, *tmp;
1533
1534 /* Remove the devices that don't need to be closed */
1535 list_for_each_entry_safe(dev, tmp, head, close_list)
1536 if (!(dev->flags & IFF_UP))
1537 list_del_init(&dev->close_list);
1538
1539 __dev_close_many(head);
1540
1541 list_for_each_entry_safe(dev, tmp, head, close_list) {
1542 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
1543 call_netdevice_notifiers(NETDEV_DOWN, dev);
1544 if (unlink)
1545 list_del_init(&dev->close_list);
1546 }
1547}
1548EXPORT_SYMBOL(dev_close_many);
1549
1550/**
1551 * dev_close - shutdown an interface.
1552 * @dev: device to shutdown
1553 *
1554 * This function moves an active device into down state. A
1555 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1556 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1557 * chain.
1558 */
1559void dev_close(struct net_device *dev)
1560{
1561 if (dev->flags & IFF_UP) {
1562 LIST_HEAD(single);
1563
1564 list_add(&dev->close_list, &single);
1565 dev_close_many(&single, true);
1566 list_del(&single);
1567 }
1568}
1569EXPORT_SYMBOL(dev_close);
1570
1571
1572/**
1573 * dev_disable_lro - disable Large Receive Offload on a device
1574 * @dev: device
1575 *
1576 * Disable Large Receive Offload (LRO) on a net device. Must be
1577 * called under RTNL. This is needed if received packets may be
1578 * forwarded to another interface.
1579 */
1580void dev_disable_lro(struct net_device *dev)
1581{
1582 struct net_device *lower_dev;
1583 struct list_head *iter;
1584
1585 dev->wanted_features &= ~NETIF_F_LRO;
1586 netdev_update_features(dev);
1587
1588 if (unlikely(dev->features & NETIF_F_LRO))
1589 netdev_WARN(dev, "failed to disable LRO!\n");
1590
1591 netdev_for_each_lower_dev(dev, lower_dev, iter)
1592 dev_disable_lro(lower_dev);
1593}
1594EXPORT_SYMBOL(dev_disable_lro);
1595
1596/**
1597 * dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1598 * @dev: device
1599 *
1600 * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be
1601 * called under RTNL. This is needed if Generic XDP is installed on
1602 * the device.
1603 */
1604static void dev_disable_gro_hw(struct net_device *dev)
1605{
1606 dev->wanted_features &= ~NETIF_F_GRO_HW;
1607 netdev_update_features(dev);
1608
1609 if (unlikely(dev->features & NETIF_F_GRO_HW))
1610 netdev_WARN(dev, "failed to disable GRO_HW!\n");
1611}
1612
1613const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1614{
1615#define N(val) \
1616 case NETDEV_##val: \
1617 return "NETDEV_" __stringify(val);
1618 switch (cmd) {
1619 N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1620 N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1621 N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1622 N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN)
1623 N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA)
1624 N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE)
1625 N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1626 N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1627 N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1628 N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
1629 N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
1630 N(XDP_FEAT_CHANGE)
1631 }
1632#undef N
1633 return "UNKNOWN_NETDEV_EVENT";
1634}
1635EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1636
1637static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1638 struct net_device *dev)
1639{
1640 struct netdev_notifier_info info = {
1641 .dev = dev,
1642 };
1643
1644 return nb->notifier_call(nb, val, &info);
1645}
1646
1647static int call_netdevice_register_notifiers(struct notifier_block *nb,
1648 struct net_device *dev)
1649{
1650 int err;
1651
1652 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1653 err = notifier_to_errno(err);
1654 if (err)
1655 return err;
1656
1657 if (!(dev->flags & IFF_UP))
1658 return 0;
1659
1660 call_netdevice_notifier(nb, NETDEV_UP, dev);
1661 return 0;
1662}
1663
1664static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
1665 struct net_device *dev)
1666{
1667 if (dev->flags & IFF_UP) {
1668 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1669 dev);
1670 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1671 }
1672 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1673}
1674
1675static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
1676 struct net *net)
1677{
1678 struct net_device *dev;
1679 int err;
1680
1681 for_each_netdev(net, dev) {
1682 err = call_netdevice_register_notifiers(nb, dev);
1683 if (err)
1684 goto rollback;
1685 }
1686 return 0;
1687
1688rollback:
1689 for_each_netdev_continue_reverse(net, dev)
1690 call_netdevice_unregister_notifiers(nb, dev);
1691 return err;
1692}
1693
1694static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
1695 struct net *net)
1696{
1697 struct net_device *dev;
1698
1699 for_each_netdev(net, dev)
1700 call_netdevice_unregister_notifiers(nb, dev);
1701}
1702
1703static int dev_boot_phase = 1;
1704
1705/**
1706 * register_netdevice_notifier - register a network notifier block
1707 * @nb: notifier
1708 *
1709 * Register a notifier to be called when network device events occur.
1710 * The notifier passed is linked into the kernel structures and must
1711 * not be reused until it has been unregistered. A negative errno code
1712 * is returned on a failure.
1713 *
1714 * When registered all registration and up events are replayed
1715 * to the new notifier to allow device to have a race free
1716 * view of the network device list.
1717 */
1718
1719int register_netdevice_notifier(struct notifier_block *nb)
1720{
1721 struct net *net;
1722 int err;
1723
1724 /* Close race with setup_net() and cleanup_net() */
1725 down_write(&pernet_ops_rwsem);
1726 rtnl_lock();
1727 err = raw_notifier_chain_register(&netdev_chain, nb);
1728 if (err)
1729 goto unlock;
1730 if (dev_boot_phase)
1731 goto unlock;
1732 for_each_net(net) {
1733 err = call_netdevice_register_net_notifiers(nb, net);
1734 if (err)
1735 goto rollback;
1736 }
1737
1738unlock:
1739 rtnl_unlock();
1740 up_write(&pernet_ops_rwsem);
1741 return err;
1742
1743rollback:
1744 for_each_net_continue_reverse(net)
1745 call_netdevice_unregister_net_notifiers(nb, net);
1746
1747 raw_notifier_chain_unregister(&netdev_chain, nb);
1748 goto unlock;
1749}
1750EXPORT_SYMBOL(register_netdevice_notifier);
1751
1752/**
1753 * unregister_netdevice_notifier - unregister a network notifier block
1754 * @nb: notifier
1755 *
1756 * Unregister a notifier previously registered by
1757 * register_netdevice_notifier(). The notifier is unlinked into the
1758 * kernel structures and may then be reused. A negative errno code
1759 * is returned on a failure.
1760 *
1761 * After unregistering unregister and down device events are synthesized
1762 * for all devices on the device list to the removed notifier to remove
1763 * the need for special case cleanup code.
1764 */
1765
1766int unregister_netdevice_notifier(struct notifier_block *nb)
1767{
1768 struct net *net;
1769 int err;
1770
1771 /* Close race with setup_net() and cleanup_net() */
1772 down_write(&pernet_ops_rwsem);
1773 rtnl_lock();
1774 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1775 if (err)
1776 goto unlock;
1777
1778 for_each_net(net)
1779 call_netdevice_unregister_net_notifiers(nb, net);
1780
1781unlock:
1782 rtnl_unlock();
1783 up_write(&pernet_ops_rwsem);
1784 return err;
1785}
1786EXPORT_SYMBOL(unregister_netdevice_notifier);
1787
1788static int __register_netdevice_notifier_net(struct net *net,
1789 struct notifier_block *nb,
1790 bool ignore_call_fail)
1791{
1792 int err;
1793
1794 err = raw_notifier_chain_register(&net->netdev_chain, nb);
1795 if (err)
1796 return err;
1797 if (dev_boot_phase)
1798 return 0;
1799
1800 err = call_netdevice_register_net_notifiers(nb, net);
1801 if (err && !ignore_call_fail)
1802 goto chain_unregister;
1803
1804 return 0;
1805
1806chain_unregister:
1807 raw_notifier_chain_unregister(&net->netdev_chain, nb);
1808 return err;
1809}
1810
1811static int __unregister_netdevice_notifier_net(struct net *net,
1812 struct notifier_block *nb)
1813{
1814 int err;
1815
1816 err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
1817 if (err)
1818 return err;
1819
1820 call_netdevice_unregister_net_notifiers(nb, net);
1821 return 0;
1822}
1823
1824/**
1825 * register_netdevice_notifier_net - register a per-netns network notifier block
1826 * @net: network namespace
1827 * @nb: notifier
1828 *
1829 * Register a notifier to be called when network device events occur.
1830 * The notifier passed is linked into the kernel structures and must
1831 * not be reused until it has been unregistered. A negative errno code
1832 * is returned on a failure.
1833 *
1834 * When registered all registration and up events are replayed
1835 * to the new notifier to allow device to have a race free
1836 * view of the network device list.
1837 */
1838
1839int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
1840{
1841 int err;
1842
1843 rtnl_lock();
1844 err = __register_netdevice_notifier_net(net, nb, false);
1845 rtnl_unlock();
1846 return err;
1847}
1848EXPORT_SYMBOL(register_netdevice_notifier_net);
1849
1850/**
1851 * unregister_netdevice_notifier_net - unregister a per-netns
1852 * network notifier block
1853 * @net: network namespace
1854 * @nb: notifier
1855 *
1856 * Unregister a notifier previously registered by
1857 * register_netdevice_notifier_net(). The notifier is unlinked from the
1858 * kernel structures and may then be reused. A negative errno code
1859 * is returned on a failure.
1860 *
1861 * After unregistering unregister and down device events are synthesized
1862 * for all devices on the device list to the removed notifier to remove
1863 * the need for special case cleanup code.
1864 */
1865
1866int unregister_netdevice_notifier_net(struct net *net,
1867 struct notifier_block *nb)
1868{
1869 int err;
1870
1871 rtnl_lock();
1872 err = __unregister_netdevice_notifier_net(net, nb);
1873 rtnl_unlock();
1874 return err;
1875}
1876EXPORT_SYMBOL(unregister_netdevice_notifier_net);
1877
1878static void __move_netdevice_notifier_net(struct net *src_net,
1879 struct net *dst_net,
1880 struct notifier_block *nb)
1881{
1882 __unregister_netdevice_notifier_net(src_net, nb);
1883 __register_netdevice_notifier_net(dst_net, nb, true);
1884}
1885
1886int register_netdevice_notifier_dev_net(struct net_device *dev,
1887 struct notifier_block *nb,
1888 struct netdev_net_notifier *nn)
1889{
1890 int err;
1891
1892 rtnl_lock();
1893 err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
1894 if (!err) {
1895 nn->nb = nb;
1896 list_add(&nn->list, &dev->net_notifier_list);
1897 }
1898 rtnl_unlock();
1899 return err;
1900}
1901EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
1902
1903int unregister_netdevice_notifier_dev_net(struct net_device *dev,
1904 struct notifier_block *nb,
1905 struct netdev_net_notifier *nn)
1906{
1907 int err;
1908
1909 rtnl_lock();
1910 list_del(&nn->list);
1911 err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
1912 rtnl_unlock();
1913 return err;
1914}
1915EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
1916
1917static void move_netdevice_notifiers_dev_net(struct net_device *dev,
1918 struct net *net)
1919{
1920 struct netdev_net_notifier *nn;
1921
1922 list_for_each_entry(nn, &dev->net_notifier_list, list)
1923 __move_netdevice_notifier_net(dev_net(dev), net, nn->nb);
1924}
1925
1926/**
1927 * call_netdevice_notifiers_info - call all network notifier blocks
1928 * @val: value passed unmodified to notifier function
1929 * @info: notifier information data
1930 *
1931 * Call all network notifier blocks. Parameters and return value
1932 * are as for raw_notifier_call_chain().
1933 */
1934
1935int call_netdevice_notifiers_info(unsigned long val,
1936 struct netdev_notifier_info *info)
1937{
1938 struct net *net = dev_net(info->dev);
1939 int ret;
1940
1941 ASSERT_RTNL();
1942
1943 /* Run per-netns notifier block chain first, then run the global one.
1944 * Hopefully, one day, the global one is going to be removed after
1945 * all notifier block registrators get converted to be per-netns.
1946 */
1947 ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
1948 if (ret & NOTIFY_STOP_MASK)
1949 return ret;
1950 return raw_notifier_call_chain(&netdev_chain, val, info);
1951}
1952
1953/**
1954 * call_netdevice_notifiers_info_robust - call per-netns notifier blocks
1955 * for and rollback on error
1956 * @val_up: value passed unmodified to notifier function
1957 * @val_down: value passed unmodified to the notifier function when
1958 * recovering from an error on @val_up
1959 * @info: notifier information data
1960 *
1961 * Call all per-netns network notifier blocks, but not notifier blocks on
1962 * the global notifier chain. Parameters and return value are as for
1963 * raw_notifier_call_chain_robust().
1964 */
1965
1966static int
1967call_netdevice_notifiers_info_robust(unsigned long val_up,
1968 unsigned long val_down,
1969 struct netdev_notifier_info *info)
1970{
1971 struct net *net = dev_net(info->dev);
1972
1973 ASSERT_RTNL();
1974
1975 return raw_notifier_call_chain_robust(&net->netdev_chain,
1976 val_up, val_down, info);
1977}
1978
1979static int call_netdevice_notifiers_extack(unsigned long val,
1980 struct net_device *dev,
1981 struct netlink_ext_ack *extack)
1982{
1983 struct netdev_notifier_info info = {
1984 .dev = dev,
1985 .extack = extack,
1986 };
1987
1988 return call_netdevice_notifiers_info(val, &info);
1989}
1990
1991/**
1992 * call_netdevice_notifiers - call all network notifier blocks
1993 * @val: value passed unmodified to notifier function
1994 * @dev: net_device pointer passed unmodified to notifier function
1995 *
1996 * Call all network notifier blocks. Parameters and return value
1997 * are as for raw_notifier_call_chain().
1998 */
1999
2000int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
2001{
2002 return call_netdevice_notifiers_extack(val, dev, NULL);
2003}
2004EXPORT_SYMBOL(call_netdevice_notifiers);
2005
2006/**
2007 * call_netdevice_notifiers_mtu - call all network notifier blocks
2008 * @val: value passed unmodified to notifier function
2009 * @dev: net_device pointer passed unmodified to notifier function
2010 * @arg: additional u32 argument passed to the notifier function
2011 *
2012 * Call all network notifier blocks. Parameters and return value
2013 * are as for raw_notifier_call_chain().
2014 */
2015static int call_netdevice_notifiers_mtu(unsigned long val,
2016 struct net_device *dev, u32 arg)
2017{
2018 struct netdev_notifier_info_ext info = {
2019 .info.dev = dev,
2020 .ext.mtu = arg,
2021 };
2022
2023 BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
2024
2025 return call_netdevice_notifiers_info(val, &info.info);
2026}
2027
2028#ifdef CONFIG_NET_INGRESS
2029static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
2030
2031void net_inc_ingress_queue(void)
2032{
2033 static_branch_inc(&ingress_needed_key);
2034}
2035EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
2036
2037void net_dec_ingress_queue(void)
2038{
2039 static_branch_dec(&ingress_needed_key);
2040}
2041EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
2042#endif
2043
2044#ifdef CONFIG_NET_EGRESS
2045static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
2046
2047void net_inc_egress_queue(void)
2048{
2049 static_branch_inc(&egress_needed_key);
2050}
2051EXPORT_SYMBOL_GPL(net_inc_egress_queue);
2052
2053void net_dec_egress_queue(void)
2054{
2055 static_branch_dec(&egress_needed_key);
2056}
2057EXPORT_SYMBOL_GPL(net_dec_egress_queue);
2058#endif
2059
2060DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
2061EXPORT_SYMBOL(netstamp_needed_key);
2062#ifdef CONFIG_JUMP_LABEL
2063static atomic_t netstamp_needed_deferred;
2064static atomic_t netstamp_wanted;
2065static void netstamp_clear(struct work_struct *work)
2066{
2067 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
2068 int wanted;
2069
2070 wanted = atomic_add_return(deferred, &netstamp_wanted);
2071 if (wanted > 0)
2072 static_branch_enable(&netstamp_needed_key);
2073 else
2074 static_branch_disable(&netstamp_needed_key);
2075}
2076static DECLARE_WORK(netstamp_work, netstamp_clear);
2077#endif
2078
2079void net_enable_timestamp(void)
2080{
2081#ifdef CONFIG_JUMP_LABEL
2082 int wanted = atomic_read(&netstamp_wanted);
2083
2084 while (wanted > 0) {
2085 if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1))
2086 return;
2087 }
2088 atomic_inc(&netstamp_needed_deferred);
2089 schedule_work(&netstamp_work);
2090#else
2091 static_branch_inc(&netstamp_needed_key);
2092#endif
2093}
2094EXPORT_SYMBOL(net_enable_timestamp);
2095
2096void net_disable_timestamp(void)
2097{
2098#ifdef CONFIG_JUMP_LABEL
2099 int wanted = atomic_read(&netstamp_wanted);
2100
2101 while (wanted > 1) {
2102 if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1))
2103 return;
2104 }
2105 atomic_dec(&netstamp_needed_deferred);
2106 schedule_work(&netstamp_work);
2107#else
2108 static_branch_dec(&netstamp_needed_key);
2109#endif
2110}
2111EXPORT_SYMBOL(net_disable_timestamp);
2112
2113static inline void net_timestamp_set(struct sk_buff *skb)
2114{
2115 skb->tstamp = 0;
2116 skb->mono_delivery_time = 0;
2117 if (static_branch_unlikely(&netstamp_needed_key))
2118 skb->tstamp = ktime_get_real();
2119}
2120
2121#define net_timestamp_check(COND, SKB) \
2122 if (static_branch_unlikely(&netstamp_needed_key)) { \
2123 if ((COND) && !(SKB)->tstamp) \
2124 (SKB)->tstamp = ktime_get_real(); \
2125 } \
2126
2127bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
2128{
2129 return __is_skb_forwardable(dev, skb, true);
2130}
2131EXPORT_SYMBOL_GPL(is_skb_forwardable);
2132
2133static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
2134 bool check_mtu)
2135{
2136 int ret = ____dev_forward_skb(dev, skb, check_mtu);
2137
2138 if (likely(!ret)) {
2139 skb->protocol = eth_type_trans(skb, dev);
2140 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
2141 }
2142
2143 return ret;
2144}
2145
2146int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2147{
2148 return __dev_forward_skb2(dev, skb, true);
2149}
2150EXPORT_SYMBOL_GPL(__dev_forward_skb);
2151
2152/**
2153 * dev_forward_skb - loopback an skb to another netif
2154 *
2155 * @dev: destination network device
2156 * @skb: buffer to forward
2157 *
2158 * return values:
2159 * NET_RX_SUCCESS (no congestion)
2160 * NET_RX_DROP (packet was dropped, but freed)
2161 *
2162 * dev_forward_skb can be used for injecting an skb from the
2163 * start_xmit function of one device into the receive queue
2164 * of another device.
2165 *
2166 * The receiving device may be in another namespace, so
2167 * we have to clear all information in the skb that could
2168 * impact namespace isolation.
2169 */
2170int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2171{
2172 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
2173}
2174EXPORT_SYMBOL_GPL(dev_forward_skb);
2175
2176int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
2177{
2178 return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
2179}
2180
2181static inline int deliver_skb(struct sk_buff *skb,
2182 struct packet_type *pt_prev,
2183 struct net_device *orig_dev)
2184{
2185 if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
2186 return -ENOMEM;
2187 refcount_inc(&skb->users);
2188 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2189}
2190
2191static inline void deliver_ptype_list_skb(struct sk_buff *skb,
2192 struct packet_type **pt,
2193 struct net_device *orig_dev,
2194 __be16 type,
2195 struct list_head *ptype_list)
2196{
2197 struct packet_type *ptype, *pt_prev = *pt;
2198
2199 list_for_each_entry_rcu(ptype, ptype_list, list) {
2200 if (ptype->type != type)
2201 continue;
2202 if (pt_prev)
2203 deliver_skb(skb, pt_prev, orig_dev);
2204 pt_prev = ptype;
2205 }
2206 *pt = pt_prev;
2207}
2208
2209static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
2210{
2211 if (!ptype->af_packet_priv || !skb->sk)
2212 return false;
2213
2214 if (ptype->id_match)
2215 return ptype->id_match(ptype, skb->sk);
2216 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
2217 return true;
2218
2219 return false;
2220}
2221
2222/**
2223 * dev_nit_active - return true if any network interface taps are in use
2224 *
2225 * @dev: network device to check for the presence of taps
2226 */
2227bool dev_nit_active(struct net_device *dev)
2228{
2229 return !list_empty(&net_hotdata.ptype_all) ||
2230 !list_empty(&dev->ptype_all);
2231}
2232EXPORT_SYMBOL_GPL(dev_nit_active);
2233
2234/*
2235 * Support routine. Sends outgoing frames to any network
2236 * taps currently in use.
2237 */
2238
2239void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
2240{
2241 struct list_head *ptype_list = &net_hotdata.ptype_all;
2242 struct packet_type *ptype, *pt_prev = NULL;
2243 struct sk_buff *skb2 = NULL;
2244
2245 rcu_read_lock();
2246again:
2247 list_for_each_entry_rcu(ptype, ptype_list, list) {
2248 if (READ_ONCE(ptype->ignore_outgoing))
2249 continue;
2250
2251 /* Never send packets back to the socket
2252 * they originated from - MvS (miquels@drinkel.ow.org)
2253 */
2254 if (skb_loop_sk(ptype, skb))
2255 continue;
2256
2257 if (pt_prev) {
2258 deliver_skb(skb2, pt_prev, skb->dev);
2259 pt_prev = ptype;
2260 continue;
2261 }
2262
2263 /* need to clone skb, done only once */
2264 skb2 = skb_clone(skb, GFP_ATOMIC);
2265 if (!skb2)
2266 goto out_unlock;
2267
2268 net_timestamp_set(skb2);
2269
2270 /* skb->nh should be correctly
2271 * set by sender, so that the second statement is
2272 * just protection against buggy protocols.
2273 */
2274 skb_reset_mac_header(skb2);
2275
2276 if (skb_network_header(skb2) < skb2->data ||
2277 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2278 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2279 ntohs(skb2->protocol),
2280 dev->name);
2281 skb_reset_network_header(skb2);
2282 }
2283
2284 skb2->transport_header = skb2->network_header;
2285 skb2->pkt_type = PACKET_OUTGOING;
2286 pt_prev = ptype;
2287 }
2288
2289 if (ptype_list == &net_hotdata.ptype_all) {
2290 ptype_list = &dev->ptype_all;
2291 goto again;
2292 }
2293out_unlock:
2294 if (pt_prev) {
2295 if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2296 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2297 else
2298 kfree_skb(skb2);
2299 }
2300 rcu_read_unlock();
2301}
2302EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2303
2304/**
2305 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2306 * @dev: Network device
2307 * @txq: number of queues available
2308 *
2309 * If real_num_tx_queues is changed the tc mappings may no longer be
2310 * valid. To resolve this verify the tc mapping remains valid and if
2311 * not NULL the mapping. With no priorities mapping to this
2312 * offset/count pair it will no longer be used. In the worst case TC0
2313 * is invalid nothing can be done so disable priority mappings. If is
2314 * expected that drivers will fix this mapping if they can before
2315 * calling netif_set_real_num_tx_queues.
2316 */
2317static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2318{
2319 int i;
2320 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2321
2322 /* If TC0 is invalidated disable TC mapping */
2323 if (tc->offset + tc->count > txq) {
2324 netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2325 dev->num_tc = 0;
2326 return;
2327 }
2328
2329 /* Invalidated prio to tc mappings set to TC0 */
2330 for (i = 1; i < TC_BITMASK + 1; i++) {
2331 int q = netdev_get_prio_tc_map(dev, i);
2332
2333 tc = &dev->tc_to_txq[q];
2334 if (tc->offset + tc->count > txq) {
2335 netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2336 i, q);
2337 netdev_set_prio_tc_map(dev, i, 0);
2338 }
2339 }
2340}
2341
2342int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2343{
2344 if (dev->num_tc) {
2345 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2346 int i;
2347
2348 /* walk through the TCs and see if it falls into any of them */
2349 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2350 if ((txq - tc->offset) < tc->count)
2351 return i;
2352 }
2353
2354 /* didn't find it, just return -1 to indicate no match */
2355 return -1;
2356 }
2357
2358 return 0;
2359}
2360EXPORT_SYMBOL(netdev_txq_to_tc);
2361
2362#ifdef CONFIG_XPS
2363static struct static_key xps_needed __read_mostly;
2364static struct static_key xps_rxqs_needed __read_mostly;
2365static DEFINE_MUTEX(xps_map_mutex);
2366#define xmap_dereference(P) \
2367 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2368
2369static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2370 struct xps_dev_maps *old_maps, int tci, u16 index)
2371{
2372 struct xps_map *map = NULL;
2373 int pos;
2374
2375 map = xmap_dereference(dev_maps->attr_map[tci]);
2376 if (!map)
2377 return false;
2378
2379 for (pos = map->len; pos--;) {
2380 if (map->queues[pos] != index)
2381 continue;
2382
2383 if (map->len > 1) {
2384 map->queues[pos] = map->queues[--map->len];
2385 break;
2386 }
2387
2388 if (old_maps)
2389 RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
2390 RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2391 kfree_rcu(map, rcu);
2392 return false;
2393 }
2394
2395 return true;
2396}
2397
2398static bool remove_xps_queue_cpu(struct net_device *dev,
2399 struct xps_dev_maps *dev_maps,
2400 int cpu, u16 offset, u16 count)
2401{
2402 int num_tc = dev_maps->num_tc;
2403 bool active = false;
2404 int tci;
2405
2406 for (tci = cpu * num_tc; num_tc--; tci++) {
2407 int i, j;
2408
2409 for (i = count, j = offset; i--; j++) {
2410 if (!remove_xps_queue(dev_maps, NULL, tci, j))
2411 break;
2412 }
2413
2414 active |= i < 0;
2415 }
2416
2417 return active;
2418}
2419
2420static void reset_xps_maps(struct net_device *dev,
2421 struct xps_dev_maps *dev_maps,
2422 enum xps_map_type type)
2423{
2424 static_key_slow_dec_cpuslocked(&xps_needed);
2425 if (type == XPS_RXQS)
2426 static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
2427
2428 RCU_INIT_POINTER(dev->xps_maps[type], NULL);
2429
2430 kfree_rcu(dev_maps, rcu);
2431}
2432
2433static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
2434 u16 offset, u16 count)
2435{
2436 struct xps_dev_maps *dev_maps;
2437 bool active = false;
2438 int i, j;
2439
2440 dev_maps = xmap_dereference(dev->xps_maps[type]);
2441 if (!dev_maps)
2442 return;
2443
2444 for (j = 0; j < dev_maps->nr_ids; j++)
2445 active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
2446 if (!active)
2447 reset_xps_maps(dev, dev_maps, type);
2448
2449 if (type == XPS_CPUS) {
2450 for (i = offset + (count - 1); count--; i--)
2451 netdev_queue_numa_node_write(
2452 netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
2453 }
2454}
2455
2456static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2457 u16 count)
2458{
2459 if (!static_key_false(&xps_needed))
2460 return;
2461
2462 cpus_read_lock();
2463 mutex_lock(&xps_map_mutex);
2464
2465 if (static_key_false(&xps_rxqs_needed))
2466 clean_xps_maps(dev, XPS_RXQS, offset, count);
2467
2468 clean_xps_maps(dev, XPS_CPUS, offset, count);
2469
2470 mutex_unlock(&xps_map_mutex);
2471 cpus_read_unlock();
2472}
2473
2474static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2475{
2476 netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2477}
2478
2479static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2480 u16 index, bool is_rxqs_map)
2481{
2482 struct xps_map *new_map;
2483 int alloc_len = XPS_MIN_MAP_ALLOC;
2484 int i, pos;
2485
2486 for (pos = 0; map && pos < map->len; pos++) {
2487 if (map->queues[pos] != index)
2488 continue;
2489 return map;
2490 }
2491
2492 /* Need to add tx-queue to this CPU's/rx-queue's existing map */
2493 if (map) {
2494 if (pos < map->alloc_len)
2495 return map;
2496
2497 alloc_len = map->alloc_len * 2;
2498 }
2499
2500 /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2501 * map
2502 */
2503 if (is_rxqs_map)
2504 new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2505 else
2506 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2507 cpu_to_node(attr_index));
2508 if (!new_map)
2509 return NULL;
2510
2511 for (i = 0; i < pos; i++)
2512 new_map->queues[i] = map->queues[i];
2513 new_map->alloc_len = alloc_len;
2514 new_map->len = pos;
2515
2516 return new_map;
2517}
2518
2519/* Copy xps maps at a given index */
2520static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
2521 struct xps_dev_maps *new_dev_maps, int index,
2522 int tc, bool skip_tc)
2523{
2524 int i, tci = index * dev_maps->num_tc;
2525 struct xps_map *map;
2526
2527 /* copy maps belonging to foreign traffic classes */
2528 for (i = 0; i < dev_maps->num_tc; i++, tci++) {
2529 if (i == tc && skip_tc)
2530 continue;
2531
2532 /* fill in the new device map from the old device map */
2533 map = xmap_dereference(dev_maps->attr_map[tci]);
2534 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2535 }
2536}
2537
2538/* Must be called under cpus_read_lock */
2539int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2540 u16 index, enum xps_map_type type)
2541{
2542 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
2543 const unsigned long *online_mask = NULL;
2544 bool active = false, copy = false;
2545 int i, j, tci, numa_node_id = -2;
2546 int maps_sz, num_tc = 1, tc = 0;
2547 struct xps_map *map, *new_map;
2548 unsigned int nr_ids;
2549
2550 WARN_ON_ONCE(index >= dev->num_tx_queues);
2551
2552 if (dev->num_tc) {
2553 /* Do not allow XPS on subordinate device directly */
2554 num_tc = dev->num_tc;
2555 if (num_tc < 0)
2556 return -EINVAL;
2557
2558 /* If queue belongs to subordinate dev use its map */
2559 dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2560
2561 tc = netdev_txq_to_tc(dev, index);
2562 if (tc < 0)
2563 return -EINVAL;
2564 }
2565
2566 mutex_lock(&xps_map_mutex);
2567
2568 dev_maps = xmap_dereference(dev->xps_maps[type]);
2569 if (type == XPS_RXQS) {
2570 maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2571 nr_ids = dev->num_rx_queues;
2572 } else {
2573 maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2574 if (num_possible_cpus() > 1)
2575 online_mask = cpumask_bits(cpu_online_mask);
2576 nr_ids = nr_cpu_ids;
2577 }
2578
2579 if (maps_sz < L1_CACHE_BYTES)
2580 maps_sz = L1_CACHE_BYTES;
2581
2582 /* The old dev_maps could be larger or smaller than the one we're
2583 * setting up now, as dev->num_tc or nr_ids could have been updated in
2584 * between. We could try to be smart, but let's be safe instead and only
2585 * copy foreign traffic classes if the two map sizes match.
2586 */
2587 if (dev_maps &&
2588 dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
2589 copy = true;
2590
2591 /* allocate memory for queue storage */
2592 for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
2593 j < nr_ids;) {
2594 if (!new_dev_maps) {
2595 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2596 if (!new_dev_maps) {
2597 mutex_unlock(&xps_map_mutex);
2598 return -ENOMEM;
2599 }
2600
2601 new_dev_maps->nr_ids = nr_ids;
2602 new_dev_maps->num_tc = num_tc;
2603 }
2604
2605 tci = j * num_tc + tc;
2606 map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
2607
2608 map = expand_xps_map(map, j, index, type == XPS_RXQS);
2609 if (!map)
2610 goto error;
2611
2612 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2613 }
2614
2615 if (!new_dev_maps)
2616 goto out_no_new_maps;
2617
2618 if (!dev_maps) {
2619 /* Increment static keys at most once per type */
2620 static_key_slow_inc_cpuslocked(&xps_needed);
2621 if (type == XPS_RXQS)
2622 static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
2623 }
2624
2625 for (j = 0; j < nr_ids; j++) {
2626 bool skip_tc = false;
2627
2628 tci = j * num_tc + tc;
2629 if (netif_attr_test_mask(j, mask, nr_ids) &&
2630 netif_attr_test_online(j, online_mask, nr_ids)) {
2631 /* add tx-queue to CPU/rx-queue maps */
2632 int pos = 0;
2633
2634 skip_tc = true;
2635
2636 map = xmap_dereference(new_dev_maps->attr_map[tci]);
2637 while ((pos < map->len) && (map->queues[pos] != index))
2638 pos++;
2639
2640 if (pos == map->len)
2641 map->queues[map->len++] = index;
2642#ifdef CONFIG_NUMA
2643 if (type == XPS_CPUS) {
2644 if (numa_node_id == -2)
2645 numa_node_id = cpu_to_node(j);
2646 else if (numa_node_id != cpu_to_node(j))
2647 numa_node_id = -1;
2648 }
2649#endif
2650 }
2651
2652 if (copy)
2653 xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
2654 skip_tc);
2655 }
2656
2657 rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
2658
2659 /* Cleanup old maps */
2660 if (!dev_maps)
2661 goto out_no_old_maps;
2662
2663 for (j = 0; j < dev_maps->nr_ids; j++) {
2664 for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
2665 map = xmap_dereference(dev_maps->attr_map[tci]);
2666 if (!map)
2667 continue;
2668
2669 if (copy) {
2670 new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2671 if (map == new_map)
2672 continue;
2673 }
2674
2675 RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2676 kfree_rcu(map, rcu);
2677 }
2678 }
2679
2680 old_dev_maps = dev_maps;
2681
2682out_no_old_maps:
2683 dev_maps = new_dev_maps;
2684 active = true;
2685
2686out_no_new_maps:
2687 if (type == XPS_CPUS)
2688 /* update Tx queue numa node */
2689 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2690 (numa_node_id >= 0) ?
2691 numa_node_id : NUMA_NO_NODE);
2692
2693 if (!dev_maps)
2694 goto out_no_maps;
2695
2696 /* removes tx-queue from unused CPUs/rx-queues */
2697 for (j = 0; j < dev_maps->nr_ids; j++) {
2698 tci = j * dev_maps->num_tc;
2699
2700 for (i = 0; i < dev_maps->num_tc; i++, tci++) {
2701 if (i == tc &&
2702 netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
2703 netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
2704 continue;
2705
2706 active |= remove_xps_queue(dev_maps,
2707 copy ? old_dev_maps : NULL,
2708 tci, index);
2709 }
2710 }
2711
2712 if (old_dev_maps)
2713 kfree_rcu(old_dev_maps, rcu);
2714
2715 /* free map if not active */
2716 if (!active)
2717 reset_xps_maps(dev, dev_maps, type);
2718
2719out_no_maps:
2720 mutex_unlock(&xps_map_mutex);
2721
2722 return 0;
2723error:
2724 /* remove any maps that we added */
2725 for (j = 0; j < nr_ids; j++) {
2726 for (i = num_tc, tci = j * num_tc; i--; tci++) {
2727 new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2728 map = copy ?
2729 xmap_dereference(dev_maps->attr_map[tci]) :
2730 NULL;
2731 if (new_map && new_map != map)
2732 kfree(new_map);
2733 }
2734 }
2735
2736 mutex_unlock(&xps_map_mutex);
2737
2738 kfree(new_dev_maps);
2739 return -ENOMEM;
2740}
2741EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
2742
2743int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2744 u16 index)
2745{
2746 int ret;
2747
2748 cpus_read_lock();
2749 ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
2750 cpus_read_unlock();
2751
2752 return ret;
2753}
2754EXPORT_SYMBOL(netif_set_xps_queue);
2755
2756#endif
2757static void netdev_unbind_all_sb_channels(struct net_device *dev)
2758{
2759 struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2760
2761 /* Unbind any subordinate channels */
2762 while (txq-- != &dev->_tx[0]) {
2763 if (txq->sb_dev)
2764 netdev_unbind_sb_channel(dev, txq->sb_dev);
2765 }
2766}
2767
2768void netdev_reset_tc(struct net_device *dev)
2769{
2770#ifdef CONFIG_XPS
2771 netif_reset_xps_queues_gt(dev, 0);
2772#endif
2773 netdev_unbind_all_sb_channels(dev);
2774
2775 /* Reset TC configuration of device */
2776 dev->num_tc = 0;
2777 memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2778 memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2779}
2780EXPORT_SYMBOL(netdev_reset_tc);
2781
2782int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2783{
2784 if (tc >= dev->num_tc)
2785 return -EINVAL;
2786
2787#ifdef CONFIG_XPS
2788 netif_reset_xps_queues(dev, offset, count);
2789#endif
2790 dev->tc_to_txq[tc].count = count;
2791 dev->tc_to_txq[tc].offset = offset;
2792 return 0;
2793}
2794EXPORT_SYMBOL(netdev_set_tc_queue);
2795
2796int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2797{
2798 if (num_tc > TC_MAX_QUEUE)
2799 return -EINVAL;
2800
2801#ifdef CONFIG_XPS
2802 netif_reset_xps_queues_gt(dev, 0);
2803#endif
2804 netdev_unbind_all_sb_channels(dev);
2805
2806 dev->num_tc = num_tc;
2807 return 0;
2808}
2809EXPORT_SYMBOL(netdev_set_num_tc);
2810
2811void netdev_unbind_sb_channel(struct net_device *dev,
2812 struct net_device *sb_dev)
2813{
2814 struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2815
2816#ifdef CONFIG_XPS
2817 netif_reset_xps_queues_gt(sb_dev, 0);
2818#endif
2819 memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
2820 memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
2821
2822 while (txq-- != &dev->_tx[0]) {
2823 if (txq->sb_dev == sb_dev)
2824 txq->sb_dev = NULL;
2825 }
2826}
2827EXPORT_SYMBOL(netdev_unbind_sb_channel);
2828
2829int netdev_bind_sb_channel_queue(struct net_device *dev,
2830 struct net_device *sb_dev,
2831 u8 tc, u16 count, u16 offset)
2832{
2833 /* Make certain the sb_dev and dev are already configured */
2834 if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
2835 return -EINVAL;
2836
2837 /* We cannot hand out queues we don't have */
2838 if ((offset + count) > dev->real_num_tx_queues)
2839 return -EINVAL;
2840
2841 /* Record the mapping */
2842 sb_dev->tc_to_txq[tc].count = count;
2843 sb_dev->tc_to_txq[tc].offset = offset;
2844
2845 /* Provide a way for Tx queue to find the tc_to_txq map or
2846 * XPS map for itself.
2847 */
2848 while (count--)
2849 netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
2850
2851 return 0;
2852}
2853EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
2854
2855int netdev_set_sb_channel(struct net_device *dev, u16 channel)
2856{
2857 /* Do not use a multiqueue device to represent a subordinate channel */
2858 if (netif_is_multiqueue(dev))
2859 return -ENODEV;
2860
2861 /* We allow channels 1 - 32767 to be used for subordinate channels.
2862 * Channel 0 is meant to be "native" mode and used only to represent
2863 * the main root device. We allow writing 0 to reset the device back
2864 * to normal mode after being used as a subordinate channel.
2865 */
2866 if (channel > S16_MAX)
2867 return -EINVAL;
2868
2869 dev->num_tc = -channel;
2870
2871 return 0;
2872}
2873EXPORT_SYMBOL(netdev_set_sb_channel);
2874
2875/*
2876 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2877 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2878 */
2879int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2880{
2881 bool disabling;
2882 int rc;
2883
2884 disabling = txq < dev->real_num_tx_queues;
2885
2886 if (txq < 1 || txq > dev->num_tx_queues)
2887 return -EINVAL;
2888
2889 if (dev->reg_state == NETREG_REGISTERED ||
2890 dev->reg_state == NETREG_UNREGISTERING) {
2891 ASSERT_RTNL();
2892
2893 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2894 txq);
2895 if (rc)
2896 return rc;
2897
2898 if (dev->num_tc)
2899 netif_setup_tc(dev, txq);
2900
2901 dev_qdisc_change_real_num_tx(dev, txq);
2902
2903 dev->real_num_tx_queues = txq;
2904
2905 if (disabling) {
2906 synchronize_net();
2907 qdisc_reset_all_tx_gt(dev, txq);
2908#ifdef CONFIG_XPS
2909 netif_reset_xps_queues_gt(dev, txq);
2910#endif
2911 }
2912 } else {
2913 dev->real_num_tx_queues = txq;
2914 }
2915
2916 return 0;
2917}
2918EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2919
2920#ifdef CONFIG_SYSFS
2921/**
2922 * netif_set_real_num_rx_queues - set actual number of RX queues used
2923 * @dev: Network device
2924 * @rxq: Actual number of RX queues
2925 *
2926 * This must be called either with the rtnl_lock held or before
2927 * registration of the net device. Returns 0 on success, or a
2928 * negative error code. If called before registration, it always
2929 * succeeds.
2930 */
2931int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2932{
2933 int rc;
2934
2935 if (rxq < 1 || rxq > dev->num_rx_queues)
2936 return -EINVAL;
2937
2938 if (dev->reg_state == NETREG_REGISTERED) {
2939 ASSERT_RTNL();
2940
2941 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2942 rxq);
2943 if (rc)
2944 return rc;
2945 }
2946
2947 dev->real_num_rx_queues = rxq;
2948 return 0;
2949}
2950EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2951#endif
2952
2953/**
2954 * netif_set_real_num_queues - set actual number of RX and TX queues used
2955 * @dev: Network device
2956 * @txq: Actual number of TX queues
2957 * @rxq: Actual number of RX queues
2958 *
2959 * Set the real number of both TX and RX queues.
2960 * Does nothing if the number of queues is already correct.
2961 */
2962int netif_set_real_num_queues(struct net_device *dev,
2963 unsigned int txq, unsigned int rxq)
2964{
2965 unsigned int old_rxq = dev->real_num_rx_queues;
2966 int err;
2967
2968 if (txq < 1 || txq > dev->num_tx_queues ||
2969 rxq < 1 || rxq > dev->num_rx_queues)
2970 return -EINVAL;
2971
2972 /* Start from increases, so the error path only does decreases -
2973 * decreases can't fail.
2974 */
2975 if (rxq > dev->real_num_rx_queues) {
2976 err = netif_set_real_num_rx_queues(dev, rxq);
2977 if (err)
2978 return err;
2979 }
2980 if (txq > dev->real_num_tx_queues) {
2981 err = netif_set_real_num_tx_queues(dev, txq);
2982 if (err)
2983 goto undo_rx;
2984 }
2985 if (rxq < dev->real_num_rx_queues)
2986 WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
2987 if (txq < dev->real_num_tx_queues)
2988 WARN_ON(netif_set_real_num_tx_queues(dev, txq));
2989
2990 return 0;
2991undo_rx:
2992 WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
2993 return err;
2994}
2995EXPORT_SYMBOL(netif_set_real_num_queues);
2996
2997/**
2998 * netif_set_tso_max_size() - set the max size of TSO frames supported
2999 * @dev: netdev to update
3000 * @size: max skb->len of a TSO frame
3001 *
3002 * Set the limit on the size of TSO super-frames the device can handle.
3003 * Unless explicitly set the stack will assume the value of
3004 * %GSO_LEGACY_MAX_SIZE.
3005 */
3006void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
3007{
3008 dev->tso_max_size = min(GSO_MAX_SIZE, size);
3009 if (size < READ_ONCE(dev->gso_max_size))
3010 netif_set_gso_max_size(dev, size);
3011 if (size < READ_ONCE(dev->gso_ipv4_max_size))
3012 netif_set_gso_ipv4_max_size(dev, size);
3013}
3014EXPORT_SYMBOL(netif_set_tso_max_size);
3015
3016/**
3017 * netif_set_tso_max_segs() - set the max number of segs supported for TSO
3018 * @dev: netdev to update
3019 * @segs: max number of TCP segments
3020 *
3021 * Set the limit on the number of TCP segments the device can generate from
3022 * a single TSO super-frame.
3023 * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
3024 */
3025void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
3026{
3027 dev->tso_max_segs = segs;
3028 if (segs < READ_ONCE(dev->gso_max_segs))
3029 netif_set_gso_max_segs(dev, segs);
3030}
3031EXPORT_SYMBOL(netif_set_tso_max_segs);
3032
3033/**
3034 * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
3035 * @to: netdev to update
3036 * @from: netdev from which to copy the limits
3037 */
3038void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
3039{
3040 netif_set_tso_max_size(to, from->tso_max_size);
3041 netif_set_tso_max_segs(to, from->tso_max_segs);
3042}
3043EXPORT_SYMBOL(netif_inherit_tso_max);
3044
3045/**
3046 * netif_get_num_default_rss_queues - default number of RSS queues
3047 *
3048 * Default value is the number of physical cores if there are only 1 or 2, or
3049 * divided by 2 if there are more.
3050 */
3051int netif_get_num_default_rss_queues(void)
3052{
3053 cpumask_var_t cpus;
3054 int cpu, count = 0;
3055
3056 if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
3057 return 1;
3058
3059 cpumask_copy(cpus, cpu_online_mask);
3060 for_each_cpu(cpu, cpus) {
3061 ++count;
3062 cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
3063 }
3064 free_cpumask_var(cpus);
3065
3066 return count > 2 ? DIV_ROUND_UP(count, 2) : count;
3067}
3068EXPORT_SYMBOL(netif_get_num_default_rss_queues);
3069
3070static void __netif_reschedule(struct Qdisc *q)
3071{
3072 struct softnet_data *sd;
3073 unsigned long flags;
3074
3075 local_irq_save(flags);
3076 sd = this_cpu_ptr(&softnet_data);
3077 q->next_sched = NULL;
3078 *sd->output_queue_tailp = q;
3079 sd->output_queue_tailp = &q->next_sched;
3080 raise_softirq_irqoff(NET_TX_SOFTIRQ);
3081 local_irq_restore(flags);
3082}
3083
3084void __netif_schedule(struct Qdisc *q)
3085{
3086 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
3087 __netif_reschedule(q);
3088}
3089EXPORT_SYMBOL(__netif_schedule);
3090
3091struct dev_kfree_skb_cb {
3092 enum skb_drop_reason reason;
3093};
3094
3095static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
3096{
3097 return (struct dev_kfree_skb_cb *)skb->cb;
3098}
3099
3100void netif_schedule_queue(struct netdev_queue *txq)
3101{
3102 rcu_read_lock();
3103 if (!netif_xmit_stopped(txq)) {
3104 struct Qdisc *q = rcu_dereference(txq->qdisc);
3105
3106 __netif_schedule(q);
3107 }
3108 rcu_read_unlock();
3109}
3110EXPORT_SYMBOL(netif_schedule_queue);
3111
3112void netif_tx_wake_queue(struct netdev_queue *dev_queue)
3113{
3114 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
3115 struct Qdisc *q;
3116
3117 rcu_read_lock();
3118 q = rcu_dereference(dev_queue->qdisc);
3119 __netif_schedule(q);
3120 rcu_read_unlock();
3121 }
3122}
3123EXPORT_SYMBOL(netif_tx_wake_queue);
3124
3125void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason)
3126{
3127 unsigned long flags;
3128
3129 if (unlikely(!skb))
3130 return;
3131
3132 if (likely(refcount_read(&skb->users) == 1)) {
3133 smp_rmb();
3134 refcount_set(&skb->users, 0);
3135 } else if (likely(!refcount_dec_and_test(&skb->users))) {
3136 return;
3137 }
3138 get_kfree_skb_cb(skb)->reason = reason;
3139 local_irq_save(flags);
3140 skb->next = __this_cpu_read(softnet_data.completion_queue);
3141 __this_cpu_write(softnet_data.completion_queue, skb);
3142 raise_softirq_irqoff(NET_TX_SOFTIRQ);
3143 local_irq_restore(flags);
3144}
3145EXPORT_SYMBOL(dev_kfree_skb_irq_reason);
3146
3147void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason)
3148{
3149 if (in_hardirq() || irqs_disabled())
3150 dev_kfree_skb_irq_reason(skb, reason);
3151 else
3152 kfree_skb_reason(skb, reason);
3153}
3154EXPORT_SYMBOL(dev_kfree_skb_any_reason);
3155
3156
3157/**
3158 * netif_device_detach - mark device as removed
3159 * @dev: network device
3160 *
3161 * Mark device as removed from system and therefore no longer available.
3162 */
3163void netif_device_detach(struct net_device *dev)
3164{
3165 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
3166 netif_running(dev)) {
3167 netif_tx_stop_all_queues(dev);
3168 }
3169}
3170EXPORT_SYMBOL(netif_device_detach);
3171
3172/**
3173 * netif_device_attach - mark device as attached
3174 * @dev: network device
3175 *
3176 * Mark device as attached from system and restart if needed.
3177 */
3178void netif_device_attach(struct net_device *dev)
3179{
3180 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
3181 netif_running(dev)) {
3182 netif_tx_wake_all_queues(dev);
3183 __netdev_watchdog_up(dev);
3184 }
3185}
3186EXPORT_SYMBOL(netif_device_attach);
3187
3188/*
3189 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
3190 * to be used as a distribution range.
3191 */
3192static u16 skb_tx_hash(const struct net_device *dev,
3193 const struct net_device *sb_dev,
3194 struct sk_buff *skb)
3195{
3196 u32 hash;
3197 u16 qoffset = 0;
3198 u16 qcount = dev->real_num_tx_queues;
3199
3200 if (dev->num_tc) {
3201 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
3202
3203 qoffset = sb_dev->tc_to_txq[tc].offset;
3204 qcount = sb_dev->tc_to_txq[tc].count;
3205 if (unlikely(!qcount)) {
3206 net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
3207 sb_dev->name, qoffset, tc);
3208 qoffset = 0;
3209 qcount = dev->real_num_tx_queues;
3210 }
3211 }
3212
3213 if (skb_rx_queue_recorded(skb)) {
3214 DEBUG_NET_WARN_ON_ONCE(qcount == 0);
3215 hash = skb_get_rx_queue(skb);
3216 if (hash >= qoffset)
3217 hash -= qoffset;
3218 while (unlikely(hash >= qcount))
3219 hash -= qcount;
3220 return hash + qoffset;
3221 }
3222
3223 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
3224}
3225
3226void skb_warn_bad_offload(const struct sk_buff *skb)
3227{
3228 static const netdev_features_t null_features;
3229 struct net_device *dev = skb->dev;
3230 const char *name = "";
3231
3232 if (!net_ratelimit())
3233 return;
3234
3235 if (dev) {
3236 if (dev->dev.parent)
3237 name = dev_driver_string(dev->dev.parent);
3238 else
3239 name = netdev_name(dev);
3240 }
3241 skb_dump(KERN_WARNING, skb, false);
3242 WARN(1, "%s: caps=(%pNF, %pNF)\n",
3243 name, dev ? &dev->features : &null_features,
3244 skb->sk ? &skb->sk->sk_route_caps : &null_features);
3245}
3246
3247/*
3248 * Invalidate hardware checksum when packet is to be mangled, and
3249 * complete checksum manually on outgoing path.
3250 */
3251int skb_checksum_help(struct sk_buff *skb)
3252{
3253 __wsum csum;
3254 int ret = 0, offset;
3255
3256 if (skb->ip_summed == CHECKSUM_COMPLETE)
3257 goto out_set_summed;
3258
3259 if (unlikely(skb_is_gso(skb))) {
3260 skb_warn_bad_offload(skb);
3261 return -EINVAL;
3262 }
3263
3264 /* Before computing a checksum, we should make sure no frag could
3265 * be modified by an external entity : checksum could be wrong.
3266 */
3267 if (skb_has_shared_frag(skb)) {
3268 ret = __skb_linearize(skb);
3269 if (ret)
3270 goto out;
3271 }
3272
3273 offset = skb_checksum_start_offset(skb);
3274 ret = -EINVAL;
3275 if (unlikely(offset >= skb_headlen(skb))) {
3276 DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
3277 WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n",
3278 offset, skb_headlen(skb));
3279 goto out;
3280 }
3281 csum = skb_checksum(skb, offset, skb->len - offset, 0);
3282
3283 offset += skb->csum_offset;
3284 if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) {
3285 DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
3286 WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n",
3287 offset + sizeof(__sum16), skb_headlen(skb));
3288 goto out;
3289 }
3290 ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
3291 if (ret)
3292 goto out;
3293
3294 *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
3295out_set_summed:
3296 skb->ip_summed = CHECKSUM_NONE;
3297out:
3298 return ret;
3299}
3300EXPORT_SYMBOL(skb_checksum_help);
3301
3302int skb_crc32c_csum_help(struct sk_buff *skb)
3303{
3304 __le32 crc32c_csum;
3305 int ret = 0, offset, start;
3306
3307 if (skb->ip_summed != CHECKSUM_PARTIAL)
3308 goto out;
3309
3310 if (unlikely(skb_is_gso(skb)))
3311 goto out;
3312
3313 /* Before computing a checksum, we should make sure no frag could
3314 * be modified by an external entity : checksum could be wrong.
3315 */
3316 if (unlikely(skb_has_shared_frag(skb))) {
3317 ret = __skb_linearize(skb);
3318 if (ret)
3319 goto out;
3320 }
3321 start = skb_checksum_start_offset(skb);
3322 offset = start + offsetof(struct sctphdr, checksum);
3323 if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
3324 ret = -EINVAL;
3325 goto out;
3326 }
3327
3328 ret = skb_ensure_writable(skb, offset + sizeof(__le32));
3329 if (ret)
3330 goto out;
3331
3332 crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
3333 skb->len - start, ~(__u32)0,
3334 crc32c_csum_stub));
3335 *(__le32 *)(skb->data + offset) = crc32c_csum;
3336 skb_reset_csum_not_inet(skb);
3337out:
3338 return ret;
3339}
3340
3341__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
3342{
3343 __be16 type = skb->protocol;
3344
3345 /* Tunnel gso handlers can set protocol to ethernet. */
3346 if (type == htons(ETH_P_TEB)) {
3347 struct ethhdr *eth;
3348
3349 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
3350 return 0;
3351
3352 eth = (struct ethhdr *)skb->data;
3353 type = eth->h_proto;
3354 }
3355
3356 return vlan_get_protocol_and_depth(skb, type, depth);
3357}
3358
3359
3360/* Take action when hardware reception checksum errors are detected. */
3361#ifdef CONFIG_BUG
3362static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3363{
3364 netdev_err(dev, "hw csum failure\n");
3365 skb_dump(KERN_ERR, skb, true);
3366 dump_stack();
3367}
3368
3369void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3370{
3371 DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
3372}
3373EXPORT_SYMBOL(netdev_rx_csum_fault);
3374#endif
3375
3376/* XXX: check that highmem exists at all on the given machine. */
3377static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
3378{
3379#ifdef CONFIG_HIGHMEM
3380 int i;
3381
3382 if (!(dev->features & NETIF_F_HIGHDMA)) {
3383 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3384 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3385
3386 if (PageHighMem(skb_frag_page(frag)))
3387 return 1;
3388 }
3389 }
3390#endif
3391 return 0;
3392}
3393
3394/* If MPLS offload request, verify we are testing hardware MPLS features
3395 * instead of standard features for the netdev.
3396 */
3397#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3398static netdev_features_t net_mpls_features(struct sk_buff *skb,
3399 netdev_features_t features,
3400 __be16 type)
3401{
3402 if (eth_p_mpls(type))
3403 features &= skb->dev->mpls_features;
3404
3405 return features;
3406}
3407#else
3408static netdev_features_t net_mpls_features(struct sk_buff *skb,
3409 netdev_features_t features,
3410 __be16 type)
3411{
3412 return features;
3413}
3414#endif
3415
3416static netdev_features_t harmonize_features(struct sk_buff *skb,
3417 netdev_features_t features)
3418{
3419 __be16 type;
3420
3421 type = skb_network_protocol(skb, NULL);
3422 features = net_mpls_features(skb, features, type);
3423
3424 if (skb->ip_summed != CHECKSUM_NONE &&
3425 !can_checksum_protocol(features, type)) {
3426 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3427 }
3428 if (illegal_highdma(skb->dev, skb))
3429 features &= ~NETIF_F_SG;
3430
3431 return features;
3432}
3433
3434netdev_features_t passthru_features_check(struct sk_buff *skb,
3435 struct net_device *dev,
3436 netdev_features_t features)
3437{
3438 return features;
3439}
3440EXPORT_SYMBOL(passthru_features_check);
3441
3442static netdev_features_t dflt_features_check(struct sk_buff *skb,
3443 struct net_device *dev,
3444 netdev_features_t features)
3445{
3446 return vlan_features_check(skb, features);
3447}
3448
3449static netdev_features_t gso_features_check(const struct sk_buff *skb,
3450 struct net_device *dev,
3451 netdev_features_t features)
3452{
3453 u16 gso_segs = skb_shinfo(skb)->gso_segs;
3454
3455 if (gso_segs > READ_ONCE(dev->gso_max_segs))
3456 return features & ~NETIF_F_GSO_MASK;
3457
3458 if (unlikely(skb->len >= READ_ONCE(dev->gso_max_size)))
3459 return features & ~NETIF_F_GSO_MASK;
3460
3461 if (!skb_shinfo(skb)->gso_type) {
3462 skb_warn_bad_offload(skb);
3463 return features & ~NETIF_F_GSO_MASK;
3464 }
3465
3466 /* Support for GSO partial features requires software
3467 * intervention before we can actually process the packets
3468 * so we need to strip support for any partial features now
3469 * and we can pull them back in after we have partially
3470 * segmented the frame.
3471 */
3472 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
3473 features &= ~dev->gso_partial_features;
3474
3475 /* Make sure to clear the IPv4 ID mangling feature if the
3476 * IPv4 header has the potential to be fragmented.
3477 */
3478 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3479 struct iphdr *iph = skb->encapsulation ?
3480 inner_ip_hdr(skb) : ip_hdr(skb);
3481
3482 if (!(iph->frag_off & htons(IP_DF)))
3483 features &= ~NETIF_F_TSO_MANGLEID;
3484 }
3485
3486 return features;
3487}
3488
3489netdev_features_t netif_skb_features(struct sk_buff *skb)
3490{
3491 struct net_device *dev = skb->dev;
3492 netdev_features_t features = dev->features;
3493
3494 if (skb_is_gso(skb))
3495 features = gso_features_check(skb, dev, features);
3496
3497 /* If encapsulation offload request, verify we are testing
3498 * hardware encapsulation features instead of standard
3499 * features for the netdev
3500 */
3501 if (skb->encapsulation)
3502 features &= dev->hw_enc_features;
3503
3504 if (skb_vlan_tagged(skb))
3505 features = netdev_intersect_features(features,
3506 dev->vlan_features |
3507 NETIF_F_HW_VLAN_CTAG_TX |
3508 NETIF_F_HW_VLAN_STAG_TX);
3509
3510 if (dev->netdev_ops->ndo_features_check)
3511 features &= dev->netdev_ops->ndo_features_check(skb, dev,
3512 features);
3513 else
3514 features &= dflt_features_check(skb, dev, features);
3515
3516 return harmonize_features(skb, features);
3517}
3518EXPORT_SYMBOL(netif_skb_features);
3519
3520static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3521 struct netdev_queue *txq, bool more)
3522{
3523 unsigned int len;
3524 int rc;
3525
3526 if (dev_nit_active(dev))
3527 dev_queue_xmit_nit(skb, dev);
3528
3529 len = skb->len;
3530 trace_net_dev_start_xmit(skb, dev);
3531 rc = netdev_start_xmit(skb, dev, txq, more);
3532 trace_net_dev_xmit(skb, rc, dev, len);
3533
3534 return rc;
3535}
3536
3537struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3538 struct netdev_queue *txq, int *ret)
3539{
3540 struct sk_buff *skb = first;
3541 int rc = NETDEV_TX_OK;
3542
3543 while (skb) {
3544 struct sk_buff *next = skb->next;
3545
3546 skb_mark_not_on_list(skb);
3547 rc = xmit_one(skb, dev, txq, next != NULL);
3548 if (unlikely(!dev_xmit_complete(rc))) {
3549 skb->next = next;
3550 goto out;
3551 }
3552
3553 skb = next;
3554 if (netif_tx_queue_stopped(txq) && skb) {
3555 rc = NETDEV_TX_BUSY;
3556 break;
3557 }
3558 }
3559
3560out:
3561 *ret = rc;
3562 return skb;
3563}
3564
3565static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3566 netdev_features_t features)
3567{
3568 if (skb_vlan_tag_present(skb) &&
3569 !vlan_hw_offload_capable(features, skb->vlan_proto))
3570 skb = __vlan_hwaccel_push_inside(skb);
3571 return skb;
3572}
3573
3574int skb_csum_hwoffload_help(struct sk_buff *skb,
3575 const netdev_features_t features)
3576{
3577 if (unlikely(skb_csum_is_sctp(skb)))
3578 return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3579 skb_crc32c_csum_help(skb);
3580
3581 if (features & NETIF_F_HW_CSUM)
3582 return 0;
3583
3584 if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
3585 switch (skb->csum_offset) {
3586 case offsetof(struct tcphdr, check):
3587 case offsetof(struct udphdr, check):
3588 return 0;
3589 }
3590 }
3591
3592 return skb_checksum_help(skb);
3593}
3594EXPORT_SYMBOL(skb_csum_hwoffload_help);
3595
3596static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3597{
3598 netdev_features_t features;
3599
3600 features = netif_skb_features(skb);
3601 skb = validate_xmit_vlan(skb, features);
3602 if (unlikely(!skb))
3603 goto out_null;
3604
3605 skb = sk_validate_xmit_skb(skb, dev);
3606 if (unlikely(!skb))
3607 goto out_null;
3608
3609 if (netif_needs_gso(skb, features)) {
3610 struct sk_buff *segs;
3611
3612 segs = skb_gso_segment(skb, features);
3613 if (IS_ERR(segs)) {
3614 goto out_kfree_skb;
3615 } else if (segs) {
3616 consume_skb(skb);
3617 skb = segs;
3618 }
3619 } else {
3620 if (skb_needs_linearize(skb, features) &&
3621 __skb_linearize(skb))
3622 goto out_kfree_skb;
3623
3624 /* If packet is not checksummed and device does not
3625 * support checksumming for this protocol, complete
3626 * checksumming here.
3627 */
3628 if (skb->ip_summed == CHECKSUM_PARTIAL) {
3629 if (skb->encapsulation)
3630 skb_set_inner_transport_header(skb,
3631 skb_checksum_start_offset(skb));
3632 else
3633 skb_set_transport_header(skb,
3634 skb_checksum_start_offset(skb));
3635 if (skb_csum_hwoffload_help(skb, features))
3636 goto out_kfree_skb;
3637 }
3638 }
3639
3640 skb = validate_xmit_xfrm(skb, features, again);
3641
3642 return skb;
3643
3644out_kfree_skb:
3645 kfree_skb(skb);
3646out_null:
3647 dev_core_stats_tx_dropped_inc(dev);
3648 return NULL;
3649}
3650
3651struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3652{
3653 struct sk_buff *next, *head = NULL, *tail;
3654
3655 for (; skb != NULL; skb = next) {
3656 next = skb->next;
3657 skb_mark_not_on_list(skb);
3658
3659 /* in case skb wont be segmented, point to itself */
3660 skb->prev = skb;
3661
3662 skb = validate_xmit_skb(skb, dev, again);
3663 if (!skb)
3664 continue;
3665
3666 if (!head)
3667 head = skb;
3668 else
3669 tail->next = skb;
3670 /* If skb was segmented, skb->prev points to
3671 * the last segment. If not, it still contains skb.
3672 */
3673 tail = skb->prev;
3674 }
3675 return head;
3676}
3677EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3678
3679static void qdisc_pkt_len_init(struct sk_buff *skb)
3680{
3681 const struct skb_shared_info *shinfo = skb_shinfo(skb);
3682
3683 qdisc_skb_cb(skb)->pkt_len = skb->len;
3684
3685 /* To get more precise estimation of bytes sent on wire,
3686 * we add to pkt_len the headers size of all segments
3687 */
3688 if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
3689 u16 gso_segs = shinfo->gso_segs;
3690 unsigned int hdr_len;
3691
3692 /* mac layer + network layer */
3693 hdr_len = skb_transport_offset(skb);
3694
3695 /* + transport layer */
3696 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3697 const struct tcphdr *th;
3698 struct tcphdr _tcphdr;
3699
3700 th = skb_header_pointer(skb, hdr_len,
3701 sizeof(_tcphdr), &_tcphdr);
3702 if (likely(th))
3703 hdr_len += __tcp_hdrlen(th);
3704 } else {
3705 struct udphdr _udphdr;
3706
3707 if (skb_header_pointer(skb, hdr_len,
3708 sizeof(_udphdr), &_udphdr))
3709 hdr_len += sizeof(struct udphdr);
3710 }
3711
3712 if (shinfo->gso_type & SKB_GSO_DODGY)
3713 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3714 shinfo->gso_size);
3715
3716 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3717 }
3718}
3719
3720static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
3721 struct sk_buff **to_free,
3722 struct netdev_queue *txq)
3723{
3724 int rc;
3725
3726 rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
3727 if (rc == NET_XMIT_SUCCESS)
3728 trace_qdisc_enqueue(q, txq, skb);
3729 return rc;
3730}
3731
3732static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3733 struct net_device *dev,
3734 struct netdev_queue *txq)
3735{
3736 spinlock_t *root_lock = qdisc_lock(q);
3737 struct sk_buff *to_free = NULL;
3738 bool contended;
3739 int rc;
3740
3741 qdisc_calculate_pkt_len(skb, q);
3742
3743 tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP);
3744
3745 if (q->flags & TCQ_F_NOLOCK) {
3746 if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
3747 qdisc_run_begin(q)) {
3748 /* Retest nolock_qdisc_is_empty() within the protection
3749 * of q->seqlock to protect from racing with requeuing.
3750 */
3751 if (unlikely(!nolock_qdisc_is_empty(q))) {
3752 rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3753 __qdisc_run(q);
3754 qdisc_run_end(q);
3755
3756 goto no_lock_out;
3757 }
3758
3759 qdisc_bstats_cpu_update(q, skb);
3760 if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
3761 !nolock_qdisc_is_empty(q))
3762 __qdisc_run(q);
3763
3764 qdisc_run_end(q);
3765 return NET_XMIT_SUCCESS;
3766 }
3767
3768 rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3769 qdisc_run(q);
3770
3771no_lock_out:
3772 if (unlikely(to_free))
3773 kfree_skb_list_reason(to_free,
3774 tcf_get_drop_reason(to_free));
3775 return rc;
3776 }
3777
3778 if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
3779 kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
3780 return NET_XMIT_DROP;
3781 }
3782 /*
3783 * Heuristic to force contended enqueues to serialize on a
3784 * separate lock before trying to get qdisc main lock.
3785 * This permits qdisc->running owner to get the lock more
3786 * often and dequeue packets faster.
3787 * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
3788 * and then other tasks will only enqueue packets. The packets will be
3789 * sent after the qdisc owner is scheduled again. To prevent this
3790 * scenario the task always serialize on the lock.
3791 */
3792 contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
3793 if (unlikely(contended))
3794 spin_lock(&q->busylock);
3795
3796 spin_lock(root_lock);
3797 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3798 __qdisc_drop(skb, &to_free);
3799 rc = NET_XMIT_DROP;
3800 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3801 qdisc_run_begin(q)) {
3802 /*
3803 * This is a work-conserving queue; there are no old skbs
3804 * waiting to be sent out; and the qdisc is not running -
3805 * xmit the skb directly.
3806 */
3807
3808 qdisc_bstats_update(q, skb);
3809
3810 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3811 if (unlikely(contended)) {
3812 spin_unlock(&q->busylock);
3813 contended = false;
3814 }
3815 __qdisc_run(q);
3816 }
3817
3818 qdisc_run_end(q);
3819 rc = NET_XMIT_SUCCESS;
3820 } else {
3821 WRITE_ONCE(q->owner, smp_processor_id());
3822 rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3823 WRITE_ONCE(q->owner, -1);
3824 if (qdisc_run_begin(q)) {
3825 if (unlikely(contended)) {
3826 spin_unlock(&q->busylock);
3827 contended = false;
3828 }
3829 __qdisc_run(q);
3830 qdisc_run_end(q);
3831 }
3832 }
3833 spin_unlock(root_lock);
3834 if (unlikely(to_free))
3835 kfree_skb_list_reason(to_free,
3836 tcf_get_drop_reason(to_free));
3837 if (unlikely(contended))
3838 spin_unlock(&q->busylock);
3839 return rc;
3840}
3841
3842#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3843static void skb_update_prio(struct sk_buff *skb)
3844{
3845 const struct netprio_map *map;
3846 const struct sock *sk;
3847 unsigned int prioidx;
3848
3849 if (skb->priority)
3850 return;
3851 map = rcu_dereference_bh(skb->dev->priomap);
3852 if (!map)
3853 return;
3854 sk = skb_to_full_sk(skb);
3855 if (!sk)
3856 return;
3857
3858 prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3859
3860 if (prioidx < map->priomap_len)
3861 skb->priority = map->priomap[prioidx];
3862}
3863#else
3864#define skb_update_prio(skb)
3865#endif
3866
3867/**
3868 * dev_loopback_xmit - loop back @skb
3869 * @net: network namespace this loopback is happening in
3870 * @sk: sk needed to be a netfilter okfn
3871 * @skb: buffer to transmit
3872 */
3873int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3874{
3875 skb_reset_mac_header(skb);
3876 __skb_pull(skb, skb_network_offset(skb));
3877 skb->pkt_type = PACKET_LOOPBACK;
3878 if (skb->ip_summed == CHECKSUM_NONE)
3879 skb->ip_summed = CHECKSUM_UNNECESSARY;
3880 DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb));
3881 skb_dst_force(skb);
3882 netif_rx(skb);
3883 return 0;
3884}
3885EXPORT_SYMBOL(dev_loopback_xmit);
3886
3887#ifdef CONFIG_NET_EGRESS
3888static struct netdev_queue *
3889netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
3890{
3891 int qm = skb_get_queue_mapping(skb);
3892
3893 return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
3894}
3895
3896static bool netdev_xmit_txqueue_skipped(void)
3897{
3898 return __this_cpu_read(softnet_data.xmit.skip_txqueue);
3899}
3900
3901void netdev_xmit_skip_txqueue(bool skip)
3902{
3903 __this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
3904}
3905EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
3906#endif /* CONFIG_NET_EGRESS */
3907
3908#ifdef CONFIG_NET_XGRESS
3909static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
3910 enum skb_drop_reason *drop_reason)
3911{
3912 int ret = TC_ACT_UNSPEC;
3913#ifdef CONFIG_NET_CLS_ACT
3914 struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
3915 struct tcf_result res;
3916
3917 if (!miniq)
3918 return ret;
3919
3920 tc_skb_cb(skb)->mru = 0;
3921 tc_skb_cb(skb)->post_ct = false;
3922 tcf_set_drop_reason(skb, *drop_reason);
3923
3924 mini_qdisc_bstats_cpu_update(miniq, skb);
3925 ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
3926 /* Only tcf related quirks below. */
3927 switch (ret) {
3928 case TC_ACT_SHOT:
3929 *drop_reason = tcf_get_drop_reason(skb);
3930 mini_qdisc_qstats_cpu_drop(miniq);
3931 break;
3932 case TC_ACT_OK:
3933 case TC_ACT_RECLASSIFY:
3934 skb->tc_index = TC_H_MIN(res.classid);
3935 break;
3936 }
3937#endif /* CONFIG_NET_CLS_ACT */
3938 return ret;
3939}
3940
3941static DEFINE_STATIC_KEY_FALSE(tcx_needed_key);
3942
3943void tcx_inc(void)
3944{
3945 static_branch_inc(&tcx_needed_key);
3946}
3947
3948void tcx_dec(void)
3949{
3950 static_branch_dec(&tcx_needed_key);
3951}
3952
3953static __always_inline enum tcx_action_base
3954tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
3955 const bool needs_mac)
3956{
3957 const struct bpf_mprog_fp *fp;
3958 const struct bpf_prog *prog;
3959 int ret = TCX_NEXT;
3960
3961 if (needs_mac)
3962 __skb_push(skb, skb->mac_len);
3963 bpf_mprog_foreach_prog(entry, fp, prog) {
3964 bpf_compute_data_pointers(skb);
3965 ret = bpf_prog_run(prog, skb);
3966 if (ret != TCX_NEXT)
3967 break;
3968 }
3969 if (needs_mac)
3970 __skb_pull(skb, skb->mac_len);
3971 return tcx_action_code(skb, ret);
3972}
3973
3974static __always_inline struct sk_buff *
3975sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3976 struct net_device *orig_dev, bool *another)
3977{
3978 struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
3979 enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
3980 int sch_ret;
3981
3982 if (!entry)
3983 return skb;
3984 if (*pt_prev) {
3985 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3986 *pt_prev = NULL;
3987 }
3988
3989 qdisc_skb_cb(skb)->pkt_len = skb->len;
3990 tcx_set_ingress(skb, true);
3991
3992 if (static_branch_unlikely(&tcx_needed_key)) {
3993 sch_ret = tcx_run(entry, skb, true);
3994 if (sch_ret != TC_ACT_UNSPEC)
3995 goto ingress_verdict;
3996 }
3997 sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
3998ingress_verdict:
3999 switch (sch_ret) {
4000 case TC_ACT_REDIRECT:
4001 /* skb_mac_header check was done by BPF, so we can safely
4002 * push the L2 header back before redirecting to another
4003 * netdev.
4004 */
4005 __skb_push(skb, skb->mac_len);
4006 if (skb_do_redirect(skb) == -EAGAIN) {
4007 __skb_pull(skb, skb->mac_len);
4008 *another = true;
4009 break;
4010 }
4011 *ret = NET_RX_SUCCESS;
4012 return NULL;
4013 case TC_ACT_SHOT:
4014 kfree_skb_reason(skb, drop_reason);
4015 *ret = NET_RX_DROP;
4016 return NULL;
4017 /* used by tc_run */
4018 case TC_ACT_STOLEN:
4019 case TC_ACT_QUEUED:
4020 case TC_ACT_TRAP:
4021 consume_skb(skb);
4022 fallthrough;
4023 case TC_ACT_CONSUMED:
4024 *ret = NET_RX_SUCCESS;
4025 return NULL;
4026 }
4027
4028 return skb;
4029}
4030
4031static __always_inline struct sk_buff *
4032sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
4033{
4034 struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
4035 enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
4036 int sch_ret;
4037
4038 if (!entry)
4039 return skb;
4040
4041 /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
4042 * already set by the caller.
4043 */
4044 if (static_branch_unlikely(&tcx_needed_key)) {
4045 sch_ret = tcx_run(entry, skb, false);
4046 if (sch_ret != TC_ACT_UNSPEC)
4047 goto egress_verdict;
4048 }
4049 sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
4050egress_verdict:
4051 switch (sch_ret) {
4052 case TC_ACT_REDIRECT:
4053 /* No need to push/pop skb's mac_header here on egress! */
4054 skb_do_redirect(skb);
4055 *ret = NET_XMIT_SUCCESS;
4056 return NULL;
4057 case TC_ACT_SHOT:
4058 kfree_skb_reason(skb, drop_reason);
4059 *ret = NET_XMIT_DROP;
4060 return NULL;
4061 /* used by tc_run */
4062 case TC_ACT_STOLEN:
4063 case TC_ACT_QUEUED:
4064 case TC_ACT_TRAP:
4065 consume_skb(skb);
4066 fallthrough;
4067 case TC_ACT_CONSUMED:
4068 *ret = NET_XMIT_SUCCESS;
4069 return NULL;
4070 }
4071
4072 return skb;
4073}
4074#else
4075static __always_inline struct sk_buff *
4076sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4077 struct net_device *orig_dev, bool *another)
4078{
4079 return skb;
4080}
4081
4082static __always_inline struct sk_buff *
4083sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
4084{
4085 return skb;
4086}
4087#endif /* CONFIG_NET_XGRESS */
4088
4089#ifdef CONFIG_XPS
4090static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
4091 struct xps_dev_maps *dev_maps, unsigned int tci)
4092{
4093 int tc = netdev_get_prio_tc_map(dev, skb->priority);
4094 struct xps_map *map;
4095 int queue_index = -1;
4096
4097 if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
4098 return queue_index;
4099
4100 tci *= dev_maps->num_tc;
4101 tci += tc;
4102
4103 map = rcu_dereference(dev_maps->attr_map[tci]);
4104 if (map) {
4105 if (map->len == 1)
4106 queue_index = map->queues[0];
4107 else
4108 queue_index = map->queues[reciprocal_scale(
4109 skb_get_hash(skb), map->len)];
4110 if (unlikely(queue_index >= dev->real_num_tx_queues))
4111 queue_index = -1;
4112 }
4113 return queue_index;
4114}
4115#endif
4116
4117static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
4118 struct sk_buff *skb)
4119{
4120#ifdef CONFIG_XPS
4121 struct xps_dev_maps *dev_maps;
4122 struct sock *sk = skb->sk;
4123 int queue_index = -1;
4124
4125 if (!static_key_false(&xps_needed))
4126 return -1;
4127
4128 rcu_read_lock();
4129 if (!static_key_false(&xps_rxqs_needed))
4130 goto get_cpus_map;
4131
4132 dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
4133 if (dev_maps) {
4134 int tci = sk_rx_queue_get(sk);
4135
4136 if (tci >= 0)
4137 queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
4138 tci);
4139 }
4140
4141get_cpus_map:
4142 if (queue_index < 0) {
4143 dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
4144 if (dev_maps) {
4145 unsigned int tci = skb->sender_cpu - 1;
4146
4147 queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
4148 tci);
4149 }
4150 }
4151 rcu_read_unlock();
4152
4153 return queue_index;
4154#else
4155 return -1;
4156#endif
4157}
4158
4159u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
4160 struct net_device *sb_dev)
4161{
4162 return 0;
4163}
4164EXPORT_SYMBOL(dev_pick_tx_zero);
4165
4166u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
4167 struct net_device *sb_dev)
4168{
4169 return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
4170}
4171EXPORT_SYMBOL(dev_pick_tx_cpu_id);
4172
4173u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
4174 struct net_device *sb_dev)
4175{
4176 struct sock *sk = skb->sk;
4177 int queue_index = sk_tx_queue_get(sk);
4178
4179 sb_dev = sb_dev ? : dev;
4180
4181 if (queue_index < 0 || skb->ooo_okay ||
4182 queue_index >= dev->real_num_tx_queues) {
4183 int new_index = get_xps_queue(dev, sb_dev, skb);
4184
4185 if (new_index < 0)
4186 new_index = skb_tx_hash(dev, sb_dev, skb);
4187
4188 if (queue_index != new_index && sk &&
4189 sk_fullsock(sk) &&
4190 rcu_access_pointer(sk->sk_dst_cache))
4191 sk_tx_queue_set(sk, new_index);
4192
4193 queue_index = new_index;
4194 }
4195
4196 return queue_index;
4197}
4198EXPORT_SYMBOL(netdev_pick_tx);
4199
4200struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
4201 struct sk_buff *skb,
4202 struct net_device *sb_dev)
4203{
4204 int queue_index = 0;
4205
4206#ifdef CONFIG_XPS
4207 u32 sender_cpu = skb->sender_cpu - 1;
4208
4209 if (sender_cpu >= (u32)NR_CPUS)
4210 skb->sender_cpu = raw_smp_processor_id() + 1;
4211#endif
4212
4213 if (dev->real_num_tx_queues != 1) {
4214 const struct net_device_ops *ops = dev->netdev_ops;
4215
4216 if (ops->ndo_select_queue)
4217 queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
4218 else
4219 queue_index = netdev_pick_tx(dev, skb, sb_dev);
4220
4221 queue_index = netdev_cap_txqueue(dev, queue_index);
4222 }
4223
4224 skb_set_queue_mapping(skb, queue_index);
4225 return netdev_get_tx_queue(dev, queue_index);
4226}
4227
4228/**
4229 * __dev_queue_xmit() - transmit a buffer
4230 * @skb: buffer to transmit
4231 * @sb_dev: suboordinate device used for L2 forwarding offload
4232 *
4233 * Queue a buffer for transmission to a network device. The caller must
4234 * have set the device and priority and built the buffer before calling
4235 * this function. The function can be called from an interrupt.
4236 *
4237 * When calling this method, interrupts MUST be enabled. This is because
4238 * the BH enable code must have IRQs enabled so that it will not deadlock.
4239 *
4240 * Regardless of the return value, the skb is consumed, so it is currently
4241 * difficult to retry a send to this method. (You can bump the ref count
4242 * before sending to hold a reference for retry if you are careful.)
4243 *
4244 * Return:
4245 * * 0 - buffer successfully transmitted
4246 * * positive qdisc return code - NET_XMIT_DROP etc.
4247 * * negative errno - other errors
4248 */
4249int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
4250{
4251 struct net_device *dev = skb->dev;
4252 struct netdev_queue *txq = NULL;
4253 struct Qdisc *q;
4254 int rc = -ENOMEM;
4255 bool again = false;
4256
4257 skb_reset_mac_header(skb);
4258 skb_assert_len(skb);
4259
4260 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
4261 __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
4262
4263 /* Disable soft irqs for various locks below. Also
4264 * stops preemption for RCU.
4265 */
4266 rcu_read_lock_bh();
4267
4268 skb_update_prio(skb);
4269
4270 qdisc_pkt_len_init(skb);
4271 tcx_set_ingress(skb, false);
4272#ifdef CONFIG_NET_EGRESS
4273 if (static_branch_unlikely(&egress_needed_key)) {
4274 if (nf_hook_egress_active()) {
4275 skb = nf_hook_egress(skb, &rc, dev);
4276 if (!skb)
4277 goto out;
4278 }
4279
4280 netdev_xmit_skip_txqueue(false);
4281
4282 nf_skip_egress(skb, true);
4283 skb = sch_handle_egress(skb, &rc, dev);
4284 if (!skb)
4285 goto out;
4286 nf_skip_egress(skb, false);
4287
4288 if (netdev_xmit_txqueue_skipped())
4289 txq = netdev_tx_queue_mapping(dev, skb);
4290 }
4291#endif
4292 /* If device/qdisc don't need skb->dst, release it right now while
4293 * its hot in this cpu cache.
4294 */
4295 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
4296 skb_dst_drop(skb);
4297 else
4298 skb_dst_force(skb);
4299
4300 if (!txq)
4301 txq = netdev_core_pick_tx(dev, skb, sb_dev);
4302
4303 q = rcu_dereference_bh(txq->qdisc);
4304
4305 trace_net_dev_queue(skb);
4306 if (q->enqueue) {
4307 rc = __dev_xmit_skb(skb, q, dev, txq);
4308 goto out;
4309 }
4310
4311 /* The device has no queue. Common case for software devices:
4312 * loopback, all the sorts of tunnels...
4313
4314 * Really, it is unlikely that netif_tx_lock protection is necessary
4315 * here. (f.e. loopback and IP tunnels are clean ignoring statistics
4316 * counters.)
4317 * However, it is possible, that they rely on protection
4318 * made by us here.
4319
4320 * Check this and shot the lock. It is not prone from deadlocks.
4321 *Either shot noqueue qdisc, it is even simpler 8)
4322 */
4323 if (dev->flags & IFF_UP) {
4324 int cpu = smp_processor_id(); /* ok because BHs are off */
4325
4326 /* Other cpus might concurrently change txq->xmit_lock_owner
4327 * to -1 or to their cpu id, but not to our id.
4328 */
4329 if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
4330 if (dev_xmit_recursion())
4331 goto recursion_alert;
4332
4333 skb = validate_xmit_skb(skb, dev, &again);
4334 if (!skb)
4335 goto out;
4336
4337 HARD_TX_LOCK(dev, txq, cpu);
4338
4339 if (!netif_xmit_stopped(txq)) {
4340 dev_xmit_recursion_inc();
4341 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
4342 dev_xmit_recursion_dec();
4343 if (dev_xmit_complete(rc)) {
4344 HARD_TX_UNLOCK(dev, txq);
4345 goto out;
4346 }
4347 }
4348 HARD_TX_UNLOCK(dev, txq);
4349 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
4350 dev->name);
4351 } else {
4352 /* Recursion is detected! It is possible,
4353 * unfortunately
4354 */
4355recursion_alert:
4356 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
4357 dev->name);
4358 }
4359 }
4360
4361 rc = -ENETDOWN;
4362 rcu_read_unlock_bh();
4363
4364 dev_core_stats_tx_dropped_inc(dev);
4365 kfree_skb_list(skb);
4366 return rc;
4367out:
4368 rcu_read_unlock_bh();
4369 return rc;
4370}
4371EXPORT_SYMBOL(__dev_queue_xmit);
4372
4373int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
4374{
4375 struct net_device *dev = skb->dev;
4376 struct sk_buff *orig_skb = skb;
4377 struct netdev_queue *txq;
4378 int ret = NETDEV_TX_BUSY;
4379 bool again = false;
4380
4381 if (unlikely(!netif_running(dev) ||
4382 !netif_carrier_ok(dev)))
4383 goto drop;
4384
4385 skb = validate_xmit_skb_list(skb, dev, &again);
4386 if (skb != orig_skb)
4387 goto drop;
4388
4389 skb_set_queue_mapping(skb, queue_id);
4390 txq = skb_get_tx_queue(dev, skb);
4391
4392 local_bh_disable();
4393
4394 dev_xmit_recursion_inc();
4395 HARD_TX_LOCK(dev, txq, smp_processor_id());
4396 if (!netif_xmit_frozen_or_drv_stopped(txq))
4397 ret = netdev_start_xmit(skb, dev, txq, false);
4398 HARD_TX_UNLOCK(dev, txq);
4399 dev_xmit_recursion_dec();
4400
4401 local_bh_enable();
4402 return ret;
4403drop:
4404 dev_core_stats_tx_dropped_inc(dev);
4405 kfree_skb_list(skb);
4406 return NET_XMIT_DROP;
4407}
4408EXPORT_SYMBOL(__dev_direct_xmit);
4409
4410/*************************************************************************
4411 * Receiver routines
4412 *************************************************************************/
4413
4414unsigned int sysctl_skb_defer_max __read_mostly = 64;
4415int weight_p __read_mostly = 64; /* old backlog weight */
4416int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
4417int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
4418
4419/* Called with irq disabled */
4420static inline void ____napi_schedule(struct softnet_data *sd,
4421 struct napi_struct *napi)
4422{
4423 struct task_struct *thread;
4424
4425 lockdep_assert_irqs_disabled();
4426
4427 if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
4428 /* Paired with smp_mb__before_atomic() in
4429 * napi_enable()/dev_set_threaded().
4430 * Use READ_ONCE() to guarantee a complete
4431 * read on napi->thread. Only call
4432 * wake_up_process() when it's not NULL.
4433 */
4434 thread = READ_ONCE(napi->thread);
4435 if (thread) {
4436 /* Avoid doing set_bit() if the thread is in
4437 * INTERRUPTIBLE state, cause napi_thread_wait()
4438 * makes sure to proceed with napi polling
4439 * if the thread is explicitly woken from here.
4440 */
4441 if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
4442 set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
4443 wake_up_process(thread);
4444 return;
4445 }
4446 }
4447
4448 list_add_tail(&napi->poll_list, &sd->poll_list);
4449 WRITE_ONCE(napi->list_owner, smp_processor_id());
4450 /* If not called from net_rx_action()
4451 * we have to raise NET_RX_SOFTIRQ.
4452 */
4453 if (!sd->in_net_rx_action)
4454 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4455}
4456
4457#ifdef CONFIG_RPS
4458
4459struct static_key_false rps_needed __read_mostly;
4460EXPORT_SYMBOL(rps_needed);
4461struct static_key_false rfs_needed __read_mostly;
4462EXPORT_SYMBOL(rfs_needed);
4463
4464static struct rps_dev_flow *
4465set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4466 struct rps_dev_flow *rflow, u16 next_cpu)
4467{
4468 if (next_cpu < nr_cpu_ids) {
4469#ifdef CONFIG_RFS_ACCEL
4470 struct netdev_rx_queue *rxqueue;
4471 struct rps_dev_flow_table *flow_table;
4472 struct rps_dev_flow *old_rflow;
4473 u32 flow_id;
4474 u16 rxq_index;
4475 int rc;
4476
4477 /* Should we steer this flow to a different hardware queue? */
4478 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
4479 !(dev->features & NETIF_F_NTUPLE))
4480 goto out;
4481 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
4482 if (rxq_index == skb_get_rx_queue(skb))
4483 goto out;
4484
4485 rxqueue = dev->_rx + rxq_index;
4486 flow_table = rcu_dereference(rxqueue->rps_flow_table);
4487 if (!flow_table)
4488 goto out;
4489 flow_id = skb_get_hash(skb) & flow_table->mask;
4490 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
4491 rxq_index, flow_id);
4492 if (rc < 0)
4493 goto out;
4494 old_rflow = rflow;
4495 rflow = &flow_table->flows[flow_id];
4496 rflow->filter = rc;
4497 if (old_rflow->filter == rflow->filter)
4498 old_rflow->filter = RPS_NO_FILTER;
4499 out:
4500#endif
4501 rflow->last_qtail =
4502 per_cpu(softnet_data, next_cpu).input_queue_head;
4503 }
4504
4505 rflow->cpu = next_cpu;
4506 return rflow;
4507}
4508
4509/*
4510 * get_rps_cpu is called from netif_receive_skb and returns the target
4511 * CPU from the RPS map of the receiving queue for a given skb.
4512 * rcu_read_lock must be held on entry.
4513 */
4514static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4515 struct rps_dev_flow **rflowp)
4516{
4517 const struct rps_sock_flow_table *sock_flow_table;
4518 struct netdev_rx_queue *rxqueue = dev->_rx;
4519 struct rps_dev_flow_table *flow_table;
4520 struct rps_map *map;
4521 int cpu = -1;
4522 u32 tcpu;
4523 u32 hash;
4524
4525 if (skb_rx_queue_recorded(skb)) {
4526 u16 index = skb_get_rx_queue(skb);
4527
4528 if (unlikely(index >= dev->real_num_rx_queues)) {
4529 WARN_ONCE(dev->real_num_rx_queues > 1,
4530 "%s received packet on queue %u, but number "
4531 "of RX queues is %u\n",
4532 dev->name, index, dev->real_num_rx_queues);
4533 goto done;
4534 }
4535 rxqueue += index;
4536 }
4537
4538 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
4539
4540 flow_table = rcu_dereference(rxqueue->rps_flow_table);
4541 map = rcu_dereference(rxqueue->rps_map);
4542 if (!flow_table && !map)
4543 goto done;
4544
4545 skb_reset_network_header(skb);
4546 hash = skb_get_hash(skb);
4547 if (!hash)
4548 goto done;
4549
4550 sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
4551 if (flow_table && sock_flow_table) {
4552 struct rps_dev_flow *rflow;
4553 u32 next_cpu;
4554 u32 ident;
4555
4556 /* First check into global flow table if there is a match.
4557 * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
4558 */
4559 ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
4560 if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask)
4561 goto try_rps;
4562
4563 next_cpu = ident & net_hotdata.rps_cpu_mask;
4564
4565 /* OK, now we know there is a match,
4566 * we can look at the local (per receive queue) flow table
4567 */
4568 rflow = &flow_table->flows[hash & flow_table->mask];
4569 tcpu = rflow->cpu;
4570
4571 /*
4572 * If the desired CPU (where last recvmsg was done) is
4573 * different from current CPU (one in the rx-queue flow
4574 * table entry), switch if one of the following holds:
4575 * - Current CPU is unset (>= nr_cpu_ids).
4576 * - Current CPU is offline.
4577 * - The current CPU's queue tail has advanced beyond the
4578 * last packet that was enqueued using this table entry.
4579 * This guarantees that all previous packets for the flow
4580 * have been dequeued, thus preserving in order delivery.
4581 */
4582 if (unlikely(tcpu != next_cpu) &&
4583 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
4584 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
4585 rflow->last_qtail)) >= 0)) {
4586 tcpu = next_cpu;
4587 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
4588 }
4589
4590 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
4591 *rflowp = rflow;
4592 cpu = tcpu;
4593 goto done;
4594 }
4595 }
4596
4597try_rps:
4598
4599 if (map) {
4600 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
4601 if (cpu_online(tcpu)) {
4602 cpu = tcpu;
4603 goto done;
4604 }
4605 }
4606
4607done:
4608 return cpu;
4609}
4610
4611#ifdef CONFIG_RFS_ACCEL
4612
4613/**
4614 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
4615 * @dev: Device on which the filter was set
4616 * @rxq_index: RX queue index
4617 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
4618 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
4619 *
4620 * Drivers that implement ndo_rx_flow_steer() should periodically call
4621 * this function for each installed filter and remove the filters for
4622 * which it returns %true.
4623 */
4624bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
4625 u32 flow_id, u16 filter_id)
4626{
4627 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
4628 struct rps_dev_flow_table *flow_table;
4629 struct rps_dev_flow *rflow;
4630 bool expire = true;
4631 unsigned int cpu;
4632
4633 rcu_read_lock();
4634 flow_table = rcu_dereference(rxqueue->rps_flow_table);
4635 if (flow_table && flow_id <= flow_table->mask) {
4636 rflow = &flow_table->flows[flow_id];
4637 cpu = READ_ONCE(rflow->cpu);
4638 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
4639 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
4640 rflow->last_qtail) <
4641 (int)(10 * flow_table->mask)))
4642 expire = false;
4643 }
4644 rcu_read_unlock();
4645 return expire;
4646}
4647EXPORT_SYMBOL(rps_may_expire_flow);
4648
4649#endif /* CONFIG_RFS_ACCEL */
4650
4651/* Called from hardirq (IPI) context */
4652static void rps_trigger_softirq(void *data)
4653{
4654 struct softnet_data *sd = data;
4655
4656 ____napi_schedule(sd, &sd->backlog);
4657 sd->received_rps++;
4658}
4659
4660#endif /* CONFIG_RPS */
4661
4662/* Called from hardirq (IPI) context */
4663static void trigger_rx_softirq(void *data)
4664{
4665 struct softnet_data *sd = data;
4666
4667 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4668 smp_store_release(&sd->defer_ipi_scheduled, 0);
4669}
4670
4671/*
4672 * After we queued a packet into sd->input_pkt_queue,
4673 * we need to make sure this queue is serviced soon.
4674 *
4675 * - If this is another cpu queue, link it to our rps_ipi_list,
4676 * and make sure we will process rps_ipi_list from net_rx_action().
4677 *
4678 * - If this is our own queue, NAPI schedule our backlog.
4679 * Note that this also raises NET_RX_SOFTIRQ.
4680 */
4681static void napi_schedule_rps(struct softnet_data *sd)
4682{
4683 struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
4684
4685#ifdef CONFIG_RPS
4686 if (sd != mysd) {
4687 sd->rps_ipi_next = mysd->rps_ipi_list;
4688 mysd->rps_ipi_list = sd;
4689
4690 /* If not called from net_rx_action() or napi_threaded_poll()
4691 * we have to raise NET_RX_SOFTIRQ.
4692 */
4693 if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll)
4694 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4695 return;
4696 }
4697#endif /* CONFIG_RPS */
4698 __napi_schedule_irqoff(&mysd->backlog);
4699}
4700
4701#ifdef CONFIG_NET_FLOW_LIMIT
4702int netdev_flow_limit_table_len __read_mostly = (1 << 12);
4703#endif
4704
4705static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
4706{
4707#ifdef CONFIG_NET_FLOW_LIMIT
4708 struct sd_flow_limit *fl;
4709 struct softnet_data *sd;
4710 unsigned int old_flow, new_flow;
4711
4712 if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1))
4713 return false;
4714
4715 sd = this_cpu_ptr(&softnet_data);
4716
4717 rcu_read_lock();
4718 fl = rcu_dereference(sd->flow_limit);
4719 if (fl) {
4720 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
4721 old_flow = fl->history[fl->history_head];
4722 fl->history[fl->history_head] = new_flow;
4723
4724 fl->history_head++;
4725 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
4726
4727 if (likely(fl->buckets[old_flow]))
4728 fl->buckets[old_flow]--;
4729
4730 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
4731 fl->count++;
4732 rcu_read_unlock();
4733 return true;
4734 }
4735 }
4736 rcu_read_unlock();
4737#endif
4738 return false;
4739}
4740
4741/*
4742 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
4743 * queue (may be a remote CPU queue).
4744 */
4745static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
4746 unsigned int *qtail)
4747{
4748 enum skb_drop_reason reason;
4749 struct softnet_data *sd;
4750 unsigned long flags;
4751 unsigned int qlen;
4752
4753 reason = SKB_DROP_REASON_NOT_SPECIFIED;
4754 sd = &per_cpu(softnet_data, cpu);
4755
4756 rps_lock_irqsave(sd, &flags);
4757 if (!netif_running(skb->dev))
4758 goto drop;
4759 qlen = skb_queue_len(&sd->input_pkt_queue);
4760 if (qlen <= READ_ONCE(net_hotdata.max_backlog) &&
4761 !skb_flow_limit(skb, qlen)) {
4762 if (qlen) {
4763enqueue:
4764 __skb_queue_tail(&sd->input_pkt_queue, skb);
4765 input_queue_tail_incr_save(sd, qtail);
4766 rps_unlock_irq_restore(sd, &flags);
4767 return NET_RX_SUCCESS;
4768 }
4769
4770 /* Schedule NAPI for backlog device
4771 * We can use non atomic operation since we own the queue lock
4772 */
4773 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
4774 napi_schedule_rps(sd);
4775 goto enqueue;
4776 }
4777 reason = SKB_DROP_REASON_CPU_BACKLOG;
4778
4779drop:
4780 sd->dropped++;
4781 rps_unlock_irq_restore(sd, &flags);
4782
4783 dev_core_stats_rx_dropped_inc(skb->dev);
4784 kfree_skb_reason(skb, reason);
4785 return NET_RX_DROP;
4786}
4787
4788static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
4789{
4790 struct net_device *dev = skb->dev;
4791 struct netdev_rx_queue *rxqueue;
4792
4793 rxqueue = dev->_rx;
4794
4795 if (skb_rx_queue_recorded(skb)) {
4796 u16 index = skb_get_rx_queue(skb);
4797
4798 if (unlikely(index >= dev->real_num_rx_queues)) {
4799 WARN_ONCE(dev->real_num_rx_queues > 1,
4800 "%s received packet on queue %u, but number "
4801 "of RX queues is %u\n",
4802 dev->name, index, dev->real_num_rx_queues);
4803
4804 return rxqueue; /* Return first rxqueue */
4805 }
4806 rxqueue += index;
4807 }
4808 return rxqueue;
4809}
4810
4811u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
4812 struct bpf_prog *xdp_prog)
4813{
4814 void *orig_data, *orig_data_end, *hard_start;
4815 struct netdev_rx_queue *rxqueue;
4816 bool orig_bcast, orig_host;
4817 u32 mac_len, frame_sz;
4818 __be16 orig_eth_type;
4819 struct ethhdr *eth;
4820 u32 metalen, act;
4821 int off;
4822
4823 /* The XDP program wants to see the packet starting at the MAC
4824 * header.
4825 */
4826 mac_len = skb->data - skb_mac_header(skb);
4827 hard_start = skb->data - skb_headroom(skb);
4828
4829 /* SKB "head" area always have tailroom for skb_shared_info */
4830 frame_sz = (void *)skb_end_pointer(skb) - hard_start;
4831 frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
4832
4833 rxqueue = netif_get_rxqueue(skb);
4834 xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
4835 xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
4836 skb_headlen(skb) + mac_len, true);
4837 if (skb_is_nonlinear(skb)) {
4838 skb_shinfo(skb)->xdp_frags_size = skb->data_len;
4839 xdp_buff_set_frags_flag(xdp);
4840 } else {
4841 xdp_buff_clear_frags_flag(xdp);
4842 }
4843
4844 orig_data_end = xdp->data_end;
4845 orig_data = xdp->data;
4846 eth = (struct ethhdr *)xdp->data;
4847 orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
4848 orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
4849 orig_eth_type = eth->h_proto;
4850
4851 act = bpf_prog_run_xdp(xdp_prog, xdp);
4852
4853 /* check if bpf_xdp_adjust_head was used */
4854 off = xdp->data - orig_data;
4855 if (off) {
4856 if (off > 0)
4857 __skb_pull(skb, off);
4858 else if (off < 0)
4859 __skb_push(skb, -off);
4860
4861 skb->mac_header += off;
4862 skb_reset_network_header(skb);
4863 }
4864
4865 /* check if bpf_xdp_adjust_tail was used */
4866 off = xdp->data_end - orig_data_end;
4867 if (off != 0) {
4868 skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4869 skb->len += off; /* positive on grow, negative on shrink */
4870 }
4871
4872 /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
4873 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
4874 */
4875 if (xdp_buff_has_frags(xdp))
4876 skb->data_len = skb_shinfo(skb)->xdp_frags_size;
4877 else
4878 skb->data_len = 0;
4879
4880 /* check if XDP changed eth hdr such SKB needs update */
4881 eth = (struct ethhdr *)xdp->data;
4882 if ((orig_eth_type != eth->h_proto) ||
4883 (orig_host != ether_addr_equal_64bits(eth->h_dest,
4884 skb->dev->dev_addr)) ||
4885 (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
4886 __skb_push(skb, ETH_HLEN);
4887 skb->pkt_type = PACKET_HOST;
4888 skb->protocol = eth_type_trans(skb, skb->dev);
4889 }
4890
4891 /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
4892 * before calling us again on redirect path. We do not call do_redirect
4893 * as we leave that up to the caller.
4894 *
4895 * Caller is responsible for managing lifetime of skb (i.e. calling
4896 * kfree_skb in response to actions it cannot handle/XDP_DROP).
4897 */
4898 switch (act) {
4899 case XDP_REDIRECT:
4900 case XDP_TX:
4901 __skb_push(skb, mac_len);
4902 break;
4903 case XDP_PASS:
4904 metalen = xdp->data - xdp->data_meta;
4905 if (metalen)
4906 skb_metadata_set(skb, metalen);
4907 break;
4908 }
4909
4910 return act;
4911}
4912
4913static int
4914netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog)
4915{
4916 struct sk_buff *skb = *pskb;
4917 int err, hroom, troom;
4918
4919 if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog))
4920 return 0;
4921
4922 /* In case we have to go down the path and also linearize,
4923 * then lets do the pskb_expand_head() work just once here.
4924 */
4925 hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
4926 troom = skb->tail + skb->data_len - skb->end;
4927 err = pskb_expand_head(skb,
4928 hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
4929 troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
4930 if (err)
4931 return err;
4932
4933 return skb_linearize(skb);
4934}
4935
4936static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
4937 struct xdp_buff *xdp,
4938 struct bpf_prog *xdp_prog)
4939{
4940 struct sk_buff *skb = *pskb;
4941 u32 mac_len, act = XDP_DROP;
4942
4943 /* Reinjected packets coming from act_mirred or similar should
4944 * not get XDP generic processing.
4945 */
4946 if (skb_is_redirected(skb))
4947 return XDP_PASS;
4948
4949 /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
4950 * bytes. This is the guarantee that also native XDP provides,
4951 * thus we need to do it here as well.
4952 */
4953 mac_len = skb->data - skb_mac_header(skb);
4954 __skb_push(skb, mac_len);
4955
4956 if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
4957 skb_headroom(skb) < XDP_PACKET_HEADROOM) {
4958 if (netif_skb_check_for_xdp(pskb, xdp_prog))
4959 goto do_drop;
4960 }
4961
4962 __skb_pull(*pskb, mac_len);
4963
4964 act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
4965 switch (act) {
4966 case XDP_REDIRECT:
4967 case XDP_TX:
4968 case XDP_PASS:
4969 break;
4970 default:
4971 bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
4972 fallthrough;
4973 case XDP_ABORTED:
4974 trace_xdp_exception((*pskb)->dev, xdp_prog, act);
4975 fallthrough;
4976 case XDP_DROP:
4977 do_drop:
4978 kfree_skb(*pskb);
4979 break;
4980 }
4981
4982 return act;
4983}
4984
4985/* When doing generic XDP we have to bypass the qdisc layer and the
4986 * network taps in order to match in-driver-XDP behavior. This also means
4987 * that XDP packets are able to starve other packets going through a qdisc,
4988 * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
4989 * queues, so they do not have this starvation issue.
4990 */
4991void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
4992{
4993 struct net_device *dev = skb->dev;
4994 struct netdev_queue *txq;
4995 bool free_skb = true;
4996 int cpu, rc;
4997
4998 txq = netdev_core_pick_tx(dev, skb, NULL);
4999 cpu = smp_processor_id();
5000 HARD_TX_LOCK(dev, txq, cpu);
5001 if (!netif_xmit_frozen_or_drv_stopped(txq)) {
5002 rc = netdev_start_xmit(skb, dev, txq, 0);
5003 if (dev_xmit_complete(rc))
5004 free_skb = false;
5005 }
5006 HARD_TX_UNLOCK(dev, txq);
5007 if (free_skb) {
5008 trace_xdp_exception(dev, xdp_prog, XDP_TX);
5009 dev_core_stats_tx_dropped_inc(dev);
5010 kfree_skb(skb);
5011 }
5012}
5013
5014static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
5015
5016int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb)
5017{
5018 if (xdp_prog) {
5019 struct xdp_buff xdp;
5020 u32 act;
5021 int err;
5022
5023 act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog);
5024 if (act != XDP_PASS) {
5025 switch (act) {
5026 case XDP_REDIRECT:
5027 err = xdp_do_generic_redirect((*pskb)->dev, *pskb,
5028 &xdp, xdp_prog);
5029 if (err)
5030 goto out_redir;
5031 break;
5032 case XDP_TX:
5033 generic_xdp_tx(*pskb, xdp_prog);
5034 break;
5035 }
5036 return XDP_DROP;
5037 }
5038 }
5039 return XDP_PASS;
5040out_redir:
5041 kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP);
5042 return XDP_DROP;
5043}
5044EXPORT_SYMBOL_GPL(do_xdp_generic);
5045
5046static int netif_rx_internal(struct sk_buff *skb)
5047{
5048 int ret;
5049
5050 net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
5051
5052 trace_netif_rx(skb);
5053
5054#ifdef CONFIG_RPS
5055 if (static_branch_unlikely(&rps_needed)) {
5056 struct rps_dev_flow voidflow, *rflow = &voidflow;
5057 int cpu;
5058
5059 rcu_read_lock();
5060
5061 cpu = get_rps_cpu(skb->dev, skb, &rflow);
5062 if (cpu < 0)
5063 cpu = smp_processor_id();
5064
5065 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5066
5067 rcu_read_unlock();
5068 } else
5069#endif
5070 {
5071 unsigned int qtail;
5072
5073 ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
5074 }
5075 return ret;
5076}
5077
5078/**
5079 * __netif_rx - Slightly optimized version of netif_rx
5080 * @skb: buffer to post
5081 *
5082 * This behaves as netif_rx except that it does not disable bottom halves.
5083 * As a result this function may only be invoked from the interrupt context
5084 * (either hard or soft interrupt).
5085 */
5086int __netif_rx(struct sk_buff *skb)
5087{
5088 int ret;
5089
5090 lockdep_assert_once(hardirq_count() | softirq_count());
5091
5092 trace_netif_rx_entry(skb);
5093 ret = netif_rx_internal(skb);
5094 trace_netif_rx_exit(ret);
5095 return ret;
5096}
5097EXPORT_SYMBOL(__netif_rx);
5098
5099/**
5100 * netif_rx - post buffer to the network code
5101 * @skb: buffer to post
5102 *
5103 * This function receives a packet from a device driver and queues it for
5104 * the upper (protocol) levels to process via the backlog NAPI device. It
5105 * always succeeds. The buffer may be dropped during processing for
5106 * congestion control or by the protocol layers.
5107 * The network buffer is passed via the backlog NAPI device. Modern NIC
5108 * driver should use NAPI and GRO.
5109 * This function can used from interrupt and from process context. The
5110 * caller from process context must not disable interrupts before invoking
5111 * this function.
5112 *
5113 * return values:
5114 * NET_RX_SUCCESS (no congestion)
5115 * NET_RX_DROP (packet was dropped)
5116 *
5117 */
5118int netif_rx(struct sk_buff *skb)
5119{
5120 bool need_bh_off = !(hardirq_count() | softirq_count());
5121 int ret;
5122
5123 if (need_bh_off)
5124 local_bh_disable();
5125 trace_netif_rx_entry(skb);
5126 ret = netif_rx_internal(skb);
5127 trace_netif_rx_exit(ret);
5128 if (need_bh_off)
5129 local_bh_enable();
5130 return ret;
5131}
5132EXPORT_SYMBOL(netif_rx);
5133
5134static __latent_entropy void net_tx_action(struct softirq_action *h)
5135{
5136 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5137
5138 if (sd->completion_queue) {
5139 struct sk_buff *clist;
5140
5141 local_irq_disable();
5142 clist = sd->completion_queue;
5143 sd->completion_queue = NULL;
5144 local_irq_enable();
5145
5146 while (clist) {
5147 struct sk_buff *skb = clist;
5148
5149 clist = clist->next;
5150
5151 WARN_ON(refcount_read(&skb->users));
5152 if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED))
5153 trace_consume_skb(skb, net_tx_action);
5154 else
5155 trace_kfree_skb(skb, net_tx_action,
5156 get_kfree_skb_cb(skb)->reason);
5157
5158 if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
5159 __kfree_skb(skb);
5160 else
5161 __napi_kfree_skb(skb,
5162 get_kfree_skb_cb(skb)->reason);
5163 }
5164 }
5165
5166 if (sd->output_queue) {
5167 struct Qdisc *head;
5168
5169 local_irq_disable();
5170 head = sd->output_queue;
5171 sd->output_queue = NULL;
5172 sd->output_queue_tailp = &sd->output_queue;
5173 local_irq_enable();
5174
5175 rcu_read_lock();
5176
5177 while (head) {
5178 struct Qdisc *q = head;
5179 spinlock_t *root_lock = NULL;
5180
5181 head = head->next_sched;
5182
5183 /* We need to make sure head->next_sched is read
5184 * before clearing __QDISC_STATE_SCHED
5185 */
5186 smp_mb__before_atomic();
5187
5188 if (!(q->flags & TCQ_F_NOLOCK)) {
5189 root_lock = qdisc_lock(q);
5190 spin_lock(root_lock);
5191 } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
5192 &q->state))) {
5193 /* There is a synchronize_net() between
5194 * STATE_DEACTIVATED flag being set and
5195 * qdisc_reset()/some_qdisc_is_busy() in
5196 * dev_deactivate(), so we can safely bail out
5197 * early here to avoid data race between
5198 * qdisc_deactivate() and some_qdisc_is_busy()
5199 * for lockless qdisc.
5200 */
5201 clear_bit(__QDISC_STATE_SCHED, &q->state);
5202 continue;
5203 }
5204
5205 clear_bit(__QDISC_STATE_SCHED, &q->state);
5206 qdisc_run(q);
5207 if (root_lock)
5208 spin_unlock(root_lock);
5209 }
5210
5211 rcu_read_unlock();
5212 }
5213
5214 xfrm_dev_backlog(sd);
5215}
5216
5217#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
5218/* This hook is defined here for ATM LANE */
5219int (*br_fdb_test_addr_hook)(struct net_device *dev,
5220 unsigned char *addr) __read_mostly;
5221EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
5222#endif
5223
5224/**
5225 * netdev_is_rx_handler_busy - check if receive handler is registered
5226 * @dev: device to check
5227 *
5228 * Check if a receive handler is already registered for a given device.
5229 * Return true if there one.
5230 *
5231 * The caller must hold the rtnl_mutex.
5232 */
5233bool netdev_is_rx_handler_busy(struct net_device *dev)
5234{
5235 ASSERT_RTNL();
5236 return dev && rtnl_dereference(dev->rx_handler);
5237}
5238EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
5239
5240/**
5241 * netdev_rx_handler_register - register receive handler
5242 * @dev: device to register a handler for
5243 * @rx_handler: receive handler to register
5244 * @rx_handler_data: data pointer that is used by rx handler
5245 *
5246 * Register a receive handler for a device. This handler will then be
5247 * called from __netif_receive_skb. A negative errno code is returned
5248 * on a failure.
5249 *
5250 * The caller must hold the rtnl_mutex.
5251 *
5252 * For a general description of rx_handler, see enum rx_handler_result.
5253 */
5254int netdev_rx_handler_register(struct net_device *dev,
5255 rx_handler_func_t *rx_handler,
5256 void *rx_handler_data)
5257{
5258 if (netdev_is_rx_handler_busy(dev))
5259 return -EBUSY;
5260
5261 if (dev->priv_flags & IFF_NO_RX_HANDLER)
5262 return -EINVAL;
5263
5264 /* Note: rx_handler_data must be set before rx_handler */
5265 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
5266 rcu_assign_pointer(dev->rx_handler, rx_handler);
5267
5268 return 0;
5269}
5270EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
5271
5272/**
5273 * netdev_rx_handler_unregister - unregister receive handler
5274 * @dev: device to unregister a handler from
5275 *
5276 * Unregister a receive handler from a device.
5277 *
5278 * The caller must hold the rtnl_mutex.
5279 */
5280void netdev_rx_handler_unregister(struct net_device *dev)
5281{
5282
5283 ASSERT_RTNL();
5284 RCU_INIT_POINTER(dev->rx_handler, NULL);
5285 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
5286 * section has a guarantee to see a non NULL rx_handler_data
5287 * as well.
5288 */
5289 synchronize_net();
5290 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
5291}
5292EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
5293
5294/*
5295 * Limit the use of PFMEMALLOC reserves to those protocols that implement
5296 * the special handling of PFMEMALLOC skbs.
5297 */
5298static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
5299{
5300 switch (skb->protocol) {
5301 case htons(ETH_P_ARP):
5302 case htons(ETH_P_IP):
5303 case htons(ETH_P_IPV6):
5304 case htons(ETH_P_8021Q):
5305 case htons(ETH_P_8021AD):
5306 return true;
5307 default:
5308 return false;
5309 }
5310}
5311
5312static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
5313 int *ret, struct net_device *orig_dev)
5314{
5315 if (nf_hook_ingress_active(skb)) {
5316 int ingress_retval;
5317
5318 if (*pt_prev) {
5319 *ret = deliver_skb(skb, *pt_prev, orig_dev);
5320 *pt_prev = NULL;
5321 }
5322
5323 rcu_read_lock();
5324 ingress_retval = nf_hook_ingress(skb);
5325 rcu_read_unlock();
5326 return ingress_retval;
5327 }
5328 return 0;
5329}
5330
5331static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
5332 struct packet_type **ppt_prev)
5333{
5334 struct packet_type *ptype, *pt_prev;
5335 rx_handler_func_t *rx_handler;
5336 struct sk_buff *skb = *pskb;
5337 struct net_device *orig_dev;
5338 bool deliver_exact = false;
5339 int ret = NET_RX_DROP;
5340 __be16 type;
5341
5342 net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb);
5343
5344 trace_netif_receive_skb(skb);
5345
5346 orig_dev = skb->dev;
5347
5348 skb_reset_network_header(skb);
5349 if (!skb_transport_header_was_set(skb))
5350 skb_reset_transport_header(skb);
5351 skb_reset_mac_len(skb);
5352
5353 pt_prev = NULL;
5354
5355another_round:
5356 skb->skb_iif = skb->dev->ifindex;
5357
5358 __this_cpu_inc(softnet_data.processed);
5359
5360 if (static_branch_unlikely(&generic_xdp_needed_key)) {
5361 int ret2;
5362
5363 migrate_disable();
5364 ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
5365 &skb);
5366 migrate_enable();
5367
5368 if (ret2 != XDP_PASS) {
5369 ret = NET_RX_DROP;
5370 goto out;
5371 }
5372 }
5373
5374 if (eth_type_vlan(skb->protocol)) {
5375 skb = skb_vlan_untag(skb);
5376 if (unlikely(!skb))
5377 goto out;
5378 }
5379
5380 if (skb_skip_tc_classify(skb))
5381 goto skip_classify;
5382
5383 if (pfmemalloc)
5384 goto skip_taps;
5385
5386 list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) {
5387 if (pt_prev)
5388 ret = deliver_skb(skb, pt_prev, orig_dev);
5389 pt_prev = ptype;
5390 }
5391
5392 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
5393 if (pt_prev)
5394 ret = deliver_skb(skb, pt_prev, orig_dev);
5395 pt_prev = ptype;
5396 }
5397
5398skip_taps:
5399#ifdef CONFIG_NET_INGRESS
5400 if (static_branch_unlikely(&ingress_needed_key)) {
5401 bool another = false;
5402
5403 nf_skip_egress(skb, true);
5404 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
5405 &another);
5406 if (another)
5407 goto another_round;
5408 if (!skb)
5409 goto out;
5410
5411 nf_skip_egress(skb, false);
5412 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
5413 goto out;
5414 }
5415#endif
5416 skb_reset_redirect(skb);
5417skip_classify:
5418 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
5419 goto drop;
5420
5421 if (skb_vlan_tag_present(skb)) {
5422 if (pt_prev) {
5423 ret = deliver_skb(skb, pt_prev, orig_dev);
5424 pt_prev = NULL;
5425 }
5426 if (vlan_do_receive(&skb))
5427 goto another_round;
5428 else if (unlikely(!skb))
5429 goto out;
5430 }
5431
5432 rx_handler = rcu_dereference(skb->dev->rx_handler);
5433 if (rx_handler) {
5434 if (pt_prev) {
5435 ret = deliver_skb(skb, pt_prev, orig_dev);
5436 pt_prev = NULL;
5437 }
5438 switch (rx_handler(&skb)) {
5439 case RX_HANDLER_CONSUMED:
5440 ret = NET_RX_SUCCESS;
5441 goto out;
5442 case RX_HANDLER_ANOTHER:
5443 goto another_round;
5444 case RX_HANDLER_EXACT:
5445 deliver_exact = true;
5446 break;
5447 case RX_HANDLER_PASS:
5448 break;
5449 default:
5450 BUG();
5451 }
5452 }
5453
5454 if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
5455check_vlan_id:
5456 if (skb_vlan_tag_get_id(skb)) {
5457 /* Vlan id is non 0 and vlan_do_receive() above couldn't
5458 * find vlan device.
5459 */
5460 skb->pkt_type = PACKET_OTHERHOST;
5461 } else if (eth_type_vlan(skb->protocol)) {
5462 /* Outer header is 802.1P with vlan 0, inner header is
5463 * 802.1Q or 802.1AD and vlan_do_receive() above could
5464 * not find vlan dev for vlan id 0.
5465 */
5466 __vlan_hwaccel_clear_tag(skb);
5467 skb = skb_vlan_untag(skb);
5468 if (unlikely(!skb))
5469 goto out;
5470 if (vlan_do_receive(&skb))
5471 /* After stripping off 802.1P header with vlan 0
5472 * vlan dev is found for inner header.
5473 */
5474 goto another_round;
5475 else if (unlikely(!skb))
5476 goto out;
5477 else
5478 /* We have stripped outer 802.1P vlan 0 header.
5479 * But could not find vlan dev.
5480 * check again for vlan id to set OTHERHOST.
5481 */
5482 goto check_vlan_id;
5483 }
5484 /* Note: we might in the future use prio bits
5485 * and set skb->priority like in vlan_do_receive()
5486 * For the time being, just ignore Priority Code Point
5487 */
5488 __vlan_hwaccel_clear_tag(skb);
5489 }
5490
5491 type = skb->protocol;
5492
5493 /* deliver only exact match when indicated */
5494 if (likely(!deliver_exact)) {
5495 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5496 &ptype_base[ntohs(type) &
5497 PTYPE_HASH_MASK]);
5498 }
5499
5500 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5501 &orig_dev->ptype_specific);
5502
5503 if (unlikely(skb->dev != orig_dev)) {
5504 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5505 &skb->dev->ptype_specific);
5506 }
5507
5508 if (pt_prev) {
5509 if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
5510 goto drop;
5511 *ppt_prev = pt_prev;
5512 } else {
5513drop:
5514 if (!deliver_exact)
5515 dev_core_stats_rx_dropped_inc(skb->dev);
5516 else
5517 dev_core_stats_rx_nohandler_inc(skb->dev);
5518 kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
5519 /* Jamal, now you will not able to escape explaining
5520 * me how you were going to use this. :-)
5521 */
5522 ret = NET_RX_DROP;
5523 }
5524
5525out:
5526 /* The invariant here is that if *ppt_prev is not NULL
5527 * then skb should also be non-NULL.
5528 *
5529 * Apparently *ppt_prev assignment above holds this invariant due to
5530 * skb dereferencing near it.
5531 */
5532 *pskb = skb;
5533 return ret;
5534}
5535
5536static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
5537{
5538 struct net_device *orig_dev = skb->dev;
5539 struct packet_type *pt_prev = NULL;
5540 int ret;
5541
5542 ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5543 if (pt_prev)
5544 ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
5545 skb->dev, pt_prev, orig_dev);
5546 return ret;
5547}
5548
5549/**
5550 * netif_receive_skb_core - special purpose version of netif_receive_skb
5551 * @skb: buffer to process
5552 *
5553 * More direct receive version of netif_receive_skb(). It should
5554 * only be used by callers that have a need to skip RPS and Generic XDP.
5555 * Caller must also take care of handling if ``(page_is_)pfmemalloc``.
5556 *
5557 * This function may only be called from softirq context and interrupts
5558 * should be enabled.
5559 *
5560 * Return values (usually ignored):
5561 * NET_RX_SUCCESS: no congestion
5562 * NET_RX_DROP: packet was dropped
5563 */
5564int netif_receive_skb_core(struct sk_buff *skb)
5565{
5566 int ret;
5567
5568 rcu_read_lock();
5569 ret = __netif_receive_skb_one_core(skb, false);
5570 rcu_read_unlock();
5571
5572 return ret;
5573}
5574EXPORT_SYMBOL(netif_receive_skb_core);
5575
5576static inline void __netif_receive_skb_list_ptype(struct list_head *head,
5577 struct packet_type *pt_prev,
5578 struct net_device *orig_dev)
5579{
5580 struct sk_buff *skb, *next;
5581
5582 if (!pt_prev)
5583 return;
5584 if (list_empty(head))
5585 return;
5586 if (pt_prev->list_func != NULL)
5587 INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
5588 ip_list_rcv, head, pt_prev, orig_dev);
5589 else
5590 list_for_each_entry_safe(skb, next, head, list) {
5591 skb_list_del_init(skb);
5592 pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
5593 }
5594}
5595
5596static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
5597{
5598 /* Fast-path assumptions:
5599 * - There is no RX handler.
5600 * - Only one packet_type matches.
5601 * If either of these fails, we will end up doing some per-packet
5602 * processing in-line, then handling the 'last ptype' for the whole
5603 * sublist. This can't cause out-of-order delivery to any single ptype,
5604 * because the 'last ptype' must be constant across the sublist, and all
5605 * other ptypes are handled per-packet.
5606 */
5607 /* Current (common) ptype of sublist */
5608 struct packet_type *pt_curr = NULL;
5609 /* Current (common) orig_dev of sublist */
5610 struct net_device *od_curr = NULL;
5611 struct list_head sublist;
5612 struct sk_buff *skb, *next;
5613
5614 INIT_LIST_HEAD(&sublist);
5615 list_for_each_entry_safe(skb, next, head, list) {
5616 struct net_device *orig_dev = skb->dev;
5617 struct packet_type *pt_prev = NULL;
5618
5619 skb_list_del_init(skb);
5620 __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5621 if (!pt_prev)
5622 continue;
5623 if (pt_curr != pt_prev || od_curr != orig_dev) {
5624 /* dispatch old sublist */
5625 __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5626 /* start new sublist */
5627 INIT_LIST_HEAD(&sublist);
5628 pt_curr = pt_prev;
5629 od_curr = orig_dev;
5630 }
5631 list_add_tail(&skb->list, &sublist);
5632 }
5633
5634 /* dispatch final sublist */
5635 __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5636}
5637
5638static int __netif_receive_skb(struct sk_buff *skb)
5639{
5640 int ret;
5641
5642 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
5643 unsigned int noreclaim_flag;
5644
5645 /*
5646 * PFMEMALLOC skbs are special, they should
5647 * - be delivered to SOCK_MEMALLOC sockets only
5648 * - stay away from userspace
5649 * - have bounded memory usage
5650 *
5651 * Use PF_MEMALLOC as this saves us from propagating the allocation
5652 * context down to all allocation sites.
5653 */
5654 noreclaim_flag = memalloc_noreclaim_save();
5655 ret = __netif_receive_skb_one_core(skb, true);
5656 memalloc_noreclaim_restore(noreclaim_flag);
5657 } else
5658 ret = __netif_receive_skb_one_core(skb, false);
5659
5660 return ret;
5661}
5662
5663static void __netif_receive_skb_list(struct list_head *head)
5664{
5665 unsigned long noreclaim_flag = 0;
5666 struct sk_buff *skb, *next;
5667 bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
5668
5669 list_for_each_entry_safe(skb, next, head, list) {
5670 if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
5671 struct list_head sublist;
5672
5673 /* Handle the previous sublist */
5674 list_cut_before(&sublist, head, &skb->list);
5675 if (!list_empty(&sublist))
5676 __netif_receive_skb_list_core(&sublist, pfmemalloc);
5677 pfmemalloc = !pfmemalloc;
5678 /* See comments in __netif_receive_skb */
5679 if (pfmemalloc)
5680 noreclaim_flag = memalloc_noreclaim_save();
5681 else
5682 memalloc_noreclaim_restore(noreclaim_flag);
5683 }
5684 }
5685 /* Handle the remaining sublist */
5686 if (!list_empty(head))
5687 __netif_receive_skb_list_core(head, pfmemalloc);
5688 /* Restore pflags */
5689 if (pfmemalloc)
5690 memalloc_noreclaim_restore(noreclaim_flag);
5691}
5692
5693static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
5694{
5695 struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
5696 struct bpf_prog *new = xdp->prog;
5697 int ret = 0;
5698
5699 switch (xdp->command) {
5700 case XDP_SETUP_PROG:
5701 rcu_assign_pointer(dev->xdp_prog, new);
5702 if (old)
5703 bpf_prog_put(old);
5704
5705 if (old && !new) {
5706 static_branch_dec(&generic_xdp_needed_key);
5707 } else if (new && !old) {
5708 static_branch_inc(&generic_xdp_needed_key);
5709 dev_disable_lro(dev);
5710 dev_disable_gro_hw(dev);
5711 }
5712 break;
5713
5714 default:
5715 ret = -EINVAL;
5716 break;
5717 }
5718
5719 return ret;
5720}
5721
5722static int netif_receive_skb_internal(struct sk_buff *skb)
5723{
5724 int ret;
5725
5726 net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
5727
5728 if (skb_defer_rx_timestamp(skb))
5729 return NET_RX_SUCCESS;
5730
5731 rcu_read_lock();
5732#ifdef CONFIG_RPS
5733 if (static_branch_unlikely(&rps_needed)) {
5734 struct rps_dev_flow voidflow, *rflow = &voidflow;
5735 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5736
5737 if (cpu >= 0) {
5738 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5739 rcu_read_unlock();
5740 return ret;
5741 }
5742 }
5743#endif
5744 ret = __netif_receive_skb(skb);
5745 rcu_read_unlock();
5746 return ret;
5747}
5748
5749void netif_receive_skb_list_internal(struct list_head *head)
5750{
5751 struct sk_buff *skb, *next;
5752 struct list_head sublist;
5753
5754 INIT_LIST_HEAD(&sublist);
5755 list_for_each_entry_safe(skb, next, head, list) {
5756 net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue),
5757 skb);
5758 skb_list_del_init(skb);
5759 if (!skb_defer_rx_timestamp(skb))
5760 list_add_tail(&skb->list, &sublist);
5761 }
5762 list_splice_init(&sublist, head);
5763
5764 rcu_read_lock();
5765#ifdef CONFIG_RPS
5766 if (static_branch_unlikely(&rps_needed)) {
5767 list_for_each_entry_safe(skb, next, head, list) {
5768 struct rps_dev_flow voidflow, *rflow = &voidflow;
5769 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5770
5771 if (cpu >= 0) {
5772 /* Will be handled, remove from list */
5773 skb_list_del_init(skb);
5774 enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5775 }
5776 }
5777 }
5778#endif
5779 __netif_receive_skb_list(head);
5780 rcu_read_unlock();
5781}
5782
5783/**
5784 * netif_receive_skb - process receive buffer from network
5785 * @skb: buffer to process
5786 *
5787 * netif_receive_skb() is the main receive data processing function.
5788 * It always succeeds. The buffer may be dropped during processing
5789 * for congestion control or by the protocol layers.
5790 *
5791 * This function may only be called from softirq context and interrupts
5792 * should be enabled.
5793 *
5794 * Return values (usually ignored):
5795 * NET_RX_SUCCESS: no congestion
5796 * NET_RX_DROP: packet was dropped
5797 */
5798int netif_receive_skb(struct sk_buff *skb)
5799{
5800 int ret;
5801
5802 trace_netif_receive_skb_entry(skb);
5803
5804 ret = netif_receive_skb_internal(skb);
5805 trace_netif_receive_skb_exit(ret);
5806
5807 return ret;
5808}
5809EXPORT_SYMBOL(netif_receive_skb);
5810
5811/**
5812 * netif_receive_skb_list - process many receive buffers from network
5813 * @head: list of skbs to process.
5814 *
5815 * Since return value of netif_receive_skb() is normally ignored, and
5816 * wouldn't be meaningful for a list, this function returns void.
5817 *
5818 * This function may only be called from softirq context and interrupts
5819 * should be enabled.
5820 */
5821void netif_receive_skb_list(struct list_head *head)
5822{
5823 struct sk_buff *skb;
5824
5825 if (list_empty(head))
5826 return;
5827 if (trace_netif_receive_skb_list_entry_enabled()) {
5828 list_for_each_entry(skb, head, list)
5829 trace_netif_receive_skb_list_entry(skb);
5830 }
5831 netif_receive_skb_list_internal(head);
5832 trace_netif_receive_skb_list_exit(0);
5833}
5834EXPORT_SYMBOL(netif_receive_skb_list);
5835
5836static DEFINE_PER_CPU(struct work_struct, flush_works);
5837
5838/* Network device is going away, flush any packets still pending */
5839static void flush_backlog(struct work_struct *work)
5840{
5841 struct sk_buff *skb, *tmp;
5842 struct softnet_data *sd;
5843
5844 local_bh_disable();
5845 sd = this_cpu_ptr(&softnet_data);
5846
5847 rps_lock_irq_disable(sd);
5848 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
5849 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5850 __skb_unlink(skb, &sd->input_pkt_queue);
5851 dev_kfree_skb_irq(skb);
5852 input_queue_head_incr(sd);
5853 }
5854 }
5855 rps_unlock_irq_enable(sd);
5856
5857 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
5858 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5859 __skb_unlink(skb, &sd->process_queue);
5860 kfree_skb(skb);
5861 input_queue_head_incr(sd);
5862 }
5863 }
5864 local_bh_enable();
5865}
5866
5867static bool flush_required(int cpu)
5868{
5869#if IS_ENABLED(CONFIG_RPS)
5870 struct softnet_data *sd = &per_cpu(softnet_data, cpu);
5871 bool do_flush;
5872
5873 rps_lock_irq_disable(sd);
5874
5875 /* as insertion into process_queue happens with the rps lock held,
5876 * process_queue access may race only with dequeue
5877 */
5878 do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
5879 !skb_queue_empty_lockless(&sd->process_queue);
5880 rps_unlock_irq_enable(sd);
5881
5882 return do_flush;
5883#endif
5884 /* without RPS we can't safely check input_pkt_queue: during a
5885 * concurrent remote skb_queue_splice() we can detect as empty both
5886 * input_pkt_queue and process_queue even if the latter could end-up
5887 * containing a lot of packets.
5888 */
5889 return true;
5890}
5891
5892static void flush_all_backlogs(void)
5893{
5894 static cpumask_t flush_cpus;
5895 unsigned int cpu;
5896
5897 /* since we are under rtnl lock protection we can use static data
5898 * for the cpumask and avoid allocating on stack the possibly
5899 * large mask
5900 */
5901 ASSERT_RTNL();
5902
5903 cpus_read_lock();
5904
5905 cpumask_clear(&flush_cpus);
5906 for_each_online_cpu(cpu) {
5907 if (flush_required(cpu)) {
5908 queue_work_on(cpu, system_highpri_wq,
5909 per_cpu_ptr(&flush_works, cpu));
5910 cpumask_set_cpu(cpu, &flush_cpus);
5911 }
5912 }
5913
5914 /* we can have in flight packet[s] on the cpus we are not flushing,
5915 * synchronize_net() in unregister_netdevice_many() will take care of
5916 * them
5917 */
5918 for_each_cpu(cpu, &flush_cpus)
5919 flush_work(per_cpu_ptr(&flush_works, cpu));
5920
5921 cpus_read_unlock();
5922}
5923
5924static void net_rps_send_ipi(struct softnet_data *remsd)
5925{
5926#ifdef CONFIG_RPS
5927 while (remsd) {
5928 struct softnet_data *next = remsd->rps_ipi_next;
5929
5930 if (cpu_online(remsd->cpu))
5931 smp_call_function_single_async(remsd->cpu, &remsd->csd);
5932 remsd = next;
5933 }
5934#endif
5935}
5936
5937/*
5938 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
5939 * Note: called with local irq disabled, but exits with local irq enabled.
5940 */
5941static void net_rps_action_and_irq_enable(struct softnet_data *sd)
5942{
5943#ifdef CONFIG_RPS
5944 struct softnet_data *remsd = sd->rps_ipi_list;
5945
5946 if (remsd) {
5947 sd->rps_ipi_list = NULL;
5948
5949 local_irq_enable();
5950
5951 /* Send pending IPI's to kick RPS processing on remote cpus. */
5952 net_rps_send_ipi(remsd);
5953 } else
5954#endif
5955 local_irq_enable();
5956}
5957
5958static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
5959{
5960#ifdef CONFIG_RPS
5961 return sd->rps_ipi_list != NULL;
5962#else
5963 return false;
5964#endif
5965}
5966
5967static int process_backlog(struct napi_struct *napi, int quota)
5968{
5969 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
5970 bool again = true;
5971 int work = 0;
5972
5973 /* Check if we have pending ipi, its better to send them now,
5974 * not waiting net_rx_action() end.
5975 */
5976 if (sd_has_rps_ipi_waiting(sd)) {
5977 local_irq_disable();
5978 net_rps_action_and_irq_enable(sd);
5979 }
5980
5981 napi->weight = READ_ONCE(net_hotdata.dev_rx_weight);
5982 while (again) {
5983 struct sk_buff *skb;
5984
5985 while ((skb = __skb_dequeue(&sd->process_queue))) {
5986 rcu_read_lock();
5987 __netif_receive_skb(skb);
5988 rcu_read_unlock();
5989 input_queue_head_incr(sd);
5990 if (++work >= quota)
5991 return work;
5992
5993 }
5994
5995 rps_lock_irq_disable(sd);
5996 if (skb_queue_empty(&sd->input_pkt_queue)) {
5997 /*
5998 * Inline a custom version of __napi_complete().
5999 * only current cpu owns and manipulates this napi,
6000 * and NAPI_STATE_SCHED is the only possible flag set
6001 * on backlog.
6002 * We can use a plain write instead of clear_bit(),
6003 * and we dont need an smp_mb() memory barrier.
6004 */
6005 napi->state = 0;
6006 again = false;
6007 } else {
6008 skb_queue_splice_tail_init(&sd->input_pkt_queue,
6009 &sd->process_queue);
6010 }
6011 rps_unlock_irq_enable(sd);
6012 }
6013
6014 return work;
6015}
6016
6017/**
6018 * __napi_schedule - schedule for receive
6019 * @n: entry to schedule
6020 *
6021 * The entry's receive function will be scheduled to run.
6022 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
6023 */
6024void __napi_schedule(struct napi_struct *n)
6025{
6026 unsigned long flags;
6027
6028 local_irq_save(flags);
6029 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6030 local_irq_restore(flags);
6031}
6032EXPORT_SYMBOL(__napi_schedule);
6033
6034/**
6035 * napi_schedule_prep - check if napi can be scheduled
6036 * @n: napi context
6037 *
6038 * Test if NAPI routine is already running, and if not mark
6039 * it as running. This is used as a condition variable to
6040 * insure only one NAPI poll instance runs. We also make
6041 * sure there is no pending NAPI disable.
6042 */
6043bool napi_schedule_prep(struct napi_struct *n)
6044{
6045 unsigned long new, val = READ_ONCE(n->state);
6046
6047 do {
6048 if (unlikely(val & NAPIF_STATE_DISABLE))
6049 return false;
6050 new = val | NAPIF_STATE_SCHED;
6051
6052 /* Sets STATE_MISSED bit if STATE_SCHED was already set
6053 * This was suggested by Alexander Duyck, as compiler
6054 * emits better code than :
6055 * if (val & NAPIF_STATE_SCHED)
6056 * new |= NAPIF_STATE_MISSED;
6057 */
6058 new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
6059 NAPIF_STATE_MISSED;
6060 } while (!try_cmpxchg(&n->state, &val, new));
6061
6062 return !(val & NAPIF_STATE_SCHED);
6063}
6064EXPORT_SYMBOL(napi_schedule_prep);
6065
6066/**
6067 * __napi_schedule_irqoff - schedule for receive
6068 * @n: entry to schedule
6069 *
6070 * Variant of __napi_schedule() assuming hard irqs are masked.
6071 *
6072 * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
6073 * because the interrupt disabled assumption might not be true
6074 * due to force-threaded interrupts and spinlock substitution.
6075 */
6076void __napi_schedule_irqoff(struct napi_struct *n)
6077{
6078 if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6079 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6080 else
6081 __napi_schedule(n);
6082}
6083EXPORT_SYMBOL(__napi_schedule_irqoff);
6084
6085bool napi_complete_done(struct napi_struct *n, int work_done)
6086{
6087 unsigned long flags, val, new, timeout = 0;
6088 bool ret = true;
6089
6090 /*
6091 * 1) Don't let napi dequeue from the cpu poll list
6092 * just in case its running on a different cpu.
6093 * 2) If we are busy polling, do nothing here, we have
6094 * the guarantee we will be called later.
6095 */
6096 if (unlikely(n->state & (NAPIF_STATE_NPSVC |
6097 NAPIF_STATE_IN_BUSY_POLL)))
6098 return false;
6099
6100 if (work_done) {
6101 if (n->gro_bitmask)
6102 timeout = READ_ONCE(n->dev->gro_flush_timeout);
6103 n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
6104 }
6105 if (n->defer_hard_irqs_count > 0) {
6106 n->defer_hard_irqs_count--;
6107 timeout = READ_ONCE(n->dev->gro_flush_timeout);
6108 if (timeout)
6109 ret = false;
6110 }
6111 if (n->gro_bitmask) {
6112 /* When the NAPI instance uses a timeout and keeps postponing
6113 * it, we need to bound somehow the time packets are kept in
6114 * the GRO layer
6115 */
6116 napi_gro_flush(n, !!timeout);
6117 }
6118
6119 gro_normal_list(n);
6120
6121 if (unlikely(!list_empty(&n->poll_list))) {
6122 /* If n->poll_list is not empty, we need to mask irqs */
6123 local_irq_save(flags);
6124 list_del_init(&n->poll_list);
6125 local_irq_restore(flags);
6126 }
6127 WRITE_ONCE(n->list_owner, -1);
6128
6129 val = READ_ONCE(n->state);
6130 do {
6131 WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
6132
6133 new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
6134 NAPIF_STATE_SCHED_THREADED |
6135 NAPIF_STATE_PREFER_BUSY_POLL);
6136
6137 /* If STATE_MISSED was set, leave STATE_SCHED set,
6138 * because we will call napi->poll() one more time.
6139 * This C code was suggested by Alexander Duyck to help gcc.
6140 */
6141 new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
6142 NAPIF_STATE_SCHED;
6143 } while (!try_cmpxchg(&n->state, &val, new));
6144
6145 if (unlikely(val & NAPIF_STATE_MISSED)) {
6146 __napi_schedule(n);
6147 return false;
6148 }
6149
6150 if (timeout)
6151 hrtimer_start(&n->timer, ns_to_ktime(timeout),
6152 HRTIMER_MODE_REL_PINNED);
6153 return ret;
6154}
6155EXPORT_SYMBOL(napi_complete_done);
6156
6157/* must be called under rcu_read_lock(), as we dont take a reference */
6158struct napi_struct *napi_by_id(unsigned int napi_id)
6159{
6160 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
6161 struct napi_struct *napi;
6162
6163 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
6164 if (napi->napi_id == napi_id)
6165 return napi;
6166
6167 return NULL;
6168}
6169
6170static void skb_defer_free_flush(struct softnet_data *sd)
6171{
6172 struct sk_buff *skb, *next;
6173
6174 /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
6175 if (!READ_ONCE(sd->defer_list))
6176 return;
6177
6178 spin_lock(&sd->defer_lock);
6179 skb = sd->defer_list;
6180 sd->defer_list = NULL;
6181 sd->defer_count = 0;
6182 spin_unlock(&sd->defer_lock);
6183
6184 while (skb != NULL) {
6185 next = skb->next;
6186 napi_consume_skb(skb, 1);
6187 skb = next;
6188 }
6189}
6190
6191#if defined(CONFIG_NET_RX_BUSY_POLL)
6192
6193static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
6194{
6195 if (!skip_schedule) {
6196 gro_normal_list(napi);
6197 __napi_schedule(napi);
6198 return;
6199 }
6200
6201 if (napi->gro_bitmask) {
6202 /* flush too old packets
6203 * If HZ < 1000, flush all packets.
6204 */
6205 napi_gro_flush(napi, HZ >= 1000);
6206 }
6207
6208 gro_normal_list(napi);
6209 clear_bit(NAPI_STATE_SCHED, &napi->state);
6210}
6211
6212enum {
6213 NAPI_F_PREFER_BUSY_POLL = 1,
6214 NAPI_F_END_ON_RESCHED = 2,
6215};
6216
6217static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
6218 unsigned flags, u16 budget)
6219{
6220 bool skip_schedule = false;
6221 unsigned long timeout;
6222 int rc;
6223
6224 /* Busy polling means there is a high chance device driver hard irq
6225 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
6226 * set in napi_schedule_prep().
6227 * Since we are about to call napi->poll() once more, we can safely
6228 * clear NAPI_STATE_MISSED.
6229 *
6230 * Note: x86 could use a single "lock and ..." instruction
6231 * to perform these two clear_bit()
6232 */
6233 clear_bit(NAPI_STATE_MISSED, &napi->state);
6234 clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
6235
6236 local_bh_disable();
6237
6238 if (flags & NAPI_F_PREFER_BUSY_POLL) {
6239 napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
6240 timeout = READ_ONCE(napi->dev->gro_flush_timeout);
6241 if (napi->defer_hard_irqs_count && timeout) {
6242 hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
6243 skip_schedule = true;
6244 }
6245 }
6246
6247 /* All we really want here is to re-enable device interrupts.
6248 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
6249 */
6250 rc = napi->poll(napi, budget);
6251 /* We can't gro_normal_list() here, because napi->poll() might have
6252 * rearmed the napi (napi_complete_done()) in which case it could
6253 * already be running on another CPU.
6254 */
6255 trace_napi_poll(napi, rc, budget);
6256 netpoll_poll_unlock(have_poll_lock);
6257 if (rc == budget)
6258 __busy_poll_stop(napi, skip_schedule);
6259 local_bh_enable();
6260}
6261
6262static void __napi_busy_loop(unsigned int napi_id,
6263 bool (*loop_end)(void *, unsigned long),
6264 void *loop_end_arg, unsigned flags, u16 budget)
6265{
6266 unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
6267 int (*napi_poll)(struct napi_struct *napi, int budget);
6268 void *have_poll_lock = NULL;
6269 struct napi_struct *napi;
6270
6271 WARN_ON_ONCE(!rcu_read_lock_held());
6272
6273restart:
6274 napi_poll = NULL;
6275
6276 napi = napi_by_id(napi_id);
6277 if (!napi)
6278 return;
6279
6280 if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6281 preempt_disable();
6282 for (;;) {
6283 int work = 0;
6284
6285 local_bh_disable();
6286 if (!napi_poll) {
6287 unsigned long val = READ_ONCE(napi->state);
6288
6289 /* If multiple threads are competing for this napi,
6290 * we avoid dirtying napi->state as much as we can.
6291 */
6292 if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
6293 NAPIF_STATE_IN_BUSY_POLL)) {
6294 if (flags & NAPI_F_PREFER_BUSY_POLL)
6295 set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6296 goto count;
6297 }
6298 if (cmpxchg(&napi->state, val,
6299 val | NAPIF_STATE_IN_BUSY_POLL |
6300 NAPIF_STATE_SCHED) != val) {
6301 if (flags & NAPI_F_PREFER_BUSY_POLL)
6302 set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6303 goto count;
6304 }
6305 have_poll_lock = netpoll_poll_lock(napi);
6306 napi_poll = napi->poll;
6307 }
6308 work = napi_poll(napi, budget);
6309 trace_napi_poll(napi, work, budget);
6310 gro_normal_list(napi);
6311count:
6312 if (work > 0)
6313 __NET_ADD_STATS(dev_net(napi->dev),
6314 LINUX_MIB_BUSYPOLLRXPACKETS, work);
6315 skb_defer_free_flush(this_cpu_ptr(&softnet_data));
6316 local_bh_enable();
6317
6318 if (!loop_end || loop_end(loop_end_arg, start_time))
6319 break;
6320
6321 if (unlikely(need_resched())) {
6322 if (flags & NAPI_F_END_ON_RESCHED)
6323 break;
6324 if (napi_poll)
6325 busy_poll_stop(napi, have_poll_lock, flags, budget);
6326 if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6327 preempt_enable();
6328 rcu_read_unlock();
6329 cond_resched();
6330 rcu_read_lock();
6331 if (loop_end(loop_end_arg, start_time))
6332 return;
6333 goto restart;
6334 }
6335 cpu_relax();
6336 }
6337 if (napi_poll)
6338 busy_poll_stop(napi, have_poll_lock, flags, budget);
6339 if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6340 preempt_enable();
6341}
6342
6343void napi_busy_loop_rcu(unsigned int napi_id,
6344 bool (*loop_end)(void *, unsigned long),
6345 void *loop_end_arg, bool prefer_busy_poll, u16 budget)
6346{
6347 unsigned flags = NAPI_F_END_ON_RESCHED;
6348
6349 if (prefer_busy_poll)
6350 flags |= NAPI_F_PREFER_BUSY_POLL;
6351
6352 __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
6353}
6354
6355void napi_busy_loop(unsigned int napi_id,
6356 bool (*loop_end)(void *, unsigned long),
6357 void *loop_end_arg, bool prefer_busy_poll, u16 budget)
6358{
6359 unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0;
6360
6361 rcu_read_lock();
6362 __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
6363 rcu_read_unlock();
6364}
6365EXPORT_SYMBOL(napi_busy_loop);
6366
6367#endif /* CONFIG_NET_RX_BUSY_POLL */
6368
6369static void napi_hash_add(struct napi_struct *napi)
6370{
6371 if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
6372 return;
6373
6374 spin_lock(&napi_hash_lock);
6375
6376 /* 0..NR_CPUS range is reserved for sender_cpu use */
6377 do {
6378 if (unlikely(++napi_gen_id < MIN_NAPI_ID))
6379 napi_gen_id = MIN_NAPI_ID;
6380 } while (napi_by_id(napi_gen_id));
6381 napi->napi_id = napi_gen_id;
6382
6383 hlist_add_head_rcu(&napi->napi_hash_node,
6384 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
6385
6386 spin_unlock(&napi_hash_lock);
6387}
6388
6389/* Warning : caller is responsible to make sure rcu grace period
6390 * is respected before freeing memory containing @napi
6391 */
6392static void napi_hash_del(struct napi_struct *napi)
6393{
6394 spin_lock(&napi_hash_lock);
6395
6396 hlist_del_init_rcu(&napi->napi_hash_node);
6397
6398 spin_unlock(&napi_hash_lock);
6399}
6400
6401static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
6402{
6403 struct napi_struct *napi;
6404
6405 napi = container_of(timer, struct napi_struct, timer);
6406
6407 /* Note : we use a relaxed variant of napi_schedule_prep() not setting
6408 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
6409 */
6410 if (!napi_disable_pending(napi) &&
6411 !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
6412 clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6413 __napi_schedule_irqoff(napi);
6414 }
6415
6416 return HRTIMER_NORESTART;
6417}
6418
6419static void init_gro_hash(struct napi_struct *napi)
6420{
6421 int i;
6422
6423 for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6424 INIT_LIST_HEAD(&napi->gro_hash[i].list);
6425 napi->gro_hash[i].count = 0;
6426 }
6427 napi->gro_bitmask = 0;
6428}
6429
6430int dev_set_threaded(struct net_device *dev, bool threaded)
6431{
6432 struct napi_struct *napi;
6433 int err = 0;
6434
6435 if (dev->threaded == threaded)
6436 return 0;
6437
6438 if (threaded) {
6439 list_for_each_entry(napi, &dev->napi_list, dev_list) {
6440 if (!napi->thread) {
6441 err = napi_kthread_create(napi);
6442 if (err) {
6443 threaded = false;
6444 break;
6445 }
6446 }
6447 }
6448 }
6449
6450 dev->threaded = threaded;
6451
6452 /* Make sure kthread is created before THREADED bit
6453 * is set.
6454 */
6455 smp_mb__before_atomic();
6456
6457 /* Setting/unsetting threaded mode on a napi might not immediately
6458 * take effect, if the current napi instance is actively being
6459 * polled. In this case, the switch between threaded mode and
6460 * softirq mode will happen in the next round of napi_schedule().
6461 * This should not cause hiccups/stalls to the live traffic.
6462 */
6463 list_for_each_entry(napi, &dev->napi_list, dev_list)
6464 assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
6465
6466 return err;
6467}
6468EXPORT_SYMBOL(dev_set_threaded);
6469
6470/**
6471 * netif_queue_set_napi - Associate queue with the napi
6472 * @dev: device to which NAPI and queue belong
6473 * @queue_index: Index of queue
6474 * @type: queue type as RX or TX
6475 * @napi: NAPI context, pass NULL to clear previously set NAPI
6476 *
6477 * Set queue with its corresponding napi context. This should be done after
6478 * registering the NAPI handler for the queue-vector and the queues have been
6479 * mapped to the corresponding interrupt vector.
6480 */
6481void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
6482 enum netdev_queue_type type, struct napi_struct *napi)
6483{
6484 struct netdev_rx_queue *rxq;
6485 struct netdev_queue *txq;
6486
6487 if (WARN_ON_ONCE(napi && !napi->dev))
6488 return;
6489 if (dev->reg_state >= NETREG_REGISTERED)
6490 ASSERT_RTNL();
6491
6492 switch (type) {
6493 case NETDEV_QUEUE_TYPE_RX:
6494 rxq = __netif_get_rx_queue(dev, queue_index);
6495 rxq->napi = napi;
6496 return;
6497 case NETDEV_QUEUE_TYPE_TX:
6498 txq = netdev_get_tx_queue(dev, queue_index);
6499 txq->napi = napi;
6500 return;
6501 default:
6502 return;
6503 }
6504}
6505EXPORT_SYMBOL(netif_queue_set_napi);
6506
6507void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
6508 int (*poll)(struct napi_struct *, int), int weight)
6509{
6510 if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
6511 return;
6512
6513 INIT_LIST_HEAD(&napi->poll_list);
6514 INIT_HLIST_NODE(&napi->napi_hash_node);
6515 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
6516 napi->timer.function = napi_watchdog;
6517 init_gro_hash(napi);
6518 napi->skb = NULL;
6519 INIT_LIST_HEAD(&napi->rx_list);
6520 napi->rx_count = 0;
6521 napi->poll = poll;
6522 if (weight > NAPI_POLL_WEIGHT)
6523 netdev_err_once(dev, "%s() called with weight %d\n", __func__,
6524 weight);
6525 napi->weight = weight;
6526 napi->dev = dev;
6527#ifdef CONFIG_NETPOLL
6528 napi->poll_owner = -1;
6529#endif
6530 napi->list_owner = -1;
6531 set_bit(NAPI_STATE_SCHED, &napi->state);
6532 set_bit(NAPI_STATE_NPSVC, &napi->state);
6533 list_add_rcu(&napi->dev_list, &dev->napi_list);
6534 napi_hash_add(napi);
6535 napi_get_frags_check(napi);
6536 /* Create kthread for this napi if dev->threaded is set.
6537 * Clear dev->threaded if kthread creation failed so that
6538 * threaded mode will not be enabled in napi_enable().
6539 */
6540 if (dev->threaded && napi_kthread_create(napi))
6541 dev->threaded = 0;
6542 netif_napi_set_irq(napi, -1);
6543}
6544EXPORT_SYMBOL(netif_napi_add_weight);
6545
6546void napi_disable(struct napi_struct *n)
6547{
6548 unsigned long val, new;
6549
6550 might_sleep();
6551 set_bit(NAPI_STATE_DISABLE, &n->state);
6552
6553 val = READ_ONCE(n->state);
6554 do {
6555 while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
6556 usleep_range(20, 200);
6557 val = READ_ONCE(n->state);
6558 }
6559
6560 new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
6561 new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
6562 } while (!try_cmpxchg(&n->state, &val, new));
6563
6564 hrtimer_cancel(&n->timer);
6565
6566 clear_bit(NAPI_STATE_DISABLE, &n->state);
6567}
6568EXPORT_SYMBOL(napi_disable);
6569
6570/**
6571 * napi_enable - enable NAPI scheduling
6572 * @n: NAPI context
6573 *
6574 * Resume NAPI from being scheduled on this context.
6575 * Must be paired with napi_disable.
6576 */
6577void napi_enable(struct napi_struct *n)
6578{
6579 unsigned long new, val = READ_ONCE(n->state);
6580
6581 do {
6582 BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
6583
6584 new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
6585 if (n->dev->threaded && n->thread)
6586 new |= NAPIF_STATE_THREADED;
6587 } while (!try_cmpxchg(&n->state, &val, new));
6588}
6589EXPORT_SYMBOL(napi_enable);
6590
6591static void flush_gro_hash(struct napi_struct *napi)
6592{
6593 int i;
6594
6595 for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6596 struct sk_buff *skb, *n;
6597
6598 list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
6599 kfree_skb(skb);
6600 napi->gro_hash[i].count = 0;
6601 }
6602}
6603
6604/* Must be called in process context */
6605void __netif_napi_del(struct napi_struct *napi)
6606{
6607 if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
6608 return;
6609
6610 napi_hash_del(napi);
6611 list_del_rcu(&napi->dev_list);
6612 napi_free_frags(napi);
6613
6614 flush_gro_hash(napi);
6615 napi->gro_bitmask = 0;
6616
6617 if (napi->thread) {
6618 kthread_stop(napi->thread);
6619 napi->thread = NULL;
6620 }
6621}
6622EXPORT_SYMBOL(__netif_napi_del);
6623
6624static int __napi_poll(struct napi_struct *n, bool *repoll)
6625{
6626 int work, weight;
6627
6628 weight = n->weight;
6629
6630 /* This NAPI_STATE_SCHED test is for avoiding a race
6631 * with netpoll's poll_napi(). Only the entity which
6632 * obtains the lock and sees NAPI_STATE_SCHED set will
6633 * actually make the ->poll() call. Therefore we avoid
6634 * accidentally calling ->poll() when NAPI is not scheduled.
6635 */
6636 work = 0;
6637 if (napi_is_scheduled(n)) {
6638 work = n->poll(n, weight);
6639 trace_napi_poll(n, work, weight);
6640
6641 xdp_do_check_flushed(n);
6642 }
6643
6644 if (unlikely(work > weight))
6645 netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
6646 n->poll, work, weight);
6647
6648 if (likely(work < weight))
6649 return work;
6650
6651 /* Drivers must not modify the NAPI state if they
6652 * consume the entire weight. In such cases this code
6653 * still "owns" the NAPI instance and therefore can
6654 * move the instance around on the list at-will.
6655 */
6656 if (unlikely(napi_disable_pending(n))) {
6657 napi_complete(n);
6658 return work;
6659 }
6660
6661 /* The NAPI context has more processing work, but busy-polling
6662 * is preferred. Exit early.
6663 */
6664 if (napi_prefer_busy_poll(n)) {
6665 if (napi_complete_done(n, work)) {
6666 /* If timeout is not set, we need to make sure
6667 * that the NAPI is re-scheduled.
6668 */
6669 napi_schedule(n);
6670 }
6671 return work;
6672 }
6673
6674 if (n->gro_bitmask) {
6675 /* flush too old packets
6676 * If HZ < 1000, flush all packets.
6677 */
6678 napi_gro_flush(n, HZ >= 1000);
6679 }
6680
6681 gro_normal_list(n);
6682
6683 /* Some drivers may have called napi_schedule
6684 * prior to exhausting their budget.
6685 */
6686 if (unlikely(!list_empty(&n->poll_list))) {
6687 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
6688 n->dev ? n->dev->name : "backlog");
6689 return work;
6690 }
6691
6692 *repoll = true;
6693
6694 return work;
6695}
6696
6697static int napi_poll(struct napi_struct *n, struct list_head *repoll)
6698{
6699 bool do_repoll = false;
6700 void *have;
6701 int work;
6702
6703 list_del_init(&n->poll_list);
6704
6705 have = netpoll_poll_lock(n);
6706
6707 work = __napi_poll(n, &do_repoll);
6708
6709 if (do_repoll)
6710 list_add_tail(&n->poll_list, repoll);
6711
6712 netpoll_poll_unlock(have);
6713
6714 return work;
6715}
6716
6717static int napi_thread_wait(struct napi_struct *napi)
6718{
6719 bool woken = false;
6720
6721 set_current_state(TASK_INTERRUPTIBLE);
6722
6723 while (!kthread_should_stop()) {
6724 /* Testing SCHED_THREADED bit here to make sure the current
6725 * kthread owns this napi and could poll on this napi.
6726 * Testing SCHED bit is not enough because SCHED bit might be
6727 * set by some other busy poll thread or by napi_disable().
6728 */
6729 if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
6730 WARN_ON(!list_empty(&napi->poll_list));
6731 __set_current_state(TASK_RUNNING);
6732 return 0;
6733 }
6734
6735 schedule();
6736 /* woken being true indicates this thread owns this napi. */
6737 woken = true;
6738 set_current_state(TASK_INTERRUPTIBLE);
6739 }
6740 __set_current_state(TASK_RUNNING);
6741
6742 return -1;
6743}
6744
6745static int napi_threaded_poll(void *data)
6746{
6747 struct napi_struct *napi = data;
6748 struct softnet_data *sd;
6749 void *have;
6750
6751 while (!napi_thread_wait(napi)) {
6752 unsigned long last_qs = jiffies;
6753
6754 for (;;) {
6755 bool repoll = false;
6756
6757 local_bh_disable();
6758 sd = this_cpu_ptr(&softnet_data);
6759 sd->in_napi_threaded_poll = true;
6760
6761 have = netpoll_poll_lock(napi);
6762 __napi_poll(napi, &repoll);
6763 netpoll_poll_unlock(have);
6764
6765 sd->in_napi_threaded_poll = false;
6766 barrier();
6767
6768 if (sd_has_rps_ipi_waiting(sd)) {
6769 local_irq_disable();
6770 net_rps_action_and_irq_enable(sd);
6771 }
6772 skb_defer_free_flush(sd);
6773 local_bh_enable();
6774
6775 if (!repoll)
6776 break;
6777
6778 rcu_softirq_qs_periodic(last_qs);
6779 cond_resched();
6780 }
6781 }
6782 return 0;
6783}
6784
6785static __latent_entropy void net_rx_action(struct softirq_action *h)
6786{
6787 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6788 unsigned long time_limit = jiffies +
6789 usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
6790 int budget = READ_ONCE(net_hotdata.netdev_budget);
6791 LIST_HEAD(list);
6792 LIST_HEAD(repoll);
6793
6794start:
6795 sd->in_net_rx_action = true;
6796 local_irq_disable();
6797 list_splice_init(&sd->poll_list, &list);
6798 local_irq_enable();
6799
6800 for (;;) {
6801 struct napi_struct *n;
6802
6803 skb_defer_free_flush(sd);
6804
6805 if (list_empty(&list)) {
6806 if (list_empty(&repoll)) {
6807 sd->in_net_rx_action = false;
6808 barrier();
6809 /* We need to check if ____napi_schedule()
6810 * had refilled poll_list while
6811 * sd->in_net_rx_action was true.
6812 */
6813 if (!list_empty(&sd->poll_list))
6814 goto start;
6815 if (!sd_has_rps_ipi_waiting(sd))
6816 goto end;
6817 }
6818 break;
6819 }
6820
6821 n = list_first_entry(&list, struct napi_struct, poll_list);
6822 budget -= napi_poll(n, &repoll);
6823
6824 /* If softirq window is exhausted then punt.
6825 * Allow this to run for 2 jiffies since which will allow
6826 * an average latency of 1.5/HZ.
6827 */
6828 if (unlikely(budget <= 0 ||
6829 time_after_eq(jiffies, time_limit))) {
6830 sd->time_squeeze++;
6831 break;
6832 }
6833 }
6834
6835 local_irq_disable();
6836
6837 list_splice_tail_init(&sd->poll_list, &list);
6838 list_splice_tail(&repoll, &list);
6839 list_splice(&list, &sd->poll_list);
6840 if (!list_empty(&sd->poll_list))
6841 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
6842 else
6843 sd->in_net_rx_action = false;
6844
6845 net_rps_action_and_irq_enable(sd);
6846end:;
6847}
6848
6849struct netdev_adjacent {
6850 struct net_device *dev;
6851 netdevice_tracker dev_tracker;
6852
6853 /* upper master flag, there can only be one master device per list */
6854 bool master;
6855
6856 /* lookup ignore flag */
6857 bool ignore;
6858
6859 /* counter for the number of times this device was added to us */
6860 u16 ref_nr;
6861
6862 /* private field for the users */
6863 void *private;
6864
6865 struct list_head list;
6866 struct rcu_head rcu;
6867};
6868
6869static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
6870 struct list_head *adj_list)
6871{
6872 struct netdev_adjacent *adj;
6873
6874 list_for_each_entry(adj, adj_list, list) {
6875 if (adj->dev == adj_dev)
6876 return adj;
6877 }
6878 return NULL;
6879}
6880
6881static int ____netdev_has_upper_dev(struct net_device *upper_dev,
6882 struct netdev_nested_priv *priv)
6883{
6884 struct net_device *dev = (struct net_device *)priv->data;
6885
6886 return upper_dev == dev;
6887}
6888
6889/**
6890 * netdev_has_upper_dev - Check if device is linked to an upper device
6891 * @dev: device
6892 * @upper_dev: upper device to check
6893 *
6894 * Find out if a device is linked to specified upper device and return true
6895 * in case it is. Note that this checks only immediate upper device,
6896 * not through a complete stack of devices. The caller must hold the RTNL lock.
6897 */
6898bool netdev_has_upper_dev(struct net_device *dev,
6899 struct net_device *upper_dev)
6900{
6901 struct netdev_nested_priv priv = {
6902 .data = (void *)upper_dev,
6903 };
6904
6905 ASSERT_RTNL();
6906
6907 return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6908 &priv);
6909}
6910EXPORT_SYMBOL(netdev_has_upper_dev);
6911
6912/**
6913 * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
6914 * @dev: device
6915 * @upper_dev: upper device to check
6916 *
6917 * Find out if a device is linked to specified upper device and return true
6918 * in case it is. Note that this checks the entire upper device chain.
6919 * The caller must hold rcu lock.
6920 */
6921
6922bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
6923 struct net_device *upper_dev)
6924{
6925 struct netdev_nested_priv priv = {
6926 .data = (void *)upper_dev,
6927 };
6928
6929 return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6930 &priv);
6931}
6932EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
6933
6934/**
6935 * netdev_has_any_upper_dev - Check if device is linked to some device
6936 * @dev: device
6937 *
6938 * Find out if a device is linked to an upper device and return true in case
6939 * it is. The caller must hold the RTNL lock.
6940 */
6941bool netdev_has_any_upper_dev(struct net_device *dev)
6942{
6943 ASSERT_RTNL();
6944
6945 return !list_empty(&dev->adj_list.upper);
6946}
6947EXPORT_SYMBOL(netdev_has_any_upper_dev);
6948
6949/**
6950 * netdev_master_upper_dev_get - Get master upper device
6951 * @dev: device
6952 *
6953 * Find a master upper device and return pointer to it or NULL in case
6954 * it's not there. The caller must hold the RTNL lock.
6955 */
6956struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
6957{
6958 struct netdev_adjacent *upper;
6959
6960 ASSERT_RTNL();
6961
6962 if (list_empty(&dev->adj_list.upper))
6963 return NULL;
6964
6965 upper = list_first_entry(&dev->adj_list.upper,
6966 struct netdev_adjacent, list);
6967 if (likely(upper->master))
6968 return upper->dev;
6969 return NULL;
6970}
6971EXPORT_SYMBOL(netdev_master_upper_dev_get);
6972
6973static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
6974{
6975 struct netdev_adjacent *upper;
6976
6977 ASSERT_RTNL();
6978
6979 if (list_empty(&dev->adj_list.upper))
6980 return NULL;
6981
6982 upper = list_first_entry(&dev->adj_list.upper,
6983 struct netdev_adjacent, list);
6984 if (likely(upper->master) && !upper->ignore)
6985 return upper->dev;
6986 return NULL;
6987}
6988
6989/**
6990 * netdev_has_any_lower_dev - Check if device is linked to some device
6991 * @dev: device
6992 *
6993 * Find out if a device is linked to a lower device and return true in case
6994 * it is. The caller must hold the RTNL lock.
6995 */
6996static bool netdev_has_any_lower_dev(struct net_device *dev)
6997{
6998 ASSERT_RTNL();
6999
7000 return !list_empty(&dev->adj_list.lower);
7001}
7002
7003void *netdev_adjacent_get_private(struct list_head *adj_list)
7004{
7005 struct netdev_adjacent *adj;
7006
7007 adj = list_entry(adj_list, struct netdev_adjacent, list);
7008
7009 return adj->private;
7010}
7011EXPORT_SYMBOL(netdev_adjacent_get_private);
7012
7013/**
7014 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
7015 * @dev: device
7016 * @iter: list_head ** of the current position
7017 *
7018 * Gets the next device from the dev's upper list, starting from iter
7019 * position. The caller must hold RCU read lock.
7020 */
7021struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
7022 struct list_head **iter)
7023{
7024 struct netdev_adjacent *upper;
7025
7026 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
7027
7028 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7029
7030 if (&upper->list == &dev->adj_list.upper)
7031 return NULL;
7032
7033 *iter = &upper->list;
7034
7035 return upper->dev;
7036}
7037EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
7038
7039static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
7040 struct list_head **iter,
7041 bool *ignore)
7042{
7043 struct netdev_adjacent *upper;
7044
7045 upper = list_entry((*iter)->next, struct netdev_adjacent, list);
7046
7047 if (&upper->list == &dev->adj_list.upper)
7048 return NULL;
7049
7050 *iter = &upper->list;
7051 *ignore = upper->ignore;
7052
7053 return upper->dev;
7054}
7055
7056static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
7057 struct list_head **iter)
7058{
7059 struct netdev_adjacent *upper;
7060
7061 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
7062
7063 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7064
7065 if (&upper->list == &dev->adj_list.upper)
7066 return NULL;
7067
7068 *iter = &upper->list;
7069
7070 return upper->dev;
7071}
7072
7073static int __netdev_walk_all_upper_dev(struct net_device *dev,
7074 int (*fn)(struct net_device *dev,
7075 struct netdev_nested_priv *priv),
7076 struct netdev_nested_priv *priv)
7077{
7078 struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7079 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7080 int ret, cur = 0;
7081 bool ignore;
7082
7083 now = dev;
7084 iter = &dev->adj_list.upper;
7085
7086 while (1) {
7087 if (now != dev) {
7088 ret = fn(now, priv);
7089 if (ret)
7090 return ret;
7091 }
7092
7093 next = NULL;
7094 while (1) {
7095 udev = __netdev_next_upper_dev(now, &iter, &ignore);
7096 if (!udev)
7097 break;
7098 if (ignore)
7099 continue;
7100
7101 next = udev;
7102 niter = &udev->adj_list.upper;
7103 dev_stack[cur] = now;
7104 iter_stack[cur++] = iter;
7105 break;
7106 }
7107
7108 if (!next) {
7109 if (!cur)
7110 return 0;
7111 next = dev_stack[--cur];
7112 niter = iter_stack[cur];
7113 }
7114
7115 now = next;
7116 iter = niter;
7117 }
7118
7119 return 0;
7120}
7121
7122int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
7123 int (*fn)(struct net_device *dev,
7124 struct netdev_nested_priv *priv),
7125 struct netdev_nested_priv *priv)
7126{
7127 struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7128 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7129 int ret, cur = 0;
7130
7131 now = dev;
7132 iter = &dev->adj_list.upper;
7133
7134 while (1) {
7135 if (now != dev) {
7136 ret = fn(now, priv);
7137 if (ret)
7138 return ret;
7139 }
7140
7141 next = NULL;
7142 while (1) {
7143 udev = netdev_next_upper_dev_rcu(now, &iter);
7144 if (!udev)
7145 break;
7146
7147 next = udev;
7148 niter = &udev->adj_list.upper;
7149 dev_stack[cur] = now;
7150 iter_stack[cur++] = iter;
7151 break;
7152 }
7153
7154 if (!next) {
7155 if (!cur)
7156 return 0;
7157 next = dev_stack[--cur];
7158 niter = iter_stack[cur];
7159 }
7160
7161 now = next;
7162 iter = niter;
7163 }
7164
7165 return 0;
7166}
7167EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
7168
7169static bool __netdev_has_upper_dev(struct net_device *dev,
7170 struct net_device *upper_dev)
7171{
7172 struct netdev_nested_priv priv = {
7173 .flags = 0,
7174 .data = (void *)upper_dev,
7175 };
7176
7177 ASSERT_RTNL();
7178
7179 return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
7180 &priv);
7181}
7182
7183/**
7184 * netdev_lower_get_next_private - Get the next ->private from the
7185 * lower neighbour list
7186 * @dev: device
7187 * @iter: list_head ** of the current position
7188 *
7189 * Gets the next netdev_adjacent->private from the dev's lower neighbour
7190 * list, starting from iter position. The caller must hold either hold the
7191 * RTNL lock or its own locking that guarantees that the neighbour lower
7192 * list will remain unchanged.
7193 */
7194void *netdev_lower_get_next_private(struct net_device *dev,
7195 struct list_head **iter)
7196{
7197 struct netdev_adjacent *lower;
7198
7199 lower = list_entry(*iter, struct netdev_adjacent, list);
7200
7201 if (&lower->list == &dev->adj_list.lower)
7202 return NULL;
7203
7204 *iter = lower->list.next;
7205
7206 return lower->private;
7207}
7208EXPORT_SYMBOL(netdev_lower_get_next_private);
7209
7210/**
7211 * netdev_lower_get_next_private_rcu - Get the next ->private from the
7212 * lower neighbour list, RCU
7213 * variant
7214 * @dev: device
7215 * @iter: list_head ** of the current position
7216 *
7217 * Gets the next netdev_adjacent->private from the dev's lower neighbour
7218 * list, starting from iter position. The caller must hold RCU read lock.
7219 */
7220void *netdev_lower_get_next_private_rcu(struct net_device *dev,
7221 struct list_head **iter)
7222{
7223 struct netdev_adjacent *lower;
7224
7225 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
7226
7227 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7228
7229 if (&lower->list == &dev->adj_list.lower)
7230 return NULL;
7231
7232 *iter = &lower->list;
7233
7234 return lower->private;
7235}
7236EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
7237
7238/**
7239 * netdev_lower_get_next - Get the next device from the lower neighbour
7240 * list
7241 * @dev: device
7242 * @iter: list_head ** of the current position
7243 *
7244 * Gets the next netdev_adjacent from the dev's lower neighbour
7245 * list, starting from iter position. The caller must hold RTNL lock or
7246 * its own locking that guarantees that the neighbour lower
7247 * list will remain unchanged.
7248 */
7249void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
7250{
7251 struct netdev_adjacent *lower;
7252
7253 lower = list_entry(*iter, struct netdev_adjacent, list);
7254
7255 if (&lower->list == &dev->adj_list.lower)
7256 return NULL;
7257
7258 *iter = lower->list.next;
7259
7260 return lower->dev;
7261}
7262EXPORT_SYMBOL(netdev_lower_get_next);
7263
7264static struct net_device *netdev_next_lower_dev(struct net_device *dev,
7265 struct list_head **iter)
7266{
7267 struct netdev_adjacent *lower;
7268
7269 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7270
7271 if (&lower->list == &dev->adj_list.lower)
7272 return NULL;
7273
7274 *iter = &lower->list;
7275
7276 return lower->dev;
7277}
7278
7279static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
7280 struct list_head **iter,
7281 bool *ignore)
7282{
7283 struct netdev_adjacent *lower;
7284
7285 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7286
7287 if (&lower->list == &dev->adj_list.lower)
7288 return NULL;
7289
7290 *iter = &lower->list;
7291 *ignore = lower->ignore;
7292
7293 return lower->dev;
7294}
7295
7296int netdev_walk_all_lower_dev(struct net_device *dev,
7297 int (*fn)(struct net_device *dev,
7298 struct netdev_nested_priv *priv),
7299 struct netdev_nested_priv *priv)
7300{
7301 struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7302 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7303 int ret, cur = 0;
7304
7305 now = dev;
7306 iter = &dev->adj_list.lower;
7307
7308 while (1) {
7309 if (now != dev) {
7310 ret = fn(now, priv);
7311 if (ret)
7312 return ret;
7313 }
7314
7315 next = NULL;
7316 while (1) {
7317 ldev = netdev_next_lower_dev(now, &iter);
7318 if (!ldev)
7319 break;
7320
7321 next = ldev;
7322 niter = &ldev->adj_list.lower;
7323 dev_stack[cur] = now;
7324 iter_stack[cur++] = iter;
7325 break;
7326 }
7327
7328 if (!next) {
7329 if (!cur)
7330 return 0;
7331 next = dev_stack[--cur];
7332 niter = iter_stack[cur];
7333 }
7334
7335 now = next;
7336 iter = niter;
7337 }
7338
7339 return 0;
7340}
7341EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
7342
7343static int __netdev_walk_all_lower_dev(struct net_device *dev,
7344 int (*fn)(struct net_device *dev,
7345 struct netdev_nested_priv *priv),
7346 struct netdev_nested_priv *priv)
7347{
7348 struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7349 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7350 int ret, cur = 0;
7351 bool ignore;
7352
7353 now = dev;
7354 iter = &dev->adj_list.lower;
7355
7356 while (1) {
7357 if (now != dev) {
7358 ret = fn(now, priv);
7359 if (ret)
7360 return ret;
7361 }
7362
7363 next = NULL;
7364 while (1) {
7365 ldev = __netdev_next_lower_dev(now, &iter, &ignore);
7366 if (!ldev)
7367 break;
7368 if (ignore)
7369 continue;
7370
7371 next = ldev;
7372 niter = &ldev->adj_list.lower;
7373 dev_stack[cur] = now;
7374 iter_stack[cur++] = iter;
7375 break;
7376 }
7377
7378 if (!next) {
7379 if (!cur)
7380 return 0;
7381 next = dev_stack[--cur];
7382 niter = iter_stack[cur];
7383 }
7384
7385 now = next;
7386 iter = niter;
7387 }
7388
7389 return 0;
7390}
7391
7392struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
7393 struct list_head **iter)
7394{
7395 struct netdev_adjacent *lower;
7396
7397 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7398 if (&lower->list == &dev->adj_list.lower)
7399 return NULL;
7400
7401 *iter = &lower->list;
7402
7403 return lower->dev;
7404}
7405EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
7406
7407static u8 __netdev_upper_depth(struct net_device *dev)
7408{
7409 struct net_device *udev;
7410 struct list_head *iter;
7411 u8 max_depth = 0;
7412 bool ignore;
7413
7414 for (iter = &dev->adj_list.upper,
7415 udev = __netdev_next_upper_dev(dev, &iter, &ignore);
7416 udev;
7417 udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
7418 if (ignore)
7419 continue;
7420 if (max_depth < udev->upper_level)
7421 max_depth = udev->upper_level;
7422 }
7423
7424 return max_depth;
7425}
7426
7427static u8 __netdev_lower_depth(struct net_device *dev)
7428{
7429 struct net_device *ldev;
7430 struct list_head *iter;
7431 u8 max_depth = 0;
7432 bool ignore;
7433
7434 for (iter = &dev->adj_list.lower,
7435 ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
7436 ldev;
7437 ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
7438 if (ignore)
7439 continue;
7440 if (max_depth < ldev->lower_level)
7441 max_depth = ldev->lower_level;
7442 }
7443
7444 return max_depth;
7445}
7446
7447static int __netdev_update_upper_level(struct net_device *dev,
7448 struct netdev_nested_priv *__unused)
7449{
7450 dev->upper_level = __netdev_upper_depth(dev) + 1;
7451 return 0;
7452}
7453
7454#ifdef CONFIG_LOCKDEP
7455static LIST_HEAD(net_unlink_list);
7456
7457static void net_unlink_todo(struct net_device *dev)
7458{
7459 if (list_empty(&dev->unlink_list))
7460 list_add_tail(&dev->unlink_list, &net_unlink_list);
7461}
7462#endif
7463
7464static int __netdev_update_lower_level(struct net_device *dev,
7465 struct netdev_nested_priv *priv)
7466{
7467 dev->lower_level = __netdev_lower_depth(dev) + 1;
7468
7469#ifdef CONFIG_LOCKDEP
7470 if (!priv)
7471 return 0;
7472
7473 if (priv->flags & NESTED_SYNC_IMM)
7474 dev->nested_level = dev->lower_level - 1;
7475 if (priv->flags & NESTED_SYNC_TODO)
7476 net_unlink_todo(dev);
7477#endif
7478 return 0;
7479}
7480
7481int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
7482 int (*fn)(struct net_device *dev,
7483 struct netdev_nested_priv *priv),
7484 struct netdev_nested_priv *priv)
7485{
7486 struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7487 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7488 int ret, cur = 0;
7489
7490 now = dev;
7491 iter = &dev->adj_list.lower;
7492
7493 while (1) {
7494 if (now != dev) {
7495 ret = fn(now, priv);
7496 if (ret)
7497 return ret;
7498 }
7499
7500 next = NULL;
7501 while (1) {
7502 ldev = netdev_next_lower_dev_rcu(now, &iter);
7503 if (!ldev)
7504 break;
7505
7506 next = ldev;
7507 niter = &ldev->adj_list.lower;
7508 dev_stack[cur] = now;
7509 iter_stack[cur++] = iter;
7510 break;
7511 }
7512
7513 if (!next) {
7514 if (!cur)
7515 return 0;
7516 next = dev_stack[--cur];
7517 niter = iter_stack[cur];
7518 }
7519
7520 now = next;
7521 iter = niter;
7522 }
7523
7524 return 0;
7525}
7526EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
7527
7528/**
7529 * netdev_lower_get_first_private_rcu - Get the first ->private from the
7530 * lower neighbour list, RCU
7531 * variant
7532 * @dev: device
7533 *
7534 * Gets the first netdev_adjacent->private from the dev's lower neighbour
7535 * list. The caller must hold RCU read lock.
7536 */
7537void *netdev_lower_get_first_private_rcu(struct net_device *dev)
7538{
7539 struct netdev_adjacent *lower;
7540
7541 lower = list_first_or_null_rcu(&dev->adj_list.lower,
7542 struct netdev_adjacent, list);
7543 if (lower)
7544 return lower->private;
7545 return NULL;
7546}
7547EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
7548
7549/**
7550 * netdev_master_upper_dev_get_rcu - Get master upper device
7551 * @dev: device
7552 *
7553 * Find a master upper device and return pointer to it or NULL in case
7554 * it's not there. The caller must hold the RCU read lock.
7555 */
7556struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
7557{
7558 struct netdev_adjacent *upper;
7559
7560 upper = list_first_or_null_rcu(&dev->adj_list.upper,
7561 struct netdev_adjacent, list);
7562 if (upper && likely(upper->master))
7563 return upper->dev;
7564 return NULL;
7565}
7566EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
7567
7568static int netdev_adjacent_sysfs_add(struct net_device *dev,
7569 struct net_device *adj_dev,
7570 struct list_head *dev_list)
7571{
7572 char linkname[IFNAMSIZ+7];
7573
7574 sprintf(linkname, dev_list == &dev->adj_list.upper ?
7575 "upper_%s" : "lower_%s", adj_dev->name);
7576 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
7577 linkname);
7578}
7579static void netdev_adjacent_sysfs_del(struct net_device *dev,
7580 char *name,
7581 struct list_head *dev_list)
7582{
7583 char linkname[IFNAMSIZ+7];
7584
7585 sprintf(linkname, dev_list == &dev->adj_list.upper ?
7586 "upper_%s" : "lower_%s", name);
7587 sysfs_remove_link(&(dev->dev.kobj), linkname);
7588}
7589
7590static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
7591 struct net_device *adj_dev,
7592 struct list_head *dev_list)
7593{
7594 return (dev_list == &dev->adj_list.upper ||
7595 dev_list == &dev->adj_list.lower) &&
7596 net_eq(dev_net(dev), dev_net(adj_dev));
7597}
7598
7599static int __netdev_adjacent_dev_insert(struct net_device *dev,
7600 struct net_device *adj_dev,
7601 struct list_head *dev_list,
7602 void *private, bool master)
7603{
7604 struct netdev_adjacent *adj;
7605 int ret;
7606
7607 adj = __netdev_find_adj(adj_dev, dev_list);
7608
7609 if (adj) {
7610 adj->ref_nr += 1;
7611 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
7612 dev->name, adj_dev->name, adj->ref_nr);
7613
7614 return 0;
7615 }
7616
7617 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
7618 if (!adj)
7619 return -ENOMEM;
7620
7621 adj->dev = adj_dev;
7622 adj->master = master;
7623 adj->ref_nr = 1;
7624 adj->private = private;
7625 adj->ignore = false;
7626 netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL);
7627
7628 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
7629 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
7630
7631 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
7632 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
7633 if (ret)
7634 goto free_adj;
7635 }
7636
7637 /* Ensure that master link is always the first item in list. */
7638 if (master) {
7639 ret = sysfs_create_link(&(dev->dev.kobj),
7640 &(adj_dev->dev.kobj), "master");
7641 if (ret)
7642 goto remove_symlinks;
7643
7644 list_add_rcu(&adj->list, dev_list);
7645 } else {
7646 list_add_tail_rcu(&adj->list, dev_list);
7647 }
7648
7649 return 0;
7650
7651remove_symlinks:
7652 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7653 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7654free_adj:
7655 netdev_put(adj_dev, &adj->dev_tracker);
7656 kfree(adj);
7657
7658 return ret;
7659}
7660
7661static void __netdev_adjacent_dev_remove(struct net_device *dev,
7662 struct net_device *adj_dev,
7663 u16 ref_nr,
7664 struct list_head *dev_list)
7665{
7666 struct netdev_adjacent *adj;
7667
7668 pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
7669 dev->name, adj_dev->name, ref_nr);
7670
7671 adj = __netdev_find_adj(adj_dev, dev_list);
7672
7673 if (!adj) {
7674 pr_err("Adjacency does not exist for device %s from %s\n",
7675 dev->name, adj_dev->name);
7676 WARN_ON(1);
7677 return;
7678 }
7679
7680 if (adj->ref_nr > ref_nr) {
7681 pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
7682 dev->name, adj_dev->name, ref_nr,
7683 adj->ref_nr - ref_nr);
7684 adj->ref_nr -= ref_nr;
7685 return;
7686 }
7687
7688 if (adj->master)
7689 sysfs_remove_link(&(dev->dev.kobj), "master");
7690
7691 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7692 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7693
7694 list_del_rcu(&adj->list);
7695 pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
7696 adj_dev->name, dev->name, adj_dev->name);
7697 netdev_put(adj_dev, &adj->dev_tracker);
7698 kfree_rcu(adj, rcu);
7699}
7700
7701static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
7702 struct net_device *upper_dev,
7703 struct list_head *up_list,
7704 struct list_head *down_list,
7705 void *private, bool master)
7706{
7707 int ret;
7708
7709 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
7710 private, master);
7711 if (ret)
7712 return ret;
7713
7714 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
7715 private, false);
7716 if (ret) {
7717 __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
7718 return ret;
7719 }
7720
7721 return 0;
7722}
7723
7724static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
7725 struct net_device *upper_dev,
7726 u16 ref_nr,
7727 struct list_head *up_list,
7728 struct list_head *down_list)
7729{
7730 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
7731 __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
7732}
7733
7734static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
7735 struct net_device *upper_dev,
7736 void *private, bool master)
7737{
7738 return __netdev_adjacent_dev_link_lists(dev, upper_dev,
7739 &dev->adj_list.upper,
7740 &upper_dev->adj_list.lower,
7741 private, master);
7742}
7743
7744static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
7745 struct net_device *upper_dev)
7746{
7747 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
7748 &dev->adj_list.upper,
7749 &upper_dev->adj_list.lower);
7750}
7751
7752static int __netdev_upper_dev_link(struct net_device *dev,
7753 struct net_device *upper_dev, bool master,
7754 void *upper_priv, void *upper_info,
7755 struct netdev_nested_priv *priv,
7756 struct netlink_ext_ack *extack)
7757{
7758 struct netdev_notifier_changeupper_info changeupper_info = {
7759 .info = {
7760 .dev = dev,
7761 .extack = extack,
7762 },
7763 .upper_dev = upper_dev,
7764 .master = master,
7765 .linking = true,
7766 .upper_info = upper_info,
7767 };
7768 struct net_device *master_dev;
7769 int ret = 0;
7770
7771 ASSERT_RTNL();
7772
7773 if (dev == upper_dev)
7774 return -EBUSY;
7775
7776 /* To prevent loops, check if dev is not upper device to upper_dev. */
7777 if (__netdev_has_upper_dev(upper_dev, dev))
7778 return -EBUSY;
7779
7780 if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
7781 return -EMLINK;
7782
7783 if (!master) {
7784 if (__netdev_has_upper_dev(dev, upper_dev))
7785 return -EEXIST;
7786 } else {
7787 master_dev = __netdev_master_upper_dev_get(dev);
7788 if (master_dev)
7789 return master_dev == upper_dev ? -EEXIST : -EBUSY;
7790 }
7791
7792 ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7793 &changeupper_info.info);
7794 ret = notifier_to_errno(ret);
7795 if (ret)
7796 return ret;
7797
7798 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
7799 master);
7800 if (ret)
7801 return ret;
7802
7803 ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7804 &changeupper_info.info);
7805 ret = notifier_to_errno(ret);
7806 if (ret)
7807 goto rollback;
7808
7809 __netdev_update_upper_level(dev, NULL);
7810 __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7811
7812 __netdev_update_lower_level(upper_dev, priv);
7813 __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7814 priv);
7815
7816 return 0;
7817
7818rollback:
7819 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7820
7821 return ret;
7822}
7823
7824/**
7825 * netdev_upper_dev_link - Add a link to the upper device
7826 * @dev: device
7827 * @upper_dev: new upper device
7828 * @extack: netlink extended ack
7829 *
7830 * Adds a link to device which is upper to this one. The caller must hold
7831 * the RTNL lock. On a failure a negative errno code is returned.
7832 * On success the reference counts are adjusted and the function
7833 * returns zero.
7834 */
7835int netdev_upper_dev_link(struct net_device *dev,
7836 struct net_device *upper_dev,
7837 struct netlink_ext_ack *extack)
7838{
7839 struct netdev_nested_priv priv = {
7840 .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7841 .data = NULL,
7842 };
7843
7844 return __netdev_upper_dev_link(dev, upper_dev, false,
7845 NULL, NULL, &priv, extack);
7846}
7847EXPORT_SYMBOL(netdev_upper_dev_link);
7848
7849/**
7850 * netdev_master_upper_dev_link - Add a master link to the upper device
7851 * @dev: device
7852 * @upper_dev: new upper device
7853 * @upper_priv: upper device private
7854 * @upper_info: upper info to be passed down via notifier
7855 * @extack: netlink extended ack
7856 *
7857 * Adds a link to device which is upper to this one. In this case, only
7858 * one master upper device can be linked, although other non-master devices
7859 * might be linked as well. The caller must hold the RTNL lock.
7860 * On a failure a negative errno code is returned. On success the reference
7861 * counts are adjusted and the function returns zero.
7862 */
7863int netdev_master_upper_dev_link(struct net_device *dev,
7864 struct net_device *upper_dev,
7865 void *upper_priv, void *upper_info,
7866 struct netlink_ext_ack *extack)
7867{
7868 struct netdev_nested_priv priv = {
7869 .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7870 .data = NULL,
7871 };
7872
7873 return __netdev_upper_dev_link(dev, upper_dev, true,
7874 upper_priv, upper_info, &priv, extack);
7875}
7876EXPORT_SYMBOL(netdev_master_upper_dev_link);
7877
7878static void __netdev_upper_dev_unlink(struct net_device *dev,
7879 struct net_device *upper_dev,
7880 struct netdev_nested_priv *priv)
7881{
7882 struct netdev_notifier_changeupper_info changeupper_info = {
7883 .info = {
7884 .dev = dev,
7885 },
7886 .upper_dev = upper_dev,
7887 .linking = false,
7888 };
7889
7890 ASSERT_RTNL();
7891
7892 changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
7893
7894 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7895 &changeupper_info.info);
7896
7897 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7898
7899 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7900 &changeupper_info.info);
7901
7902 __netdev_update_upper_level(dev, NULL);
7903 __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7904
7905 __netdev_update_lower_level(upper_dev, priv);
7906 __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7907 priv);
7908}
7909
7910/**
7911 * netdev_upper_dev_unlink - Removes a link to upper device
7912 * @dev: device
7913 * @upper_dev: new upper device
7914 *
7915 * Removes a link to device which is upper to this one. The caller must hold
7916 * the RTNL lock.
7917 */
7918void netdev_upper_dev_unlink(struct net_device *dev,
7919 struct net_device *upper_dev)
7920{
7921 struct netdev_nested_priv priv = {
7922 .flags = NESTED_SYNC_TODO,
7923 .data = NULL,
7924 };
7925
7926 __netdev_upper_dev_unlink(dev, upper_dev, &priv);
7927}
7928EXPORT_SYMBOL(netdev_upper_dev_unlink);
7929
7930static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
7931 struct net_device *lower_dev,
7932 bool val)
7933{
7934 struct netdev_adjacent *adj;
7935
7936 adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
7937 if (adj)
7938 adj->ignore = val;
7939
7940 adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
7941 if (adj)
7942 adj->ignore = val;
7943}
7944
7945static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
7946 struct net_device *lower_dev)
7947{
7948 __netdev_adjacent_dev_set(upper_dev, lower_dev, true);
7949}
7950
7951static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
7952 struct net_device *lower_dev)
7953{
7954 __netdev_adjacent_dev_set(upper_dev, lower_dev, false);
7955}
7956
7957int netdev_adjacent_change_prepare(struct net_device *old_dev,
7958 struct net_device *new_dev,
7959 struct net_device *dev,
7960 struct netlink_ext_ack *extack)
7961{
7962 struct netdev_nested_priv priv = {
7963 .flags = 0,
7964 .data = NULL,
7965 };
7966 int err;
7967
7968 if (!new_dev)
7969 return 0;
7970
7971 if (old_dev && new_dev != old_dev)
7972 netdev_adjacent_dev_disable(dev, old_dev);
7973 err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
7974 extack);
7975 if (err) {
7976 if (old_dev && new_dev != old_dev)
7977 netdev_adjacent_dev_enable(dev, old_dev);
7978 return err;
7979 }
7980
7981 return 0;
7982}
7983EXPORT_SYMBOL(netdev_adjacent_change_prepare);
7984
7985void netdev_adjacent_change_commit(struct net_device *old_dev,
7986 struct net_device *new_dev,
7987 struct net_device *dev)
7988{
7989 struct netdev_nested_priv priv = {
7990 .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7991 .data = NULL,
7992 };
7993
7994 if (!new_dev || !old_dev)
7995 return;
7996
7997 if (new_dev == old_dev)
7998 return;
7999
8000 netdev_adjacent_dev_enable(dev, old_dev);
8001 __netdev_upper_dev_unlink(old_dev, dev, &priv);
8002}
8003EXPORT_SYMBOL(netdev_adjacent_change_commit);
8004
8005void netdev_adjacent_change_abort(struct net_device *old_dev,
8006 struct net_device *new_dev,
8007 struct net_device *dev)
8008{
8009 struct netdev_nested_priv priv = {
8010 .flags = 0,
8011 .data = NULL,
8012 };
8013
8014 if (!new_dev)
8015 return;
8016
8017 if (old_dev && new_dev != old_dev)
8018 netdev_adjacent_dev_enable(dev, old_dev);
8019
8020 __netdev_upper_dev_unlink(new_dev, dev, &priv);
8021}
8022EXPORT_SYMBOL(netdev_adjacent_change_abort);
8023
8024/**
8025 * netdev_bonding_info_change - Dispatch event about slave change
8026 * @dev: device
8027 * @bonding_info: info to dispatch
8028 *
8029 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
8030 * The caller must hold the RTNL lock.
8031 */
8032void netdev_bonding_info_change(struct net_device *dev,
8033 struct netdev_bonding_info *bonding_info)
8034{
8035 struct netdev_notifier_bonding_info info = {
8036 .info.dev = dev,
8037 };
8038
8039 memcpy(&info.bonding_info, bonding_info,
8040 sizeof(struct netdev_bonding_info));
8041 call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
8042 &info.info);
8043}
8044EXPORT_SYMBOL(netdev_bonding_info_change);
8045
8046static int netdev_offload_xstats_enable_l3(struct net_device *dev,
8047 struct netlink_ext_ack *extack)
8048{
8049 struct netdev_notifier_offload_xstats_info info = {
8050 .info.dev = dev,
8051 .info.extack = extack,
8052 .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
8053 };
8054 int err;
8055 int rc;
8056
8057 dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
8058 GFP_KERNEL);
8059 if (!dev->offload_xstats_l3)
8060 return -ENOMEM;
8061
8062 rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
8063 NETDEV_OFFLOAD_XSTATS_DISABLE,
8064 &info.info);
8065 err = notifier_to_errno(rc);
8066 if (err)
8067 goto free_stats;
8068
8069 return 0;
8070
8071free_stats:
8072 kfree(dev->offload_xstats_l3);
8073 dev->offload_xstats_l3 = NULL;
8074 return err;
8075}
8076
8077int netdev_offload_xstats_enable(struct net_device *dev,
8078 enum netdev_offload_xstats_type type,
8079 struct netlink_ext_ack *extack)
8080{
8081 ASSERT_RTNL();
8082
8083 if (netdev_offload_xstats_enabled(dev, type))
8084 return -EALREADY;
8085
8086 switch (type) {
8087 case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
8088 return netdev_offload_xstats_enable_l3(dev, extack);
8089 }
8090
8091 WARN_ON(1);
8092 return -EINVAL;
8093}
8094EXPORT_SYMBOL(netdev_offload_xstats_enable);
8095
8096static void netdev_offload_xstats_disable_l3(struct net_device *dev)
8097{
8098 struct netdev_notifier_offload_xstats_info info = {
8099 .info.dev = dev,
8100 .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
8101 };
8102
8103 call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
8104 &info.info);
8105 kfree(dev->offload_xstats_l3);
8106 dev->offload_xstats_l3 = NULL;
8107}
8108
8109int netdev_offload_xstats_disable(struct net_device *dev,
8110 enum netdev_offload_xstats_type type)
8111{
8112 ASSERT_RTNL();
8113
8114 if (!netdev_offload_xstats_enabled(dev, type))
8115 return -EALREADY;
8116
8117 switch (type) {
8118 case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
8119 netdev_offload_xstats_disable_l3(dev);
8120 return 0;
8121 }
8122
8123 WARN_ON(1);
8124 return -EINVAL;
8125}
8126EXPORT_SYMBOL(netdev_offload_xstats_disable);
8127
8128static void netdev_offload_xstats_disable_all(struct net_device *dev)
8129{
8130 netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
8131}
8132
8133static struct rtnl_hw_stats64 *
8134netdev_offload_xstats_get_ptr(const struct net_device *dev,
8135 enum netdev_offload_xstats_type type)
8136{
8137 switch (type) {
8138 case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
8139 return dev->offload_xstats_l3;
8140 }
8141
8142 WARN_ON(1);
8143 return NULL;
8144}
8145
8146bool netdev_offload_xstats_enabled(const struct net_device *dev,
8147 enum netdev_offload_xstats_type type)
8148{
8149 ASSERT_RTNL();
8150
8151 return netdev_offload_xstats_get_ptr(dev, type);
8152}
8153EXPORT_SYMBOL(netdev_offload_xstats_enabled);
8154
8155struct netdev_notifier_offload_xstats_ru {
8156 bool used;
8157};
8158
8159struct netdev_notifier_offload_xstats_rd {
8160 struct rtnl_hw_stats64 stats;
8161 bool used;
8162};
8163
8164static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
8165 const struct rtnl_hw_stats64 *src)
8166{
8167 dest->rx_packets += src->rx_packets;
8168 dest->tx_packets += src->tx_packets;
8169 dest->rx_bytes += src->rx_bytes;
8170 dest->tx_bytes += src->tx_bytes;
8171 dest->rx_errors += src->rx_errors;
8172 dest->tx_errors += src->tx_errors;
8173 dest->rx_dropped += src->rx_dropped;
8174 dest->tx_dropped += src->tx_dropped;
8175 dest->multicast += src->multicast;
8176}
8177
8178static int netdev_offload_xstats_get_used(struct net_device *dev,
8179 enum netdev_offload_xstats_type type,
8180 bool *p_used,
8181 struct netlink_ext_ack *extack)
8182{
8183 struct netdev_notifier_offload_xstats_ru report_used = {};
8184 struct netdev_notifier_offload_xstats_info info = {
8185 .info.dev = dev,
8186 .info.extack = extack,
8187 .type = type,
8188 .report_used = &report_used,
8189 };
8190 int rc;
8191
8192 WARN_ON(!netdev_offload_xstats_enabled(dev, type));
8193 rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
8194 &info.info);
8195 *p_used = report_used.used;
8196 return notifier_to_errno(rc);
8197}
8198
8199static int netdev_offload_xstats_get_stats(struct net_device *dev,
8200 enum netdev_offload_xstats_type type,
8201 struct rtnl_hw_stats64 *p_stats,
8202 bool *p_used,
8203 struct netlink_ext_ack *extack)
8204{
8205 struct netdev_notifier_offload_xstats_rd report_delta = {};
8206 struct netdev_notifier_offload_xstats_info info = {
8207 .info.dev = dev,
8208 .info.extack = extack,
8209 .type = type,
8210 .report_delta = &report_delta,
8211 };
8212 struct rtnl_hw_stats64 *stats;
8213 int rc;
8214
8215 stats = netdev_offload_xstats_get_ptr(dev, type);
8216 if (WARN_ON(!stats))
8217 return -EINVAL;
8218
8219 rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
8220 &info.info);
8221
8222 /* Cache whatever we got, even if there was an error, otherwise the
8223 * successful stats retrievals would get lost.
8224 */
8225 netdev_hw_stats64_add(stats, &report_delta.stats);
8226
8227 if (p_stats)
8228 *p_stats = *stats;
8229 *p_used = report_delta.used;
8230
8231 return notifier_to_errno(rc);
8232}
8233
8234int netdev_offload_xstats_get(struct net_device *dev,
8235 enum netdev_offload_xstats_type type,
8236 struct rtnl_hw_stats64 *p_stats, bool *p_used,
8237 struct netlink_ext_ack *extack)
8238{
8239 ASSERT_RTNL();
8240
8241 if (p_stats)
8242 return netdev_offload_xstats_get_stats(dev, type, p_stats,
8243 p_used, extack);
8244 else
8245 return netdev_offload_xstats_get_used(dev, type, p_used,
8246 extack);
8247}
8248EXPORT_SYMBOL(netdev_offload_xstats_get);
8249
8250void
8251netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
8252 const struct rtnl_hw_stats64 *stats)
8253{
8254 report_delta->used = true;
8255 netdev_hw_stats64_add(&report_delta->stats, stats);
8256}
8257EXPORT_SYMBOL(netdev_offload_xstats_report_delta);
8258
8259void
8260netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
8261{
8262 report_used->used = true;
8263}
8264EXPORT_SYMBOL(netdev_offload_xstats_report_used);
8265
8266void netdev_offload_xstats_push_delta(struct net_device *dev,
8267 enum netdev_offload_xstats_type type,
8268 const struct rtnl_hw_stats64 *p_stats)
8269{
8270 struct rtnl_hw_stats64 *stats;
8271
8272 ASSERT_RTNL();
8273
8274 stats = netdev_offload_xstats_get_ptr(dev, type);
8275 if (WARN_ON(!stats))
8276 return;
8277
8278 netdev_hw_stats64_add(stats, p_stats);
8279}
8280EXPORT_SYMBOL(netdev_offload_xstats_push_delta);
8281
8282/**
8283 * netdev_get_xmit_slave - Get the xmit slave of master device
8284 * @dev: device
8285 * @skb: The packet
8286 * @all_slaves: assume all the slaves are active
8287 *
8288 * The reference counters are not incremented so the caller must be
8289 * careful with locks. The caller must hold RCU lock.
8290 * %NULL is returned if no slave is found.
8291 */
8292
8293struct net_device *netdev_get_xmit_slave(struct net_device *dev,
8294 struct sk_buff *skb,
8295 bool all_slaves)
8296{
8297 const struct net_device_ops *ops = dev->netdev_ops;
8298
8299 if (!ops->ndo_get_xmit_slave)
8300 return NULL;
8301 return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
8302}
8303EXPORT_SYMBOL(netdev_get_xmit_slave);
8304
8305static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
8306 struct sock *sk)
8307{
8308 const struct net_device_ops *ops = dev->netdev_ops;
8309
8310 if (!ops->ndo_sk_get_lower_dev)
8311 return NULL;
8312 return ops->ndo_sk_get_lower_dev(dev, sk);
8313}
8314
8315/**
8316 * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
8317 * @dev: device
8318 * @sk: the socket
8319 *
8320 * %NULL is returned if no lower device is found.
8321 */
8322
8323struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
8324 struct sock *sk)
8325{
8326 struct net_device *lower;
8327
8328 lower = netdev_sk_get_lower_dev(dev, sk);
8329 while (lower) {
8330 dev = lower;
8331 lower = netdev_sk_get_lower_dev(dev, sk);
8332 }
8333
8334 return dev;
8335}
8336EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
8337
8338static void netdev_adjacent_add_links(struct net_device *dev)
8339{
8340 struct netdev_adjacent *iter;
8341
8342 struct net *net = dev_net(dev);
8343
8344 list_for_each_entry(iter, &dev->adj_list.upper, list) {
8345 if (!net_eq(net, dev_net(iter->dev)))
8346 continue;
8347 netdev_adjacent_sysfs_add(iter->dev, dev,
8348 &iter->dev->adj_list.lower);
8349 netdev_adjacent_sysfs_add(dev, iter->dev,
8350 &dev->adj_list.upper);
8351 }
8352
8353 list_for_each_entry(iter, &dev->adj_list.lower, list) {
8354 if (!net_eq(net, dev_net(iter->dev)))
8355 continue;
8356 netdev_adjacent_sysfs_add(iter->dev, dev,
8357 &iter->dev->adj_list.upper);
8358 netdev_adjacent_sysfs_add(dev, iter->dev,
8359 &dev->adj_list.lower);
8360 }
8361}
8362
8363static void netdev_adjacent_del_links(struct net_device *dev)
8364{
8365 struct netdev_adjacent *iter;
8366
8367 struct net *net = dev_net(dev);
8368
8369 list_for_each_entry(iter, &dev->adj_list.upper, list) {
8370 if (!net_eq(net, dev_net(iter->dev)))
8371 continue;
8372 netdev_adjacent_sysfs_del(iter->dev, dev->name,
8373 &iter->dev->adj_list.lower);
8374 netdev_adjacent_sysfs_del(dev, iter->dev->name,
8375 &dev->adj_list.upper);
8376 }
8377
8378 list_for_each_entry(iter, &dev->adj_list.lower, list) {
8379 if (!net_eq(net, dev_net(iter->dev)))
8380 continue;
8381 netdev_adjacent_sysfs_del(iter->dev, dev->name,
8382 &iter->dev->adj_list.upper);
8383 netdev_adjacent_sysfs_del(dev, iter->dev->name,
8384 &dev->adj_list.lower);
8385 }
8386}
8387
8388void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
8389{
8390 struct netdev_adjacent *iter;
8391
8392 struct net *net = dev_net(dev);
8393
8394 list_for_each_entry(iter, &dev->adj_list.upper, list) {
8395 if (!net_eq(net, dev_net(iter->dev)))
8396 continue;
8397 netdev_adjacent_sysfs_del(iter->dev, oldname,
8398 &iter->dev->adj_list.lower);
8399 netdev_adjacent_sysfs_add(iter->dev, dev,
8400 &iter->dev->adj_list.lower);
8401 }
8402
8403 list_for_each_entry(iter, &dev->adj_list.lower, list) {
8404 if (!net_eq(net, dev_net(iter->dev)))
8405 continue;
8406 netdev_adjacent_sysfs_del(iter->dev, oldname,
8407 &iter->dev->adj_list.upper);
8408 netdev_adjacent_sysfs_add(iter->dev, dev,
8409 &iter->dev->adj_list.upper);
8410 }
8411}
8412
8413void *netdev_lower_dev_get_private(struct net_device *dev,
8414 struct net_device *lower_dev)
8415{
8416 struct netdev_adjacent *lower;
8417
8418 if (!lower_dev)
8419 return NULL;
8420 lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
8421 if (!lower)
8422 return NULL;
8423
8424 return lower->private;
8425}
8426EXPORT_SYMBOL(netdev_lower_dev_get_private);
8427
8428
8429/**
8430 * netdev_lower_state_changed - Dispatch event about lower device state change
8431 * @lower_dev: device
8432 * @lower_state_info: state to dispatch
8433 *
8434 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
8435 * The caller must hold the RTNL lock.
8436 */
8437void netdev_lower_state_changed(struct net_device *lower_dev,
8438 void *lower_state_info)
8439{
8440 struct netdev_notifier_changelowerstate_info changelowerstate_info = {
8441 .info.dev = lower_dev,
8442 };
8443
8444 ASSERT_RTNL();
8445 changelowerstate_info.lower_state_info = lower_state_info;
8446 call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
8447 &changelowerstate_info.info);
8448}
8449EXPORT_SYMBOL(netdev_lower_state_changed);
8450
8451static void dev_change_rx_flags(struct net_device *dev, int flags)
8452{
8453 const struct net_device_ops *ops = dev->netdev_ops;
8454
8455 if (ops->ndo_change_rx_flags)
8456 ops->ndo_change_rx_flags(dev, flags);
8457}
8458
8459static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
8460{
8461 unsigned int old_flags = dev->flags;
8462 kuid_t uid;
8463 kgid_t gid;
8464
8465 ASSERT_RTNL();
8466
8467 dev->flags |= IFF_PROMISC;
8468 dev->promiscuity += inc;
8469 if (dev->promiscuity == 0) {
8470 /*
8471 * Avoid overflow.
8472 * If inc causes overflow, untouch promisc and return error.
8473 */
8474 if (inc < 0)
8475 dev->flags &= ~IFF_PROMISC;
8476 else {
8477 dev->promiscuity -= inc;
8478 netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
8479 return -EOVERFLOW;
8480 }
8481 }
8482 if (dev->flags != old_flags) {
8483 netdev_info(dev, "%s promiscuous mode\n",
8484 dev->flags & IFF_PROMISC ? "entered" : "left");
8485 if (audit_enabled) {
8486 current_uid_gid(&uid, &gid);
8487 audit_log(audit_context(), GFP_ATOMIC,
8488 AUDIT_ANOM_PROMISCUOUS,
8489 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
8490 dev->name, (dev->flags & IFF_PROMISC),
8491 (old_flags & IFF_PROMISC),
8492 from_kuid(&init_user_ns, audit_get_loginuid(current)),
8493 from_kuid(&init_user_ns, uid),
8494 from_kgid(&init_user_ns, gid),
8495 audit_get_sessionid(current));
8496 }
8497
8498 dev_change_rx_flags(dev, IFF_PROMISC);
8499 }
8500 if (notify)
8501 __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
8502 return 0;
8503}
8504
8505/**
8506 * dev_set_promiscuity - update promiscuity count on a device
8507 * @dev: device
8508 * @inc: modifier
8509 *
8510 * Add or remove promiscuity from a device. While the count in the device
8511 * remains above zero the interface remains promiscuous. Once it hits zero
8512 * the device reverts back to normal filtering operation. A negative inc
8513 * value is used to drop promiscuity on the device.
8514 * Return 0 if successful or a negative errno code on error.
8515 */
8516int dev_set_promiscuity(struct net_device *dev, int inc)
8517{
8518 unsigned int old_flags = dev->flags;
8519 int err;
8520
8521 err = __dev_set_promiscuity(dev, inc, true);
8522 if (err < 0)
8523 return err;
8524 if (dev->flags != old_flags)
8525 dev_set_rx_mode(dev);
8526 return err;
8527}
8528EXPORT_SYMBOL(dev_set_promiscuity);
8529
8530static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
8531{
8532 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
8533
8534 ASSERT_RTNL();
8535
8536 dev->flags |= IFF_ALLMULTI;
8537 dev->allmulti += inc;
8538 if (dev->allmulti == 0) {
8539 /*
8540 * Avoid overflow.
8541 * If inc causes overflow, untouch allmulti and return error.
8542 */
8543 if (inc < 0)
8544 dev->flags &= ~IFF_ALLMULTI;
8545 else {
8546 dev->allmulti -= inc;
8547 netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
8548 return -EOVERFLOW;
8549 }
8550 }
8551 if (dev->flags ^ old_flags) {
8552 netdev_info(dev, "%s allmulticast mode\n",
8553 dev->flags & IFF_ALLMULTI ? "entered" : "left");
8554 dev_change_rx_flags(dev, IFF_ALLMULTI);
8555 dev_set_rx_mode(dev);
8556 if (notify)
8557 __dev_notify_flags(dev, old_flags,
8558 dev->gflags ^ old_gflags, 0, NULL);
8559 }
8560 return 0;
8561}
8562
8563/**
8564 * dev_set_allmulti - update allmulti count on a device
8565 * @dev: device
8566 * @inc: modifier
8567 *
8568 * Add or remove reception of all multicast frames to a device. While the
8569 * count in the device remains above zero the interface remains listening
8570 * to all interfaces. Once it hits zero the device reverts back to normal
8571 * filtering operation. A negative @inc value is used to drop the counter
8572 * when releasing a resource needing all multicasts.
8573 * Return 0 if successful or a negative errno code on error.
8574 */
8575
8576int dev_set_allmulti(struct net_device *dev, int inc)
8577{
8578 return __dev_set_allmulti(dev, inc, true);
8579}
8580EXPORT_SYMBOL(dev_set_allmulti);
8581
8582/*
8583 * Upload unicast and multicast address lists to device and
8584 * configure RX filtering. When the device doesn't support unicast
8585 * filtering it is put in promiscuous mode while unicast addresses
8586 * are present.
8587 */
8588void __dev_set_rx_mode(struct net_device *dev)
8589{
8590 const struct net_device_ops *ops = dev->netdev_ops;
8591
8592 /* dev_open will call this function so the list will stay sane. */
8593 if (!(dev->flags&IFF_UP))
8594 return;
8595
8596 if (!netif_device_present(dev))
8597 return;
8598
8599 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
8600 /* Unicast addresses changes may only happen under the rtnl,
8601 * therefore calling __dev_set_promiscuity here is safe.
8602 */
8603 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
8604 __dev_set_promiscuity(dev, 1, false);
8605 dev->uc_promisc = true;
8606 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
8607 __dev_set_promiscuity(dev, -1, false);
8608 dev->uc_promisc = false;
8609 }
8610 }
8611
8612 if (ops->ndo_set_rx_mode)
8613 ops->ndo_set_rx_mode(dev);
8614}
8615
8616void dev_set_rx_mode(struct net_device *dev)
8617{
8618 netif_addr_lock_bh(dev);
8619 __dev_set_rx_mode(dev);
8620 netif_addr_unlock_bh(dev);
8621}
8622
8623/**
8624 * dev_get_flags - get flags reported to userspace
8625 * @dev: device
8626 *
8627 * Get the combination of flag bits exported through APIs to userspace.
8628 */
8629unsigned int dev_get_flags(const struct net_device *dev)
8630{
8631 unsigned int flags;
8632
8633 flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC |
8634 IFF_ALLMULTI |
8635 IFF_RUNNING |
8636 IFF_LOWER_UP |
8637 IFF_DORMANT)) |
8638 (READ_ONCE(dev->gflags) & (IFF_PROMISC |
8639 IFF_ALLMULTI));
8640
8641 if (netif_running(dev)) {
8642 if (netif_oper_up(dev))
8643 flags |= IFF_RUNNING;
8644 if (netif_carrier_ok(dev))
8645 flags |= IFF_LOWER_UP;
8646 if (netif_dormant(dev))
8647 flags |= IFF_DORMANT;
8648 }
8649
8650 return flags;
8651}
8652EXPORT_SYMBOL(dev_get_flags);
8653
8654int __dev_change_flags(struct net_device *dev, unsigned int flags,
8655 struct netlink_ext_ack *extack)
8656{
8657 unsigned int old_flags = dev->flags;
8658 int ret;
8659
8660 ASSERT_RTNL();
8661
8662 /*
8663 * Set the flags on our device.
8664 */
8665
8666 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
8667 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
8668 IFF_AUTOMEDIA)) |
8669 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
8670 IFF_ALLMULTI));
8671
8672 /*
8673 * Load in the correct multicast list now the flags have changed.
8674 */
8675
8676 if ((old_flags ^ flags) & IFF_MULTICAST)
8677 dev_change_rx_flags(dev, IFF_MULTICAST);
8678
8679 dev_set_rx_mode(dev);
8680
8681 /*
8682 * Have we downed the interface. We handle IFF_UP ourselves
8683 * according to user attempts to set it, rather than blindly
8684 * setting it.
8685 */
8686
8687 ret = 0;
8688 if ((old_flags ^ flags) & IFF_UP) {
8689 if (old_flags & IFF_UP)
8690 __dev_close(dev);
8691 else
8692 ret = __dev_open(dev, extack);
8693 }
8694
8695 if ((flags ^ dev->gflags) & IFF_PROMISC) {
8696 int inc = (flags & IFF_PROMISC) ? 1 : -1;
8697 unsigned int old_flags = dev->flags;
8698
8699 dev->gflags ^= IFF_PROMISC;
8700
8701 if (__dev_set_promiscuity(dev, inc, false) >= 0)
8702 if (dev->flags != old_flags)
8703 dev_set_rx_mode(dev);
8704 }
8705
8706 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
8707 * is important. Some (broken) drivers set IFF_PROMISC, when
8708 * IFF_ALLMULTI is requested not asking us and not reporting.
8709 */
8710 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
8711 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
8712
8713 dev->gflags ^= IFF_ALLMULTI;
8714 __dev_set_allmulti(dev, inc, false);
8715 }
8716
8717 return ret;
8718}
8719
8720void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
8721 unsigned int gchanges, u32 portid,
8722 const struct nlmsghdr *nlh)
8723{
8724 unsigned int changes = dev->flags ^ old_flags;
8725
8726 if (gchanges)
8727 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh);
8728
8729 if (changes & IFF_UP) {
8730 if (dev->flags & IFF_UP)
8731 call_netdevice_notifiers(NETDEV_UP, dev);
8732 else
8733 call_netdevice_notifiers(NETDEV_DOWN, dev);
8734 }
8735
8736 if (dev->flags & IFF_UP &&
8737 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
8738 struct netdev_notifier_change_info change_info = {
8739 .info = {
8740 .dev = dev,
8741 },
8742 .flags_changed = changes,
8743 };
8744
8745 call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
8746 }
8747}
8748
8749/**
8750 * dev_change_flags - change device settings
8751 * @dev: device
8752 * @flags: device state flags
8753 * @extack: netlink extended ack
8754 *
8755 * Change settings on device based state flags. The flags are
8756 * in the userspace exported format.
8757 */
8758int dev_change_flags(struct net_device *dev, unsigned int flags,
8759 struct netlink_ext_ack *extack)
8760{
8761 int ret;
8762 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
8763
8764 ret = __dev_change_flags(dev, flags, extack);
8765 if (ret < 0)
8766 return ret;
8767
8768 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
8769 __dev_notify_flags(dev, old_flags, changes, 0, NULL);
8770 return ret;
8771}
8772EXPORT_SYMBOL(dev_change_flags);
8773
8774int __dev_set_mtu(struct net_device *dev, int new_mtu)
8775{
8776 const struct net_device_ops *ops = dev->netdev_ops;
8777
8778 if (ops->ndo_change_mtu)
8779 return ops->ndo_change_mtu(dev, new_mtu);
8780
8781 /* Pairs with all the lockless reads of dev->mtu in the stack */
8782 WRITE_ONCE(dev->mtu, new_mtu);
8783 return 0;
8784}
8785EXPORT_SYMBOL(__dev_set_mtu);
8786
8787int dev_validate_mtu(struct net_device *dev, int new_mtu,
8788 struct netlink_ext_ack *extack)
8789{
8790 /* MTU must be positive, and in range */
8791 if (new_mtu < 0 || new_mtu < dev->min_mtu) {
8792 NL_SET_ERR_MSG(extack, "mtu less than device minimum");
8793 return -EINVAL;
8794 }
8795
8796 if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
8797 NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
8798 return -EINVAL;
8799 }
8800 return 0;
8801}
8802
8803/**
8804 * dev_set_mtu_ext - Change maximum transfer unit
8805 * @dev: device
8806 * @new_mtu: new transfer unit
8807 * @extack: netlink extended ack
8808 *
8809 * Change the maximum transfer size of the network device.
8810 */
8811int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
8812 struct netlink_ext_ack *extack)
8813{
8814 int err, orig_mtu;
8815
8816 if (new_mtu == dev->mtu)
8817 return 0;
8818
8819 err = dev_validate_mtu(dev, new_mtu, extack);
8820 if (err)
8821 return err;
8822
8823 if (!netif_device_present(dev))
8824 return -ENODEV;
8825
8826 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
8827 err = notifier_to_errno(err);
8828 if (err)
8829 return err;
8830
8831 orig_mtu = dev->mtu;
8832 err = __dev_set_mtu(dev, new_mtu);
8833
8834 if (!err) {
8835 err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8836 orig_mtu);
8837 err = notifier_to_errno(err);
8838 if (err) {
8839 /* setting mtu back and notifying everyone again,
8840 * so that they have a chance to revert changes.
8841 */
8842 __dev_set_mtu(dev, orig_mtu);
8843 call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8844 new_mtu);
8845 }
8846 }
8847 return err;
8848}
8849
8850int dev_set_mtu(struct net_device *dev, int new_mtu)
8851{
8852 struct netlink_ext_ack extack;
8853 int err;
8854
8855 memset(&extack, 0, sizeof(extack));
8856 err = dev_set_mtu_ext(dev, new_mtu, &extack);
8857 if (err && extack._msg)
8858 net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
8859 return err;
8860}
8861EXPORT_SYMBOL(dev_set_mtu);
8862
8863/**
8864 * dev_change_tx_queue_len - Change TX queue length of a netdevice
8865 * @dev: device
8866 * @new_len: new tx queue length
8867 */
8868int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
8869{
8870 unsigned int orig_len = dev->tx_queue_len;
8871 int res;
8872
8873 if (new_len != (unsigned int)new_len)
8874 return -ERANGE;
8875
8876 if (new_len != orig_len) {
8877 dev->tx_queue_len = new_len;
8878 res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
8879 res = notifier_to_errno(res);
8880 if (res)
8881 goto err_rollback;
8882 res = dev_qdisc_change_tx_queue_len(dev);
8883 if (res)
8884 goto err_rollback;
8885 }
8886
8887 return 0;
8888
8889err_rollback:
8890 netdev_err(dev, "refused to change device tx_queue_len\n");
8891 dev->tx_queue_len = orig_len;
8892 return res;
8893}
8894
8895/**
8896 * dev_set_group - Change group this device belongs to
8897 * @dev: device
8898 * @new_group: group this device should belong to
8899 */
8900void dev_set_group(struct net_device *dev, int new_group)
8901{
8902 dev->group = new_group;
8903}
8904
8905/**
8906 * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
8907 * @dev: device
8908 * @addr: new address
8909 * @extack: netlink extended ack
8910 */
8911int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
8912 struct netlink_ext_ack *extack)
8913{
8914 struct netdev_notifier_pre_changeaddr_info info = {
8915 .info.dev = dev,
8916 .info.extack = extack,
8917 .dev_addr = addr,
8918 };
8919 int rc;
8920
8921 rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
8922 return notifier_to_errno(rc);
8923}
8924EXPORT_SYMBOL(dev_pre_changeaddr_notify);
8925
8926/**
8927 * dev_set_mac_address - Change Media Access Control Address
8928 * @dev: device
8929 * @sa: new address
8930 * @extack: netlink extended ack
8931 *
8932 * Change the hardware (MAC) address of the device
8933 */
8934int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
8935 struct netlink_ext_ack *extack)
8936{
8937 const struct net_device_ops *ops = dev->netdev_ops;
8938 int err;
8939
8940 if (!ops->ndo_set_mac_address)
8941 return -EOPNOTSUPP;
8942 if (sa->sa_family != dev->type)
8943 return -EINVAL;
8944 if (!netif_device_present(dev))
8945 return -ENODEV;
8946 err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
8947 if (err)
8948 return err;
8949 if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) {
8950 err = ops->ndo_set_mac_address(dev, sa);
8951 if (err)
8952 return err;
8953 }
8954 dev->addr_assign_type = NET_ADDR_SET;
8955 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
8956 add_device_randomness(dev->dev_addr, dev->addr_len);
8957 return 0;
8958}
8959EXPORT_SYMBOL(dev_set_mac_address);
8960
8961DECLARE_RWSEM(dev_addr_sem);
8962
8963int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
8964 struct netlink_ext_ack *extack)
8965{
8966 int ret;
8967
8968 down_write(&dev_addr_sem);
8969 ret = dev_set_mac_address(dev, sa, extack);
8970 up_write(&dev_addr_sem);
8971 return ret;
8972}
8973EXPORT_SYMBOL(dev_set_mac_address_user);
8974
8975int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
8976{
8977 size_t size = sizeof(sa->sa_data_min);
8978 struct net_device *dev;
8979 int ret = 0;
8980
8981 down_read(&dev_addr_sem);
8982 rcu_read_lock();
8983
8984 dev = dev_get_by_name_rcu(net, dev_name);
8985 if (!dev) {
8986 ret = -ENODEV;
8987 goto unlock;
8988 }
8989 if (!dev->addr_len)
8990 memset(sa->sa_data, 0, size);
8991 else
8992 memcpy(sa->sa_data, dev->dev_addr,
8993 min_t(size_t, size, dev->addr_len));
8994 sa->sa_family = dev->type;
8995
8996unlock:
8997 rcu_read_unlock();
8998 up_read(&dev_addr_sem);
8999 return ret;
9000}
9001EXPORT_SYMBOL(dev_get_mac_address);
9002
9003/**
9004 * dev_change_carrier - Change device carrier
9005 * @dev: device
9006 * @new_carrier: new value
9007 *
9008 * Change device carrier
9009 */
9010int dev_change_carrier(struct net_device *dev, bool new_carrier)
9011{
9012 const struct net_device_ops *ops = dev->netdev_ops;
9013
9014 if (!ops->ndo_change_carrier)
9015 return -EOPNOTSUPP;
9016 if (!netif_device_present(dev))
9017 return -ENODEV;
9018 return ops->ndo_change_carrier(dev, new_carrier);
9019}
9020
9021/**
9022 * dev_get_phys_port_id - Get device physical port ID
9023 * @dev: device
9024 * @ppid: port ID
9025 *
9026 * Get device physical port ID
9027 */
9028int dev_get_phys_port_id(struct net_device *dev,
9029 struct netdev_phys_item_id *ppid)
9030{
9031 const struct net_device_ops *ops = dev->netdev_ops;
9032
9033 if (!ops->ndo_get_phys_port_id)
9034 return -EOPNOTSUPP;
9035 return ops->ndo_get_phys_port_id(dev, ppid);
9036}
9037
9038/**
9039 * dev_get_phys_port_name - Get device physical port name
9040 * @dev: device
9041 * @name: port name
9042 * @len: limit of bytes to copy to name
9043 *
9044 * Get device physical port name
9045 */
9046int dev_get_phys_port_name(struct net_device *dev,
9047 char *name, size_t len)
9048{
9049 const struct net_device_ops *ops = dev->netdev_ops;
9050 int err;
9051
9052 if (ops->ndo_get_phys_port_name) {
9053 err = ops->ndo_get_phys_port_name(dev, name, len);
9054 if (err != -EOPNOTSUPP)
9055 return err;
9056 }
9057 return devlink_compat_phys_port_name_get(dev, name, len);
9058}
9059
9060/**
9061 * dev_get_port_parent_id - Get the device's port parent identifier
9062 * @dev: network device
9063 * @ppid: pointer to a storage for the port's parent identifier
9064 * @recurse: allow/disallow recursion to lower devices
9065 *
9066 * Get the devices's port parent identifier
9067 */
9068int dev_get_port_parent_id(struct net_device *dev,
9069 struct netdev_phys_item_id *ppid,
9070 bool recurse)
9071{
9072 const struct net_device_ops *ops = dev->netdev_ops;
9073 struct netdev_phys_item_id first = { };
9074 struct net_device *lower_dev;
9075 struct list_head *iter;
9076 int err;
9077
9078 if (ops->ndo_get_port_parent_id) {
9079 err = ops->ndo_get_port_parent_id(dev, ppid);
9080 if (err != -EOPNOTSUPP)
9081 return err;
9082 }
9083
9084 err = devlink_compat_switch_id_get(dev, ppid);
9085 if (!recurse || err != -EOPNOTSUPP)
9086 return err;
9087
9088 netdev_for_each_lower_dev(dev, lower_dev, iter) {
9089 err = dev_get_port_parent_id(lower_dev, ppid, true);
9090 if (err)
9091 break;
9092 if (!first.id_len)
9093 first = *ppid;
9094 else if (memcmp(&first, ppid, sizeof(*ppid)))
9095 return -EOPNOTSUPP;
9096 }
9097
9098 return err;
9099}
9100EXPORT_SYMBOL(dev_get_port_parent_id);
9101
9102/**
9103 * netdev_port_same_parent_id - Indicate if two network devices have
9104 * the same port parent identifier
9105 * @a: first network device
9106 * @b: second network device
9107 */
9108bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
9109{
9110 struct netdev_phys_item_id a_id = { };
9111 struct netdev_phys_item_id b_id = { };
9112
9113 if (dev_get_port_parent_id(a, &a_id, true) ||
9114 dev_get_port_parent_id(b, &b_id, true))
9115 return false;
9116
9117 return netdev_phys_item_id_same(&a_id, &b_id);
9118}
9119EXPORT_SYMBOL(netdev_port_same_parent_id);
9120
9121/**
9122 * dev_change_proto_down - set carrier according to proto_down.
9123 *
9124 * @dev: device
9125 * @proto_down: new value
9126 */
9127int dev_change_proto_down(struct net_device *dev, bool proto_down)
9128{
9129 if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN))
9130 return -EOPNOTSUPP;
9131 if (!netif_device_present(dev))
9132 return -ENODEV;
9133 if (proto_down)
9134 netif_carrier_off(dev);
9135 else
9136 netif_carrier_on(dev);
9137 dev->proto_down = proto_down;
9138 return 0;
9139}
9140
9141/**
9142 * dev_change_proto_down_reason - proto down reason
9143 *
9144 * @dev: device
9145 * @mask: proto down mask
9146 * @value: proto down value
9147 */
9148void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
9149 u32 value)
9150{
9151 int b;
9152
9153 if (!mask) {
9154 dev->proto_down_reason = value;
9155 } else {
9156 for_each_set_bit(b, &mask, 32) {
9157 if (value & (1 << b))
9158 dev->proto_down_reason |= BIT(b);
9159 else
9160 dev->proto_down_reason &= ~BIT(b);
9161 }
9162 }
9163}
9164
9165struct bpf_xdp_link {
9166 struct bpf_link link;
9167 struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
9168 int flags;
9169};
9170
9171static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
9172{
9173 if (flags & XDP_FLAGS_HW_MODE)
9174 return XDP_MODE_HW;
9175 if (flags & XDP_FLAGS_DRV_MODE)
9176 return XDP_MODE_DRV;
9177 if (flags & XDP_FLAGS_SKB_MODE)
9178 return XDP_MODE_SKB;
9179 return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
9180}
9181
9182static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
9183{
9184 switch (mode) {
9185 case XDP_MODE_SKB:
9186 return generic_xdp_install;
9187 case XDP_MODE_DRV:
9188 case XDP_MODE_HW:
9189 return dev->netdev_ops->ndo_bpf;
9190 default:
9191 return NULL;
9192 }
9193}
9194
9195static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
9196 enum bpf_xdp_mode mode)
9197{
9198 return dev->xdp_state[mode].link;
9199}
9200
9201static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
9202 enum bpf_xdp_mode mode)
9203{
9204 struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
9205
9206 if (link)
9207 return link->link.prog;
9208 return dev->xdp_state[mode].prog;
9209}
9210
9211u8 dev_xdp_prog_count(struct net_device *dev)
9212{
9213 u8 count = 0;
9214 int i;
9215
9216 for (i = 0; i < __MAX_XDP_MODE; i++)
9217 if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
9218 count++;
9219 return count;
9220}
9221EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
9222
9223u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
9224{
9225 struct bpf_prog *prog = dev_xdp_prog(dev, mode);
9226
9227 return prog ? prog->aux->id : 0;
9228}
9229
9230static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
9231 struct bpf_xdp_link *link)
9232{
9233 dev->xdp_state[mode].link = link;
9234 dev->xdp_state[mode].prog = NULL;
9235}
9236
9237static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
9238 struct bpf_prog *prog)
9239{
9240 dev->xdp_state[mode].link = NULL;
9241 dev->xdp_state[mode].prog = prog;
9242}
9243
9244static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
9245 bpf_op_t bpf_op, struct netlink_ext_ack *extack,
9246 u32 flags, struct bpf_prog *prog)
9247{
9248 struct netdev_bpf xdp;
9249 int err;
9250
9251 memset(&xdp, 0, sizeof(xdp));
9252 xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
9253 xdp.extack = extack;
9254 xdp.flags = flags;
9255 xdp.prog = prog;
9256
9257 /* Drivers assume refcnt is already incremented (i.e, prog pointer is
9258 * "moved" into driver), so they don't increment it on their own, but
9259 * they do decrement refcnt when program is detached or replaced.
9260 * Given net_device also owns link/prog, we need to bump refcnt here
9261 * to prevent drivers from underflowing it.
9262 */
9263 if (prog)
9264 bpf_prog_inc(prog);
9265 err = bpf_op(dev, &xdp);
9266 if (err) {
9267 if (prog)
9268 bpf_prog_put(prog);
9269 return err;
9270 }
9271
9272 if (mode != XDP_MODE_HW)
9273 bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
9274
9275 return 0;
9276}
9277
9278static void dev_xdp_uninstall(struct net_device *dev)
9279{
9280 struct bpf_xdp_link *link;
9281 struct bpf_prog *prog;
9282 enum bpf_xdp_mode mode;
9283 bpf_op_t bpf_op;
9284
9285 ASSERT_RTNL();
9286
9287 for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
9288 prog = dev_xdp_prog(dev, mode);
9289 if (!prog)
9290 continue;
9291
9292 bpf_op = dev_xdp_bpf_op(dev, mode);
9293 if (!bpf_op)
9294 continue;
9295
9296 WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9297
9298 /* auto-detach link from net device */
9299 link = dev_xdp_link(dev, mode);
9300 if (link)
9301 link->dev = NULL;
9302 else
9303 bpf_prog_put(prog);
9304
9305 dev_xdp_set_link(dev, mode, NULL);
9306 }
9307}
9308
9309static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
9310 struct bpf_xdp_link *link, struct bpf_prog *new_prog,
9311 struct bpf_prog *old_prog, u32 flags)
9312{
9313 unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
9314 struct bpf_prog *cur_prog;
9315 struct net_device *upper;
9316 struct list_head *iter;
9317 enum bpf_xdp_mode mode;
9318 bpf_op_t bpf_op;
9319 int err;
9320
9321 ASSERT_RTNL();
9322
9323 /* either link or prog attachment, never both */
9324 if (link && (new_prog || old_prog))
9325 return -EINVAL;
9326 /* link supports only XDP mode flags */
9327 if (link && (flags & ~XDP_FLAGS_MODES)) {
9328 NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
9329 return -EINVAL;
9330 }
9331 /* just one XDP mode bit should be set, zero defaults to drv/skb mode */
9332 if (num_modes > 1) {
9333 NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
9334 return -EINVAL;
9335 }
9336 /* avoid ambiguity if offload + drv/skb mode progs are both loaded */
9337 if (!num_modes && dev_xdp_prog_count(dev) > 1) {
9338 NL_SET_ERR_MSG(extack,
9339 "More than one program loaded, unset mode is ambiguous");
9340 return -EINVAL;
9341 }
9342 /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
9343 if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
9344 NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
9345 return -EINVAL;
9346 }
9347
9348 mode = dev_xdp_mode(dev, flags);
9349 /* can't replace attached link */
9350 if (dev_xdp_link(dev, mode)) {
9351 NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
9352 return -EBUSY;
9353 }
9354
9355 /* don't allow if an upper device already has a program */
9356 netdev_for_each_upper_dev_rcu(dev, upper, iter) {
9357 if (dev_xdp_prog_count(upper) > 0) {
9358 NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
9359 return -EEXIST;
9360 }
9361 }
9362
9363 cur_prog = dev_xdp_prog(dev, mode);
9364 /* can't replace attached prog with link */
9365 if (link && cur_prog) {
9366 NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
9367 return -EBUSY;
9368 }
9369 if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
9370 NL_SET_ERR_MSG(extack, "Active program does not match expected");
9371 return -EEXIST;
9372 }
9373
9374 /* put effective new program into new_prog */
9375 if (link)
9376 new_prog = link->link.prog;
9377
9378 if (new_prog) {
9379 bool offload = mode == XDP_MODE_HW;
9380 enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
9381 ? XDP_MODE_DRV : XDP_MODE_SKB;
9382
9383 if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
9384 NL_SET_ERR_MSG(extack, "XDP program already attached");
9385 return -EBUSY;
9386 }
9387 if (!offload && dev_xdp_prog(dev, other_mode)) {
9388 NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
9389 return -EEXIST;
9390 }
9391 if (!offload && bpf_prog_is_offloaded(new_prog->aux)) {
9392 NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported");
9393 return -EINVAL;
9394 }
9395 if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) {
9396 NL_SET_ERR_MSG(extack, "Program bound to different device");
9397 return -EINVAL;
9398 }
9399 if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
9400 NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
9401 return -EINVAL;
9402 }
9403 if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
9404 NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
9405 return -EINVAL;
9406 }
9407 }
9408
9409 /* don't call drivers if the effective program didn't change */
9410 if (new_prog != cur_prog) {
9411 bpf_op = dev_xdp_bpf_op(dev, mode);
9412 if (!bpf_op) {
9413 NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
9414 return -EOPNOTSUPP;
9415 }
9416
9417 err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
9418 if (err)
9419 return err;
9420 }
9421
9422 if (link)
9423 dev_xdp_set_link(dev, mode, link);
9424 else
9425 dev_xdp_set_prog(dev, mode, new_prog);
9426 if (cur_prog)
9427 bpf_prog_put(cur_prog);
9428
9429 return 0;
9430}
9431
9432static int dev_xdp_attach_link(struct net_device *dev,
9433 struct netlink_ext_ack *extack,
9434 struct bpf_xdp_link *link)
9435{
9436 return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
9437}
9438
9439static int dev_xdp_detach_link(struct net_device *dev,
9440 struct netlink_ext_ack *extack,
9441 struct bpf_xdp_link *link)
9442{
9443 enum bpf_xdp_mode mode;
9444 bpf_op_t bpf_op;
9445
9446 ASSERT_RTNL();
9447
9448 mode = dev_xdp_mode(dev, link->flags);
9449 if (dev_xdp_link(dev, mode) != link)
9450 return -EINVAL;
9451
9452 bpf_op = dev_xdp_bpf_op(dev, mode);
9453 WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9454 dev_xdp_set_link(dev, mode, NULL);
9455 return 0;
9456}
9457
9458static void bpf_xdp_link_release(struct bpf_link *link)
9459{
9460 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9461
9462 rtnl_lock();
9463
9464 /* if racing with net_device's tear down, xdp_link->dev might be
9465 * already NULL, in which case link was already auto-detached
9466 */
9467 if (xdp_link->dev) {
9468 WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
9469 xdp_link->dev = NULL;
9470 }
9471
9472 rtnl_unlock();
9473}
9474
9475static int bpf_xdp_link_detach(struct bpf_link *link)
9476{
9477 bpf_xdp_link_release(link);
9478 return 0;
9479}
9480
9481static void bpf_xdp_link_dealloc(struct bpf_link *link)
9482{
9483 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9484
9485 kfree(xdp_link);
9486}
9487
9488static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
9489 struct seq_file *seq)
9490{
9491 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9492 u32 ifindex = 0;
9493
9494 rtnl_lock();
9495 if (xdp_link->dev)
9496 ifindex = xdp_link->dev->ifindex;
9497 rtnl_unlock();
9498
9499 seq_printf(seq, "ifindex:\t%u\n", ifindex);
9500}
9501
9502static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
9503 struct bpf_link_info *info)
9504{
9505 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9506 u32 ifindex = 0;
9507
9508 rtnl_lock();
9509 if (xdp_link->dev)
9510 ifindex = xdp_link->dev->ifindex;
9511 rtnl_unlock();
9512
9513 info->xdp.ifindex = ifindex;
9514 return 0;
9515}
9516
9517static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
9518 struct bpf_prog *old_prog)
9519{
9520 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9521 enum bpf_xdp_mode mode;
9522 bpf_op_t bpf_op;
9523 int err = 0;
9524
9525 rtnl_lock();
9526
9527 /* link might have been auto-released already, so fail */
9528 if (!xdp_link->dev) {
9529 err = -ENOLINK;
9530 goto out_unlock;
9531 }
9532
9533 if (old_prog && link->prog != old_prog) {
9534 err = -EPERM;
9535 goto out_unlock;
9536 }
9537 old_prog = link->prog;
9538 if (old_prog->type != new_prog->type ||
9539 old_prog->expected_attach_type != new_prog->expected_attach_type) {
9540 err = -EINVAL;
9541 goto out_unlock;
9542 }
9543
9544 if (old_prog == new_prog) {
9545 /* no-op, don't disturb drivers */
9546 bpf_prog_put(new_prog);
9547 goto out_unlock;
9548 }
9549
9550 mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
9551 bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
9552 err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
9553 xdp_link->flags, new_prog);
9554 if (err)
9555 goto out_unlock;
9556
9557 old_prog = xchg(&link->prog, new_prog);
9558 bpf_prog_put(old_prog);
9559
9560out_unlock:
9561 rtnl_unlock();
9562 return err;
9563}
9564
9565static const struct bpf_link_ops bpf_xdp_link_lops = {
9566 .release = bpf_xdp_link_release,
9567 .dealloc = bpf_xdp_link_dealloc,
9568 .detach = bpf_xdp_link_detach,
9569 .show_fdinfo = bpf_xdp_link_show_fdinfo,
9570 .fill_link_info = bpf_xdp_link_fill_link_info,
9571 .update_prog = bpf_xdp_link_update,
9572};
9573
9574int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
9575{
9576 struct net *net = current->nsproxy->net_ns;
9577 struct bpf_link_primer link_primer;
9578 struct netlink_ext_ack extack = {};
9579 struct bpf_xdp_link *link;
9580 struct net_device *dev;
9581 int err, fd;
9582
9583 rtnl_lock();
9584 dev = dev_get_by_index(net, attr->link_create.target_ifindex);
9585 if (!dev) {
9586 rtnl_unlock();
9587 return -EINVAL;
9588 }
9589
9590 link = kzalloc(sizeof(*link), GFP_USER);
9591 if (!link) {
9592 err = -ENOMEM;
9593 goto unlock;
9594 }
9595
9596 bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
9597 link->dev = dev;
9598 link->flags = attr->link_create.flags;
9599
9600 err = bpf_link_prime(&link->link, &link_primer);
9601 if (err) {
9602 kfree(link);
9603 goto unlock;
9604 }
9605
9606 err = dev_xdp_attach_link(dev, &extack, link);
9607 rtnl_unlock();
9608
9609 if (err) {
9610 link->dev = NULL;
9611 bpf_link_cleanup(&link_primer);
9612 trace_bpf_xdp_link_attach_failed(extack._msg);
9613 goto out_put_dev;
9614 }
9615
9616 fd = bpf_link_settle(&link_primer);
9617 /* link itself doesn't hold dev's refcnt to not complicate shutdown */
9618 dev_put(dev);
9619 return fd;
9620
9621unlock:
9622 rtnl_unlock();
9623
9624out_put_dev:
9625 dev_put(dev);
9626 return err;
9627}
9628
9629/**
9630 * dev_change_xdp_fd - set or clear a bpf program for a device rx path
9631 * @dev: device
9632 * @extack: netlink extended ack
9633 * @fd: new program fd or negative value to clear
9634 * @expected_fd: old program fd that userspace expects to replace or clear
9635 * @flags: xdp-related flags
9636 *
9637 * Set or clear a bpf program for a device
9638 */
9639int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
9640 int fd, int expected_fd, u32 flags)
9641{
9642 enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
9643 struct bpf_prog *new_prog = NULL, *old_prog = NULL;
9644 int err;
9645
9646 ASSERT_RTNL();
9647
9648 if (fd >= 0) {
9649 new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
9650 mode != XDP_MODE_SKB);
9651 if (IS_ERR(new_prog))
9652 return PTR_ERR(new_prog);
9653 }
9654
9655 if (expected_fd >= 0) {
9656 old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
9657 mode != XDP_MODE_SKB);
9658 if (IS_ERR(old_prog)) {
9659 err = PTR_ERR(old_prog);
9660 old_prog = NULL;
9661 goto err_out;
9662 }
9663 }
9664
9665 err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
9666
9667err_out:
9668 if (err && new_prog)
9669 bpf_prog_put(new_prog);
9670 if (old_prog)
9671 bpf_prog_put(old_prog);
9672 return err;
9673}
9674
9675/**
9676 * dev_index_reserve() - allocate an ifindex in a namespace
9677 * @net: the applicable net namespace
9678 * @ifindex: requested ifindex, pass %0 to get one allocated
9679 *
9680 * Allocate a ifindex for a new device. Caller must either use the ifindex
9681 * to store the device (via list_netdevice()) or call dev_index_release()
9682 * to give the index up.
9683 *
9684 * Return: a suitable unique value for a new device interface number or -errno.
9685 */
9686static int dev_index_reserve(struct net *net, u32 ifindex)
9687{
9688 int err;
9689
9690 if (ifindex > INT_MAX) {
9691 DEBUG_NET_WARN_ON_ONCE(1);
9692 return -EINVAL;
9693 }
9694
9695 if (!ifindex)
9696 err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL,
9697 xa_limit_31b, &net->ifindex, GFP_KERNEL);
9698 else
9699 err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL);
9700 if (err < 0)
9701 return err;
9702
9703 return ifindex;
9704}
9705
9706static void dev_index_release(struct net *net, int ifindex)
9707{
9708 /* Expect only unused indexes, unlist_netdevice() removes the used */
9709 WARN_ON(xa_erase(&net->dev_by_index, ifindex));
9710}
9711
9712/* Delayed registration/unregisteration */
9713LIST_HEAD(net_todo_list);
9714DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
9715atomic_t dev_unreg_count = ATOMIC_INIT(0);
9716
9717static void net_set_todo(struct net_device *dev)
9718{
9719 list_add_tail(&dev->todo_list, &net_todo_list);
9720}
9721
9722static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
9723 struct net_device *upper, netdev_features_t features)
9724{
9725 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9726 netdev_features_t feature;
9727 int feature_bit;
9728
9729 for_each_netdev_feature(upper_disables, feature_bit) {
9730 feature = __NETIF_F_BIT(feature_bit);
9731 if (!(upper->wanted_features & feature)
9732 && (features & feature)) {
9733 netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
9734 &feature, upper->name);
9735 features &= ~feature;
9736 }
9737 }
9738
9739 return features;
9740}
9741
9742static void netdev_sync_lower_features(struct net_device *upper,
9743 struct net_device *lower, netdev_features_t features)
9744{
9745 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9746 netdev_features_t feature;
9747 int feature_bit;
9748
9749 for_each_netdev_feature(upper_disables, feature_bit) {
9750 feature = __NETIF_F_BIT(feature_bit);
9751 if (!(features & feature) && (lower->features & feature)) {
9752 netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
9753 &feature, lower->name);
9754 lower->wanted_features &= ~feature;
9755 __netdev_update_features(lower);
9756
9757 if (unlikely(lower->features & feature))
9758 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
9759 &feature, lower->name);
9760 else
9761 netdev_features_change(lower);
9762 }
9763 }
9764}
9765
9766static netdev_features_t netdev_fix_features(struct net_device *dev,
9767 netdev_features_t features)
9768{
9769 /* Fix illegal checksum combinations */
9770 if ((features & NETIF_F_HW_CSUM) &&
9771 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
9772 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
9773 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
9774 }
9775
9776 /* TSO requires that SG is present as well. */
9777 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
9778 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
9779 features &= ~NETIF_F_ALL_TSO;
9780 }
9781
9782 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
9783 !(features & NETIF_F_IP_CSUM)) {
9784 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
9785 features &= ~NETIF_F_TSO;
9786 features &= ~NETIF_F_TSO_ECN;
9787 }
9788
9789 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
9790 !(features & NETIF_F_IPV6_CSUM)) {
9791 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
9792 features &= ~NETIF_F_TSO6;
9793 }
9794
9795 /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
9796 if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
9797 features &= ~NETIF_F_TSO_MANGLEID;
9798
9799 /* TSO ECN requires that TSO is present as well. */
9800 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
9801 features &= ~NETIF_F_TSO_ECN;
9802
9803 /* Software GSO depends on SG. */
9804 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
9805 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
9806 features &= ~NETIF_F_GSO;
9807 }
9808
9809 /* GSO partial features require GSO partial be set */
9810 if ((features & dev->gso_partial_features) &&
9811 !(features & NETIF_F_GSO_PARTIAL)) {
9812 netdev_dbg(dev,
9813 "Dropping partially supported GSO features since no GSO partial.\n");
9814 features &= ~dev->gso_partial_features;
9815 }
9816
9817 if (!(features & NETIF_F_RXCSUM)) {
9818 /* NETIF_F_GRO_HW implies doing RXCSUM since every packet
9819 * successfully merged by hardware must also have the
9820 * checksum verified by hardware. If the user does not
9821 * want to enable RXCSUM, logically, we should disable GRO_HW.
9822 */
9823 if (features & NETIF_F_GRO_HW) {
9824 netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
9825 features &= ~NETIF_F_GRO_HW;
9826 }
9827 }
9828
9829 /* LRO/HW-GRO features cannot be combined with RX-FCS */
9830 if (features & NETIF_F_RXFCS) {
9831 if (features & NETIF_F_LRO) {
9832 netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
9833 features &= ~NETIF_F_LRO;
9834 }
9835
9836 if (features & NETIF_F_GRO_HW) {
9837 netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
9838 features &= ~NETIF_F_GRO_HW;
9839 }
9840 }
9841
9842 if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
9843 netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
9844 features &= ~NETIF_F_LRO;
9845 }
9846
9847 if (features & NETIF_F_HW_TLS_TX) {
9848 bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
9849 (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
9850 bool hw_csum = features & NETIF_F_HW_CSUM;
9851
9852 if (!ip_csum && !hw_csum) {
9853 netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
9854 features &= ~NETIF_F_HW_TLS_TX;
9855 }
9856 }
9857
9858 if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
9859 netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
9860 features &= ~NETIF_F_HW_TLS_RX;
9861 }
9862
9863 return features;
9864}
9865
9866int __netdev_update_features(struct net_device *dev)
9867{
9868 struct net_device *upper, *lower;
9869 netdev_features_t features;
9870 struct list_head *iter;
9871 int err = -1;
9872
9873 ASSERT_RTNL();
9874
9875 features = netdev_get_wanted_features(dev);
9876
9877 if (dev->netdev_ops->ndo_fix_features)
9878 features = dev->netdev_ops->ndo_fix_features(dev, features);
9879
9880 /* driver might be less strict about feature dependencies */
9881 features = netdev_fix_features(dev, features);
9882
9883 /* some features can't be enabled if they're off on an upper device */
9884 netdev_for_each_upper_dev_rcu(dev, upper, iter)
9885 features = netdev_sync_upper_features(dev, upper, features);
9886
9887 if (dev->features == features)
9888 goto sync_lower;
9889
9890 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
9891 &dev->features, &features);
9892
9893 if (dev->netdev_ops->ndo_set_features)
9894 err = dev->netdev_ops->ndo_set_features(dev, features);
9895 else
9896 err = 0;
9897
9898 if (unlikely(err < 0)) {
9899 netdev_err(dev,
9900 "set_features() failed (%d); wanted %pNF, left %pNF\n",
9901 err, &features, &dev->features);
9902 /* return non-0 since some features might have changed and
9903 * it's better to fire a spurious notification than miss it
9904 */
9905 return -1;
9906 }
9907
9908sync_lower:
9909 /* some features must be disabled on lower devices when disabled
9910 * on an upper device (think: bonding master or bridge)
9911 */
9912 netdev_for_each_lower_dev(dev, lower, iter)
9913 netdev_sync_lower_features(dev, lower, features);
9914
9915 if (!err) {
9916 netdev_features_t diff = features ^ dev->features;
9917
9918 if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
9919 /* udp_tunnel_{get,drop}_rx_info both need
9920 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
9921 * device, or they won't do anything.
9922 * Thus we need to update dev->features
9923 * *before* calling udp_tunnel_get_rx_info,
9924 * but *after* calling udp_tunnel_drop_rx_info.
9925 */
9926 if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
9927 dev->features = features;
9928 udp_tunnel_get_rx_info(dev);
9929 } else {
9930 udp_tunnel_drop_rx_info(dev);
9931 }
9932 }
9933
9934 if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
9935 if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
9936 dev->features = features;
9937 err |= vlan_get_rx_ctag_filter_info(dev);
9938 } else {
9939 vlan_drop_rx_ctag_filter_info(dev);
9940 }
9941 }
9942
9943 if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
9944 if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
9945 dev->features = features;
9946 err |= vlan_get_rx_stag_filter_info(dev);
9947 } else {
9948 vlan_drop_rx_stag_filter_info(dev);
9949 }
9950 }
9951
9952 dev->features = features;
9953 }
9954
9955 return err < 0 ? 0 : 1;
9956}
9957
9958/**
9959 * netdev_update_features - recalculate device features
9960 * @dev: the device to check
9961 *
9962 * Recalculate dev->features set and send notifications if it
9963 * has changed. Should be called after driver or hardware dependent
9964 * conditions might have changed that influence the features.
9965 */
9966void netdev_update_features(struct net_device *dev)
9967{
9968 if (__netdev_update_features(dev))
9969 netdev_features_change(dev);
9970}
9971EXPORT_SYMBOL(netdev_update_features);
9972
9973/**
9974 * netdev_change_features - recalculate device features
9975 * @dev: the device to check
9976 *
9977 * Recalculate dev->features set and send notifications even
9978 * if they have not changed. Should be called instead of
9979 * netdev_update_features() if also dev->vlan_features might
9980 * have changed to allow the changes to be propagated to stacked
9981 * VLAN devices.
9982 */
9983void netdev_change_features(struct net_device *dev)
9984{
9985 __netdev_update_features(dev);
9986 netdev_features_change(dev);
9987}
9988EXPORT_SYMBOL(netdev_change_features);
9989
9990/**
9991 * netif_stacked_transfer_operstate - transfer operstate
9992 * @rootdev: the root or lower level device to transfer state from
9993 * @dev: the device to transfer operstate to
9994 *
9995 * Transfer operational state from root to device. This is normally
9996 * called when a stacking relationship exists between the root
9997 * device and the device(a leaf device).
9998 */
9999void netif_stacked_transfer_operstate(const struct net_device *rootdev,
10000 struct net_device *dev)
10001{
10002 if (rootdev->operstate == IF_OPER_DORMANT)
10003 netif_dormant_on(dev);
10004 else
10005 netif_dormant_off(dev);
10006
10007 if (rootdev->operstate == IF_OPER_TESTING)
10008 netif_testing_on(dev);
10009 else
10010 netif_testing_off(dev);
10011
10012 if (netif_carrier_ok(rootdev))
10013 netif_carrier_on(dev);
10014 else
10015 netif_carrier_off(dev);
10016}
10017EXPORT_SYMBOL(netif_stacked_transfer_operstate);
10018
10019static int netif_alloc_rx_queues(struct net_device *dev)
10020{
10021 unsigned int i, count = dev->num_rx_queues;
10022 struct netdev_rx_queue *rx;
10023 size_t sz = count * sizeof(*rx);
10024 int err = 0;
10025
10026 BUG_ON(count < 1);
10027
10028 rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10029 if (!rx)
10030 return -ENOMEM;
10031
10032 dev->_rx = rx;
10033
10034 for (i = 0; i < count; i++) {
10035 rx[i].dev = dev;
10036
10037 /* XDP RX-queue setup */
10038 err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
10039 if (err < 0)
10040 goto err_rxq_info;
10041 }
10042 return 0;
10043
10044err_rxq_info:
10045 /* Rollback successful reg's and free other resources */
10046 while (i--)
10047 xdp_rxq_info_unreg(&rx[i].xdp_rxq);
10048 kvfree(dev->_rx);
10049 dev->_rx = NULL;
10050 return err;
10051}
10052
10053static void netif_free_rx_queues(struct net_device *dev)
10054{
10055 unsigned int i, count = dev->num_rx_queues;
10056
10057 /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
10058 if (!dev->_rx)
10059 return;
10060
10061 for (i = 0; i < count; i++)
10062 xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
10063
10064 kvfree(dev->_rx);
10065}
10066
10067static void netdev_init_one_queue(struct net_device *dev,
10068 struct netdev_queue *queue, void *_unused)
10069{
10070 /* Initialize queue lock */
10071 spin_lock_init(&queue->_xmit_lock);
10072 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
10073 queue->xmit_lock_owner = -1;
10074 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
10075 queue->dev = dev;
10076#ifdef CONFIG_BQL
10077 dql_init(&queue->dql, HZ);
10078#endif
10079}
10080
10081static void netif_free_tx_queues(struct net_device *dev)
10082{
10083 kvfree(dev->_tx);
10084}
10085
10086static int netif_alloc_netdev_queues(struct net_device *dev)
10087{
10088 unsigned int count = dev->num_tx_queues;
10089 struct netdev_queue *tx;
10090 size_t sz = count * sizeof(*tx);
10091
10092 if (count < 1 || count > 0xffff)
10093 return -EINVAL;
10094
10095 tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10096 if (!tx)
10097 return -ENOMEM;
10098
10099 dev->_tx = tx;
10100
10101 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
10102 spin_lock_init(&dev->tx_global_lock);
10103
10104 return 0;
10105}
10106
10107void netif_tx_stop_all_queues(struct net_device *dev)
10108{
10109 unsigned int i;
10110
10111 for (i = 0; i < dev->num_tx_queues; i++) {
10112 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
10113
10114 netif_tx_stop_queue(txq);
10115 }
10116}
10117EXPORT_SYMBOL(netif_tx_stop_all_queues);
10118
10119static int netdev_do_alloc_pcpu_stats(struct net_device *dev)
10120{
10121 void __percpu *v;
10122
10123 /* Drivers implementing ndo_get_peer_dev must support tstat
10124 * accounting, so that skb_do_redirect() can bump the dev's
10125 * RX stats upon network namespace switch.
10126 */
10127 if (dev->netdev_ops->ndo_get_peer_dev &&
10128 dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS)
10129 return -EOPNOTSUPP;
10130
10131 switch (dev->pcpu_stat_type) {
10132 case NETDEV_PCPU_STAT_NONE:
10133 return 0;
10134 case NETDEV_PCPU_STAT_LSTATS:
10135 v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
10136 break;
10137 case NETDEV_PCPU_STAT_TSTATS:
10138 v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
10139 break;
10140 case NETDEV_PCPU_STAT_DSTATS:
10141 v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
10142 break;
10143 default:
10144 return -EINVAL;
10145 }
10146
10147 return v ? 0 : -ENOMEM;
10148}
10149
10150static void netdev_do_free_pcpu_stats(struct net_device *dev)
10151{
10152 switch (dev->pcpu_stat_type) {
10153 case NETDEV_PCPU_STAT_NONE:
10154 return;
10155 case NETDEV_PCPU_STAT_LSTATS:
10156 free_percpu(dev->lstats);
10157 break;
10158 case NETDEV_PCPU_STAT_TSTATS:
10159 free_percpu(dev->tstats);
10160 break;
10161 case NETDEV_PCPU_STAT_DSTATS:
10162 free_percpu(dev->dstats);
10163 break;
10164 }
10165}
10166
10167/**
10168 * register_netdevice() - register a network device
10169 * @dev: device to register
10170 *
10171 * Take a prepared network device structure and make it externally accessible.
10172 * A %NETDEV_REGISTER message is sent to the netdev notifier chain.
10173 * Callers must hold the rtnl lock - you may want register_netdev()
10174 * instead of this.
10175 */
10176int register_netdevice(struct net_device *dev)
10177{
10178 int ret;
10179 struct net *net = dev_net(dev);
10180
10181 BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
10182 NETDEV_FEATURE_COUNT);
10183 BUG_ON(dev_boot_phase);
10184 ASSERT_RTNL();
10185
10186 might_sleep();
10187
10188 /* When net_device's are persistent, this will be fatal. */
10189 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
10190 BUG_ON(!net);
10191
10192 ret = ethtool_check_ops(dev->ethtool_ops);
10193 if (ret)
10194 return ret;
10195
10196 spin_lock_init(&dev->addr_list_lock);
10197 netdev_set_addr_lockdep_class(dev);
10198
10199 ret = dev_get_valid_name(net, dev, dev->name);
10200 if (ret < 0)
10201 goto out;
10202
10203 ret = -ENOMEM;
10204 dev->name_node = netdev_name_node_head_alloc(dev);
10205 if (!dev->name_node)
10206 goto out;
10207
10208 /* Init, if this function is available */
10209 if (dev->netdev_ops->ndo_init) {
10210 ret = dev->netdev_ops->ndo_init(dev);
10211 if (ret) {
10212 if (ret > 0)
10213 ret = -EIO;
10214 goto err_free_name;
10215 }
10216 }
10217
10218 if (((dev->hw_features | dev->features) &
10219 NETIF_F_HW_VLAN_CTAG_FILTER) &&
10220 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
10221 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
10222 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
10223 ret = -EINVAL;
10224 goto err_uninit;
10225 }
10226
10227 ret = netdev_do_alloc_pcpu_stats(dev);
10228 if (ret)
10229 goto err_uninit;
10230
10231 ret = dev_index_reserve(net, dev->ifindex);
10232 if (ret < 0)
10233 goto err_free_pcpu;
10234 dev->ifindex = ret;
10235
10236 /* Transfer changeable features to wanted_features and enable
10237 * software offloads (GSO and GRO).
10238 */
10239 dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
10240 dev->features |= NETIF_F_SOFT_FEATURES;
10241
10242 if (dev->udp_tunnel_nic_info) {
10243 dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10244 dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10245 }
10246
10247 dev->wanted_features = dev->features & dev->hw_features;
10248
10249 if (!(dev->flags & IFF_LOOPBACK))
10250 dev->hw_features |= NETIF_F_NOCACHE_COPY;
10251
10252 /* If IPv4 TCP segmentation offload is supported we should also
10253 * allow the device to enable segmenting the frame with the option
10254 * of ignoring a static IP ID value. This doesn't enable the
10255 * feature itself but allows the user to enable it later.
10256 */
10257 if (dev->hw_features & NETIF_F_TSO)
10258 dev->hw_features |= NETIF_F_TSO_MANGLEID;
10259 if (dev->vlan_features & NETIF_F_TSO)
10260 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
10261 if (dev->mpls_features & NETIF_F_TSO)
10262 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
10263 if (dev->hw_enc_features & NETIF_F_TSO)
10264 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
10265
10266 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
10267 */
10268 dev->vlan_features |= NETIF_F_HIGHDMA;
10269
10270 /* Make NETIF_F_SG inheritable to tunnel devices.
10271 */
10272 dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
10273
10274 /* Make NETIF_F_SG inheritable to MPLS.
10275 */
10276 dev->mpls_features |= NETIF_F_SG;
10277
10278 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
10279 ret = notifier_to_errno(ret);
10280 if (ret)
10281 goto err_ifindex_release;
10282
10283 ret = netdev_register_kobject(dev);
10284
10285 WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
10286
10287 if (ret)
10288 goto err_uninit_notify;
10289
10290 __netdev_update_features(dev);
10291
10292 /*
10293 * Default initial state at registry is that the
10294 * device is present.
10295 */
10296
10297 set_bit(__LINK_STATE_PRESENT, &dev->state);
10298
10299 linkwatch_init_dev(dev);
10300
10301 dev_init_scheduler(dev);
10302
10303 netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL);
10304 list_netdevice(dev);
10305
10306 add_device_randomness(dev->dev_addr, dev->addr_len);
10307
10308 /* If the device has permanent device address, driver should
10309 * set dev_addr and also addr_assign_type should be set to
10310 * NET_ADDR_PERM (default value).
10311 */
10312 if (dev->addr_assign_type == NET_ADDR_PERM)
10313 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
10314
10315 /* Notify protocols, that a new device appeared. */
10316 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
10317 ret = notifier_to_errno(ret);
10318 if (ret) {
10319 /* Expect explicit free_netdev() on failure */
10320 dev->needs_free_netdev = false;
10321 unregister_netdevice_queue(dev, NULL);
10322 goto out;
10323 }
10324 /*
10325 * Prevent userspace races by waiting until the network
10326 * device is fully setup before sending notifications.
10327 */
10328 if (!dev->rtnl_link_ops ||
10329 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10330 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
10331
10332out:
10333 return ret;
10334
10335err_uninit_notify:
10336 call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
10337err_ifindex_release:
10338 dev_index_release(net, dev->ifindex);
10339err_free_pcpu:
10340 netdev_do_free_pcpu_stats(dev);
10341err_uninit:
10342 if (dev->netdev_ops->ndo_uninit)
10343 dev->netdev_ops->ndo_uninit(dev);
10344 if (dev->priv_destructor)
10345 dev->priv_destructor(dev);
10346err_free_name:
10347 netdev_name_node_free(dev->name_node);
10348 goto out;
10349}
10350EXPORT_SYMBOL(register_netdevice);
10351
10352/**
10353 * init_dummy_netdev - init a dummy network device for NAPI
10354 * @dev: device to init
10355 *
10356 * This takes a network device structure and initialize the minimum
10357 * amount of fields so it can be used to schedule NAPI polls without
10358 * registering a full blown interface. This is to be used by drivers
10359 * that need to tie several hardware interfaces to a single NAPI
10360 * poll scheduler due to HW limitations.
10361 */
10362void init_dummy_netdev(struct net_device *dev)
10363{
10364 /* Clear everything. Note we don't initialize spinlocks
10365 * are they aren't supposed to be taken by any of the
10366 * NAPI code and this dummy netdev is supposed to be
10367 * only ever used for NAPI polls
10368 */
10369 memset(dev, 0, sizeof(struct net_device));
10370
10371 /* make sure we BUG if trying to hit standard
10372 * register/unregister code path
10373 */
10374 dev->reg_state = NETREG_DUMMY;
10375
10376 /* NAPI wants this */
10377 INIT_LIST_HEAD(&dev->napi_list);
10378
10379 /* a dummy interface is started by default */
10380 set_bit(__LINK_STATE_PRESENT, &dev->state);
10381 set_bit(__LINK_STATE_START, &dev->state);
10382
10383 /* napi_busy_loop stats accounting wants this */
10384 dev_net_set(dev, &init_net);
10385
10386 /* Note : We dont allocate pcpu_refcnt for dummy devices,
10387 * because users of this 'device' dont need to change
10388 * its refcount.
10389 */
10390}
10391EXPORT_SYMBOL_GPL(init_dummy_netdev);
10392
10393
10394/**
10395 * register_netdev - register a network device
10396 * @dev: device to register
10397 *
10398 * Take a completed network device structure and add it to the kernel
10399 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
10400 * chain. 0 is returned on success. A negative errno code is returned
10401 * on a failure to set up the device, or if the name is a duplicate.
10402 *
10403 * This is a wrapper around register_netdevice that takes the rtnl semaphore
10404 * and expands the device name if you passed a format string to
10405 * alloc_netdev.
10406 */
10407int register_netdev(struct net_device *dev)
10408{
10409 int err;
10410
10411 if (rtnl_lock_killable())
10412 return -EINTR;
10413 err = register_netdevice(dev);
10414 rtnl_unlock();
10415 return err;
10416}
10417EXPORT_SYMBOL(register_netdev);
10418
10419int netdev_refcnt_read(const struct net_device *dev)
10420{
10421#ifdef CONFIG_PCPU_DEV_REFCNT
10422 int i, refcnt = 0;
10423
10424 for_each_possible_cpu(i)
10425 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
10426 return refcnt;
10427#else
10428 return refcount_read(&dev->dev_refcnt);
10429#endif
10430}
10431EXPORT_SYMBOL(netdev_refcnt_read);
10432
10433int netdev_unregister_timeout_secs __read_mostly = 10;
10434
10435#define WAIT_REFS_MIN_MSECS 1
10436#define WAIT_REFS_MAX_MSECS 250
10437/**
10438 * netdev_wait_allrefs_any - wait until all references are gone.
10439 * @list: list of net_devices to wait on
10440 *
10441 * This is called when unregistering network devices.
10442 *
10443 * Any protocol or device that holds a reference should register
10444 * for netdevice notification, and cleanup and put back the
10445 * reference if they receive an UNREGISTER event.
10446 * We can get stuck here if buggy protocols don't correctly
10447 * call dev_put.
10448 */
10449static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
10450{
10451 unsigned long rebroadcast_time, warning_time;
10452 struct net_device *dev;
10453 int wait = 0;
10454
10455 rebroadcast_time = warning_time = jiffies;
10456
10457 list_for_each_entry(dev, list, todo_list)
10458 if (netdev_refcnt_read(dev) == 1)
10459 return dev;
10460
10461 while (true) {
10462 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
10463 rtnl_lock();
10464
10465 /* Rebroadcast unregister notification */
10466 list_for_each_entry(dev, list, todo_list)
10467 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10468
10469 __rtnl_unlock();
10470 rcu_barrier();
10471 rtnl_lock();
10472
10473 list_for_each_entry(dev, list, todo_list)
10474 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
10475 &dev->state)) {
10476 /* We must not have linkwatch events
10477 * pending on unregister. If this
10478 * happens, we simply run the queue
10479 * unscheduled, resulting in a noop
10480 * for this device.
10481 */
10482 linkwatch_run_queue();
10483 break;
10484 }
10485
10486 __rtnl_unlock();
10487
10488 rebroadcast_time = jiffies;
10489 }
10490
10491 rcu_barrier();
10492
10493 if (!wait) {
10494 wait = WAIT_REFS_MIN_MSECS;
10495 } else {
10496 msleep(wait);
10497 wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
10498 }
10499
10500 list_for_each_entry(dev, list, todo_list)
10501 if (netdev_refcnt_read(dev) == 1)
10502 return dev;
10503
10504 if (time_after(jiffies, warning_time +
10505 READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
10506 list_for_each_entry(dev, list, todo_list) {
10507 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
10508 dev->name, netdev_refcnt_read(dev));
10509 ref_tracker_dir_print(&dev->refcnt_tracker, 10);
10510 }
10511
10512 warning_time = jiffies;
10513 }
10514 }
10515}
10516
10517/* The sequence is:
10518 *
10519 * rtnl_lock();
10520 * ...
10521 * register_netdevice(x1);
10522 * register_netdevice(x2);
10523 * ...
10524 * unregister_netdevice(y1);
10525 * unregister_netdevice(y2);
10526 * ...
10527 * rtnl_unlock();
10528 * free_netdev(y1);
10529 * free_netdev(y2);
10530 *
10531 * We are invoked by rtnl_unlock().
10532 * This allows us to deal with problems:
10533 * 1) We can delete sysfs objects which invoke hotplug
10534 * without deadlocking with linkwatch via keventd.
10535 * 2) Since we run with the RTNL semaphore not held, we can sleep
10536 * safely in order to wait for the netdev refcnt to drop to zero.
10537 *
10538 * We must not return until all unregister events added during
10539 * the interval the lock was held have been completed.
10540 */
10541void netdev_run_todo(void)
10542{
10543 struct net_device *dev, *tmp;
10544 struct list_head list;
10545 int cnt;
10546#ifdef CONFIG_LOCKDEP
10547 struct list_head unlink_list;
10548
10549 list_replace_init(&net_unlink_list, &unlink_list);
10550
10551 while (!list_empty(&unlink_list)) {
10552 struct net_device *dev = list_first_entry(&unlink_list,
10553 struct net_device,
10554 unlink_list);
10555 list_del_init(&dev->unlink_list);
10556 dev->nested_level = dev->lower_level - 1;
10557 }
10558#endif
10559
10560 /* Snapshot list, allow later requests */
10561 list_replace_init(&net_todo_list, &list);
10562
10563 __rtnl_unlock();
10564
10565 /* Wait for rcu callbacks to finish before next phase */
10566 if (!list_empty(&list))
10567 rcu_barrier();
10568
10569 list_for_each_entry_safe(dev, tmp, &list, todo_list) {
10570 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
10571 netdev_WARN(dev, "run_todo but not unregistering\n");
10572 list_del(&dev->todo_list);
10573 continue;
10574 }
10575
10576 WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
10577 linkwatch_sync_dev(dev);
10578 }
10579
10580 cnt = 0;
10581 while (!list_empty(&list)) {
10582 dev = netdev_wait_allrefs_any(&list);
10583 list_del(&dev->todo_list);
10584
10585 /* paranoia */
10586 BUG_ON(netdev_refcnt_read(dev) != 1);
10587 BUG_ON(!list_empty(&dev->ptype_all));
10588 BUG_ON(!list_empty(&dev->ptype_specific));
10589 WARN_ON(rcu_access_pointer(dev->ip_ptr));
10590 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
10591
10592 netdev_do_free_pcpu_stats(dev);
10593 if (dev->priv_destructor)
10594 dev->priv_destructor(dev);
10595 if (dev->needs_free_netdev)
10596 free_netdev(dev);
10597
10598 cnt++;
10599
10600 /* Free network device */
10601 kobject_put(&dev->dev.kobj);
10602 }
10603 if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count))
10604 wake_up(&netdev_unregistering_wq);
10605}
10606
10607/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
10608 * all the same fields in the same order as net_device_stats, with only
10609 * the type differing, but rtnl_link_stats64 may have additional fields
10610 * at the end for newer counters.
10611 */
10612void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
10613 const struct net_device_stats *netdev_stats)
10614{
10615 size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
10616 const atomic_long_t *src = (atomic_long_t *)netdev_stats;
10617 u64 *dst = (u64 *)stats64;
10618
10619 BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
10620 for (i = 0; i < n; i++)
10621 dst[i] = (unsigned long)atomic_long_read(&src[i]);
10622 /* zero out counters that only exist in rtnl_link_stats64 */
10623 memset((char *)stats64 + n * sizeof(u64), 0,
10624 sizeof(*stats64) - n * sizeof(u64));
10625}
10626EXPORT_SYMBOL(netdev_stats_to_stats64);
10627
10628static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc(
10629 struct net_device *dev)
10630{
10631 struct net_device_core_stats __percpu *p;
10632
10633 p = alloc_percpu_gfp(struct net_device_core_stats,
10634 GFP_ATOMIC | __GFP_NOWARN);
10635
10636 if (p && cmpxchg(&dev->core_stats, NULL, p))
10637 free_percpu(p);
10638
10639 /* This READ_ONCE() pairs with the cmpxchg() above */
10640 return READ_ONCE(dev->core_stats);
10641}
10642
10643noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset)
10644{
10645 /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
10646 struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
10647 unsigned long __percpu *field;
10648
10649 if (unlikely(!p)) {
10650 p = netdev_core_stats_alloc(dev);
10651 if (!p)
10652 return;
10653 }
10654
10655 field = (__force unsigned long __percpu *)((__force void *)p + offset);
10656 this_cpu_inc(*field);
10657}
10658EXPORT_SYMBOL_GPL(netdev_core_stats_inc);
10659
10660/**
10661 * dev_get_stats - get network device statistics
10662 * @dev: device to get statistics from
10663 * @storage: place to store stats
10664 *
10665 * Get network statistics from device. Return @storage.
10666 * The device driver may provide its own method by setting
10667 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
10668 * otherwise the internal statistics structure is used.
10669 */
10670struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
10671 struct rtnl_link_stats64 *storage)
10672{
10673 const struct net_device_ops *ops = dev->netdev_ops;
10674 const struct net_device_core_stats __percpu *p;
10675
10676 if (ops->ndo_get_stats64) {
10677 memset(storage, 0, sizeof(*storage));
10678 ops->ndo_get_stats64(dev, storage);
10679 } else if (ops->ndo_get_stats) {
10680 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
10681 } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) {
10682 dev_get_tstats64(dev, storage);
10683 } else {
10684 netdev_stats_to_stats64(storage, &dev->stats);
10685 }
10686
10687 /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
10688 p = READ_ONCE(dev->core_stats);
10689 if (p) {
10690 const struct net_device_core_stats *core_stats;
10691 int i;
10692
10693 for_each_possible_cpu(i) {
10694 core_stats = per_cpu_ptr(p, i);
10695 storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
10696 storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
10697 storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
10698 storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped);
10699 }
10700 }
10701 return storage;
10702}
10703EXPORT_SYMBOL(dev_get_stats);
10704
10705/**
10706 * dev_fetch_sw_netstats - get per-cpu network device statistics
10707 * @s: place to store stats
10708 * @netstats: per-cpu network stats to read from
10709 *
10710 * Read per-cpu network statistics and populate the related fields in @s.
10711 */
10712void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
10713 const struct pcpu_sw_netstats __percpu *netstats)
10714{
10715 int cpu;
10716
10717 for_each_possible_cpu(cpu) {
10718 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
10719 const struct pcpu_sw_netstats *stats;
10720 unsigned int start;
10721
10722 stats = per_cpu_ptr(netstats, cpu);
10723 do {
10724 start = u64_stats_fetch_begin(&stats->syncp);
10725 rx_packets = u64_stats_read(&stats->rx_packets);
10726 rx_bytes = u64_stats_read(&stats->rx_bytes);
10727 tx_packets = u64_stats_read(&stats->tx_packets);
10728 tx_bytes = u64_stats_read(&stats->tx_bytes);
10729 } while (u64_stats_fetch_retry(&stats->syncp, start));
10730
10731 s->rx_packets += rx_packets;
10732 s->rx_bytes += rx_bytes;
10733 s->tx_packets += tx_packets;
10734 s->tx_bytes += tx_bytes;
10735 }
10736}
10737EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
10738
10739/**
10740 * dev_get_tstats64 - ndo_get_stats64 implementation
10741 * @dev: device to get statistics from
10742 * @s: place to store stats
10743 *
10744 * Populate @s from dev->stats and dev->tstats. Can be used as
10745 * ndo_get_stats64() callback.
10746 */
10747void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
10748{
10749 netdev_stats_to_stats64(s, &dev->stats);
10750 dev_fetch_sw_netstats(s, dev->tstats);
10751}
10752EXPORT_SYMBOL_GPL(dev_get_tstats64);
10753
10754struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
10755{
10756 struct netdev_queue *queue = dev_ingress_queue(dev);
10757
10758#ifdef CONFIG_NET_CLS_ACT
10759 if (queue)
10760 return queue;
10761 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
10762 if (!queue)
10763 return NULL;
10764 netdev_init_one_queue(dev, queue, NULL);
10765 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
10766 RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc);
10767 rcu_assign_pointer(dev->ingress_queue, queue);
10768#endif
10769 return queue;
10770}
10771
10772static const struct ethtool_ops default_ethtool_ops;
10773
10774void netdev_set_default_ethtool_ops(struct net_device *dev,
10775 const struct ethtool_ops *ops)
10776{
10777 if (dev->ethtool_ops == &default_ethtool_ops)
10778 dev->ethtool_ops = ops;
10779}
10780EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
10781
10782/**
10783 * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default
10784 * @dev: netdev to enable the IRQ coalescing on
10785 *
10786 * Sets a conservative default for SW IRQ coalescing. Users can use
10787 * sysfs attributes to override the default values.
10788 */
10789void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
10790{
10791 WARN_ON(dev->reg_state == NETREG_REGISTERED);
10792
10793 if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
10794 dev->gro_flush_timeout = 20000;
10795 dev->napi_defer_hard_irqs = 1;
10796 }
10797}
10798EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
10799
10800void netdev_freemem(struct net_device *dev)
10801{
10802 char *addr = (char *)dev - dev->padded;
10803
10804 kvfree(addr);
10805}
10806
10807/**
10808 * alloc_netdev_mqs - allocate network device
10809 * @sizeof_priv: size of private data to allocate space for
10810 * @name: device name format string
10811 * @name_assign_type: origin of device name
10812 * @setup: callback to initialize device
10813 * @txqs: the number of TX subqueues to allocate
10814 * @rxqs: the number of RX subqueues to allocate
10815 *
10816 * Allocates a struct net_device with private data area for driver use
10817 * and performs basic initialization. Also allocates subqueue structs
10818 * for each queue on the device.
10819 */
10820struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
10821 unsigned char name_assign_type,
10822 void (*setup)(struct net_device *),
10823 unsigned int txqs, unsigned int rxqs)
10824{
10825 struct net_device *dev;
10826 unsigned int alloc_size;
10827 struct net_device *p;
10828
10829 BUG_ON(strlen(name) >= sizeof(dev->name));
10830
10831 if (txqs < 1) {
10832 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
10833 return NULL;
10834 }
10835
10836 if (rxqs < 1) {
10837 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
10838 return NULL;
10839 }
10840
10841 alloc_size = sizeof(struct net_device);
10842 if (sizeof_priv) {
10843 /* ensure 32-byte alignment of private area */
10844 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
10845 alloc_size += sizeof_priv;
10846 }
10847 /* ensure 32-byte alignment of whole construct */
10848 alloc_size += NETDEV_ALIGN - 1;
10849
10850 p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10851 if (!p)
10852 return NULL;
10853
10854 dev = PTR_ALIGN(p, NETDEV_ALIGN);
10855 dev->padded = (char *)dev - (char *)p;
10856
10857 ref_tracker_dir_init(&dev->refcnt_tracker, 128, name);
10858#ifdef CONFIG_PCPU_DEV_REFCNT
10859 dev->pcpu_refcnt = alloc_percpu(int);
10860 if (!dev->pcpu_refcnt)
10861 goto free_dev;
10862 __dev_hold(dev);
10863#else
10864 refcount_set(&dev->dev_refcnt, 1);
10865#endif
10866
10867 if (dev_addr_init(dev))
10868 goto free_pcpu;
10869
10870 dev_mc_init(dev);
10871 dev_uc_init(dev);
10872
10873 dev_net_set(dev, &init_net);
10874
10875 dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
10876 dev->xdp_zc_max_segs = 1;
10877 dev->gso_max_segs = GSO_MAX_SEGS;
10878 dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
10879 dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE;
10880 dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE;
10881 dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
10882 dev->tso_max_segs = TSO_MAX_SEGS;
10883 dev->upper_level = 1;
10884 dev->lower_level = 1;
10885#ifdef CONFIG_LOCKDEP
10886 dev->nested_level = 0;
10887 INIT_LIST_HEAD(&dev->unlink_list);
10888#endif
10889
10890 INIT_LIST_HEAD(&dev->napi_list);
10891 INIT_LIST_HEAD(&dev->unreg_list);
10892 INIT_LIST_HEAD(&dev->close_list);
10893 INIT_LIST_HEAD(&dev->link_watch_list);
10894 INIT_LIST_HEAD(&dev->adj_list.upper);
10895 INIT_LIST_HEAD(&dev->adj_list.lower);
10896 INIT_LIST_HEAD(&dev->ptype_all);
10897 INIT_LIST_HEAD(&dev->ptype_specific);
10898 INIT_LIST_HEAD(&dev->net_notifier_list);
10899#ifdef CONFIG_NET_SCHED
10900 hash_init(dev->qdisc_hash);
10901#endif
10902 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
10903 setup(dev);
10904
10905 if (!dev->tx_queue_len) {
10906 dev->priv_flags |= IFF_NO_QUEUE;
10907 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
10908 }
10909
10910 dev->num_tx_queues = txqs;
10911 dev->real_num_tx_queues = txqs;
10912 if (netif_alloc_netdev_queues(dev))
10913 goto free_all;
10914
10915 dev->num_rx_queues = rxqs;
10916 dev->real_num_rx_queues = rxqs;
10917 if (netif_alloc_rx_queues(dev))
10918 goto free_all;
10919
10920 strcpy(dev->name, name);
10921 dev->name_assign_type = name_assign_type;
10922 dev->group = INIT_NETDEV_GROUP;
10923 if (!dev->ethtool_ops)
10924 dev->ethtool_ops = &default_ethtool_ops;
10925
10926 nf_hook_netdev_init(dev);
10927
10928 return dev;
10929
10930free_all:
10931 free_netdev(dev);
10932 return NULL;
10933
10934free_pcpu:
10935#ifdef CONFIG_PCPU_DEV_REFCNT
10936 free_percpu(dev->pcpu_refcnt);
10937free_dev:
10938#endif
10939 netdev_freemem(dev);
10940 return NULL;
10941}
10942EXPORT_SYMBOL(alloc_netdev_mqs);
10943
10944/**
10945 * free_netdev - free network device
10946 * @dev: device
10947 *
10948 * This function does the last stage of destroying an allocated device
10949 * interface. The reference to the device object is released. If this
10950 * is the last reference then it will be freed.Must be called in process
10951 * context.
10952 */
10953void free_netdev(struct net_device *dev)
10954{
10955 struct napi_struct *p, *n;
10956
10957 might_sleep();
10958
10959 /* When called immediately after register_netdevice() failed the unwind
10960 * handling may still be dismantling the device. Handle that case by
10961 * deferring the free.
10962 */
10963 if (dev->reg_state == NETREG_UNREGISTERING) {
10964 ASSERT_RTNL();
10965 dev->needs_free_netdev = true;
10966 return;
10967 }
10968
10969 netif_free_tx_queues(dev);
10970 netif_free_rx_queues(dev);
10971
10972 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
10973
10974 /* Flush device addresses */
10975 dev_addr_flush(dev);
10976
10977 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
10978 netif_napi_del(p);
10979
10980 ref_tracker_dir_exit(&dev->refcnt_tracker);
10981#ifdef CONFIG_PCPU_DEV_REFCNT
10982 free_percpu(dev->pcpu_refcnt);
10983 dev->pcpu_refcnt = NULL;
10984#endif
10985 free_percpu(dev->core_stats);
10986 dev->core_stats = NULL;
10987 free_percpu(dev->xdp_bulkq);
10988 dev->xdp_bulkq = NULL;
10989
10990 /* Compatibility with error handling in drivers */
10991 if (dev->reg_state == NETREG_UNINITIALIZED) {
10992 netdev_freemem(dev);
10993 return;
10994 }
10995
10996 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
10997 WRITE_ONCE(dev->reg_state, NETREG_RELEASED);
10998
10999 /* will free via device release */
11000 put_device(&dev->dev);
11001}
11002EXPORT_SYMBOL(free_netdev);
11003
11004/**
11005 * synchronize_net - Synchronize with packet receive processing
11006 *
11007 * Wait for packets currently being received to be done.
11008 * Does not block later packets from starting.
11009 */
11010void synchronize_net(void)
11011{
11012 might_sleep();
11013 if (rtnl_is_locked())
11014 synchronize_rcu_expedited();
11015 else
11016 synchronize_rcu();
11017}
11018EXPORT_SYMBOL(synchronize_net);
11019
11020/**
11021 * unregister_netdevice_queue - remove device from the kernel
11022 * @dev: device
11023 * @head: list
11024 *
11025 * This function shuts down a device interface and removes it
11026 * from the kernel tables.
11027 * If head not NULL, device is queued to be unregistered later.
11028 *
11029 * Callers must hold the rtnl semaphore. You may want
11030 * unregister_netdev() instead of this.
11031 */
11032
11033void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
11034{
11035 ASSERT_RTNL();
11036
11037 if (head) {
11038 list_move_tail(&dev->unreg_list, head);
11039 } else {
11040 LIST_HEAD(single);
11041
11042 list_add(&dev->unreg_list, &single);
11043 unregister_netdevice_many(&single);
11044 }
11045}
11046EXPORT_SYMBOL(unregister_netdevice_queue);
11047
11048void unregister_netdevice_many_notify(struct list_head *head,
11049 u32 portid, const struct nlmsghdr *nlh)
11050{
11051 struct net_device *dev, *tmp;
11052 LIST_HEAD(close_head);
11053 int cnt = 0;
11054
11055 BUG_ON(dev_boot_phase);
11056 ASSERT_RTNL();
11057
11058 if (list_empty(head))
11059 return;
11060
11061 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
11062 /* Some devices call without registering
11063 * for initialization unwind. Remove those
11064 * devices and proceed with the remaining.
11065 */
11066 if (dev->reg_state == NETREG_UNINITIALIZED) {
11067 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
11068 dev->name, dev);
11069
11070 WARN_ON(1);
11071 list_del(&dev->unreg_list);
11072 continue;
11073 }
11074 dev->dismantle = true;
11075 BUG_ON(dev->reg_state != NETREG_REGISTERED);
11076 }
11077
11078 /* If device is running, close it first. */
11079 list_for_each_entry(dev, head, unreg_list)
11080 list_add_tail(&dev->close_list, &close_head);
11081 dev_close_many(&close_head, true);
11082
11083 list_for_each_entry(dev, head, unreg_list) {
11084 /* And unlink it from device chain. */
11085 unlist_netdevice(dev);
11086 WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
11087 }
11088 flush_all_backlogs();
11089
11090 synchronize_net();
11091
11092 list_for_each_entry(dev, head, unreg_list) {
11093 struct sk_buff *skb = NULL;
11094
11095 /* Shutdown queueing discipline. */
11096 dev_shutdown(dev);
11097 dev_tcx_uninstall(dev);
11098 dev_xdp_uninstall(dev);
11099 bpf_dev_bound_netdev_unregister(dev);
11100
11101 netdev_offload_xstats_disable_all(dev);
11102
11103 /* Notify protocols, that we are about to destroy
11104 * this device. They should clean all the things.
11105 */
11106 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
11107
11108 if (!dev->rtnl_link_ops ||
11109 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
11110 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
11111 GFP_KERNEL, NULL, 0,
11112 portid, nlh);
11113
11114 /*
11115 * Flush the unicast and multicast chains
11116 */
11117 dev_uc_flush(dev);
11118 dev_mc_flush(dev);
11119
11120 netdev_name_node_alt_flush(dev);
11121 netdev_name_node_free(dev->name_node);
11122
11123 call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
11124
11125 if (dev->netdev_ops->ndo_uninit)
11126 dev->netdev_ops->ndo_uninit(dev);
11127
11128 if (skb)
11129 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);
11130
11131 /* Notifier chain MUST detach us all upper devices. */
11132 WARN_ON(netdev_has_any_upper_dev(dev));
11133 WARN_ON(netdev_has_any_lower_dev(dev));
11134
11135 /* Remove entries from kobject tree */
11136 netdev_unregister_kobject(dev);
11137#ifdef CONFIG_XPS
11138 /* Remove XPS queueing entries */
11139 netif_reset_xps_queues_gt(dev, 0);
11140#endif
11141 }
11142
11143 synchronize_net();
11144
11145 list_for_each_entry(dev, head, unreg_list) {
11146 netdev_put(dev, &dev->dev_registered_tracker);
11147 net_set_todo(dev);
11148 cnt++;
11149 }
11150 atomic_add(cnt, &dev_unreg_count);
11151
11152 list_del(head);
11153}
11154
11155/**
11156 * unregister_netdevice_many - unregister many devices
11157 * @head: list of devices
11158 *
11159 * Note: As most callers use a stack allocated list_head,
11160 * we force a list_del() to make sure stack wont be corrupted later.
11161 */
11162void unregister_netdevice_many(struct list_head *head)
11163{
11164 unregister_netdevice_many_notify(head, 0, NULL);
11165}
11166EXPORT_SYMBOL(unregister_netdevice_many);
11167
11168/**
11169 * unregister_netdev - remove device from the kernel
11170 * @dev: device
11171 *
11172 * This function shuts down a device interface and removes it
11173 * from the kernel tables.
11174 *
11175 * This is just a wrapper for unregister_netdevice that takes
11176 * the rtnl semaphore. In general you want to use this and not
11177 * unregister_netdevice.
11178 */
11179void unregister_netdev(struct net_device *dev)
11180{
11181 rtnl_lock();
11182 unregister_netdevice(dev);
11183 rtnl_unlock();
11184}
11185EXPORT_SYMBOL(unregister_netdev);
11186
11187/**
11188 * __dev_change_net_namespace - move device to different nethost namespace
11189 * @dev: device
11190 * @net: network namespace
11191 * @pat: If not NULL name pattern to try if the current device name
11192 * is already taken in the destination network namespace.
11193 * @new_ifindex: If not zero, specifies device index in the target
11194 * namespace.
11195 *
11196 * This function shuts down a device interface and moves it
11197 * to a new network namespace. On success 0 is returned, on
11198 * a failure a netagive errno code is returned.
11199 *
11200 * Callers must hold the rtnl semaphore.
11201 */
11202
11203int __dev_change_net_namespace(struct net_device *dev, struct net *net,
11204 const char *pat, int new_ifindex)
11205{
11206 struct netdev_name_node *name_node;
11207 struct net *net_old = dev_net(dev);
11208 char new_name[IFNAMSIZ] = {};
11209 int err, new_nsid;
11210
11211 ASSERT_RTNL();
11212
11213 /* Don't allow namespace local devices to be moved. */
11214 err = -EINVAL;
11215 if (dev->features & NETIF_F_NETNS_LOCAL)
11216 goto out;
11217
11218 /* Ensure the device has been registrered */
11219 if (dev->reg_state != NETREG_REGISTERED)
11220 goto out;
11221
11222 /* Get out if there is nothing todo */
11223 err = 0;
11224 if (net_eq(net_old, net))
11225 goto out;
11226
11227 /* Pick the destination device name, and ensure
11228 * we can use it in the destination network namespace.
11229 */
11230 err = -EEXIST;
11231 if (netdev_name_in_use(net, dev->name)) {
11232 /* We get here if we can't use the current device name */
11233 if (!pat)
11234 goto out;
11235 err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST);
11236 if (err < 0)
11237 goto out;
11238 }
11239 /* Check that none of the altnames conflicts. */
11240 err = -EEXIST;
11241 netdev_for_each_altname(dev, name_node)
11242 if (netdev_name_in_use(net, name_node->name))
11243 goto out;
11244
11245 /* Check that new_ifindex isn't used yet. */
11246 if (new_ifindex) {
11247 err = dev_index_reserve(net, new_ifindex);
11248 if (err < 0)
11249 goto out;
11250 } else {
11251 /* If there is an ifindex conflict assign a new one */
11252 err = dev_index_reserve(net, dev->ifindex);
11253 if (err == -EBUSY)
11254 err = dev_index_reserve(net, 0);
11255 if (err < 0)
11256 goto out;
11257 new_ifindex = err;
11258 }
11259
11260 /*
11261 * And now a mini version of register_netdevice unregister_netdevice.
11262 */
11263
11264 /* If device is running close it first. */
11265 dev_close(dev);
11266
11267 /* And unlink it from device chain */
11268 unlist_netdevice(dev);
11269
11270 synchronize_net();
11271
11272 /* Shutdown queueing discipline. */
11273 dev_shutdown(dev);
11274
11275 /* Notify protocols, that we are about to destroy
11276 * this device. They should clean all the things.
11277 *
11278 * Note that dev->reg_state stays at NETREG_REGISTERED.
11279 * This is wanted because this way 8021q and macvlan know
11280 * the device is just moving and can keep their slaves up.
11281 */
11282 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
11283 rcu_barrier();
11284
11285 new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
11286
11287 rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
11288 new_ifindex);
11289
11290 /*
11291 * Flush the unicast and multicast chains
11292 */
11293 dev_uc_flush(dev);
11294 dev_mc_flush(dev);
11295
11296 /* Send a netdev-removed uevent to the old namespace */
11297 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
11298 netdev_adjacent_del_links(dev);
11299
11300 /* Move per-net netdevice notifiers that are following the netdevice */
11301 move_netdevice_notifiers_dev_net(dev, net);
11302
11303 /* Actually switch the network namespace */
11304 dev_net_set(dev, net);
11305 dev->ifindex = new_ifindex;
11306
11307 if (new_name[0]) /* Rename the netdev to prepared name */
11308 strscpy(dev->name, new_name, IFNAMSIZ);
11309
11310 /* Fixup kobjects */
11311 dev_set_uevent_suppress(&dev->dev, 1);
11312 err = device_rename(&dev->dev, dev->name);
11313 dev_set_uevent_suppress(&dev->dev, 0);
11314 WARN_ON(err);
11315
11316 /* Send a netdev-add uevent to the new namespace */
11317 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
11318 netdev_adjacent_add_links(dev);
11319
11320 /* Adapt owner in case owning user namespace of target network
11321 * namespace is different from the original one.
11322 */
11323 err = netdev_change_owner(dev, net_old, net);
11324 WARN_ON(err);
11325
11326 /* Add the device back in the hashes */
11327 list_netdevice(dev);
11328
11329 /* Notify protocols, that a new device appeared. */
11330 call_netdevice_notifiers(NETDEV_REGISTER, dev);
11331
11332 /*
11333 * Prevent userspace races by waiting until the network
11334 * device is fully setup before sending notifications.
11335 */
11336 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
11337
11338 synchronize_net();
11339 err = 0;
11340out:
11341 return err;
11342}
11343EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
11344
11345static int dev_cpu_dead(unsigned int oldcpu)
11346{
11347 struct sk_buff **list_skb;
11348 struct sk_buff *skb;
11349 unsigned int cpu;
11350 struct softnet_data *sd, *oldsd, *remsd = NULL;
11351
11352 local_irq_disable();
11353 cpu = smp_processor_id();
11354 sd = &per_cpu(softnet_data, cpu);
11355 oldsd = &per_cpu(softnet_data, oldcpu);
11356
11357 /* Find end of our completion_queue. */
11358 list_skb = &sd->completion_queue;
11359 while (*list_skb)
11360 list_skb = &(*list_skb)->next;
11361 /* Append completion queue from offline CPU. */
11362 *list_skb = oldsd->completion_queue;
11363 oldsd->completion_queue = NULL;
11364
11365 /* Append output queue from offline CPU. */
11366 if (oldsd->output_queue) {
11367 *sd->output_queue_tailp = oldsd->output_queue;
11368 sd->output_queue_tailp = oldsd->output_queue_tailp;
11369 oldsd->output_queue = NULL;
11370 oldsd->output_queue_tailp = &oldsd->output_queue;
11371 }
11372 /* Append NAPI poll list from offline CPU, with one exception :
11373 * process_backlog() must be called by cpu owning percpu backlog.
11374 * We properly handle process_queue & input_pkt_queue later.
11375 */
11376 while (!list_empty(&oldsd->poll_list)) {
11377 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
11378 struct napi_struct,
11379 poll_list);
11380
11381 list_del_init(&napi->poll_list);
11382 if (napi->poll == process_backlog)
11383 napi->state = 0;
11384 else
11385 ____napi_schedule(sd, napi);
11386 }
11387
11388 raise_softirq_irqoff(NET_TX_SOFTIRQ);
11389 local_irq_enable();
11390
11391#ifdef CONFIG_RPS
11392 remsd = oldsd->rps_ipi_list;
11393 oldsd->rps_ipi_list = NULL;
11394#endif
11395 /* send out pending IPI's on offline CPU */
11396 net_rps_send_ipi(remsd);
11397
11398 /* Process offline CPU's input_pkt_queue */
11399 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
11400 netif_rx(skb);
11401 input_queue_head_incr(oldsd);
11402 }
11403 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
11404 netif_rx(skb);
11405 input_queue_head_incr(oldsd);
11406 }
11407
11408 return 0;
11409}
11410
11411/**
11412 * netdev_increment_features - increment feature set by one
11413 * @all: current feature set
11414 * @one: new feature set
11415 * @mask: mask feature set
11416 *
11417 * Computes a new feature set after adding a device with feature set
11418 * @one to the master device with current feature set @all. Will not
11419 * enable anything that is off in @mask. Returns the new feature set.
11420 */
11421netdev_features_t netdev_increment_features(netdev_features_t all,
11422 netdev_features_t one, netdev_features_t mask)
11423{
11424 if (mask & NETIF_F_HW_CSUM)
11425 mask |= NETIF_F_CSUM_MASK;
11426 mask |= NETIF_F_VLAN_CHALLENGED;
11427
11428 all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
11429 all &= one | ~NETIF_F_ALL_FOR_ALL;
11430
11431 /* If one device supports hw checksumming, set for all. */
11432 if (all & NETIF_F_HW_CSUM)
11433 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
11434
11435 return all;
11436}
11437EXPORT_SYMBOL(netdev_increment_features);
11438
11439static struct hlist_head * __net_init netdev_create_hash(void)
11440{
11441 int i;
11442 struct hlist_head *hash;
11443
11444 hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
11445 if (hash != NULL)
11446 for (i = 0; i < NETDEV_HASHENTRIES; i++)
11447 INIT_HLIST_HEAD(&hash[i]);
11448
11449 return hash;
11450}
11451
11452/* Initialize per network namespace state */
11453static int __net_init netdev_init(struct net *net)
11454{
11455 BUILD_BUG_ON(GRO_HASH_BUCKETS >
11456 8 * sizeof_field(struct napi_struct, gro_bitmask));
11457
11458 INIT_LIST_HEAD(&net->dev_base_head);
11459
11460 net->dev_name_head = netdev_create_hash();
11461 if (net->dev_name_head == NULL)
11462 goto err_name;
11463
11464 net->dev_index_head = netdev_create_hash();
11465 if (net->dev_index_head == NULL)
11466 goto err_idx;
11467
11468 xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1);
11469
11470 RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
11471
11472 return 0;
11473
11474err_idx:
11475 kfree(net->dev_name_head);
11476err_name:
11477 return -ENOMEM;
11478}
11479
11480/**
11481 * netdev_drivername - network driver for the device
11482 * @dev: network device
11483 *
11484 * Determine network driver for device.
11485 */
11486const char *netdev_drivername(const struct net_device *dev)
11487{
11488 const struct device_driver *driver;
11489 const struct device *parent;
11490 const char *empty = "";
11491
11492 parent = dev->dev.parent;
11493 if (!parent)
11494 return empty;
11495
11496 driver = parent->driver;
11497 if (driver && driver->name)
11498 return driver->name;
11499 return empty;
11500}
11501
11502static void __netdev_printk(const char *level, const struct net_device *dev,
11503 struct va_format *vaf)
11504{
11505 if (dev && dev->dev.parent) {
11506 dev_printk_emit(level[1] - '0',
11507 dev->dev.parent,
11508 "%s %s %s%s: %pV",
11509 dev_driver_string(dev->dev.parent),
11510 dev_name(dev->dev.parent),
11511 netdev_name(dev), netdev_reg_state(dev),
11512 vaf);
11513 } else if (dev) {
11514 printk("%s%s%s: %pV",
11515 level, netdev_name(dev), netdev_reg_state(dev), vaf);
11516 } else {
11517 printk("%s(NULL net_device): %pV", level, vaf);
11518 }
11519}
11520
11521void netdev_printk(const char *level, const struct net_device *dev,
11522 const char *format, ...)
11523{
11524 struct va_format vaf;
11525 va_list args;
11526
11527 va_start(args, format);
11528
11529 vaf.fmt = format;
11530 vaf.va = &args;
11531
11532 __netdev_printk(level, dev, &vaf);
11533
11534 va_end(args);
11535}
11536EXPORT_SYMBOL(netdev_printk);
11537
11538#define define_netdev_printk_level(func, level) \
11539void func(const struct net_device *dev, const char *fmt, ...) \
11540{ \
11541 struct va_format vaf; \
11542 va_list args; \
11543 \
11544 va_start(args, fmt); \
11545 \
11546 vaf.fmt = fmt; \
11547 vaf.va = &args; \
11548 \
11549 __netdev_printk(level, dev, &vaf); \
11550 \
11551 va_end(args); \
11552} \
11553EXPORT_SYMBOL(func);
11554
11555define_netdev_printk_level(netdev_emerg, KERN_EMERG);
11556define_netdev_printk_level(netdev_alert, KERN_ALERT);
11557define_netdev_printk_level(netdev_crit, KERN_CRIT);
11558define_netdev_printk_level(netdev_err, KERN_ERR);
11559define_netdev_printk_level(netdev_warn, KERN_WARNING);
11560define_netdev_printk_level(netdev_notice, KERN_NOTICE);
11561define_netdev_printk_level(netdev_info, KERN_INFO);
11562
11563static void __net_exit netdev_exit(struct net *net)
11564{
11565 kfree(net->dev_name_head);
11566 kfree(net->dev_index_head);
11567 xa_destroy(&net->dev_by_index);
11568 if (net != &init_net)
11569 WARN_ON_ONCE(!list_empty(&net->dev_base_head));
11570}
11571
11572static struct pernet_operations __net_initdata netdev_net_ops = {
11573 .init = netdev_init,
11574 .exit = netdev_exit,
11575};
11576
11577static void __net_exit default_device_exit_net(struct net *net)
11578{
11579 struct netdev_name_node *name_node, *tmp;
11580 struct net_device *dev, *aux;
11581 /*
11582 * Push all migratable network devices back to the
11583 * initial network namespace
11584 */
11585 ASSERT_RTNL();
11586 for_each_netdev_safe(net, dev, aux) {
11587 int err;
11588 char fb_name[IFNAMSIZ];
11589
11590 /* Ignore unmoveable devices (i.e. loopback) */
11591 if (dev->features & NETIF_F_NETNS_LOCAL)
11592 continue;
11593
11594 /* Leave virtual devices for the generic cleanup */
11595 if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
11596 continue;
11597
11598 /* Push remaining network devices to init_net */
11599 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
11600 if (netdev_name_in_use(&init_net, fb_name))
11601 snprintf(fb_name, IFNAMSIZ, "dev%%d");
11602
11603 netdev_for_each_altname_safe(dev, name_node, tmp)
11604 if (netdev_name_in_use(&init_net, name_node->name))
11605 __netdev_name_node_alt_destroy(name_node);
11606
11607 err = dev_change_net_namespace(dev, &init_net, fb_name);
11608 if (err) {
11609 pr_emerg("%s: failed to move %s to init_net: %d\n",
11610 __func__, dev->name, err);
11611 BUG();
11612 }
11613 }
11614}
11615
11616static void __net_exit default_device_exit_batch(struct list_head *net_list)
11617{
11618 /* At exit all network devices most be removed from a network
11619 * namespace. Do this in the reverse order of registration.
11620 * Do this across as many network namespaces as possible to
11621 * improve batching efficiency.
11622 */
11623 struct net_device *dev;
11624 struct net *net;
11625 LIST_HEAD(dev_kill_list);
11626
11627 rtnl_lock();
11628 list_for_each_entry(net, net_list, exit_list) {
11629 default_device_exit_net(net);
11630 cond_resched();
11631 }
11632
11633 list_for_each_entry(net, net_list, exit_list) {
11634 for_each_netdev_reverse(net, dev) {
11635 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
11636 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
11637 else
11638 unregister_netdevice_queue(dev, &dev_kill_list);
11639 }
11640 }
11641 unregister_netdevice_many(&dev_kill_list);
11642 rtnl_unlock();
11643}
11644
11645static struct pernet_operations __net_initdata default_device_ops = {
11646 .exit_batch = default_device_exit_batch,
11647};
11648
11649static void __init net_dev_struct_check(void)
11650{
11651 /* TX read-mostly hotpath */
11652 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags);
11653 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops);
11654 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops);
11655 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx);
11656 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues);
11657 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size);
11658 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size);
11659 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs);
11660 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features);
11661 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc);
11662 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu);
11663 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom);
11664 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq);
11665#ifdef CONFIG_XPS
11666 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps);
11667#endif
11668#ifdef CONFIG_NETFILTER_EGRESS
11669 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress);
11670#endif
11671#ifdef CONFIG_NET_XGRESS
11672 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress);
11673#endif
11674 CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160);
11675
11676 /* TXRX read-mostly hotpath */
11677 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats);
11678 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state);
11679 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags);
11680 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len);
11681 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features);
11682 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr);
11683 CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46);
11684
11685 /* RX read-mostly hotpath */
11686 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific);
11687 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
11688 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
11689 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
11690 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout);
11691 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs);
11692 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
11693 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
11694 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
11695 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data);
11696 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net);
11697#ifdef CONFIG_NETPOLL
11698 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo);
11699#endif
11700#ifdef CONFIG_NET_XGRESS
11701 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
11702#endif
11703 CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104);
11704}
11705
11706/*
11707 * Initialize the DEV module. At boot time this walks the device list and
11708 * unhooks any devices that fail to initialise (normally hardware not
11709 * present) and leaves us with a valid list of present and active devices.
11710 *
11711 */
11712
11713/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */
11714#define SYSTEM_PERCPU_PAGE_POOL_SIZE ((1 << 20) / PAGE_SIZE)
11715
11716static int net_page_pool_create(int cpuid)
11717{
11718#if IS_ENABLED(CONFIG_PAGE_POOL)
11719 struct page_pool_params page_pool_params = {
11720 .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE,
11721 .flags = PP_FLAG_SYSTEM_POOL,
11722 .nid = NUMA_NO_NODE,
11723 };
11724 struct page_pool *pp_ptr;
11725
11726 pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
11727 if (IS_ERR(pp_ptr))
11728 return -ENOMEM;
11729
11730 per_cpu(system_page_pool, cpuid) = pp_ptr;
11731#endif
11732 return 0;
11733}
11734
11735/*
11736 * This is called single threaded during boot, so no need
11737 * to take the rtnl semaphore.
11738 */
11739static int __init net_dev_init(void)
11740{
11741 int i, rc = -ENOMEM;
11742
11743 BUG_ON(!dev_boot_phase);
11744
11745 net_dev_struct_check();
11746
11747 if (dev_proc_init())
11748 goto out;
11749
11750 if (netdev_kobject_init())
11751 goto out;
11752
11753 for (i = 0; i < PTYPE_HASH_SIZE; i++)
11754 INIT_LIST_HEAD(&ptype_base[i]);
11755
11756 if (register_pernet_subsys(&netdev_net_ops))
11757 goto out;
11758
11759 /*
11760 * Initialise the packet receive queues.
11761 */
11762
11763 for_each_possible_cpu(i) {
11764 struct work_struct *flush = per_cpu_ptr(&flush_works, i);
11765 struct softnet_data *sd = &per_cpu(softnet_data, i);
11766
11767 INIT_WORK(flush, flush_backlog);
11768
11769 skb_queue_head_init(&sd->input_pkt_queue);
11770 skb_queue_head_init(&sd->process_queue);
11771#ifdef CONFIG_XFRM_OFFLOAD
11772 skb_queue_head_init(&sd->xfrm_backlog);
11773#endif
11774 INIT_LIST_HEAD(&sd->poll_list);
11775 sd->output_queue_tailp = &sd->output_queue;
11776#ifdef CONFIG_RPS
11777 INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
11778 sd->cpu = i;
11779#endif
11780 INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
11781 spin_lock_init(&sd->defer_lock);
11782
11783 init_gro_hash(&sd->backlog);
11784 sd->backlog.poll = process_backlog;
11785 sd->backlog.weight = weight_p;
11786
11787 if (net_page_pool_create(i))
11788 goto out;
11789 }
11790
11791 dev_boot_phase = 0;
11792
11793 /* The loopback device is special if any other network devices
11794 * is present in a network namespace the loopback device must
11795 * be present. Since we now dynamically allocate and free the
11796 * loopback device ensure this invariant is maintained by
11797 * keeping the loopback device as the first device on the
11798 * list of network devices. Ensuring the loopback devices
11799 * is the first device that appears and the last network device
11800 * that disappears.
11801 */
11802 if (register_pernet_device(&loopback_net_ops))
11803 goto out;
11804
11805 if (register_pernet_device(&default_device_ops))
11806 goto out;
11807
11808 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
11809 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
11810
11811 rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
11812 NULL, dev_cpu_dead);
11813 WARN_ON(rc < 0);
11814 rc = 0;
11815out:
11816 if (rc < 0) {
11817 for_each_possible_cpu(i) {
11818 struct page_pool *pp_ptr;
11819
11820 pp_ptr = per_cpu(system_page_pool, i);
11821 if (!pp_ptr)
11822 continue;
11823
11824 page_pool_destroy(pp_ptr);
11825 per_cpu(system_page_pool, i) = NULL;
11826 }
11827 }
11828
11829 return rc;
11830}
11831
11832subsys_initcall(net_dev_init);
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
76#include <linux/bitops.h>
77#include <linux/capability.h>
78#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
81#include <linux/hash.h>
82#include <linux/slab.h>
83#include <linux/sched.h>
84#include <linux/mutex.h>
85#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
94#include <linux/ethtool.h>
95#include <linux/notifier.h>
96#include <linux/skbuff.h>
97#include <net/net_namespace.h>
98#include <net/sock.h>
99#include <linux/rtnetlink.h>
100#include <linux/proc_fs.h>
101#include <linux/seq_file.h>
102#include <linux/stat.h>
103#include <net/dst.h>
104#include <net/pkt_sched.h>
105#include <net/checksum.h>
106#include <net/xfrm.h>
107#include <linux/highmem.h>
108#include <linux/init.h>
109#include <linux/kmod.h>
110#include <linux/module.h>
111#include <linux/netpoll.h>
112#include <linux/rcupdate.h>
113#include <linux/delay.h>
114#include <net/wext.h>
115#include <net/iw_handler.h>
116#include <asm/current.h>
117#include <linux/audit.h>
118#include <linux/dmaengine.h>
119#include <linux/err.h>
120#include <linux/ctype.h>
121#include <linux/if_arp.h>
122#include <linux/if_vlan.h>
123#include <linux/ip.h>
124#include <net/ip.h>
125#include <linux/ipv6.h>
126#include <linux/in.h>
127#include <linux/jhash.h>
128#include <linux/random.h>
129#include <trace/events/napi.h>
130#include <trace/events/net.h>
131#include <trace/events/skb.h>
132#include <linux/pci.h>
133#include <linux/inetdevice.h>
134#include <linux/cpu_rmap.h>
135#include <linux/net_tstamp.h>
136#include <linux/static_key.h>
137#include <net/flow_keys.h>
138
139#include "net-sysfs.h"
140
141/* Instead of increasing this, you should create a hash table. */
142#define MAX_GRO_SKBS 8
143
144/* This should be increased if a protocol with a bigger head is added. */
145#define GRO_MAX_HEAD (MAX_HEADER + 128)
146
147/*
148 * The list of packet types we will receive (as opposed to discard)
149 * and the routines to invoke.
150 *
151 * Why 16. Because with 16 the only overlap we get on a hash of the
152 * low nibble of the protocol value is RARP/SNAP/X.25.
153 *
154 * NOTE: That is no longer true with the addition of VLAN tags. Not
155 * sure which should go first, but I bet it won't make much
156 * difference if we are running VLANs. The good news is that
157 * this protocol won't be in the list unless compiled in, so
158 * the average user (w/out VLANs) will not be adversely affected.
159 * --BLG
160 *
161 * 0800 IP
162 * 8100 802.1Q VLAN
163 * 0001 802.3
164 * 0002 AX.25
165 * 0004 802.2
166 * 8035 RARP
167 * 0005 SNAP
168 * 0805 X.25
169 * 0806 ARP
170 * 8137 IPX
171 * 0009 Localtalk
172 * 86DD IPv6
173 */
174
175#define PTYPE_HASH_SIZE (16)
176#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
177
178static DEFINE_SPINLOCK(ptype_lock);
179static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
180static struct list_head ptype_all __read_mostly; /* Taps */
181
182/*
183 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
184 * semaphore.
185 *
186 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
187 *
188 * Writers must hold the rtnl semaphore while they loop through the
189 * dev_base_head list, and hold dev_base_lock for writing when they do the
190 * actual updates. This allows pure readers to access the list even
191 * while a writer is preparing to update it.
192 *
193 * To put it another way, dev_base_lock is held for writing only to
194 * protect against pure readers; the rtnl semaphore provides the
195 * protection against other writers.
196 *
197 * See, for example usages, register_netdevice() and
198 * unregister_netdevice(), which must be called with the rtnl
199 * semaphore held.
200 */
201DEFINE_RWLOCK(dev_base_lock);
202EXPORT_SYMBOL(dev_base_lock);
203
204static inline void dev_base_seq_inc(struct net *net)
205{
206 while (++net->dev_base_seq == 0);
207}
208
209static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
210{
211 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
212
213 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
214}
215
216static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
217{
218 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
219}
220
221static inline void rps_lock(struct softnet_data *sd)
222{
223#ifdef CONFIG_RPS
224 spin_lock(&sd->input_pkt_queue.lock);
225#endif
226}
227
228static inline void rps_unlock(struct softnet_data *sd)
229{
230#ifdef CONFIG_RPS
231 spin_unlock(&sd->input_pkt_queue.lock);
232#endif
233}
234
235/* Device list insertion */
236static int list_netdevice(struct net_device *dev)
237{
238 struct net *net = dev_net(dev);
239
240 ASSERT_RTNL();
241
242 write_lock_bh(&dev_base_lock);
243 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
244 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
245 hlist_add_head_rcu(&dev->index_hlist,
246 dev_index_hash(net, dev->ifindex));
247 write_unlock_bh(&dev_base_lock);
248
249 dev_base_seq_inc(net);
250
251 return 0;
252}
253
254/* Device list removal
255 * caller must respect a RCU grace period before freeing/reusing dev
256 */
257static void unlist_netdevice(struct net_device *dev)
258{
259 ASSERT_RTNL();
260
261 /* Unlink dev from the device chain */
262 write_lock_bh(&dev_base_lock);
263 list_del_rcu(&dev->dev_list);
264 hlist_del_rcu(&dev->name_hlist);
265 hlist_del_rcu(&dev->index_hlist);
266 write_unlock_bh(&dev_base_lock);
267
268 dev_base_seq_inc(dev_net(dev));
269}
270
271/*
272 * Our notifier list
273 */
274
275static RAW_NOTIFIER_HEAD(netdev_chain);
276
277/*
278 * Device drivers call our routines to queue packets here. We empty the
279 * queue in the local softnet handler.
280 */
281
282DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
283EXPORT_PER_CPU_SYMBOL(softnet_data);
284
285#ifdef CONFIG_LOCKDEP
286/*
287 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
288 * according to dev->type
289 */
290static const unsigned short netdev_lock_type[] =
291 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
292 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
293 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
294 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
295 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
296 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
297 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
298 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
299 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
300 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
301 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
302 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
303 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
304 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
305 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
306
307static const char *const netdev_lock_name[] =
308 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
309 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
310 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
311 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
312 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
313 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
314 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
315 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
316 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
317 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
318 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
319 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
320 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
321 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
322 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
323
324static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
325static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
326
327static inline unsigned short netdev_lock_pos(unsigned short dev_type)
328{
329 int i;
330
331 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
332 if (netdev_lock_type[i] == dev_type)
333 return i;
334 /* the last key is used by default */
335 return ARRAY_SIZE(netdev_lock_type) - 1;
336}
337
338static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
339 unsigned short dev_type)
340{
341 int i;
342
343 i = netdev_lock_pos(dev_type);
344 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
345 netdev_lock_name[i]);
346}
347
348static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
349{
350 int i;
351
352 i = netdev_lock_pos(dev->type);
353 lockdep_set_class_and_name(&dev->addr_list_lock,
354 &netdev_addr_lock_key[i],
355 netdev_lock_name[i]);
356}
357#else
358static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
359 unsigned short dev_type)
360{
361}
362static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
363{
364}
365#endif
366
367/*******************************************************************************
368
369 Protocol management and registration routines
370
371*******************************************************************************/
372
373/*
374 * Add a protocol ID to the list. Now that the input handler is
375 * smarter we can dispense with all the messy stuff that used to be
376 * here.
377 *
378 * BEWARE!!! Protocol handlers, mangling input packets,
379 * MUST BE last in hash buckets and checking protocol handlers
380 * MUST start from promiscuous ptype_all chain in net_bh.
381 * It is true now, do not change it.
382 * Explanation follows: if protocol handler, mangling packet, will
383 * be the first on list, it is not able to sense, that packet
384 * is cloned and should be copied-on-write, so that it will
385 * change it and subsequent readers will get broken packet.
386 * --ANK (980803)
387 */
388
389static inline struct list_head *ptype_head(const struct packet_type *pt)
390{
391 if (pt->type == htons(ETH_P_ALL))
392 return &ptype_all;
393 else
394 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
395}
396
397/**
398 * dev_add_pack - add packet handler
399 * @pt: packet type declaration
400 *
401 * Add a protocol handler to the networking stack. The passed &packet_type
402 * is linked into kernel lists and may not be freed until it has been
403 * removed from the kernel lists.
404 *
405 * This call does not sleep therefore it can not
406 * guarantee all CPU's that are in middle of receiving packets
407 * will see the new packet type (until the next received packet).
408 */
409
410void dev_add_pack(struct packet_type *pt)
411{
412 struct list_head *head = ptype_head(pt);
413
414 spin_lock(&ptype_lock);
415 list_add_rcu(&pt->list, head);
416 spin_unlock(&ptype_lock);
417}
418EXPORT_SYMBOL(dev_add_pack);
419
420/**
421 * __dev_remove_pack - remove packet handler
422 * @pt: packet type declaration
423 *
424 * Remove a protocol handler that was previously added to the kernel
425 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
426 * from the kernel lists and can be freed or reused once this function
427 * returns.
428 *
429 * The packet type might still be in use by receivers
430 * and must not be freed until after all the CPU's have gone
431 * through a quiescent state.
432 */
433void __dev_remove_pack(struct packet_type *pt)
434{
435 struct list_head *head = ptype_head(pt);
436 struct packet_type *pt1;
437
438 spin_lock(&ptype_lock);
439
440 list_for_each_entry(pt1, head, list) {
441 if (pt == pt1) {
442 list_del_rcu(&pt->list);
443 goto out;
444 }
445 }
446
447 pr_warn("dev_remove_pack: %p not found\n", pt);
448out:
449 spin_unlock(&ptype_lock);
450}
451EXPORT_SYMBOL(__dev_remove_pack);
452
453/**
454 * dev_remove_pack - remove packet handler
455 * @pt: packet type declaration
456 *
457 * Remove a protocol handler that was previously added to the kernel
458 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
459 * from the kernel lists and can be freed or reused once this function
460 * returns.
461 *
462 * This call sleeps to guarantee that no CPU is looking at the packet
463 * type after return.
464 */
465void dev_remove_pack(struct packet_type *pt)
466{
467 __dev_remove_pack(pt);
468
469 synchronize_net();
470}
471EXPORT_SYMBOL(dev_remove_pack);
472
473/******************************************************************************
474
475 Device Boot-time Settings Routines
476
477*******************************************************************************/
478
479/* Boot time configuration table */
480static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
481
482/**
483 * netdev_boot_setup_add - add new setup entry
484 * @name: name of the device
485 * @map: configured settings for the device
486 *
487 * Adds new setup entry to the dev_boot_setup list. The function
488 * returns 0 on error and 1 on success. This is a generic routine to
489 * all netdevices.
490 */
491static int netdev_boot_setup_add(char *name, struct ifmap *map)
492{
493 struct netdev_boot_setup *s;
494 int i;
495
496 s = dev_boot_setup;
497 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
498 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
499 memset(s[i].name, 0, sizeof(s[i].name));
500 strlcpy(s[i].name, name, IFNAMSIZ);
501 memcpy(&s[i].map, map, sizeof(s[i].map));
502 break;
503 }
504 }
505
506 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
507}
508
509/**
510 * netdev_boot_setup_check - check boot time settings
511 * @dev: the netdevice
512 *
513 * Check boot time settings for the device.
514 * The found settings are set for the device to be used
515 * later in the device probing.
516 * Returns 0 if no settings found, 1 if they are.
517 */
518int netdev_boot_setup_check(struct net_device *dev)
519{
520 struct netdev_boot_setup *s = dev_boot_setup;
521 int i;
522
523 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
524 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
525 !strcmp(dev->name, s[i].name)) {
526 dev->irq = s[i].map.irq;
527 dev->base_addr = s[i].map.base_addr;
528 dev->mem_start = s[i].map.mem_start;
529 dev->mem_end = s[i].map.mem_end;
530 return 1;
531 }
532 }
533 return 0;
534}
535EXPORT_SYMBOL(netdev_boot_setup_check);
536
537
538/**
539 * netdev_boot_base - get address from boot time settings
540 * @prefix: prefix for network device
541 * @unit: id for network device
542 *
543 * Check boot time settings for the base address of device.
544 * The found settings are set for the device to be used
545 * later in the device probing.
546 * Returns 0 if no settings found.
547 */
548unsigned long netdev_boot_base(const char *prefix, int unit)
549{
550 const struct netdev_boot_setup *s = dev_boot_setup;
551 char name[IFNAMSIZ];
552 int i;
553
554 sprintf(name, "%s%d", prefix, unit);
555
556 /*
557 * If device already registered then return base of 1
558 * to indicate not to probe for this interface
559 */
560 if (__dev_get_by_name(&init_net, name))
561 return 1;
562
563 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
564 if (!strcmp(name, s[i].name))
565 return s[i].map.base_addr;
566 return 0;
567}
568
569/*
570 * Saves at boot time configured settings for any netdevice.
571 */
572int __init netdev_boot_setup(char *str)
573{
574 int ints[5];
575 struct ifmap map;
576
577 str = get_options(str, ARRAY_SIZE(ints), ints);
578 if (!str || !*str)
579 return 0;
580
581 /* Save settings */
582 memset(&map, 0, sizeof(map));
583 if (ints[0] > 0)
584 map.irq = ints[1];
585 if (ints[0] > 1)
586 map.base_addr = ints[2];
587 if (ints[0] > 2)
588 map.mem_start = ints[3];
589 if (ints[0] > 3)
590 map.mem_end = ints[4];
591
592 /* Add new entry to the list */
593 return netdev_boot_setup_add(str, &map);
594}
595
596__setup("netdev=", netdev_boot_setup);
597
598/*******************************************************************************
599
600 Device Interface Subroutines
601
602*******************************************************************************/
603
604/**
605 * __dev_get_by_name - find a device by its name
606 * @net: the applicable net namespace
607 * @name: name to find
608 *
609 * Find an interface by name. Must be called under RTNL semaphore
610 * or @dev_base_lock. If the name is found a pointer to the device
611 * is returned. If the name is not found then %NULL is returned. The
612 * reference counters are not incremented so the caller must be
613 * careful with locks.
614 */
615
616struct net_device *__dev_get_by_name(struct net *net, const char *name)
617{
618 struct hlist_node *p;
619 struct net_device *dev;
620 struct hlist_head *head = dev_name_hash(net, name);
621
622 hlist_for_each_entry(dev, p, head, name_hlist)
623 if (!strncmp(dev->name, name, IFNAMSIZ))
624 return dev;
625
626 return NULL;
627}
628EXPORT_SYMBOL(__dev_get_by_name);
629
630/**
631 * dev_get_by_name_rcu - find a device by its name
632 * @net: the applicable net namespace
633 * @name: name to find
634 *
635 * Find an interface by name.
636 * If the name is found a pointer to the device is returned.
637 * If the name is not found then %NULL is returned.
638 * The reference counters are not incremented so the caller must be
639 * careful with locks. The caller must hold RCU lock.
640 */
641
642struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
643{
644 struct hlist_node *p;
645 struct net_device *dev;
646 struct hlist_head *head = dev_name_hash(net, name);
647
648 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
649 if (!strncmp(dev->name, name, IFNAMSIZ))
650 return dev;
651
652 return NULL;
653}
654EXPORT_SYMBOL(dev_get_by_name_rcu);
655
656/**
657 * dev_get_by_name - find a device by its name
658 * @net: the applicable net namespace
659 * @name: name to find
660 *
661 * Find an interface by name. This can be called from any
662 * context and does its own locking. The returned handle has
663 * the usage count incremented and the caller must use dev_put() to
664 * release it when it is no longer needed. %NULL is returned if no
665 * matching device is found.
666 */
667
668struct net_device *dev_get_by_name(struct net *net, const char *name)
669{
670 struct net_device *dev;
671
672 rcu_read_lock();
673 dev = dev_get_by_name_rcu(net, name);
674 if (dev)
675 dev_hold(dev);
676 rcu_read_unlock();
677 return dev;
678}
679EXPORT_SYMBOL(dev_get_by_name);
680
681/**
682 * __dev_get_by_index - find a device by its ifindex
683 * @net: the applicable net namespace
684 * @ifindex: index of device
685 *
686 * Search for an interface by index. Returns %NULL if the device
687 * is not found or a pointer to the device. The device has not
688 * had its reference counter increased so the caller must be careful
689 * about locking. The caller must hold either the RTNL semaphore
690 * or @dev_base_lock.
691 */
692
693struct net_device *__dev_get_by_index(struct net *net, int ifindex)
694{
695 struct hlist_node *p;
696 struct net_device *dev;
697 struct hlist_head *head = dev_index_hash(net, ifindex);
698
699 hlist_for_each_entry(dev, p, head, index_hlist)
700 if (dev->ifindex == ifindex)
701 return dev;
702
703 return NULL;
704}
705EXPORT_SYMBOL(__dev_get_by_index);
706
707/**
708 * dev_get_by_index_rcu - find a device by its ifindex
709 * @net: the applicable net namespace
710 * @ifindex: index of device
711 *
712 * Search for an interface by index. Returns %NULL if the device
713 * is not found or a pointer to the device. The device has not
714 * had its reference counter increased so the caller must be careful
715 * about locking. The caller must hold RCU lock.
716 */
717
718struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
719{
720 struct hlist_node *p;
721 struct net_device *dev;
722 struct hlist_head *head = dev_index_hash(net, ifindex);
723
724 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
725 if (dev->ifindex == ifindex)
726 return dev;
727
728 return NULL;
729}
730EXPORT_SYMBOL(dev_get_by_index_rcu);
731
732
733/**
734 * dev_get_by_index - find a device by its ifindex
735 * @net: the applicable net namespace
736 * @ifindex: index of device
737 *
738 * Search for an interface by index. Returns NULL if the device
739 * is not found or a pointer to the device. The device returned has
740 * had a reference added and the pointer is safe until the user calls
741 * dev_put to indicate they have finished with it.
742 */
743
744struct net_device *dev_get_by_index(struct net *net, int ifindex)
745{
746 struct net_device *dev;
747
748 rcu_read_lock();
749 dev = dev_get_by_index_rcu(net, ifindex);
750 if (dev)
751 dev_hold(dev);
752 rcu_read_unlock();
753 return dev;
754}
755EXPORT_SYMBOL(dev_get_by_index);
756
757/**
758 * dev_getbyhwaddr_rcu - find a device by its hardware address
759 * @net: the applicable net namespace
760 * @type: media type of device
761 * @ha: hardware address
762 *
763 * Search for an interface by MAC address. Returns NULL if the device
764 * is not found or a pointer to the device.
765 * The caller must hold RCU or RTNL.
766 * The returned device has not had its ref count increased
767 * and the caller must therefore be careful about locking
768 *
769 */
770
771struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
772 const char *ha)
773{
774 struct net_device *dev;
775
776 for_each_netdev_rcu(net, dev)
777 if (dev->type == type &&
778 !memcmp(dev->dev_addr, ha, dev->addr_len))
779 return dev;
780
781 return NULL;
782}
783EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
784
785struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
786{
787 struct net_device *dev;
788
789 ASSERT_RTNL();
790 for_each_netdev(net, dev)
791 if (dev->type == type)
792 return dev;
793
794 return NULL;
795}
796EXPORT_SYMBOL(__dev_getfirstbyhwtype);
797
798struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
799{
800 struct net_device *dev, *ret = NULL;
801
802 rcu_read_lock();
803 for_each_netdev_rcu(net, dev)
804 if (dev->type == type) {
805 dev_hold(dev);
806 ret = dev;
807 break;
808 }
809 rcu_read_unlock();
810 return ret;
811}
812EXPORT_SYMBOL(dev_getfirstbyhwtype);
813
814/**
815 * dev_get_by_flags_rcu - find any device with given flags
816 * @net: the applicable net namespace
817 * @if_flags: IFF_* values
818 * @mask: bitmask of bits in if_flags to check
819 *
820 * Search for any interface with the given flags. Returns NULL if a device
821 * is not found or a pointer to the device. Must be called inside
822 * rcu_read_lock(), and result refcount is unchanged.
823 */
824
825struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
826 unsigned short mask)
827{
828 struct net_device *dev, *ret;
829
830 ret = NULL;
831 for_each_netdev_rcu(net, dev) {
832 if (((dev->flags ^ if_flags) & mask) == 0) {
833 ret = dev;
834 break;
835 }
836 }
837 return ret;
838}
839EXPORT_SYMBOL(dev_get_by_flags_rcu);
840
841/**
842 * dev_valid_name - check if name is okay for network device
843 * @name: name string
844 *
845 * Network device names need to be valid file names to
846 * to allow sysfs to work. We also disallow any kind of
847 * whitespace.
848 */
849bool dev_valid_name(const char *name)
850{
851 if (*name == '\0')
852 return false;
853 if (strlen(name) >= IFNAMSIZ)
854 return false;
855 if (!strcmp(name, ".") || !strcmp(name, ".."))
856 return false;
857
858 while (*name) {
859 if (*name == '/' || isspace(*name))
860 return false;
861 name++;
862 }
863 return true;
864}
865EXPORT_SYMBOL(dev_valid_name);
866
867/**
868 * __dev_alloc_name - allocate a name for a device
869 * @net: network namespace to allocate the device name in
870 * @name: name format string
871 * @buf: scratch buffer and result name string
872 *
873 * Passed a format string - eg "lt%d" it will try and find a suitable
874 * id. It scans list of devices to build up a free map, then chooses
875 * the first empty slot. The caller must hold the dev_base or rtnl lock
876 * while allocating the name and adding the device in order to avoid
877 * duplicates.
878 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
879 * Returns the number of the unit assigned or a negative errno code.
880 */
881
882static int __dev_alloc_name(struct net *net, const char *name, char *buf)
883{
884 int i = 0;
885 const char *p;
886 const int max_netdevices = 8*PAGE_SIZE;
887 unsigned long *inuse;
888 struct net_device *d;
889
890 p = strnchr(name, IFNAMSIZ-1, '%');
891 if (p) {
892 /*
893 * Verify the string as this thing may have come from
894 * the user. There must be either one "%d" and no other "%"
895 * characters.
896 */
897 if (p[1] != 'd' || strchr(p + 2, '%'))
898 return -EINVAL;
899
900 /* Use one page as a bit array of possible slots */
901 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
902 if (!inuse)
903 return -ENOMEM;
904
905 for_each_netdev(net, d) {
906 if (!sscanf(d->name, name, &i))
907 continue;
908 if (i < 0 || i >= max_netdevices)
909 continue;
910
911 /* avoid cases where sscanf is not exact inverse of printf */
912 snprintf(buf, IFNAMSIZ, name, i);
913 if (!strncmp(buf, d->name, IFNAMSIZ))
914 set_bit(i, inuse);
915 }
916
917 i = find_first_zero_bit(inuse, max_netdevices);
918 free_page((unsigned long) inuse);
919 }
920
921 if (buf != name)
922 snprintf(buf, IFNAMSIZ, name, i);
923 if (!__dev_get_by_name(net, buf))
924 return i;
925
926 /* It is possible to run out of possible slots
927 * when the name is long and there isn't enough space left
928 * for the digits, or if all bits are used.
929 */
930 return -ENFILE;
931}
932
933/**
934 * dev_alloc_name - allocate a name for a device
935 * @dev: device
936 * @name: name format string
937 *
938 * Passed a format string - eg "lt%d" it will try and find a suitable
939 * id. It scans list of devices to build up a free map, then chooses
940 * the first empty slot. The caller must hold the dev_base or rtnl lock
941 * while allocating the name and adding the device in order to avoid
942 * duplicates.
943 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
944 * Returns the number of the unit assigned or a negative errno code.
945 */
946
947int dev_alloc_name(struct net_device *dev, const char *name)
948{
949 char buf[IFNAMSIZ];
950 struct net *net;
951 int ret;
952
953 BUG_ON(!dev_net(dev));
954 net = dev_net(dev);
955 ret = __dev_alloc_name(net, name, buf);
956 if (ret >= 0)
957 strlcpy(dev->name, buf, IFNAMSIZ);
958 return ret;
959}
960EXPORT_SYMBOL(dev_alloc_name);
961
962static int dev_get_valid_name(struct net_device *dev, const char *name)
963{
964 struct net *net;
965
966 BUG_ON(!dev_net(dev));
967 net = dev_net(dev);
968
969 if (!dev_valid_name(name))
970 return -EINVAL;
971
972 if (strchr(name, '%'))
973 return dev_alloc_name(dev, name);
974 else if (__dev_get_by_name(net, name))
975 return -EEXIST;
976 else if (dev->name != name)
977 strlcpy(dev->name, name, IFNAMSIZ);
978
979 return 0;
980}
981
982/**
983 * dev_change_name - change name of a device
984 * @dev: device
985 * @newname: name (or format string) must be at least IFNAMSIZ
986 *
987 * Change name of a device, can pass format strings "eth%d".
988 * for wildcarding.
989 */
990int dev_change_name(struct net_device *dev, const char *newname)
991{
992 char oldname[IFNAMSIZ];
993 int err = 0;
994 int ret;
995 struct net *net;
996
997 ASSERT_RTNL();
998 BUG_ON(!dev_net(dev));
999
1000 net = dev_net(dev);
1001 if (dev->flags & IFF_UP)
1002 return -EBUSY;
1003
1004 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1005 return 0;
1006
1007 memcpy(oldname, dev->name, IFNAMSIZ);
1008
1009 err = dev_get_valid_name(dev, newname);
1010 if (err < 0)
1011 return err;
1012
1013rollback:
1014 ret = device_rename(&dev->dev, dev->name);
1015 if (ret) {
1016 memcpy(dev->name, oldname, IFNAMSIZ);
1017 return ret;
1018 }
1019
1020 write_lock_bh(&dev_base_lock);
1021 hlist_del_rcu(&dev->name_hlist);
1022 write_unlock_bh(&dev_base_lock);
1023
1024 synchronize_rcu();
1025
1026 write_lock_bh(&dev_base_lock);
1027 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1028 write_unlock_bh(&dev_base_lock);
1029
1030 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1031 ret = notifier_to_errno(ret);
1032
1033 if (ret) {
1034 /* err >= 0 after dev_alloc_name() or stores the first errno */
1035 if (err >= 0) {
1036 err = ret;
1037 memcpy(dev->name, oldname, IFNAMSIZ);
1038 goto rollback;
1039 } else {
1040 pr_err("%s: name change rollback failed: %d\n",
1041 dev->name, ret);
1042 }
1043 }
1044
1045 return err;
1046}
1047
1048/**
1049 * dev_set_alias - change ifalias of a device
1050 * @dev: device
1051 * @alias: name up to IFALIASZ
1052 * @len: limit of bytes to copy from info
1053 *
1054 * Set ifalias for a device,
1055 */
1056int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1057{
1058 char *new_ifalias;
1059
1060 ASSERT_RTNL();
1061
1062 if (len >= IFALIASZ)
1063 return -EINVAL;
1064
1065 if (!len) {
1066 if (dev->ifalias) {
1067 kfree(dev->ifalias);
1068 dev->ifalias = NULL;
1069 }
1070 return 0;
1071 }
1072
1073 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1074 if (!new_ifalias)
1075 return -ENOMEM;
1076 dev->ifalias = new_ifalias;
1077
1078 strlcpy(dev->ifalias, alias, len+1);
1079 return len;
1080}
1081
1082
1083/**
1084 * netdev_features_change - device changes features
1085 * @dev: device to cause notification
1086 *
1087 * Called to indicate a device has changed features.
1088 */
1089void netdev_features_change(struct net_device *dev)
1090{
1091 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1092}
1093EXPORT_SYMBOL(netdev_features_change);
1094
1095/**
1096 * netdev_state_change - device changes state
1097 * @dev: device to cause notification
1098 *
1099 * Called to indicate a device has changed state. This function calls
1100 * the notifier chains for netdev_chain and sends a NEWLINK message
1101 * to the routing socket.
1102 */
1103void netdev_state_change(struct net_device *dev)
1104{
1105 if (dev->flags & IFF_UP) {
1106 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1107 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1108 }
1109}
1110EXPORT_SYMBOL(netdev_state_change);
1111
1112int netdev_bonding_change(struct net_device *dev, unsigned long event)
1113{
1114 return call_netdevice_notifiers(event, dev);
1115}
1116EXPORT_SYMBOL(netdev_bonding_change);
1117
1118/**
1119 * dev_load - load a network module
1120 * @net: the applicable net namespace
1121 * @name: name of interface
1122 *
1123 * If a network interface is not present and the process has suitable
1124 * privileges this function loads the module. If module loading is not
1125 * available in this kernel then it becomes a nop.
1126 */
1127
1128void dev_load(struct net *net, const char *name)
1129{
1130 struct net_device *dev;
1131 int no_module;
1132
1133 rcu_read_lock();
1134 dev = dev_get_by_name_rcu(net, name);
1135 rcu_read_unlock();
1136
1137 no_module = !dev;
1138 if (no_module && capable(CAP_NET_ADMIN))
1139 no_module = request_module("netdev-%s", name);
1140 if (no_module && capable(CAP_SYS_MODULE)) {
1141 if (!request_module("%s", name))
1142 pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1143 name);
1144 }
1145}
1146EXPORT_SYMBOL(dev_load);
1147
1148static int __dev_open(struct net_device *dev)
1149{
1150 const struct net_device_ops *ops = dev->netdev_ops;
1151 int ret;
1152
1153 ASSERT_RTNL();
1154
1155 if (!netif_device_present(dev))
1156 return -ENODEV;
1157
1158 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1159 ret = notifier_to_errno(ret);
1160 if (ret)
1161 return ret;
1162
1163 set_bit(__LINK_STATE_START, &dev->state);
1164
1165 if (ops->ndo_validate_addr)
1166 ret = ops->ndo_validate_addr(dev);
1167
1168 if (!ret && ops->ndo_open)
1169 ret = ops->ndo_open(dev);
1170
1171 if (ret)
1172 clear_bit(__LINK_STATE_START, &dev->state);
1173 else {
1174 dev->flags |= IFF_UP;
1175 net_dmaengine_get();
1176 dev_set_rx_mode(dev);
1177 dev_activate(dev);
1178 add_device_randomness(dev->dev_addr, dev->addr_len);
1179 }
1180
1181 return ret;
1182}
1183
1184/**
1185 * dev_open - prepare an interface for use.
1186 * @dev: device to open
1187 *
1188 * Takes a device from down to up state. The device's private open
1189 * function is invoked and then the multicast lists are loaded. Finally
1190 * the device is moved into the up state and a %NETDEV_UP message is
1191 * sent to the netdev notifier chain.
1192 *
1193 * Calling this function on an active interface is a nop. On a failure
1194 * a negative errno code is returned.
1195 */
1196int dev_open(struct net_device *dev)
1197{
1198 int ret;
1199
1200 if (dev->flags & IFF_UP)
1201 return 0;
1202
1203 ret = __dev_open(dev);
1204 if (ret < 0)
1205 return ret;
1206
1207 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1208 call_netdevice_notifiers(NETDEV_UP, dev);
1209
1210 return ret;
1211}
1212EXPORT_SYMBOL(dev_open);
1213
1214static int __dev_close_many(struct list_head *head)
1215{
1216 struct net_device *dev;
1217
1218 ASSERT_RTNL();
1219 might_sleep();
1220
1221 list_for_each_entry(dev, head, unreg_list) {
1222 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1223
1224 clear_bit(__LINK_STATE_START, &dev->state);
1225
1226 /* Synchronize to scheduled poll. We cannot touch poll list, it
1227 * can be even on different cpu. So just clear netif_running().
1228 *
1229 * dev->stop() will invoke napi_disable() on all of it's
1230 * napi_struct instances on this device.
1231 */
1232 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1233 }
1234
1235 dev_deactivate_many(head);
1236
1237 list_for_each_entry(dev, head, unreg_list) {
1238 const struct net_device_ops *ops = dev->netdev_ops;
1239
1240 /*
1241 * Call the device specific close. This cannot fail.
1242 * Only if device is UP
1243 *
1244 * We allow it to be called even after a DETACH hot-plug
1245 * event.
1246 */
1247 if (ops->ndo_stop)
1248 ops->ndo_stop(dev);
1249
1250 dev->flags &= ~IFF_UP;
1251 net_dmaengine_put();
1252 }
1253
1254 return 0;
1255}
1256
1257static int __dev_close(struct net_device *dev)
1258{
1259 int retval;
1260 LIST_HEAD(single);
1261
1262 list_add(&dev->unreg_list, &single);
1263 retval = __dev_close_many(&single);
1264 list_del(&single);
1265 return retval;
1266}
1267
1268static int dev_close_many(struct list_head *head)
1269{
1270 struct net_device *dev, *tmp;
1271 LIST_HEAD(tmp_list);
1272
1273 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1274 if (!(dev->flags & IFF_UP))
1275 list_move(&dev->unreg_list, &tmp_list);
1276
1277 __dev_close_many(head);
1278
1279 list_for_each_entry(dev, head, unreg_list) {
1280 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1281 call_netdevice_notifiers(NETDEV_DOWN, dev);
1282 }
1283
1284 /* rollback_registered_many needs the complete original list */
1285 list_splice(&tmp_list, head);
1286 return 0;
1287}
1288
1289/**
1290 * dev_close - shutdown an interface.
1291 * @dev: device to shutdown
1292 *
1293 * This function moves an active device into down state. A
1294 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1295 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1296 * chain.
1297 */
1298int dev_close(struct net_device *dev)
1299{
1300 if (dev->flags & IFF_UP) {
1301 LIST_HEAD(single);
1302
1303 list_add(&dev->unreg_list, &single);
1304 dev_close_many(&single);
1305 list_del(&single);
1306 }
1307 return 0;
1308}
1309EXPORT_SYMBOL(dev_close);
1310
1311
1312/**
1313 * dev_disable_lro - disable Large Receive Offload on a device
1314 * @dev: device
1315 *
1316 * Disable Large Receive Offload (LRO) on a net device. Must be
1317 * called under RTNL. This is needed if received packets may be
1318 * forwarded to another interface.
1319 */
1320void dev_disable_lro(struct net_device *dev)
1321{
1322 /*
1323 * If we're trying to disable lro on a vlan device
1324 * use the underlying physical device instead
1325 */
1326 if (is_vlan_dev(dev))
1327 dev = vlan_dev_real_dev(dev);
1328
1329 dev->wanted_features &= ~NETIF_F_LRO;
1330 netdev_update_features(dev);
1331
1332 if (unlikely(dev->features & NETIF_F_LRO))
1333 netdev_WARN(dev, "failed to disable LRO!\n");
1334}
1335EXPORT_SYMBOL(dev_disable_lro);
1336
1337
1338static int dev_boot_phase = 1;
1339
1340/**
1341 * register_netdevice_notifier - register a network notifier block
1342 * @nb: notifier
1343 *
1344 * Register a notifier to be called when network device events occur.
1345 * The notifier passed is linked into the kernel structures and must
1346 * not be reused until it has been unregistered. A negative errno code
1347 * is returned on a failure.
1348 *
1349 * When registered all registration and up events are replayed
1350 * to the new notifier to allow device to have a race free
1351 * view of the network device list.
1352 */
1353
1354int register_netdevice_notifier(struct notifier_block *nb)
1355{
1356 struct net_device *dev;
1357 struct net_device *last;
1358 struct net *net;
1359 int err;
1360
1361 rtnl_lock();
1362 err = raw_notifier_chain_register(&netdev_chain, nb);
1363 if (err)
1364 goto unlock;
1365 if (dev_boot_phase)
1366 goto unlock;
1367 for_each_net(net) {
1368 for_each_netdev(net, dev) {
1369 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1370 err = notifier_to_errno(err);
1371 if (err)
1372 goto rollback;
1373
1374 if (!(dev->flags & IFF_UP))
1375 continue;
1376
1377 nb->notifier_call(nb, NETDEV_UP, dev);
1378 }
1379 }
1380
1381unlock:
1382 rtnl_unlock();
1383 return err;
1384
1385rollback:
1386 last = dev;
1387 for_each_net(net) {
1388 for_each_netdev(net, dev) {
1389 if (dev == last)
1390 goto outroll;
1391
1392 if (dev->flags & IFF_UP) {
1393 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1394 nb->notifier_call(nb, NETDEV_DOWN, dev);
1395 }
1396 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1397 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1398 }
1399 }
1400
1401outroll:
1402 raw_notifier_chain_unregister(&netdev_chain, nb);
1403 goto unlock;
1404}
1405EXPORT_SYMBOL(register_netdevice_notifier);
1406
1407/**
1408 * unregister_netdevice_notifier - unregister a network notifier block
1409 * @nb: notifier
1410 *
1411 * Unregister a notifier previously registered by
1412 * register_netdevice_notifier(). The notifier is unlinked into the
1413 * kernel structures and may then be reused. A negative errno code
1414 * is returned on a failure.
1415 *
1416 * After unregistering unregister and down device events are synthesized
1417 * for all devices on the device list to the removed notifier to remove
1418 * the need for special case cleanup code.
1419 */
1420
1421int unregister_netdevice_notifier(struct notifier_block *nb)
1422{
1423 struct net_device *dev;
1424 struct net *net;
1425 int err;
1426
1427 rtnl_lock();
1428 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1429 if (err)
1430 goto unlock;
1431
1432 for_each_net(net) {
1433 for_each_netdev(net, dev) {
1434 if (dev->flags & IFF_UP) {
1435 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1436 nb->notifier_call(nb, NETDEV_DOWN, dev);
1437 }
1438 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1439 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1440 }
1441 }
1442unlock:
1443 rtnl_unlock();
1444 return err;
1445}
1446EXPORT_SYMBOL(unregister_netdevice_notifier);
1447
1448/**
1449 * call_netdevice_notifiers - call all network notifier blocks
1450 * @val: value passed unmodified to notifier function
1451 * @dev: net_device pointer passed unmodified to notifier function
1452 *
1453 * Call all network notifier blocks. Parameters and return value
1454 * are as for raw_notifier_call_chain().
1455 */
1456
1457int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1458{
1459 ASSERT_RTNL();
1460 return raw_notifier_call_chain(&netdev_chain, val, dev);
1461}
1462EXPORT_SYMBOL(call_netdevice_notifiers);
1463
1464static struct static_key netstamp_needed __read_mostly;
1465#ifdef HAVE_JUMP_LABEL
1466/* We are not allowed to call static_key_slow_dec() from irq context
1467 * If net_disable_timestamp() is called from irq context, defer the
1468 * static_key_slow_dec() calls.
1469 */
1470static atomic_t netstamp_needed_deferred;
1471#endif
1472
1473void net_enable_timestamp(void)
1474{
1475#ifdef HAVE_JUMP_LABEL
1476 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1477
1478 if (deferred) {
1479 while (--deferred)
1480 static_key_slow_dec(&netstamp_needed);
1481 return;
1482 }
1483#endif
1484 WARN_ON(in_interrupt());
1485 static_key_slow_inc(&netstamp_needed);
1486}
1487EXPORT_SYMBOL(net_enable_timestamp);
1488
1489void net_disable_timestamp(void)
1490{
1491#ifdef HAVE_JUMP_LABEL
1492 if (in_interrupt()) {
1493 atomic_inc(&netstamp_needed_deferred);
1494 return;
1495 }
1496#endif
1497 static_key_slow_dec(&netstamp_needed);
1498}
1499EXPORT_SYMBOL(net_disable_timestamp);
1500
1501static inline void net_timestamp_set(struct sk_buff *skb)
1502{
1503 skb->tstamp.tv64 = 0;
1504 if (static_key_false(&netstamp_needed))
1505 __net_timestamp(skb);
1506}
1507
1508#define net_timestamp_check(COND, SKB) \
1509 if (static_key_false(&netstamp_needed)) { \
1510 if ((COND) && !(SKB)->tstamp.tv64) \
1511 __net_timestamp(SKB); \
1512 } \
1513
1514static int net_hwtstamp_validate(struct ifreq *ifr)
1515{
1516 struct hwtstamp_config cfg;
1517 enum hwtstamp_tx_types tx_type;
1518 enum hwtstamp_rx_filters rx_filter;
1519 int tx_type_valid = 0;
1520 int rx_filter_valid = 0;
1521
1522 if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1523 return -EFAULT;
1524
1525 if (cfg.flags) /* reserved for future extensions */
1526 return -EINVAL;
1527
1528 tx_type = cfg.tx_type;
1529 rx_filter = cfg.rx_filter;
1530
1531 switch (tx_type) {
1532 case HWTSTAMP_TX_OFF:
1533 case HWTSTAMP_TX_ON:
1534 case HWTSTAMP_TX_ONESTEP_SYNC:
1535 tx_type_valid = 1;
1536 break;
1537 }
1538
1539 switch (rx_filter) {
1540 case HWTSTAMP_FILTER_NONE:
1541 case HWTSTAMP_FILTER_ALL:
1542 case HWTSTAMP_FILTER_SOME:
1543 case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1544 case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1545 case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1546 case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1547 case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1548 case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1549 case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1550 case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1551 case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1552 case HWTSTAMP_FILTER_PTP_V2_EVENT:
1553 case HWTSTAMP_FILTER_PTP_V2_SYNC:
1554 case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1555 rx_filter_valid = 1;
1556 break;
1557 }
1558
1559 if (!tx_type_valid || !rx_filter_valid)
1560 return -ERANGE;
1561
1562 return 0;
1563}
1564
1565static inline bool is_skb_forwardable(struct net_device *dev,
1566 struct sk_buff *skb)
1567{
1568 unsigned int len;
1569
1570 if (!(dev->flags & IFF_UP))
1571 return false;
1572
1573 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1574 if (skb->len <= len)
1575 return true;
1576
1577 /* if TSO is enabled, we don't care about the length as the packet
1578 * could be forwarded without being segmented before
1579 */
1580 if (skb_is_gso(skb))
1581 return true;
1582
1583 return false;
1584}
1585
1586/**
1587 * dev_forward_skb - loopback an skb to another netif
1588 *
1589 * @dev: destination network device
1590 * @skb: buffer to forward
1591 *
1592 * return values:
1593 * NET_RX_SUCCESS (no congestion)
1594 * NET_RX_DROP (packet was dropped, but freed)
1595 *
1596 * dev_forward_skb can be used for injecting an skb from the
1597 * start_xmit function of one device into the receive queue
1598 * of another device.
1599 *
1600 * The receiving device may be in another namespace, so
1601 * we have to clear all information in the skb that could
1602 * impact namespace isolation.
1603 */
1604int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1605{
1606 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1607 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1608 atomic_long_inc(&dev->rx_dropped);
1609 kfree_skb(skb);
1610 return NET_RX_DROP;
1611 }
1612 }
1613
1614 skb_orphan(skb);
1615 nf_reset(skb);
1616
1617 if (unlikely(!is_skb_forwardable(dev, skb))) {
1618 atomic_long_inc(&dev->rx_dropped);
1619 kfree_skb(skb);
1620 return NET_RX_DROP;
1621 }
1622 skb->skb_iif = 0;
1623 skb->dev = dev;
1624 skb_dst_drop(skb);
1625 skb->tstamp.tv64 = 0;
1626 skb->pkt_type = PACKET_HOST;
1627 skb->protocol = eth_type_trans(skb, dev);
1628 skb->mark = 0;
1629 secpath_reset(skb);
1630 nf_reset(skb);
1631 return netif_rx(skb);
1632}
1633EXPORT_SYMBOL_GPL(dev_forward_skb);
1634
1635static inline int deliver_skb(struct sk_buff *skb,
1636 struct packet_type *pt_prev,
1637 struct net_device *orig_dev)
1638{
1639 atomic_inc(&skb->users);
1640 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1641}
1642
1643static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1644{
1645 if (ptype->af_packet_priv == NULL)
1646 return false;
1647
1648 if (ptype->id_match)
1649 return ptype->id_match(ptype, skb->sk);
1650 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1651 return true;
1652
1653 return false;
1654}
1655
1656/*
1657 * Support routine. Sends outgoing frames to any network
1658 * taps currently in use.
1659 */
1660
1661static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1662{
1663 struct packet_type *ptype;
1664 struct sk_buff *skb2 = NULL;
1665 struct packet_type *pt_prev = NULL;
1666
1667 rcu_read_lock();
1668 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1669 /* Never send packets back to the socket
1670 * they originated from - MvS (miquels@drinkel.ow.org)
1671 */
1672 if ((ptype->dev == dev || !ptype->dev) &&
1673 (!skb_loop_sk(ptype, skb))) {
1674 if (pt_prev) {
1675 deliver_skb(skb2, pt_prev, skb->dev);
1676 pt_prev = ptype;
1677 continue;
1678 }
1679
1680 skb2 = skb_clone(skb, GFP_ATOMIC);
1681 if (!skb2)
1682 break;
1683
1684 net_timestamp_set(skb2);
1685
1686 /* skb->nh should be correctly
1687 set by sender, so that the second statement is
1688 just protection against buggy protocols.
1689 */
1690 skb_reset_mac_header(skb2);
1691
1692 if (skb_network_header(skb2) < skb2->data ||
1693 skb2->network_header > skb2->tail) {
1694 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1695 ntohs(skb2->protocol),
1696 dev->name);
1697 skb_reset_network_header(skb2);
1698 }
1699
1700 skb2->transport_header = skb2->network_header;
1701 skb2->pkt_type = PACKET_OUTGOING;
1702 pt_prev = ptype;
1703 }
1704 }
1705 if (pt_prev)
1706 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1707 rcu_read_unlock();
1708}
1709
1710/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1711 * @dev: Network device
1712 * @txq: number of queues available
1713 *
1714 * If real_num_tx_queues is changed the tc mappings may no longer be
1715 * valid. To resolve this verify the tc mapping remains valid and if
1716 * not NULL the mapping. With no priorities mapping to this
1717 * offset/count pair it will no longer be used. In the worst case TC0
1718 * is invalid nothing can be done so disable priority mappings. If is
1719 * expected that drivers will fix this mapping if they can before
1720 * calling netif_set_real_num_tx_queues.
1721 */
1722static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1723{
1724 int i;
1725 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1726
1727 /* If TC0 is invalidated disable TC mapping */
1728 if (tc->offset + tc->count > txq) {
1729 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1730 dev->num_tc = 0;
1731 return;
1732 }
1733
1734 /* Invalidated prio to tc mappings set to TC0 */
1735 for (i = 1; i < TC_BITMASK + 1; i++) {
1736 int q = netdev_get_prio_tc_map(dev, i);
1737
1738 tc = &dev->tc_to_txq[q];
1739 if (tc->offset + tc->count > txq) {
1740 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1741 i, q);
1742 netdev_set_prio_tc_map(dev, i, 0);
1743 }
1744 }
1745}
1746
1747/*
1748 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1749 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1750 */
1751int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1752{
1753 int rc;
1754
1755 if (txq < 1 || txq > dev->num_tx_queues)
1756 return -EINVAL;
1757
1758 if (dev->reg_state == NETREG_REGISTERED ||
1759 dev->reg_state == NETREG_UNREGISTERING) {
1760 ASSERT_RTNL();
1761
1762 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1763 txq);
1764 if (rc)
1765 return rc;
1766
1767 if (dev->num_tc)
1768 netif_setup_tc(dev, txq);
1769
1770 if (txq < dev->real_num_tx_queues)
1771 qdisc_reset_all_tx_gt(dev, txq);
1772 }
1773
1774 dev->real_num_tx_queues = txq;
1775 return 0;
1776}
1777EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1778
1779#ifdef CONFIG_RPS
1780/**
1781 * netif_set_real_num_rx_queues - set actual number of RX queues used
1782 * @dev: Network device
1783 * @rxq: Actual number of RX queues
1784 *
1785 * This must be called either with the rtnl_lock held or before
1786 * registration of the net device. Returns 0 on success, or a
1787 * negative error code. If called before registration, it always
1788 * succeeds.
1789 */
1790int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1791{
1792 int rc;
1793
1794 if (rxq < 1 || rxq > dev->num_rx_queues)
1795 return -EINVAL;
1796
1797 if (dev->reg_state == NETREG_REGISTERED) {
1798 ASSERT_RTNL();
1799
1800 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1801 rxq);
1802 if (rc)
1803 return rc;
1804 }
1805
1806 dev->real_num_rx_queues = rxq;
1807 return 0;
1808}
1809EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1810#endif
1811
1812static inline void __netif_reschedule(struct Qdisc *q)
1813{
1814 struct softnet_data *sd;
1815 unsigned long flags;
1816
1817 local_irq_save(flags);
1818 sd = &__get_cpu_var(softnet_data);
1819 q->next_sched = NULL;
1820 *sd->output_queue_tailp = q;
1821 sd->output_queue_tailp = &q->next_sched;
1822 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1823 local_irq_restore(flags);
1824}
1825
1826void __netif_schedule(struct Qdisc *q)
1827{
1828 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1829 __netif_reschedule(q);
1830}
1831EXPORT_SYMBOL(__netif_schedule);
1832
1833void dev_kfree_skb_irq(struct sk_buff *skb)
1834{
1835 if (atomic_dec_and_test(&skb->users)) {
1836 struct softnet_data *sd;
1837 unsigned long flags;
1838
1839 local_irq_save(flags);
1840 sd = &__get_cpu_var(softnet_data);
1841 skb->next = sd->completion_queue;
1842 sd->completion_queue = skb;
1843 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1844 local_irq_restore(flags);
1845 }
1846}
1847EXPORT_SYMBOL(dev_kfree_skb_irq);
1848
1849void dev_kfree_skb_any(struct sk_buff *skb)
1850{
1851 if (in_irq() || irqs_disabled())
1852 dev_kfree_skb_irq(skb);
1853 else
1854 dev_kfree_skb(skb);
1855}
1856EXPORT_SYMBOL(dev_kfree_skb_any);
1857
1858
1859/**
1860 * netif_device_detach - mark device as removed
1861 * @dev: network device
1862 *
1863 * Mark device as removed from system and therefore no longer available.
1864 */
1865void netif_device_detach(struct net_device *dev)
1866{
1867 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1868 netif_running(dev)) {
1869 netif_tx_stop_all_queues(dev);
1870 }
1871}
1872EXPORT_SYMBOL(netif_device_detach);
1873
1874/**
1875 * netif_device_attach - mark device as attached
1876 * @dev: network device
1877 *
1878 * Mark device as attached from system and restart if needed.
1879 */
1880void netif_device_attach(struct net_device *dev)
1881{
1882 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1883 netif_running(dev)) {
1884 netif_tx_wake_all_queues(dev);
1885 __netdev_watchdog_up(dev);
1886 }
1887}
1888EXPORT_SYMBOL(netif_device_attach);
1889
1890static void skb_warn_bad_offload(const struct sk_buff *skb)
1891{
1892 static const netdev_features_t null_features = 0;
1893 struct net_device *dev = skb->dev;
1894 const char *driver = "";
1895
1896 if (dev && dev->dev.parent)
1897 driver = dev_driver_string(dev->dev.parent);
1898
1899 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
1900 "gso_type=%d ip_summed=%d\n",
1901 driver, dev ? &dev->features : &null_features,
1902 skb->sk ? &skb->sk->sk_route_caps : &null_features,
1903 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
1904 skb_shinfo(skb)->gso_type, skb->ip_summed);
1905}
1906
1907/*
1908 * Invalidate hardware checksum when packet is to be mangled, and
1909 * complete checksum manually on outgoing path.
1910 */
1911int skb_checksum_help(struct sk_buff *skb)
1912{
1913 __wsum csum;
1914 int ret = 0, offset;
1915
1916 if (skb->ip_summed == CHECKSUM_COMPLETE)
1917 goto out_set_summed;
1918
1919 if (unlikely(skb_shinfo(skb)->gso_size)) {
1920 skb_warn_bad_offload(skb);
1921 return -EINVAL;
1922 }
1923
1924 offset = skb_checksum_start_offset(skb);
1925 BUG_ON(offset >= skb_headlen(skb));
1926 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1927
1928 offset += skb->csum_offset;
1929 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1930
1931 if (skb_cloned(skb) &&
1932 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1933 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1934 if (ret)
1935 goto out;
1936 }
1937
1938 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1939out_set_summed:
1940 skb->ip_summed = CHECKSUM_NONE;
1941out:
1942 return ret;
1943}
1944EXPORT_SYMBOL(skb_checksum_help);
1945
1946/**
1947 * skb_gso_segment - Perform segmentation on skb.
1948 * @skb: buffer to segment
1949 * @features: features for the output path (see dev->features)
1950 *
1951 * This function segments the given skb and returns a list of segments.
1952 *
1953 * It may return NULL if the skb requires no segmentation. This is
1954 * only possible when GSO is used for verifying header integrity.
1955 */
1956struct sk_buff *skb_gso_segment(struct sk_buff *skb,
1957 netdev_features_t features)
1958{
1959 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1960 struct packet_type *ptype;
1961 __be16 type = skb->protocol;
1962 int vlan_depth = ETH_HLEN;
1963 int err;
1964
1965 while (type == htons(ETH_P_8021Q)) {
1966 struct vlan_hdr *vh;
1967
1968 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1969 return ERR_PTR(-EINVAL);
1970
1971 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1972 type = vh->h_vlan_encapsulated_proto;
1973 vlan_depth += VLAN_HLEN;
1974 }
1975
1976 skb_reset_mac_header(skb);
1977 skb->mac_len = skb->network_header - skb->mac_header;
1978 __skb_pull(skb, skb->mac_len);
1979
1980 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1981 skb_warn_bad_offload(skb);
1982
1983 if (skb_header_cloned(skb) &&
1984 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1985 return ERR_PTR(err);
1986 }
1987
1988 rcu_read_lock();
1989 list_for_each_entry_rcu(ptype,
1990 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1991 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1992 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1993 err = ptype->gso_send_check(skb);
1994 segs = ERR_PTR(err);
1995 if (err || skb_gso_ok(skb, features))
1996 break;
1997 __skb_push(skb, (skb->data -
1998 skb_network_header(skb)));
1999 }
2000 segs = ptype->gso_segment(skb, features);
2001 break;
2002 }
2003 }
2004 rcu_read_unlock();
2005
2006 __skb_push(skb, skb->data - skb_mac_header(skb));
2007
2008 return segs;
2009}
2010EXPORT_SYMBOL(skb_gso_segment);
2011
2012/* Take action when hardware reception checksum errors are detected. */
2013#ifdef CONFIG_BUG
2014void netdev_rx_csum_fault(struct net_device *dev)
2015{
2016 if (net_ratelimit()) {
2017 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2018 dump_stack();
2019 }
2020}
2021EXPORT_SYMBOL(netdev_rx_csum_fault);
2022#endif
2023
2024/* Actually, we should eliminate this check as soon as we know, that:
2025 * 1. IOMMU is present and allows to map all the memory.
2026 * 2. No high memory really exists on this machine.
2027 */
2028
2029static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2030{
2031#ifdef CONFIG_HIGHMEM
2032 int i;
2033 if (!(dev->features & NETIF_F_HIGHDMA)) {
2034 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2035 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2036 if (PageHighMem(skb_frag_page(frag)))
2037 return 1;
2038 }
2039 }
2040
2041 if (PCI_DMA_BUS_IS_PHYS) {
2042 struct device *pdev = dev->dev.parent;
2043
2044 if (!pdev)
2045 return 0;
2046 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2047 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2048 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2049 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2050 return 1;
2051 }
2052 }
2053#endif
2054 return 0;
2055}
2056
2057struct dev_gso_cb {
2058 void (*destructor)(struct sk_buff *skb);
2059};
2060
2061#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2062
2063static void dev_gso_skb_destructor(struct sk_buff *skb)
2064{
2065 struct dev_gso_cb *cb;
2066
2067 do {
2068 struct sk_buff *nskb = skb->next;
2069
2070 skb->next = nskb->next;
2071 nskb->next = NULL;
2072 kfree_skb(nskb);
2073 } while (skb->next);
2074
2075 cb = DEV_GSO_CB(skb);
2076 if (cb->destructor)
2077 cb->destructor(skb);
2078}
2079
2080/**
2081 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2082 * @skb: buffer to segment
2083 * @features: device features as applicable to this skb
2084 *
2085 * This function segments the given skb and stores the list of segments
2086 * in skb->next.
2087 */
2088static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2089{
2090 struct sk_buff *segs;
2091
2092 segs = skb_gso_segment(skb, features);
2093
2094 /* Verifying header integrity only. */
2095 if (!segs)
2096 return 0;
2097
2098 if (IS_ERR(segs))
2099 return PTR_ERR(segs);
2100
2101 skb->next = segs;
2102 DEV_GSO_CB(skb)->destructor = skb->destructor;
2103 skb->destructor = dev_gso_skb_destructor;
2104
2105 return 0;
2106}
2107
2108static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2109{
2110 return ((features & NETIF_F_GEN_CSUM) ||
2111 ((features & NETIF_F_V4_CSUM) &&
2112 protocol == htons(ETH_P_IP)) ||
2113 ((features & NETIF_F_V6_CSUM) &&
2114 protocol == htons(ETH_P_IPV6)) ||
2115 ((features & NETIF_F_FCOE_CRC) &&
2116 protocol == htons(ETH_P_FCOE)));
2117}
2118
2119static netdev_features_t harmonize_features(struct sk_buff *skb,
2120 __be16 protocol, netdev_features_t features)
2121{
2122 if (!can_checksum_protocol(features, protocol)) {
2123 features &= ~NETIF_F_ALL_CSUM;
2124 features &= ~NETIF_F_SG;
2125 } else if (illegal_highdma(skb->dev, skb)) {
2126 features &= ~NETIF_F_SG;
2127 }
2128
2129 return features;
2130}
2131
2132netdev_features_t netif_skb_features(struct sk_buff *skb)
2133{
2134 __be16 protocol = skb->protocol;
2135 netdev_features_t features = skb->dev->features;
2136
2137 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2138 features &= ~NETIF_F_GSO_MASK;
2139
2140 if (protocol == htons(ETH_P_8021Q)) {
2141 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2142 protocol = veh->h_vlan_encapsulated_proto;
2143 } else if (!vlan_tx_tag_present(skb)) {
2144 return harmonize_features(skb, protocol, features);
2145 }
2146
2147 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2148
2149 if (protocol != htons(ETH_P_8021Q)) {
2150 return harmonize_features(skb, protocol, features);
2151 } else {
2152 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2153 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2154 return harmonize_features(skb, protocol, features);
2155 }
2156}
2157EXPORT_SYMBOL(netif_skb_features);
2158
2159/*
2160 * Returns true if either:
2161 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2162 * 2. skb is fragmented and the device does not support SG, or if
2163 * at least one of fragments is in highmem and device does not
2164 * support DMA from it.
2165 */
2166static inline int skb_needs_linearize(struct sk_buff *skb,
2167 int features)
2168{
2169 return skb_is_nonlinear(skb) &&
2170 ((skb_has_frag_list(skb) &&
2171 !(features & NETIF_F_FRAGLIST)) ||
2172 (skb_shinfo(skb)->nr_frags &&
2173 !(features & NETIF_F_SG)));
2174}
2175
2176int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2177 struct netdev_queue *txq)
2178{
2179 const struct net_device_ops *ops = dev->netdev_ops;
2180 int rc = NETDEV_TX_OK;
2181 unsigned int skb_len;
2182
2183 if (likely(!skb->next)) {
2184 netdev_features_t features;
2185
2186 /*
2187 * If device doesn't need skb->dst, release it right now while
2188 * its hot in this cpu cache
2189 */
2190 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2191 skb_dst_drop(skb);
2192
2193 if (!list_empty(&ptype_all))
2194 dev_queue_xmit_nit(skb, dev);
2195
2196 features = netif_skb_features(skb);
2197
2198 if (vlan_tx_tag_present(skb) &&
2199 !(features & NETIF_F_HW_VLAN_TX)) {
2200 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2201 if (unlikely(!skb))
2202 goto out;
2203
2204 skb->vlan_tci = 0;
2205 }
2206
2207 if (netif_needs_gso(skb, features)) {
2208 if (unlikely(dev_gso_segment(skb, features)))
2209 goto out_kfree_skb;
2210 if (skb->next)
2211 goto gso;
2212 } else {
2213 if (skb_needs_linearize(skb, features) &&
2214 __skb_linearize(skb))
2215 goto out_kfree_skb;
2216
2217 /* If packet is not checksummed and device does not
2218 * support checksumming for this protocol, complete
2219 * checksumming here.
2220 */
2221 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2222 skb_set_transport_header(skb,
2223 skb_checksum_start_offset(skb));
2224 if (!(features & NETIF_F_ALL_CSUM) &&
2225 skb_checksum_help(skb))
2226 goto out_kfree_skb;
2227 }
2228 }
2229
2230 skb_len = skb->len;
2231 rc = ops->ndo_start_xmit(skb, dev);
2232 trace_net_dev_xmit(skb, rc, dev, skb_len);
2233 if (rc == NETDEV_TX_OK)
2234 txq_trans_update(txq);
2235 return rc;
2236 }
2237
2238gso:
2239 do {
2240 struct sk_buff *nskb = skb->next;
2241
2242 skb->next = nskb->next;
2243 nskb->next = NULL;
2244
2245 /*
2246 * If device doesn't need nskb->dst, release it right now while
2247 * its hot in this cpu cache
2248 */
2249 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2250 skb_dst_drop(nskb);
2251
2252 skb_len = nskb->len;
2253 rc = ops->ndo_start_xmit(nskb, dev);
2254 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2255 if (unlikely(rc != NETDEV_TX_OK)) {
2256 if (rc & ~NETDEV_TX_MASK)
2257 goto out_kfree_gso_skb;
2258 nskb->next = skb->next;
2259 skb->next = nskb;
2260 return rc;
2261 }
2262 txq_trans_update(txq);
2263 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2264 return NETDEV_TX_BUSY;
2265 } while (skb->next);
2266
2267out_kfree_gso_skb:
2268 if (likely(skb->next == NULL))
2269 skb->destructor = DEV_GSO_CB(skb)->destructor;
2270out_kfree_skb:
2271 kfree_skb(skb);
2272out:
2273 return rc;
2274}
2275
2276static u32 hashrnd __read_mostly;
2277
2278/*
2279 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2280 * to be used as a distribution range.
2281 */
2282u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2283 unsigned int num_tx_queues)
2284{
2285 u32 hash;
2286 u16 qoffset = 0;
2287 u16 qcount = num_tx_queues;
2288
2289 if (skb_rx_queue_recorded(skb)) {
2290 hash = skb_get_rx_queue(skb);
2291 while (unlikely(hash >= num_tx_queues))
2292 hash -= num_tx_queues;
2293 return hash;
2294 }
2295
2296 if (dev->num_tc) {
2297 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2298 qoffset = dev->tc_to_txq[tc].offset;
2299 qcount = dev->tc_to_txq[tc].count;
2300 }
2301
2302 if (skb->sk && skb->sk->sk_hash)
2303 hash = skb->sk->sk_hash;
2304 else
2305 hash = (__force u16) skb->protocol;
2306 hash = jhash_1word(hash, hashrnd);
2307
2308 return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2309}
2310EXPORT_SYMBOL(__skb_tx_hash);
2311
2312static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2313{
2314 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2315 net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
2316 dev->name, queue_index,
2317 dev->real_num_tx_queues);
2318 return 0;
2319 }
2320 return queue_index;
2321}
2322
2323static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2324{
2325#ifdef CONFIG_XPS
2326 struct xps_dev_maps *dev_maps;
2327 struct xps_map *map;
2328 int queue_index = -1;
2329
2330 rcu_read_lock();
2331 dev_maps = rcu_dereference(dev->xps_maps);
2332 if (dev_maps) {
2333 map = rcu_dereference(
2334 dev_maps->cpu_map[raw_smp_processor_id()]);
2335 if (map) {
2336 if (map->len == 1)
2337 queue_index = map->queues[0];
2338 else {
2339 u32 hash;
2340 if (skb->sk && skb->sk->sk_hash)
2341 hash = skb->sk->sk_hash;
2342 else
2343 hash = (__force u16) skb->protocol ^
2344 skb->rxhash;
2345 hash = jhash_1word(hash, hashrnd);
2346 queue_index = map->queues[
2347 ((u64)hash * map->len) >> 32];
2348 }
2349 if (unlikely(queue_index >= dev->real_num_tx_queues))
2350 queue_index = -1;
2351 }
2352 }
2353 rcu_read_unlock();
2354
2355 return queue_index;
2356#else
2357 return -1;
2358#endif
2359}
2360
2361static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2362 struct sk_buff *skb)
2363{
2364 int queue_index;
2365 const struct net_device_ops *ops = dev->netdev_ops;
2366
2367 if (dev->real_num_tx_queues == 1)
2368 queue_index = 0;
2369 else if (ops->ndo_select_queue) {
2370 queue_index = ops->ndo_select_queue(dev, skb);
2371 queue_index = dev_cap_txqueue(dev, queue_index);
2372 } else {
2373 struct sock *sk = skb->sk;
2374 queue_index = sk_tx_queue_get(sk);
2375
2376 if (queue_index < 0 || skb->ooo_okay ||
2377 queue_index >= dev->real_num_tx_queues) {
2378 int old_index = queue_index;
2379
2380 queue_index = get_xps_queue(dev, skb);
2381 if (queue_index < 0)
2382 queue_index = skb_tx_hash(dev, skb);
2383
2384 if (queue_index != old_index && sk) {
2385 struct dst_entry *dst =
2386 rcu_dereference_check(sk->sk_dst_cache, 1);
2387
2388 if (dst && skb_dst(skb) == dst)
2389 sk_tx_queue_set(sk, queue_index);
2390 }
2391 }
2392 }
2393
2394 skb_set_queue_mapping(skb, queue_index);
2395 return netdev_get_tx_queue(dev, queue_index);
2396}
2397
2398static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2399 struct net_device *dev,
2400 struct netdev_queue *txq)
2401{
2402 spinlock_t *root_lock = qdisc_lock(q);
2403 bool contended;
2404 int rc;
2405
2406 qdisc_skb_cb(skb)->pkt_len = skb->len;
2407 qdisc_calculate_pkt_len(skb, q);
2408 /*
2409 * Heuristic to force contended enqueues to serialize on a
2410 * separate lock before trying to get qdisc main lock.
2411 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2412 * and dequeue packets faster.
2413 */
2414 contended = qdisc_is_running(q);
2415 if (unlikely(contended))
2416 spin_lock(&q->busylock);
2417
2418 spin_lock(root_lock);
2419 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2420 kfree_skb(skb);
2421 rc = NET_XMIT_DROP;
2422 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2423 qdisc_run_begin(q)) {
2424 /*
2425 * This is a work-conserving queue; there are no old skbs
2426 * waiting to be sent out; and the qdisc is not running -
2427 * xmit the skb directly.
2428 */
2429 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2430 skb_dst_force(skb);
2431
2432 qdisc_bstats_update(q, skb);
2433
2434 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2435 if (unlikely(contended)) {
2436 spin_unlock(&q->busylock);
2437 contended = false;
2438 }
2439 __qdisc_run(q);
2440 } else
2441 qdisc_run_end(q);
2442
2443 rc = NET_XMIT_SUCCESS;
2444 } else {
2445 skb_dst_force(skb);
2446 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2447 if (qdisc_run_begin(q)) {
2448 if (unlikely(contended)) {
2449 spin_unlock(&q->busylock);
2450 contended = false;
2451 }
2452 __qdisc_run(q);
2453 }
2454 }
2455 spin_unlock(root_lock);
2456 if (unlikely(contended))
2457 spin_unlock(&q->busylock);
2458 return rc;
2459}
2460
2461#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2462static void skb_update_prio(struct sk_buff *skb)
2463{
2464 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2465
2466 if (!skb->priority && skb->sk && map) {
2467 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2468
2469 if (prioidx < map->priomap_len)
2470 skb->priority = map->priomap[prioidx];
2471 }
2472}
2473#else
2474#define skb_update_prio(skb)
2475#endif
2476
2477static DEFINE_PER_CPU(int, xmit_recursion);
2478#define RECURSION_LIMIT 10
2479
2480/**
2481 * dev_queue_xmit - transmit a buffer
2482 * @skb: buffer to transmit
2483 *
2484 * Queue a buffer for transmission to a network device. The caller must
2485 * have set the device and priority and built the buffer before calling
2486 * this function. The function can be called from an interrupt.
2487 *
2488 * A negative errno code is returned on a failure. A success does not
2489 * guarantee the frame will be transmitted as it may be dropped due
2490 * to congestion or traffic shaping.
2491 *
2492 * -----------------------------------------------------------------------------------
2493 * I notice this method can also return errors from the queue disciplines,
2494 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2495 * be positive.
2496 *
2497 * Regardless of the return value, the skb is consumed, so it is currently
2498 * difficult to retry a send to this method. (You can bump the ref count
2499 * before sending to hold a reference for retry if you are careful.)
2500 *
2501 * When calling this method, interrupts MUST be enabled. This is because
2502 * the BH enable code must have IRQs enabled so that it will not deadlock.
2503 * --BLG
2504 */
2505int dev_queue_xmit(struct sk_buff *skb)
2506{
2507 struct net_device *dev = skb->dev;
2508 struct netdev_queue *txq;
2509 struct Qdisc *q;
2510 int rc = -ENOMEM;
2511
2512 /* Disable soft irqs for various locks below. Also
2513 * stops preemption for RCU.
2514 */
2515 rcu_read_lock_bh();
2516
2517 skb_update_prio(skb);
2518
2519 txq = dev_pick_tx(dev, skb);
2520 q = rcu_dereference_bh(txq->qdisc);
2521
2522#ifdef CONFIG_NET_CLS_ACT
2523 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2524#endif
2525 trace_net_dev_queue(skb);
2526 if (q->enqueue) {
2527 rc = __dev_xmit_skb(skb, q, dev, txq);
2528 goto out;
2529 }
2530
2531 /* The device has no queue. Common case for software devices:
2532 loopback, all the sorts of tunnels...
2533
2534 Really, it is unlikely that netif_tx_lock protection is necessary
2535 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2536 counters.)
2537 However, it is possible, that they rely on protection
2538 made by us here.
2539
2540 Check this and shot the lock. It is not prone from deadlocks.
2541 Either shot noqueue qdisc, it is even simpler 8)
2542 */
2543 if (dev->flags & IFF_UP) {
2544 int cpu = smp_processor_id(); /* ok because BHs are off */
2545
2546 if (txq->xmit_lock_owner != cpu) {
2547
2548 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2549 goto recursion_alert;
2550
2551 HARD_TX_LOCK(dev, txq, cpu);
2552
2553 if (!netif_xmit_stopped(txq)) {
2554 __this_cpu_inc(xmit_recursion);
2555 rc = dev_hard_start_xmit(skb, dev, txq);
2556 __this_cpu_dec(xmit_recursion);
2557 if (dev_xmit_complete(rc)) {
2558 HARD_TX_UNLOCK(dev, txq);
2559 goto out;
2560 }
2561 }
2562 HARD_TX_UNLOCK(dev, txq);
2563 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2564 dev->name);
2565 } else {
2566 /* Recursion is detected! It is possible,
2567 * unfortunately
2568 */
2569recursion_alert:
2570 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2571 dev->name);
2572 }
2573 }
2574
2575 rc = -ENETDOWN;
2576 rcu_read_unlock_bh();
2577
2578 kfree_skb(skb);
2579 return rc;
2580out:
2581 rcu_read_unlock_bh();
2582 return rc;
2583}
2584EXPORT_SYMBOL(dev_queue_xmit);
2585
2586
2587/*=======================================================================
2588 Receiver routines
2589 =======================================================================*/
2590
2591int netdev_max_backlog __read_mostly = 1000;
2592int netdev_tstamp_prequeue __read_mostly = 1;
2593int netdev_budget __read_mostly = 300;
2594int weight_p __read_mostly = 64; /* old backlog weight */
2595
2596/* Called with irq disabled */
2597static inline void ____napi_schedule(struct softnet_data *sd,
2598 struct napi_struct *napi)
2599{
2600 list_add_tail(&napi->poll_list, &sd->poll_list);
2601 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2602}
2603
2604/*
2605 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2606 * and src/dst port numbers. Sets rxhash in skb to non-zero hash value
2607 * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb
2608 * if hash is a canonical 4-tuple hash over transport ports.
2609 */
2610void __skb_get_rxhash(struct sk_buff *skb)
2611{
2612 struct flow_keys keys;
2613 u32 hash;
2614
2615 if (!skb_flow_dissect(skb, &keys))
2616 return;
2617
2618 if (keys.ports) {
2619 if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0])
2620 swap(keys.port16[0], keys.port16[1]);
2621 skb->l4_rxhash = 1;
2622 }
2623
2624 /* get a consistent hash (same value on both flow directions) */
2625 if ((__force u32)keys.dst < (__force u32)keys.src)
2626 swap(keys.dst, keys.src);
2627
2628 hash = jhash_3words((__force u32)keys.dst,
2629 (__force u32)keys.src,
2630 (__force u32)keys.ports, hashrnd);
2631 if (!hash)
2632 hash = 1;
2633
2634 skb->rxhash = hash;
2635}
2636EXPORT_SYMBOL(__skb_get_rxhash);
2637
2638#ifdef CONFIG_RPS
2639
2640/* One global table that all flow-based protocols share. */
2641struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2642EXPORT_SYMBOL(rps_sock_flow_table);
2643
2644struct static_key rps_needed __read_mostly;
2645
2646static struct rps_dev_flow *
2647set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2648 struct rps_dev_flow *rflow, u16 next_cpu)
2649{
2650 if (next_cpu != RPS_NO_CPU) {
2651#ifdef CONFIG_RFS_ACCEL
2652 struct netdev_rx_queue *rxqueue;
2653 struct rps_dev_flow_table *flow_table;
2654 struct rps_dev_flow *old_rflow;
2655 u32 flow_id;
2656 u16 rxq_index;
2657 int rc;
2658
2659 /* Should we steer this flow to a different hardware queue? */
2660 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2661 !(dev->features & NETIF_F_NTUPLE))
2662 goto out;
2663 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2664 if (rxq_index == skb_get_rx_queue(skb))
2665 goto out;
2666
2667 rxqueue = dev->_rx + rxq_index;
2668 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2669 if (!flow_table)
2670 goto out;
2671 flow_id = skb->rxhash & flow_table->mask;
2672 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2673 rxq_index, flow_id);
2674 if (rc < 0)
2675 goto out;
2676 old_rflow = rflow;
2677 rflow = &flow_table->flows[flow_id];
2678 rflow->filter = rc;
2679 if (old_rflow->filter == rflow->filter)
2680 old_rflow->filter = RPS_NO_FILTER;
2681 out:
2682#endif
2683 rflow->last_qtail =
2684 per_cpu(softnet_data, next_cpu).input_queue_head;
2685 }
2686
2687 rflow->cpu = next_cpu;
2688 return rflow;
2689}
2690
2691/*
2692 * get_rps_cpu is called from netif_receive_skb and returns the target
2693 * CPU from the RPS map of the receiving queue for a given skb.
2694 * rcu_read_lock must be held on entry.
2695 */
2696static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2697 struct rps_dev_flow **rflowp)
2698{
2699 struct netdev_rx_queue *rxqueue;
2700 struct rps_map *map;
2701 struct rps_dev_flow_table *flow_table;
2702 struct rps_sock_flow_table *sock_flow_table;
2703 int cpu = -1;
2704 u16 tcpu;
2705
2706 if (skb_rx_queue_recorded(skb)) {
2707 u16 index = skb_get_rx_queue(skb);
2708 if (unlikely(index >= dev->real_num_rx_queues)) {
2709 WARN_ONCE(dev->real_num_rx_queues > 1,
2710 "%s received packet on queue %u, but number "
2711 "of RX queues is %u\n",
2712 dev->name, index, dev->real_num_rx_queues);
2713 goto done;
2714 }
2715 rxqueue = dev->_rx + index;
2716 } else
2717 rxqueue = dev->_rx;
2718
2719 map = rcu_dereference(rxqueue->rps_map);
2720 if (map) {
2721 if (map->len == 1 &&
2722 !rcu_access_pointer(rxqueue->rps_flow_table)) {
2723 tcpu = map->cpus[0];
2724 if (cpu_online(tcpu))
2725 cpu = tcpu;
2726 goto done;
2727 }
2728 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2729 goto done;
2730 }
2731
2732 skb_reset_network_header(skb);
2733 if (!skb_get_rxhash(skb))
2734 goto done;
2735
2736 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2737 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2738 if (flow_table && sock_flow_table) {
2739 u16 next_cpu;
2740 struct rps_dev_flow *rflow;
2741
2742 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2743 tcpu = rflow->cpu;
2744
2745 next_cpu = sock_flow_table->ents[skb->rxhash &
2746 sock_flow_table->mask];
2747
2748 /*
2749 * If the desired CPU (where last recvmsg was done) is
2750 * different from current CPU (one in the rx-queue flow
2751 * table entry), switch if one of the following holds:
2752 * - Current CPU is unset (equal to RPS_NO_CPU).
2753 * - Current CPU is offline.
2754 * - The current CPU's queue tail has advanced beyond the
2755 * last packet that was enqueued using this table entry.
2756 * This guarantees that all previous packets for the flow
2757 * have been dequeued, thus preserving in order delivery.
2758 */
2759 if (unlikely(tcpu != next_cpu) &&
2760 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2761 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2762 rflow->last_qtail)) >= 0))
2763 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2764
2765 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2766 *rflowp = rflow;
2767 cpu = tcpu;
2768 goto done;
2769 }
2770 }
2771
2772 if (map) {
2773 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2774
2775 if (cpu_online(tcpu)) {
2776 cpu = tcpu;
2777 goto done;
2778 }
2779 }
2780
2781done:
2782 return cpu;
2783}
2784
2785#ifdef CONFIG_RFS_ACCEL
2786
2787/**
2788 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2789 * @dev: Device on which the filter was set
2790 * @rxq_index: RX queue index
2791 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2792 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2793 *
2794 * Drivers that implement ndo_rx_flow_steer() should periodically call
2795 * this function for each installed filter and remove the filters for
2796 * which it returns %true.
2797 */
2798bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2799 u32 flow_id, u16 filter_id)
2800{
2801 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2802 struct rps_dev_flow_table *flow_table;
2803 struct rps_dev_flow *rflow;
2804 bool expire = true;
2805 int cpu;
2806
2807 rcu_read_lock();
2808 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2809 if (flow_table && flow_id <= flow_table->mask) {
2810 rflow = &flow_table->flows[flow_id];
2811 cpu = ACCESS_ONCE(rflow->cpu);
2812 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2813 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2814 rflow->last_qtail) <
2815 (int)(10 * flow_table->mask)))
2816 expire = false;
2817 }
2818 rcu_read_unlock();
2819 return expire;
2820}
2821EXPORT_SYMBOL(rps_may_expire_flow);
2822
2823#endif /* CONFIG_RFS_ACCEL */
2824
2825/* Called from hardirq (IPI) context */
2826static void rps_trigger_softirq(void *data)
2827{
2828 struct softnet_data *sd = data;
2829
2830 ____napi_schedule(sd, &sd->backlog);
2831 sd->received_rps++;
2832}
2833
2834#endif /* CONFIG_RPS */
2835
2836/*
2837 * Check if this softnet_data structure is another cpu one
2838 * If yes, queue it to our IPI list and return 1
2839 * If no, return 0
2840 */
2841static int rps_ipi_queued(struct softnet_data *sd)
2842{
2843#ifdef CONFIG_RPS
2844 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2845
2846 if (sd != mysd) {
2847 sd->rps_ipi_next = mysd->rps_ipi_list;
2848 mysd->rps_ipi_list = sd;
2849
2850 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2851 return 1;
2852 }
2853#endif /* CONFIG_RPS */
2854 return 0;
2855}
2856
2857/*
2858 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2859 * queue (may be a remote CPU queue).
2860 */
2861static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2862 unsigned int *qtail)
2863{
2864 struct softnet_data *sd;
2865 unsigned long flags;
2866
2867 sd = &per_cpu(softnet_data, cpu);
2868
2869 local_irq_save(flags);
2870
2871 rps_lock(sd);
2872 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2873 if (skb_queue_len(&sd->input_pkt_queue)) {
2874enqueue:
2875 __skb_queue_tail(&sd->input_pkt_queue, skb);
2876 input_queue_tail_incr_save(sd, qtail);
2877 rps_unlock(sd);
2878 local_irq_restore(flags);
2879 return NET_RX_SUCCESS;
2880 }
2881
2882 /* Schedule NAPI for backlog device
2883 * We can use non atomic operation since we own the queue lock
2884 */
2885 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2886 if (!rps_ipi_queued(sd))
2887 ____napi_schedule(sd, &sd->backlog);
2888 }
2889 goto enqueue;
2890 }
2891
2892 sd->dropped++;
2893 rps_unlock(sd);
2894
2895 local_irq_restore(flags);
2896
2897 atomic_long_inc(&skb->dev->rx_dropped);
2898 kfree_skb(skb);
2899 return NET_RX_DROP;
2900}
2901
2902/**
2903 * netif_rx - post buffer to the network code
2904 * @skb: buffer to post
2905 *
2906 * This function receives a packet from a device driver and queues it for
2907 * the upper (protocol) levels to process. It always succeeds. The buffer
2908 * may be dropped during processing for congestion control or by the
2909 * protocol layers.
2910 *
2911 * return values:
2912 * NET_RX_SUCCESS (no congestion)
2913 * NET_RX_DROP (packet was dropped)
2914 *
2915 */
2916
2917int netif_rx(struct sk_buff *skb)
2918{
2919 int ret;
2920
2921 /* if netpoll wants it, pretend we never saw it */
2922 if (netpoll_rx(skb))
2923 return NET_RX_DROP;
2924
2925 net_timestamp_check(netdev_tstamp_prequeue, skb);
2926
2927 trace_netif_rx(skb);
2928#ifdef CONFIG_RPS
2929 if (static_key_false(&rps_needed)) {
2930 struct rps_dev_flow voidflow, *rflow = &voidflow;
2931 int cpu;
2932
2933 preempt_disable();
2934 rcu_read_lock();
2935
2936 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2937 if (cpu < 0)
2938 cpu = smp_processor_id();
2939
2940 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2941
2942 rcu_read_unlock();
2943 preempt_enable();
2944 } else
2945#endif
2946 {
2947 unsigned int qtail;
2948 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2949 put_cpu();
2950 }
2951 return ret;
2952}
2953EXPORT_SYMBOL(netif_rx);
2954
2955int netif_rx_ni(struct sk_buff *skb)
2956{
2957 int err;
2958
2959 preempt_disable();
2960 err = netif_rx(skb);
2961 if (local_softirq_pending())
2962 do_softirq();
2963 preempt_enable();
2964
2965 return err;
2966}
2967EXPORT_SYMBOL(netif_rx_ni);
2968
2969static void net_tx_action(struct softirq_action *h)
2970{
2971 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2972
2973 if (sd->completion_queue) {
2974 struct sk_buff *clist;
2975
2976 local_irq_disable();
2977 clist = sd->completion_queue;
2978 sd->completion_queue = NULL;
2979 local_irq_enable();
2980
2981 while (clist) {
2982 struct sk_buff *skb = clist;
2983 clist = clist->next;
2984
2985 WARN_ON(atomic_read(&skb->users));
2986 trace_kfree_skb(skb, net_tx_action);
2987 __kfree_skb(skb);
2988 }
2989 }
2990
2991 if (sd->output_queue) {
2992 struct Qdisc *head;
2993
2994 local_irq_disable();
2995 head = sd->output_queue;
2996 sd->output_queue = NULL;
2997 sd->output_queue_tailp = &sd->output_queue;
2998 local_irq_enable();
2999
3000 while (head) {
3001 struct Qdisc *q = head;
3002 spinlock_t *root_lock;
3003
3004 head = head->next_sched;
3005
3006 root_lock = qdisc_lock(q);
3007 if (spin_trylock(root_lock)) {
3008 smp_mb__before_clear_bit();
3009 clear_bit(__QDISC_STATE_SCHED,
3010 &q->state);
3011 qdisc_run(q);
3012 spin_unlock(root_lock);
3013 } else {
3014 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3015 &q->state)) {
3016 __netif_reschedule(q);
3017 } else {
3018 smp_mb__before_clear_bit();
3019 clear_bit(__QDISC_STATE_SCHED,
3020 &q->state);
3021 }
3022 }
3023 }
3024 }
3025}
3026
3027#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3028 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3029/* This hook is defined here for ATM LANE */
3030int (*br_fdb_test_addr_hook)(struct net_device *dev,
3031 unsigned char *addr) __read_mostly;
3032EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3033#endif
3034
3035#ifdef CONFIG_NET_CLS_ACT
3036/* TODO: Maybe we should just force sch_ingress to be compiled in
3037 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3038 * a compare and 2 stores extra right now if we dont have it on
3039 * but have CONFIG_NET_CLS_ACT
3040 * NOTE: This doesn't stop any functionality; if you dont have
3041 * the ingress scheduler, you just can't add policies on ingress.
3042 *
3043 */
3044static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3045{
3046 struct net_device *dev = skb->dev;
3047 u32 ttl = G_TC_RTTL(skb->tc_verd);
3048 int result = TC_ACT_OK;
3049 struct Qdisc *q;
3050
3051 if (unlikely(MAX_RED_LOOP < ttl++)) {
3052 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3053 skb->skb_iif, dev->ifindex);
3054 return TC_ACT_SHOT;
3055 }
3056
3057 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3058 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3059
3060 q = rxq->qdisc;
3061 if (q != &noop_qdisc) {
3062 spin_lock(qdisc_lock(q));
3063 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3064 result = qdisc_enqueue_root(skb, q);
3065 spin_unlock(qdisc_lock(q));
3066 }
3067
3068 return result;
3069}
3070
3071static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3072 struct packet_type **pt_prev,
3073 int *ret, struct net_device *orig_dev)
3074{
3075 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3076
3077 if (!rxq || rxq->qdisc == &noop_qdisc)
3078 goto out;
3079
3080 if (*pt_prev) {
3081 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3082 *pt_prev = NULL;
3083 }
3084
3085 switch (ing_filter(skb, rxq)) {
3086 case TC_ACT_SHOT:
3087 case TC_ACT_STOLEN:
3088 kfree_skb(skb);
3089 return NULL;
3090 }
3091
3092out:
3093 skb->tc_verd = 0;
3094 return skb;
3095}
3096#endif
3097
3098/**
3099 * netdev_rx_handler_register - register receive handler
3100 * @dev: device to register a handler for
3101 * @rx_handler: receive handler to register
3102 * @rx_handler_data: data pointer that is used by rx handler
3103 *
3104 * Register a receive hander for a device. This handler will then be
3105 * called from __netif_receive_skb. A negative errno code is returned
3106 * on a failure.
3107 *
3108 * The caller must hold the rtnl_mutex.
3109 *
3110 * For a general description of rx_handler, see enum rx_handler_result.
3111 */
3112int netdev_rx_handler_register(struct net_device *dev,
3113 rx_handler_func_t *rx_handler,
3114 void *rx_handler_data)
3115{
3116 ASSERT_RTNL();
3117
3118 if (dev->rx_handler)
3119 return -EBUSY;
3120
3121 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3122 rcu_assign_pointer(dev->rx_handler, rx_handler);
3123
3124 return 0;
3125}
3126EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3127
3128/**
3129 * netdev_rx_handler_unregister - unregister receive handler
3130 * @dev: device to unregister a handler from
3131 *
3132 * Unregister a receive hander from a device.
3133 *
3134 * The caller must hold the rtnl_mutex.
3135 */
3136void netdev_rx_handler_unregister(struct net_device *dev)
3137{
3138
3139 ASSERT_RTNL();
3140 RCU_INIT_POINTER(dev->rx_handler, NULL);
3141 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3142}
3143EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3144
3145static int __netif_receive_skb(struct sk_buff *skb)
3146{
3147 struct packet_type *ptype, *pt_prev;
3148 rx_handler_func_t *rx_handler;
3149 struct net_device *orig_dev;
3150 struct net_device *null_or_dev;
3151 bool deliver_exact = false;
3152 int ret = NET_RX_DROP;
3153 __be16 type;
3154
3155 net_timestamp_check(!netdev_tstamp_prequeue, skb);
3156
3157 trace_netif_receive_skb(skb);
3158
3159 /* if we've gotten here through NAPI, check netpoll */
3160 if (netpoll_receive_skb(skb))
3161 return NET_RX_DROP;
3162
3163 if (!skb->skb_iif)
3164 skb->skb_iif = skb->dev->ifindex;
3165 orig_dev = skb->dev;
3166
3167 skb_reset_network_header(skb);
3168 skb_reset_transport_header(skb);
3169 skb_reset_mac_len(skb);
3170
3171 pt_prev = NULL;
3172
3173 rcu_read_lock();
3174
3175another_round:
3176
3177 __this_cpu_inc(softnet_data.processed);
3178
3179 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3180 skb = vlan_untag(skb);
3181 if (unlikely(!skb))
3182 goto out;
3183 }
3184
3185#ifdef CONFIG_NET_CLS_ACT
3186 if (skb->tc_verd & TC_NCLS) {
3187 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3188 goto ncls;
3189 }
3190#endif
3191
3192 list_for_each_entry_rcu(ptype, &ptype_all, list) {
3193 if (!ptype->dev || ptype->dev == skb->dev) {
3194 if (pt_prev)
3195 ret = deliver_skb(skb, pt_prev, orig_dev);
3196 pt_prev = ptype;
3197 }
3198 }
3199
3200#ifdef CONFIG_NET_CLS_ACT
3201 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3202 if (!skb)
3203 goto out;
3204ncls:
3205#endif
3206
3207 rx_handler = rcu_dereference(skb->dev->rx_handler);
3208 if (vlan_tx_tag_present(skb)) {
3209 if (pt_prev) {
3210 ret = deliver_skb(skb, pt_prev, orig_dev);
3211 pt_prev = NULL;
3212 }
3213 if (vlan_do_receive(&skb, !rx_handler))
3214 goto another_round;
3215 else if (unlikely(!skb))
3216 goto out;
3217 }
3218
3219 if (rx_handler) {
3220 if (pt_prev) {
3221 ret = deliver_skb(skb, pt_prev, orig_dev);
3222 pt_prev = NULL;
3223 }
3224 switch (rx_handler(&skb)) {
3225 case RX_HANDLER_CONSUMED:
3226 goto out;
3227 case RX_HANDLER_ANOTHER:
3228 goto another_round;
3229 case RX_HANDLER_EXACT:
3230 deliver_exact = true;
3231 case RX_HANDLER_PASS:
3232 break;
3233 default:
3234 BUG();
3235 }
3236 }
3237
3238 /* deliver only exact match when indicated */
3239 null_or_dev = deliver_exact ? skb->dev : NULL;
3240
3241 type = skb->protocol;
3242 list_for_each_entry_rcu(ptype,
3243 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3244 if (ptype->type == type &&
3245 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3246 ptype->dev == orig_dev)) {
3247 if (pt_prev)
3248 ret = deliver_skb(skb, pt_prev, orig_dev);
3249 pt_prev = ptype;
3250 }
3251 }
3252
3253 if (pt_prev) {
3254 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3255 } else {
3256 atomic_long_inc(&skb->dev->rx_dropped);
3257 kfree_skb(skb);
3258 /* Jamal, now you will not able to escape explaining
3259 * me how you were going to use this. :-)
3260 */
3261 ret = NET_RX_DROP;
3262 }
3263
3264out:
3265 rcu_read_unlock();
3266 return ret;
3267}
3268
3269/**
3270 * netif_receive_skb - process receive buffer from network
3271 * @skb: buffer to process
3272 *
3273 * netif_receive_skb() is the main receive data processing function.
3274 * It always succeeds. The buffer may be dropped during processing
3275 * for congestion control or by the protocol layers.
3276 *
3277 * This function may only be called from softirq context and interrupts
3278 * should be enabled.
3279 *
3280 * Return values (usually ignored):
3281 * NET_RX_SUCCESS: no congestion
3282 * NET_RX_DROP: packet was dropped
3283 */
3284int netif_receive_skb(struct sk_buff *skb)
3285{
3286 net_timestamp_check(netdev_tstamp_prequeue, skb);
3287
3288 if (skb_defer_rx_timestamp(skb))
3289 return NET_RX_SUCCESS;
3290
3291#ifdef CONFIG_RPS
3292 if (static_key_false(&rps_needed)) {
3293 struct rps_dev_flow voidflow, *rflow = &voidflow;
3294 int cpu, ret;
3295
3296 rcu_read_lock();
3297
3298 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3299
3300 if (cpu >= 0) {
3301 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3302 rcu_read_unlock();
3303 return ret;
3304 }
3305 rcu_read_unlock();
3306 }
3307#endif
3308 return __netif_receive_skb(skb);
3309}
3310EXPORT_SYMBOL(netif_receive_skb);
3311
3312/* Network device is going away, flush any packets still pending
3313 * Called with irqs disabled.
3314 */
3315static void flush_backlog(void *arg)
3316{
3317 struct net_device *dev = arg;
3318 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3319 struct sk_buff *skb, *tmp;
3320
3321 rps_lock(sd);
3322 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3323 if (skb->dev == dev) {
3324 __skb_unlink(skb, &sd->input_pkt_queue);
3325 kfree_skb(skb);
3326 input_queue_head_incr(sd);
3327 }
3328 }
3329 rps_unlock(sd);
3330
3331 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3332 if (skb->dev == dev) {
3333 __skb_unlink(skb, &sd->process_queue);
3334 kfree_skb(skb);
3335 input_queue_head_incr(sd);
3336 }
3337 }
3338}
3339
3340static int napi_gro_complete(struct sk_buff *skb)
3341{
3342 struct packet_type *ptype;
3343 __be16 type = skb->protocol;
3344 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3345 int err = -ENOENT;
3346
3347 if (NAPI_GRO_CB(skb)->count == 1) {
3348 skb_shinfo(skb)->gso_size = 0;
3349 goto out;
3350 }
3351
3352 rcu_read_lock();
3353 list_for_each_entry_rcu(ptype, head, list) {
3354 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3355 continue;
3356
3357 err = ptype->gro_complete(skb);
3358 break;
3359 }
3360 rcu_read_unlock();
3361
3362 if (err) {
3363 WARN_ON(&ptype->list == head);
3364 kfree_skb(skb);
3365 return NET_RX_SUCCESS;
3366 }
3367
3368out:
3369 return netif_receive_skb(skb);
3370}
3371
3372inline void napi_gro_flush(struct napi_struct *napi)
3373{
3374 struct sk_buff *skb, *next;
3375
3376 for (skb = napi->gro_list; skb; skb = next) {
3377 next = skb->next;
3378 skb->next = NULL;
3379 napi_gro_complete(skb);
3380 }
3381
3382 napi->gro_count = 0;
3383 napi->gro_list = NULL;
3384}
3385EXPORT_SYMBOL(napi_gro_flush);
3386
3387enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3388{
3389 struct sk_buff **pp = NULL;
3390 struct packet_type *ptype;
3391 __be16 type = skb->protocol;
3392 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3393 int same_flow;
3394 int mac_len;
3395 enum gro_result ret;
3396
3397 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3398 goto normal;
3399
3400 if (skb_is_gso(skb) || skb_has_frag_list(skb))
3401 goto normal;
3402
3403 rcu_read_lock();
3404 list_for_each_entry_rcu(ptype, head, list) {
3405 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3406 continue;
3407
3408 skb_set_network_header(skb, skb_gro_offset(skb));
3409 mac_len = skb->network_header - skb->mac_header;
3410 skb->mac_len = mac_len;
3411 NAPI_GRO_CB(skb)->same_flow = 0;
3412 NAPI_GRO_CB(skb)->flush = 0;
3413 NAPI_GRO_CB(skb)->free = 0;
3414
3415 pp = ptype->gro_receive(&napi->gro_list, skb);
3416 break;
3417 }
3418 rcu_read_unlock();
3419
3420 if (&ptype->list == head)
3421 goto normal;
3422
3423 same_flow = NAPI_GRO_CB(skb)->same_flow;
3424 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3425
3426 if (pp) {
3427 struct sk_buff *nskb = *pp;
3428
3429 *pp = nskb->next;
3430 nskb->next = NULL;
3431 napi_gro_complete(nskb);
3432 napi->gro_count--;
3433 }
3434
3435 if (same_flow)
3436 goto ok;
3437
3438 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3439 goto normal;
3440
3441 napi->gro_count++;
3442 NAPI_GRO_CB(skb)->count = 1;
3443 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3444 skb->next = napi->gro_list;
3445 napi->gro_list = skb;
3446 ret = GRO_HELD;
3447
3448pull:
3449 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3450 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3451
3452 BUG_ON(skb->end - skb->tail < grow);
3453
3454 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3455
3456 skb->tail += grow;
3457 skb->data_len -= grow;
3458
3459 skb_shinfo(skb)->frags[0].page_offset += grow;
3460 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3461
3462 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3463 skb_frag_unref(skb, 0);
3464 memmove(skb_shinfo(skb)->frags,
3465 skb_shinfo(skb)->frags + 1,
3466 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3467 }
3468 }
3469
3470ok:
3471 return ret;
3472
3473normal:
3474 ret = GRO_NORMAL;
3475 goto pull;
3476}
3477EXPORT_SYMBOL(dev_gro_receive);
3478
3479static inline gro_result_t
3480__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3481{
3482 struct sk_buff *p;
3483 unsigned int maclen = skb->dev->hard_header_len;
3484
3485 for (p = napi->gro_list; p; p = p->next) {
3486 unsigned long diffs;
3487
3488 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3489 diffs |= p->vlan_tci ^ skb->vlan_tci;
3490 if (maclen == ETH_HLEN)
3491 diffs |= compare_ether_header(skb_mac_header(p),
3492 skb_gro_mac_header(skb));
3493 else if (!diffs)
3494 diffs = memcmp(skb_mac_header(p),
3495 skb_gro_mac_header(skb),
3496 maclen);
3497 NAPI_GRO_CB(p)->same_flow = !diffs;
3498 NAPI_GRO_CB(p)->flush = 0;
3499 }
3500
3501 return dev_gro_receive(napi, skb);
3502}
3503
3504gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3505{
3506 switch (ret) {
3507 case GRO_NORMAL:
3508 if (netif_receive_skb(skb))
3509 ret = GRO_DROP;
3510 break;
3511
3512 case GRO_DROP:
3513 kfree_skb(skb);
3514 break;
3515
3516 case GRO_MERGED_FREE:
3517 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3518 kmem_cache_free(skbuff_head_cache, skb);
3519 else
3520 __kfree_skb(skb);
3521 break;
3522
3523 case GRO_HELD:
3524 case GRO_MERGED:
3525 break;
3526 }
3527
3528 return ret;
3529}
3530EXPORT_SYMBOL(napi_skb_finish);
3531
3532void skb_gro_reset_offset(struct sk_buff *skb)
3533{
3534 NAPI_GRO_CB(skb)->data_offset = 0;
3535 NAPI_GRO_CB(skb)->frag0 = NULL;
3536 NAPI_GRO_CB(skb)->frag0_len = 0;
3537
3538 if (skb->mac_header == skb->tail &&
3539 !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3540 NAPI_GRO_CB(skb)->frag0 =
3541 skb_frag_address(&skb_shinfo(skb)->frags[0]);
3542 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3543 }
3544}
3545EXPORT_SYMBOL(skb_gro_reset_offset);
3546
3547gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3548{
3549 skb_gro_reset_offset(skb);
3550
3551 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3552}
3553EXPORT_SYMBOL(napi_gro_receive);
3554
3555static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3556{
3557 __skb_pull(skb, skb_headlen(skb));
3558 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3559 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3560 skb->vlan_tci = 0;
3561 skb->dev = napi->dev;
3562 skb->skb_iif = 0;
3563
3564 napi->skb = skb;
3565}
3566
3567struct sk_buff *napi_get_frags(struct napi_struct *napi)
3568{
3569 struct sk_buff *skb = napi->skb;
3570
3571 if (!skb) {
3572 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3573 if (skb)
3574 napi->skb = skb;
3575 }
3576 return skb;
3577}
3578EXPORT_SYMBOL(napi_get_frags);
3579
3580gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3581 gro_result_t ret)
3582{
3583 switch (ret) {
3584 case GRO_NORMAL:
3585 case GRO_HELD:
3586 skb->protocol = eth_type_trans(skb, skb->dev);
3587
3588 if (ret == GRO_HELD)
3589 skb_gro_pull(skb, -ETH_HLEN);
3590 else if (netif_receive_skb(skb))
3591 ret = GRO_DROP;
3592 break;
3593
3594 case GRO_DROP:
3595 case GRO_MERGED_FREE:
3596 napi_reuse_skb(napi, skb);
3597 break;
3598
3599 case GRO_MERGED:
3600 break;
3601 }
3602
3603 return ret;
3604}
3605EXPORT_SYMBOL(napi_frags_finish);
3606
3607static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3608{
3609 struct sk_buff *skb = napi->skb;
3610 struct ethhdr *eth;
3611 unsigned int hlen;
3612 unsigned int off;
3613
3614 napi->skb = NULL;
3615
3616 skb_reset_mac_header(skb);
3617 skb_gro_reset_offset(skb);
3618
3619 off = skb_gro_offset(skb);
3620 hlen = off + sizeof(*eth);
3621 eth = skb_gro_header_fast(skb, off);
3622 if (skb_gro_header_hard(skb, hlen)) {
3623 eth = skb_gro_header_slow(skb, hlen, off);
3624 if (unlikely(!eth)) {
3625 napi_reuse_skb(napi, skb);
3626 skb = NULL;
3627 goto out;
3628 }
3629 }
3630
3631 skb_gro_pull(skb, sizeof(*eth));
3632
3633 /*
3634 * This works because the only protocols we care about don't require
3635 * special handling. We'll fix it up properly at the end.
3636 */
3637 skb->protocol = eth->h_proto;
3638
3639out:
3640 return skb;
3641}
3642
3643gro_result_t napi_gro_frags(struct napi_struct *napi)
3644{
3645 struct sk_buff *skb = napi_frags_skb(napi);
3646
3647 if (!skb)
3648 return GRO_DROP;
3649
3650 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3651}
3652EXPORT_SYMBOL(napi_gro_frags);
3653
3654/*
3655 * net_rps_action sends any pending IPI's for rps.
3656 * Note: called with local irq disabled, but exits with local irq enabled.
3657 */
3658static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3659{
3660#ifdef CONFIG_RPS
3661 struct softnet_data *remsd = sd->rps_ipi_list;
3662
3663 if (remsd) {
3664 sd->rps_ipi_list = NULL;
3665
3666 local_irq_enable();
3667
3668 /* Send pending IPI's to kick RPS processing on remote cpus. */
3669 while (remsd) {
3670 struct softnet_data *next = remsd->rps_ipi_next;
3671
3672 if (cpu_online(remsd->cpu))
3673 __smp_call_function_single(remsd->cpu,
3674 &remsd->csd, 0);
3675 remsd = next;
3676 }
3677 } else
3678#endif
3679 local_irq_enable();
3680}
3681
3682static int process_backlog(struct napi_struct *napi, int quota)
3683{
3684 int work = 0;
3685 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3686
3687#ifdef CONFIG_RPS
3688 /* Check if we have pending ipi, its better to send them now,
3689 * not waiting net_rx_action() end.
3690 */
3691 if (sd->rps_ipi_list) {
3692 local_irq_disable();
3693 net_rps_action_and_irq_enable(sd);
3694 }
3695#endif
3696 napi->weight = weight_p;
3697 local_irq_disable();
3698 while (work < quota) {
3699 struct sk_buff *skb;
3700 unsigned int qlen;
3701
3702 while ((skb = __skb_dequeue(&sd->process_queue))) {
3703 local_irq_enable();
3704 __netif_receive_skb(skb);
3705 local_irq_disable();
3706 input_queue_head_incr(sd);
3707 if (++work >= quota) {
3708 local_irq_enable();
3709 return work;
3710 }
3711 }
3712
3713 rps_lock(sd);
3714 qlen = skb_queue_len(&sd->input_pkt_queue);
3715 if (qlen)
3716 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3717 &sd->process_queue);
3718
3719 if (qlen < quota - work) {
3720 /*
3721 * Inline a custom version of __napi_complete().
3722 * only current cpu owns and manipulates this napi,
3723 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3724 * we can use a plain write instead of clear_bit(),
3725 * and we dont need an smp_mb() memory barrier.
3726 */
3727 list_del(&napi->poll_list);
3728 napi->state = 0;
3729
3730 quota = work + qlen;
3731 }
3732 rps_unlock(sd);
3733 }
3734 local_irq_enable();
3735
3736 return work;
3737}
3738
3739/**
3740 * __napi_schedule - schedule for receive
3741 * @n: entry to schedule
3742 *
3743 * The entry's receive function will be scheduled to run
3744 */
3745void __napi_schedule(struct napi_struct *n)
3746{
3747 unsigned long flags;
3748
3749 local_irq_save(flags);
3750 ____napi_schedule(&__get_cpu_var(softnet_data), n);
3751 local_irq_restore(flags);
3752}
3753EXPORT_SYMBOL(__napi_schedule);
3754
3755void __napi_complete(struct napi_struct *n)
3756{
3757 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3758 BUG_ON(n->gro_list);
3759
3760 list_del(&n->poll_list);
3761 smp_mb__before_clear_bit();
3762 clear_bit(NAPI_STATE_SCHED, &n->state);
3763}
3764EXPORT_SYMBOL(__napi_complete);
3765
3766void napi_complete(struct napi_struct *n)
3767{
3768 unsigned long flags;
3769
3770 /*
3771 * don't let napi dequeue from the cpu poll list
3772 * just in case its running on a different cpu
3773 */
3774 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3775 return;
3776
3777 napi_gro_flush(n);
3778 local_irq_save(flags);
3779 __napi_complete(n);
3780 local_irq_restore(flags);
3781}
3782EXPORT_SYMBOL(napi_complete);
3783
3784void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3785 int (*poll)(struct napi_struct *, int), int weight)
3786{
3787 INIT_LIST_HEAD(&napi->poll_list);
3788 napi->gro_count = 0;
3789 napi->gro_list = NULL;
3790 napi->skb = NULL;
3791 napi->poll = poll;
3792 napi->weight = weight;
3793 list_add(&napi->dev_list, &dev->napi_list);
3794 napi->dev = dev;
3795#ifdef CONFIG_NETPOLL
3796 spin_lock_init(&napi->poll_lock);
3797 napi->poll_owner = -1;
3798#endif
3799 set_bit(NAPI_STATE_SCHED, &napi->state);
3800}
3801EXPORT_SYMBOL(netif_napi_add);
3802
3803void netif_napi_del(struct napi_struct *napi)
3804{
3805 struct sk_buff *skb, *next;
3806
3807 list_del_init(&napi->dev_list);
3808 napi_free_frags(napi);
3809
3810 for (skb = napi->gro_list; skb; skb = next) {
3811 next = skb->next;
3812 skb->next = NULL;
3813 kfree_skb(skb);
3814 }
3815
3816 napi->gro_list = NULL;
3817 napi->gro_count = 0;
3818}
3819EXPORT_SYMBOL(netif_napi_del);
3820
3821static void net_rx_action(struct softirq_action *h)
3822{
3823 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3824 unsigned long time_limit = jiffies + 2;
3825 int budget = netdev_budget;
3826 void *have;
3827
3828 local_irq_disable();
3829
3830 while (!list_empty(&sd->poll_list)) {
3831 struct napi_struct *n;
3832 int work, weight;
3833
3834 /* If softirq window is exhuasted then punt.
3835 * Allow this to run for 2 jiffies since which will allow
3836 * an average latency of 1.5/HZ.
3837 */
3838 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3839 goto softnet_break;
3840
3841 local_irq_enable();
3842
3843 /* Even though interrupts have been re-enabled, this
3844 * access is safe because interrupts can only add new
3845 * entries to the tail of this list, and only ->poll()
3846 * calls can remove this head entry from the list.
3847 */
3848 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3849
3850 have = netpoll_poll_lock(n);
3851
3852 weight = n->weight;
3853
3854 /* This NAPI_STATE_SCHED test is for avoiding a race
3855 * with netpoll's poll_napi(). Only the entity which
3856 * obtains the lock and sees NAPI_STATE_SCHED set will
3857 * actually make the ->poll() call. Therefore we avoid
3858 * accidentally calling ->poll() when NAPI is not scheduled.
3859 */
3860 work = 0;
3861 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3862 work = n->poll(n, weight);
3863 trace_napi_poll(n);
3864 }
3865
3866 WARN_ON_ONCE(work > weight);
3867
3868 budget -= work;
3869
3870 local_irq_disable();
3871
3872 /* Drivers must not modify the NAPI state if they
3873 * consume the entire weight. In such cases this code
3874 * still "owns" the NAPI instance and therefore can
3875 * move the instance around on the list at-will.
3876 */
3877 if (unlikely(work == weight)) {
3878 if (unlikely(napi_disable_pending(n))) {
3879 local_irq_enable();
3880 napi_complete(n);
3881 local_irq_disable();
3882 } else
3883 list_move_tail(&n->poll_list, &sd->poll_list);
3884 }
3885
3886 netpoll_poll_unlock(have);
3887 }
3888out:
3889 net_rps_action_and_irq_enable(sd);
3890
3891#ifdef CONFIG_NET_DMA
3892 /*
3893 * There may not be any more sk_buffs coming right now, so push
3894 * any pending DMA copies to hardware
3895 */
3896 dma_issue_pending_all();
3897#endif
3898
3899 return;
3900
3901softnet_break:
3902 sd->time_squeeze++;
3903 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3904 goto out;
3905}
3906
3907static gifconf_func_t *gifconf_list[NPROTO];
3908
3909/**
3910 * register_gifconf - register a SIOCGIF handler
3911 * @family: Address family
3912 * @gifconf: Function handler
3913 *
3914 * Register protocol dependent address dumping routines. The handler
3915 * that is passed must not be freed or reused until it has been replaced
3916 * by another handler.
3917 */
3918int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3919{
3920 if (family >= NPROTO)
3921 return -EINVAL;
3922 gifconf_list[family] = gifconf;
3923 return 0;
3924}
3925EXPORT_SYMBOL(register_gifconf);
3926
3927
3928/*
3929 * Map an interface index to its name (SIOCGIFNAME)
3930 */
3931
3932/*
3933 * We need this ioctl for efficient implementation of the
3934 * if_indextoname() function required by the IPv6 API. Without
3935 * it, we would have to search all the interfaces to find a
3936 * match. --pb
3937 */
3938
3939static int dev_ifname(struct net *net, struct ifreq __user *arg)
3940{
3941 struct net_device *dev;
3942 struct ifreq ifr;
3943
3944 /*
3945 * Fetch the caller's info block.
3946 */
3947
3948 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3949 return -EFAULT;
3950
3951 rcu_read_lock();
3952 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3953 if (!dev) {
3954 rcu_read_unlock();
3955 return -ENODEV;
3956 }
3957
3958 strcpy(ifr.ifr_name, dev->name);
3959 rcu_read_unlock();
3960
3961 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3962 return -EFAULT;
3963 return 0;
3964}
3965
3966/*
3967 * Perform a SIOCGIFCONF call. This structure will change
3968 * size eventually, and there is nothing I can do about it.
3969 * Thus we will need a 'compatibility mode'.
3970 */
3971
3972static int dev_ifconf(struct net *net, char __user *arg)
3973{
3974 struct ifconf ifc;
3975 struct net_device *dev;
3976 char __user *pos;
3977 int len;
3978 int total;
3979 int i;
3980
3981 /*
3982 * Fetch the caller's info block.
3983 */
3984
3985 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3986 return -EFAULT;
3987
3988 pos = ifc.ifc_buf;
3989 len = ifc.ifc_len;
3990
3991 /*
3992 * Loop over the interfaces, and write an info block for each.
3993 */
3994
3995 total = 0;
3996 for_each_netdev(net, dev) {
3997 for (i = 0; i < NPROTO; i++) {
3998 if (gifconf_list[i]) {
3999 int done;
4000 if (!pos)
4001 done = gifconf_list[i](dev, NULL, 0);
4002 else
4003 done = gifconf_list[i](dev, pos + total,
4004 len - total);
4005 if (done < 0)
4006 return -EFAULT;
4007 total += done;
4008 }
4009 }
4010 }
4011
4012 /*
4013 * All done. Write the updated control block back to the caller.
4014 */
4015 ifc.ifc_len = total;
4016
4017 /*
4018 * Both BSD and Solaris return 0 here, so we do too.
4019 */
4020 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4021}
4022
4023#ifdef CONFIG_PROC_FS
4024
4025#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4026
4027#define get_bucket(x) ((x) >> BUCKET_SPACE)
4028#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4029#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4030
4031static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4032{
4033 struct net *net = seq_file_net(seq);
4034 struct net_device *dev;
4035 struct hlist_node *p;
4036 struct hlist_head *h;
4037 unsigned int count = 0, offset = get_offset(*pos);
4038
4039 h = &net->dev_name_head[get_bucket(*pos)];
4040 hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4041 if (++count == offset)
4042 return dev;
4043 }
4044
4045 return NULL;
4046}
4047
4048static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4049{
4050 struct net_device *dev;
4051 unsigned int bucket;
4052
4053 do {
4054 dev = dev_from_same_bucket(seq, pos);
4055 if (dev)
4056 return dev;
4057
4058 bucket = get_bucket(*pos) + 1;
4059 *pos = set_bucket_offset(bucket, 1);
4060 } while (bucket < NETDEV_HASHENTRIES);
4061
4062 return NULL;
4063}
4064
4065/*
4066 * This is invoked by the /proc filesystem handler to display a device
4067 * in detail.
4068 */
4069void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4070 __acquires(RCU)
4071{
4072 rcu_read_lock();
4073 if (!*pos)
4074 return SEQ_START_TOKEN;
4075
4076 if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4077 return NULL;
4078
4079 return dev_from_bucket(seq, pos);
4080}
4081
4082void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4083{
4084 ++*pos;
4085 return dev_from_bucket(seq, pos);
4086}
4087
4088void dev_seq_stop(struct seq_file *seq, void *v)
4089 __releases(RCU)
4090{
4091 rcu_read_unlock();
4092}
4093
4094static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4095{
4096 struct rtnl_link_stats64 temp;
4097 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4098
4099 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4100 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4101 dev->name, stats->rx_bytes, stats->rx_packets,
4102 stats->rx_errors,
4103 stats->rx_dropped + stats->rx_missed_errors,
4104 stats->rx_fifo_errors,
4105 stats->rx_length_errors + stats->rx_over_errors +
4106 stats->rx_crc_errors + stats->rx_frame_errors,
4107 stats->rx_compressed, stats->multicast,
4108 stats->tx_bytes, stats->tx_packets,
4109 stats->tx_errors, stats->tx_dropped,
4110 stats->tx_fifo_errors, stats->collisions,
4111 stats->tx_carrier_errors +
4112 stats->tx_aborted_errors +
4113 stats->tx_window_errors +
4114 stats->tx_heartbeat_errors,
4115 stats->tx_compressed);
4116}
4117
4118/*
4119 * Called from the PROCfs module. This now uses the new arbitrary sized
4120 * /proc/net interface to create /proc/net/dev
4121 */
4122static int dev_seq_show(struct seq_file *seq, void *v)
4123{
4124 if (v == SEQ_START_TOKEN)
4125 seq_puts(seq, "Inter-| Receive "
4126 " | Transmit\n"
4127 " face |bytes packets errs drop fifo frame "
4128 "compressed multicast|bytes packets errs "
4129 "drop fifo colls carrier compressed\n");
4130 else
4131 dev_seq_printf_stats(seq, v);
4132 return 0;
4133}
4134
4135static struct softnet_data *softnet_get_online(loff_t *pos)
4136{
4137 struct softnet_data *sd = NULL;
4138
4139 while (*pos < nr_cpu_ids)
4140 if (cpu_online(*pos)) {
4141 sd = &per_cpu(softnet_data, *pos);
4142 break;
4143 } else
4144 ++*pos;
4145 return sd;
4146}
4147
4148static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4149{
4150 return softnet_get_online(pos);
4151}
4152
4153static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4154{
4155 ++*pos;
4156 return softnet_get_online(pos);
4157}
4158
4159static void softnet_seq_stop(struct seq_file *seq, void *v)
4160{
4161}
4162
4163static int softnet_seq_show(struct seq_file *seq, void *v)
4164{
4165 struct softnet_data *sd = v;
4166
4167 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4168 sd->processed, sd->dropped, sd->time_squeeze, 0,
4169 0, 0, 0, 0, /* was fastroute */
4170 sd->cpu_collision, sd->received_rps);
4171 return 0;
4172}
4173
4174static const struct seq_operations dev_seq_ops = {
4175 .start = dev_seq_start,
4176 .next = dev_seq_next,
4177 .stop = dev_seq_stop,
4178 .show = dev_seq_show,
4179};
4180
4181static int dev_seq_open(struct inode *inode, struct file *file)
4182{
4183 return seq_open_net(inode, file, &dev_seq_ops,
4184 sizeof(struct seq_net_private));
4185}
4186
4187static const struct file_operations dev_seq_fops = {
4188 .owner = THIS_MODULE,
4189 .open = dev_seq_open,
4190 .read = seq_read,
4191 .llseek = seq_lseek,
4192 .release = seq_release_net,
4193};
4194
4195static const struct seq_operations softnet_seq_ops = {
4196 .start = softnet_seq_start,
4197 .next = softnet_seq_next,
4198 .stop = softnet_seq_stop,
4199 .show = softnet_seq_show,
4200};
4201
4202static int softnet_seq_open(struct inode *inode, struct file *file)
4203{
4204 return seq_open(file, &softnet_seq_ops);
4205}
4206
4207static const struct file_operations softnet_seq_fops = {
4208 .owner = THIS_MODULE,
4209 .open = softnet_seq_open,
4210 .read = seq_read,
4211 .llseek = seq_lseek,
4212 .release = seq_release,
4213};
4214
4215static void *ptype_get_idx(loff_t pos)
4216{
4217 struct packet_type *pt = NULL;
4218 loff_t i = 0;
4219 int t;
4220
4221 list_for_each_entry_rcu(pt, &ptype_all, list) {
4222 if (i == pos)
4223 return pt;
4224 ++i;
4225 }
4226
4227 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4228 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4229 if (i == pos)
4230 return pt;
4231 ++i;
4232 }
4233 }
4234 return NULL;
4235}
4236
4237static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4238 __acquires(RCU)
4239{
4240 rcu_read_lock();
4241 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4242}
4243
4244static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4245{
4246 struct packet_type *pt;
4247 struct list_head *nxt;
4248 int hash;
4249
4250 ++*pos;
4251 if (v == SEQ_START_TOKEN)
4252 return ptype_get_idx(0);
4253
4254 pt = v;
4255 nxt = pt->list.next;
4256 if (pt->type == htons(ETH_P_ALL)) {
4257 if (nxt != &ptype_all)
4258 goto found;
4259 hash = 0;
4260 nxt = ptype_base[0].next;
4261 } else
4262 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4263
4264 while (nxt == &ptype_base[hash]) {
4265 if (++hash >= PTYPE_HASH_SIZE)
4266 return NULL;
4267 nxt = ptype_base[hash].next;
4268 }
4269found:
4270 return list_entry(nxt, struct packet_type, list);
4271}
4272
4273static void ptype_seq_stop(struct seq_file *seq, void *v)
4274 __releases(RCU)
4275{
4276 rcu_read_unlock();
4277}
4278
4279static int ptype_seq_show(struct seq_file *seq, void *v)
4280{
4281 struct packet_type *pt = v;
4282
4283 if (v == SEQ_START_TOKEN)
4284 seq_puts(seq, "Type Device Function\n");
4285 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4286 if (pt->type == htons(ETH_P_ALL))
4287 seq_puts(seq, "ALL ");
4288 else
4289 seq_printf(seq, "%04x", ntohs(pt->type));
4290
4291 seq_printf(seq, " %-8s %pF\n",
4292 pt->dev ? pt->dev->name : "", pt->func);
4293 }
4294
4295 return 0;
4296}
4297
4298static const struct seq_operations ptype_seq_ops = {
4299 .start = ptype_seq_start,
4300 .next = ptype_seq_next,
4301 .stop = ptype_seq_stop,
4302 .show = ptype_seq_show,
4303};
4304
4305static int ptype_seq_open(struct inode *inode, struct file *file)
4306{
4307 return seq_open_net(inode, file, &ptype_seq_ops,
4308 sizeof(struct seq_net_private));
4309}
4310
4311static const struct file_operations ptype_seq_fops = {
4312 .owner = THIS_MODULE,
4313 .open = ptype_seq_open,
4314 .read = seq_read,
4315 .llseek = seq_lseek,
4316 .release = seq_release_net,
4317};
4318
4319
4320static int __net_init dev_proc_net_init(struct net *net)
4321{
4322 int rc = -ENOMEM;
4323
4324 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4325 goto out;
4326 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4327 goto out_dev;
4328 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4329 goto out_softnet;
4330
4331 if (wext_proc_init(net))
4332 goto out_ptype;
4333 rc = 0;
4334out:
4335 return rc;
4336out_ptype:
4337 proc_net_remove(net, "ptype");
4338out_softnet:
4339 proc_net_remove(net, "softnet_stat");
4340out_dev:
4341 proc_net_remove(net, "dev");
4342 goto out;
4343}
4344
4345static void __net_exit dev_proc_net_exit(struct net *net)
4346{
4347 wext_proc_exit(net);
4348
4349 proc_net_remove(net, "ptype");
4350 proc_net_remove(net, "softnet_stat");
4351 proc_net_remove(net, "dev");
4352}
4353
4354static struct pernet_operations __net_initdata dev_proc_ops = {
4355 .init = dev_proc_net_init,
4356 .exit = dev_proc_net_exit,
4357};
4358
4359static int __init dev_proc_init(void)
4360{
4361 return register_pernet_subsys(&dev_proc_ops);
4362}
4363#else
4364#define dev_proc_init() 0
4365#endif /* CONFIG_PROC_FS */
4366
4367
4368/**
4369 * netdev_set_master - set up master pointer
4370 * @slave: slave device
4371 * @master: new master device
4372 *
4373 * Changes the master device of the slave. Pass %NULL to break the
4374 * bonding. The caller must hold the RTNL semaphore. On a failure
4375 * a negative errno code is returned. On success the reference counts
4376 * are adjusted and the function returns zero.
4377 */
4378int netdev_set_master(struct net_device *slave, struct net_device *master)
4379{
4380 struct net_device *old = slave->master;
4381
4382 ASSERT_RTNL();
4383
4384 if (master) {
4385 if (old)
4386 return -EBUSY;
4387 dev_hold(master);
4388 }
4389
4390 slave->master = master;
4391
4392 if (old)
4393 dev_put(old);
4394 return 0;
4395}
4396EXPORT_SYMBOL(netdev_set_master);
4397
4398/**
4399 * netdev_set_bond_master - set up bonding master/slave pair
4400 * @slave: slave device
4401 * @master: new master device
4402 *
4403 * Changes the master device of the slave. Pass %NULL to break the
4404 * bonding. The caller must hold the RTNL semaphore. On a failure
4405 * a negative errno code is returned. On success %RTM_NEWLINK is sent
4406 * to the routing socket and the function returns zero.
4407 */
4408int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4409{
4410 int err;
4411
4412 ASSERT_RTNL();
4413
4414 err = netdev_set_master(slave, master);
4415 if (err)
4416 return err;
4417 if (master)
4418 slave->flags |= IFF_SLAVE;
4419 else
4420 slave->flags &= ~IFF_SLAVE;
4421
4422 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4423 return 0;
4424}
4425EXPORT_SYMBOL(netdev_set_bond_master);
4426
4427static void dev_change_rx_flags(struct net_device *dev, int flags)
4428{
4429 const struct net_device_ops *ops = dev->netdev_ops;
4430
4431 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4432 ops->ndo_change_rx_flags(dev, flags);
4433}
4434
4435static int __dev_set_promiscuity(struct net_device *dev, int inc)
4436{
4437 unsigned int old_flags = dev->flags;
4438 uid_t uid;
4439 gid_t gid;
4440
4441 ASSERT_RTNL();
4442
4443 dev->flags |= IFF_PROMISC;
4444 dev->promiscuity += inc;
4445 if (dev->promiscuity == 0) {
4446 /*
4447 * Avoid overflow.
4448 * If inc causes overflow, untouch promisc and return error.
4449 */
4450 if (inc < 0)
4451 dev->flags &= ~IFF_PROMISC;
4452 else {
4453 dev->promiscuity -= inc;
4454 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4455 dev->name);
4456 return -EOVERFLOW;
4457 }
4458 }
4459 if (dev->flags != old_flags) {
4460 pr_info("device %s %s promiscuous mode\n",
4461 dev->name,
4462 dev->flags & IFF_PROMISC ? "entered" : "left");
4463 if (audit_enabled) {
4464 current_uid_gid(&uid, &gid);
4465 audit_log(current->audit_context, GFP_ATOMIC,
4466 AUDIT_ANOM_PROMISCUOUS,
4467 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4468 dev->name, (dev->flags & IFF_PROMISC),
4469 (old_flags & IFF_PROMISC),
4470 audit_get_loginuid(current),
4471 uid, gid,
4472 audit_get_sessionid(current));
4473 }
4474
4475 dev_change_rx_flags(dev, IFF_PROMISC);
4476 }
4477 return 0;
4478}
4479
4480/**
4481 * dev_set_promiscuity - update promiscuity count on a device
4482 * @dev: device
4483 * @inc: modifier
4484 *
4485 * Add or remove promiscuity from a device. While the count in the device
4486 * remains above zero the interface remains promiscuous. Once it hits zero
4487 * the device reverts back to normal filtering operation. A negative inc
4488 * value is used to drop promiscuity on the device.
4489 * Return 0 if successful or a negative errno code on error.
4490 */
4491int dev_set_promiscuity(struct net_device *dev, int inc)
4492{
4493 unsigned int old_flags = dev->flags;
4494 int err;
4495
4496 err = __dev_set_promiscuity(dev, inc);
4497 if (err < 0)
4498 return err;
4499 if (dev->flags != old_flags)
4500 dev_set_rx_mode(dev);
4501 return err;
4502}
4503EXPORT_SYMBOL(dev_set_promiscuity);
4504
4505/**
4506 * dev_set_allmulti - update allmulti count on a device
4507 * @dev: device
4508 * @inc: modifier
4509 *
4510 * Add or remove reception of all multicast frames to a device. While the
4511 * count in the device remains above zero the interface remains listening
4512 * to all interfaces. Once it hits zero the device reverts back to normal
4513 * filtering operation. A negative @inc value is used to drop the counter
4514 * when releasing a resource needing all multicasts.
4515 * Return 0 if successful or a negative errno code on error.
4516 */
4517
4518int dev_set_allmulti(struct net_device *dev, int inc)
4519{
4520 unsigned int old_flags = dev->flags;
4521
4522 ASSERT_RTNL();
4523
4524 dev->flags |= IFF_ALLMULTI;
4525 dev->allmulti += inc;
4526 if (dev->allmulti == 0) {
4527 /*
4528 * Avoid overflow.
4529 * If inc causes overflow, untouch allmulti and return error.
4530 */
4531 if (inc < 0)
4532 dev->flags &= ~IFF_ALLMULTI;
4533 else {
4534 dev->allmulti -= inc;
4535 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4536 dev->name);
4537 return -EOVERFLOW;
4538 }
4539 }
4540 if (dev->flags ^ old_flags) {
4541 dev_change_rx_flags(dev, IFF_ALLMULTI);
4542 dev_set_rx_mode(dev);
4543 }
4544 return 0;
4545}
4546EXPORT_SYMBOL(dev_set_allmulti);
4547
4548/*
4549 * Upload unicast and multicast address lists to device and
4550 * configure RX filtering. When the device doesn't support unicast
4551 * filtering it is put in promiscuous mode while unicast addresses
4552 * are present.
4553 */
4554void __dev_set_rx_mode(struct net_device *dev)
4555{
4556 const struct net_device_ops *ops = dev->netdev_ops;
4557
4558 /* dev_open will call this function so the list will stay sane. */
4559 if (!(dev->flags&IFF_UP))
4560 return;
4561
4562 if (!netif_device_present(dev))
4563 return;
4564
4565 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4566 /* Unicast addresses changes may only happen under the rtnl,
4567 * therefore calling __dev_set_promiscuity here is safe.
4568 */
4569 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4570 __dev_set_promiscuity(dev, 1);
4571 dev->uc_promisc = true;
4572 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4573 __dev_set_promiscuity(dev, -1);
4574 dev->uc_promisc = false;
4575 }
4576 }
4577
4578 if (ops->ndo_set_rx_mode)
4579 ops->ndo_set_rx_mode(dev);
4580}
4581
4582void dev_set_rx_mode(struct net_device *dev)
4583{
4584 netif_addr_lock_bh(dev);
4585 __dev_set_rx_mode(dev);
4586 netif_addr_unlock_bh(dev);
4587}
4588
4589/**
4590 * dev_get_flags - get flags reported to userspace
4591 * @dev: device
4592 *
4593 * Get the combination of flag bits exported through APIs to userspace.
4594 */
4595unsigned int dev_get_flags(const struct net_device *dev)
4596{
4597 unsigned int flags;
4598
4599 flags = (dev->flags & ~(IFF_PROMISC |
4600 IFF_ALLMULTI |
4601 IFF_RUNNING |
4602 IFF_LOWER_UP |
4603 IFF_DORMANT)) |
4604 (dev->gflags & (IFF_PROMISC |
4605 IFF_ALLMULTI));
4606
4607 if (netif_running(dev)) {
4608 if (netif_oper_up(dev))
4609 flags |= IFF_RUNNING;
4610 if (netif_carrier_ok(dev))
4611 flags |= IFF_LOWER_UP;
4612 if (netif_dormant(dev))
4613 flags |= IFF_DORMANT;
4614 }
4615
4616 return flags;
4617}
4618EXPORT_SYMBOL(dev_get_flags);
4619
4620int __dev_change_flags(struct net_device *dev, unsigned int flags)
4621{
4622 unsigned int old_flags = dev->flags;
4623 int ret;
4624
4625 ASSERT_RTNL();
4626
4627 /*
4628 * Set the flags on our device.
4629 */
4630
4631 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4632 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4633 IFF_AUTOMEDIA)) |
4634 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4635 IFF_ALLMULTI));
4636
4637 /*
4638 * Load in the correct multicast list now the flags have changed.
4639 */
4640
4641 if ((old_flags ^ flags) & IFF_MULTICAST)
4642 dev_change_rx_flags(dev, IFF_MULTICAST);
4643
4644 dev_set_rx_mode(dev);
4645
4646 /*
4647 * Have we downed the interface. We handle IFF_UP ourselves
4648 * according to user attempts to set it, rather than blindly
4649 * setting it.
4650 */
4651
4652 ret = 0;
4653 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
4654 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4655
4656 if (!ret)
4657 dev_set_rx_mode(dev);
4658 }
4659
4660 if ((flags ^ dev->gflags) & IFF_PROMISC) {
4661 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4662
4663 dev->gflags ^= IFF_PROMISC;
4664 dev_set_promiscuity(dev, inc);
4665 }
4666
4667 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4668 is important. Some (broken) drivers set IFF_PROMISC, when
4669 IFF_ALLMULTI is requested not asking us and not reporting.
4670 */
4671 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4672 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4673
4674 dev->gflags ^= IFF_ALLMULTI;
4675 dev_set_allmulti(dev, inc);
4676 }
4677
4678 return ret;
4679}
4680
4681void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4682{
4683 unsigned int changes = dev->flags ^ old_flags;
4684
4685 if (changes & IFF_UP) {
4686 if (dev->flags & IFF_UP)
4687 call_netdevice_notifiers(NETDEV_UP, dev);
4688 else
4689 call_netdevice_notifiers(NETDEV_DOWN, dev);
4690 }
4691
4692 if (dev->flags & IFF_UP &&
4693 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4694 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4695}
4696
4697/**
4698 * dev_change_flags - change device settings
4699 * @dev: device
4700 * @flags: device state flags
4701 *
4702 * Change settings on device based state flags. The flags are
4703 * in the userspace exported format.
4704 */
4705int dev_change_flags(struct net_device *dev, unsigned int flags)
4706{
4707 int ret;
4708 unsigned int changes, old_flags = dev->flags;
4709
4710 ret = __dev_change_flags(dev, flags);
4711 if (ret < 0)
4712 return ret;
4713
4714 changes = old_flags ^ dev->flags;
4715 if (changes)
4716 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4717
4718 __dev_notify_flags(dev, old_flags);
4719 return ret;
4720}
4721EXPORT_SYMBOL(dev_change_flags);
4722
4723/**
4724 * dev_set_mtu - Change maximum transfer unit
4725 * @dev: device
4726 * @new_mtu: new transfer unit
4727 *
4728 * Change the maximum transfer size of the network device.
4729 */
4730int dev_set_mtu(struct net_device *dev, int new_mtu)
4731{
4732 const struct net_device_ops *ops = dev->netdev_ops;
4733 int err;
4734
4735 if (new_mtu == dev->mtu)
4736 return 0;
4737
4738 /* MTU must be positive. */
4739 if (new_mtu < 0)
4740 return -EINVAL;
4741
4742 if (!netif_device_present(dev))
4743 return -ENODEV;
4744
4745 err = 0;
4746 if (ops->ndo_change_mtu)
4747 err = ops->ndo_change_mtu(dev, new_mtu);
4748 else
4749 dev->mtu = new_mtu;
4750
4751 if (!err && dev->flags & IFF_UP)
4752 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4753 return err;
4754}
4755EXPORT_SYMBOL(dev_set_mtu);
4756
4757/**
4758 * dev_set_group - Change group this device belongs to
4759 * @dev: device
4760 * @new_group: group this device should belong to
4761 */
4762void dev_set_group(struct net_device *dev, int new_group)
4763{
4764 dev->group = new_group;
4765}
4766EXPORT_SYMBOL(dev_set_group);
4767
4768/**
4769 * dev_set_mac_address - Change Media Access Control Address
4770 * @dev: device
4771 * @sa: new address
4772 *
4773 * Change the hardware (MAC) address of the device
4774 */
4775int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4776{
4777 const struct net_device_ops *ops = dev->netdev_ops;
4778 int err;
4779
4780 if (!ops->ndo_set_mac_address)
4781 return -EOPNOTSUPP;
4782 if (sa->sa_family != dev->type)
4783 return -EINVAL;
4784 if (!netif_device_present(dev))
4785 return -ENODEV;
4786 err = ops->ndo_set_mac_address(dev, sa);
4787 if (!err)
4788 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4789 add_device_randomness(dev->dev_addr, dev->addr_len);
4790 return err;
4791}
4792EXPORT_SYMBOL(dev_set_mac_address);
4793
4794/*
4795 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4796 */
4797static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4798{
4799 int err;
4800 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4801
4802 if (!dev)
4803 return -ENODEV;
4804
4805 switch (cmd) {
4806 case SIOCGIFFLAGS: /* Get interface flags */
4807 ifr->ifr_flags = (short) dev_get_flags(dev);
4808 return 0;
4809
4810 case SIOCGIFMETRIC: /* Get the metric on the interface
4811 (currently unused) */
4812 ifr->ifr_metric = 0;
4813 return 0;
4814
4815 case SIOCGIFMTU: /* Get the MTU of a device */
4816 ifr->ifr_mtu = dev->mtu;
4817 return 0;
4818
4819 case SIOCGIFHWADDR:
4820 if (!dev->addr_len)
4821 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4822 else
4823 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4824 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4825 ifr->ifr_hwaddr.sa_family = dev->type;
4826 return 0;
4827
4828 case SIOCGIFSLAVE:
4829 err = -EINVAL;
4830 break;
4831
4832 case SIOCGIFMAP:
4833 ifr->ifr_map.mem_start = dev->mem_start;
4834 ifr->ifr_map.mem_end = dev->mem_end;
4835 ifr->ifr_map.base_addr = dev->base_addr;
4836 ifr->ifr_map.irq = dev->irq;
4837 ifr->ifr_map.dma = dev->dma;
4838 ifr->ifr_map.port = dev->if_port;
4839 return 0;
4840
4841 case SIOCGIFINDEX:
4842 ifr->ifr_ifindex = dev->ifindex;
4843 return 0;
4844
4845 case SIOCGIFTXQLEN:
4846 ifr->ifr_qlen = dev->tx_queue_len;
4847 return 0;
4848
4849 default:
4850 /* dev_ioctl() should ensure this case
4851 * is never reached
4852 */
4853 WARN_ON(1);
4854 err = -ENOTTY;
4855 break;
4856
4857 }
4858 return err;
4859}
4860
4861/*
4862 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4863 */
4864static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4865{
4866 int err;
4867 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4868 const struct net_device_ops *ops;
4869
4870 if (!dev)
4871 return -ENODEV;
4872
4873 ops = dev->netdev_ops;
4874
4875 switch (cmd) {
4876 case SIOCSIFFLAGS: /* Set interface flags */
4877 return dev_change_flags(dev, ifr->ifr_flags);
4878
4879 case SIOCSIFMETRIC: /* Set the metric on the interface
4880 (currently unused) */
4881 return -EOPNOTSUPP;
4882
4883 case SIOCSIFMTU: /* Set the MTU of a device */
4884 return dev_set_mtu(dev, ifr->ifr_mtu);
4885
4886 case SIOCSIFHWADDR:
4887 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4888
4889 case SIOCSIFHWBROADCAST:
4890 if (ifr->ifr_hwaddr.sa_family != dev->type)
4891 return -EINVAL;
4892 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4893 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4894 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4895 return 0;
4896
4897 case SIOCSIFMAP:
4898 if (ops->ndo_set_config) {
4899 if (!netif_device_present(dev))
4900 return -ENODEV;
4901 return ops->ndo_set_config(dev, &ifr->ifr_map);
4902 }
4903 return -EOPNOTSUPP;
4904
4905 case SIOCADDMULTI:
4906 if (!ops->ndo_set_rx_mode ||
4907 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4908 return -EINVAL;
4909 if (!netif_device_present(dev))
4910 return -ENODEV;
4911 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4912
4913 case SIOCDELMULTI:
4914 if (!ops->ndo_set_rx_mode ||
4915 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4916 return -EINVAL;
4917 if (!netif_device_present(dev))
4918 return -ENODEV;
4919 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4920
4921 case SIOCSIFTXQLEN:
4922 if (ifr->ifr_qlen < 0)
4923 return -EINVAL;
4924 dev->tx_queue_len = ifr->ifr_qlen;
4925 return 0;
4926
4927 case SIOCSIFNAME:
4928 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4929 return dev_change_name(dev, ifr->ifr_newname);
4930
4931 case SIOCSHWTSTAMP:
4932 err = net_hwtstamp_validate(ifr);
4933 if (err)
4934 return err;
4935 /* fall through */
4936
4937 /*
4938 * Unknown or private ioctl
4939 */
4940 default:
4941 if ((cmd >= SIOCDEVPRIVATE &&
4942 cmd <= SIOCDEVPRIVATE + 15) ||
4943 cmd == SIOCBONDENSLAVE ||
4944 cmd == SIOCBONDRELEASE ||
4945 cmd == SIOCBONDSETHWADDR ||
4946 cmd == SIOCBONDSLAVEINFOQUERY ||
4947 cmd == SIOCBONDINFOQUERY ||
4948 cmd == SIOCBONDCHANGEACTIVE ||
4949 cmd == SIOCGMIIPHY ||
4950 cmd == SIOCGMIIREG ||
4951 cmd == SIOCSMIIREG ||
4952 cmd == SIOCBRADDIF ||
4953 cmd == SIOCBRDELIF ||
4954 cmd == SIOCSHWTSTAMP ||
4955 cmd == SIOCWANDEV) {
4956 err = -EOPNOTSUPP;
4957 if (ops->ndo_do_ioctl) {
4958 if (netif_device_present(dev))
4959 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4960 else
4961 err = -ENODEV;
4962 }
4963 } else
4964 err = -EINVAL;
4965
4966 }
4967 return err;
4968}
4969
4970/*
4971 * This function handles all "interface"-type I/O control requests. The actual
4972 * 'doing' part of this is dev_ifsioc above.
4973 */
4974
4975/**
4976 * dev_ioctl - network device ioctl
4977 * @net: the applicable net namespace
4978 * @cmd: command to issue
4979 * @arg: pointer to a struct ifreq in user space
4980 *
4981 * Issue ioctl functions to devices. This is normally called by the
4982 * user space syscall interfaces but can sometimes be useful for
4983 * other purposes. The return value is the return from the syscall if
4984 * positive or a negative errno code on error.
4985 */
4986
4987int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4988{
4989 struct ifreq ifr;
4990 int ret;
4991 char *colon;
4992
4993 /* One special case: SIOCGIFCONF takes ifconf argument
4994 and requires shared lock, because it sleeps writing
4995 to user space.
4996 */
4997
4998 if (cmd == SIOCGIFCONF) {
4999 rtnl_lock();
5000 ret = dev_ifconf(net, (char __user *) arg);
5001 rtnl_unlock();
5002 return ret;
5003 }
5004 if (cmd == SIOCGIFNAME)
5005 return dev_ifname(net, (struct ifreq __user *)arg);
5006
5007 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5008 return -EFAULT;
5009
5010 ifr.ifr_name[IFNAMSIZ-1] = 0;
5011
5012 colon = strchr(ifr.ifr_name, ':');
5013 if (colon)
5014 *colon = 0;
5015
5016 /*
5017 * See which interface the caller is talking about.
5018 */
5019
5020 switch (cmd) {
5021 /*
5022 * These ioctl calls:
5023 * - can be done by all.
5024 * - atomic and do not require locking.
5025 * - return a value
5026 */
5027 case SIOCGIFFLAGS:
5028 case SIOCGIFMETRIC:
5029 case SIOCGIFMTU:
5030 case SIOCGIFHWADDR:
5031 case SIOCGIFSLAVE:
5032 case SIOCGIFMAP:
5033 case SIOCGIFINDEX:
5034 case SIOCGIFTXQLEN:
5035 dev_load(net, ifr.ifr_name);
5036 rcu_read_lock();
5037 ret = dev_ifsioc_locked(net, &ifr, cmd);
5038 rcu_read_unlock();
5039 if (!ret) {
5040 if (colon)
5041 *colon = ':';
5042 if (copy_to_user(arg, &ifr,
5043 sizeof(struct ifreq)))
5044 ret = -EFAULT;
5045 }
5046 return ret;
5047
5048 case SIOCETHTOOL:
5049 dev_load(net, ifr.ifr_name);
5050 rtnl_lock();
5051 ret = dev_ethtool(net, &ifr);
5052 rtnl_unlock();
5053 if (!ret) {
5054 if (colon)
5055 *colon = ':';
5056 if (copy_to_user(arg, &ifr,
5057 sizeof(struct ifreq)))
5058 ret = -EFAULT;
5059 }
5060 return ret;
5061
5062 /*
5063 * These ioctl calls:
5064 * - require superuser power.
5065 * - require strict serialization.
5066 * - return a value
5067 */
5068 case SIOCGMIIPHY:
5069 case SIOCGMIIREG:
5070 case SIOCSIFNAME:
5071 if (!capable(CAP_NET_ADMIN))
5072 return -EPERM;
5073 dev_load(net, ifr.ifr_name);
5074 rtnl_lock();
5075 ret = dev_ifsioc(net, &ifr, cmd);
5076 rtnl_unlock();
5077 if (!ret) {
5078 if (colon)
5079 *colon = ':';
5080 if (copy_to_user(arg, &ifr,
5081 sizeof(struct ifreq)))
5082 ret = -EFAULT;
5083 }
5084 return ret;
5085
5086 /*
5087 * These ioctl calls:
5088 * - require superuser power.
5089 * - require strict serialization.
5090 * - do not return a value
5091 */
5092 case SIOCSIFFLAGS:
5093 case SIOCSIFMETRIC:
5094 case SIOCSIFMTU:
5095 case SIOCSIFMAP:
5096 case SIOCSIFHWADDR:
5097 case SIOCSIFSLAVE:
5098 case SIOCADDMULTI:
5099 case SIOCDELMULTI:
5100 case SIOCSIFHWBROADCAST:
5101 case SIOCSIFTXQLEN:
5102 case SIOCSMIIREG:
5103 case SIOCBONDENSLAVE:
5104 case SIOCBONDRELEASE:
5105 case SIOCBONDSETHWADDR:
5106 case SIOCBONDCHANGEACTIVE:
5107 case SIOCBRADDIF:
5108 case SIOCBRDELIF:
5109 case SIOCSHWTSTAMP:
5110 if (!capable(CAP_NET_ADMIN))
5111 return -EPERM;
5112 /* fall through */
5113 case SIOCBONDSLAVEINFOQUERY:
5114 case SIOCBONDINFOQUERY:
5115 dev_load(net, ifr.ifr_name);
5116 rtnl_lock();
5117 ret = dev_ifsioc(net, &ifr, cmd);
5118 rtnl_unlock();
5119 return ret;
5120
5121 case SIOCGIFMEM:
5122 /* Get the per device memory space. We can add this but
5123 * currently do not support it */
5124 case SIOCSIFMEM:
5125 /* Set the per device memory buffer space.
5126 * Not applicable in our case */
5127 case SIOCSIFLINK:
5128 return -ENOTTY;
5129
5130 /*
5131 * Unknown or private ioctl.
5132 */
5133 default:
5134 if (cmd == SIOCWANDEV ||
5135 (cmd >= SIOCDEVPRIVATE &&
5136 cmd <= SIOCDEVPRIVATE + 15)) {
5137 dev_load(net, ifr.ifr_name);
5138 rtnl_lock();
5139 ret = dev_ifsioc(net, &ifr, cmd);
5140 rtnl_unlock();
5141 if (!ret && copy_to_user(arg, &ifr,
5142 sizeof(struct ifreq)))
5143 ret = -EFAULT;
5144 return ret;
5145 }
5146 /* Take care of Wireless Extensions */
5147 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5148 return wext_handle_ioctl(net, &ifr, cmd, arg);
5149 return -ENOTTY;
5150 }
5151}
5152
5153
5154/**
5155 * dev_new_index - allocate an ifindex
5156 * @net: the applicable net namespace
5157 *
5158 * Returns a suitable unique value for a new device interface
5159 * number. The caller must hold the rtnl semaphore or the
5160 * dev_base_lock to be sure it remains unique.
5161 */
5162static int dev_new_index(struct net *net)
5163{
5164 static int ifindex;
5165 for (;;) {
5166 if (++ifindex <= 0)
5167 ifindex = 1;
5168 if (!__dev_get_by_index(net, ifindex))
5169 return ifindex;
5170 }
5171}
5172
5173/* Delayed registration/unregisteration */
5174static LIST_HEAD(net_todo_list);
5175
5176static void net_set_todo(struct net_device *dev)
5177{
5178 list_add_tail(&dev->todo_list, &net_todo_list);
5179}
5180
5181static void rollback_registered_many(struct list_head *head)
5182{
5183 struct net_device *dev, *tmp;
5184
5185 BUG_ON(dev_boot_phase);
5186 ASSERT_RTNL();
5187
5188 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5189 /* Some devices call without registering
5190 * for initialization unwind. Remove those
5191 * devices and proceed with the remaining.
5192 */
5193 if (dev->reg_state == NETREG_UNINITIALIZED) {
5194 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5195 dev->name, dev);
5196
5197 WARN_ON(1);
5198 list_del(&dev->unreg_list);
5199 continue;
5200 }
5201 dev->dismantle = true;
5202 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5203 }
5204
5205 /* If device is running, close it first. */
5206 dev_close_many(head);
5207
5208 list_for_each_entry(dev, head, unreg_list) {
5209 /* And unlink it from device chain. */
5210 unlist_netdevice(dev);
5211
5212 dev->reg_state = NETREG_UNREGISTERING;
5213 }
5214
5215 synchronize_net();
5216
5217 list_for_each_entry(dev, head, unreg_list) {
5218 /* Shutdown queueing discipline. */
5219 dev_shutdown(dev);
5220
5221
5222 /* Notify protocols, that we are about to destroy
5223 this device. They should clean all the things.
5224 */
5225 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5226
5227 if (!dev->rtnl_link_ops ||
5228 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5229 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5230
5231 /*
5232 * Flush the unicast and multicast chains
5233 */
5234 dev_uc_flush(dev);
5235 dev_mc_flush(dev);
5236
5237 if (dev->netdev_ops->ndo_uninit)
5238 dev->netdev_ops->ndo_uninit(dev);
5239
5240 /* Notifier chain MUST detach us from master device. */
5241 WARN_ON(dev->master);
5242
5243 /* Remove entries from kobject tree */
5244 netdev_unregister_kobject(dev);
5245 }
5246
5247 /* Process any work delayed until the end of the batch */
5248 dev = list_first_entry(head, struct net_device, unreg_list);
5249 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5250
5251 synchronize_net();
5252
5253 list_for_each_entry(dev, head, unreg_list)
5254 dev_put(dev);
5255}
5256
5257static void rollback_registered(struct net_device *dev)
5258{
5259 LIST_HEAD(single);
5260
5261 list_add(&dev->unreg_list, &single);
5262 rollback_registered_many(&single);
5263 list_del(&single);
5264}
5265
5266static netdev_features_t netdev_fix_features(struct net_device *dev,
5267 netdev_features_t features)
5268{
5269 /* Fix illegal checksum combinations */
5270 if ((features & NETIF_F_HW_CSUM) &&
5271 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5272 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5273 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5274 }
5275
5276 /* Fix illegal SG+CSUM combinations. */
5277 if ((features & NETIF_F_SG) &&
5278 !(features & NETIF_F_ALL_CSUM)) {
5279 netdev_dbg(dev,
5280 "Dropping NETIF_F_SG since no checksum feature.\n");
5281 features &= ~NETIF_F_SG;
5282 }
5283
5284 /* TSO requires that SG is present as well. */
5285 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5286 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5287 features &= ~NETIF_F_ALL_TSO;
5288 }
5289
5290 /* TSO ECN requires that TSO is present as well. */
5291 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5292 features &= ~NETIF_F_TSO_ECN;
5293
5294 /* Software GSO depends on SG. */
5295 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5296 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5297 features &= ~NETIF_F_GSO;
5298 }
5299
5300 /* UFO needs SG and checksumming */
5301 if (features & NETIF_F_UFO) {
5302 /* maybe split UFO into V4 and V6? */
5303 if (!((features & NETIF_F_GEN_CSUM) ||
5304 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5305 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5306 netdev_dbg(dev,
5307 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5308 features &= ~NETIF_F_UFO;
5309 }
5310
5311 if (!(features & NETIF_F_SG)) {
5312 netdev_dbg(dev,
5313 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5314 features &= ~NETIF_F_UFO;
5315 }
5316 }
5317
5318 return features;
5319}
5320
5321int __netdev_update_features(struct net_device *dev)
5322{
5323 netdev_features_t features;
5324 int err = 0;
5325
5326 ASSERT_RTNL();
5327
5328 features = netdev_get_wanted_features(dev);
5329
5330 if (dev->netdev_ops->ndo_fix_features)
5331 features = dev->netdev_ops->ndo_fix_features(dev, features);
5332
5333 /* driver might be less strict about feature dependencies */
5334 features = netdev_fix_features(dev, features);
5335
5336 if (dev->features == features)
5337 return 0;
5338
5339 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5340 &dev->features, &features);
5341
5342 if (dev->netdev_ops->ndo_set_features)
5343 err = dev->netdev_ops->ndo_set_features(dev, features);
5344
5345 if (unlikely(err < 0)) {
5346 netdev_err(dev,
5347 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5348 err, &features, &dev->features);
5349 return -1;
5350 }
5351
5352 if (!err)
5353 dev->features = features;
5354
5355 return 1;
5356}
5357
5358/**
5359 * netdev_update_features - recalculate device features
5360 * @dev: the device to check
5361 *
5362 * Recalculate dev->features set and send notifications if it
5363 * has changed. Should be called after driver or hardware dependent
5364 * conditions might have changed that influence the features.
5365 */
5366void netdev_update_features(struct net_device *dev)
5367{
5368 if (__netdev_update_features(dev))
5369 netdev_features_change(dev);
5370}
5371EXPORT_SYMBOL(netdev_update_features);
5372
5373/**
5374 * netdev_change_features - recalculate device features
5375 * @dev: the device to check
5376 *
5377 * Recalculate dev->features set and send notifications even
5378 * if they have not changed. Should be called instead of
5379 * netdev_update_features() if also dev->vlan_features might
5380 * have changed to allow the changes to be propagated to stacked
5381 * VLAN devices.
5382 */
5383void netdev_change_features(struct net_device *dev)
5384{
5385 __netdev_update_features(dev);
5386 netdev_features_change(dev);
5387}
5388EXPORT_SYMBOL(netdev_change_features);
5389
5390/**
5391 * netif_stacked_transfer_operstate - transfer operstate
5392 * @rootdev: the root or lower level device to transfer state from
5393 * @dev: the device to transfer operstate to
5394 *
5395 * Transfer operational state from root to device. This is normally
5396 * called when a stacking relationship exists between the root
5397 * device and the device(a leaf device).
5398 */
5399void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5400 struct net_device *dev)
5401{
5402 if (rootdev->operstate == IF_OPER_DORMANT)
5403 netif_dormant_on(dev);
5404 else
5405 netif_dormant_off(dev);
5406
5407 if (netif_carrier_ok(rootdev)) {
5408 if (!netif_carrier_ok(dev))
5409 netif_carrier_on(dev);
5410 } else {
5411 if (netif_carrier_ok(dev))
5412 netif_carrier_off(dev);
5413 }
5414}
5415EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5416
5417#ifdef CONFIG_RPS
5418static int netif_alloc_rx_queues(struct net_device *dev)
5419{
5420 unsigned int i, count = dev->num_rx_queues;
5421 struct netdev_rx_queue *rx;
5422
5423 BUG_ON(count < 1);
5424
5425 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5426 if (!rx) {
5427 pr_err("netdev: Unable to allocate %u rx queues\n", count);
5428 return -ENOMEM;
5429 }
5430 dev->_rx = rx;
5431
5432 for (i = 0; i < count; i++)
5433 rx[i].dev = dev;
5434 return 0;
5435}
5436#endif
5437
5438static void netdev_init_one_queue(struct net_device *dev,
5439 struct netdev_queue *queue, void *_unused)
5440{
5441 /* Initialize queue lock */
5442 spin_lock_init(&queue->_xmit_lock);
5443 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5444 queue->xmit_lock_owner = -1;
5445 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5446 queue->dev = dev;
5447#ifdef CONFIG_BQL
5448 dql_init(&queue->dql, HZ);
5449#endif
5450}
5451
5452static int netif_alloc_netdev_queues(struct net_device *dev)
5453{
5454 unsigned int count = dev->num_tx_queues;
5455 struct netdev_queue *tx;
5456
5457 BUG_ON(count < 1);
5458
5459 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5460 if (!tx) {
5461 pr_err("netdev: Unable to allocate %u tx queues\n", count);
5462 return -ENOMEM;
5463 }
5464 dev->_tx = tx;
5465
5466 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5467 spin_lock_init(&dev->tx_global_lock);
5468
5469 return 0;
5470}
5471
5472/**
5473 * register_netdevice - register a network device
5474 * @dev: device to register
5475 *
5476 * Take a completed network device structure and add it to the kernel
5477 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5478 * chain. 0 is returned on success. A negative errno code is returned
5479 * on a failure to set up the device, or if the name is a duplicate.
5480 *
5481 * Callers must hold the rtnl semaphore. You may want
5482 * register_netdev() instead of this.
5483 *
5484 * BUGS:
5485 * The locking appears insufficient to guarantee two parallel registers
5486 * will not get the same name.
5487 */
5488
5489int register_netdevice(struct net_device *dev)
5490{
5491 int ret;
5492 struct net *net = dev_net(dev);
5493
5494 BUG_ON(dev_boot_phase);
5495 ASSERT_RTNL();
5496
5497 might_sleep();
5498
5499 /* When net_device's are persistent, this will be fatal. */
5500 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5501 BUG_ON(!net);
5502
5503 spin_lock_init(&dev->addr_list_lock);
5504 netdev_set_addr_lockdep_class(dev);
5505
5506 dev->iflink = -1;
5507
5508 ret = dev_get_valid_name(dev, dev->name);
5509 if (ret < 0)
5510 goto out;
5511
5512 /* Init, if this function is available */
5513 if (dev->netdev_ops->ndo_init) {
5514 ret = dev->netdev_ops->ndo_init(dev);
5515 if (ret) {
5516 if (ret > 0)
5517 ret = -EIO;
5518 goto out;
5519 }
5520 }
5521
5522 dev->ifindex = dev_new_index(net);
5523 if (dev->iflink == -1)
5524 dev->iflink = dev->ifindex;
5525
5526 /* Transfer changeable features to wanted_features and enable
5527 * software offloads (GSO and GRO).
5528 */
5529 dev->hw_features |= NETIF_F_SOFT_FEATURES;
5530 dev->features |= NETIF_F_SOFT_FEATURES;
5531 dev->wanted_features = dev->features & dev->hw_features;
5532
5533 /* Turn on no cache copy if HW is doing checksum */
5534 if (!(dev->flags & IFF_LOOPBACK)) {
5535 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5536 if (dev->features & NETIF_F_ALL_CSUM) {
5537 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5538 dev->features |= NETIF_F_NOCACHE_COPY;
5539 }
5540 }
5541
5542 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5543 */
5544 dev->vlan_features |= NETIF_F_HIGHDMA;
5545
5546 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5547 ret = notifier_to_errno(ret);
5548 if (ret)
5549 goto err_uninit;
5550
5551 ret = netdev_register_kobject(dev);
5552 if (ret)
5553 goto err_uninit;
5554 dev->reg_state = NETREG_REGISTERED;
5555
5556 __netdev_update_features(dev);
5557
5558 /*
5559 * Default initial state at registry is that the
5560 * device is present.
5561 */
5562
5563 set_bit(__LINK_STATE_PRESENT, &dev->state);
5564
5565 dev_init_scheduler(dev);
5566 dev_hold(dev);
5567 list_netdevice(dev);
5568 add_device_randomness(dev->dev_addr, dev->addr_len);
5569
5570 /* Notify protocols, that a new device appeared. */
5571 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5572 ret = notifier_to_errno(ret);
5573 if (ret) {
5574 rollback_registered(dev);
5575 dev->reg_state = NETREG_UNREGISTERED;
5576 }
5577 /*
5578 * Prevent userspace races by waiting until the network
5579 * device is fully setup before sending notifications.
5580 */
5581 if (!dev->rtnl_link_ops ||
5582 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5583 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5584
5585out:
5586 return ret;
5587
5588err_uninit:
5589 if (dev->netdev_ops->ndo_uninit)
5590 dev->netdev_ops->ndo_uninit(dev);
5591 goto out;
5592}
5593EXPORT_SYMBOL(register_netdevice);
5594
5595/**
5596 * init_dummy_netdev - init a dummy network device for NAPI
5597 * @dev: device to init
5598 *
5599 * This takes a network device structure and initialize the minimum
5600 * amount of fields so it can be used to schedule NAPI polls without
5601 * registering a full blown interface. This is to be used by drivers
5602 * that need to tie several hardware interfaces to a single NAPI
5603 * poll scheduler due to HW limitations.
5604 */
5605int init_dummy_netdev(struct net_device *dev)
5606{
5607 /* Clear everything. Note we don't initialize spinlocks
5608 * are they aren't supposed to be taken by any of the
5609 * NAPI code and this dummy netdev is supposed to be
5610 * only ever used for NAPI polls
5611 */
5612 memset(dev, 0, sizeof(struct net_device));
5613
5614 /* make sure we BUG if trying to hit standard
5615 * register/unregister code path
5616 */
5617 dev->reg_state = NETREG_DUMMY;
5618
5619 /* NAPI wants this */
5620 INIT_LIST_HEAD(&dev->napi_list);
5621
5622 /* a dummy interface is started by default */
5623 set_bit(__LINK_STATE_PRESENT, &dev->state);
5624 set_bit(__LINK_STATE_START, &dev->state);
5625
5626 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5627 * because users of this 'device' dont need to change
5628 * its refcount.
5629 */
5630
5631 return 0;
5632}
5633EXPORT_SYMBOL_GPL(init_dummy_netdev);
5634
5635
5636/**
5637 * register_netdev - register a network device
5638 * @dev: device to register
5639 *
5640 * Take a completed network device structure and add it to the kernel
5641 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5642 * chain. 0 is returned on success. A negative errno code is returned
5643 * on a failure to set up the device, or if the name is a duplicate.
5644 *
5645 * This is a wrapper around register_netdevice that takes the rtnl semaphore
5646 * and expands the device name if you passed a format string to
5647 * alloc_netdev.
5648 */
5649int register_netdev(struct net_device *dev)
5650{
5651 int err;
5652
5653 rtnl_lock();
5654 err = register_netdevice(dev);
5655 rtnl_unlock();
5656 return err;
5657}
5658EXPORT_SYMBOL(register_netdev);
5659
5660int netdev_refcnt_read(const struct net_device *dev)
5661{
5662 int i, refcnt = 0;
5663
5664 for_each_possible_cpu(i)
5665 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5666 return refcnt;
5667}
5668EXPORT_SYMBOL(netdev_refcnt_read);
5669
5670/*
5671 * netdev_wait_allrefs - wait until all references are gone.
5672 *
5673 * This is called when unregistering network devices.
5674 *
5675 * Any protocol or device that holds a reference should register
5676 * for netdevice notification, and cleanup and put back the
5677 * reference if they receive an UNREGISTER event.
5678 * We can get stuck here if buggy protocols don't correctly
5679 * call dev_put.
5680 */
5681static void netdev_wait_allrefs(struct net_device *dev)
5682{
5683 unsigned long rebroadcast_time, warning_time;
5684 int refcnt;
5685
5686 linkwatch_forget_dev(dev);
5687
5688 rebroadcast_time = warning_time = jiffies;
5689 refcnt = netdev_refcnt_read(dev);
5690
5691 while (refcnt != 0) {
5692 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5693 rtnl_lock();
5694
5695 /* Rebroadcast unregister notification */
5696 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5697 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5698 * should have already handle it the first time */
5699
5700 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5701 &dev->state)) {
5702 /* We must not have linkwatch events
5703 * pending on unregister. If this
5704 * happens, we simply run the queue
5705 * unscheduled, resulting in a noop
5706 * for this device.
5707 */
5708 linkwatch_run_queue();
5709 }
5710
5711 __rtnl_unlock();
5712
5713 rebroadcast_time = jiffies;
5714 }
5715
5716 msleep(250);
5717
5718 refcnt = netdev_refcnt_read(dev);
5719
5720 if (time_after(jiffies, warning_time + 10 * HZ)) {
5721 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5722 dev->name, refcnt);
5723 warning_time = jiffies;
5724 }
5725 }
5726}
5727
5728/* The sequence is:
5729 *
5730 * rtnl_lock();
5731 * ...
5732 * register_netdevice(x1);
5733 * register_netdevice(x2);
5734 * ...
5735 * unregister_netdevice(y1);
5736 * unregister_netdevice(y2);
5737 * ...
5738 * rtnl_unlock();
5739 * free_netdev(y1);
5740 * free_netdev(y2);
5741 *
5742 * We are invoked by rtnl_unlock().
5743 * This allows us to deal with problems:
5744 * 1) We can delete sysfs objects which invoke hotplug
5745 * without deadlocking with linkwatch via keventd.
5746 * 2) Since we run with the RTNL semaphore not held, we can sleep
5747 * safely in order to wait for the netdev refcnt to drop to zero.
5748 *
5749 * We must not return until all unregister events added during
5750 * the interval the lock was held have been completed.
5751 */
5752void netdev_run_todo(void)
5753{
5754 struct list_head list;
5755
5756 /* Snapshot list, allow later requests */
5757 list_replace_init(&net_todo_list, &list);
5758
5759 __rtnl_unlock();
5760
5761 /* Wait for rcu callbacks to finish before attempting to drain
5762 * the device list. This usually avoids a 250ms wait.
5763 */
5764 if (!list_empty(&list))
5765 rcu_barrier();
5766
5767 while (!list_empty(&list)) {
5768 struct net_device *dev
5769 = list_first_entry(&list, struct net_device, todo_list);
5770 list_del(&dev->todo_list);
5771
5772 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5773 pr_err("network todo '%s' but state %d\n",
5774 dev->name, dev->reg_state);
5775 dump_stack();
5776 continue;
5777 }
5778
5779 dev->reg_state = NETREG_UNREGISTERED;
5780
5781 on_each_cpu(flush_backlog, dev, 1);
5782
5783 netdev_wait_allrefs(dev);
5784
5785 /* paranoia */
5786 BUG_ON(netdev_refcnt_read(dev));
5787 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5788 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5789 WARN_ON(dev->dn_ptr);
5790
5791 if (dev->destructor)
5792 dev->destructor(dev);
5793
5794 /* Free network device */
5795 kobject_put(&dev->dev.kobj);
5796 }
5797}
5798
5799/* Convert net_device_stats to rtnl_link_stats64. They have the same
5800 * fields in the same order, with only the type differing.
5801 */
5802void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5803 const struct net_device_stats *netdev_stats)
5804{
5805#if BITS_PER_LONG == 64
5806 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5807 memcpy(stats64, netdev_stats, sizeof(*stats64));
5808#else
5809 size_t i, n = sizeof(*stats64) / sizeof(u64);
5810 const unsigned long *src = (const unsigned long *)netdev_stats;
5811 u64 *dst = (u64 *)stats64;
5812
5813 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5814 sizeof(*stats64) / sizeof(u64));
5815 for (i = 0; i < n; i++)
5816 dst[i] = src[i];
5817#endif
5818}
5819EXPORT_SYMBOL(netdev_stats_to_stats64);
5820
5821/**
5822 * dev_get_stats - get network device statistics
5823 * @dev: device to get statistics from
5824 * @storage: place to store stats
5825 *
5826 * Get network statistics from device. Return @storage.
5827 * The device driver may provide its own method by setting
5828 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5829 * otherwise the internal statistics structure is used.
5830 */
5831struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5832 struct rtnl_link_stats64 *storage)
5833{
5834 const struct net_device_ops *ops = dev->netdev_ops;
5835
5836 if (ops->ndo_get_stats64) {
5837 memset(storage, 0, sizeof(*storage));
5838 ops->ndo_get_stats64(dev, storage);
5839 } else if (ops->ndo_get_stats) {
5840 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5841 } else {
5842 netdev_stats_to_stats64(storage, &dev->stats);
5843 }
5844 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5845 return storage;
5846}
5847EXPORT_SYMBOL(dev_get_stats);
5848
5849struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5850{
5851 struct netdev_queue *queue = dev_ingress_queue(dev);
5852
5853#ifdef CONFIG_NET_CLS_ACT
5854 if (queue)
5855 return queue;
5856 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5857 if (!queue)
5858 return NULL;
5859 netdev_init_one_queue(dev, queue, NULL);
5860 queue->qdisc = &noop_qdisc;
5861 queue->qdisc_sleeping = &noop_qdisc;
5862 rcu_assign_pointer(dev->ingress_queue, queue);
5863#endif
5864 return queue;
5865}
5866
5867/**
5868 * alloc_netdev_mqs - allocate network device
5869 * @sizeof_priv: size of private data to allocate space for
5870 * @name: device name format string
5871 * @setup: callback to initialize device
5872 * @txqs: the number of TX subqueues to allocate
5873 * @rxqs: the number of RX subqueues to allocate
5874 *
5875 * Allocates a struct net_device with private data area for driver use
5876 * and performs basic initialization. Also allocates subquue structs
5877 * for each queue on the device.
5878 */
5879struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5880 void (*setup)(struct net_device *),
5881 unsigned int txqs, unsigned int rxqs)
5882{
5883 struct net_device *dev;
5884 size_t alloc_size;
5885 struct net_device *p;
5886
5887 BUG_ON(strlen(name) >= sizeof(dev->name));
5888
5889 if (txqs < 1) {
5890 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5891 return NULL;
5892 }
5893
5894#ifdef CONFIG_RPS
5895 if (rxqs < 1) {
5896 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5897 return NULL;
5898 }
5899#endif
5900
5901 alloc_size = sizeof(struct net_device);
5902 if (sizeof_priv) {
5903 /* ensure 32-byte alignment of private area */
5904 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5905 alloc_size += sizeof_priv;
5906 }
5907 /* ensure 32-byte alignment of whole construct */
5908 alloc_size += NETDEV_ALIGN - 1;
5909
5910 p = kzalloc(alloc_size, GFP_KERNEL);
5911 if (!p) {
5912 pr_err("alloc_netdev: Unable to allocate device\n");
5913 return NULL;
5914 }
5915
5916 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5917 dev->padded = (char *)dev - (char *)p;
5918
5919 dev->pcpu_refcnt = alloc_percpu(int);
5920 if (!dev->pcpu_refcnt)
5921 goto free_p;
5922
5923 if (dev_addr_init(dev))
5924 goto free_pcpu;
5925
5926 dev_mc_init(dev);
5927 dev_uc_init(dev);
5928
5929 dev_net_set(dev, &init_net);
5930
5931 dev->gso_max_size = GSO_MAX_SIZE;
5932 dev->gso_max_segs = GSO_MAX_SEGS;
5933
5934 INIT_LIST_HEAD(&dev->napi_list);
5935 INIT_LIST_HEAD(&dev->unreg_list);
5936 INIT_LIST_HEAD(&dev->link_watch_list);
5937 dev->priv_flags = IFF_XMIT_DST_RELEASE;
5938 setup(dev);
5939
5940 dev->num_tx_queues = txqs;
5941 dev->real_num_tx_queues = txqs;
5942 if (netif_alloc_netdev_queues(dev))
5943 goto free_all;
5944
5945#ifdef CONFIG_RPS
5946 dev->num_rx_queues = rxqs;
5947 dev->real_num_rx_queues = rxqs;
5948 if (netif_alloc_rx_queues(dev))
5949 goto free_all;
5950#endif
5951
5952 strcpy(dev->name, name);
5953 dev->group = INIT_NETDEV_GROUP;
5954 return dev;
5955
5956free_all:
5957 free_netdev(dev);
5958 return NULL;
5959
5960free_pcpu:
5961 free_percpu(dev->pcpu_refcnt);
5962 kfree(dev->_tx);
5963#ifdef CONFIG_RPS
5964 kfree(dev->_rx);
5965#endif
5966
5967free_p:
5968 kfree(p);
5969 return NULL;
5970}
5971EXPORT_SYMBOL(alloc_netdev_mqs);
5972
5973/**
5974 * free_netdev - free network device
5975 * @dev: device
5976 *
5977 * This function does the last stage of destroying an allocated device
5978 * interface. The reference to the device object is released.
5979 * If this is the last reference then it will be freed.
5980 */
5981void free_netdev(struct net_device *dev)
5982{
5983 struct napi_struct *p, *n;
5984
5985 release_net(dev_net(dev));
5986
5987 kfree(dev->_tx);
5988#ifdef CONFIG_RPS
5989 kfree(dev->_rx);
5990#endif
5991
5992 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
5993
5994 /* Flush device addresses */
5995 dev_addr_flush(dev);
5996
5997 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5998 netif_napi_del(p);
5999
6000 free_percpu(dev->pcpu_refcnt);
6001 dev->pcpu_refcnt = NULL;
6002
6003 /* Compatibility with error handling in drivers */
6004 if (dev->reg_state == NETREG_UNINITIALIZED) {
6005 kfree((char *)dev - dev->padded);
6006 return;
6007 }
6008
6009 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6010 dev->reg_state = NETREG_RELEASED;
6011
6012 /* will free via device release */
6013 put_device(&dev->dev);
6014}
6015EXPORT_SYMBOL(free_netdev);
6016
6017/**
6018 * synchronize_net - Synchronize with packet receive processing
6019 *
6020 * Wait for packets currently being received to be done.
6021 * Does not block later packets from starting.
6022 */
6023void synchronize_net(void)
6024{
6025 might_sleep();
6026 if (rtnl_is_locked())
6027 synchronize_rcu_expedited();
6028 else
6029 synchronize_rcu();
6030}
6031EXPORT_SYMBOL(synchronize_net);
6032
6033/**
6034 * unregister_netdevice_queue - remove device from the kernel
6035 * @dev: device
6036 * @head: list
6037 *
6038 * This function shuts down a device interface and removes it
6039 * from the kernel tables.
6040 * If head not NULL, device is queued to be unregistered later.
6041 *
6042 * Callers must hold the rtnl semaphore. You may want
6043 * unregister_netdev() instead of this.
6044 */
6045
6046void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6047{
6048 ASSERT_RTNL();
6049
6050 if (head) {
6051 list_move_tail(&dev->unreg_list, head);
6052 } else {
6053 rollback_registered(dev);
6054 /* Finish processing unregister after unlock */
6055 net_set_todo(dev);
6056 }
6057}
6058EXPORT_SYMBOL(unregister_netdevice_queue);
6059
6060/**
6061 * unregister_netdevice_many - unregister many devices
6062 * @head: list of devices
6063 */
6064void unregister_netdevice_many(struct list_head *head)
6065{
6066 struct net_device *dev;
6067
6068 if (!list_empty(head)) {
6069 rollback_registered_many(head);
6070 list_for_each_entry(dev, head, unreg_list)
6071 net_set_todo(dev);
6072 }
6073}
6074EXPORT_SYMBOL(unregister_netdevice_many);
6075
6076/**
6077 * unregister_netdev - remove device from the kernel
6078 * @dev: device
6079 *
6080 * This function shuts down a device interface and removes it
6081 * from the kernel tables.
6082 *
6083 * This is just a wrapper for unregister_netdevice that takes
6084 * the rtnl semaphore. In general you want to use this and not
6085 * unregister_netdevice.
6086 */
6087void unregister_netdev(struct net_device *dev)
6088{
6089 rtnl_lock();
6090 unregister_netdevice(dev);
6091 rtnl_unlock();
6092}
6093EXPORT_SYMBOL(unregister_netdev);
6094
6095/**
6096 * dev_change_net_namespace - move device to different nethost namespace
6097 * @dev: device
6098 * @net: network namespace
6099 * @pat: If not NULL name pattern to try if the current device name
6100 * is already taken in the destination network namespace.
6101 *
6102 * This function shuts down a device interface and moves it
6103 * to a new network namespace. On success 0 is returned, on
6104 * a failure a netagive errno code is returned.
6105 *
6106 * Callers must hold the rtnl semaphore.
6107 */
6108
6109int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6110{
6111 int err;
6112
6113 ASSERT_RTNL();
6114
6115 /* Don't allow namespace local devices to be moved. */
6116 err = -EINVAL;
6117 if (dev->features & NETIF_F_NETNS_LOCAL)
6118 goto out;
6119
6120 /* Ensure the device has been registrered */
6121 err = -EINVAL;
6122 if (dev->reg_state != NETREG_REGISTERED)
6123 goto out;
6124
6125 /* Get out if there is nothing todo */
6126 err = 0;
6127 if (net_eq(dev_net(dev), net))
6128 goto out;
6129
6130 /* Pick the destination device name, and ensure
6131 * we can use it in the destination network namespace.
6132 */
6133 err = -EEXIST;
6134 if (__dev_get_by_name(net, dev->name)) {
6135 /* We get here if we can't use the current device name */
6136 if (!pat)
6137 goto out;
6138 if (dev_get_valid_name(dev, pat) < 0)
6139 goto out;
6140 }
6141
6142 /*
6143 * And now a mini version of register_netdevice unregister_netdevice.
6144 */
6145
6146 /* If device is running close it first. */
6147 dev_close(dev);
6148
6149 /* And unlink it from device chain */
6150 err = -ENODEV;
6151 unlist_netdevice(dev);
6152
6153 synchronize_net();
6154
6155 /* Shutdown queueing discipline. */
6156 dev_shutdown(dev);
6157
6158 /* Notify protocols, that we are about to destroy
6159 this device. They should clean all the things.
6160
6161 Note that dev->reg_state stays at NETREG_REGISTERED.
6162 This is wanted because this way 8021q and macvlan know
6163 the device is just moving and can keep their slaves up.
6164 */
6165 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6166 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6167 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6168
6169 /*
6170 * Flush the unicast and multicast chains
6171 */
6172 dev_uc_flush(dev);
6173 dev_mc_flush(dev);
6174
6175 /* Actually switch the network namespace */
6176 dev_net_set(dev, net);
6177
6178 /* If there is an ifindex conflict assign a new one */
6179 if (__dev_get_by_index(net, dev->ifindex)) {
6180 int iflink = (dev->iflink == dev->ifindex);
6181 dev->ifindex = dev_new_index(net);
6182 if (iflink)
6183 dev->iflink = dev->ifindex;
6184 }
6185
6186 /* Fixup kobjects */
6187 err = device_rename(&dev->dev, dev->name);
6188 WARN_ON(err);
6189
6190 /* Add the device back in the hashes */
6191 list_netdevice(dev);
6192
6193 /* Notify protocols, that a new device appeared. */
6194 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6195
6196 /*
6197 * Prevent userspace races by waiting until the network
6198 * device is fully setup before sending notifications.
6199 */
6200 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6201
6202 synchronize_net();
6203 err = 0;
6204out:
6205 return err;
6206}
6207EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6208
6209static int dev_cpu_callback(struct notifier_block *nfb,
6210 unsigned long action,
6211 void *ocpu)
6212{
6213 struct sk_buff **list_skb;
6214 struct sk_buff *skb;
6215 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6216 struct softnet_data *sd, *oldsd;
6217
6218 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6219 return NOTIFY_OK;
6220
6221 local_irq_disable();
6222 cpu = smp_processor_id();
6223 sd = &per_cpu(softnet_data, cpu);
6224 oldsd = &per_cpu(softnet_data, oldcpu);
6225
6226 /* Find end of our completion_queue. */
6227 list_skb = &sd->completion_queue;
6228 while (*list_skb)
6229 list_skb = &(*list_skb)->next;
6230 /* Append completion queue from offline CPU. */
6231 *list_skb = oldsd->completion_queue;
6232 oldsd->completion_queue = NULL;
6233
6234 /* Append output queue from offline CPU. */
6235 if (oldsd->output_queue) {
6236 *sd->output_queue_tailp = oldsd->output_queue;
6237 sd->output_queue_tailp = oldsd->output_queue_tailp;
6238 oldsd->output_queue = NULL;
6239 oldsd->output_queue_tailp = &oldsd->output_queue;
6240 }
6241 /* Append NAPI poll list from offline CPU. */
6242 if (!list_empty(&oldsd->poll_list)) {
6243 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6244 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6245 }
6246
6247 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6248 local_irq_enable();
6249
6250 /* Process offline CPU's input_pkt_queue */
6251 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6252 netif_rx(skb);
6253 input_queue_head_incr(oldsd);
6254 }
6255 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6256 netif_rx(skb);
6257 input_queue_head_incr(oldsd);
6258 }
6259
6260 return NOTIFY_OK;
6261}
6262
6263
6264/**
6265 * netdev_increment_features - increment feature set by one
6266 * @all: current feature set
6267 * @one: new feature set
6268 * @mask: mask feature set
6269 *
6270 * Computes a new feature set after adding a device with feature set
6271 * @one to the master device with current feature set @all. Will not
6272 * enable anything that is off in @mask. Returns the new feature set.
6273 */
6274netdev_features_t netdev_increment_features(netdev_features_t all,
6275 netdev_features_t one, netdev_features_t mask)
6276{
6277 if (mask & NETIF_F_GEN_CSUM)
6278 mask |= NETIF_F_ALL_CSUM;
6279 mask |= NETIF_F_VLAN_CHALLENGED;
6280
6281 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6282 all &= one | ~NETIF_F_ALL_FOR_ALL;
6283
6284 /* If one device supports hw checksumming, set for all. */
6285 if (all & NETIF_F_GEN_CSUM)
6286 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6287
6288 return all;
6289}
6290EXPORT_SYMBOL(netdev_increment_features);
6291
6292static struct hlist_head *netdev_create_hash(void)
6293{
6294 int i;
6295 struct hlist_head *hash;
6296
6297 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6298 if (hash != NULL)
6299 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6300 INIT_HLIST_HEAD(&hash[i]);
6301
6302 return hash;
6303}
6304
6305/* Initialize per network namespace state */
6306static int __net_init netdev_init(struct net *net)
6307{
6308 if (net != &init_net)
6309 INIT_LIST_HEAD(&net->dev_base_head);
6310
6311 net->dev_name_head = netdev_create_hash();
6312 if (net->dev_name_head == NULL)
6313 goto err_name;
6314
6315 net->dev_index_head = netdev_create_hash();
6316 if (net->dev_index_head == NULL)
6317 goto err_idx;
6318
6319 return 0;
6320
6321err_idx:
6322 kfree(net->dev_name_head);
6323err_name:
6324 return -ENOMEM;
6325}
6326
6327/**
6328 * netdev_drivername - network driver for the device
6329 * @dev: network device
6330 *
6331 * Determine network driver for device.
6332 */
6333const char *netdev_drivername(const struct net_device *dev)
6334{
6335 const struct device_driver *driver;
6336 const struct device *parent;
6337 const char *empty = "";
6338
6339 parent = dev->dev.parent;
6340 if (!parent)
6341 return empty;
6342
6343 driver = parent->driver;
6344 if (driver && driver->name)
6345 return driver->name;
6346 return empty;
6347}
6348
6349int __netdev_printk(const char *level, const struct net_device *dev,
6350 struct va_format *vaf)
6351{
6352 int r;
6353
6354 if (dev && dev->dev.parent)
6355 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6356 netdev_name(dev), vaf);
6357 else if (dev)
6358 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6359 else
6360 r = printk("%s(NULL net_device): %pV", level, vaf);
6361
6362 return r;
6363}
6364EXPORT_SYMBOL(__netdev_printk);
6365
6366int netdev_printk(const char *level, const struct net_device *dev,
6367 const char *format, ...)
6368{
6369 struct va_format vaf;
6370 va_list args;
6371 int r;
6372
6373 va_start(args, format);
6374
6375 vaf.fmt = format;
6376 vaf.va = &args;
6377
6378 r = __netdev_printk(level, dev, &vaf);
6379 va_end(args);
6380
6381 return r;
6382}
6383EXPORT_SYMBOL(netdev_printk);
6384
6385#define define_netdev_printk_level(func, level) \
6386int func(const struct net_device *dev, const char *fmt, ...) \
6387{ \
6388 int r; \
6389 struct va_format vaf; \
6390 va_list args; \
6391 \
6392 va_start(args, fmt); \
6393 \
6394 vaf.fmt = fmt; \
6395 vaf.va = &args; \
6396 \
6397 r = __netdev_printk(level, dev, &vaf); \
6398 va_end(args); \
6399 \
6400 return r; \
6401} \
6402EXPORT_SYMBOL(func);
6403
6404define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6405define_netdev_printk_level(netdev_alert, KERN_ALERT);
6406define_netdev_printk_level(netdev_crit, KERN_CRIT);
6407define_netdev_printk_level(netdev_err, KERN_ERR);
6408define_netdev_printk_level(netdev_warn, KERN_WARNING);
6409define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6410define_netdev_printk_level(netdev_info, KERN_INFO);
6411
6412static void __net_exit netdev_exit(struct net *net)
6413{
6414 kfree(net->dev_name_head);
6415 kfree(net->dev_index_head);
6416}
6417
6418static struct pernet_operations __net_initdata netdev_net_ops = {
6419 .init = netdev_init,
6420 .exit = netdev_exit,
6421};
6422
6423static void __net_exit default_device_exit(struct net *net)
6424{
6425 struct net_device *dev, *aux;
6426 /*
6427 * Push all migratable network devices back to the
6428 * initial network namespace
6429 */
6430 rtnl_lock();
6431 for_each_netdev_safe(net, dev, aux) {
6432 int err;
6433 char fb_name[IFNAMSIZ];
6434
6435 /* Ignore unmoveable devices (i.e. loopback) */
6436 if (dev->features & NETIF_F_NETNS_LOCAL)
6437 continue;
6438
6439 /* Leave virtual devices for the generic cleanup */
6440 if (dev->rtnl_link_ops)
6441 continue;
6442
6443 /* Push remaining network devices to init_net */
6444 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6445 err = dev_change_net_namespace(dev, &init_net, fb_name);
6446 if (err) {
6447 pr_emerg("%s: failed to move %s to init_net: %d\n",
6448 __func__, dev->name, err);
6449 BUG();
6450 }
6451 }
6452 rtnl_unlock();
6453}
6454
6455static void __net_exit default_device_exit_batch(struct list_head *net_list)
6456{
6457 /* At exit all network devices most be removed from a network
6458 * namespace. Do this in the reverse order of registration.
6459 * Do this across as many network namespaces as possible to
6460 * improve batching efficiency.
6461 */
6462 struct net_device *dev;
6463 struct net *net;
6464 LIST_HEAD(dev_kill_list);
6465
6466 rtnl_lock();
6467 list_for_each_entry(net, net_list, exit_list) {
6468 for_each_netdev_reverse(net, dev) {
6469 if (dev->rtnl_link_ops)
6470 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6471 else
6472 unregister_netdevice_queue(dev, &dev_kill_list);
6473 }
6474 }
6475 unregister_netdevice_many(&dev_kill_list);
6476 list_del(&dev_kill_list);
6477 rtnl_unlock();
6478}
6479
6480static struct pernet_operations __net_initdata default_device_ops = {
6481 .exit = default_device_exit,
6482 .exit_batch = default_device_exit_batch,
6483};
6484
6485/*
6486 * Initialize the DEV module. At boot time this walks the device list and
6487 * unhooks any devices that fail to initialise (normally hardware not
6488 * present) and leaves us with a valid list of present and active devices.
6489 *
6490 */
6491
6492/*
6493 * This is called single threaded during boot, so no need
6494 * to take the rtnl semaphore.
6495 */
6496static int __init net_dev_init(void)
6497{
6498 int i, rc = -ENOMEM;
6499
6500 BUG_ON(!dev_boot_phase);
6501
6502 if (dev_proc_init())
6503 goto out;
6504
6505 if (netdev_kobject_init())
6506 goto out;
6507
6508 INIT_LIST_HEAD(&ptype_all);
6509 for (i = 0; i < PTYPE_HASH_SIZE; i++)
6510 INIT_LIST_HEAD(&ptype_base[i]);
6511
6512 if (register_pernet_subsys(&netdev_net_ops))
6513 goto out;
6514
6515 /*
6516 * Initialise the packet receive queues.
6517 */
6518
6519 for_each_possible_cpu(i) {
6520 struct softnet_data *sd = &per_cpu(softnet_data, i);
6521
6522 memset(sd, 0, sizeof(*sd));
6523 skb_queue_head_init(&sd->input_pkt_queue);
6524 skb_queue_head_init(&sd->process_queue);
6525 sd->completion_queue = NULL;
6526 INIT_LIST_HEAD(&sd->poll_list);
6527 sd->output_queue = NULL;
6528 sd->output_queue_tailp = &sd->output_queue;
6529#ifdef CONFIG_RPS
6530 sd->csd.func = rps_trigger_softirq;
6531 sd->csd.info = sd;
6532 sd->csd.flags = 0;
6533 sd->cpu = i;
6534#endif
6535
6536 sd->backlog.poll = process_backlog;
6537 sd->backlog.weight = weight_p;
6538 sd->backlog.gro_list = NULL;
6539 sd->backlog.gro_count = 0;
6540 }
6541
6542 dev_boot_phase = 0;
6543
6544 /* The loopback device is special if any other network devices
6545 * is present in a network namespace the loopback device must
6546 * be present. Since we now dynamically allocate and free the
6547 * loopback device ensure this invariant is maintained by
6548 * keeping the loopback device as the first device on the
6549 * list of network devices. Ensuring the loopback devices
6550 * is the first device that appears and the last network device
6551 * that disappears.
6552 */
6553 if (register_pernet_device(&loopback_net_ops))
6554 goto out;
6555
6556 if (register_pernet_device(&default_device_ops))
6557 goto out;
6558
6559 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6560 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6561
6562 hotcpu_notifier(dev_cpu_callback, 0);
6563 dst_init();
6564 dev_mcast_init();
6565 rc = 0;
6566out:
6567 return rc;
6568}
6569
6570subsys_initcall(net_dev_init);
6571
6572static int __init initialize_hashrnd(void)
6573{
6574 get_random_bytes(&hashrnd, sizeof(hashrnd));
6575 return 0;
6576}
6577
6578late_initcall_sync(initialize_hashrnd);
6579