Loading...
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * NET3 Protocol independent device support routines.
4 *
5 * Derived from the non IP parts of dev.c 1.0.19
6 * Authors: Ross Biro
7 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
8 * Mark Evans, <evansmp@uhura.aston.ac.uk>
9 *
10 * Additional Authors:
11 * Florian la Roche <rzsfl@rz.uni-sb.de>
12 * Alan Cox <gw4pts@gw4pts.ampr.org>
13 * David Hinds <dahinds@users.sourceforge.net>
14 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
15 * Adam Sulmicki <adam@cfar.umd.edu>
16 * Pekka Riikonen <priikone@poesidon.pspt.fi>
17 *
18 * Changes:
19 * D.J. Barrow : Fixed bug where dev->refcnt gets set
20 * to 2 if register_netdev gets called
21 * before net_dev_init & also removed a
22 * few lines of code in the process.
23 * Alan Cox : device private ioctl copies fields back.
24 * Alan Cox : Transmit queue code does relevant
25 * stunts to keep the queue safe.
26 * Alan Cox : Fixed double lock.
27 * Alan Cox : Fixed promisc NULL pointer trap
28 * ???????? : Support the full private ioctl range
29 * Alan Cox : Moved ioctl permission check into
30 * drivers
31 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
32 * Alan Cox : 100 backlog just doesn't cut it when
33 * you start doing multicast video 8)
34 * Alan Cox : Rewrote net_bh and list manager.
35 * Alan Cox : Fix ETH_P_ALL echoback lengths.
36 * Alan Cox : Took out transmit every packet pass
37 * Saved a few bytes in the ioctl handler
38 * Alan Cox : Network driver sets packet type before
39 * calling netif_rx. Saves a function
40 * call a packet.
41 * Alan Cox : Hashed net_bh()
42 * Richard Kooijman: Timestamp fixes.
43 * Alan Cox : Wrong field in SIOCGIFDSTADDR
44 * Alan Cox : Device lock protection.
45 * Alan Cox : Fixed nasty side effect of device close
46 * changes.
47 * Rudi Cilibrasi : Pass the right thing to
48 * set_mac_address()
49 * Dave Miller : 32bit quantity for the device lock to
50 * make it work out on a Sparc.
51 * Bjorn Ekwall : Added KERNELD hack.
52 * Alan Cox : Cleaned up the backlog initialise.
53 * Craig Metz : SIOCGIFCONF fix if space for under
54 * 1 device.
55 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
56 * is no device open function.
57 * Andi Kleen : Fix error reporting for SIOCGIFCONF
58 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
59 * Cyrus Durgin : Cleaned for KMOD
60 * Adam Sulmicki : Bug Fix : Network Device Unload
61 * A network device unload needs to purge
62 * the backlog queue.
63 * Paul Rusty Russell : SIOCSIFNAME
64 * Pekka Riikonen : Netdev boot-time settings code
65 * Andrew Morton : Make unregister_netdevice wait
66 * indefinitely on dev->refcnt
67 * J Hadi Salim : - Backlog queue sampling
68 * - netif_rx() feedback
69 */
70
71#include <linux/uaccess.h>
72#include <linux/bitmap.h>
73#include <linux/capability.h>
74#include <linux/cpu.h>
75#include <linux/types.h>
76#include <linux/kernel.h>
77#include <linux/hash.h>
78#include <linux/slab.h>
79#include <linux/sched.h>
80#include <linux/sched/mm.h>
81#include <linux/mutex.h>
82#include <linux/rwsem.h>
83#include <linux/string.h>
84#include <linux/mm.h>
85#include <linux/socket.h>
86#include <linux/sockios.h>
87#include <linux/errno.h>
88#include <linux/interrupt.h>
89#include <linux/if_ether.h>
90#include <linux/netdevice.h>
91#include <linux/etherdevice.h>
92#include <linux/ethtool.h>
93#include <linux/skbuff.h>
94#include <linux/kthread.h>
95#include <linux/bpf.h>
96#include <linux/bpf_trace.h>
97#include <net/net_namespace.h>
98#include <net/sock.h>
99#include <net/busy_poll.h>
100#include <linux/rtnetlink.h>
101#include <linux/stat.h>
102#include <net/dsa.h>
103#include <net/dst.h>
104#include <net/dst_metadata.h>
105#include <net/gro.h>
106#include <net/pkt_sched.h>
107#include <net/pkt_cls.h>
108#include <net/checksum.h>
109#include <net/xfrm.h>
110#include <net/tcx.h>
111#include <linux/highmem.h>
112#include <linux/init.h>
113#include <linux/module.h>
114#include <linux/netpoll.h>
115#include <linux/rcupdate.h>
116#include <linux/delay.h>
117#include <net/iw_handler.h>
118#include <asm/current.h>
119#include <linux/audit.h>
120#include <linux/dmaengine.h>
121#include <linux/err.h>
122#include <linux/ctype.h>
123#include <linux/if_arp.h>
124#include <linux/if_vlan.h>
125#include <linux/ip.h>
126#include <net/ip.h>
127#include <net/mpls.h>
128#include <linux/ipv6.h>
129#include <linux/in.h>
130#include <linux/jhash.h>
131#include <linux/random.h>
132#include <trace/events/napi.h>
133#include <trace/events/net.h>
134#include <trace/events/skb.h>
135#include <trace/events/qdisc.h>
136#include <trace/events/xdp.h>
137#include <linux/inetdevice.h>
138#include <linux/cpu_rmap.h>
139#include <linux/static_key.h>
140#include <linux/hashtable.h>
141#include <linux/vmalloc.h>
142#include <linux/if_macvlan.h>
143#include <linux/errqueue.h>
144#include <linux/hrtimer.h>
145#include <linux/netfilter_netdev.h>
146#include <linux/crash_dump.h>
147#include <linux/sctp.h>
148#include <net/udp_tunnel.h>
149#include <linux/net_namespace.h>
150#include <linux/indirect_call_wrapper.h>
151#include <net/devlink.h>
152#include <linux/pm_runtime.h>
153#include <linux/prandom.h>
154#include <linux/once_lite.h>
155#include <net/netdev_rx_queue.h>
156#include <net/page_pool/types.h>
157#include <net/page_pool/helpers.h>
158#include <net/rps.h>
159
160#include "dev.h"
161#include "net-sysfs.h"
162
163static DEFINE_SPINLOCK(ptype_lock);
164struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
165
166static int netif_rx_internal(struct sk_buff *skb);
167static int call_netdevice_notifiers_extack(unsigned long val,
168 struct net_device *dev,
169 struct netlink_ext_ack *extack);
170
171static DEFINE_MUTEX(ifalias_mutex);
172
173/* protects napi_hash addition/deletion and napi_gen_id */
174static DEFINE_SPINLOCK(napi_hash_lock);
175
176static unsigned int napi_gen_id = NR_CPUS;
177static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
178
179static DECLARE_RWSEM(devnet_rename_sem);
180
181static inline void dev_base_seq_inc(struct net *net)
182{
183 unsigned int val = net->dev_base_seq + 1;
184
185 WRITE_ONCE(net->dev_base_seq, val ?: 1);
186}
187
188static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
189{
190 unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
191
192 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
193}
194
195static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
196{
197 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
198}
199
200static inline void rps_lock_irqsave(struct softnet_data *sd,
201 unsigned long *flags)
202{
203 if (IS_ENABLED(CONFIG_RPS))
204 spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
205 else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
206 local_irq_save(*flags);
207}
208
209static inline void rps_lock_irq_disable(struct softnet_data *sd)
210{
211 if (IS_ENABLED(CONFIG_RPS))
212 spin_lock_irq(&sd->input_pkt_queue.lock);
213 else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
214 local_irq_disable();
215}
216
217static inline void rps_unlock_irq_restore(struct softnet_data *sd,
218 unsigned long *flags)
219{
220 if (IS_ENABLED(CONFIG_RPS))
221 spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
222 else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
223 local_irq_restore(*flags);
224}
225
226static inline void rps_unlock_irq_enable(struct softnet_data *sd)
227{
228 if (IS_ENABLED(CONFIG_RPS))
229 spin_unlock_irq(&sd->input_pkt_queue.lock);
230 else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
231 local_irq_enable();
232}
233
234static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
235 const char *name)
236{
237 struct netdev_name_node *name_node;
238
239 name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
240 if (!name_node)
241 return NULL;
242 INIT_HLIST_NODE(&name_node->hlist);
243 name_node->dev = dev;
244 name_node->name = name;
245 return name_node;
246}
247
248static struct netdev_name_node *
249netdev_name_node_head_alloc(struct net_device *dev)
250{
251 struct netdev_name_node *name_node;
252
253 name_node = netdev_name_node_alloc(dev, dev->name);
254 if (!name_node)
255 return NULL;
256 INIT_LIST_HEAD(&name_node->list);
257 return name_node;
258}
259
260static void netdev_name_node_free(struct netdev_name_node *name_node)
261{
262 kfree(name_node);
263}
264
265static void netdev_name_node_add(struct net *net,
266 struct netdev_name_node *name_node)
267{
268 hlist_add_head_rcu(&name_node->hlist,
269 dev_name_hash(net, name_node->name));
270}
271
272static void netdev_name_node_del(struct netdev_name_node *name_node)
273{
274 hlist_del_rcu(&name_node->hlist);
275}
276
277static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
278 const char *name)
279{
280 struct hlist_head *head = dev_name_hash(net, name);
281 struct netdev_name_node *name_node;
282
283 hlist_for_each_entry(name_node, head, hlist)
284 if (!strcmp(name_node->name, name))
285 return name_node;
286 return NULL;
287}
288
289static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
290 const char *name)
291{
292 struct hlist_head *head = dev_name_hash(net, name);
293 struct netdev_name_node *name_node;
294
295 hlist_for_each_entry_rcu(name_node, head, hlist)
296 if (!strcmp(name_node->name, name))
297 return name_node;
298 return NULL;
299}
300
301bool netdev_name_in_use(struct net *net, const char *name)
302{
303 return netdev_name_node_lookup(net, name);
304}
305EXPORT_SYMBOL(netdev_name_in_use);
306
307int netdev_name_node_alt_create(struct net_device *dev, const char *name)
308{
309 struct netdev_name_node *name_node;
310 struct net *net = dev_net(dev);
311
312 name_node = netdev_name_node_lookup(net, name);
313 if (name_node)
314 return -EEXIST;
315 name_node = netdev_name_node_alloc(dev, name);
316 if (!name_node)
317 return -ENOMEM;
318 netdev_name_node_add(net, name_node);
319 /* The node that holds dev->name acts as a head of per-device list. */
320 list_add_tail_rcu(&name_node->list, &dev->name_node->list);
321
322 return 0;
323}
324
325static void netdev_name_node_alt_free(struct rcu_head *head)
326{
327 struct netdev_name_node *name_node =
328 container_of(head, struct netdev_name_node, rcu);
329
330 kfree(name_node->name);
331 netdev_name_node_free(name_node);
332}
333
334static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
335{
336 netdev_name_node_del(name_node);
337 list_del(&name_node->list);
338 call_rcu(&name_node->rcu, netdev_name_node_alt_free);
339}
340
341int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
342{
343 struct netdev_name_node *name_node;
344 struct net *net = dev_net(dev);
345
346 name_node = netdev_name_node_lookup(net, name);
347 if (!name_node)
348 return -ENOENT;
349 /* lookup might have found our primary name or a name belonging
350 * to another device.
351 */
352 if (name_node == dev->name_node || name_node->dev != dev)
353 return -EINVAL;
354
355 __netdev_name_node_alt_destroy(name_node);
356 return 0;
357}
358
359static void netdev_name_node_alt_flush(struct net_device *dev)
360{
361 struct netdev_name_node *name_node, *tmp;
362
363 list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) {
364 list_del(&name_node->list);
365 netdev_name_node_alt_free(&name_node->rcu);
366 }
367}
368
369/* Device list insertion */
370static void list_netdevice(struct net_device *dev)
371{
372 struct netdev_name_node *name_node;
373 struct net *net = dev_net(dev);
374
375 ASSERT_RTNL();
376
377 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
378 netdev_name_node_add(net, dev->name_node);
379 hlist_add_head_rcu(&dev->index_hlist,
380 dev_index_hash(net, dev->ifindex));
381
382 netdev_for_each_altname(dev, name_node)
383 netdev_name_node_add(net, name_node);
384
385 /* We reserved the ifindex, this can't fail */
386 WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));
387
388 dev_base_seq_inc(net);
389}
390
391/* Device list removal
392 * caller must respect a RCU grace period before freeing/reusing dev
393 */
394static void unlist_netdevice(struct net_device *dev)
395{
396 struct netdev_name_node *name_node;
397 struct net *net = dev_net(dev);
398
399 ASSERT_RTNL();
400
401 xa_erase(&net->dev_by_index, dev->ifindex);
402
403 netdev_for_each_altname(dev, name_node)
404 netdev_name_node_del(name_node);
405
406 /* Unlink dev from the device chain */
407 list_del_rcu(&dev->dev_list);
408 netdev_name_node_del(dev->name_node);
409 hlist_del_rcu(&dev->index_hlist);
410
411 dev_base_seq_inc(dev_net(dev));
412}
413
414/*
415 * Our notifier list
416 */
417
418static RAW_NOTIFIER_HEAD(netdev_chain);
419
420/*
421 * Device drivers call our routines to queue packets here. We empty the
422 * queue in the local softnet handler.
423 */
424
425DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
426EXPORT_PER_CPU_SYMBOL(softnet_data);
427
428/* Page_pool has a lockless array/stack to alloc/recycle pages.
429 * PP consumers must pay attention to run APIs in the appropriate context
430 * (e.g. NAPI context).
431 */
432static DEFINE_PER_CPU(struct page_pool *, system_page_pool);
433
434#ifdef CONFIG_LOCKDEP
435/*
436 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
437 * according to dev->type
438 */
439static const unsigned short netdev_lock_type[] = {
440 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
441 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
442 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
443 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
444 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
445 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
446 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
447 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
448 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
449 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
450 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
451 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
452 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
453 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
454 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
455
456static const char *const netdev_lock_name[] = {
457 "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
458 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
459 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
460 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
461 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
462 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
463 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
464 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
465 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
466 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
467 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
468 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
469 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
470 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
471 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
472
473static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
474static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
475
476static inline unsigned short netdev_lock_pos(unsigned short dev_type)
477{
478 int i;
479
480 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
481 if (netdev_lock_type[i] == dev_type)
482 return i;
483 /* the last key is used by default */
484 return ARRAY_SIZE(netdev_lock_type) - 1;
485}
486
487static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
488 unsigned short dev_type)
489{
490 int i;
491
492 i = netdev_lock_pos(dev_type);
493 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
494 netdev_lock_name[i]);
495}
496
497static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
498{
499 int i;
500
501 i = netdev_lock_pos(dev->type);
502 lockdep_set_class_and_name(&dev->addr_list_lock,
503 &netdev_addr_lock_key[i],
504 netdev_lock_name[i]);
505}
506#else
507static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
508 unsigned short dev_type)
509{
510}
511
512static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
513{
514}
515#endif
516
517/*******************************************************************************
518 *
519 * Protocol management and registration routines
520 *
521 *******************************************************************************/
522
523
524/*
525 * Add a protocol ID to the list. Now that the input handler is
526 * smarter we can dispense with all the messy stuff that used to be
527 * here.
528 *
529 * BEWARE!!! Protocol handlers, mangling input packets,
530 * MUST BE last in hash buckets and checking protocol handlers
531 * MUST start from promiscuous ptype_all chain in net_bh.
532 * It is true now, do not change it.
533 * Explanation follows: if protocol handler, mangling packet, will
534 * be the first on list, it is not able to sense, that packet
535 * is cloned and should be copied-on-write, so that it will
536 * change it and subsequent readers will get broken packet.
537 * --ANK (980803)
538 */
539
540static inline struct list_head *ptype_head(const struct packet_type *pt)
541{
542 if (pt->type == htons(ETH_P_ALL))
543 return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all;
544 else
545 return pt->dev ? &pt->dev->ptype_specific :
546 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
547}
548
549/**
550 * dev_add_pack - add packet handler
551 * @pt: packet type declaration
552 *
553 * Add a protocol handler to the networking stack. The passed &packet_type
554 * is linked into kernel lists and may not be freed until it has been
555 * removed from the kernel lists.
556 *
557 * This call does not sleep therefore it can not
558 * guarantee all CPU's that are in middle of receiving packets
559 * will see the new packet type (until the next received packet).
560 */
561
562void dev_add_pack(struct packet_type *pt)
563{
564 struct list_head *head = ptype_head(pt);
565
566 spin_lock(&ptype_lock);
567 list_add_rcu(&pt->list, head);
568 spin_unlock(&ptype_lock);
569}
570EXPORT_SYMBOL(dev_add_pack);
571
572/**
573 * __dev_remove_pack - remove packet handler
574 * @pt: packet type declaration
575 *
576 * Remove a protocol handler that was previously added to the kernel
577 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
578 * from the kernel lists and can be freed or reused once this function
579 * returns.
580 *
581 * The packet type might still be in use by receivers
582 * and must not be freed until after all the CPU's have gone
583 * through a quiescent state.
584 */
585void __dev_remove_pack(struct packet_type *pt)
586{
587 struct list_head *head = ptype_head(pt);
588 struct packet_type *pt1;
589
590 spin_lock(&ptype_lock);
591
592 list_for_each_entry(pt1, head, list) {
593 if (pt == pt1) {
594 list_del_rcu(&pt->list);
595 goto out;
596 }
597 }
598
599 pr_warn("dev_remove_pack: %p not found\n", pt);
600out:
601 spin_unlock(&ptype_lock);
602}
603EXPORT_SYMBOL(__dev_remove_pack);
604
605/**
606 * dev_remove_pack - remove packet handler
607 * @pt: packet type declaration
608 *
609 * Remove a protocol handler that was previously added to the kernel
610 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
611 * from the kernel lists and can be freed or reused once this function
612 * returns.
613 *
614 * This call sleeps to guarantee that no CPU is looking at the packet
615 * type after return.
616 */
617void dev_remove_pack(struct packet_type *pt)
618{
619 __dev_remove_pack(pt);
620
621 synchronize_net();
622}
623EXPORT_SYMBOL(dev_remove_pack);
624
625
626/*******************************************************************************
627 *
628 * Device Interface Subroutines
629 *
630 *******************************************************************************/
631
632/**
633 * dev_get_iflink - get 'iflink' value of a interface
634 * @dev: targeted interface
635 *
636 * Indicates the ifindex the interface is linked to.
637 * Physical interfaces have the same 'ifindex' and 'iflink' values.
638 */
639
640int dev_get_iflink(const struct net_device *dev)
641{
642 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
643 return dev->netdev_ops->ndo_get_iflink(dev);
644
645 return READ_ONCE(dev->ifindex);
646}
647EXPORT_SYMBOL(dev_get_iflink);
648
649/**
650 * dev_fill_metadata_dst - Retrieve tunnel egress information.
651 * @dev: targeted interface
652 * @skb: The packet.
653 *
654 * For better visibility of tunnel traffic OVS needs to retrieve
655 * egress tunnel information for a packet. Following API allows
656 * user to get this info.
657 */
658int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
659{
660 struct ip_tunnel_info *info;
661
662 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
663 return -EINVAL;
664
665 info = skb_tunnel_info_unclone(skb);
666 if (!info)
667 return -ENOMEM;
668 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
669 return -EINVAL;
670
671 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
672}
673EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
674
675static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
676{
677 int k = stack->num_paths++;
678
679 if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
680 return NULL;
681
682 return &stack->path[k];
683}
684
685int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
686 struct net_device_path_stack *stack)
687{
688 const struct net_device *last_dev;
689 struct net_device_path_ctx ctx = {
690 .dev = dev,
691 };
692 struct net_device_path *path;
693 int ret = 0;
694
695 memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
696 stack->num_paths = 0;
697 while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
698 last_dev = ctx.dev;
699 path = dev_fwd_path(stack);
700 if (!path)
701 return -1;
702
703 memset(path, 0, sizeof(struct net_device_path));
704 ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
705 if (ret < 0)
706 return -1;
707
708 if (WARN_ON_ONCE(last_dev == ctx.dev))
709 return -1;
710 }
711
712 if (!ctx.dev)
713 return ret;
714
715 path = dev_fwd_path(stack);
716 if (!path)
717 return -1;
718 path->type = DEV_PATH_ETHERNET;
719 path->dev = ctx.dev;
720
721 return ret;
722}
723EXPORT_SYMBOL_GPL(dev_fill_forward_path);
724
725/**
726 * __dev_get_by_name - find a device by its name
727 * @net: the applicable net namespace
728 * @name: name to find
729 *
730 * Find an interface by name. Must be called under RTNL semaphore.
731 * If the name is found a pointer to the device is returned.
732 * If the name is not found then %NULL is returned. The
733 * reference counters are not incremented so the caller must be
734 * careful with locks.
735 */
736
737struct net_device *__dev_get_by_name(struct net *net, const char *name)
738{
739 struct netdev_name_node *node_name;
740
741 node_name = netdev_name_node_lookup(net, name);
742 return node_name ? node_name->dev : NULL;
743}
744EXPORT_SYMBOL(__dev_get_by_name);
745
746/**
747 * dev_get_by_name_rcu - find a device by its name
748 * @net: the applicable net namespace
749 * @name: name to find
750 *
751 * Find an interface by name.
752 * If the name is found a pointer to the device is returned.
753 * If the name is not found then %NULL is returned.
754 * The reference counters are not incremented so the caller must be
755 * careful with locks. The caller must hold RCU lock.
756 */
757
758struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
759{
760 struct netdev_name_node *node_name;
761
762 node_name = netdev_name_node_lookup_rcu(net, name);
763 return node_name ? node_name->dev : NULL;
764}
765EXPORT_SYMBOL(dev_get_by_name_rcu);
766
767/* Deprecated for new users, call netdev_get_by_name() instead */
768struct net_device *dev_get_by_name(struct net *net, const char *name)
769{
770 struct net_device *dev;
771
772 rcu_read_lock();
773 dev = dev_get_by_name_rcu(net, name);
774 dev_hold(dev);
775 rcu_read_unlock();
776 return dev;
777}
778EXPORT_SYMBOL(dev_get_by_name);
779
780/**
781 * netdev_get_by_name() - find a device by its name
782 * @net: the applicable net namespace
783 * @name: name to find
784 * @tracker: tracking object for the acquired reference
785 * @gfp: allocation flags for the tracker
786 *
787 * Find an interface by name. This can be called from any
788 * context and does its own locking. The returned handle has
789 * the usage count incremented and the caller must use netdev_put() to
790 * release it when it is no longer needed. %NULL is returned if no
791 * matching device is found.
792 */
793struct net_device *netdev_get_by_name(struct net *net, const char *name,
794 netdevice_tracker *tracker, gfp_t gfp)
795{
796 struct net_device *dev;
797
798 dev = dev_get_by_name(net, name);
799 if (dev)
800 netdev_tracker_alloc(dev, tracker, gfp);
801 return dev;
802}
803EXPORT_SYMBOL(netdev_get_by_name);
804
805/**
806 * __dev_get_by_index - find a device by its ifindex
807 * @net: the applicable net namespace
808 * @ifindex: index of device
809 *
810 * Search for an interface by index. Returns %NULL if the device
811 * is not found or a pointer to the device. The device has not
812 * had its reference counter increased so the caller must be careful
813 * about locking. The caller must hold the RTNL semaphore.
814 */
815
816struct net_device *__dev_get_by_index(struct net *net, int ifindex)
817{
818 struct net_device *dev;
819 struct hlist_head *head = dev_index_hash(net, ifindex);
820
821 hlist_for_each_entry(dev, head, index_hlist)
822 if (dev->ifindex == ifindex)
823 return dev;
824
825 return NULL;
826}
827EXPORT_SYMBOL(__dev_get_by_index);
828
829/**
830 * dev_get_by_index_rcu - find a device by its ifindex
831 * @net: the applicable net namespace
832 * @ifindex: index of device
833 *
834 * Search for an interface by index. Returns %NULL if the device
835 * is not found or a pointer to the device. The device has not
836 * had its reference counter increased so the caller must be careful
837 * about locking. The caller must hold RCU lock.
838 */
839
840struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
841{
842 struct net_device *dev;
843 struct hlist_head *head = dev_index_hash(net, ifindex);
844
845 hlist_for_each_entry_rcu(dev, head, index_hlist)
846 if (dev->ifindex == ifindex)
847 return dev;
848
849 return NULL;
850}
851EXPORT_SYMBOL(dev_get_by_index_rcu);
852
853/* Deprecated for new users, call netdev_get_by_index() instead */
854struct net_device *dev_get_by_index(struct net *net, int ifindex)
855{
856 struct net_device *dev;
857
858 rcu_read_lock();
859 dev = dev_get_by_index_rcu(net, ifindex);
860 dev_hold(dev);
861 rcu_read_unlock();
862 return dev;
863}
864EXPORT_SYMBOL(dev_get_by_index);
865
866/**
867 * netdev_get_by_index() - find a device by its ifindex
868 * @net: the applicable net namespace
869 * @ifindex: index of device
870 * @tracker: tracking object for the acquired reference
871 * @gfp: allocation flags for the tracker
872 *
873 * Search for an interface by index. Returns NULL if the device
874 * is not found or a pointer to the device. The device returned has
875 * had a reference added and the pointer is safe until the user calls
876 * netdev_put() to indicate they have finished with it.
877 */
878struct net_device *netdev_get_by_index(struct net *net, int ifindex,
879 netdevice_tracker *tracker, gfp_t gfp)
880{
881 struct net_device *dev;
882
883 dev = dev_get_by_index(net, ifindex);
884 if (dev)
885 netdev_tracker_alloc(dev, tracker, gfp);
886 return dev;
887}
888EXPORT_SYMBOL(netdev_get_by_index);
889
890/**
891 * dev_get_by_napi_id - find a device by napi_id
892 * @napi_id: ID of the NAPI struct
893 *
894 * Search for an interface by NAPI ID. Returns %NULL if the device
895 * is not found or a pointer to the device. The device has not had
896 * its reference counter increased so the caller must be careful
897 * about locking. The caller must hold RCU lock.
898 */
899
900struct net_device *dev_get_by_napi_id(unsigned int napi_id)
901{
902 struct napi_struct *napi;
903
904 WARN_ON_ONCE(!rcu_read_lock_held());
905
906 if (napi_id < MIN_NAPI_ID)
907 return NULL;
908
909 napi = napi_by_id(napi_id);
910
911 return napi ? napi->dev : NULL;
912}
913EXPORT_SYMBOL(dev_get_by_napi_id);
914
915/**
916 * netdev_get_name - get a netdevice name, knowing its ifindex.
917 * @net: network namespace
918 * @name: a pointer to the buffer where the name will be stored.
919 * @ifindex: the ifindex of the interface to get the name from.
920 */
921int netdev_get_name(struct net *net, char *name, int ifindex)
922{
923 struct net_device *dev;
924 int ret;
925
926 down_read(&devnet_rename_sem);
927 rcu_read_lock();
928
929 dev = dev_get_by_index_rcu(net, ifindex);
930 if (!dev) {
931 ret = -ENODEV;
932 goto out;
933 }
934
935 strcpy(name, dev->name);
936
937 ret = 0;
938out:
939 rcu_read_unlock();
940 up_read(&devnet_rename_sem);
941 return ret;
942}
943
944/**
945 * dev_getbyhwaddr_rcu - find a device by its hardware address
946 * @net: the applicable net namespace
947 * @type: media type of device
948 * @ha: hardware address
949 *
950 * Search for an interface by MAC address. Returns NULL if the device
951 * is not found or a pointer to the device.
952 * The caller must hold RCU or RTNL.
953 * The returned device has not had its ref count increased
954 * and the caller must therefore be careful about locking
955 *
956 */
957
958struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
959 const char *ha)
960{
961 struct net_device *dev;
962
963 for_each_netdev_rcu(net, dev)
964 if (dev->type == type &&
965 !memcmp(dev->dev_addr, ha, dev->addr_len))
966 return dev;
967
968 return NULL;
969}
970EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
971
972struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
973{
974 struct net_device *dev, *ret = NULL;
975
976 rcu_read_lock();
977 for_each_netdev_rcu(net, dev)
978 if (dev->type == type) {
979 dev_hold(dev);
980 ret = dev;
981 break;
982 }
983 rcu_read_unlock();
984 return ret;
985}
986EXPORT_SYMBOL(dev_getfirstbyhwtype);
987
988/**
989 * __dev_get_by_flags - find any device with given flags
990 * @net: the applicable net namespace
991 * @if_flags: IFF_* values
992 * @mask: bitmask of bits in if_flags to check
993 *
994 * Search for any interface with the given flags. Returns NULL if a device
995 * is not found or a pointer to the device. Must be called inside
996 * rtnl_lock(), and result refcount is unchanged.
997 */
998
999struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
1000 unsigned short mask)
1001{
1002 struct net_device *dev, *ret;
1003
1004 ASSERT_RTNL();
1005
1006 ret = NULL;
1007 for_each_netdev(net, dev) {
1008 if (((dev->flags ^ if_flags) & mask) == 0) {
1009 ret = dev;
1010 break;
1011 }
1012 }
1013 return ret;
1014}
1015EXPORT_SYMBOL(__dev_get_by_flags);
1016
1017/**
1018 * dev_valid_name - check if name is okay for network device
1019 * @name: name string
1020 *
1021 * Network device names need to be valid file names to
1022 * allow sysfs to work. We also disallow any kind of
1023 * whitespace.
1024 */
1025bool dev_valid_name(const char *name)
1026{
1027 if (*name == '\0')
1028 return false;
1029 if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1030 return false;
1031 if (!strcmp(name, ".") || !strcmp(name, ".."))
1032 return false;
1033
1034 while (*name) {
1035 if (*name == '/' || *name == ':' || isspace(*name))
1036 return false;
1037 name++;
1038 }
1039 return true;
1040}
1041EXPORT_SYMBOL(dev_valid_name);
1042
1043/**
1044 * __dev_alloc_name - allocate a name for a device
1045 * @net: network namespace to allocate the device name in
1046 * @name: name format string
1047 * @res: result name string
1048 *
1049 * Passed a format string - eg "lt%d" it will try and find a suitable
1050 * id. It scans list of devices to build up a free map, then chooses
1051 * the first empty slot. The caller must hold the dev_base or rtnl lock
1052 * while allocating the name and adding the device in order to avoid
1053 * duplicates.
1054 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1055 * Returns the number of the unit assigned or a negative errno code.
1056 */
1057
1058static int __dev_alloc_name(struct net *net, const char *name, char *res)
1059{
1060 int i = 0;
1061 const char *p;
1062 const int max_netdevices = 8*PAGE_SIZE;
1063 unsigned long *inuse;
1064 struct net_device *d;
1065 char buf[IFNAMSIZ];
1066
1067 /* Verify the string as this thing may have come from the user.
1068 * There must be one "%d" and no other "%" characters.
1069 */
1070 p = strchr(name, '%');
1071 if (!p || p[1] != 'd' || strchr(p + 2, '%'))
1072 return -EINVAL;
1073
1074 /* Use one page as a bit array of possible slots */
1075 inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
1076 if (!inuse)
1077 return -ENOMEM;
1078
1079 for_each_netdev(net, d) {
1080 struct netdev_name_node *name_node;
1081
1082 netdev_for_each_altname(d, name_node) {
1083 if (!sscanf(name_node->name, name, &i))
1084 continue;
1085 if (i < 0 || i >= max_netdevices)
1086 continue;
1087
1088 /* avoid cases where sscanf is not exact inverse of printf */
1089 snprintf(buf, IFNAMSIZ, name, i);
1090 if (!strncmp(buf, name_node->name, IFNAMSIZ))
1091 __set_bit(i, inuse);
1092 }
1093 if (!sscanf(d->name, name, &i))
1094 continue;
1095 if (i < 0 || i >= max_netdevices)
1096 continue;
1097
1098 /* avoid cases where sscanf is not exact inverse of printf */
1099 snprintf(buf, IFNAMSIZ, name, i);
1100 if (!strncmp(buf, d->name, IFNAMSIZ))
1101 __set_bit(i, inuse);
1102 }
1103
1104 i = find_first_zero_bit(inuse, max_netdevices);
1105 bitmap_free(inuse);
1106 if (i == max_netdevices)
1107 return -ENFILE;
1108
1109 /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */
1110 strscpy(buf, name, IFNAMSIZ);
1111 snprintf(res, IFNAMSIZ, buf, i);
1112 return i;
1113}
1114
1115/* Returns negative errno or allocated unit id (see __dev_alloc_name()) */
1116static int dev_prep_valid_name(struct net *net, struct net_device *dev,
1117 const char *want_name, char *out_name,
1118 int dup_errno)
1119{
1120 if (!dev_valid_name(want_name))
1121 return -EINVAL;
1122
1123 if (strchr(want_name, '%'))
1124 return __dev_alloc_name(net, want_name, out_name);
1125
1126 if (netdev_name_in_use(net, want_name))
1127 return -dup_errno;
1128 if (out_name != want_name)
1129 strscpy(out_name, want_name, IFNAMSIZ);
1130 return 0;
1131}
1132
1133/**
1134 * dev_alloc_name - allocate a name for a device
1135 * @dev: device
1136 * @name: name format string
1137 *
1138 * Passed a format string - eg "lt%d" it will try and find a suitable
1139 * id. It scans list of devices to build up a free map, then chooses
1140 * the first empty slot. The caller must hold the dev_base or rtnl lock
1141 * while allocating the name and adding the device in order to avoid
1142 * duplicates.
1143 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1144 * Returns the number of the unit assigned or a negative errno code.
1145 */
1146
1147int dev_alloc_name(struct net_device *dev, const char *name)
1148{
1149 return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE);
1150}
1151EXPORT_SYMBOL(dev_alloc_name);
1152
1153static int dev_get_valid_name(struct net *net, struct net_device *dev,
1154 const char *name)
1155{
1156 int ret;
1157
1158 ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST);
1159 return ret < 0 ? ret : 0;
1160}
1161
1162/**
1163 * dev_change_name - change name of a device
1164 * @dev: device
1165 * @newname: name (or format string) must be at least IFNAMSIZ
1166 *
1167 * Change name of a device, can pass format strings "eth%d".
1168 * for wildcarding.
1169 */
1170int dev_change_name(struct net_device *dev, const char *newname)
1171{
1172 unsigned char old_assign_type;
1173 char oldname[IFNAMSIZ];
1174 int err = 0;
1175 int ret;
1176 struct net *net;
1177
1178 ASSERT_RTNL();
1179 BUG_ON(!dev_net(dev));
1180
1181 net = dev_net(dev);
1182
1183 down_write(&devnet_rename_sem);
1184
1185 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1186 up_write(&devnet_rename_sem);
1187 return 0;
1188 }
1189
1190 memcpy(oldname, dev->name, IFNAMSIZ);
1191
1192 err = dev_get_valid_name(net, dev, newname);
1193 if (err < 0) {
1194 up_write(&devnet_rename_sem);
1195 return err;
1196 }
1197
1198 if (oldname[0] && !strchr(oldname, '%'))
1199 netdev_info(dev, "renamed from %s%s\n", oldname,
1200 dev->flags & IFF_UP ? " (while UP)" : "");
1201
1202 old_assign_type = dev->name_assign_type;
1203 WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED);
1204
1205rollback:
1206 ret = device_rename(&dev->dev, dev->name);
1207 if (ret) {
1208 memcpy(dev->name, oldname, IFNAMSIZ);
1209 WRITE_ONCE(dev->name_assign_type, old_assign_type);
1210 up_write(&devnet_rename_sem);
1211 return ret;
1212 }
1213
1214 up_write(&devnet_rename_sem);
1215
1216 netdev_adjacent_rename_links(dev, oldname);
1217
1218 netdev_name_node_del(dev->name_node);
1219
1220 synchronize_net();
1221
1222 netdev_name_node_add(net, dev->name_node);
1223
1224 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1225 ret = notifier_to_errno(ret);
1226
1227 if (ret) {
1228 /* err >= 0 after dev_alloc_name() or stores the first errno */
1229 if (err >= 0) {
1230 err = ret;
1231 down_write(&devnet_rename_sem);
1232 memcpy(dev->name, oldname, IFNAMSIZ);
1233 memcpy(oldname, newname, IFNAMSIZ);
1234 WRITE_ONCE(dev->name_assign_type, old_assign_type);
1235 old_assign_type = NET_NAME_RENAMED;
1236 goto rollback;
1237 } else {
1238 netdev_err(dev, "name change rollback failed: %d\n",
1239 ret);
1240 }
1241 }
1242
1243 return err;
1244}
1245
1246/**
1247 * dev_set_alias - change ifalias of a device
1248 * @dev: device
1249 * @alias: name up to IFALIASZ
1250 * @len: limit of bytes to copy from info
1251 *
1252 * Set ifalias for a device,
1253 */
1254int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1255{
1256 struct dev_ifalias *new_alias = NULL;
1257
1258 if (len >= IFALIASZ)
1259 return -EINVAL;
1260
1261 if (len) {
1262 new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1263 if (!new_alias)
1264 return -ENOMEM;
1265
1266 memcpy(new_alias->ifalias, alias, len);
1267 new_alias->ifalias[len] = 0;
1268 }
1269
1270 mutex_lock(&ifalias_mutex);
1271 new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
1272 mutex_is_locked(&ifalias_mutex));
1273 mutex_unlock(&ifalias_mutex);
1274
1275 if (new_alias)
1276 kfree_rcu(new_alias, rcuhead);
1277
1278 return len;
1279}
1280EXPORT_SYMBOL(dev_set_alias);
1281
1282/**
1283 * dev_get_alias - get ifalias of a device
1284 * @dev: device
1285 * @name: buffer to store name of ifalias
1286 * @len: size of buffer
1287 *
1288 * get ifalias for a device. Caller must make sure dev cannot go
1289 * away, e.g. rcu read lock or own a reference count to device.
1290 */
1291int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1292{
1293 const struct dev_ifalias *alias;
1294 int ret = 0;
1295
1296 rcu_read_lock();
1297 alias = rcu_dereference(dev->ifalias);
1298 if (alias)
1299 ret = snprintf(name, len, "%s", alias->ifalias);
1300 rcu_read_unlock();
1301
1302 return ret;
1303}
1304
1305/**
1306 * netdev_features_change - device changes features
1307 * @dev: device to cause notification
1308 *
1309 * Called to indicate a device has changed features.
1310 */
1311void netdev_features_change(struct net_device *dev)
1312{
1313 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1314}
1315EXPORT_SYMBOL(netdev_features_change);
1316
1317/**
1318 * netdev_state_change - device changes state
1319 * @dev: device to cause notification
1320 *
1321 * Called to indicate a device has changed state. This function calls
1322 * the notifier chains for netdev_chain and sends a NEWLINK message
1323 * to the routing socket.
1324 */
1325void netdev_state_change(struct net_device *dev)
1326{
1327 if (dev->flags & IFF_UP) {
1328 struct netdev_notifier_change_info change_info = {
1329 .info.dev = dev,
1330 };
1331
1332 call_netdevice_notifiers_info(NETDEV_CHANGE,
1333 &change_info.info);
1334 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
1335 }
1336}
1337EXPORT_SYMBOL(netdev_state_change);
1338
1339/**
1340 * __netdev_notify_peers - notify network peers about existence of @dev,
1341 * to be called when rtnl lock is already held.
1342 * @dev: network device
1343 *
1344 * Generate traffic such that interested network peers are aware of
1345 * @dev, such as by generating a gratuitous ARP. This may be used when
1346 * a device wants to inform the rest of the network about some sort of
1347 * reconfiguration such as a failover event or virtual machine
1348 * migration.
1349 */
1350void __netdev_notify_peers(struct net_device *dev)
1351{
1352 ASSERT_RTNL();
1353 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1354 call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1355}
1356EXPORT_SYMBOL(__netdev_notify_peers);
1357
1358/**
1359 * netdev_notify_peers - notify network peers about existence of @dev
1360 * @dev: network device
1361 *
1362 * Generate traffic such that interested network peers are aware of
1363 * @dev, such as by generating a gratuitous ARP. This may be used when
1364 * a device wants to inform the rest of the network about some sort of
1365 * reconfiguration such as a failover event or virtual machine
1366 * migration.
1367 */
1368void netdev_notify_peers(struct net_device *dev)
1369{
1370 rtnl_lock();
1371 __netdev_notify_peers(dev);
1372 rtnl_unlock();
1373}
1374EXPORT_SYMBOL(netdev_notify_peers);
1375
1376static int napi_threaded_poll(void *data);
1377
1378static int napi_kthread_create(struct napi_struct *n)
1379{
1380 int err = 0;
1381
1382 /* Create and wake up the kthread once to put it in
1383 * TASK_INTERRUPTIBLE mode to avoid the blocked task
1384 * warning and work with loadavg.
1385 */
1386 n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
1387 n->dev->name, n->napi_id);
1388 if (IS_ERR(n->thread)) {
1389 err = PTR_ERR(n->thread);
1390 pr_err("kthread_run failed with err %d\n", err);
1391 n->thread = NULL;
1392 }
1393
1394 return err;
1395}
1396
1397static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1398{
1399 const struct net_device_ops *ops = dev->netdev_ops;
1400 int ret;
1401
1402 ASSERT_RTNL();
1403 dev_addr_check(dev);
1404
1405 if (!netif_device_present(dev)) {
1406 /* may be detached because parent is runtime-suspended */
1407 if (dev->dev.parent)
1408 pm_runtime_resume(dev->dev.parent);
1409 if (!netif_device_present(dev))
1410 return -ENODEV;
1411 }
1412
1413 /* Block netpoll from trying to do any rx path servicing.
1414 * If we don't do this there is a chance ndo_poll_controller
1415 * or ndo_poll may be running while we open the device
1416 */
1417 netpoll_poll_disable(dev);
1418
1419 ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
1420 ret = notifier_to_errno(ret);
1421 if (ret)
1422 return ret;
1423
1424 set_bit(__LINK_STATE_START, &dev->state);
1425
1426 if (ops->ndo_validate_addr)
1427 ret = ops->ndo_validate_addr(dev);
1428
1429 if (!ret && ops->ndo_open)
1430 ret = ops->ndo_open(dev);
1431
1432 netpoll_poll_enable(dev);
1433
1434 if (ret)
1435 clear_bit(__LINK_STATE_START, &dev->state);
1436 else {
1437 dev->flags |= IFF_UP;
1438 dev_set_rx_mode(dev);
1439 dev_activate(dev);
1440 add_device_randomness(dev->dev_addr, dev->addr_len);
1441 }
1442
1443 return ret;
1444}
1445
1446/**
1447 * dev_open - prepare an interface for use.
1448 * @dev: device to open
1449 * @extack: netlink extended ack
1450 *
1451 * Takes a device from down to up state. The device's private open
1452 * function is invoked and then the multicast lists are loaded. Finally
1453 * the device is moved into the up state and a %NETDEV_UP message is
1454 * sent to the netdev notifier chain.
1455 *
1456 * Calling this function on an active interface is a nop. On a failure
1457 * a negative errno code is returned.
1458 */
1459int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1460{
1461 int ret;
1462
1463 if (dev->flags & IFF_UP)
1464 return 0;
1465
1466 ret = __dev_open(dev, extack);
1467 if (ret < 0)
1468 return ret;
1469
1470 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
1471 call_netdevice_notifiers(NETDEV_UP, dev);
1472
1473 return ret;
1474}
1475EXPORT_SYMBOL(dev_open);
1476
1477static void __dev_close_many(struct list_head *head)
1478{
1479 struct net_device *dev;
1480
1481 ASSERT_RTNL();
1482 might_sleep();
1483
1484 list_for_each_entry(dev, head, close_list) {
1485 /* Temporarily disable netpoll until the interface is down */
1486 netpoll_poll_disable(dev);
1487
1488 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1489
1490 clear_bit(__LINK_STATE_START, &dev->state);
1491
1492 /* Synchronize to scheduled poll. We cannot touch poll list, it
1493 * can be even on different cpu. So just clear netif_running().
1494 *
1495 * dev->stop() will invoke napi_disable() on all of it's
1496 * napi_struct instances on this device.
1497 */
1498 smp_mb__after_atomic(); /* Commit netif_running(). */
1499 }
1500
1501 dev_deactivate_many(head);
1502
1503 list_for_each_entry(dev, head, close_list) {
1504 const struct net_device_ops *ops = dev->netdev_ops;
1505
1506 /*
1507 * Call the device specific close. This cannot fail.
1508 * Only if device is UP
1509 *
1510 * We allow it to be called even after a DETACH hot-plug
1511 * event.
1512 */
1513 if (ops->ndo_stop)
1514 ops->ndo_stop(dev);
1515
1516 dev->flags &= ~IFF_UP;
1517 netpoll_poll_enable(dev);
1518 }
1519}
1520
1521static void __dev_close(struct net_device *dev)
1522{
1523 LIST_HEAD(single);
1524
1525 list_add(&dev->close_list, &single);
1526 __dev_close_many(&single);
1527 list_del(&single);
1528}
1529
1530void dev_close_many(struct list_head *head, bool unlink)
1531{
1532 struct net_device *dev, *tmp;
1533
1534 /* Remove the devices that don't need to be closed */
1535 list_for_each_entry_safe(dev, tmp, head, close_list)
1536 if (!(dev->flags & IFF_UP))
1537 list_del_init(&dev->close_list);
1538
1539 __dev_close_many(head);
1540
1541 list_for_each_entry_safe(dev, tmp, head, close_list) {
1542 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
1543 call_netdevice_notifiers(NETDEV_DOWN, dev);
1544 if (unlink)
1545 list_del_init(&dev->close_list);
1546 }
1547}
1548EXPORT_SYMBOL(dev_close_many);
1549
1550/**
1551 * dev_close - shutdown an interface.
1552 * @dev: device to shutdown
1553 *
1554 * This function moves an active device into down state. A
1555 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1556 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1557 * chain.
1558 */
1559void dev_close(struct net_device *dev)
1560{
1561 if (dev->flags & IFF_UP) {
1562 LIST_HEAD(single);
1563
1564 list_add(&dev->close_list, &single);
1565 dev_close_many(&single, true);
1566 list_del(&single);
1567 }
1568}
1569EXPORT_SYMBOL(dev_close);
1570
1571
1572/**
1573 * dev_disable_lro - disable Large Receive Offload on a device
1574 * @dev: device
1575 *
1576 * Disable Large Receive Offload (LRO) on a net device. Must be
1577 * called under RTNL. This is needed if received packets may be
1578 * forwarded to another interface.
1579 */
1580void dev_disable_lro(struct net_device *dev)
1581{
1582 struct net_device *lower_dev;
1583 struct list_head *iter;
1584
1585 dev->wanted_features &= ~NETIF_F_LRO;
1586 netdev_update_features(dev);
1587
1588 if (unlikely(dev->features & NETIF_F_LRO))
1589 netdev_WARN(dev, "failed to disable LRO!\n");
1590
1591 netdev_for_each_lower_dev(dev, lower_dev, iter)
1592 dev_disable_lro(lower_dev);
1593}
1594EXPORT_SYMBOL(dev_disable_lro);
1595
1596/**
1597 * dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1598 * @dev: device
1599 *
1600 * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be
1601 * called under RTNL. This is needed if Generic XDP is installed on
1602 * the device.
1603 */
1604static void dev_disable_gro_hw(struct net_device *dev)
1605{
1606 dev->wanted_features &= ~NETIF_F_GRO_HW;
1607 netdev_update_features(dev);
1608
1609 if (unlikely(dev->features & NETIF_F_GRO_HW))
1610 netdev_WARN(dev, "failed to disable GRO_HW!\n");
1611}
1612
1613const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1614{
1615#define N(val) \
1616 case NETDEV_##val: \
1617 return "NETDEV_" __stringify(val);
1618 switch (cmd) {
1619 N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1620 N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1621 N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1622 N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN)
1623 N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA)
1624 N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE)
1625 N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1626 N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1627 N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1628 N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
1629 N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
1630 N(XDP_FEAT_CHANGE)
1631 }
1632#undef N
1633 return "UNKNOWN_NETDEV_EVENT";
1634}
1635EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1636
1637static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1638 struct net_device *dev)
1639{
1640 struct netdev_notifier_info info = {
1641 .dev = dev,
1642 };
1643
1644 return nb->notifier_call(nb, val, &info);
1645}
1646
1647static int call_netdevice_register_notifiers(struct notifier_block *nb,
1648 struct net_device *dev)
1649{
1650 int err;
1651
1652 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1653 err = notifier_to_errno(err);
1654 if (err)
1655 return err;
1656
1657 if (!(dev->flags & IFF_UP))
1658 return 0;
1659
1660 call_netdevice_notifier(nb, NETDEV_UP, dev);
1661 return 0;
1662}
1663
1664static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
1665 struct net_device *dev)
1666{
1667 if (dev->flags & IFF_UP) {
1668 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1669 dev);
1670 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1671 }
1672 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1673}
1674
1675static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
1676 struct net *net)
1677{
1678 struct net_device *dev;
1679 int err;
1680
1681 for_each_netdev(net, dev) {
1682 err = call_netdevice_register_notifiers(nb, dev);
1683 if (err)
1684 goto rollback;
1685 }
1686 return 0;
1687
1688rollback:
1689 for_each_netdev_continue_reverse(net, dev)
1690 call_netdevice_unregister_notifiers(nb, dev);
1691 return err;
1692}
1693
1694static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
1695 struct net *net)
1696{
1697 struct net_device *dev;
1698
1699 for_each_netdev(net, dev)
1700 call_netdevice_unregister_notifiers(nb, dev);
1701}
1702
1703static int dev_boot_phase = 1;
1704
1705/**
1706 * register_netdevice_notifier - register a network notifier block
1707 * @nb: notifier
1708 *
1709 * Register a notifier to be called when network device events occur.
1710 * The notifier passed is linked into the kernel structures and must
1711 * not be reused until it has been unregistered. A negative errno code
1712 * is returned on a failure.
1713 *
1714 * When registered all registration and up events are replayed
1715 * to the new notifier to allow device to have a race free
1716 * view of the network device list.
1717 */
1718
1719int register_netdevice_notifier(struct notifier_block *nb)
1720{
1721 struct net *net;
1722 int err;
1723
1724 /* Close race with setup_net() and cleanup_net() */
1725 down_write(&pernet_ops_rwsem);
1726 rtnl_lock();
1727 err = raw_notifier_chain_register(&netdev_chain, nb);
1728 if (err)
1729 goto unlock;
1730 if (dev_boot_phase)
1731 goto unlock;
1732 for_each_net(net) {
1733 err = call_netdevice_register_net_notifiers(nb, net);
1734 if (err)
1735 goto rollback;
1736 }
1737
1738unlock:
1739 rtnl_unlock();
1740 up_write(&pernet_ops_rwsem);
1741 return err;
1742
1743rollback:
1744 for_each_net_continue_reverse(net)
1745 call_netdevice_unregister_net_notifiers(nb, net);
1746
1747 raw_notifier_chain_unregister(&netdev_chain, nb);
1748 goto unlock;
1749}
1750EXPORT_SYMBOL(register_netdevice_notifier);
1751
1752/**
1753 * unregister_netdevice_notifier - unregister a network notifier block
1754 * @nb: notifier
1755 *
1756 * Unregister a notifier previously registered by
1757 * register_netdevice_notifier(). The notifier is unlinked into the
1758 * kernel structures and may then be reused. A negative errno code
1759 * is returned on a failure.
1760 *
1761 * After unregistering unregister and down device events are synthesized
1762 * for all devices on the device list to the removed notifier to remove
1763 * the need for special case cleanup code.
1764 */
1765
1766int unregister_netdevice_notifier(struct notifier_block *nb)
1767{
1768 struct net *net;
1769 int err;
1770
1771 /* Close race with setup_net() and cleanup_net() */
1772 down_write(&pernet_ops_rwsem);
1773 rtnl_lock();
1774 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1775 if (err)
1776 goto unlock;
1777
1778 for_each_net(net)
1779 call_netdevice_unregister_net_notifiers(nb, net);
1780
1781unlock:
1782 rtnl_unlock();
1783 up_write(&pernet_ops_rwsem);
1784 return err;
1785}
1786EXPORT_SYMBOL(unregister_netdevice_notifier);
1787
1788static int __register_netdevice_notifier_net(struct net *net,
1789 struct notifier_block *nb,
1790 bool ignore_call_fail)
1791{
1792 int err;
1793
1794 err = raw_notifier_chain_register(&net->netdev_chain, nb);
1795 if (err)
1796 return err;
1797 if (dev_boot_phase)
1798 return 0;
1799
1800 err = call_netdevice_register_net_notifiers(nb, net);
1801 if (err && !ignore_call_fail)
1802 goto chain_unregister;
1803
1804 return 0;
1805
1806chain_unregister:
1807 raw_notifier_chain_unregister(&net->netdev_chain, nb);
1808 return err;
1809}
1810
1811static int __unregister_netdevice_notifier_net(struct net *net,
1812 struct notifier_block *nb)
1813{
1814 int err;
1815
1816 err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
1817 if (err)
1818 return err;
1819
1820 call_netdevice_unregister_net_notifiers(nb, net);
1821 return 0;
1822}
1823
1824/**
1825 * register_netdevice_notifier_net - register a per-netns network notifier block
1826 * @net: network namespace
1827 * @nb: notifier
1828 *
1829 * Register a notifier to be called when network device events occur.
1830 * The notifier passed is linked into the kernel structures and must
1831 * not be reused until it has been unregistered. A negative errno code
1832 * is returned on a failure.
1833 *
1834 * When registered all registration and up events are replayed
1835 * to the new notifier to allow device to have a race free
1836 * view of the network device list.
1837 */
1838
1839int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
1840{
1841 int err;
1842
1843 rtnl_lock();
1844 err = __register_netdevice_notifier_net(net, nb, false);
1845 rtnl_unlock();
1846 return err;
1847}
1848EXPORT_SYMBOL(register_netdevice_notifier_net);
1849
1850/**
1851 * unregister_netdevice_notifier_net - unregister a per-netns
1852 * network notifier block
1853 * @net: network namespace
1854 * @nb: notifier
1855 *
1856 * Unregister a notifier previously registered by
1857 * register_netdevice_notifier_net(). The notifier is unlinked from the
1858 * kernel structures and may then be reused. A negative errno code
1859 * is returned on a failure.
1860 *
1861 * After unregistering unregister and down device events are synthesized
1862 * for all devices on the device list to the removed notifier to remove
1863 * the need for special case cleanup code.
1864 */
1865
1866int unregister_netdevice_notifier_net(struct net *net,
1867 struct notifier_block *nb)
1868{
1869 int err;
1870
1871 rtnl_lock();
1872 err = __unregister_netdevice_notifier_net(net, nb);
1873 rtnl_unlock();
1874 return err;
1875}
1876EXPORT_SYMBOL(unregister_netdevice_notifier_net);
1877
1878static void __move_netdevice_notifier_net(struct net *src_net,
1879 struct net *dst_net,
1880 struct notifier_block *nb)
1881{
1882 __unregister_netdevice_notifier_net(src_net, nb);
1883 __register_netdevice_notifier_net(dst_net, nb, true);
1884}
1885
1886int register_netdevice_notifier_dev_net(struct net_device *dev,
1887 struct notifier_block *nb,
1888 struct netdev_net_notifier *nn)
1889{
1890 int err;
1891
1892 rtnl_lock();
1893 err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
1894 if (!err) {
1895 nn->nb = nb;
1896 list_add(&nn->list, &dev->net_notifier_list);
1897 }
1898 rtnl_unlock();
1899 return err;
1900}
1901EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
1902
1903int unregister_netdevice_notifier_dev_net(struct net_device *dev,
1904 struct notifier_block *nb,
1905 struct netdev_net_notifier *nn)
1906{
1907 int err;
1908
1909 rtnl_lock();
1910 list_del(&nn->list);
1911 err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
1912 rtnl_unlock();
1913 return err;
1914}
1915EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
1916
1917static void move_netdevice_notifiers_dev_net(struct net_device *dev,
1918 struct net *net)
1919{
1920 struct netdev_net_notifier *nn;
1921
1922 list_for_each_entry(nn, &dev->net_notifier_list, list)
1923 __move_netdevice_notifier_net(dev_net(dev), net, nn->nb);
1924}
1925
1926/**
1927 * call_netdevice_notifiers_info - call all network notifier blocks
1928 * @val: value passed unmodified to notifier function
1929 * @info: notifier information data
1930 *
1931 * Call all network notifier blocks. Parameters and return value
1932 * are as for raw_notifier_call_chain().
1933 */
1934
1935int call_netdevice_notifiers_info(unsigned long val,
1936 struct netdev_notifier_info *info)
1937{
1938 struct net *net = dev_net(info->dev);
1939 int ret;
1940
1941 ASSERT_RTNL();
1942
1943 /* Run per-netns notifier block chain first, then run the global one.
1944 * Hopefully, one day, the global one is going to be removed after
1945 * all notifier block registrators get converted to be per-netns.
1946 */
1947 ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
1948 if (ret & NOTIFY_STOP_MASK)
1949 return ret;
1950 return raw_notifier_call_chain(&netdev_chain, val, info);
1951}
1952
1953/**
1954 * call_netdevice_notifiers_info_robust - call per-netns notifier blocks
1955 * for and rollback on error
1956 * @val_up: value passed unmodified to notifier function
1957 * @val_down: value passed unmodified to the notifier function when
1958 * recovering from an error on @val_up
1959 * @info: notifier information data
1960 *
1961 * Call all per-netns network notifier blocks, but not notifier blocks on
1962 * the global notifier chain. Parameters and return value are as for
1963 * raw_notifier_call_chain_robust().
1964 */
1965
1966static int
1967call_netdevice_notifiers_info_robust(unsigned long val_up,
1968 unsigned long val_down,
1969 struct netdev_notifier_info *info)
1970{
1971 struct net *net = dev_net(info->dev);
1972
1973 ASSERT_RTNL();
1974
1975 return raw_notifier_call_chain_robust(&net->netdev_chain,
1976 val_up, val_down, info);
1977}
1978
1979static int call_netdevice_notifiers_extack(unsigned long val,
1980 struct net_device *dev,
1981 struct netlink_ext_ack *extack)
1982{
1983 struct netdev_notifier_info info = {
1984 .dev = dev,
1985 .extack = extack,
1986 };
1987
1988 return call_netdevice_notifiers_info(val, &info);
1989}
1990
1991/**
1992 * call_netdevice_notifiers - call all network notifier blocks
1993 * @val: value passed unmodified to notifier function
1994 * @dev: net_device pointer passed unmodified to notifier function
1995 *
1996 * Call all network notifier blocks. Parameters and return value
1997 * are as for raw_notifier_call_chain().
1998 */
1999
2000int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
2001{
2002 return call_netdevice_notifiers_extack(val, dev, NULL);
2003}
2004EXPORT_SYMBOL(call_netdevice_notifiers);
2005
2006/**
2007 * call_netdevice_notifiers_mtu - call all network notifier blocks
2008 * @val: value passed unmodified to notifier function
2009 * @dev: net_device pointer passed unmodified to notifier function
2010 * @arg: additional u32 argument passed to the notifier function
2011 *
2012 * Call all network notifier blocks. Parameters and return value
2013 * are as for raw_notifier_call_chain().
2014 */
2015static int call_netdevice_notifiers_mtu(unsigned long val,
2016 struct net_device *dev, u32 arg)
2017{
2018 struct netdev_notifier_info_ext info = {
2019 .info.dev = dev,
2020 .ext.mtu = arg,
2021 };
2022
2023 BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
2024
2025 return call_netdevice_notifiers_info(val, &info.info);
2026}
2027
2028#ifdef CONFIG_NET_INGRESS
2029static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
2030
2031void net_inc_ingress_queue(void)
2032{
2033 static_branch_inc(&ingress_needed_key);
2034}
2035EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
2036
2037void net_dec_ingress_queue(void)
2038{
2039 static_branch_dec(&ingress_needed_key);
2040}
2041EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
2042#endif
2043
2044#ifdef CONFIG_NET_EGRESS
2045static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
2046
2047void net_inc_egress_queue(void)
2048{
2049 static_branch_inc(&egress_needed_key);
2050}
2051EXPORT_SYMBOL_GPL(net_inc_egress_queue);
2052
2053void net_dec_egress_queue(void)
2054{
2055 static_branch_dec(&egress_needed_key);
2056}
2057EXPORT_SYMBOL_GPL(net_dec_egress_queue);
2058#endif
2059
2060DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
2061EXPORT_SYMBOL(netstamp_needed_key);
2062#ifdef CONFIG_JUMP_LABEL
2063static atomic_t netstamp_needed_deferred;
2064static atomic_t netstamp_wanted;
2065static void netstamp_clear(struct work_struct *work)
2066{
2067 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
2068 int wanted;
2069
2070 wanted = atomic_add_return(deferred, &netstamp_wanted);
2071 if (wanted > 0)
2072 static_branch_enable(&netstamp_needed_key);
2073 else
2074 static_branch_disable(&netstamp_needed_key);
2075}
2076static DECLARE_WORK(netstamp_work, netstamp_clear);
2077#endif
2078
2079void net_enable_timestamp(void)
2080{
2081#ifdef CONFIG_JUMP_LABEL
2082 int wanted = atomic_read(&netstamp_wanted);
2083
2084 while (wanted > 0) {
2085 if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1))
2086 return;
2087 }
2088 atomic_inc(&netstamp_needed_deferred);
2089 schedule_work(&netstamp_work);
2090#else
2091 static_branch_inc(&netstamp_needed_key);
2092#endif
2093}
2094EXPORT_SYMBOL(net_enable_timestamp);
2095
2096void net_disable_timestamp(void)
2097{
2098#ifdef CONFIG_JUMP_LABEL
2099 int wanted = atomic_read(&netstamp_wanted);
2100
2101 while (wanted > 1) {
2102 if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1))
2103 return;
2104 }
2105 atomic_dec(&netstamp_needed_deferred);
2106 schedule_work(&netstamp_work);
2107#else
2108 static_branch_dec(&netstamp_needed_key);
2109#endif
2110}
2111EXPORT_SYMBOL(net_disable_timestamp);
2112
2113static inline void net_timestamp_set(struct sk_buff *skb)
2114{
2115 skb->tstamp = 0;
2116 skb->mono_delivery_time = 0;
2117 if (static_branch_unlikely(&netstamp_needed_key))
2118 skb->tstamp = ktime_get_real();
2119}
2120
2121#define net_timestamp_check(COND, SKB) \
2122 if (static_branch_unlikely(&netstamp_needed_key)) { \
2123 if ((COND) && !(SKB)->tstamp) \
2124 (SKB)->tstamp = ktime_get_real(); \
2125 } \
2126
2127bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
2128{
2129 return __is_skb_forwardable(dev, skb, true);
2130}
2131EXPORT_SYMBOL_GPL(is_skb_forwardable);
2132
2133static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
2134 bool check_mtu)
2135{
2136 int ret = ____dev_forward_skb(dev, skb, check_mtu);
2137
2138 if (likely(!ret)) {
2139 skb->protocol = eth_type_trans(skb, dev);
2140 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
2141 }
2142
2143 return ret;
2144}
2145
2146int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2147{
2148 return __dev_forward_skb2(dev, skb, true);
2149}
2150EXPORT_SYMBOL_GPL(__dev_forward_skb);
2151
2152/**
2153 * dev_forward_skb - loopback an skb to another netif
2154 *
2155 * @dev: destination network device
2156 * @skb: buffer to forward
2157 *
2158 * return values:
2159 * NET_RX_SUCCESS (no congestion)
2160 * NET_RX_DROP (packet was dropped, but freed)
2161 *
2162 * dev_forward_skb can be used for injecting an skb from the
2163 * start_xmit function of one device into the receive queue
2164 * of another device.
2165 *
2166 * The receiving device may be in another namespace, so
2167 * we have to clear all information in the skb that could
2168 * impact namespace isolation.
2169 */
2170int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2171{
2172 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
2173}
2174EXPORT_SYMBOL_GPL(dev_forward_skb);
2175
2176int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
2177{
2178 return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
2179}
2180
2181static inline int deliver_skb(struct sk_buff *skb,
2182 struct packet_type *pt_prev,
2183 struct net_device *orig_dev)
2184{
2185 if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
2186 return -ENOMEM;
2187 refcount_inc(&skb->users);
2188 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2189}
2190
2191static inline void deliver_ptype_list_skb(struct sk_buff *skb,
2192 struct packet_type **pt,
2193 struct net_device *orig_dev,
2194 __be16 type,
2195 struct list_head *ptype_list)
2196{
2197 struct packet_type *ptype, *pt_prev = *pt;
2198
2199 list_for_each_entry_rcu(ptype, ptype_list, list) {
2200 if (ptype->type != type)
2201 continue;
2202 if (pt_prev)
2203 deliver_skb(skb, pt_prev, orig_dev);
2204 pt_prev = ptype;
2205 }
2206 *pt = pt_prev;
2207}
2208
2209static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
2210{
2211 if (!ptype->af_packet_priv || !skb->sk)
2212 return false;
2213
2214 if (ptype->id_match)
2215 return ptype->id_match(ptype, skb->sk);
2216 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
2217 return true;
2218
2219 return false;
2220}
2221
2222/**
2223 * dev_nit_active - return true if any network interface taps are in use
2224 *
2225 * @dev: network device to check for the presence of taps
2226 */
2227bool dev_nit_active(struct net_device *dev)
2228{
2229 return !list_empty(&net_hotdata.ptype_all) ||
2230 !list_empty(&dev->ptype_all);
2231}
2232EXPORT_SYMBOL_GPL(dev_nit_active);
2233
2234/*
2235 * Support routine. Sends outgoing frames to any network
2236 * taps currently in use.
2237 */
2238
2239void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
2240{
2241 struct list_head *ptype_list = &net_hotdata.ptype_all;
2242 struct packet_type *ptype, *pt_prev = NULL;
2243 struct sk_buff *skb2 = NULL;
2244
2245 rcu_read_lock();
2246again:
2247 list_for_each_entry_rcu(ptype, ptype_list, list) {
2248 if (READ_ONCE(ptype->ignore_outgoing))
2249 continue;
2250
2251 /* Never send packets back to the socket
2252 * they originated from - MvS (miquels@drinkel.ow.org)
2253 */
2254 if (skb_loop_sk(ptype, skb))
2255 continue;
2256
2257 if (pt_prev) {
2258 deliver_skb(skb2, pt_prev, skb->dev);
2259 pt_prev = ptype;
2260 continue;
2261 }
2262
2263 /* need to clone skb, done only once */
2264 skb2 = skb_clone(skb, GFP_ATOMIC);
2265 if (!skb2)
2266 goto out_unlock;
2267
2268 net_timestamp_set(skb2);
2269
2270 /* skb->nh should be correctly
2271 * set by sender, so that the second statement is
2272 * just protection against buggy protocols.
2273 */
2274 skb_reset_mac_header(skb2);
2275
2276 if (skb_network_header(skb2) < skb2->data ||
2277 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2278 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2279 ntohs(skb2->protocol),
2280 dev->name);
2281 skb_reset_network_header(skb2);
2282 }
2283
2284 skb2->transport_header = skb2->network_header;
2285 skb2->pkt_type = PACKET_OUTGOING;
2286 pt_prev = ptype;
2287 }
2288
2289 if (ptype_list == &net_hotdata.ptype_all) {
2290 ptype_list = &dev->ptype_all;
2291 goto again;
2292 }
2293out_unlock:
2294 if (pt_prev) {
2295 if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2296 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2297 else
2298 kfree_skb(skb2);
2299 }
2300 rcu_read_unlock();
2301}
2302EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2303
2304/**
2305 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2306 * @dev: Network device
2307 * @txq: number of queues available
2308 *
2309 * If real_num_tx_queues is changed the tc mappings may no longer be
2310 * valid. To resolve this verify the tc mapping remains valid and if
2311 * not NULL the mapping. With no priorities mapping to this
2312 * offset/count pair it will no longer be used. In the worst case TC0
2313 * is invalid nothing can be done so disable priority mappings. If is
2314 * expected that drivers will fix this mapping if they can before
2315 * calling netif_set_real_num_tx_queues.
2316 */
2317static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2318{
2319 int i;
2320 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2321
2322 /* If TC0 is invalidated disable TC mapping */
2323 if (tc->offset + tc->count > txq) {
2324 netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2325 dev->num_tc = 0;
2326 return;
2327 }
2328
2329 /* Invalidated prio to tc mappings set to TC0 */
2330 for (i = 1; i < TC_BITMASK + 1; i++) {
2331 int q = netdev_get_prio_tc_map(dev, i);
2332
2333 tc = &dev->tc_to_txq[q];
2334 if (tc->offset + tc->count > txq) {
2335 netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2336 i, q);
2337 netdev_set_prio_tc_map(dev, i, 0);
2338 }
2339 }
2340}
2341
2342int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2343{
2344 if (dev->num_tc) {
2345 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2346 int i;
2347
2348 /* walk through the TCs and see if it falls into any of them */
2349 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2350 if ((txq - tc->offset) < tc->count)
2351 return i;
2352 }
2353
2354 /* didn't find it, just return -1 to indicate no match */
2355 return -1;
2356 }
2357
2358 return 0;
2359}
2360EXPORT_SYMBOL(netdev_txq_to_tc);
2361
2362#ifdef CONFIG_XPS
2363static struct static_key xps_needed __read_mostly;
2364static struct static_key xps_rxqs_needed __read_mostly;
2365static DEFINE_MUTEX(xps_map_mutex);
2366#define xmap_dereference(P) \
2367 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2368
2369static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2370 struct xps_dev_maps *old_maps, int tci, u16 index)
2371{
2372 struct xps_map *map = NULL;
2373 int pos;
2374
2375 map = xmap_dereference(dev_maps->attr_map[tci]);
2376 if (!map)
2377 return false;
2378
2379 for (pos = map->len; pos--;) {
2380 if (map->queues[pos] != index)
2381 continue;
2382
2383 if (map->len > 1) {
2384 map->queues[pos] = map->queues[--map->len];
2385 break;
2386 }
2387
2388 if (old_maps)
2389 RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
2390 RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2391 kfree_rcu(map, rcu);
2392 return false;
2393 }
2394
2395 return true;
2396}
2397
2398static bool remove_xps_queue_cpu(struct net_device *dev,
2399 struct xps_dev_maps *dev_maps,
2400 int cpu, u16 offset, u16 count)
2401{
2402 int num_tc = dev_maps->num_tc;
2403 bool active = false;
2404 int tci;
2405
2406 for (tci = cpu * num_tc; num_tc--; tci++) {
2407 int i, j;
2408
2409 for (i = count, j = offset; i--; j++) {
2410 if (!remove_xps_queue(dev_maps, NULL, tci, j))
2411 break;
2412 }
2413
2414 active |= i < 0;
2415 }
2416
2417 return active;
2418}
2419
2420static void reset_xps_maps(struct net_device *dev,
2421 struct xps_dev_maps *dev_maps,
2422 enum xps_map_type type)
2423{
2424 static_key_slow_dec_cpuslocked(&xps_needed);
2425 if (type == XPS_RXQS)
2426 static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
2427
2428 RCU_INIT_POINTER(dev->xps_maps[type], NULL);
2429
2430 kfree_rcu(dev_maps, rcu);
2431}
2432
2433static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
2434 u16 offset, u16 count)
2435{
2436 struct xps_dev_maps *dev_maps;
2437 bool active = false;
2438 int i, j;
2439
2440 dev_maps = xmap_dereference(dev->xps_maps[type]);
2441 if (!dev_maps)
2442 return;
2443
2444 for (j = 0; j < dev_maps->nr_ids; j++)
2445 active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
2446 if (!active)
2447 reset_xps_maps(dev, dev_maps, type);
2448
2449 if (type == XPS_CPUS) {
2450 for (i = offset + (count - 1); count--; i--)
2451 netdev_queue_numa_node_write(
2452 netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
2453 }
2454}
2455
2456static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2457 u16 count)
2458{
2459 if (!static_key_false(&xps_needed))
2460 return;
2461
2462 cpus_read_lock();
2463 mutex_lock(&xps_map_mutex);
2464
2465 if (static_key_false(&xps_rxqs_needed))
2466 clean_xps_maps(dev, XPS_RXQS, offset, count);
2467
2468 clean_xps_maps(dev, XPS_CPUS, offset, count);
2469
2470 mutex_unlock(&xps_map_mutex);
2471 cpus_read_unlock();
2472}
2473
2474static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2475{
2476 netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2477}
2478
2479static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2480 u16 index, bool is_rxqs_map)
2481{
2482 struct xps_map *new_map;
2483 int alloc_len = XPS_MIN_MAP_ALLOC;
2484 int i, pos;
2485
2486 for (pos = 0; map && pos < map->len; pos++) {
2487 if (map->queues[pos] != index)
2488 continue;
2489 return map;
2490 }
2491
2492 /* Need to add tx-queue to this CPU's/rx-queue's existing map */
2493 if (map) {
2494 if (pos < map->alloc_len)
2495 return map;
2496
2497 alloc_len = map->alloc_len * 2;
2498 }
2499
2500 /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2501 * map
2502 */
2503 if (is_rxqs_map)
2504 new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2505 else
2506 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2507 cpu_to_node(attr_index));
2508 if (!new_map)
2509 return NULL;
2510
2511 for (i = 0; i < pos; i++)
2512 new_map->queues[i] = map->queues[i];
2513 new_map->alloc_len = alloc_len;
2514 new_map->len = pos;
2515
2516 return new_map;
2517}
2518
2519/* Copy xps maps at a given index */
2520static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
2521 struct xps_dev_maps *new_dev_maps, int index,
2522 int tc, bool skip_tc)
2523{
2524 int i, tci = index * dev_maps->num_tc;
2525 struct xps_map *map;
2526
2527 /* copy maps belonging to foreign traffic classes */
2528 for (i = 0; i < dev_maps->num_tc; i++, tci++) {
2529 if (i == tc && skip_tc)
2530 continue;
2531
2532 /* fill in the new device map from the old device map */
2533 map = xmap_dereference(dev_maps->attr_map[tci]);
2534 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2535 }
2536}
2537
2538/* Must be called under cpus_read_lock */
2539int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2540 u16 index, enum xps_map_type type)
2541{
2542 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
2543 const unsigned long *online_mask = NULL;
2544 bool active = false, copy = false;
2545 int i, j, tci, numa_node_id = -2;
2546 int maps_sz, num_tc = 1, tc = 0;
2547 struct xps_map *map, *new_map;
2548 unsigned int nr_ids;
2549
2550 WARN_ON_ONCE(index >= dev->num_tx_queues);
2551
2552 if (dev->num_tc) {
2553 /* Do not allow XPS on subordinate device directly */
2554 num_tc = dev->num_tc;
2555 if (num_tc < 0)
2556 return -EINVAL;
2557
2558 /* If queue belongs to subordinate dev use its map */
2559 dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2560
2561 tc = netdev_txq_to_tc(dev, index);
2562 if (tc < 0)
2563 return -EINVAL;
2564 }
2565
2566 mutex_lock(&xps_map_mutex);
2567
2568 dev_maps = xmap_dereference(dev->xps_maps[type]);
2569 if (type == XPS_RXQS) {
2570 maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2571 nr_ids = dev->num_rx_queues;
2572 } else {
2573 maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2574 if (num_possible_cpus() > 1)
2575 online_mask = cpumask_bits(cpu_online_mask);
2576 nr_ids = nr_cpu_ids;
2577 }
2578
2579 if (maps_sz < L1_CACHE_BYTES)
2580 maps_sz = L1_CACHE_BYTES;
2581
2582 /* The old dev_maps could be larger or smaller than the one we're
2583 * setting up now, as dev->num_tc or nr_ids could have been updated in
2584 * between. We could try to be smart, but let's be safe instead and only
2585 * copy foreign traffic classes if the two map sizes match.
2586 */
2587 if (dev_maps &&
2588 dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
2589 copy = true;
2590
2591 /* allocate memory for queue storage */
2592 for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
2593 j < nr_ids;) {
2594 if (!new_dev_maps) {
2595 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2596 if (!new_dev_maps) {
2597 mutex_unlock(&xps_map_mutex);
2598 return -ENOMEM;
2599 }
2600
2601 new_dev_maps->nr_ids = nr_ids;
2602 new_dev_maps->num_tc = num_tc;
2603 }
2604
2605 tci = j * num_tc + tc;
2606 map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
2607
2608 map = expand_xps_map(map, j, index, type == XPS_RXQS);
2609 if (!map)
2610 goto error;
2611
2612 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2613 }
2614
2615 if (!new_dev_maps)
2616 goto out_no_new_maps;
2617
2618 if (!dev_maps) {
2619 /* Increment static keys at most once per type */
2620 static_key_slow_inc_cpuslocked(&xps_needed);
2621 if (type == XPS_RXQS)
2622 static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
2623 }
2624
2625 for (j = 0; j < nr_ids; j++) {
2626 bool skip_tc = false;
2627
2628 tci = j * num_tc + tc;
2629 if (netif_attr_test_mask(j, mask, nr_ids) &&
2630 netif_attr_test_online(j, online_mask, nr_ids)) {
2631 /* add tx-queue to CPU/rx-queue maps */
2632 int pos = 0;
2633
2634 skip_tc = true;
2635
2636 map = xmap_dereference(new_dev_maps->attr_map[tci]);
2637 while ((pos < map->len) && (map->queues[pos] != index))
2638 pos++;
2639
2640 if (pos == map->len)
2641 map->queues[map->len++] = index;
2642#ifdef CONFIG_NUMA
2643 if (type == XPS_CPUS) {
2644 if (numa_node_id == -2)
2645 numa_node_id = cpu_to_node(j);
2646 else if (numa_node_id != cpu_to_node(j))
2647 numa_node_id = -1;
2648 }
2649#endif
2650 }
2651
2652 if (copy)
2653 xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
2654 skip_tc);
2655 }
2656
2657 rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
2658
2659 /* Cleanup old maps */
2660 if (!dev_maps)
2661 goto out_no_old_maps;
2662
2663 for (j = 0; j < dev_maps->nr_ids; j++) {
2664 for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
2665 map = xmap_dereference(dev_maps->attr_map[tci]);
2666 if (!map)
2667 continue;
2668
2669 if (copy) {
2670 new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2671 if (map == new_map)
2672 continue;
2673 }
2674
2675 RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2676 kfree_rcu(map, rcu);
2677 }
2678 }
2679
2680 old_dev_maps = dev_maps;
2681
2682out_no_old_maps:
2683 dev_maps = new_dev_maps;
2684 active = true;
2685
2686out_no_new_maps:
2687 if (type == XPS_CPUS)
2688 /* update Tx queue numa node */
2689 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2690 (numa_node_id >= 0) ?
2691 numa_node_id : NUMA_NO_NODE);
2692
2693 if (!dev_maps)
2694 goto out_no_maps;
2695
2696 /* removes tx-queue from unused CPUs/rx-queues */
2697 for (j = 0; j < dev_maps->nr_ids; j++) {
2698 tci = j * dev_maps->num_tc;
2699
2700 for (i = 0; i < dev_maps->num_tc; i++, tci++) {
2701 if (i == tc &&
2702 netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
2703 netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
2704 continue;
2705
2706 active |= remove_xps_queue(dev_maps,
2707 copy ? old_dev_maps : NULL,
2708 tci, index);
2709 }
2710 }
2711
2712 if (old_dev_maps)
2713 kfree_rcu(old_dev_maps, rcu);
2714
2715 /* free map if not active */
2716 if (!active)
2717 reset_xps_maps(dev, dev_maps, type);
2718
2719out_no_maps:
2720 mutex_unlock(&xps_map_mutex);
2721
2722 return 0;
2723error:
2724 /* remove any maps that we added */
2725 for (j = 0; j < nr_ids; j++) {
2726 for (i = num_tc, tci = j * num_tc; i--; tci++) {
2727 new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2728 map = copy ?
2729 xmap_dereference(dev_maps->attr_map[tci]) :
2730 NULL;
2731 if (new_map && new_map != map)
2732 kfree(new_map);
2733 }
2734 }
2735
2736 mutex_unlock(&xps_map_mutex);
2737
2738 kfree(new_dev_maps);
2739 return -ENOMEM;
2740}
2741EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
2742
2743int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2744 u16 index)
2745{
2746 int ret;
2747
2748 cpus_read_lock();
2749 ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
2750 cpus_read_unlock();
2751
2752 return ret;
2753}
2754EXPORT_SYMBOL(netif_set_xps_queue);
2755
2756#endif
2757static void netdev_unbind_all_sb_channels(struct net_device *dev)
2758{
2759 struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2760
2761 /* Unbind any subordinate channels */
2762 while (txq-- != &dev->_tx[0]) {
2763 if (txq->sb_dev)
2764 netdev_unbind_sb_channel(dev, txq->sb_dev);
2765 }
2766}
2767
2768void netdev_reset_tc(struct net_device *dev)
2769{
2770#ifdef CONFIG_XPS
2771 netif_reset_xps_queues_gt(dev, 0);
2772#endif
2773 netdev_unbind_all_sb_channels(dev);
2774
2775 /* Reset TC configuration of device */
2776 dev->num_tc = 0;
2777 memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2778 memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2779}
2780EXPORT_SYMBOL(netdev_reset_tc);
2781
2782int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2783{
2784 if (tc >= dev->num_tc)
2785 return -EINVAL;
2786
2787#ifdef CONFIG_XPS
2788 netif_reset_xps_queues(dev, offset, count);
2789#endif
2790 dev->tc_to_txq[tc].count = count;
2791 dev->tc_to_txq[tc].offset = offset;
2792 return 0;
2793}
2794EXPORT_SYMBOL(netdev_set_tc_queue);
2795
2796int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2797{
2798 if (num_tc > TC_MAX_QUEUE)
2799 return -EINVAL;
2800
2801#ifdef CONFIG_XPS
2802 netif_reset_xps_queues_gt(dev, 0);
2803#endif
2804 netdev_unbind_all_sb_channels(dev);
2805
2806 dev->num_tc = num_tc;
2807 return 0;
2808}
2809EXPORT_SYMBOL(netdev_set_num_tc);
2810
2811void netdev_unbind_sb_channel(struct net_device *dev,
2812 struct net_device *sb_dev)
2813{
2814 struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2815
2816#ifdef CONFIG_XPS
2817 netif_reset_xps_queues_gt(sb_dev, 0);
2818#endif
2819 memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
2820 memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
2821
2822 while (txq-- != &dev->_tx[0]) {
2823 if (txq->sb_dev == sb_dev)
2824 txq->sb_dev = NULL;
2825 }
2826}
2827EXPORT_SYMBOL(netdev_unbind_sb_channel);
2828
2829int netdev_bind_sb_channel_queue(struct net_device *dev,
2830 struct net_device *sb_dev,
2831 u8 tc, u16 count, u16 offset)
2832{
2833 /* Make certain the sb_dev and dev are already configured */
2834 if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
2835 return -EINVAL;
2836
2837 /* We cannot hand out queues we don't have */
2838 if ((offset + count) > dev->real_num_tx_queues)
2839 return -EINVAL;
2840
2841 /* Record the mapping */
2842 sb_dev->tc_to_txq[tc].count = count;
2843 sb_dev->tc_to_txq[tc].offset = offset;
2844
2845 /* Provide a way for Tx queue to find the tc_to_txq map or
2846 * XPS map for itself.
2847 */
2848 while (count--)
2849 netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
2850
2851 return 0;
2852}
2853EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
2854
2855int netdev_set_sb_channel(struct net_device *dev, u16 channel)
2856{
2857 /* Do not use a multiqueue device to represent a subordinate channel */
2858 if (netif_is_multiqueue(dev))
2859 return -ENODEV;
2860
2861 /* We allow channels 1 - 32767 to be used for subordinate channels.
2862 * Channel 0 is meant to be "native" mode and used only to represent
2863 * the main root device. We allow writing 0 to reset the device back
2864 * to normal mode after being used as a subordinate channel.
2865 */
2866 if (channel > S16_MAX)
2867 return -EINVAL;
2868
2869 dev->num_tc = -channel;
2870
2871 return 0;
2872}
2873EXPORT_SYMBOL(netdev_set_sb_channel);
2874
2875/*
2876 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2877 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2878 */
2879int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2880{
2881 bool disabling;
2882 int rc;
2883
2884 disabling = txq < dev->real_num_tx_queues;
2885
2886 if (txq < 1 || txq > dev->num_tx_queues)
2887 return -EINVAL;
2888
2889 if (dev->reg_state == NETREG_REGISTERED ||
2890 dev->reg_state == NETREG_UNREGISTERING) {
2891 ASSERT_RTNL();
2892
2893 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2894 txq);
2895 if (rc)
2896 return rc;
2897
2898 if (dev->num_tc)
2899 netif_setup_tc(dev, txq);
2900
2901 dev_qdisc_change_real_num_tx(dev, txq);
2902
2903 dev->real_num_tx_queues = txq;
2904
2905 if (disabling) {
2906 synchronize_net();
2907 qdisc_reset_all_tx_gt(dev, txq);
2908#ifdef CONFIG_XPS
2909 netif_reset_xps_queues_gt(dev, txq);
2910#endif
2911 }
2912 } else {
2913 dev->real_num_tx_queues = txq;
2914 }
2915
2916 return 0;
2917}
2918EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2919
2920#ifdef CONFIG_SYSFS
2921/**
2922 * netif_set_real_num_rx_queues - set actual number of RX queues used
2923 * @dev: Network device
2924 * @rxq: Actual number of RX queues
2925 *
2926 * This must be called either with the rtnl_lock held or before
2927 * registration of the net device. Returns 0 on success, or a
2928 * negative error code. If called before registration, it always
2929 * succeeds.
2930 */
2931int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2932{
2933 int rc;
2934
2935 if (rxq < 1 || rxq > dev->num_rx_queues)
2936 return -EINVAL;
2937
2938 if (dev->reg_state == NETREG_REGISTERED) {
2939 ASSERT_RTNL();
2940
2941 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2942 rxq);
2943 if (rc)
2944 return rc;
2945 }
2946
2947 dev->real_num_rx_queues = rxq;
2948 return 0;
2949}
2950EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2951#endif
2952
2953/**
2954 * netif_set_real_num_queues - set actual number of RX and TX queues used
2955 * @dev: Network device
2956 * @txq: Actual number of TX queues
2957 * @rxq: Actual number of RX queues
2958 *
2959 * Set the real number of both TX and RX queues.
2960 * Does nothing if the number of queues is already correct.
2961 */
2962int netif_set_real_num_queues(struct net_device *dev,
2963 unsigned int txq, unsigned int rxq)
2964{
2965 unsigned int old_rxq = dev->real_num_rx_queues;
2966 int err;
2967
2968 if (txq < 1 || txq > dev->num_tx_queues ||
2969 rxq < 1 || rxq > dev->num_rx_queues)
2970 return -EINVAL;
2971
2972 /* Start from increases, so the error path only does decreases -
2973 * decreases can't fail.
2974 */
2975 if (rxq > dev->real_num_rx_queues) {
2976 err = netif_set_real_num_rx_queues(dev, rxq);
2977 if (err)
2978 return err;
2979 }
2980 if (txq > dev->real_num_tx_queues) {
2981 err = netif_set_real_num_tx_queues(dev, txq);
2982 if (err)
2983 goto undo_rx;
2984 }
2985 if (rxq < dev->real_num_rx_queues)
2986 WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
2987 if (txq < dev->real_num_tx_queues)
2988 WARN_ON(netif_set_real_num_tx_queues(dev, txq));
2989
2990 return 0;
2991undo_rx:
2992 WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
2993 return err;
2994}
2995EXPORT_SYMBOL(netif_set_real_num_queues);
2996
2997/**
2998 * netif_set_tso_max_size() - set the max size of TSO frames supported
2999 * @dev: netdev to update
3000 * @size: max skb->len of a TSO frame
3001 *
3002 * Set the limit on the size of TSO super-frames the device can handle.
3003 * Unless explicitly set the stack will assume the value of
3004 * %GSO_LEGACY_MAX_SIZE.
3005 */
3006void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
3007{
3008 dev->tso_max_size = min(GSO_MAX_SIZE, size);
3009 if (size < READ_ONCE(dev->gso_max_size))
3010 netif_set_gso_max_size(dev, size);
3011 if (size < READ_ONCE(dev->gso_ipv4_max_size))
3012 netif_set_gso_ipv4_max_size(dev, size);
3013}
3014EXPORT_SYMBOL(netif_set_tso_max_size);
3015
3016/**
3017 * netif_set_tso_max_segs() - set the max number of segs supported for TSO
3018 * @dev: netdev to update
3019 * @segs: max number of TCP segments
3020 *
3021 * Set the limit on the number of TCP segments the device can generate from
3022 * a single TSO super-frame.
3023 * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
3024 */
3025void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
3026{
3027 dev->tso_max_segs = segs;
3028 if (segs < READ_ONCE(dev->gso_max_segs))
3029 netif_set_gso_max_segs(dev, segs);
3030}
3031EXPORT_SYMBOL(netif_set_tso_max_segs);
3032
3033/**
3034 * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
3035 * @to: netdev to update
3036 * @from: netdev from which to copy the limits
3037 */
3038void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
3039{
3040 netif_set_tso_max_size(to, from->tso_max_size);
3041 netif_set_tso_max_segs(to, from->tso_max_segs);
3042}
3043EXPORT_SYMBOL(netif_inherit_tso_max);
3044
3045/**
3046 * netif_get_num_default_rss_queues - default number of RSS queues
3047 *
3048 * Default value is the number of physical cores if there are only 1 or 2, or
3049 * divided by 2 if there are more.
3050 */
3051int netif_get_num_default_rss_queues(void)
3052{
3053 cpumask_var_t cpus;
3054 int cpu, count = 0;
3055
3056 if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
3057 return 1;
3058
3059 cpumask_copy(cpus, cpu_online_mask);
3060 for_each_cpu(cpu, cpus) {
3061 ++count;
3062 cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
3063 }
3064 free_cpumask_var(cpus);
3065
3066 return count > 2 ? DIV_ROUND_UP(count, 2) : count;
3067}
3068EXPORT_SYMBOL(netif_get_num_default_rss_queues);
3069
3070static void __netif_reschedule(struct Qdisc *q)
3071{
3072 struct softnet_data *sd;
3073 unsigned long flags;
3074
3075 local_irq_save(flags);
3076 sd = this_cpu_ptr(&softnet_data);
3077 q->next_sched = NULL;
3078 *sd->output_queue_tailp = q;
3079 sd->output_queue_tailp = &q->next_sched;
3080 raise_softirq_irqoff(NET_TX_SOFTIRQ);
3081 local_irq_restore(flags);
3082}
3083
3084void __netif_schedule(struct Qdisc *q)
3085{
3086 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
3087 __netif_reschedule(q);
3088}
3089EXPORT_SYMBOL(__netif_schedule);
3090
3091struct dev_kfree_skb_cb {
3092 enum skb_drop_reason reason;
3093};
3094
3095static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
3096{
3097 return (struct dev_kfree_skb_cb *)skb->cb;
3098}
3099
3100void netif_schedule_queue(struct netdev_queue *txq)
3101{
3102 rcu_read_lock();
3103 if (!netif_xmit_stopped(txq)) {
3104 struct Qdisc *q = rcu_dereference(txq->qdisc);
3105
3106 __netif_schedule(q);
3107 }
3108 rcu_read_unlock();
3109}
3110EXPORT_SYMBOL(netif_schedule_queue);
3111
3112void netif_tx_wake_queue(struct netdev_queue *dev_queue)
3113{
3114 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
3115 struct Qdisc *q;
3116
3117 rcu_read_lock();
3118 q = rcu_dereference(dev_queue->qdisc);
3119 __netif_schedule(q);
3120 rcu_read_unlock();
3121 }
3122}
3123EXPORT_SYMBOL(netif_tx_wake_queue);
3124
3125void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason)
3126{
3127 unsigned long flags;
3128
3129 if (unlikely(!skb))
3130 return;
3131
3132 if (likely(refcount_read(&skb->users) == 1)) {
3133 smp_rmb();
3134 refcount_set(&skb->users, 0);
3135 } else if (likely(!refcount_dec_and_test(&skb->users))) {
3136 return;
3137 }
3138 get_kfree_skb_cb(skb)->reason = reason;
3139 local_irq_save(flags);
3140 skb->next = __this_cpu_read(softnet_data.completion_queue);
3141 __this_cpu_write(softnet_data.completion_queue, skb);
3142 raise_softirq_irqoff(NET_TX_SOFTIRQ);
3143 local_irq_restore(flags);
3144}
3145EXPORT_SYMBOL(dev_kfree_skb_irq_reason);
3146
3147void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason)
3148{
3149 if (in_hardirq() || irqs_disabled())
3150 dev_kfree_skb_irq_reason(skb, reason);
3151 else
3152 kfree_skb_reason(skb, reason);
3153}
3154EXPORT_SYMBOL(dev_kfree_skb_any_reason);
3155
3156
3157/**
3158 * netif_device_detach - mark device as removed
3159 * @dev: network device
3160 *
3161 * Mark device as removed from system and therefore no longer available.
3162 */
3163void netif_device_detach(struct net_device *dev)
3164{
3165 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
3166 netif_running(dev)) {
3167 netif_tx_stop_all_queues(dev);
3168 }
3169}
3170EXPORT_SYMBOL(netif_device_detach);
3171
3172/**
3173 * netif_device_attach - mark device as attached
3174 * @dev: network device
3175 *
3176 * Mark device as attached from system and restart if needed.
3177 */
3178void netif_device_attach(struct net_device *dev)
3179{
3180 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
3181 netif_running(dev)) {
3182 netif_tx_wake_all_queues(dev);
3183 __netdev_watchdog_up(dev);
3184 }
3185}
3186EXPORT_SYMBOL(netif_device_attach);
3187
3188/*
3189 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
3190 * to be used as a distribution range.
3191 */
3192static u16 skb_tx_hash(const struct net_device *dev,
3193 const struct net_device *sb_dev,
3194 struct sk_buff *skb)
3195{
3196 u32 hash;
3197 u16 qoffset = 0;
3198 u16 qcount = dev->real_num_tx_queues;
3199
3200 if (dev->num_tc) {
3201 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
3202
3203 qoffset = sb_dev->tc_to_txq[tc].offset;
3204 qcount = sb_dev->tc_to_txq[tc].count;
3205 if (unlikely(!qcount)) {
3206 net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
3207 sb_dev->name, qoffset, tc);
3208 qoffset = 0;
3209 qcount = dev->real_num_tx_queues;
3210 }
3211 }
3212
3213 if (skb_rx_queue_recorded(skb)) {
3214 DEBUG_NET_WARN_ON_ONCE(qcount == 0);
3215 hash = skb_get_rx_queue(skb);
3216 if (hash >= qoffset)
3217 hash -= qoffset;
3218 while (unlikely(hash >= qcount))
3219 hash -= qcount;
3220 return hash + qoffset;
3221 }
3222
3223 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
3224}
3225
3226void skb_warn_bad_offload(const struct sk_buff *skb)
3227{
3228 static const netdev_features_t null_features;
3229 struct net_device *dev = skb->dev;
3230 const char *name = "";
3231
3232 if (!net_ratelimit())
3233 return;
3234
3235 if (dev) {
3236 if (dev->dev.parent)
3237 name = dev_driver_string(dev->dev.parent);
3238 else
3239 name = netdev_name(dev);
3240 }
3241 skb_dump(KERN_WARNING, skb, false);
3242 WARN(1, "%s: caps=(%pNF, %pNF)\n",
3243 name, dev ? &dev->features : &null_features,
3244 skb->sk ? &skb->sk->sk_route_caps : &null_features);
3245}
3246
3247/*
3248 * Invalidate hardware checksum when packet is to be mangled, and
3249 * complete checksum manually on outgoing path.
3250 */
3251int skb_checksum_help(struct sk_buff *skb)
3252{
3253 __wsum csum;
3254 int ret = 0, offset;
3255
3256 if (skb->ip_summed == CHECKSUM_COMPLETE)
3257 goto out_set_summed;
3258
3259 if (unlikely(skb_is_gso(skb))) {
3260 skb_warn_bad_offload(skb);
3261 return -EINVAL;
3262 }
3263
3264 /* Before computing a checksum, we should make sure no frag could
3265 * be modified by an external entity : checksum could be wrong.
3266 */
3267 if (skb_has_shared_frag(skb)) {
3268 ret = __skb_linearize(skb);
3269 if (ret)
3270 goto out;
3271 }
3272
3273 offset = skb_checksum_start_offset(skb);
3274 ret = -EINVAL;
3275 if (unlikely(offset >= skb_headlen(skb))) {
3276 DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
3277 WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n",
3278 offset, skb_headlen(skb));
3279 goto out;
3280 }
3281 csum = skb_checksum(skb, offset, skb->len - offset, 0);
3282
3283 offset += skb->csum_offset;
3284 if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) {
3285 DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
3286 WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n",
3287 offset + sizeof(__sum16), skb_headlen(skb));
3288 goto out;
3289 }
3290 ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
3291 if (ret)
3292 goto out;
3293
3294 *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
3295out_set_summed:
3296 skb->ip_summed = CHECKSUM_NONE;
3297out:
3298 return ret;
3299}
3300EXPORT_SYMBOL(skb_checksum_help);
3301
3302int skb_crc32c_csum_help(struct sk_buff *skb)
3303{
3304 __le32 crc32c_csum;
3305 int ret = 0, offset, start;
3306
3307 if (skb->ip_summed != CHECKSUM_PARTIAL)
3308 goto out;
3309
3310 if (unlikely(skb_is_gso(skb)))
3311 goto out;
3312
3313 /* Before computing a checksum, we should make sure no frag could
3314 * be modified by an external entity : checksum could be wrong.
3315 */
3316 if (unlikely(skb_has_shared_frag(skb))) {
3317 ret = __skb_linearize(skb);
3318 if (ret)
3319 goto out;
3320 }
3321 start = skb_checksum_start_offset(skb);
3322 offset = start + offsetof(struct sctphdr, checksum);
3323 if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
3324 ret = -EINVAL;
3325 goto out;
3326 }
3327
3328 ret = skb_ensure_writable(skb, offset + sizeof(__le32));
3329 if (ret)
3330 goto out;
3331
3332 crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
3333 skb->len - start, ~(__u32)0,
3334 crc32c_csum_stub));
3335 *(__le32 *)(skb->data + offset) = crc32c_csum;
3336 skb_reset_csum_not_inet(skb);
3337out:
3338 return ret;
3339}
3340
3341__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
3342{
3343 __be16 type = skb->protocol;
3344
3345 /* Tunnel gso handlers can set protocol to ethernet. */
3346 if (type == htons(ETH_P_TEB)) {
3347 struct ethhdr *eth;
3348
3349 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
3350 return 0;
3351
3352 eth = (struct ethhdr *)skb->data;
3353 type = eth->h_proto;
3354 }
3355
3356 return vlan_get_protocol_and_depth(skb, type, depth);
3357}
3358
3359
3360/* Take action when hardware reception checksum errors are detected. */
3361#ifdef CONFIG_BUG
3362static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3363{
3364 netdev_err(dev, "hw csum failure\n");
3365 skb_dump(KERN_ERR, skb, true);
3366 dump_stack();
3367}
3368
3369void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3370{
3371 DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
3372}
3373EXPORT_SYMBOL(netdev_rx_csum_fault);
3374#endif
3375
3376/* XXX: check that highmem exists at all on the given machine. */
3377static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
3378{
3379#ifdef CONFIG_HIGHMEM
3380 int i;
3381
3382 if (!(dev->features & NETIF_F_HIGHDMA)) {
3383 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3384 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3385
3386 if (PageHighMem(skb_frag_page(frag)))
3387 return 1;
3388 }
3389 }
3390#endif
3391 return 0;
3392}
3393
3394/* If MPLS offload request, verify we are testing hardware MPLS features
3395 * instead of standard features for the netdev.
3396 */
3397#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3398static netdev_features_t net_mpls_features(struct sk_buff *skb,
3399 netdev_features_t features,
3400 __be16 type)
3401{
3402 if (eth_p_mpls(type))
3403 features &= skb->dev->mpls_features;
3404
3405 return features;
3406}
3407#else
3408static netdev_features_t net_mpls_features(struct sk_buff *skb,
3409 netdev_features_t features,
3410 __be16 type)
3411{
3412 return features;
3413}
3414#endif
3415
3416static netdev_features_t harmonize_features(struct sk_buff *skb,
3417 netdev_features_t features)
3418{
3419 __be16 type;
3420
3421 type = skb_network_protocol(skb, NULL);
3422 features = net_mpls_features(skb, features, type);
3423
3424 if (skb->ip_summed != CHECKSUM_NONE &&
3425 !can_checksum_protocol(features, type)) {
3426 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3427 }
3428 if (illegal_highdma(skb->dev, skb))
3429 features &= ~NETIF_F_SG;
3430
3431 return features;
3432}
3433
3434netdev_features_t passthru_features_check(struct sk_buff *skb,
3435 struct net_device *dev,
3436 netdev_features_t features)
3437{
3438 return features;
3439}
3440EXPORT_SYMBOL(passthru_features_check);
3441
3442static netdev_features_t dflt_features_check(struct sk_buff *skb,
3443 struct net_device *dev,
3444 netdev_features_t features)
3445{
3446 return vlan_features_check(skb, features);
3447}
3448
3449static netdev_features_t gso_features_check(const struct sk_buff *skb,
3450 struct net_device *dev,
3451 netdev_features_t features)
3452{
3453 u16 gso_segs = skb_shinfo(skb)->gso_segs;
3454
3455 if (gso_segs > READ_ONCE(dev->gso_max_segs))
3456 return features & ~NETIF_F_GSO_MASK;
3457
3458 if (unlikely(skb->len >= READ_ONCE(dev->gso_max_size)))
3459 return features & ~NETIF_F_GSO_MASK;
3460
3461 if (!skb_shinfo(skb)->gso_type) {
3462 skb_warn_bad_offload(skb);
3463 return features & ~NETIF_F_GSO_MASK;
3464 }
3465
3466 /* Support for GSO partial features requires software
3467 * intervention before we can actually process the packets
3468 * so we need to strip support for any partial features now
3469 * and we can pull them back in after we have partially
3470 * segmented the frame.
3471 */
3472 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
3473 features &= ~dev->gso_partial_features;
3474
3475 /* Make sure to clear the IPv4 ID mangling feature if the
3476 * IPv4 header has the potential to be fragmented.
3477 */
3478 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3479 struct iphdr *iph = skb->encapsulation ?
3480 inner_ip_hdr(skb) : ip_hdr(skb);
3481
3482 if (!(iph->frag_off & htons(IP_DF)))
3483 features &= ~NETIF_F_TSO_MANGLEID;
3484 }
3485
3486 return features;
3487}
3488
3489netdev_features_t netif_skb_features(struct sk_buff *skb)
3490{
3491 struct net_device *dev = skb->dev;
3492 netdev_features_t features = dev->features;
3493
3494 if (skb_is_gso(skb))
3495 features = gso_features_check(skb, dev, features);
3496
3497 /* If encapsulation offload request, verify we are testing
3498 * hardware encapsulation features instead of standard
3499 * features for the netdev
3500 */
3501 if (skb->encapsulation)
3502 features &= dev->hw_enc_features;
3503
3504 if (skb_vlan_tagged(skb))
3505 features = netdev_intersect_features(features,
3506 dev->vlan_features |
3507 NETIF_F_HW_VLAN_CTAG_TX |
3508 NETIF_F_HW_VLAN_STAG_TX);
3509
3510 if (dev->netdev_ops->ndo_features_check)
3511 features &= dev->netdev_ops->ndo_features_check(skb, dev,
3512 features);
3513 else
3514 features &= dflt_features_check(skb, dev, features);
3515
3516 return harmonize_features(skb, features);
3517}
3518EXPORT_SYMBOL(netif_skb_features);
3519
3520static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3521 struct netdev_queue *txq, bool more)
3522{
3523 unsigned int len;
3524 int rc;
3525
3526 if (dev_nit_active(dev))
3527 dev_queue_xmit_nit(skb, dev);
3528
3529 len = skb->len;
3530 trace_net_dev_start_xmit(skb, dev);
3531 rc = netdev_start_xmit(skb, dev, txq, more);
3532 trace_net_dev_xmit(skb, rc, dev, len);
3533
3534 return rc;
3535}
3536
3537struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3538 struct netdev_queue *txq, int *ret)
3539{
3540 struct sk_buff *skb = first;
3541 int rc = NETDEV_TX_OK;
3542
3543 while (skb) {
3544 struct sk_buff *next = skb->next;
3545
3546 skb_mark_not_on_list(skb);
3547 rc = xmit_one(skb, dev, txq, next != NULL);
3548 if (unlikely(!dev_xmit_complete(rc))) {
3549 skb->next = next;
3550 goto out;
3551 }
3552
3553 skb = next;
3554 if (netif_tx_queue_stopped(txq) && skb) {
3555 rc = NETDEV_TX_BUSY;
3556 break;
3557 }
3558 }
3559
3560out:
3561 *ret = rc;
3562 return skb;
3563}
3564
3565static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3566 netdev_features_t features)
3567{
3568 if (skb_vlan_tag_present(skb) &&
3569 !vlan_hw_offload_capable(features, skb->vlan_proto))
3570 skb = __vlan_hwaccel_push_inside(skb);
3571 return skb;
3572}
3573
3574int skb_csum_hwoffload_help(struct sk_buff *skb,
3575 const netdev_features_t features)
3576{
3577 if (unlikely(skb_csum_is_sctp(skb)))
3578 return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3579 skb_crc32c_csum_help(skb);
3580
3581 if (features & NETIF_F_HW_CSUM)
3582 return 0;
3583
3584 if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
3585 switch (skb->csum_offset) {
3586 case offsetof(struct tcphdr, check):
3587 case offsetof(struct udphdr, check):
3588 return 0;
3589 }
3590 }
3591
3592 return skb_checksum_help(skb);
3593}
3594EXPORT_SYMBOL(skb_csum_hwoffload_help);
3595
3596static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3597{
3598 netdev_features_t features;
3599
3600 features = netif_skb_features(skb);
3601 skb = validate_xmit_vlan(skb, features);
3602 if (unlikely(!skb))
3603 goto out_null;
3604
3605 skb = sk_validate_xmit_skb(skb, dev);
3606 if (unlikely(!skb))
3607 goto out_null;
3608
3609 if (netif_needs_gso(skb, features)) {
3610 struct sk_buff *segs;
3611
3612 segs = skb_gso_segment(skb, features);
3613 if (IS_ERR(segs)) {
3614 goto out_kfree_skb;
3615 } else if (segs) {
3616 consume_skb(skb);
3617 skb = segs;
3618 }
3619 } else {
3620 if (skb_needs_linearize(skb, features) &&
3621 __skb_linearize(skb))
3622 goto out_kfree_skb;
3623
3624 /* If packet is not checksummed and device does not
3625 * support checksumming for this protocol, complete
3626 * checksumming here.
3627 */
3628 if (skb->ip_summed == CHECKSUM_PARTIAL) {
3629 if (skb->encapsulation)
3630 skb_set_inner_transport_header(skb,
3631 skb_checksum_start_offset(skb));
3632 else
3633 skb_set_transport_header(skb,
3634 skb_checksum_start_offset(skb));
3635 if (skb_csum_hwoffload_help(skb, features))
3636 goto out_kfree_skb;
3637 }
3638 }
3639
3640 skb = validate_xmit_xfrm(skb, features, again);
3641
3642 return skb;
3643
3644out_kfree_skb:
3645 kfree_skb(skb);
3646out_null:
3647 dev_core_stats_tx_dropped_inc(dev);
3648 return NULL;
3649}
3650
3651struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3652{
3653 struct sk_buff *next, *head = NULL, *tail;
3654
3655 for (; skb != NULL; skb = next) {
3656 next = skb->next;
3657 skb_mark_not_on_list(skb);
3658
3659 /* in case skb wont be segmented, point to itself */
3660 skb->prev = skb;
3661
3662 skb = validate_xmit_skb(skb, dev, again);
3663 if (!skb)
3664 continue;
3665
3666 if (!head)
3667 head = skb;
3668 else
3669 tail->next = skb;
3670 /* If skb was segmented, skb->prev points to
3671 * the last segment. If not, it still contains skb.
3672 */
3673 tail = skb->prev;
3674 }
3675 return head;
3676}
3677EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3678
3679static void qdisc_pkt_len_init(struct sk_buff *skb)
3680{
3681 const struct skb_shared_info *shinfo = skb_shinfo(skb);
3682
3683 qdisc_skb_cb(skb)->pkt_len = skb->len;
3684
3685 /* To get more precise estimation of bytes sent on wire,
3686 * we add to pkt_len the headers size of all segments
3687 */
3688 if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
3689 u16 gso_segs = shinfo->gso_segs;
3690 unsigned int hdr_len;
3691
3692 /* mac layer + network layer */
3693 hdr_len = skb_transport_offset(skb);
3694
3695 /* + transport layer */
3696 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3697 const struct tcphdr *th;
3698 struct tcphdr _tcphdr;
3699
3700 th = skb_header_pointer(skb, hdr_len,
3701 sizeof(_tcphdr), &_tcphdr);
3702 if (likely(th))
3703 hdr_len += __tcp_hdrlen(th);
3704 } else {
3705 struct udphdr _udphdr;
3706
3707 if (skb_header_pointer(skb, hdr_len,
3708 sizeof(_udphdr), &_udphdr))
3709 hdr_len += sizeof(struct udphdr);
3710 }
3711
3712 if (shinfo->gso_type & SKB_GSO_DODGY)
3713 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3714 shinfo->gso_size);
3715
3716 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3717 }
3718}
3719
3720static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
3721 struct sk_buff **to_free,
3722 struct netdev_queue *txq)
3723{
3724 int rc;
3725
3726 rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
3727 if (rc == NET_XMIT_SUCCESS)
3728 trace_qdisc_enqueue(q, txq, skb);
3729 return rc;
3730}
3731
3732static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3733 struct net_device *dev,
3734 struct netdev_queue *txq)
3735{
3736 spinlock_t *root_lock = qdisc_lock(q);
3737 struct sk_buff *to_free = NULL;
3738 bool contended;
3739 int rc;
3740
3741 qdisc_calculate_pkt_len(skb, q);
3742
3743 tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP);
3744
3745 if (q->flags & TCQ_F_NOLOCK) {
3746 if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
3747 qdisc_run_begin(q)) {
3748 /* Retest nolock_qdisc_is_empty() within the protection
3749 * of q->seqlock to protect from racing with requeuing.
3750 */
3751 if (unlikely(!nolock_qdisc_is_empty(q))) {
3752 rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3753 __qdisc_run(q);
3754 qdisc_run_end(q);
3755
3756 goto no_lock_out;
3757 }
3758
3759 qdisc_bstats_cpu_update(q, skb);
3760 if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
3761 !nolock_qdisc_is_empty(q))
3762 __qdisc_run(q);
3763
3764 qdisc_run_end(q);
3765 return NET_XMIT_SUCCESS;
3766 }
3767
3768 rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3769 qdisc_run(q);
3770
3771no_lock_out:
3772 if (unlikely(to_free))
3773 kfree_skb_list_reason(to_free,
3774 tcf_get_drop_reason(to_free));
3775 return rc;
3776 }
3777
3778 if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
3779 kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
3780 return NET_XMIT_DROP;
3781 }
3782 /*
3783 * Heuristic to force contended enqueues to serialize on a
3784 * separate lock before trying to get qdisc main lock.
3785 * This permits qdisc->running owner to get the lock more
3786 * often and dequeue packets faster.
3787 * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
3788 * and then other tasks will only enqueue packets. The packets will be
3789 * sent after the qdisc owner is scheduled again. To prevent this
3790 * scenario the task always serialize on the lock.
3791 */
3792 contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
3793 if (unlikely(contended))
3794 spin_lock(&q->busylock);
3795
3796 spin_lock(root_lock);
3797 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3798 __qdisc_drop(skb, &to_free);
3799 rc = NET_XMIT_DROP;
3800 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3801 qdisc_run_begin(q)) {
3802 /*
3803 * This is a work-conserving queue; there are no old skbs
3804 * waiting to be sent out; and the qdisc is not running -
3805 * xmit the skb directly.
3806 */
3807
3808 qdisc_bstats_update(q, skb);
3809
3810 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3811 if (unlikely(contended)) {
3812 spin_unlock(&q->busylock);
3813 contended = false;
3814 }
3815 __qdisc_run(q);
3816 }
3817
3818 qdisc_run_end(q);
3819 rc = NET_XMIT_SUCCESS;
3820 } else {
3821 WRITE_ONCE(q->owner, smp_processor_id());
3822 rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3823 WRITE_ONCE(q->owner, -1);
3824 if (qdisc_run_begin(q)) {
3825 if (unlikely(contended)) {
3826 spin_unlock(&q->busylock);
3827 contended = false;
3828 }
3829 __qdisc_run(q);
3830 qdisc_run_end(q);
3831 }
3832 }
3833 spin_unlock(root_lock);
3834 if (unlikely(to_free))
3835 kfree_skb_list_reason(to_free,
3836 tcf_get_drop_reason(to_free));
3837 if (unlikely(contended))
3838 spin_unlock(&q->busylock);
3839 return rc;
3840}
3841
3842#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3843static void skb_update_prio(struct sk_buff *skb)
3844{
3845 const struct netprio_map *map;
3846 const struct sock *sk;
3847 unsigned int prioidx;
3848
3849 if (skb->priority)
3850 return;
3851 map = rcu_dereference_bh(skb->dev->priomap);
3852 if (!map)
3853 return;
3854 sk = skb_to_full_sk(skb);
3855 if (!sk)
3856 return;
3857
3858 prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3859
3860 if (prioidx < map->priomap_len)
3861 skb->priority = map->priomap[prioidx];
3862}
3863#else
3864#define skb_update_prio(skb)
3865#endif
3866
3867/**
3868 * dev_loopback_xmit - loop back @skb
3869 * @net: network namespace this loopback is happening in
3870 * @sk: sk needed to be a netfilter okfn
3871 * @skb: buffer to transmit
3872 */
3873int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3874{
3875 skb_reset_mac_header(skb);
3876 __skb_pull(skb, skb_network_offset(skb));
3877 skb->pkt_type = PACKET_LOOPBACK;
3878 if (skb->ip_summed == CHECKSUM_NONE)
3879 skb->ip_summed = CHECKSUM_UNNECESSARY;
3880 DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb));
3881 skb_dst_force(skb);
3882 netif_rx(skb);
3883 return 0;
3884}
3885EXPORT_SYMBOL(dev_loopback_xmit);
3886
3887#ifdef CONFIG_NET_EGRESS
3888static struct netdev_queue *
3889netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
3890{
3891 int qm = skb_get_queue_mapping(skb);
3892
3893 return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
3894}
3895
3896static bool netdev_xmit_txqueue_skipped(void)
3897{
3898 return __this_cpu_read(softnet_data.xmit.skip_txqueue);
3899}
3900
3901void netdev_xmit_skip_txqueue(bool skip)
3902{
3903 __this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
3904}
3905EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
3906#endif /* CONFIG_NET_EGRESS */
3907
3908#ifdef CONFIG_NET_XGRESS
3909static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
3910 enum skb_drop_reason *drop_reason)
3911{
3912 int ret = TC_ACT_UNSPEC;
3913#ifdef CONFIG_NET_CLS_ACT
3914 struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
3915 struct tcf_result res;
3916
3917 if (!miniq)
3918 return ret;
3919
3920 tc_skb_cb(skb)->mru = 0;
3921 tc_skb_cb(skb)->post_ct = false;
3922 tcf_set_drop_reason(skb, *drop_reason);
3923
3924 mini_qdisc_bstats_cpu_update(miniq, skb);
3925 ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
3926 /* Only tcf related quirks below. */
3927 switch (ret) {
3928 case TC_ACT_SHOT:
3929 *drop_reason = tcf_get_drop_reason(skb);
3930 mini_qdisc_qstats_cpu_drop(miniq);
3931 break;
3932 case TC_ACT_OK:
3933 case TC_ACT_RECLASSIFY:
3934 skb->tc_index = TC_H_MIN(res.classid);
3935 break;
3936 }
3937#endif /* CONFIG_NET_CLS_ACT */
3938 return ret;
3939}
3940
3941static DEFINE_STATIC_KEY_FALSE(tcx_needed_key);
3942
3943void tcx_inc(void)
3944{
3945 static_branch_inc(&tcx_needed_key);
3946}
3947
3948void tcx_dec(void)
3949{
3950 static_branch_dec(&tcx_needed_key);
3951}
3952
3953static __always_inline enum tcx_action_base
3954tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
3955 const bool needs_mac)
3956{
3957 const struct bpf_mprog_fp *fp;
3958 const struct bpf_prog *prog;
3959 int ret = TCX_NEXT;
3960
3961 if (needs_mac)
3962 __skb_push(skb, skb->mac_len);
3963 bpf_mprog_foreach_prog(entry, fp, prog) {
3964 bpf_compute_data_pointers(skb);
3965 ret = bpf_prog_run(prog, skb);
3966 if (ret != TCX_NEXT)
3967 break;
3968 }
3969 if (needs_mac)
3970 __skb_pull(skb, skb->mac_len);
3971 return tcx_action_code(skb, ret);
3972}
3973
3974static __always_inline struct sk_buff *
3975sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3976 struct net_device *orig_dev, bool *another)
3977{
3978 struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
3979 enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
3980 int sch_ret;
3981
3982 if (!entry)
3983 return skb;
3984 if (*pt_prev) {
3985 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3986 *pt_prev = NULL;
3987 }
3988
3989 qdisc_skb_cb(skb)->pkt_len = skb->len;
3990 tcx_set_ingress(skb, true);
3991
3992 if (static_branch_unlikely(&tcx_needed_key)) {
3993 sch_ret = tcx_run(entry, skb, true);
3994 if (sch_ret != TC_ACT_UNSPEC)
3995 goto ingress_verdict;
3996 }
3997 sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
3998ingress_verdict:
3999 switch (sch_ret) {
4000 case TC_ACT_REDIRECT:
4001 /* skb_mac_header check was done by BPF, so we can safely
4002 * push the L2 header back before redirecting to another
4003 * netdev.
4004 */
4005 __skb_push(skb, skb->mac_len);
4006 if (skb_do_redirect(skb) == -EAGAIN) {
4007 __skb_pull(skb, skb->mac_len);
4008 *another = true;
4009 break;
4010 }
4011 *ret = NET_RX_SUCCESS;
4012 return NULL;
4013 case TC_ACT_SHOT:
4014 kfree_skb_reason(skb, drop_reason);
4015 *ret = NET_RX_DROP;
4016 return NULL;
4017 /* used by tc_run */
4018 case TC_ACT_STOLEN:
4019 case TC_ACT_QUEUED:
4020 case TC_ACT_TRAP:
4021 consume_skb(skb);
4022 fallthrough;
4023 case TC_ACT_CONSUMED:
4024 *ret = NET_RX_SUCCESS;
4025 return NULL;
4026 }
4027
4028 return skb;
4029}
4030
4031static __always_inline struct sk_buff *
4032sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
4033{
4034 struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
4035 enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
4036 int sch_ret;
4037
4038 if (!entry)
4039 return skb;
4040
4041 /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
4042 * already set by the caller.
4043 */
4044 if (static_branch_unlikely(&tcx_needed_key)) {
4045 sch_ret = tcx_run(entry, skb, false);
4046 if (sch_ret != TC_ACT_UNSPEC)
4047 goto egress_verdict;
4048 }
4049 sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
4050egress_verdict:
4051 switch (sch_ret) {
4052 case TC_ACT_REDIRECT:
4053 /* No need to push/pop skb's mac_header here on egress! */
4054 skb_do_redirect(skb);
4055 *ret = NET_XMIT_SUCCESS;
4056 return NULL;
4057 case TC_ACT_SHOT:
4058 kfree_skb_reason(skb, drop_reason);
4059 *ret = NET_XMIT_DROP;
4060 return NULL;
4061 /* used by tc_run */
4062 case TC_ACT_STOLEN:
4063 case TC_ACT_QUEUED:
4064 case TC_ACT_TRAP:
4065 consume_skb(skb);
4066 fallthrough;
4067 case TC_ACT_CONSUMED:
4068 *ret = NET_XMIT_SUCCESS;
4069 return NULL;
4070 }
4071
4072 return skb;
4073}
4074#else
4075static __always_inline struct sk_buff *
4076sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4077 struct net_device *orig_dev, bool *another)
4078{
4079 return skb;
4080}
4081
4082static __always_inline struct sk_buff *
4083sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
4084{
4085 return skb;
4086}
4087#endif /* CONFIG_NET_XGRESS */
4088
4089#ifdef CONFIG_XPS
4090static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
4091 struct xps_dev_maps *dev_maps, unsigned int tci)
4092{
4093 int tc = netdev_get_prio_tc_map(dev, skb->priority);
4094 struct xps_map *map;
4095 int queue_index = -1;
4096
4097 if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
4098 return queue_index;
4099
4100 tci *= dev_maps->num_tc;
4101 tci += tc;
4102
4103 map = rcu_dereference(dev_maps->attr_map[tci]);
4104 if (map) {
4105 if (map->len == 1)
4106 queue_index = map->queues[0];
4107 else
4108 queue_index = map->queues[reciprocal_scale(
4109 skb_get_hash(skb), map->len)];
4110 if (unlikely(queue_index >= dev->real_num_tx_queues))
4111 queue_index = -1;
4112 }
4113 return queue_index;
4114}
4115#endif
4116
4117static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
4118 struct sk_buff *skb)
4119{
4120#ifdef CONFIG_XPS
4121 struct xps_dev_maps *dev_maps;
4122 struct sock *sk = skb->sk;
4123 int queue_index = -1;
4124
4125 if (!static_key_false(&xps_needed))
4126 return -1;
4127
4128 rcu_read_lock();
4129 if (!static_key_false(&xps_rxqs_needed))
4130 goto get_cpus_map;
4131
4132 dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
4133 if (dev_maps) {
4134 int tci = sk_rx_queue_get(sk);
4135
4136 if (tci >= 0)
4137 queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
4138 tci);
4139 }
4140
4141get_cpus_map:
4142 if (queue_index < 0) {
4143 dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
4144 if (dev_maps) {
4145 unsigned int tci = skb->sender_cpu - 1;
4146
4147 queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
4148 tci);
4149 }
4150 }
4151 rcu_read_unlock();
4152
4153 return queue_index;
4154#else
4155 return -1;
4156#endif
4157}
4158
4159u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
4160 struct net_device *sb_dev)
4161{
4162 return 0;
4163}
4164EXPORT_SYMBOL(dev_pick_tx_zero);
4165
4166u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
4167 struct net_device *sb_dev)
4168{
4169 return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
4170}
4171EXPORT_SYMBOL(dev_pick_tx_cpu_id);
4172
4173u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
4174 struct net_device *sb_dev)
4175{
4176 struct sock *sk = skb->sk;
4177 int queue_index = sk_tx_queue_get(sk);
4178
4179 sb_dev = sb_dev ? : dev;
4180
4181 if (queue_index < 0 || skb->ooo_okay ||
4182 queue_index >= dev->real_num_tx_queues) {
4183 int new_index = get_xps_queue(dev, sb_dev, skb);
4184
4185 if (new_index < 0)
4186 new_index = skb_tx_hash(dev, sb_dev, skb);
4187
4188 if (queue_index != new_index && sk &&
4189 sk_fullsock(sk) &&
4190 rcu_access_pointer(sk->sk_dst_cache))
4191 sk_tx_queue_set(sk, new_index);
4192
4193 queue_index = new_index;
4194 }
4195
4196 return queue_index;
4197}
4198EXPORT_SYMBOL(netdev_pick_tx);
4199
4200struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
4201 struct sk_buff *skb,
4202 struct net_device *sb_dev)
4203{
4204 int queue_index = 0;
4205
4206#ifdef CONFIG_XPS
4207 u32 sender_cpu = skb->sender_cpu - 1;
4208
4209 if (sender_cpu >= (u32)NR_CPUS)
4210 skb->sender_cpu = raw_smp_processor_id() + 1;
4211#endif
4212
4213 if (dev->real_num_tx_queues != 1) {
4214 const struct net_device_ops *ops = dev->netdev_ops;
4215
4216 if (ops->ndo_select_queue)
4217 queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
4218 else
4219 queue_index = netdev_pick_tx(dev, skb, sb_dev);
4220
4221 queue_index = netdev_cap_txqueue(dev, queue_index);
4222 }
4223
4224 skb_set_queue_mapping(skb, queue_index);
4225 return netdev_get_tx_queue(dev, queue_index);
4226}
4227
4228/**
4229 * __dev_queue_xmit() - transmit a buffer
4230 * @skb: buffer to transmit
4231 * @sb_dev: suboordinate device used for L2 forwarding offload
4232 *
4233 * Queue a buffer for transmission to a network device. The caller must
4234 * have set the device and priority and built the buffer before calling
4235 * this function. The function can be called from an interrupt.
4236 *
4237 * When calling this method, interrupts MUST be enabled. This is because
4238 * the BH enable code must have IRQs enabled so that it will not deadlock.
4239 *
4240 * Regardless of the return value, the skb is consumed, so it is currently
4241 * difficult to retry a send to this method. (You can bump the ref count
4242 * before sending to hold a reference for retry if you are careful.)
4243 *
4244 * Return:
4245 * * 0 - buffer successfully transmitted
4246 * * positive qdisc return code - NET_XMIT_DROP etc.
4247 * * negative errno - other errors
4248 */
4249int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
4250{
4251 struct net_device *dev = skb->dev;
4252 struct netdev_queue *txq = NULL;
4253 struct Qdisc *q;
4254 int rc = -ENOMEM;
4255 bool again = false;
4256
4257 skb_reset_mac_header(skb);
4258 skb_assert_len(skb);
4259
4260 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
4261 __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
4262
4263 /* Disable soft irqs for various locks below. Also
4264 * stops preemption for RCU.
4265 */
4266 rcu_read_lock_bh();
4267
4268 skb_update_prio(skb);
4269
4270 qdisc_pkt_len_init(skb);
4271 tcx_set_ingress(skb, false);
4272#ifdef CONFIG_NET_EGRESS
4273 if (static_branch_unlikely(&egress_needed_key)) {
4274 if (nf_hook_egress_active()) {
4275 skb = nf_hook_egress(skb, &rc, dev);
4276 if (!skb)
4277 goto out;
4278 }
4279
4280 netdev_xmit_skip_txqueue(false);
4281
4282 nf_skip_egress(skb, true);
4283 skb = sch_handle_egress(skb, &rc, dev);
4284 if (!skb)
4285 goto out;
4286 nf_skip_egress(skb, false);
4287
4288 if (netdev_xmit_txqueue_skipped())
4289 txq = netdev_tx_queue_mapping(dev, skb);
4290 }
4291#endif
4292 /* If device/qdisc don't need skb->dst, release it right now while
4293 * its hot in this cpu cache.
4294 */
4295 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
4296 skb_dst_drop(skb);
4297 else
4298 skb_dst_force(skb);
4299
4300 if (!txq)
4301 txq = netdev_core_pick_tx(dev, skb, sb_dev);
4302
4303 q = rcu_dereference_bh(txq->qdisc);
4304
4305 trace_net_dev_queue(skb);
4306 if (q->enqueue) {
4307 rc = __dev_xmit_skb(skb, q, dev, txq);
4308 goto out;
4309 }
4310
4311 /* The device has no queue. Common case for software devices:
4312 * loopback, all the sorts of tunnels...
4313
4314 * Really, it is unlikely that netif_tx_lock protection is necessary
4315 * here. (f.e. loopback and IP tunnels are clean ignoring statistics
4316 * counters.)
4317 * However, it is possible, that they rely on protection
4318 * made by us here.
4319
4320 * Check this and shot the lock. It is not prone from deadlocks.
4321 *Either shot noqueue qdisc, it is even simpler 8)
4322 */
4323 if (dev->flags & IFF_UP) {
4324 int cpu = smp_processor_id(); /* ok because BHs are off */
4325
4326 /* Other cpus might concurrently change txq->xmit_lock_owner
4327 * to -1 or to their cpu id, but not to our id.
4328 */
4329 if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
4330 if (dev_xmit_recursion())
4331 goto recursion_alert;
4332
4333 skb = validate_xmit_skb(skb, dev, &again);
4334 if (!skb)
4335 goto out;
4336
4337 HARD_TX_LOCK(dev, txq, cpu);
4338
4339 if (!netif_xmit_stopped(txq)) {
4340 dev_xmit_recursion_inc();
4341 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
4342 dev_xmit_recursion_dec();
4343 if (dev_xmit_complete(rc)) {
4344 HARD_TX_UNLOCK(dev, txq);
4345 goto out;
4346 }
4347 }
4348 HARD_TX_UNLOCK(dev, txq);
4349 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
4350 dev->name);
4351 } else {
4352 /* Recursion is detected! It is possible,
4353 * unfortunately
4354 */
4355recursion_alert:
4356 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
4357 dev->name);
4358 }
4359 }
4360
4361 rc = -ENETDOWN;
4362 rcu_read_unlock_bh();
4363
4364 dev_core_stats_tx_dropped_inc(dev);
4365 kfree_skb_list(skb);
4366 return rc;
4367out:
4368 rcu_read_unlock_bh();
4369 return rc;
4370}
4371EXPORT_SYMBOL(__dev_queue_xmit);
4372
4373int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
4374{
4375 struct net_device *dev = skb->dev;
4376 struct sk_buff *orig_skb = skb;
4377 struct netdev_queue *txq;
4378 int ret = NETDEV_TX_BUSY;
4379 bool again = false;
4380
4381 if (unlikely(!netif_running(dev) ||
4382 !netif_carrier_ok(dev)))
4383 goto drop;
4384
4385 skb = validate_xmit_skb_list(skb, dev, &again);
4386 if (skb != orig_skb)
4387 goto drop;
4388
4389 skb_set_queue_mapping(skb, queue_id);
4390 txq = skb_get_tx_queue(dev, skb);
4391
4392 local_bh_disable();
4393
4394 dev_xmit_recursion_inc();
4395 HARD_TX_LOCK(dev, txq, smp_processor_id());
4396 if (!netif_xmit_frozen_or_drv_stopped(txq))
4397 ret = netdev_start_xmit(skb, dev, txq, false);
4398 HARD_TX_UNLOCK(dev, txq);
4399 dev_xmit_recursion_dec();
4400
4401 local_bh_enable();
4402 return ret;
4403drop:
4404 dev_core_stats_tx_dropped_inc(dev);
4405 kfree_skb_list(skb);
4406 return NET_XMIT_DROP;
4407}
4408EXPORT_SYMBOL(__dev_direct_xmit);
4409
4410/*************************************************************************
4411 * Receiver routines
4412 *************************************************************************/
4413
4414unsigned int sysctl_skb_defer_max __read_mostly = 64;
4415int weight_p __read_mostly = 64; /* old backlog weight */
4416int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
4417int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
4418
4419/* Called with irq disabled */
4420static inline void ____napi_schedule(struct softnet_data *sd,
4421 struct napi_struct *napi)
4422{
4423 struct task_struct *thread;
4424
4425 lockdep_assert_irqs_disabled();
4426
4427 if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
4428 /* Paired with smp_mb__before_atomic() in
4429 * napi_enable()/dev_set_threaded().
4430 * Use READ_ONCE() to guarantee a complete
4431 * read on napi->thread. Only call
4432 * wake_up_process() when it's not NULL.
4433 */
4434 thread = READ_ONCE(napi->thread);
4435 if (thread) {
4436 /* Avoid doing set_bit() if the thread is in
4437 * INTERRUPTIBLE state, cause napi_thread_wait()
4438 * makes sure to proceed with napi polling
4439 * if the thread is explicitly woken from here.
4440 */
4441 if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
4442 set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
4443 wake_up_process(thread);
4444 return;
4445 }
4446 }
4447
4448 list_add_tail(&napi->poll_list, &sd->poll_list);
4449 WRITE_ONCE(napi->list_owner, smp_processor_id());
4450 /* If not called from net_rx_action()
4451 * we have to raise NET_RX_SOFTIRQ.
4452 */
4453 if (!sd->in_net_rx_action)
4454 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4455}
4456
4457#ifdef CONFIG_RPS
4458
4459struct static_key_false rps_needed __read_mostly;
4460EXPORT_SYMBOL(rps_needed);
4461struct static_key_false rfs_needed __read_mostly;
4462EXPORT_SYMBOL(rfs_needed);
4463
4464static struct rps_dev_flow *
4465set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4466 struct rps_dev_flow *rflow, u16 next_cpu)
4467{
4468 if (next_cpu < nr_cpu_ids) {
4469#ifdef CONFIG_RFS_ACCEL
4470 struct netdev_rx_queue *rxqueue;
4471 struct rps_dev_flow_table *flow_table;
4472 struct rps_dev_flow *old_rflow;
4473 u32 flow_id;
4474 u16 rxq_index;
4475 int rc;
4476
4477 /* Should we steer this flow to a different hardware queue? */
4478 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
4479 !(dev->features & NETIF_F_NTUPLE))
4480 goto out;
4481 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
4482 if (rxq_index == skb_get_rx_queue(skb))
4483 goto out;
4484
4485 rxqueue = dev->_rx + rxq_index;
4486 flow_table = rcu_dereference(rxqueue->rps_flow_table);
4487 if (!flow_table)
4488 goto out;
4489 flow_id = skb_get_hash(skb) & flow_table->mask;
4490 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
4491 rxq_index, flow_id);
4492 if (rc < 0)
4493 goto out;
4494 old_rflow = rflow;
4495 rflow = &flow_table->flows[flow_id];
4496 rflow->filter = rc;
4497 if (old_rflow->filter == rflow->filter)
4498 old_rflow->filter = RPS_NO_FILTER;
4499 out:
4500#endif
4501 rflow->last_qtail =
4502 per_cpu(softnet_data, next_cpu).input_queue_head;
4503 }
4504
4505 rflow->cpu = next_cpu;
4506 return rflow;
4507}
4508
4509/*
4510 * get_rps_cpu is called from netif_receive_skb and returns the target
4511 * CPU from the RPS map of the receiving queue for a given skb.
4512 * rcu_read_lock must be held on entry.
4513 */
4514static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4515 struct rps_dev_flow **rflowp)
4516{
4517 const struct rps_sock_flow_table *sock_flow_table;
4518 struct netdev_rx_queue *rxqueue = dev->_rx;
4519 struct rps_dev_flow_table *flow_table;
4520 struct rps_map *map;
4521 int cpu = -1;
4522 u32 tcpu;
4523 u32 hash;
4524
4525 if (skb_rx_queue_recorded(skb)) {
4526 u16 index = skb_get_rx_queue(skb);
4527
4528 if (unlikely(index >= dev->real_num_rx_queues)) {
4529 WARN_ONCE(dev->real_num_rx_queues > 1,
4530 "%s received packet on queue %u, but number "
4531 "of RX queues is %u\n",
4532 dev->name, index, dev->real_num_rx_queues);
4533 goto done;
4534 }
4535 rxqueue += index;
4536 }
4537
4538 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
4539
4540 flow_table = rcu_dereference(rxqueue->rps_flow_table);
4541 map = rcu_dereference(rxqueue->rps_map);
4542 if (!flow_table && !map)
4543 goto done;
4544
4545 skb_reset_network_header(skb);
4546 hash = skb_get_hash(skb);
4547 if (!hash)
4548 goto done;
4549
4550 sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
4551 if (flow_table && sock_flow_table) {
4552 struct rps_dev_flow *rflow;
4553 u32 next_cpu;
4554 u32 ident;
4555
4556 /* First check into global flow table if there is a match.
4557 * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
4558 */
4559 ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
4560 if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask)
4561 goto try_rps;
4562
4563 next_cpu = ident & net_hotdata.rps_cpu_mask;
4564
4565 /* OK, now we know there is a match,
4566 * we can look at the local (per receive queue) flow table
4567 */
4568 rflow = &flow_table->flows[hash & flow_table->mask];
4569 tcpu = rflow->cpu;
4570
4571 /*
4572 * If the desired CPU (where last recvmsg was done) is
4573 * different from current CPU (one in the rx-queue flow
4574 * table entry), switch if one of the following holds:
4575 * - Current CPU is unset (>= nr_cpu_ids).
4576 * - Current CPU is offline.
4577 * - The current CPU's queue tail has advanced beyond the
4578 * last packet that was enqueued using this table entry.
4579 * This guarantees that all previous packets for the flow
4580 * have been dequeued, thus preserving in order delivery.
4581 */
4582 if (unlikely(tcpu != next_cpu) &&
4583 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
4584 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
4585 rflow->last_qtail)) >= 0)) {
4586 tcpu = next_cpu;
4587 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
4588 }
4589
4590 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
4591 *rflowp = rflow;
4592 cpu = tcpu;
4593 goto done;
4594 }
4595 }
4596
4597try_rps:
4598
4599 if (map) {
4600 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
4601 if (cpu_online(tcpu)) {
4602 cpu = tcpu;
4603 goto done;
4604 }
4605 }
4606
4607done:
4608 return cpu;
4609}
4610
4611#ifdef CONFIG_RFS_ACCEL
4612
4613/**
4614 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
4615 * @dev: Device on which the filter was set
4616 * @rxq_index: RX queue index
4617 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
4618 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
4619 *
4620 * Drivers that implement ndo_rx_flow_steer() should periodically call
4621 * this function for each installed filter and remove the filters for
4622 * which it returns %true.
4623 */
4624bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
4625 u32 flow_id, u16 filter_id)
4626{
4627 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
4628 struct rps_dev_flow_table *flow_table;
4629 struct rps_dev_flow *rflow;
4630 bool expire = true;
4631 unsigned int cpu;
4632
4633 rcu_read_lock();
4634 flow_table = rcu_dereference(rxqueue->rps_flow_table);
4635 if (flow_table && flow_id <= flow_table->mask) {
4636 rflow = &flow_table->flows[flow_id];
4637 cpu = READ_ONCE(rflow->cpu);
4638 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
4639 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
4640 rflow->last_qtail) <
4641 (int)(10 * flow_table->mask)))
4642 expire = false;
4643 }
4644 rcu_read_unlock();
4645 return expire;
4646}
4647EXPORT_SYMBOL(rps_may_expire_flow);
4648
4649#endif /* CONFIG_RFS_ACCEL */
4650
4651/* Called from hardirq (IPI) context */
4652static void rps_trigger_softirq(void *data)
4653{
4654 struct softnet_data *sd = data;
4655
4656 ____napi_schedule(sd, &sd->backlog);
4657 sd->received_rps++;
4658}
4659
4660#endif /* CONFIG_RPS */
4661
4662/* Called from hardirq (IPI) context */
4663static void trigger_rx_softirq(void *data)
4664{
4665 struct softnet_data *sd = data;
4666
4667 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4668 smp_store_release(&sd->defer_ipi_scheduled, 0);
4669}
4670
4671/*
4672 * After we queued a packet into sd->input_pkt_queue,
4673 * we need to make sure this queue is serviced soon.
4674 *
4675 * - If this is another cpu queue, link it to our rps_ipi_list,
4676 * and make sure we will process rps_ipi_list from net_rx_action().
4677 *
4678 * - If this is our own queue, NAPI schedule our backlog.
4679 * Note that this also raises NET_RX_SOFTIRQ.
4680 */
4681static void napi_schedule_rps(struct softnet_data *sd)
4682{
4683 struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
4684
4685#ifdef CONFIG_RPS
4686 if (sd != mysd) {
4687 sd->rps_ipi_next = mysd->rps_ipi_list;
4688 mysd->rps_ipi_list = sd;
4689
4690 /* If not called from net_rx_action() or napi_threaded_poll()
4691 * we have to raise NET_RX_SOFTIRQ.
4692 */
4693 if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll)
4694 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4695 return;
4696 }
4697#endif /* CONFIG_RPS */
4698 __napi_schedule_irqoff(&mysd->backlog);
4699}
4700
4701#ifdef CONFIG_NET_FLOW_LIMIT
4702int netdev_flow_limit_table_len __read_mostly = (1 << 12);
4703#endif
4704
4705static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
4706{
4707#ifdef CONFIG_NET_FLOW_LIMIT
4708 struct sd_flow_limit *fl;
4709 struct softnet_data *sd;
4710 unsigned int old_flow, new_flow;
4711
4712 if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1))
4713 return false;
4714
4715 sd = this_cpu_ptr(&softnet_data);
4716
4717 rcu_read_lock();
4718 fl = rcu_dereference(sd->flow_limit);
4719 if (fl) {
4720 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
4721 old_flow = fl->history[fl->history_head];
4722 fl->history[fl->history_head] = new_flow;
4723
4724 fl->history_head++;
4725 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
4726
4727 if (likely(fl->buckets[old_flow]))
4728 fl->buckets[old_flow]--;
4729
4730 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
4731 fl->count++;
4732 rcu_read_unlock();
4733 return true;
4734 }
4735 }
4736 rcu_read_unlock();
4737#endif
4738 return false;
4739}
4740
4741/*
4742 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
4743 * queue (may be a remote CPU queue).
4744 */
4745static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
4746 unsigned int *qtail)
4747{
4748 enum skb_drop_reason reason;
4749 struct softnet_data *sd;
4750 unsigned long flags;
4751 unsigned int qlen;
4752
4753 reason = SKB_DROP_REASON_NOT_SPECIFIED;
4754 sd = &per_cpu(softnet_data, cpu);
4755
4756 rps_lock_irqsave(sd, &flags);
4757 if (!netif_running(skb->dev))
4758 goto drop;
4759 qlen = skb_queue_len(&sd->input_pkt_queue);
4760 if (qlen <= READ_ONCE(net_hotdata.max_backlog) &&
4761 !skb_flow_limit(skb, qlen)) {
4762 if (qlen) {
4763enqueue:
4764 __skb_queue_tail(&sd->input_pkt_queue, skb);
4765 input_queue_tail_incr_save(sd, qtail);
4766 rps_unlock_irq_restore(sd, &flags);
4767 return NET_RX_SUCCESS;
4768 }
4769
4770 /* Schedule NAPI for backlog device
4771 * We can use non atomic operation since we own the queue lock
4772 */
4773 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
4774 napi_schedule_rps(sd);
4775 goto enqueue;
4776 }
4777 reason = SKB_DROP_REASON_CPU_BACKLOG;
4778
4779drop:
4780 sd->dropped++;
4781 rps_unlock_irq_restore(sd, &flags);
4782
4783 dev_core_stats_rx_dropped_inc(skb->dev);
4784 kfree_skb_reason(skb, reason);
4785 return NET_RX_DROP;
4786}
4787
4788static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
4789{
4790 struct net_device *dev = skb->dev;
4791 struct netdev_rx_queue *rxqueue;
4792
4793 rxqueue = dev->_rx;
4794
4795 if (skb_rx_queue_recorded(skb)) {
4796 u16 index = skb_get_rx_queue(skb);
4797
4798 if (unlikely(index >= dev->real_num_rx_queues)) {
4799 WARN_ONCE(dev->real_num_rx_queues > 1,
4800 "%s received packet on queue %u, but number "
4801 "of RX queues is %u\n",
4802 dev->name, index, dev->real_num_rx_queues);
4803
4804 return rxqueue; /* Return first rxqueue */
4805 }
4806 rxqueue += index;
4807 }
4808 return rxqueue;
4809}
4810
4811u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
4812 struct bpf_prog *xdp_prog)
4813{
4814 void *orig_data, *orig_data_end, *hard_start;
4815 struct netdev_rx_queue *rxqueue;
4816 bool orig_bcast, orig_host;
4817 u32 mac_len, frame_sz;
4818 __be16 orig_eth_type;
4819 struct ethhdr *eth;
4820 u32 metalen, act;
4821 int off;
4822
4823 /* The XDP program wants to see the packet starting at the MAC
4824 * header.
4825 */
4826 mac_len = skb->data - skb_mac_header(skb);
4827 hard_start = skb->data - skb_headroom(skb);
4828
4829 /* SKB "head" area always have tailroom for skb_shared_info */
4830 frame_sz = (void *)skb_end_pointer(skb) - hard_start;
4831 frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
4832
4833 rxqueue = netif_get_rxqueue(skb);
4834 xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
4835 xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
4836 skb_headlen(skb) + mac_len, true);
4837 if (skb_is_nonlinear(skb)) {
4838 skb_shinfo(skb)->xdp_frags_size = skb->data_len;
4839 xdp_buff_set_frags_flag(xdp);
4840 } else {
4841 xdp_buff_clear_frags_flag(xdp);
4842 }
4843
4844 orig_data_end = xdp->data_end;
4845 orig_data = xdp->data;
4846 eth = (struct ethhdr *)xdp->data;
4847 orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
4848 orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
4849 orig_eth_type = eth->h_proto;
4850
4851 act = bpf_prog_run_xdp(xdp_prog, xdp);
4852
4853 /* check if bpf_xdp_adjust_head was used */
4854 off = xdp->data - orig_data;
4855 if (off) {
4856 if (off > 0)
4857 __skb_pull(skb, off);
4858 else if (off < 0)
4859 __skb_push(skb, -off);
4860
4861 skb->mac_header += off;
4862 skb_reset_network_header(skb);
4863 }
4864
4865 /* check if bpf_xdp_adjust_tail was used */
4866 off = xdp->data_end - orig_data_end;
4867 if (off != 0) {
4868 skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4869 skb->len += off; /* positive on grow, negative on shrink */
4870 }
4871
4872 /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
4873 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
4874 */
4875 if (xdp_buff_has_frags(xdp))
4876 skb->data_len = skb_shinfo(skb)->xdp_frags_size;
4877 else
4878 skb->data_len = 0;
4879
4880 /* check if XDP changed eth hdr such SKB needs update */
4881 eth = (struct ethhdr *)xdp->data;
4882 if ((orig_eth_type != eth->h_proto) ||
4883 (orig_host != ether_addr_equal_64bits(eth->h_dest,
4884 skb->dev->dev_addr)) ||
4885 (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
4886 __skb_push(skb, ETH_HLEN);
4887 skb->pkt_type = PACKET_HOST;
4888 skb->protocol = eth_type_trans(skb, skb->dev);
4889 }
4890
4891 /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
4892 * before calling us again on redirect path. We do not call do_redirect
4893 * as we leave that up to the caller.
4894 *
4895 * Caller is responsible for managing lifetime of skb (i.e. calling
4896 * kfree_skb in response to actions it cannot handle/XDP_DROP).
4897 */
4898 switch (act) {
4899 case XDP_REDIRECT:
4900 case XDP_TX:
4901 __skb_push(skb, mac_len);
4902 break;
4903 case XDP_PASS:
4904 metalen = xdp->data - xdp->data_meta;
4905 if (metalen)
4906 skb_metadata_set(skb, metalen);
4907 break;
4908 }
4909
4910 return act;
4911}
4912
4913static int
4914netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog)
4915{
4916 struct sk_buff *skb = *pskb;
4917 int err, hroom, troom;
4918
4919 if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog))
4920 return 0;
4921
4922 /* In case we have to go down the path and also linearize,
4923 * then lets do the pskb_expand_head() work just once here.
4924 */
4925 hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
4926 troom = skb->tail + skb->data_len - skb->end;
4927 err = pskb_expand_head(skb,
4928 hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
4929 troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
4930 if (err)
4931 return err;
4932
4933 return skb_linearize(skb);
4934}
4935
4936static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
4937 struct xdp_buff *xdp,
4938 struct bpf_prog *xdp_prog)
4939{
4940 struct sk_buff *skb = *pskb;
4941 u32 mac_len, act = XDP_DROP;
4942
4943 /* Reinjected packets coming from act_mirred or similar should
4944 * not get XDP generic processing.
4945 */
4946 if (skb_is_redirected(skb))
4947 return XDP_PASS;
4948
4949 /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
4950 * bytes. This is the guarantee that also native XDP provides,
4951 * thus we need to do it here as well.
4952 */
4953 mac_len = skb->data - skb_mac_header(skb);
4954 __skb_push(skb, mac_len);
4955
4956 if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
4957 skb_headroom(skb) < XDP_PACKET_HEADROOM) {
4958 if (netif_skb_check_for_xdp(pskb, xdp_prog))
4959 goto do_drop;
4960 }
4961
4962 __skb_pull(*pskb, mac_len);
4963
4964 act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
4965 switch (act) {
4966 case XDP_REDIRECT:
4967 case XDP_TX:
4968 case XDP_PASS:
4969 break;
4970 default:
4971 bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
4972 fallthrough;
4973 case XDP_ABORTED:
4974 trace_xdp_exception((*pskb)->dev, xdp_prog, act);
4975 fallthrough;
4976 case XDP_DROP:
4977 do_drop:
4978 kfree_skb(*pskb);
4979 break;
4980 }
4981
4982 return act;
4983}
4984
4985/* When doing generic XDP we have to bypass the qdisc layer and the
4986 * network taps in order to match in-driver-XDP behavior. This also means
4987 * that XDP packets are able to starve other packets going through a qdisc,
4988 * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
4989 * queues, so they do not have this starvation issue.
4990 */
4991void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
4992{
4993 struct net_device *dev = skb->dev;
4994 struct netdev_queue *txq;
4995 bool free_skb = true;
4996 int cpu, rc;
4997
4998 txq = netdev_core_pick_tx(dev, skb, NULL);
4999 cpu = smp_processor_id();
5000 HARD_TX_LOCK(dev, txq, cpu);
5001 if (!netif_xmit_frozen_or_drv_stopped(txq)) {
5002 rc = netdev_start_xmit(skb, dev, txq, 0);
5003 if (dev_xmit_complete(rc))
5004 free_skb = false;
5005 }
5006 HARD_TX_UNLOCK(dev, txq);
5007 if (free_skb) {
5008 trace_xdp_exception(dev, xdp_prog, XDP_TX);
5009 dev_core_stats_tx_dropped_inc(dev);
5010 kfree_skb(skb);
5011 }
5012}
5013
5014static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
5015
5016int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb)
5017{
5018 if (xdp_prog) {
5019 struct xdp_buff xdp;
5020 u32 act;
5021 int err;
5022
5023 act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog);
5024 if (act != XDP_PASS) {
5025 switch (act) {
5026 case XDP_REDIRECT:
5027 err = xdp_do_generic_redirect((*pskb)->dev, *pskb,
5028 &xdp, xdp_prog);
5029 if (err)
5030 goto out_redir;
5031 break;
5032 case XDP_TX:
5033 generic_xdp_tx(*pskb, xdp_prog);
5034 break;
5035 }
5036 return XDP_DROP;
5037 }
5038 }
5039 return XDP_PASS;
5040out_redir:
5041 kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP);
5042 return XDP_DROP;
5043}
5044EXPORT_SYMBOL_GPL(do_xdp_generic);
5045
5046static int netif_rx_internal(struct sk_buff *skb)
5047{
5048 int ret;
5049
5050 net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
5051
5052 trace_netif_rx(skb);
5053
5054#ifdef CONFIG_RPS
5055 if (static_branch_unlikely(&rps_needed)) {
5056 struct rps_dev_flow voidflow, *rflow = &voidflow;
5057 int cpu;
5058
5059 rcu_read_lock();
5060
5061 cpu = get_rps_cpu(skb->dev, skb, &rflow);
5062 if (cpu < 0)
5063 cpu = smp_processor_id();
5064
5065 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5066
5067 rcu_read_unlock();
5068 } else
5069#endif
5070 {
5071 unsigned int qtail;
5072
5073 ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
5074 }
5075 return ret;
5076}
5077
5078/**
5079 * __netif_rx - Slightly optimized version of netif_rx
5080 * @skb: buffer to post
5081 *
5082 * This behaves as netif_rx except that it does not disable bottom halves.
5083 * As a result this function may only be invoked from the interrupt context
5084 * (either hard or soft interrupt).
5085 */
5086int __netif_rx(struct sk_buff *skb)
5087{
5088 int ret;
5089
5090 lockdep_assert_once(hardirq_count() | softirq_count());
5091
5092 trace_netif_rx_entry(skb);
5093 ret = netif_rx_internal(skb);
5094 trace_netif_rx_exit(ret);
5095 return ret;
5096}
5097EXPORT_SYMBOL(__netif_rx);
5098
5099/**
5100 * netif_rx - post buffer to the network code
5101 * @skb: buffer to post
5102 *
5103 * This function receives a packet from a device driver and queues it for
5104 * the upper (protocol) levels to process via the backlog NAPI device. It
5105 * always succeeds. The buffer may be dropped during processing for
5106 * congestion control or by the protocol layers.
5107 * The network buffer is passed via the backlog NAPI device. Modern NIC
5108 * driver should use NAPI and GRO.
5109 * This function can used from interrupt and from process context. The
5110 * caller from process context must not disable interrupts before invoking
5111 * this function.
5112 *
5113 * return values:
5114 * NET_RX_SUCCESS (no congestion)
5115 * NET_RX_DROP (packet was dropped)
5116 *
5117 */
5118int netif_rx(struct sk_buff *skb)
5119{
5120 bool need_bh_off = !(hardirq_count() | softirq_count());
5121 int ret;
5122
5123 if (need_bh_off)
5124 local_bh_disable();
5125 trace_netif_rx_entry(skb);
5126 ret = netif_rx_internal(skb);
5127 trace_netif_rx_exit(ret);
5128 if (need_bh_off)
5129 local_bh_enable();
5130 return ret;
5131}
5132EXPORT_SYMBOL(netif_rx);
5133
5134static __latent_entropy void net_tx_action(struct softirq_action *h)
5135{
5136 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5137
5138 if (sd->completion_queue) {
5139 struct sk_buff *clist;
5140
5141 local_irq_disable();
5142 clist = sd->completion_queue;
5143 sd->completion_queue = NULL;
5144 local_irq_enable();
5145
5146 while (clist) {
5147 struct sk_buff *skb = clist;
5148
5149 clist = clist->next;
5150
5151 WARN_ON(refcount_read(&skb->users));
5152 if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED))
5153 trace_consume_skb(skb, net_tx_action);
5154 else
5155 trace_kfree_skb(skb, net_tx_action,
5156 get_kfree_skb_cb(skb)->reason);
5157
5158 if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
5159 __kfree_skb(skb);
5160 else
5161 __napi_kfree_skb(skb,
5162 get_kfree_skb_cb(skb)->reason);
5163 }
5164 }
5165
5166 if (sd->output_queue) {
5167 struct Qdisc *head;
5168
5169 local_irq_disable();
5170 head = sd->output_queue;
5171 sd->output_queue = NULL;
5172 sd->output_queue_tailp = &sd->output_queue;
5173 local_irq_enable();
5174
5175 rcu_read_lock();
5176
5177 while (head) {
5178 struct Qdisc *q = head;
5179 spinlock_t *root_lock = NULL;
5180
5181 head = head->next_sched;
5182
5183 /* We need to make sure head->next_sched is read
5184 * before clearing __QDISC_STATE_SCHED
5185 */
5186 smp_mb__before_atomic();
5187
5188 if (!(q->flags & TCQ_F_NOLOCK)) {
5189 root_lock = qdisc_lock(q);
5190 spin_lock(root_lock);
5191 } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
5192 &q->state))) {
5193 /* There is a synchronize_net() between
5194 * STATE_DEACTIVATED flag being set and
5195 * qdisc_reset()/some_qdisc_is_busy() in
5196 * dev_deactivate(), so we can safely bail out
5197 * early here to avoid data race between
5198 * qdisc_deactivate() and some_qdisc_is_busy()
5199 * for lockless qdisc.
5200 */
5201 clear_bit(__QDISC_STATE_SCHED, &q->state);
5202 continue;
5203 }
5204
5205 clear_bit(__QDISC_STATE_SCHED, &q->state);
5206 qdisc_run(q);
5207 if (root_lock)
5208 spin_unlock(root_lock);
5209 }
5210
5211 rcu_read_unlock();
5212 }
5213
5214 xfrm_dev_backlog(sd);
5215}
5216
5217#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
5218/* This hook is defined here for ATM LANE */
5219int (*br_fdb_test_addr_hook)(struct net_device *dev,
5220 unsigned char *addr) __read_mostly;
5221EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
5222#endif
5223
5224/**
5225 * netdev_is_rx_handler_busy - check if receive handler is registered
5226 * @dev: device to check
5227 *
5228 * Check if a receive handler is already registered for a given device.
5229 * Return true if there one.
5230 *
5231 * The caller must hold the rtnl_mutex.
5232 */
5233bool netdev_is_rx_handler_busy(struct net_device *dev)
5234{
5235 ASSERT_RTNL();
5236 return dev && rtnl_dereference(dev->rx_handler);
5237}
5238EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
5239
5240/**
5241 * netdev_rx_handler_register - register receive handler
5242 * @dev: device to register a handler for
5243 * @rx_handler: receive handler to register
5244 * @rx_handler_data: data pointer that is used by rx handler
5245 *
5246 * Register a receive handler for a device. This handler will then be
5247 * called from __netif_receive_skb. A negative errno code is returned
5248 * on a failure.
5249 *
5250 * The caller must hold the rtnl_mutex.
5251 *
5252 * For a general description of rx_handler, see enum rx_handler_result.
5253 */
5254int netdev_rx_handler_register(struct net_device *dev,
5255 rx_handler_func_t *rx_handler,
5256 void *rx_handler_data)
5257{
5258 if (netdev_is_rx_handler_busy(dev))
5259 return -EBUSY;
5260
5261 if (dev->priv_flags & IFF_NO_RX_HANDLER)
5262 return -EINVAL;
5263
5264 /* Note: rx_handler_data must be set before rx_handler */
5265 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
5266 rcu_assign_pointer(dev->rx_handler, rx_handler);
5267
5268 return 0;
5269}
5270EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
5271
5272/**
5273 * netdev_rx_handler_unregister - unregister receive handler
5274 * @dev: device to unregister a handler from
5275 *
5276 * Unregister a receive handler from a device.
5277 *
5278 * The caller must hold the rtnl_mutex.
5279 */
5280void netdev_rx_handler_unregister(struct net_device *dev)
5281{
5282
5283 ASSERT_RTNL();
5284 RCU_INIT_POINTER(dev->rx_handler, NULL);
5285 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
5286 * section has a guarantee to see a non NULL rx_handler_data
5287 * as well.
5288 */
5289 synchronize_net();
5290 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
5291}
5292EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
5293
5294/*
5295 * Limit the use of PFMEMALLOC reserves to those protocols that implement
5296 * the special handling of PFMEMALLOC skbs.
5297 */
5298static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
5299{
5300 switch (skb->protocol) {
5301 case htons(ETH_P_ARP):
5302 case htons(ETH_P_IP):
5303 case htons(ETH_P_IPV6):
5304 case htons(ETH_P_8021Q):
5305 case htons(ETH_P_8021AD):
5306 return true;
5307 default:
5308 return false;
5309 }
5310}
5311
5312static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
5313 int *ret, struct net_device *orig_dev)
5314{
5315 if (nf_hook_ingress_active(skb)) {
5316 int ingress_retval;
5317
5318 if (*pt_prev) {
5319 *ret = deliver_skb(skb, *pt_prev, orig_dev);
5320 *pt_prev = NULL;
5321 }
5322
5323 rcu_read_lock();
5324 ingress_retval = nf_hook_ingress(skb);
5325 rcu_read_unlock();
5326 return ingress_retval;
5327 }
5328 return 0;
5329}
5330
5331static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
5332 struct packet_type **ppt_prev)
5333{
5334 struct packet_type *ptype, *pt_prev;
5335 rx_handler_func_t *rx_handler;
5336 struct sk_buff *skb = *pskb;
5337 struct net_device *orig_dev;
5338 bool deliver_exact = false;
5339 int ret = NET_RX_DROP;
5340 __be16 type;
5341
5342 net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb);
5343
5344 trace_netif_receive_skb(skb);
5345
5346 orig_dev = skb->dev;
5347
5348 skb_reset_network_header(skb);
5349 if (!skb_transport_header_was_set(skb))
5350 skb_reset_transport_header(skb);
5351 skb_reset_mac_len(skb);
5352
5353 pt_prev = NULL;
5354
5355another_round:
5356 skb->skb_iif = skb->dev->ifindex;
5357
5358 __this_cpu_inc(softnet_data.processed);
5359
5360 if (static_branch_unlikely(&generic_xdp_needed_key)) {
5361 int ret2;
5362
5363 migrate_disable();
5364 ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
5365 &skb);
5366 migrate_enable();
5367
5368 if (ret2 != XDP_PASS) {
5369 ret = NET_RX_DROP;
5370 goto out;
5371 }
5372 }
5373
5374 if (eth_type_vlan(skb->protocol)) {
5375 skb = skb_vlan_untag(skb);
5376 if (unlikely(!skb))
5377 goto out;
5378 }
5379
5380 if (skb_skip_tc_classify(skb))
5381 goto skip_classify;
5382
5383 if (pfmemalloc)
5384 goto skip_taps;
5385
5386 list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) {
5387 if (pt_prev)
5388 ret = deliver_skb(skb, pt_prev, orig_dev);
5389 pt_prev = ptype;
5390 }
5391
5392 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
5393 if (pt_prev)
5394 ret = deliver_skb(skb, pt_prev, orig_dev);
5395 pt_prev = ptype;
5396 }
5397
5398skip_taps:
5399#ifdef CONFIG_NET_INGRESS
5400 if (static_branch_unlikely(&ingress_needed_key)) {
5401 bool another = false;
5402
5403 nf_skip_egress(skb, true);
5404 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
5405 &another);
5406 if (another)
5407 goto another_round;
5408 if (!skb)
5409 goto out;
5410
5411 nf_skip_egress(skb, false);
5412 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
5413 goto out;
5414 }
5415#endif
5416 skb_reset_redirect(skb);
5417skip_classify:
5418 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
5419 goto drop;
5420
5421 if (skb_vlan_tag_present(skb)) {
5422 if (pt_prev) {
5423 ret = deliver_skb(skb, pt_prev, orig_dev);
5424 pt_prev = NULL;
5425 }
5426 if (vlan_do_receive(&skb))
5427 goto another_round;
5428 else if (unlikely(!skb))
5429 goto out;
5430 }
5431
5432 rx_handler = rcu_dereference(skb->dev->rx_handler);
5433 if (rx_handler) {
5434 if (pt_prev) {
5435 ret = deliver_skb(skb, pt_prev, orig_dev);
5436 pt_prev = NULL;
5437 }
5438 switch (rx_handler(&skb)) {
5439 case RX_HANDLER_CONSUMED:
5440 ret = NET_RX_SUCCESS;
5441 goto out;
5442 case RX_HANDLER_ANOTHER:
5443 goto another_round;
5444 case RX_HANDLER_EXACT:
5445 deliver_exact = true;
5446 break;
5447 case RX_HANDLER_PASS:
5448 break;
5449 default:
5450 BUG();
5451 }
5452 }
5453
5454 if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
5455check_vlan_id:
5456 if (skb_vlan_tag_get_id(skb)) {
5457 /* Vlan id is non 0 and vlan_do_receive() above couldn't
5458 * find vlan device.
5459 */
5460 skb->pkt_type = PACKET_OTHERHOST;
5461 } else if (eth_type_vlan(skb->protocol)) {
5462 /* Outer header is 802.1P with vlan 0, inner header is
5463 * 802.1Q or 802.1AD and vlan_do_receive() above could
5464 * not find vlan dev for vlan id 0.
5465 */
5466 __vlan_hwaccel_clear_tag(skb);
5467 skb = skb_vlan_untag(skb);
5468 if (unlikely(!skb))
5469 goto out;
5470 if (vlan_do_receive(&skb))
5471 /* After stripping off 802.1P header with vlan 0
5472 * vlan dev is found for inner header.
5473 */
5474 goto another_round;
5475 else if (unlikely(!skb))
5476 goto out;
5477 else
5478 /* We have stripped outer 802.1P vlan 0 header.
5479 * But could not find vlan dev.
5480 * check again for vlan id to set OTHERHOST.
5481 */
5482 goto check_vlan_id;
5483 }
5484 /* Note: we might in the future use prio bits
5485 * and set skb->priority like in vlan_do_receive()
5486 * For the time being, just ignore Priority Code Point
5487 */
5488 __vlan_hwaccel_clear_tag(skb);
5489 }
5490
5491 type = skb->protocol;
5492
5493 /* deliver only exact match when indicated */
5494 if (likely(!deliver_exact)) {
5495 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5496 &ptype_base[ntohs(type) &
5497 PTYPE_HASH_MASK]);
5498 }
5499
5500 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5501 &orig_dev->ptype_specific);
5502
5503 if (unlikely(skb->dev != orig_dev)) {
5504 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5505 &skb->dev->ptype_specific);
5506 }
5507
5508 if (pt_prev) {
5509 if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
5510 goto drop;
5511 *ppt_prev = pt_prev;
5512 } else {
5513drop:
5514 if (!deliver_exact)
5515 dev_core_stats_rx_dropped_inc(skb->dev);
5516 else
5517 dev_core_stats_rx_nohandler_inc(skb->dev);
5518 kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
5519 /* Jamal, now you will not able to escape explaining
5520 * me how you were going to use this. :-)
5521 */
5522 ret = NET_RX_DROP;
5523 }
5524
5525out:
5526 /* The invariant here is that if *ppt_prev is not NULL
5527 * then skb should also be non-NULL.
5528 *
5529 * Apparently *ppt_prev assignment above holds this invariant due to
5530 * skb dereferencing near it.
5531 */
5532 *pskb = skb;
5533 return ret;
5534}
5535
5536static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
5537{
5538 struct net_device *orig_dev = skb->dev;
5539 struct packet_type *pt_prev = NULL;
5540 int ret;
5541
5542 ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5543 if (pt_prev)
5544 ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
5545 skb->dev, pt_prev, orig_dev);
5546 return ret;
5547}
5548
5549/**
5550 * netif_receive_skb_core - special purpose version of netif_receive_skb
5551 * @skb: buffer to process
5552 *
5553 * More direct receive version of netif_receive_skb(). It should
5554 * only be used by callers that have a need to skip RPS and Generic XDP.
5555 * Caller must also take care of handling if ``(page_is_)pfmemalloc``.
5556 *
5557 * This function may only be called from softirq context and interrupts
5558 * should be enabled.
5559 *
5560 * Return values (usually ignored):
5561 * NET_RX_SUCCESS: no congestion
5562 * NET_RX_DROP: packet was dropped
5563 */
5564int netif_receive_skb_core(struct sk_buff *skb)
5565{
5566 int ret;
5567
5568 rcu_read_lock();
5569 ret = __netif_receive_skb_one_core(skb, false);
5570 rcu_read_unlock();
5571
5572 return ret;
5573}
5574EXPORT_SYMBOL(netif_receive_skb_core);
5575
5576static inline void __netif_receive_skb_list_ptype(struct list_head *head,
5577 struct packet_type *pt_prev,
5578 struct net_device *orig_dev)
5579{
5580 struct sk_buff *skb, *next;
5581
5582 if (!pt_prev)
5583 return;
5584 if (list_empty(head))
5585 return;
5586 if (pt_prev->list_func != NULL)
5587 INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
5588 ip_list_rcv, head, pt_prev, orig_dev);
5589 else
5590 list_for_each_entry_safe(skb, next, head, list) {
5591 skb_list_del_init(skb);
5592 pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
5593 }
5594}
5595
5596static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
5597{
5598 /* Fast-path assumptions:
5599 * - There is no RX handler.
5600 * - Only one packet_type matches.
5601 * If either of these fails, we will end up doing some per-packet
5602 * processing in-line, then handling the 'last ptype' for the whole
5603 * sublist. This can't cause out-of-order delivery to any single ptype,
5604 * because the 'last ptype' must be constant across the sublist, and all
5605 * other ptypes are handled per-packet.
5606 */
5607 /* Current (common) ptype of sublist */
5608 struct packet_type *pt_curr = NULL;
5609 /* Current (common) orig_dev of sublist */
5610 struct net_device *od_curr = NULL;
5611 struct list_head sublist;
5612 struct sk_buff *skb, *next;
5613
5614 INIT_LIST_HEAD(&sublist);
5615 list_for_each_entry_safe(skb, next, head, list) {
5616 struct net_device *orig_dev = skb->dev;
5617 struct packet_type *pt_prev = NULL;
5618
5619 skb_list_del_init(skb);
5620 __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5621 if (!pt_prev)
5622 continue;
5623 if (pt_curr != pt_prev || od_curr != orig_dev) {
5624 /* dispatch old sublist */
5625 __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5626 /* start new sublist */
5627 INIT_LIST_HEAD(&sublist);
5628 pt_curr = pt_prev;
5629 od_curr = orig_dev;
5630 }
5631 list_add_tail(&skb->list, &sublist);
5632 }
5633
5634 /* dispatch final sublist */
5635 __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5636}
5637
5638static int __netif_receive_skb(struct sk_buff *skb)
5639{
5640 int ret;
5641
5642 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
5643 unsigned int noreclaim_flag;
5644
5645 /*
5646 * PFMEMALLOC skbs are special, they should
5647 * - be delivered to SOCK_MEMALLOC sockets only
5648 * - stay away from userspace
5649 * - have bounded memory usage
5650 *
5651 * Use PF_MEMALLOC as this saves us from propagating the allocation
5652 * context down to all allocation sites.
5653 */
5654 noreclaim_flag = memalloc_noreclaim_save();
5655 ret = __netif_receive_skb_one_core(skb, true);
5656 memalloc_noreclaim_restore(noreclaim_flag);
5657 } else
5658 ret = __netif_receive_skb_one_core(skb, false);
5659
5660 return ret;
5661}
5662
5663static void __netif_receive_skb_list(struct list_head *head)
5664{
5665 unsigned long noreclaim_flag = 0;
5666 struct sk_buff *skb, *next;
5667 bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
5668
5669 list_for_each_entry_safe(skb, next, head, list) {
5670 if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
5671 struct list_head sublist;
5672
5673 /* Handle the previous sublist */
5674 list_cut_before(&sublist, head, &skb->list);
5675 if (!list_empty(&sublist))
5676 __netif_receive_skb_list_core(&sublist, pfmemalloc);
5677 pfmemalloc = !pfmemalloc;
5678 /* See comments in __netif_receive_skb */
5679 if (pfmemalloc)
5680 noreclaim_flag = memalloc_noreclaim_save();
5681 else
5682 memalloc_noreclaim_restore(noreclaim_flag);
5683 }
5684 }
5685 /* Handle the remaining sublist */
5686 if (!list_empty(head))
5687 __netif_receive_skb_list_core(head, pfmemalloc);
5688 /* Restore pflags */
5689 if (pfmemalloc)
5690 memalloc_noreclaim_restore(noreclaim_flag);
5691}
5692
5693static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
5694{
5695 struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
5696 struct bpf_prog *new = xdp->prog;
5697 int ret = 0;
5698
5699 switch (xdp->command) {
5700 case XDP_SETUP_PROG:
5701 rcu_assign_pointer(dev->xdp_prog, new);
5702 if (old)
5703 bpf_prog_put(old);
5704
5705 if (old && !new) {
5706 static_branch_dec(&generic_xdp_needed_key);
5707 } else if (new && !old) {
5708 static_branch_inc(&generic_xdp_needed_key);
5709 dev_disable_lro(dev);
5710 dev_disable_gro_hw(dev);
5711 }
5712 break;
5713
5714 default:
5715 ret = -EINVAL;
5716 break;
5717 }
5718
5719 return ret;
5720}
5721
5722static int netif_receive_skb_internal(struct sk_buff *skb)
5723{
5724 int ret;
5725
5726 net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
5727
5728 if (skb_defer_rx_timestamp(skb))
5729 return NET_RX_SUCCESS;
5730
5731 rcu_read_lock();
5732#ifdef CONFIG_RPS
5733 if (static_branch_unlikely(&rps_needed)) {
5734 struct rps_dev_flow voidflow, *rflow = &voidflow;
5735 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5736
5737 if (cpu >= 0) {
5738 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5739 rcu_read_unlock();
5740 return ret;
5741 }
5742 }
5743#endif
5744 ret = __netif_receive_skb(skb);
5745 rcu_read_unlock();
5746 return ret;
5747}
5748
5749void netif_receive_skb_list_internal(struct list_head *head)
5750{
5751 struct sk_buff *skb, *next;
5752 struct list_head sublist;
5753
5754 INIT_LIST_HEAD(&sublist);
5755 list_for_each_entry_safe(skb, next, head, list) {
5756 net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue),
5757 skb);
5758 skb_list_del_init(skb);
5759 if (!skb_defer_rx_timestamp(skb))
5760 list_add_tail(&skb->list, &sublist);
5761 }
5762 list_splice_init(&sublist, head);
5763
5764 rcu_read_lock();
5765#ifdef CONFIG_RPS
5766 if (static_branch_unlikely(&rps_needed)) {
5767 list_for_each_entry_safe(skb, next, head, list) {
5768 struct rps_dev_flow voidflow, *rflow = &voidflow;
5769 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5770
5771 if (cpu >= 0) {
5772 /* Will be handled, remove from list */
5773 skb_list_del_init(skb);
5774 enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5775 }
5776 }
5777 }
5778#endif
5779 __netif_receive_skb_list(head);
5780 rcu_read_unlock();
5781}
5782
5783/**
5784 * netif_receive_skb - process receive buffer from network
5785 * @skb: buffer to process
5786 *
5787 * netif_receive_skb() is the main receive data processing function.
5788 * It always succeeds. The buffer may be dropped during processing
5789 * for congestion control or by the protocol layers.
5790 *
5791 * This function may only be called from softirq context and interrupts
5792 * should be enabled.
5793 *
5794 * Return values (usually ignored):
5795 * NET_RX_SUCCESS: no congestion
5796 * NET_RX_DROP: packet was dropped
5797 */
5798int netif_receive_skb(struct sk_buff *skb)
5799{
5800 int ret;
5801
5802 trace_netif_receive_skb_entry(skb);
5803
5804 ret = netif_receive_skb_internal(skb);
5805 trace_netif_receive_skb_exit(ret);
5806
5807 return ret;
5808}
5809EXPORT_SYMBOL(netif_receive_skb);
5810
5811/**
5812 * netif_receive_skb_list - process many receive buffers from network
5813 * @head: list of skbs to process.
5814 *
5815 * Since return value of netif_receive_skb() is normally ignored, and
5816 * wouldn't be meaningful for a list, this function returns void.
5817 *
5818 * This function may only be called from softirq context and interrupts
5819 * should be enabled.
5820 */
5821void netif_receive_skb_list(struct list_head *head)
5822{
5823 struct sk_buff *skb;
5824
5825 if (list_empty(head))
5826 return;
5827 if (trace_netif_receive_skb_list_entry_enabled()) {
5828 list_for_each_entry(skb, head, list)
5829 trace_netif_receive_skb_list_entry(skb);
5830 }
5831 netif_receive_skb_list_internal(head);
5832 trace_netif_receive_skb_list_exit(0);
5833}
5834EXPORT_SYMBOL(netif_receive_skb_list);
5835
5836static DEFINE_PER_CPU(struct work_struct, flush_works);
5837
5838/* Network device is going away, flush any packets still pending */
5839static void flush_backlog(struct work_struct *work)
5840{
5841 struct sk_buff *skb, *tmp;
5842 struct softnet_data *sd;
5843
5844 local_bh_disable();
5845 sd = this_cpu_ptr(&softnet_data);
5846
5847 rps_lock_irq_disable(sd);
5848 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
5849 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5850 __skb_unlink(skb, &sd->input_pkt_queue);
5851 dev_kfree_skb_irq(skb);
5852 input_queue_head_incr(sd);
5853 }
5854 }
5855 rps_unlock_irq_enable(sd);
5856
5857 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
5858 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5859 __skb_unlink(skb, &sd->process_queue);
5860 kfree_skb(skb);
5861 input_queue_head_incr(sd);
5862 }
5863 }
5864 local_bh_enable();
5865}
5866
5867static bool flush_required(int cpu)
5868{
5869#if IS_ENABLED(CONFIG_RPS)
5870 struct softnet_data *sd = &per_cpu(softnet_data, cpu);
5871 bool do_flush;
5872
5873 rps_lock_irq_disable(sd);
5874
5875 /* as insertion into process_queue happens with the rps lock held,
5876 * process_queue access may race only with dequeue
5877 */
5878 do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
5879 !skb_queue_empty_lockless(&sd->process_queue);
5880 rps_unlock_irq_enable(sd);
5881
5882 return do_flush;
5883#endif
5884 /* without RPS we can't safely check input_pkt_queue: during a
5885 * concurrent remote skb_queue_splice() we can detect as empty both
5886 * input_pkt_queue and process_queue even if the latter could end-up
5887 * containing a lot of packets.
5888 */
5889 return true;
5890}
5891
5892static void flush_all_backlogs(void)
5893{
5894 static cpumask_t flush_cpus;
5895 unsigned int cpu;
5896
5897 /* since we are under rtnl lock protection we can use static data
5898 * for the cpumask and avoid allocating on stack the possibly
5899 * large mask
5900 */
5901 ASSERT_RTNL();
5902
5903 cpus_read_lock();
5904
5905 cpumask_clear(&flush_cpus);
5906 for_each_online_cpu(cpu) {
5907 if (flush_required(cpu)) {
5908 queue_work_on(cpu, system_highpri_wq,
5909 per_cpu_ptr(&flush_works, cpu));
5910 cpumask_set_cpu(cpu, &flush_cpus);
5911 }
5912 }
5913
5914 /* we can have in flight packet[s] on the cpus we are not flushing,
5915 * synchronize_net() in unregister_netdevice_many() will take care of
5916 * them
5917 */
5918 for_each_cpu(cpu, &flush_cpus)
5919 flush_work(per_cpu_ptr(&flush_works, cpu));
5920
5921 cpus_read_unlock();
5922}
5923
5924static void net_rps_send_ipi(struct softnet_data *remsd)
5925{
5926#ifdef CONFIG_RPS
5927 while (remsd) {
5928 struct softnet_data *next = remsd->rps_ipi_next;
5929
5930 if (cpu_online(remsd->cpu))
5931 smp_call_function_single_async(remsd->cpu, &remsd->csd);
5932 remsd = next;
5933 }
5934#endif
5935}
5936
5937/*
5938 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
5939 * Note: called with local irq disabled, but exits with local irq enabled.
5940 */
5941static void net_rps_action_and_irq_enable(struct softnet_data *sd)
5942{
5943#ifdef CONFIG_RPS
5944 struct softnet_data *remsd = sd->rps_ipi_list;
5945
5946 if (remsd) {
5947 sd->rps_ipi_list = NULL;
5948
5949 local_irq_enable();
5950
5951 /* Send pending IPI's to kick RPS processing on remote cpus. */
5952 net_rps_send_ipi(remsd);
5953 } else
5954#endif
5955 local_irq_enable();
5956}
5957
5958static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
5959{
5960#ifdef CONFIG_RPS
5961 return sd->rps_ipi_list != NULL;
5962#else
5963 return false;
5964#endif
5965}
5966
5967static int process_backlog(struct napi_struct *napi, int quota)
5968{
5969 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
5970 bool again = true;
5971 int work = 0;
5972
5973 /* Check if we have pending ipi, its better to send them now,
5974 * not waiting net_rx_action() end.
5975 */
5976 if (sd_has_rps_ipi_waiting(sd)) {
5977 local_irq_disable();
5978 net_rps_action_and_irq_enable(sd);
5979 }
5980
5981 napi->weight = READ_ONCE(net_hotdata.dev_rx_weight);
5982 while (again) {
5983 struct sk_buff *skb;
5984
5985 while ((skb = __skb_dequeue(&sd->process_queue))) {
5986 rcu_read_lock();
5987 __netif_receive_skb(skb);
5988 rcu_read_unlock();
5989 input_queue_head_incr(sd);
5990 if (++work >= quota)
5991 return work;
5992
5993 }
5994
5995 rps_lock_irq_disable(sd);
5996 if (skb_queue_empty(&sd->input_pkt_queue)) {
5997 /*
5998 * Inline a custom version of __napi_complete().
5999 * only current cpu owns and manipulates this napi,
6000 * and NAPI_STATE_SCHED is the only possible flag set
6001 * on backlog.
6002 * We can use a plain write instead of clear_bit(),
6003 * and we dont need an smp_mb() memory barrier.
6004 */
6005 napi->state = 0;
6006 again = false;
6007 } else {
6008 skb_queue_splice_tail_init(&sd->input_pkt_queue,
6009 &sd->process_queue);
6010 }
6011 rps_unlock_irq_enable(sd);
6012 }
6013
6014 return work;
6015}
6016
6017/**
6018 * __napi_schedule - schedule for receive
6019 * @n: entry to schedule
6020 *
6021 * The entry's receive function will be scheduled to run.
6022 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
6023 */
6024void __napi_schedule(struct napi_struct *n)
6025{
6026 unsigned long flags;
6027
6028 local_irq_save(flags);
6029 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6030 local_irq_restore(flags);
6031}
6032EXPORT_SYMBOL(__napi_schedule);
6033
6034/**
6035 * napi_schedule_prep - check if napi can be scheduled
6036 * @n: napi context
6037 *
6038 * Test if NAPI routine is already running, and if not mark
6039 * it as running. This is used as a condition variable to
6040 * insure only one NAPI poll instance runs. We also make
6041 * sure there is no pending NAPI disable.
6042 */
6043bool napi_schedule_prep(struct napi_struct *n)
6044{
6045 unsigned long new, val = READ_ONCE(n->state);
6046
6047 do {
6048 if (unlikely(val & NAPIF_STATE_DISABLE))
6049 return false;
6050 new = val | NAPIF_STATE_SCHED;
6051
6052 /* Sets STATE_MISSED bit if STATE_SCHED was already set
6053 * This was suggested by Alexander Duyck, as compiler
6054 * emits better code than :
6055 * if (val & NAPIF_STATE_SCHED)
6056 * new |= NAPIF_STATE_MISSED;
6057 */
6058 new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
6059 NAPIF_STATE_MISSED;
6060 } while (!try_cmpxchg(&n->state, &val, new));
6061
6062 return !(val & NAPIF_STATE_SCHED);
6063}
6064EXPORT_SYMBOL(napi_schedule_prep);
6065
6066/**
6067 * __napi_schedule_irqoff - schedule for receive
6068 * @n: entry to schedule
6069 *
6070 * Variant of __napi_schedule() assuming hard irqs are masked.
6071 *
6072 * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
6073 * because the interrupt disabled assumption might not be true
6074 * due to force-threaded interrupts and spinlock substitution.
6075 */
6076void __napi_schedule_irqoff(struct napi_struct *n)
6077{
6078 if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6079 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6080 else
6081 __napi_schedule(n);
6082}
6083EXPORT_SYMBOL(__napi_schedule_irqoff);
6084
6085bool napi_complete_done(struct napi_struct *n, int work_done)
6086{
6087 unsigned long flags, val, new, timeout = 0;
6088 bool ret = true;
6089
6090 /*
6091 * 1) Don't let napi dequeue from the cpu poll list
6092 * just in case its running on a different cpu.
6093 * 2) If we are busy polling, do nothing here, we have
6094 * the guarantee we will be called later.
6095 */
6096 if (unlikely(n->state & (NAPIF_STATE_NPSVC |
6097 NAPIF_STATE_IN_BUSY_POLL)))
6098 return false;
6099
6100 if (work_done) {
6101 if (n->gro_bitmask)
6102 timeout = READ_ONCE(n->dev->gro_flush_timeout);
6103 n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
6104 }
6105 if (n->defer_hard_irqs_count > 0) {
6106 n->defer_hard_irqs_count--;
6107 timeout = READ_ONCE(n->dev->gro_flush_timeout);
6108 if (timeout)
6109 ret = false;
6110 }
6111 if (n->gro_bitmask) {
6112 /* When the NAPI instance uses a timeout and keeps postponing
6113 * it, we need to bound somehow the time packets are kept in
6114 * the GRO layer
6115 */
6116 napi_gro_flush(n, !!timeout);
6117 }
6118
6119 gro_normal_list(n);
6120
6121 if (unlikely(!list_empty(&n->poll_list))) {
6122 /* If n->poll_list is not empty, we need to mask irqs */
6123 local_irq_save(flags);
6124 list_del_init(&n->poll_list);
6125 local_irq_restore(flags);
6126 }
6127 WRITE_ONCE(n->list_owner, -1);
6128
6129 val = READ_ONCE(n->state);
6130 do {
6131 WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
6132
6133 new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
6134 NAPIF_STATE_SCHED_THREADED |
6135 NAPIF_STATE_PREFER_BUSY_POLL);
6136
6137 /* If STATE_MISSED was set, leave STATE_SCHED set,
6138 * because we will call napi->poll() one more time.
6139 * This C code was suggested by Alexander Duyck to help gcc.
6140 */
6141 new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
6142 NAPIF_STATE_SCHED;
6143 } while (!try_cmpxchg(&n->state, &val, new));
6144
6145 if (unlikely(val & NAPIF_STATE_MISSED)) {
6146 __napi_schedule(n);
6147 return false;
6148 }
6149
6150 if (timeout)
6151 hrtimer_start(&n->timer, ns_to_ktime(timeout),
6152 HRTIMER_MODE_REL_PINNED);
6153 return ret;
6154}
6155EXPORT_SYMBOL(napi_complete_done);
6156
6157/* must be called under rcu_read_lock(), as we dont take a reference */
6158struct napi_struct *napi_by_id(unsigned int napi_id)
6159{
6160 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
6161 struct napi_struct *napi;
6162
6163 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
6164 if (napi->napi_id == napi_id)
6165 return napi;
6166
6167 return NULL;
6168}
6169
6170static void skb_defer_free_flush(struct softnet_data *sd)
6171{
6172 struct sk_buff *skb, *next;
6173
6174 /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
6175 if (!READ_ONCE(sd->defer_list))
6176 return;
6177
6178 spin_lock(&sd->defer_lock);
6179 skb = sd->defer_list;
6180 sd->defer_list = NULL;
6181 sd->defer_count = 0;
6182 spin_unlock(&sd->defer_lock);
6183
6184 while (skb != NULL) {
6185 next = skb->next;
6186 napi_consume_skb(skb, 1);
6187 skb = next;
6188 }
6189}
6190
6191#if defined(CONFIG_NET_RX_BUSY_POLL)
6192
6193static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
6194{
6195 if (!skip_schedule) {
6196 gro_normal_list(napi);
6197 __napi_schedule(napi);
6198 return;
6199 }
6200
6201 if (napi->gro_bitmask) {
6202 /* flush too old packets
6203 * If HZ < 1000, flush all packets.
6204 */
6205 napi_gro_flush(napi, HZ >= 1000);
6206 }
6207
6208 gro_normal_list(napi);
6209 clear_bit(NAPI_STATE_SCHED, &napi->state);
6210}
6211
6212enum {
6213 NAPI_F_PREFER_BUSY_POLL = 1,
6214 NAPI_F_END_ON_RESCHED = 2,
6215};
6216
6217static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
6218 unsigned flags, u16 budget)
6219{
6220 bool skip_schedule = false;
6221 unsigned long timeout;
6222 int rc;
6223
6224 /* Busy polling means there is a high chance device driver hard irq
6225 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
6226 * set in napi_schedule_prep().
6227 * Since we are about to call napi->poll() once more, we can safely
6228 * clear NAPI_STATE_MISSED.
6229 *
6230 * Note: x86 could use a single "lock and ..." instruction
6231 * to perform these two clear_bit()
6232 */
6233 clear_bit(NAPI_STATE_MISSED, &napi->state);
6234 clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
6235
6236 local_bh_disable();
6237
6238 if (flags & NAPI_F_PREFER_BUSY_POLL) {
6239 napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
6240 timeout = READ_ONCE(napi->dev->gro_flush_timeout);
6241 if (napi->defer_hard_irqs_count && timeout) {
6242 hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
6243 skip_schedule = true;
6244 }
6245 }
6246
6247 /* All we really want here is to re-enable device interrupts.
6248 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
6249 */
6250 rc = napi->poll(napi, budget);
6251 /* We can't gro_normal_list() here, because napi->poll() might have
6252 * rearmed the napi (napi_complete_done()) in which case it could
6253 * already be running on another CPU.
6254 */
6255 trace_napi_poll(napi, rc, budget);
6256 netpoll_poll_unlock(have_poll_lock);
6257 if (rc == budget)
6258 __busy_poll_stop(napi, skip_schedule);
6259 local_bh_enable();
6260}
6261
6262static void __napi_busy_loop(unsigned int napi_id,
6263 bool (*loop_end)(void *, unsigned long),
6264 void *loop_end_arg, unsigned flags, u16 budget)
6265{
6266 unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
6267 int (*napi_poll)(struct napi_struct *napi, int budget);
6268 void *have_poll_lock = NULL;
6269 struct napi_struct *napi;
6270
6271 WARN_ON_ONCE(!rcu_read_lock_held());
6272
6273restart:
6274 napi_poll = NULL;
6275
6276 napi = napi_by_id(napi_id);
6277 if (!napi)
6278 return;
6279
6280 if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6281 preempt_disable();
6282 for (;;) {
6283 int work = 0;
6284
6285 local_bh_disable();
6286 if (!napi_poll) {
6287 unsigned long val = READ_ONCE(napi->state);
6288
6289 /* If multiple threads are competing for this napi,
6290 * we avoid dirtying napi->state as much as we can.
6291 */
6292 if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
6293 NAPIF_STATE_IN_BUSY_POLL)) {
6294 if (flags & NAPI_F_PREFER_BUSY_POLL)
6295 set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6296 goto count;
6297 }
6298 if (cmpxchg(&napi->state, val,
6299 val | NAPIF_STATE_IN_BUSY_POLL |
6300 NAPIF_STATE_SCHED) != val) {
6301 if (flags & NAPI_F_PREFER_BUSY_POLL)
6302 set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6303 goto count;
6304 }
6305 have_poll_lock = netpoll_poll_lock(napi);
6306 napi_poll = napi->poll;
6307 }
6308 work = napi_poll(napi, budget);
6309 trace_napi_poll(napi, work, budget);
6310 gro_normal_list(napi);
6311count:
6312 if (work > 0)
6313 __NET_ADD_STATS(dev_net(napi->dev),
6314 LINUX_MIB_BUSYPOLLRXPACKETS, work);
6315 skb_defer_free_flush(this_cpu_ptr(&softnet_data));
6316 local_bh_enable();
6317
6318 if (!loop_end || loop_end(loop_end_arg, start_time))
6319 break;
6320
6321 if (unlikely(need_resched())) {
6322 if (flags & NAPI_F_END_ON_RESCHED)
6323 break;
6324 if (napi_poll)
6325 busy_poll_stop(napi, have_poll_lock, flags, budget);
6326 if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6327 preempt_enable();
6328 rcu_read_unlock();
6329 cond_resched();
6330 rcu_read_lock();
6331 if (loop_end(loop_end_arg, start_time))
6332 return;
6333 goto restart;
6334 }
6335 cpu_relax();
6336 }
6337 if (napi_poll)
6338 busy_poll_stop(napi, have_poll_lock, flags, budget);
6339 if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6340 preempt_enable();
6341}
6342
6343void napi_busy_loop_rcu(unsigned int napi_id,
6344 bool (*loop_end)(void *, unsigned long),
6345 void *loop_end_arg, bool prefer_busy_poll, u16 budget)
6346{
6347 unsigned flags = NAPI_F_END_ON_RESCHED;
6348
6349 if (prefer_busy_poll)
6350 flags |= NAPI_F_PREFER_BUSY_POLL;
6351
6352 __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
6353}
6354
6355void napi_busy_loop(unsigned int napi_id,
6356 bool (*loop_end)(void *, unsigned long),
6357 void *loop_end_arg, bool prefer_busy_poll, u16 budget)
6358{
6359 unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0;
6360
6361 rcu_read_lock();
6362 __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
6363 rcu_read_unlock();
6364}
6365EXPORT_SYMBOL(napi_busy_loop);
6366
6367#endif /* CONFIG_NET_RX_BUSY_POLL */
6368
6369static void napi_hash_add(struct napi_struct *napi)
6370{
6371 if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
6372 return;
6373
6374 spin_lock(&napi_hash_lock);
6375
6376 /* 0..NR_CPUS range is reserved for sender_cpu use */
6377 do {
6378 if (unlikely(++napi_gen_id < MIN_NAPI_ID))
6379 napi_gen_id = MIN_NAPI_ID;
6380 } while (napi_by_id(napi_gen_id));
6381 napi->napi_id = napi_gen_id;
6382
6383 hlist_add_head_rcu(&napi->napi_hash_node,
6384 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
6385
6386 spin_unlock(&napi_hash_lock);
6387}
6388
6389/* Warning : caller is responsible to make sure rcu grace period
6390 * is respected before freeing memory containing @napi
6391 */
6392static void napi_hash_del(struct napi_struct *napi)
6393{
6394 spin_lock(&napi_hash_lock);
6395
6396 hlist_del_init_rcu(&napi->napi_hash_node);
6397
6398 spin_unlock(&napi_hash_lock);
6399}
6400
6401static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
6402{
6403 struct napi_struct *napi;
6404
6405 napi = container_of(timer, struct napi_struct, timer);
6406
6407 /* Note : we use a relaxed variant of napi_schedule_prep() not setting
6408 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
6409 */
6410 if (!napi_disable_pending(napi) &&
6411 !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
6412 clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6413 __napi_schedule_irqoff(napi);
6414 }
6415
6416 return HRTIMER_NORESTART;
6417}
6418
6419static void init_gro_hash(struct napi_struct *napi)
6420{
6421 int i;
6422
6423 for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6424 INIT_LIST_HEAD(&napi->gro_hash[i].list);
6425 napi->gro_hash[i].count = 0;
6426 }
6427 napi->gro_bitmask = 0;
6428}
6429
6430int dev_set_threaded(struct net_device *dev, bool threaded)
6431{
6432 struct napi_struct *napi;
6433 int err = 0;
6434
6435 if (dev->threaded == threaded)
6436 return 0;
6437
6438 if (threaded) {
6439 list_for_each_entry(napi, &dev->napi_list, dev_list) {
6440 if (!napi->thread) {
6441 err = napi_kthread_create(napi);
6442 if (err) {
6443 threaded = false;
6444 break;
6445 }
6446 }
6447 }
6448 }
6449
6450 dev->threaded = threaded;
6451
6452 /* Make sure kthread is created before THREADED bit
6453 * is set.
6454 */
6455 smp_mb__before_atomic();
6456
6457 /* Setting/unsetting threaded mode on a napi might not immediately
6458 * take effect, if the current napi instance is actively being
6459 * polled. In this case, the switch between threaded mode and
6460 * softirq mode will happen in the next round of napi_schedule().
6461 * This should not cause hiccups/stalls to the live traffic.
6462 */
6463 list_for_each_entry(napi, &dev->napi_list, dev_list)
6464 assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
6465
6466 return err;
6467}
6468EXPORT_SYMBOL(dev_set_threaded);
6469
6470/**
6471 * netif_queue_set_napi - Associate queue with the napi
6472 * @dev: device to which NAPI and queue belong
6473 * @queue_index: Index of queue
6474 * @type: queue type as RX or TX
6475 * @napi: NAPI context, pass NULL to clear previously set NAPI
6476 *
6477 * Set queue with its corresponding napi context. This should be done after
6478 * registering the NAPI handler for the queue-vector and the queues have been
6479 * mapped to the corresponding interrupt vector.
6480 */
6481void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
6482 enum netdev_queue_type type, struct napi_struct *napi)
6483{
6484 struct netdev_rx_queue *rxq;
6485 struct netdev_queue *txq;
6486
6487 if (WARN_ON_ONCE(napi && !napi->dev))
6488 return;
6489 if (dev->reg_state >= NETREG_REGISTERED)
6490 ASSERT_RTNL();
6491
6492 switch (type) {
6493 case NETDEV_QUEUE_TYPE_RX:
6494 rxq = __netif_get_rx_queue(dev, queue_index);
6495 rxq->napi = napi;
6496 return;
6497 case NETDEV_QUEUE_TYPE_TX:
6498 txq = netdev_get_tx_queue(dev, queue_index);
6499 txq->napi = napi;
6500 return;
6501 default:
6502 return;
6503 }
6504}
6505EXPORT_SYMBOL(netif_queue_set_napi);
6506
6507void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
6508 int (*poll)(struct napi_struct *, int), int weight)
6509{
6510 if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
6511 return;
6512
6513 INIT_LIST_HEAD(&napi->poll_list);
6514 INIT_HLIST_NODE(&napi->napi_hash_node);
6515 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
6516 napi->timer.function = napi_watchdog;
6517 init_gro_hash(napi);
6518 napi->skb = NULL;
6519 INIT_LIST_HEAD(&napi->rx_list);
6520 napi->rx_count = 0;
6521 napi->poll = poll;
6522 if (weight > NAPI_POLL_WEIGHT)
6523 netdev_err_once(dev, "%s() called with weight %d\n", __func__,
6524 weight);
6525 napi->weight = weight;
6526 napi->dev = dev;
6527#ifdef CONFIG_NETPOLL
6528 napi->poll_owner = -1;
6529#endif
6530 napi->list_owner = -1;
6531 set_bit(NAPI_STATE_SCHED, &napi->state);
6532 set_bit(NAPI_STATE_NPSVC, &napi->state);
6533 list_add_rcu(&napi->dev_list, &dev->napi_list);
6534 napi_hash_add(napi);
6535 napi_get_frags_check(napi);
6536 /* Create kthread for this napi if dev->threaded is set.
6537 * Clear dev->threaded if kthread creation failed so that
6538 * threaded mode will not be enabled in napi_enable().
6539 */
6540 if (dev->threaded && napi_kthread_create(napi))
6541 dev->threaded = 0;
6542 netif_napi_set_irq(napi, -1);
6543}
6544EXPORT_SYMBOL(netif_napi_add_weight);
6545
6546void napi_disable(struct napi_struct *n)
6547{
6548 unsigned long val, new;
6549
6550 might_sleep();
6551 set_bit(NAPI_STATE_DISABLE, &n->state);
6552
6553 val = READ_ONCE(n->state);
6554 do {
6555 while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
6556 usleep_range(20, 200);
6557 val = READ_ONCE(n->state);
6558 }
6559
6560 new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
6561 new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
6562 } while (!try_cmpxchg(&n->state, &val, new));
6563
6564 hrtimer_cancel(&n->timer);
6565
6566 clear_bit(NAPI_STATE_DISABLE, &n->state);
6567}
6568EXPORT_SYMBOL(napi_disable);
6569
6570/**
6571 * napi_enable - enable NAPI scheduling
6572 * @n: NAPI context
6573 *
6574 * Resume NAPI from being scheduled on this context.
6575 * Must be paired with napi_disable.
6576 */
6577void napi_enable(struct napi_struct *n)
6578{
6579 unsigned long new, val = READ_ONCE(n->state);
6580
6581 do {
6582 BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
6583
6584 new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
6585 if (n->dev->threaded && n->thread)
6586 new |= NAPIF_STATE_THREADED;
6587 } while (!try_cmpxchg(&n->state, &val, new));
6588}
6589EXPORT_SYMBOL(napi_enable);
6590
6591static void flush_gro_hash(struct napi_struct *napi)
6592{
6593 int i;
6594
6595 for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6596 struct sk_buff *skb, *n;
6597
6598 list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
6599 kfree_skb(skb);
6600 napi->gro_hash[i].count = 0;
6601 }
6602}
6603
6604/* Must be called in process context */
6605void __netif_napi_del(struct napi_struct *napi)
6606{
6607 if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
6608 return;
6609
6610 napi_hash_del(napi);
6611 list_del_rcu(&napi->dev_list);
6612 napi_free_frags(napi);
6613
6614 flush_gro_hash(napi);
6615 napi->gro_bitmask = 0;
6616
6617 if (napi->thread) {
6618 kthread_stop(napi->thread);
6619 napi->thread = NULL;
6620 }
6621}
6622EXPORT_SYMBOL(__netif_napi_del);
6623
6624static int __napi_poll(struct napi_struct *n, bool *repoll)
6625{
6626 int work, weight;
6627
6628 weight = n->weight;
6629
6630 /* This NAPI_STATE_SCHED test is for avoiding a race
6631 * with netpoll's poll_napi(). Only the entity which
6632 * obtains the lock and sees NAPI_STATE_SCHED set will
6633 * actually make the ->poll() call. Therefore we avoid
6634 * accidentally calling ->poll() when NAPI is not scheduled.
6635 */
6636 work = 0;
6637 if (napi_is_scheduled(n)) {
6638 work = n->poll(n, weight);
6639 trace_napi_poll(n, work, weight);
6640
6641 xdp_do_check_flushed(n);
6642 }
6643
6644 if (unlikely(work > weight))
6645 netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
6646 n->poll, work, weight);
6647
6648 if (likely(work < weight))
6649 return work;
6650
6651 /* Drivers must not modify the NAPI state if they
6652 * consume the entire weight. In such cases this code
6653 * still "owns" the NAPI instance and therefore can
6654 * move the instance around on the list at-will.
6655 */
6656 if (unlikely(napi_disable_pending(n))) {
6657 napi_complete(n);
6658 return work;
6659 }
6660
6661 /* The NAPI context has more processing work, but busy-polling
6662 * is preferred. Exit early.
6663 */
6664 if (napi_prefer_busy_poll(n)) {
6665 if (napi_complete_done(n, work)) {
6666 /* If timeout is not set, we need to make sure
6667 * that the NAPI is re-scheduled.
6668 */
6669 napi_schedule(n);
6670 }
6671 return work;
6672 }
6673
6674 if (n->gro_bitmask) {
6675 /* flush too old packets
6676 * If HZ < 1000, flush all packets.
6677 */
6678 napi_gro_flush(n, HZ >= 1000);
6679 }
6680
6681 gro_normal_list(n);
6682
6683 /* Some drivers may have called napi_schedule
6684 * prior to exhausting their budget.
6685 */
6686 if (unlikely(!list_empty(&n->poll_list))) {
6687 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
6688 n->dev ? n->dev->name : "backlog");
6689 return work;
6690 }
6691
6692 *repoll = true;
6693
6694 return work;
6695}
6696
6697static int napi_poll(struct napi_struct *n, struct list_head *repoll)
6698{
6699 bool do_repoll = false;
6700 void *have;
6701 int work;
6702
6703 list_del_init(&n->poll_list);
6704
6705 have = netpoll_poll_lock(n);
6706
6707 work = __napi_poll(n, &do_repoll);
6708
6709 if (do_repoll)
6710 list_add_tail(&n->poll_list, repoll);
6711
6712 netpoll_poll_unlock(have);
6713
6714 return work;
6715}
6716
6717static int napi_thread_wait(struct napi_struct *napi)
6718{
6719 bool woken = false;
6720
6721 set_current_state(TASK_INTERRUPTIBLE);
6722
6723 while (!kthread_should_stop()) {
6724 /* Testing SCHED_THREADED bit here to make sure the current
6725 * kthread owns this napi and could poll on this napi.
6726 * Testing SCHED bit is not enough because SCHED bit might be
6727 * set by some other busy poll thread or by napi_disable().
6728 */
6729 if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
6730 WARN_ON(!list_empty(&napi->poll_list));
6731 __set_current_state(TASK_RUNNING);
6732 return 0;
6733 }
6734
6735 schedule();
6736 /* woken being true indicates this thread owns this napi. */
6737 woken = true;
6738 set_current_state(TASK_INTERRUPTIBLE);
6739 }
6740 __set_current_state(TASK_RUNNING);
6741
6742 return -1;
6743}
6744
6745static int napi_threaded_poll(void *data)
6746{
6747 struct napi_struct *napi = data;
6748 struct softnet_data *sd;
6749 void *have;
6750
6751 while (!napi_thread_wait(napi)) {
6752 unsigned long last_qs = jiffies;
6753
6754 for (;;) {
6755 bool repoll = false;
6756
6757 local_bh_disable();
6758 sd = this_cpu_ptr(&softnet_data);
6759 sd->in_napi_threaded_poll = true;
6760
6761 have = netpoll_poll_lock(napi);
6762 __napi_poll(napi, &repoll);
6763 netpoll_poll_unlock(have);
6764
6765 sd->in_napi_threaded_poll = false;
6766 barrier();
6767
6768 if (sd_has_rps_ipi_waiting(sd)) {
6769 local_irq_disable();
6770 net_rps_action_and_irq_enable(sd);
6771 }
6772 skb_defer_free_flush(sd);
6773 local_bh_enable();
6774
6775 if (!repoll)
6776 break;
6777
6778 rcu_softirq_qs_periodic(last_qs);
6779 cond_resched();
6780 }
6781 }
6782 return 0;
6783}
6784
6785static __latent_entropy void net_rx_action(struct softirq_action *h)
6786{
6787 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6788 unsigned long time_limit = jiffies +
6789 usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
6790 int budget = READ_ONCE(net_hotdata.netdev_budget);
6791 LIST_HEAD(list);
6792 LIST_HEAD(repoll);
6793
6794start:
6795 sd->in_net_rx_action = true;
6796 local_irq_disable();
6797 list_splice_init(&sd->poll_list, &list);
6798 local_irq_enable();
6799
6800 for (;;) {
6801 struct napi_struct *n;
6802
6803 skb_defer_free_flush(sd);
6804
6805 if (list_empty(&list)) {
6806 if (list_empty(&repoll)) {
6807 sd->in_net_rx_action = false;
6808 barrier();
6809 /* We need to check if ____napi_schedule()
6810 * had refilled poll_list while
6811 * sd->in_net_rx_action was true.
6812 */
6813 if (!list_empty(&sd->poll_list))
6814 goto start;
6815 if (!sd_has_rps_ipi_waiting(sd))
6816 goto end;
6817 }
6818 break;
6819 }
6820
6821 n = list_first_entry(&list, struct napi_struct, poll_list);
6822 budget -= napi_poll(n, &repoll);
6823
6824 /* If softirq window is exhausted then punt.
6825 * Allow this to run for 2 jiffies since which will allow
6826 * an average latency of 1.5/HZ.
6827 */
6828 if (unlikely(budget <= 0 ||
6829 time_after_eq(jiffies, time_limit))) {
6830 sd->time_squeeze++;
6831 break;
6832 }
6833 }
6834
6835 local_irq_disable();
6836
6837 list_splice_tail_init(&sd->poll_list, &list);
6838 list_splice_tail(&repoll, &list);
6839 list_splice(&list, &sd->poll_list);
6840 if (!list_empty(&sd->poll_list))
6841 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
6842 else
6843 sd->in_net_rx_action = false;
6844
6845 net_rps_action_and_irq_enable(sd);
6846end:;
6847}
6848
6849struct netdev_adjacent {
6850 struct net_device *dev;
6851 netdevice_tracker dev_tracker;
6852
6853 /* upper master flag, there can only be one master device per list */
6854 bool master;
6855
6856 /* lookup ignore flag */
6857 bool ignore;
6858
6859 /* counter for the number of times this device was added to us */
6860 u16 ref_nr;
6861
6862 /* private field for the users */
6863 void *private;
6864
6865 struct list_head list;
6866 struct rcu_head rcu;
6867};
6868
6869static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
6870 struct list_head *adj_list)
6871{
6872 struct netdev_adjacent *adj;
6873
6874 list_for_each_entry(adj, adj_list, list) {
6875 if (adj->dev == adj_dev)
6876 return adj;
6877 }
6878 return NULL;
6879}
6880
6881static int ____netdev_has_upper_dev(struct net_device *upper_dev,
6882 struct netdev_nested_priv *priv)
6883{
6884 struct net_device *dev = (struct net_device *)priv->data;
6885
6886 return upper_dev == dev;
6887}
6888
6889/**
6890 * netdev_has_upper_dev - Check if device is linked to an upper device
6891 * @dev: device
6892 * @upper_dev: upper device to check
6893 *
6894 * Find out if a device is linked to specified upper device and return true
6895 * in case it is. Note that this checks only immediate upper device,
6896 * not through a complete stack of devices. The caller must hold the RTNL lock.
6897 */
6898bool netdev_has_upper_dev(struct net_device *dev,
6899 struct net_device *upper_dev)
6900{
6901 struct netdev_nested_priv priv = {
6902 .data = (void *)upper_dev,
6903 };
6904
6905 ASSERT_RTNL();
6906
6907 return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6908 &priv);
6909}
6910EXPORT_SYMBOL(netdev_has_upper_dev);
6911
6912/**
6913 * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
6914 * @dev: device
6915 * @upper_dev: upper device to check
6916 *
6917 * Find out if a device is linked to specified upper device and return true
6918 * in case it is. Note that this checks the entire upper device chain.
6919 * The caller must hold rcu lock.
6920 */
6921
6922bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
6923 struct net_device *upper_dev)
6924{
6925 struct netdev_nested_priv priv = {
6926 .data = (void *)upper_dev,
6927 };
6928
6929 return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6930 &priv);
6931}
6932EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
6933
6934/**
6935 * netdev_has_any_upper_dev - Check if device is linked to some device
6936 * @dev: device
6937 *
6938 * Find out if a device is linked to an upper device and return true in case
6939 * it is. The caller must hold the RTNL lock.
6940 */
6941bool netdev_has_any_upper_dev(struct net_device *dev)
6942{
6943 ASSERT_RTNL();
6944
6945 return !list_empty(&dev->adj_list.upper);
6946}
6947EXPORT_SYMBOL(netdev_has_any_upper_dev);
6948
6949/**
6950 * netdev_master_upper_dev_get - Get master upper device
6951 * @dev: device
6952 *
6953 * Find a master upper device and return pointer to it or NULL in case
6954 * it's not there. The caller must hold the RTNL lock.
6955 */
6956struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
6957{
6958 struct netdev_adjacent *upper;
6959
6960 ASSERT_RTNL();
6961
6962 if (list_empty(&dev->adj_list.upper))
6963 return NULL;
6964
6965 upper = list_first_entry(&dev->adj_list.upper,
6966 struct netdev_adjacent, list);
6967 if (likely(upper->master))
6968 return upper->dev;
6969 return NULL;
6970}
6971EXPORT_SYMBOL(netdev_master_upper_dev_get);
6972
6973static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
6974{
6975 struct netdev_adjacent *upper;
6976
6977 ASSERT_RTNL();
6978
6979 if (list_empty(&dev->adj_list.upper))
6980 return NULL;
6981
6982 upper = list_first_entry(&dev->adj_list.upper,
6983 struct netdev_adjacent, list);
6984 if (likely(upper->master) && !upper->ignore)
6985 return upper->dev;
6986 return NULL;
6987}
6988
6989/**
6990 * netdev_has_any_lower_dev - Check if device is linked to some device
6991 * @dev: device
6992 *
6993 * Find out if a device is linked to a lower device and return true in case
6994 * it is. The caller must hold the RTNL lock.
6995 */
6996static bool netdev_has_any_lower_dev(struct net_device *dev)
6997{
6998 ASSERT_RTNL();
6999
7000 return !list_empty(&dev->adj_list.lower);
7001}
7002
7003void *netdev_adjacent_get_private(struct list_head *adj_list)
7004{
7005 struct netdev_adjacent *adj;
7006
7007 adj = list_entry(adj_list, struct netdev_adjacent, list);
7008
7009 return adj->private;
7010}
7011EXPORT_SYMBOL(netdev_adjacent_get_private);
7012
7013/**
7014 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
7015 * @dev: device
7016 * @iter: list_head ** of the current position
7017 *
7018 * Gets the next device from the dev's upper list, starting from iter
7019 * position. The caller must hold RCU read lock.
7020 */
7021struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
7022 struct list_head **iter)
7023{
7024 struct netdev_adjacent *upper;
7025
7026 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
7027
7028 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7029
7030 if (&upper->list == &dev->adj_list.upper)
7031 return NULL;
7032
7033 *iter = &upper->list;
7034
7035 return upper->dev;
7036}
7037EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
7038
7039static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
7040 struct list_head **iter,
7041 bool *ignore)
7042{
7043 struct netdev_adjacent *upper;
7044
7045 upper = list_entry((*iter)->next, struct netdev_adjacent, list);
7046
7047 if (&upper->list == &dev->adj_list.upper)
7048 return NULL;
7049
7050 *iter = &upper->list;
7051 *ignore = upper->ignore;
7052
7053 return upper->dev;
7054}
7055
7056static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
7057 struct list_head **iter)
7058{
7059 struct netdev_adjacent *upper;
7060
7061 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
7062
7063 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7064
7065 if (&upper->list == &dev->adj_list.upper)
7066 return NULL;
7067
7068 *iter = &upper->list;
7069
7070 return upper->dev;
7071}
7072
7073static int __netdev_walk_all_upper_dev(struct net_device *dev,
7074 int (*fn)(struct net_device *dev,
7075 struct netdev_nested_priv *priv),
7076 struct netdev_nested_priv *priv)
7077{
7078 struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7079 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7080 int ret, cur = 0;
7081 bool ignore;
7082
7083 now = dev;
7084 iter = &dev->adj_list.upper;
7085
7086 while (1) {
7087 if (now != dev) {
7088 ret = fn(now, priv);
7089 if (ret)
7090 return ret;
7091 }
7092
7093 next = NULL;
7094 while (1) {
7095 udev = __netdev_next_upper_dev(now, &iter, &ignore);
7096 if (!udev)
7097 break;
7098 if (ignore)
7099 continue;
7100
7101 next = udev;
7102 niter = &udev->adj_list.upper;
7103 dev_stack[cur] = now;
7104 iter_stack[cur++] = iter;
7105 break;
7106 }
7107
7108 if (!next) {
7109 if (!cur)
7110 return 0;
7111 next = dev_stack[--cur];
7112 niter = iter_stack[cur];
7113 }
7114
7115 now = next;
7116 iter = niter;
7117 }
7118
7119 return 0;
7120}
7121
7122int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
7123 int (*fn)(struct net_device *dev,
7124 struct netdev_nested_priv *priv),
7125 struct netdev_nested_priv *priv)
7126{
7127 struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7128 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7129 int ret, cur = 0;
7130
7131 now = dev;
7132 iter = &dev->adj_list.upper;
7133
7134 while (1) {
7135 if (now != dev) {
7136 ret = fn(now, priv);
7137 if (ret)
7138 return ret;
7139 }
7140
7141 next = NULL;
7142 while (1) {
7143 udev = netdev_next_upper_dev_rcu(now, &iter);
7144 if (!udev)
7145 break;
7146
7147 next = udev;
7148 niter = &udev->adj_list.upper;
7149 dev_stack[cur] = now;
7150 iter_stack[cur++] = iter;
7151 break;
7152 }
7153
7154 if (!next) {
7155 if (!cur)
7156 return 0;
7157 next = dev_stack[--cur];
7158 niter = iter_stack[cur];
7159 }
7160
7161 now = next;
7162 iter = niter;
7163 }
7164
7165 return 0;
7166}
7167EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
7168
7169static bool __netdev_has_upper_dev(struct net_device *dev,
7170 struct net_device *upper_dev)
7171{
7172 struct netdev_nested_priv priv = {
7173 .flags = 0,
7174 .data = (void *)upper_dev,
7175 };
7176
7177 ASSERT_RTNL();
7178
7179 return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
7180 &priv);
7181}
7182
7183/**
7184 * netdev_lower_get_next_private - Get the next ->private from the
7185 * lower neighbour list
7186 * @dev: device
7187 * @iter: list_head ** of the current position
7188 *
7189 * Gets the next netdev_adjacent->private from the dev's lower neighbour
7190 * list, starting from iter position. The caller must hold either hold the
7191 * RTNL lock or its own locking that guarantees that the neighbour lower
7192 * list will remain unchanged.
7193 */
7194void *netdev_lower_get_next_private(struct net_device *dev,
7195 struct list_head **iter)
7196{
7197 struct netdev_adjacent *lower;
7198
7199 lower = list_entry(*iter, struct netdev_adjacent, list);
7200
7201 if (&lower->list == &dev->adj_list.lower)
7202 return NULL;
7203
7204 *iter = lower->list.next;
7205
7206 return lower->private;
7207}
7208EXPORT_SYMBOL(netdev_lower_get_next_private);
7209
7210/**
7211 * netdev_lower_get_next_private_rcu - Get the next ->private from the
7212 * lower neighbour list, RCU
7213 * variant
7214 * @dev: device
7215 * @iter: list_head ** of the current position
7216 *
7217 * Gets the next netdev_adjacent->private from the dev's lower neighbour
7218 * list, starting from iter position. The caller must hold RCU read lock.
7219 */
7220void *netdev_lower_get_next_private_rcu(struct net_device *dev,
7221 struct list_head **iter)
7222{
7223 struct netdev_adjacent *lower;
7224
7225 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
7226
7227 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7228
7229 if (&lower->list == &dev->adj_list.lower)
7230 return NULL;
7231
7232 *iter = &lower->list;
7233
7234 return lower->private;
7235}
7236EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
7237
7238/**
7239 * netdev_lower_get_next - Get the next device from the lower neighbour
7240 * list
7241 * @dev: device
7242 * @iter: list_head ** of the current position
7243 *
7244 * Gets the next netdev_adjacent from the dev's lower neighbour
7245 * list, starting from iter position. The caller must hold RTNL lock or
7246 * its own locking that guarantees that the neighbour lower
7247 * list will remain unchanged.
7248 */
7249void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
7250{
7251 struct netdev_adjacent *lower;
7252
7253 lower = list_entry(*iter, struct netdev_adjacent, list);
7254
7255 if (&lower->list == &dev->adj_list.lower)
7256 return NULL;
7257
7258 *iter = lower->list.next;
7259
7260 return lower->dev;
7261}
7262EXPORT_SYMBOL(netdev_lower_get_next);
7263
7264static struct net_device *netdev_next_lower_dev(struct net_device *dev,
7265 struct list_head **iter)
7266{
7267 struct netdev_adjacent *lower;
7268
7269 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7270
7271 if (&lower->list == &dev->adj_list.lower)
7272 return NULL;
7273
7274 *iter = &lower->list;
7275
7276 return lower->dev;
7277}
7278
7279static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
7280 struct list_head **iter,
7281 bool *ignore)
7282{
7283 struct netdev_adjacent *lower;
7284
7285 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7286
7287 if (&lower->list == &dev->adj_list.lower)
7288 return NULL;
7289
7290 *iter = &lower->list;
7291 *ignore = lower->ignore;
7292
7293 return lower->dev;
7294}
7295
7296int netdev_walk_all_lower_dev(struct net_device *dev,
7297 int (*fn)(struct net_device *dev,
7298 struct netdev_nested_priv *priv),
7299 struct netdev_nested_priv *priv)
7300{
7301 struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7302 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7303 int ret, cur = 0;
7304
7305 now = dev;
7306 iter = &dev->adj_list.lower;
7307
7308 while (1) {
7309 if (now != dev) {
7310 ret = fn(now, priv);
7311 if (ret)
7312 return ret;
7313 }
7314
7315 next = NULL;
7316 while (1) {
7317 ldev = netdev_next_lower_dev(now, &iter);
7318 if (!ldev)
7319 break;
7320
7321 next = ldev;
7322 niter = &ldev->adj_list.lower;
7323 dev_stack[cur] = now;
7324 iter_stack[cur++] = iter;
7325 break;
7326 }
7327
7328 if (!next) {
7329 if (!cur)
7330 return 0;
7331 next = dev_stack[--cur];
7332 niter = iter_stack[cur];
7333 }
7334
7335 now = next;
7336 iter = niter;
7337 }
7338
7339 return 0;
7340}
7341EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
7342
7343static int __netdev_walk_all_lower_dev(struct net_device *dev,
7344 int (*fn)(struct net_device *dev,
7345 struct netdev_nested_priv *priv),
7346 struct netdev_nested_priv *priv)
7347{
7348 struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7349 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7350 int ret, cur = 0;
7351 bool ignore;
7352
7353 now = dev;
7354 iter = &dev->adj_list.lower;
7355
7356 while (1) {
7357 if (now != dev) {
7358 ret = fn(now, priv);
7359 if (ret)
7360 return ret;
7361 }
7362
7363 next = NULL;
7364 while (1) {
7365 ldev = __netdev_next_lower_dev(now, &iter, &ignore);
7366 if (!ldev)
7367 break;
7368 if (ignore)
7369 continue;
7370
7371 next = ldev;
7372 niter = &ldev->adj_list.lower;
7373 dev_stack[cur] = now;
7374 iter_stack[cur++] = iter;
7375 break;
7376 }
7377
7378 if (!next) {
7379 if (!cur)
7380 return 0;
7381 next = dev_stack[--cur];
7382 niter = iter_stack[cur];
7383 }
7384
7385 now = next;
7386 iter = niter;
7387 }
7388
7389 return 0;
7390}
7391
7392struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
7393 struct list_head **iter)
7394{
7395 struct netdev_adjacent *lower;
7396
7397 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7398 if (&lower->list == &dev->adj_list.lower)
7399 return NULL;
7400
7401 *iter = &lower->list;
7402
7403 return lower->dev;
7404}
7405EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
7406
7407static u8 __netdev_upper_depth(struct net_device *dev)
7408{
7409 struct net_device *udev;
7410 struct list_head *iter;
7411 u8 max_depth = 0;
7412 bool ignore;
7413
7414 for (iter = &dev->adj_list.upper,
7415 udev = __netdev_next_upper_dev(dev, &iter, &ignore);
7416 udev;
7417 udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
7418 if (ignore)
7419 continue;
7420 if (max_depth < udev->upper_level)
7421 max_depth = udev->upper_level;
7422 }
7423
7424 return max_depth;
7425}
7426
7427static u8 __netdev_lower_depth(struct net_device *dev)
7428{
7429 struct net_device *ldev;
7430 struct list_head *iter;
7431 u8 max_depth = 0;
7432 bool ignore;
7433
7434 for (iter = &dev->adj_list.lower,
7435 ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
7436 ldev;
7437 ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
7438 if (ignore)
7439 continue;
7440 if (max_depth < ldev->lower_level)
7441 max_depth = ldev->lower_level;
7442 }
7443
7444 return max_depth;
7445}
7446
7447static int __netdev_update_upper_level(struct net_device *dev,
7448 struct netdev_nested_priv *__unused)
7449{
7450 dev->upper_level = __netdev_upper_depth(dev) + 1;
7451 return 0;
7452}
7453
7454#ifdef CONFIG_LOCKDEP
7455static LIST_HEAD(net_unlink_list);
7456
7457static void net_unlink_todo(struct net_device *dev)
7458{
7459 if (list_empty(&dev->unlink_list))
7460 list_add_tail(&dev->unlink_list, &net_unlink_list);
7461}
7462#endif
7463
7464static int __netdev_update_lower_level(struct net_device *dev,
7465 struct netdev_nested_priv *priv)
7466{
7467 dev->lower_level = __netdev_lower_depth(dev) + 1;
7468
7469#ifdef CONFIG_LOCKDEP
7470 if (!priv)
7471 return 0;
7472
7473 if (priv->flags & NESTED_SYNC_IMM)
7474 dev->nested_level = dev->lower_level - 1;
7475 if (priv->flags & NESTED_SYNC_TODO)
7476 net_unlink_todo(dev);
7477#endif
7478 return 0;
7479}
7480
7481int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
7482 int (*fn)(struct net_device *dev,
7483 struct netdev_nested_priv *priv),
7484 struct netdev_nested_priv *priv)
7485{
7486 struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7487 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7488 int ret, cur = 0;
7489
7490 now = dev;
7491 iter = &dev->adj_list.lower;
7492
7493 while (1) {
7494 if (now != dev) {
7495 ret = fn(now, priv);
7496 if (ret)
7497 return ret;
7498 }
7499
7500 next = NULL;
7501 while (1) {
7502 ldev = netdev_next_lower_dev_rcu(now, &iter);
7503 if (!ldev)
7504 break;
7505
7506 next = ldev;
7507 niter = &ldev->adj_list.lower;
7508 dev_stack[cur] = now;
7509 iter_stack[cur++] = iter;
7510 break;
7511 }
7512
7513 if (!next) {
7514 if (!cur)
7515 return 0;
7516 next = dev_stack[--cur];
7517 niter = iter_stack[cur];
7518 }
7519
7520 now = next;
7521 iter = niter;
7522 }
7523
7524 return 0;
7525}
7526EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
7527
7528/**
7529 * netdev_lower_get_first_private_rcu - Get the first ->private from the
7530 * lower neighbour list, RCU
7531 * variant
7532 * @dev: device
7533 *
7534 * Gets the first netdev_adjacent->private from the dev's lower neighbour
7535 * list. The caller must hold RCU read lock.
7536 */
7537void *netdev_lower_get_first_private_rcu(struct net_device *dev)
7538{
7539 struct netdev_adjacent *lower;
7540
7541 lower = list_first_or_null_rcu(&dev->adj_list.lower,
7542 struct netdev_adjacent, list);
7543 if (lower)
7544 return lower->private;
7545 return NULL;
7546}
7547EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
7548
7549/**
7550 * netdev_master_upper_dev_get_rcu - Get master upper device
7551 * @dev: device
7552 *
7553 * Find a master upper device and return pointer to it or NULL in case
7554 * it's not there. The caller must hold the RCU read lock.
7555 */
7556struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
7557{
7558 struct netdev_adjacent *upper;
7559
7560 upper = list_first_or_null_rcu(&dev->adj_list.upper,
7561 struct netdev_adjacent, list);
7562 if (upper && likely(upper->master))
7563 return upper->dev;
7564 return NULL;
7565}
7566EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
7567
7568static int netdev_adjacent_sysfs_add(struct net_device *dev,
7569 struct net_device *adj_dev,
7570 struct list_head *dev_list)
7571{
7572 char linkname[IFNAMSIZ+7];
7573
7574 sprintf(linkname, dev_list == &dev->adj_list.upper ?
7575 "upper_%s" : "lower_%s", adj_dev->name);
7576 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
7577 linkname);
7578}
7579static void netdev_adjacent_sysfs_del(struct net_device *dev,
7580 char *name,
7581 struct list_head *dev_list)
7582{
7583 char linkname[IFNAMSIZ+7];
7584
7585 sprintf(linkname, dev_list == &dev->adj_list.upper ?
7586 "upper_%s" : "lower_%s", name);
7587 sysfs_remove_link(&(dev->dev.kobj), linkname);
7588}
7589
7590static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
7591 struct net_device *adj_dev,
7592 struct list_head *dev_list)
7593{
7594 return (dev_list == &dev->adj_list.upper ||
7595 dev_list == &dev->adj_list.lower) &&
7596 net_eq(dev_net(dev), dev_net(adj_dev));
7597}
7598
7599static int __netdev_adjacent_dev_insert(struct net_device *dev,
7600 struct net_device *adj_dev,
7601 struct list_head *dev_list,
7602 void *private, bool master)
7603{
7604 struct netdev_adjacent *adj;
7605 int ret;
7606
7607 adj = __netdev_find_adj(adj_dev, dev_list);
7608
7609 if (adj) {
7610 adj->ref_nr += 1;
7611 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
7612 dev->name, adj_dev->name, adj->ref_nr);
7613
7614 return 0;
7615 }
7616
7617 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
7618 if (!adj)
7619 return -ENOMEM;
7620
7621 adj->dev = adj_dev;
7622 adj->master = master;
7623 adj->ref_nr = 1;
7624 adj->private = private;
7625 adj->ignore = false;
7626 netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL);
7627
7628 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
7629 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
7630
7631 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
7632 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
7633 if (ret)
7634 goto free_adj;
7635 }
7636
7637 /* Ensure that master link is always the first item in list. */
7638 if (master) {
7639 ret = sysfs_create_link(&(dev->dev.kobj),
7640 &(adj_dev->dev.kobj), "master");
7641 if (ret)
7642 goto remove_symlinks;
7643
7644 list_add_rcu(&adj->list, dev_list);
7645 } else {
7646 list_add_tail_rcu(&adj->list, dev_list);
7647 }
7648
7649 return 0;
7650
7651remove_symlinks:
7652 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7653 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7654free_adj:
7655 netdev_put(adj_dev, &adj->dev_tracker);
7656 kfree(adj);
7657
7658 return ret;
7659}
7660
7661static void __netdev_adjacent_dev_remove(struct net_device *dev,
7662 struct net_device *adj_dev,
7663 u16 ref_nr,
7664 struct list_head *dev_list)
7665{
7666 struct netdev_adjacent *adj;
7667
7668 pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
7669 dev->name, adj_dev->name, ref_nr);
7670
7671 adj = __netdev_find_adj(adj_dev, dev_list);
7672
7673 if (!adj) {
7674 pr_err("Adjacency does not exist for device %s from %s\n",
7675 dev->name, adj_dev->name);
7676 WARN_ON(1);
7677 return;
7678 }
7679
7680 if (adj->ref_nr > ref_nr) {
7681 pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
7682 dev->name, adj_dev->name, ref_nr,
7683 adj->ref_nr - ref_nr);
7684 adj->ref_nr -= ref_nr;
7685 return;
7686 }
7687
7688 if (adj->master)
7689 sysfs_remove_link(&(dev->dev.kobj), "master");
7690
7691 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7692 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7693
7694 list_del_rcu(&adj->list);
7695 pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
7696 adj_dev->name, dev->name, adj_dev->name);
7697 netdev_put(adj_dev, &adj->dev_tracker);
7698 kfree_rcu(adj, rcu);
7699}
7700
7701static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
7702 struct net_device *upper_dev,
7703 struct list_head *up_list,
7704 struct list_head *down_list,
7705 void *private, bool master)
7706{
7707 int ret;
7708
7709 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
7710 private, master);
7711 if (ret)
7712 return ret;
7713
7714 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
7715 private, false);
7716 if (ret) {
7717 __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
7718 return ret;
7719 }
7720
7721 return 0;
7722}
7723
7724static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
7725 struct net_device *upper_dev,
7726 u16 ref_nr,
7727 struct list_head *up_list,
7728 struct list_head *down_list)
7729{
7730 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
7731 __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
7732}
7733
7734static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
7735 struct net_device *upper_dev,
7736 void *private, bool master)
7737{
7738 return __netdev_adjacent_dev_link_lists(dev, upper_dev,
7739 &dev->adj_list.upper,
7740 &upper_dev->adj_list.lower,
7741 private, master);
7742}
7743
7744static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
7745 struct net_device *upper_dev)
7746{
7747 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
7748 &dev->adj_list.upper,
7749 &upper_dev->adj_list.lower);
7750}
7751
7752static int __netdev_upper_dev_link(struct net_device *dev,
7753 struct net_device *upper_dev, bool master,
7754 void *upper_priv, void *upper_info,
7755 struct netdev_nested_priv *priv,
7756 struct netlink_ext_ack *extack)
7757{
7758 struct netdev_notifier_changeupper_info changeupper_info = {
7759 .info = {
7760 .dev = dev,
7761 .extack = extack,
7762 },
7763 .upper_dev = upper_dev,
7764 .master = master,
7765 .linking = true,
7766 .upper_info = upper_info,
7767 };
7768 struct net_device *master_dev;
7769 int ret = 0;
7770
7771 ASSERT_RTNL();
7772
7773 if (dev == upper_dev)
7774 return -EBUSY;
7775
7776 /* To prevent loops, check if dev is not upper device to upper_dev. */
7777 if (__netdev_has_upper_dev(upper_dev, dev))
7778 return -EBUSY;
7779
7780 if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
7781 return -EMLINK;
7782
7783 if (!master) {
7784 if (__netdev_has_upper_dev(dev, upper_dev))
7785 return -EEXIST;
7786 } else {
7787 master_dev = __netdev_master_upper_dev_get(dev);
7788 if (master_dev)
7789 return master_dev == upper_dev ? -EEXIST : -EBUSY;
7790 }
7791
7792 ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7793 &changeupper_info.info);
7794 ret = notifier_to_errno(ret);
7795 if (ret)
7796 return ret;
7797
7798 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
7799 master);
7800 if (ret)
7801 return ret;
7802
7803 ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7804 &changeupper_info.info);
7805 ret = notifier_to_errno(ret);
7806 if (ret)
7807 goto rollback;
7808
7809 __netdev_update_upper_level(dev, NULL);
7810 __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7811
7812 __netdev_update_lower_level(upper_dev, priv);
7813 __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7814 priv);
7815
7816 return 0;
7817
7818rollback:
7819 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7820
7821 return ret;
7822}
7823
7824/**
7825 * netdev_upper_dev_link - Add a link to the upper device
7826 * @dev: device
7827 * @upper_dev: new upper device
7828 * @extack: netlink extended ack
7829 *
7830 * Adds a link to device which is upper to this one. The caller must hold
7831 * the RTNL lock. On a failure a negative errno code is returned.
7832 * On success the reference counts are adjusted and the function
7833 * returns zero.
7834 */
7835int netdev_upper_dev_link(struct net_device *dev,
7836 struct net_device *upper_dev,
7837 struct netlink_ext_ack *extack)
7838{
7839 struct netdev_nested_priv priv = {
7840 .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7841 .data = NULL,
7842 };
7843
7844 return __netdev_upper_dev_link(dev, upper_dev, false,
7845 NULL, NULL, &priv, extack);
7846}
7847EXPORT_SYMBOL(netdev_upper_dev_link);
7848
7849/**
7850 * netdev_master_upper_dev_link - Add a master link to the upper device
7851 * @dev: device
7852 * @upper_dev: new upper device
7853 * @upper_priv: upper device private
7854 * @upper_info: upper info to be passed down via notifier
7855 * @extack: netlink extended ack
7856 *
7857 * Adds a link to device which is upper to this one. In this case, only
7858 * one master upper device can be linked, although other non-master devices
7859 * might be linked as well. The caller must hold the RTNL lock.
7860 * On a failure a negative errno code is returned. On success the reference
7861 * counts are adjusted and the function returns zero.
7862 */
7863int netdev_master_upper_dev_link(struct net_device *dev,
7864 struct net_device *upper_dev,
7865 void *upper_priv, void *upper_info,
7866 struct netlink_ext_ack *extack)
7867{
7868 struct netdev_nested_priv priv = {
7869 .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7870 .data = NULL,
7871 };
7872
7873 return __netdev_upper_dev_link(dev, upper_dev, true,
7874 upper_priv, upper_info, &priv, extack);
7875}
7876EXPORT_SYMBOL(netdev_master_upper_dev_link);
7877
7878static void __netdev_upper_dev_unlink(struct net_device *dev,
7879 struct net_device *upper_dev,
7880 struct netdev_nested_priv *priv)
7881{
7882 struct netdev_notifier_changeupper_info changeupper_info = {
7883 .info = {
7884 .dev = dev,
7885 },
7886 .upper_dev = upper_dev,
7887 .linking = false,
7888 };
7889
7890 ASSERT_RTNL();
7891
7892 changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
7893
7894 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7895 &changeupper_info.info);
7896
7897 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7898
7899 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7900 &changeupper_info.info);
7901
7902 __netdev_update_upper_level(dev, NULL);
7903 __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7904
7905 __netdev_update_lower_level(upper_dev, priv);
7906 __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7907 priv);
7908}
7909
7910/**
7911 * netdev_upper_dev_unlink - Removes a link to upper device
7912 * @dev: device
7913 * @upper_dev: new upper device
7914 *
7915 * Removes a link to device which is upper to this one. The caller must hold
7916 * the RTNL lock.
7917 */
7918void netdev_upper_dev_unlink(struct net_device *dev,
7919 struct net_device *upper_dev)
7920{
7921 struct netdev_nested_priv priv = {
7922 .flags = NESTED_SYNC_TODO,
7923 .data = NULL,
7924 };
7925
7926 __netdev_upper_dev_unlink(dev, upper_dev, &priv);
7927}
7928EXPORT_SYMBOL(netdev_upper_dev_unlink);
7929
7930static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
7931 struct net_device *lower_dev,
7932 bool val)
7933{
7934 struct netdev_adjacent *adj;
7935
7936 adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
7937 if (adj)
7938 adj->ignore = val;
7939
7940 adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
7941 if (adj)
7942 adj->ignore = val;
7943}
7944
7945static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
7946 struct net_device *lower_dev)
7947{
7948 __netdev_adjacent_dev_set(upper_dev, lower_dev, true);
7949}
7950
7951static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
7952 struct net_device *lower_dev)
7953{
7954 __netdev_adjacent_dev_set(upper_dev, lower_dev, false);
7955}
7956
7957int netdev_adjacent_change_prepare(struct net_device *old_dev,
7958 struct net_device *new_dev,
7959 struct net_device *dev,
7960 struct netlink_ext_ack *extack)
7961{
7962 struct netdev_nested_priv priv = {
7963 .flags = 0,
7964 .data = NULL,
7965 };
7966 int err;
7967
7968 if (!new_dev)
7969 return 0;
7970
7971 if (old_dev && new_dev != old_dev)
7972 netdev_adjacent_dev_disable(dev, old_dev);
7973 err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
7974 extack);
7975 if (err) {
7976 if (old_dev && new_dev != old_dev)
7977 netdev_adjacent_dev_enable(dev, old_dev);
7978 return err;
7979 }
7980
7981 return 0;
7982}
7983EXPORT_SYMBOL(netdev_adjacent_change_prepare);
7984
7985void netdev_adjacent_change_commit(struct net_device *old_dev,
7986 struct net_device *new_dev,
7987 struct net_device *dev)
7988{
7989 struct netdev_nested_priv priv = {
7990 .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7991 .data = NULL,
7992 };
7993
7994 if (!new_dev || !old_dev)
7995 return;
7996
7997 if (new_dev == old_dev)
7998 return;
7999
8000 netdev_adjacent_dev_enable(dev, old_dev);
8001 __netdev_upper_dev_unlink(old_dev, dev, &priv);
8002}
8003EXPORT_SYMBOL(netdev_adjacent_change_commit);
8004
8005void netdev_adjacent_change_abort(struct net_device *old_dev,
8006 struct net_device *new_dev,
8007 struct net_device *dev)
8008{
8009 struct netdev_nested_priv priv = {
8010 .flags = 0,
8011 .data = NULL,
8012 };
8013
8014 if (!new_dev)
8015 return;
8016
8017 if (old_dev && new_dev != old_dev)
8018 netdev_adjacent_dev_enable(dev, old_dev);
8019
8020 __netdev_upper_dev_unlink(new_dev, dev, &priv);
8021}
8022EXPORT_SYMBOL(netdev_adjacent_change_abort);
8023
8024/**
8025 * netdev_bonding_info_change - Dispatch event about slave change
8026 * @dev: device
8027 * @bonding_info: info to dispatch
8028 *
8029 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
8030 * The caller must hold the RTNL lock.
8031 */
8032void netdev_bonding_info_change(struct net_device *dev,
8033 struct netdev_bonding_info *bonding_info)
8034{
8035 struct netdev_notifier_bonding_info info = {
8036 .info.dev = dev,
8037 };
8038
8039 memcpy(&info.bonding_info, bonding_info,
8040 sizeof(struct netdev_bonding_info));
8041 call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
8042 &info.info);
8043}
8044EXPORT_SYMBOL(netdev_bonding_info_change);
8045
8046static int netdev_offload_xstats_enable_l3(struct net_device *dev,
8047 struct netlink_ext_ack *extack)
8048{
8049 struct netdev_notifier_offload_xstats_info info = {
8050 .info.dev = dev,
8051 .info.extack = extack,
8052 .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
8053 };
8054 int err;
8055 int rc;
8056
8057 dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
8058 GFP_KERNEL);
8059 if (!dev->offload_xstats_l3)
8060 return -ENOMEM;
8061
8062 rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
8063 NETDEV_OFFLOAD_XSTATS_DISABLE,
8064 &info.info);
8065 err = notifier_to_errno(rc);
8066 if (err)
8067 goto free_stats;
8068
8069 return 0;
8070
8071free_stats:
8072 kfree(dev->offload_xstats_l3);
8073 dev->offload_xstats_l3 = NULL;
8074 return err;
8075}
8076
8077int netdev_offload_xstats_enable(struct net_device *dev,
8078 enum netdev_offload_xstats_type type,
8079 struct netlink_ext_ack *extack)
8080{
8081 ASSERT_RTNL();
8082
8083 if (netdev_offload_xstats_enabled(dev, type))
8084 return -EALREADY;
8085
8086 switch (type) {
8087 case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
8088 return netdev_offload_xstats_enable_l3(dev, extack);
8089 }
8090
8091 WARN_ON(1);
8092 return -EINVAL;
8093}
8094EXPORT_SYMBOL(netdev_offload_xstats_enable);
8095
8096static void netdev_offload_xstats_disable_l3(struct net_device *dev)
8097{
8098 struct netdev_notifier_offload_xstats_info info = {
8099 .info.dev = dev,
8100 .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
8101 };
8102
8103 call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
8104 &info.info);
8105 kfree(dev->offload_xstats_l3);
8106 dev->offload_xstats_l3 = NULL;
8107}
8108
8109int netdev_offload_xstats_disable(struct net_device *dev,
8110 enum netdev_offload_xstats_type type)
8111{
8112 ASSERT_RTNL();
8113
8114 if (!netdev_offload_xstats_enabled(dev, type))
8115 return -EALREADY;
8116
8117 switch (type) {
8118 case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
8119 netdev_offload_xstats_disable_l3(dev);
8120 return 0;
8121 }
8122
8123 WARN_ON(1);
8124 return -EINVAL;
8125}
8126EXPORT_SYMBOL(netdev_offload_xstats_disable);
8127
8128static void netdev_offload_xstats_disable_all(struct net_device *dev)
8129{
8130 netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
8131}
8132
8133static struct rtnl_hw_stats64 *
8134netdev_offload_xstats_get_ptr(const struct net_device *dev,
8135 enum netdev_offload_xstats_type type)
8136{
8137 switch (type) {
8138 case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
8139 return dev->offload_xstats_l3;
8140 }
8141
8142 WARN_ON(1);
8143 return NULL;
8144}
8145
8146bool netdev_offload_xstats_enabled(const struct net_device *dev,
8147 enum netdev_offload_xstats_type type)
8148{
8149 ASSERT_RTNL();
8150
8151 return netdev_offload_xstats_get_ptr(dev, type);
8152}
8153EXPORT_SYMBOL(netdev_offload_xstats_enabled);
8154
8155struct netdev_notifier_offload_xstats_ru {
8156 bool used;
8157};
8158
8159struct netdev_notifier_offload_xstats_rd {
8160 struct rtnl_hw_stats64 stats;
8161 bool used;
8162};
8163
8164static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
8165 const struct rtnl_hw_stats64 *src)
8166{
8167 dest->rx_packets += src->rx_packets;
8168 dest->tx_packets += src->tx_packets;
8169 dest->rx_bytes += src->rx_bytes;
8170 dest->tx_bytes += src->tx_bytes;
8171 dest->rx_errors += src->rx_errors;
8172 dest->tx_errors += src->tx_errors;
8173 dest->rx_dropped += src->rx_dropped;
8174 dest->tx_dropped += src->tx_dropped;
8175 dest->multicast += src->multicast;
8176}
8177
8178static int netdev_offload_xstats_get_used(struct net_device *dev,
8179 enum netdev_offload_xstats_type type,
8180 bool *p_used,
8181 struct netlink_ext_ack *extack)
8182{
8183 struct netdev_notifier_offload_xstats_ru report_used = {};
8184 struct netdev_notifier_offload_xstats_info info = {
8185 .info.dev = dev,
8186 .info.extack = extack,
8187 .type = type,
8188 .report_used = &report_used,
8189 };
8190 int rc;
8191
8192 WARN_ON(!netdev_offload_xstats_enabled(dev, type));
8193 rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
8194 &info.info);
8195 *p_used = report_used.used;
8196 return notifier_to_errno(rc);
8197}
8198
8199static int netdev_offload_xstats_get_stats(struct net_device *dev,
8200 enum netdev_offload_xstats_type type,
8201 struct rtnl_hw_stats64 *p_stats,
8202 bool *p_used,
8203 struct netlink_ext_ack *extack)
8204{
8205 struct netdev_notifier_offload_xstats_rd report_delta = {};
8206 struct netdev_notifier_offload_xstats_info info = {
8207 .info.dev = dev,
8208 .info.extack = extack,
8209 .type = type,
8210 .report_delta = &report_delta,
8211 };
8212 struct rtnl_hw_stats64 *stats;
8213 int rc;
8214
8215 stats = netdev_offload_xstats_get_ptr(dev, type);
8216 if (WARN_ON(!stats))
8217 return -EINVAL;
8218
8219 rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
8220 &info.info);
8221
8222 /* Cache whatever we got, even if there was an error, otherwise the
8223 * successful stats retrievals would get lost.
8224 */
8225 netdev_hw_stats64_add(stats, &report_delta.stats);
8226
8227 if (p_stats)
8228 *p_stats = *stats;
8229 *p_used = report_delta.used;
8230
8231 return notifier_to_errno(rc);
8232}
8233
8234int netdev_offload_xstats_get(struct net_device *dev,
8235 enum netdev_offload_xstats_type type,
8236 struct rtnl_hw_stats64 *p_stats, bool *p_used,
8237 struct netlink_ext_ack *extack)
8238{
8239 ASSERT_RTNL();
8240
8241 if (p_stats)
8242 return netdev_offload_xstats_get_stats(dev, type, p_stats,
8243 p_used, extack);
8244 else
8245 return netdev_offload_xstats_get_used(dev, type, p_used,
8246 extack);
8247}
8248EXPORT_SYMBOL(netdev_offload_xstats_get);
8249
8250void
8251netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
8252 const struct rtnl_hw_stats64 *stats)
8253{
8254 report_delta->used = true;
8255 netdev_hw_stats64_add(&report_delta->stats, stats);
8256}
8257EXPORT_SYMBOL(netdev_offload_xstats_report_delta);
8258
8259void
8260netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
8261{
8262 report_used->used = true;
8263}
8264EXPORT_SYMBOL(netdev_offload_xstats_report_used);
8265
8266void netdev_offload_xstats_push_delta(struct net_device *dev,
8267 enum netdev_offload_xstats_type type,
8268 const struct rtnl_hw_stats64 *p_stats)
8269{
8270 struct rtnl_hw_stats64 *stats;
8271
8272 ASSERT_RTNL();
8273
8274 stats = netdev_offload_xstats_get_ptr(dev, type);
8275 if (WARN_ON(!stats))
8276 return;
8277
8278 netdev_hw_stats64_add(stats, p_stats);
8279}
8280EXPORT_SYMBOL(netdev_offload_xstats_push_delta);
8281
8282/**
8283 * netdev_get_xmit_slave - Get the xmit slave of master device
8284 * @dev: device
8285 * @skb: The packet
8286 * @all_slaves: assume all the slaves are active
8287 *
8288 * The reference counters are not incremented so the caller must be
8289 * careful with locks. The caller must hold RCU lock.
8290 * %NULL is returned if no slave is found.
8291 */
8292
8293struct net_device *netdev_get_xmit_slave(struct net_device *dev,
8294 struct sk_buff *skb,
8295 bool all_slaves)
8296{
8297 const struct net_device_ops *ops = dev->netdev_ops;
8298
8299 if (!ops->ndo_get_xmit_slave)
8300 return NULL;
8301 return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
8302}
8303EXPORT_SYMBOL(netdev_get_xmit_slave);
8304
8305static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
8306 struct sock *sk)
8307{
8308 const struct net_device_ops *ops = dev->netdev_ops;
8309
8310 if (!ops->ndo_sk_get_lower_dev)
8311 return NULL;
8312 return ops->ndo_sk_get_lower_dev(dev, sk);
8313}
8314
8315/**
8316 * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
8317 * @dev: device
8318 * @sk: the socket
8319 *
8320 * %NULL is returned if no lower device is found.
8321 */
8322
8323struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
8324 struct sock *sk)
8325{
8326 struct net_device *lower;
8327
8328 lower = netdev_sk_get_lower_dev(dev, sk);
8329 while (lower) {
8330 dev = lower;
8331 lower = netdev_sk_get_lower_dev(dev, sk);
8332 }
8333
8334 return dev;
8335}
8336EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
8337
8338static void netdev_adjacent_add_links(struct net_device *dev)
8339{
8340 struct netdev_adjacent *iter;
8341
8342 struct net *net = dev_net(dev);
8343
8344 list_for_each_entry(iter, &dev->adj_list.upper, list) {
8345 if (!net_eq(net, dev_net(iter->dev)))
8346 continue;
8347 netdev_adjacent_sysfs_add(iter->dev, dev,
8348 &iter->dev->adj_list.lower);
8349 netdev_adjacent_sysfs_add(dev, iter->dev,
8350 &dev->adj_list.upper);
8351 }
8352
8353 list_for_each_entry(iter, &dev->adj_list.lower, list) {
8354 if (!net_eq(net, dev_net(iter->dev)))
8355 continue;
8356 netdev_adjacent_sysfs_add(iter->dev, dev,
8357 &iter->dev->adj_list.upper);
8358 netdev_adjacent_sysfs_add(dev, iter->dev,
8359 &dev->adj_list.lower);
8360 }
8361}
8362
8363static void netdev_adjacent_del_links(struct net_device *dev)
8364{
8365 struct netdev_adjacent *iter;
8366
8367 struct net *net = dev_net(dev);
8368
8369 list_for_each_entry(iter, &dev->adj_list.upper, list) {
8370 if (!net_eq(net, dev_net(iter->dev)))
8371 continue;
8372 netdev_adjacent_sysfs_del(iter->dev, dev->name,
8373 &iter->dev->adj_list.lower);
8374 netdev_adjacent_sysfs_del(dev, iter->dev->name,
8375 &dev->adj_list.upper);
8376 }
8377
8378 list_for_each_entry(iter, &dev->adj_list.lower, list) {
8379 if (!net_eq(net, dev_net(iter->dev)))
8380 continue;
8381 netdev_adjacent_sysfs_del(iter->dev, dev->name,
8382 &iter->dev->adj_list.upper);
8383 netdev_adjacent_sysfs_del(dev, iter->dev->name,
8384 &dev->adj_list.lower);
8385 }
8386}
8387
8388void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
8389{
8390 struct netdev_adjacent *iter;
8391
8392 struct net *net = dev_net(dev);
8393
8394 list_for_each_entry(iter, &dev->adj_list.upper, list) {
8395 if (!net_eq(net, dev_net(iter->dev)))
8396 continue;
8397 netdev_adjacent_sysfs_del(iter->dev, oldname,
8398 &iter->dev->adj_list.lower);
8399 netdev_adjacent_sysfs_add(iter->dev, dev,
8400 &iter->dev->adj_list.lower);
8401 }
8402
8403 list_for_each_entry(iter, &dev->adj_list.lower, list) {
8404 if (!net_eq(net, dev_net(iter->dev)))
8405 continue;
8406 netdev_adjacent_sysfs_del(iter->dev, oldname,
8407 &iter->dev->adj_list.upper);
8408 netdev_adjacent_sysfs_add(iter->dev, dev,
8409 &iter->dev->adj_list.upper);
8410 }
8411}
8412
8413void *netdev_lower_dev_get_private(struct net_device *dev,
8414 struct net_device *lower_dev)
8415{
8416 struct netdev_adjacent *lower;
8417
8418 if (!lower_dev)
8419 return NULL;
8420 lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
8421 if (!lower)
8422 return NULL;
8423
8424 return lower->private;
8425}
8426EXPORT_SYMBOL(netdev_lower_dev_get_private);
8427
8428
8429/**
8430 * netdev_lower_state_changed - Dispatch event about lower device state change
8431 * @lower_dev: device
8432 * @lower_state_info: state to dispatch
8433 *
8434 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
8435 * The caller must hold the RTNL lock.
8436 */
8437void netdev_lower_state_changed(struct net_device *lower_dev,
8438 void *lower_state_info)
8439{
8440 struct netdev_notifier_changelowerstate_info changelowerstate_info = {
8441 .info.dev = lower_dev,
8442 };
8443
8444 ASSERT_RTNL();
8445 changelowerstate_info.lower_state_info = lower_state_info;
8446 call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
8447 &changelowerstate_info.info);
8448}
8449EXPORT_SYMBOL(netdev_lower_state_changed);
8450
8451static void dev_change_rx_flags(struct net_device *dev, int flags)
8452{
8453 const struct net_device_ops *ops = dev->netdev_ops;
8454
8455 if (ops->ndo_change_rx_flags)
8456 ops->ndo_change_rx_flags(dev, flags);
8457}
8458
8459static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
8460{
8461 unsigned int old_flags = dev->flags;
8462 kuid_t uid;
8463 kgid_t gid;
8464
8465 ASSERT_RTNL();
8466
8467 dev->flags |= IFF_PROMISC;
8468 dev->promiscuity += inc;
8469 if (dev->promiscuity == 0) {
8470 /*
8471 * Avoid overflow.
8472 * If inc causes overflow, untouch promisc and return error.
8473 */
8474 if (inc < 0)
8475 dev->flags &= ~IFF_PROMISC;
8476 else {
8477 dev->promiscuity -= inc;
8478 netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
8479 return -EOVERFLOW;
8480 }
8481 }
8482 if (dev->flags != old_flags) {
8483 netdev_info(dev, "%s promiscuous mode\n",
8484 dev->flags & IFF_PROMISC ? "entered" : "left");
8485 if (audit_enabled) {
8486 current_uid_gid(&uid, &gid);
8487 audit_log(audit_context(), GFP_ATOMIC,
8488 AUDIT_ANOM_PROMISCUOUS,
8489 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
8490 dev->name, (dev->flags & IFF_PROMISC),
8491 (old_flags & IFF_PROMISC),
8492 from_kuid(&init_user_ns, audit_get_loginuid(current)),
8493 from_kuid(&init_user_ns, uid),
8494 from_kgid(&init_user_ns, gid),
8495 audit_get_sessionid(current));
8496 }
8497
8498 dev_change_rx_flags(dev, IFF_PROMISC);
8499 }
8500 if (notify)
8501 __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
8502 return 0;
8503}
8504
8505/**
8506 * dev_set_promiscuity - update promiscuity count on a device
8507 * @dev: device
8508 * @inc: modifier
8509 *
8510 * Add or remove promiscuity from a device. While the count in the device
8511 * remains above zero the interface remains promiscuous. Once it hits zero
8512 * the device reverts back to normal filtering operation. A negative inc
8513 * value is used to drop promiscuity on the device.
8514 * Return 0 if successful or a negative errno code on error.
8515 */
8516int dev_set_promiscuity(struct net_device *dev, int inc)
8517{
8518 unsigned int old_flags = dev->flags;
8519 int err;
8520
8521 err = __dev_set_promiscuity(dev, inc, true);
8522 if (err < 0)
8523 return err;
8524 if (dev->flags != old_flags)
8525 dev_set_rx_mode(dev);
8526 return err;
8527}
8528EXPORT_SYMBOL(dev_set_promiscuity);
8529
8530static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
8531{
8532 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
8533
8534 ASSERT_RTNL();
8535
8536 dev->flags |= IFF_ALLMULTI;
8537 dev->allmulti += inc;
8538 if (dev->allmulti == 0) {
8539 /*
8540 * Avoid overflow.
8541 * If inc causes overflow, untouch allmulti and return error.
8542 */
8543 if (inc < 0)
8544 dev->flags &= ~IFF_ALLMULTI;
8545 else {
8546 dev->allmulti -= inc;
8547 netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
8548 return -EOVERFLOW;
8549 }
8550 }
8551 if (dev->flags ^ old_flags) {
8552 netdev_info(dev, "%s allmulticast mode\n",
8553 dev->flags & IFF_ALLMULTI ? "entered" : "left");
8554 dev_change_rx_flags(dev, IFF_ALLMULTI);
8555 dev_set_rx_mode(dev);
8556 if (notify)
8557 __dev_notify_flags(dev, old_flags,
8558 dev->gflags ^ old_gflags, 0, NULL);
8559 }
8560 return 0;
8561}
8562
8563/**
8564 * dev_set_allmulti - update allmulti count on a device
8565 * @dev: device
8566 * @inc: modifier
8567 *
8568 * Add or remove reception of all multicast frames to a device. While the
8569 * count in the device remains above zero the interface remains listening
8570 * to all interfaces. Once it hits zero the device reverts back to normal
8571 * filtering operation. A negative @inc value is used to drop the counter
8572 * when releasing a resource needing all multicasts.
8573 * Return 0 if successful or a negative errno code on error.
8574 */
8575
8576int dev_set_allmulti(struct net_device *dev, int inc)
8577{
8578 return __dev_set_allmulti(dev, inc, true);
8579}
8580EXPORT_SYMBOL(dev_set_allmulti);
8581
8582/*
8583 * Upload unicast and multicast address lists to device and
8584 * configure RX filtering. When the device doesn't support unicast
8585 * filtering it is put in promiscuous mode while unicast addresses
8586 * are present.
8587 */
8588void __dev_set_rx_mode(struct net_device *dev)
8589{
8590 const struct net_device_ops *ops = dev->netdev_ops;
8591
8592 /* dev_open will call this function so the list will stay sane. */
8593 if (!(dev->flags&IFF_UP))
8594 return;
8595
8596 if (!netif_device_present(dev))
8597 return;
8598
8599 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
8600 /* Unicast addresses changes may only happen under the rtnl,
8601 * therefore calling __dev_set_promiscuity here is safe.
8602 */
8603 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
8604 __dev_set_promiscuity(dev, 1, false);
8605 dev->uc_promisc = true;
8606 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
8607 __dev_set_promiscuity(dev, -1, false);
8608 dev->uc_promisc = false;
8609 }
8610 }
8611
8612 if (ops->ndo_set_rx_mode)
8613 ops->ndo_set_rx_mode(dev);
8614}
8615
8616void dev_set_rx_mode(struct net_device *dev)
8617{
8618 netif_addr_lock_bh(dev);
8619 __dev_set_rx_mode(dev);
8620 netif_addr_unlock_bh(dev);
8621}
8622
8623/**
8624 * dev_get_flags - get flags reported to userspace
8625 * @dev: device
8626 *
8627 * Get the combination of flag bits exported through APIs to userspace.
8628 */
8629unsigned int dev_get_flags(const struct net_device *dev)
8630{
8631 unsigned int flags;
8632
8633 flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC |
8634 IFF_ALLMULTI |
8635 IFF_RUNNING |
8636 IFF_LOWER_UP |
8637 IFF_DORMANT)) |
8638 (READ_ONCE(dev->gflags) & (IFF_PROMISC |
8639 IFF_ALLMULTI));
8640
8641 if (netif_running(dev)) {
8642 if (netif_oper_up(dev))
8643 flags |= IFF_RUNNING;
8644 if (netif_carrier_ok(dev))
8645 flags |= IFF_LOWER_UP;
8646 if (netif_dormant(dev))
8647 flags |= IFF_DORMANT;
8648 }
8649
8650 return flags;
8651}
8652EXPORT_SYMBOL(dev_get_flags);
8653
8654int __dev_change_flags(struct net_device *dev, unsigned int flags,
8655 struct netlink_ext_ack *extack)
8656{
8657 unsigned int old_flags = dev->flags;
8658 int ret;
8659
8660 ASSERT_RTNL();
8661
8662 /*
8663 * Set the flags on our device.
8664 */
8665
8666 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
8667 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
8668 IFF_AUTOMEDIA)) |
8669 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
8670 IFF_ALLMULTI));
8671
8672 /*
8673 * Load in the correct multicast list now the flags have changed.
8674 */
8675
8676 if ((old_flags ^ flags) & IFF_MULTICAST)
8677 dev_change_rx_flags(dev, IFF_MULTICAST);
8678
8679 dev_set_rx_mode(dev);
8680
8681 /*
8682 * Have we downed the interface. We handle IFF_UP ourselves
8683 * according to user attempts to set it, rather than blindly
8684 * setting it.
8685 */
8686
8687 ret = 0;
8688 if ((old_flags ^ flags) & IFF_UP) {
8689 if (old_flags & IFF_UP)
8690 __dev_close(dev);
8691 else
8692 ret = __dev_open(dev, extack);
8693 }
8694
8695 if ((flags ^ dev->gflags) & IFF_PROMISC) {
8696 int inc = (flags & IFF_PROMISC) ? 1 : -1;
8697 unsigned int old_flags = dev->flags;
8698
8699 dev->gflags ^= IFF_PROMISC;
8700
8701 if (__dev_set_promiscuity(dev, inc, false) >= 0)
8702 if (dev->flags != old_flags)
8703 dev_set_rx_mode(dev);
8704 }
8705
8706 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
8707 * is important. Some (broken) drivers set IFF_PROMISC, when
8708 * IFF_ALLMULTI is requested not asking us and not reporting.
8709 */
8710 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
8711 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
8712
8713 dev->gflags ^= IFF_ALLMULTI;
8714 __dev_set_allmulti(dev, inc, false);
8715 }
8716
8717 return ret;
8718}
8719
8720void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
8721 unsigned int gchanges, u32 portid,
8722 const struct nlmsghdr *nlh)
8723{
8724 unsigned int changes = dev->flags ^ old_flags;
8725
8726 if (gchanges)
8727 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh);
8728
8729 if (changes & IFF_UP) {
8730 if (dev->flags & IFF_UP)
8731 call_netdevice_notifiers(NETDEV_UP, dev);
8732 else
8733 call_netdevice_notifiers(NETDEV_DOWN, dev);
8734 }
8735
8736 if (dev->flags & IFF_UP &&
8737 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
8738 struct netdev_notifier_change_info change_info = {
8739 .info = {
8740 .dev = dev,
8741 },
8742 .flags_changed = changes,
8743 };
8744
8745 call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
8746 }
8747}
8748
8749/**
8750 * dev_change_flags - change device settings
8751 * @dev: device
8752 * @flags: device state flags
8753 * @extack: netlink extended ack
8754 *
8755 * Change settings on device based state flags. The flags are
8756 * in the userspace exported format.
8757 */
8758int dev_change_flags(struct net_device *dev, unsigned int flags,
8759 struct netlink_ext_ack *extack)
8760{
8761 int ret;
8762 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
8763
8764 ret = __dev_change_flags(dev, flags, extack);
8765 if (ret < 0)
8766 return ret;
8767
8768 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
8769 __dev_notify_flags(dev, old_flags, changes, 0, NULL);
8770 return ret;
8771}
8772EXPORT_SYMBOL(dev_change_flags);
8773
8774int __dev_set_mtu(struct net_device *dev, int new_mtu)
8775{
8776 const struct net_device_ops *ops = dev->netdev_ops;
8777
8778 if (ops->ndo_change_mtu)
8779 return ops->ndo_change_mtu(dev, new_mtu);
8780
8781 /* Pairs with all the lockless reads of dev->mtu in the stack */
8782 WRITE_ONCE(dev->mtu, new_mtu);
8783 return 0;
8784}
8785EXPORT_SYMBOL(__dev_set_mtu);
8786
8787int dev_validate_mtu(struct net_device *dev, int new_mtu,
8788 struct netlink_ext_ack *extack)
8789{
8790 /* MTU must be positive, and in range */
8791 if (new_mtu < 0 || new_mtu < dev->min_mtu) {
8792 NL_SET_ERR_MSG(extack, "mtu less than device minimum");
8793 return -EINVAL;
8794 }
8795
8796 if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
8797 NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
8798 return -EINVAL;
8799 }
8800 return 0;
8801}
8802
8803/**
8804 * dev_set_mtu_ext - Change maximum transfer unit
8805 * @dev: device
8806 * @new_mtu: new transfer unit
8807 * @extack: netlink extended ack
8808 *
8809 * Change the maximum transfer size of the network device.
8810 */
8811int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
8812 struct netlink_ext_ack *extack)
8813{
8814 int err, orig_mtu;
8815
8816 if (new_mtu == dev->mtu)
8817 return 0;
8818
8819 err = dev_validate_mtu(dev, new_mtu, extack);
8820 if (err)
8821 return err;
8822
8823 if (!netif_device_present(dev))
8824 return -ENODEV;
8825
8826 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
8827 err = notifier_to_errno(err);
8828 if (err)
8829 return err;
8830
8831 orig_mtu = dev->mtu;
8832 err = __dev_set_mtu(dev, new_mtu);
8833
8834 if (!err) {
8835 err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8836 orig_mtu);
8837 err = notifier_to_errno(err);
8838 if (err) {
8839 /* setting mtu back and notifying everyone again,
8840 * so that they have a chance to revert changes.
8841 */
8842 __dev_set_mtu(dev, orig_mtu);
8843 call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8844 new_mtu);
8845 }
8846 }
8847 return err;
8848}
8849
8850int dev_set_mtu(struct net_device *dev, int new_mtu)
8851{
8852 struct netlink_ext_ack extack;
8853 int err;
8854
8855 memset(&extack, 0, sizeof(extack));
8856 err = dev_set_mtu_ext(dev, new_mtu, &extack);
8857 if (err && extack._msg)
8858 net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
8859 return err;
8860}
8861EXPORT_SYMBOL(dev_set_mtu);
8862
8863/**
8864 * dev_change_tx_queue_len - Change TX queue length of a netdevice
8865 * @dev: device
8866 * @new_len: new tx queue length
8867 */
8868int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
8869{
8870 unsigned int orig_len = dev->tx_queue_len;
8871 int res;
8872
8873 if (new_len != (unsigned int)new_len)
8874 return -ERANGE;
8875
8876 if (new_len != orig_len) {
8877 dev->tx_queue_len = new_len;
8878 res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
8879 res = notifier_to_errno(res);
8880 if (res)
8881 goto err_rollback;
8882 res = dev_qdisc_change_tx_queue_len(dev);
8883 if (res)
8884 goto err_rollback;
8885 }
8886
8887 return 0;
8888
8889err_rollback:
8890 netdev_err(dev, "refused to change device tx_queue_len\n");
8891 dev->tx_queue_len = orig_len;
8892 return res;
8893}
8894
8895/**
8896 * dev_set_group - Change group this device belongs to
8897 * @dev: device
8898 * @new_group: group this device should belong to
8899 */
8900void dev_set_group(struct net_device *dev, int new_group)
8901{
8902 dev->group = new_group;
8903}
8904
8905/**
8906 * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
8907 * @dev: device
8908 * @addr: new address
8909 * @extack: netlink extended ack
8910 */
8911int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
8912 struct netlink_ext_ack *extack)
8913{
8914 struct netdev_notifier_pre_changeaddr_info info = {
8915 .info.dev = dev,
8916 .info.extack = extack,
8917 .dev_addr = addr,
8918 };
8919 int rc;
8920
8921 rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
8922 return notifier_to_errno(rc);
8923}
8924EXPORT_SYMBOL(dev_pre_changeaddr_notify);
8925
8926/**
8927 * dev_set_mac_address - Change Media Access Control Address
8928 * @dev: device
8929 * @sa: new address
8930 * @extack: netlink extended ack
8931 *
8932 * Change the hardware (MAC) address of the device
8933 */
8934int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
8935 struct netlink_ext_ack *extack)
8936{
8937 const struct net_device_ops *ops = dev->netdev_ops;
8938 int err;
8939
8940 if (!ops->ndo_set_mac_address)
8941 return -EOPNOTSUPP;
8942 if (sa->sa_family != dev->type)
8943 return -EINVAL;
8944 if (!netif_device_present(dev))
8945 return -ENODEV;
8946 err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
8947 if (err)
8948 return err;
8949 if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) {
8950 err = ops->ndo_set_mac_address(dev, sa);
8951 if (err)
8952 return err;
8953 }
8954 dev->addr_assign_type = NET_ADDR_SET;
8955 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
8956 add_device_randomness(dev->dev_addr, dev->addr_len);
8957 return 0;
8958}
8959EXPORT_SYMBOL(dev_set_mac_address);
8960
8961DECLARE_RWSEM(dev_addr_sem);
8962
8963int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
8964 struct netlink_ext_ack *extack)
8965{
8966 int ret;
8967
8968 down_write(&dev_addr_sem);
8969 ret = dev_set_mac_address(dev, sa, extack);
8970 up_write(&dev_addr_sem);
8971 return ret;
8972}
8973EXPORT_SYMBOL(dev_set_mac_address_user);
8974
8975int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
8976{
8977 size_t size = sizeof(sa->sa_data_min);
8978 struct net_device *dev;
8979 int ret = 0;
8980
8981 down_read(&dev_addr_sem);
8982 rcu_read_lock();
8983
8984 dev = dev_get_by_name_rcu(net, dev_name);
8985 if (!dev) {
8986 ret = -ENODEV;
8987 goto unlock;
8988 }
8989 if (!dev->addr_len)
8990 memset(sa->sa_data, 0, size);
8991 else
8992 memcpy(sa->sa_data, dev->dev_addr,
8993 min_t(size_t, size, dev->addr_len));
8994 sa->sa_family = dev->type;
8995
8996unlock:
8997 rcu_read_unlock();
8998 up_read(&dev_addr_sem);
8999 return ret;
9000}
9001EXPORT_SYMBOL(dev_get_mac_address);
9002
9003/**
9004 * dev_change_carrier - Change device carrier
9005 * @dev: device
9006 * @new_carrier: new value
9007 *
9008 * Change device carrier
9009 */
9010int dev_change_carrier(struct net_device *dev, bool new_carrier)
9011{
9012 const struct net_device_ops *ops = dev->netdev_ops;
9013
9014 if (!ops->ndo_change_carrier)
9015 return -EOPNOTSUPP;
9016 if (!netif_device_present(dev))
9017 return -ENODEV;
9018 return ops->ndo_change_carrier(dev, new_carrier);
9019}
9020
9021/**
9022 * dev_get_phys_port_id - Get device physical port ID
9023 * @dev: device
9024 * @ppid: port ID
9025 *
9026 * Get device physical port ID
9027 */
9028int dev_get_phys_port_id(struct net_device *dev,
9029 struct netdev_phys_item_id *ppid)
9030{
9031 const struct net_device_ops *ops = dev->netdev_ops;
9032
9033 if (!ops->ndo_get_phys_port_id)
9034 return -EOPNOTSUPP;
9035 return ops->ndo_get_phys_port_id(dev, ppid);
9036}
9037
9038/**
9039 * dev_get_phys_port_name - Get device physical port name
9040 * @dev: device
9041 * @name: port name
9042 * @len: limit of bytes to copy to name
9043 *
9044 * Get device physical port name
9045 */
9046int dev_get_phys_port_name(struct net_device *dev,
9047 char *name, size_t len)
9048{
9049 const struct net_device_ops *ops = dev->netdev_ops;
9050 int err;
9051
9052 if (ops->ndo_get_phys_port_name) {
9053 err = ops->ndo_get_phys_port_name(dev, name, len);
9054 if (err != -EOPNOTSUPP)
9055 return err;
9056 }
9057 return devlink_compat_phys_port_name_get(dev, name, len);
9058}
9059
9060/**
9061 * dev_get_port_parent_id - Get the device's port parent identifier
9062 * @dev: network device
9063 * @ppid: pointer to a storage for the port's parent identifier
9064 * @recurse: allow/disallow recursion to lower devices
9065 *
9066 * Get the devices's port parent identifier
9067 */
9068int dev_get_port_parent_id(struct net_device *dev,
9069 struct netdev_phys_item_id *ppid,
9070 bool recurse)
9071{
9072 const struct net_device_ops *ops = dev->netdev_ops;
9073 struct netdev_phys_item_id first = { };
9074 struct net_device *lower_dev;
9075 struct list_head *iter;
9076 int err;
9077
9078 if (ops->ndo_get_port_parent_id) {
9079 err = ops->ndo_get_port_parent_id(dev, ppid);
9080 if (err != -EOPNOTSUPP)
9081 return err;
9082 }
9083
9084 err = devlink_compat_switch_id_get(dev, ppid);
9085 if (!recurse || err != -EOPNOTSUPP)
9086 return err;
9087
9088 netdev_for_each_lower_dev(dev, lower_dev, iter) {
9089 err = dev_get_port_parent_id(lower_dev, ppid, true);
9090 if (err)
9091 break;
9092 if (!first.id_len)
9093 first = *ppid;
9094 else if (memcmp(&first, ppid, sizeof(*ppid)))
9095 return -EOPNOTSUPP;
9096 }
9097
9098 return err;
9099}
9100EXPORT_SYMBOL(dev_get_port_parent_id);
9101
9102/**
9103 * netdev_port_same_parent_id - Indicate if two network devices have
9104 * the same port parent identifier
9105 * @a: first network device
9106 * @b: second network device
9107 */
9108bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
9109{
9110 struct netdev_phys_item_id a_id = { };
9111 struct netdev_phys_item_id b_id = { };
9112
9113 if (dev_get_port_parent_id(a, &a_id, true) ||
9114 dev_get_port_parent_id(b, &b_id, true))
9115 return false;
9116
9117 return netdev_phys_item_id_same(&a_id, &b_id);
9118}
9119EXPORT_SYMBOL(netdev_port_same_parent_id);
9120
9121/**
9122 * dev_change_proto_down - set carrier according to proto_down.
9123 *
9124 * @dev: device
9125 * @proto_down: new value
9126 */
9127int dev_change_proto_down(struct net_device *dev, bool proto_down)
9128{
9129 if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN))
9130 return -EOPNOTSUPP;
9131 if (!netif_device_present(dev))
9132 return -ENODEV;
9133 if (proto_down)
9134 netif_carrier_off(dev);
9135 else
9136 netif_carrier_on(dev);
9137 dev->proto_down = proto_down;
9138 return 0;
9139}
9140
9141/**
9142 * dev_change_proto_down_reason - proto down reason
9143 *
9144 * @dev: device
9145 * @mask: proto down mask
9146 * @value: proto down value
9147 */
9148void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
9149 u32 value)
9150{
9151 int b;
9152
9153 if (!mask) {
9154 dev->proto_down_reason = value;
9155 } else {
9156 for_each_set_bit(b, &mask, 32) {
9157 if (value & (1 << b))
9158 dev->proto_down_reason |= BIT(b);
9159 else
9160 dev->proto_down_reason &= ~BIT(b);
9161 }
9162 }
9163}
9164
9165struct bpf_xdp_link {
9166 struct bpf_link link;
9167 struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
9168 int flags;
9169};
9170
9171static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
9172{
9173 if (flags & XDP_FLAGS_HW_MODE)
9174 return XDP_MODE_HW;
9175 if (flags & XDP_FLAGS_DRV_MODE)
9176 return XDP_MODE_DRV;
9177 if (flags & XDP_FLAGS_SKB_MODE)
9178 return XDP_MODE_SKB;
9179 return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
9180}
9181
9182static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
9183{
9184 switch (mode) {
9185 case XDP_MODE_SKB:
9186 return generic_xdp_install;
9187 case XDP_MODE_DRV:
9188 case XDP_MODE_HW:
9189 return dev->netdev_ops->ndo_bpf;
9190 default:
9191 return NULL;
9192 }
9193}
9194
9195static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
9196 enum bpf_xdp_mode mode)
9197{
9198 return dev->xdp_state[mode].link;
9199}
9200
9201static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
9202 enum bpf_xdp_mode mode)
9203{
9204 struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
9205
9206 if (link)
9207 return link->link.prog;
9208 return dev->xdp_state[mode].prog;
9209}
9210
9211u8 dev_xdp_prog_count(struct net_device *dev)
9212{
9213 u8 count = 0;
9214 int i;
9215
9216 for (i = 0; i < __MAX_XDP_MODE; i++)
9217 if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
9218 count++;
9219 return count;
9220}
9221EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
9222
9223u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
9224{
9225 struct bpf_prog *prog = dev_xdp_prog(dev, mode);
9226
9227 return prog ? prog->aux->id : 0;
9228}
9229
9230static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
9231 struct bpf_xdp_link *link)
9232{
9233 dev->xdp_state[mode].link = link;
9234 dev->xdp_state[mode].prog = NULL;
9235}
9236
9237static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
9238 struct bpf_prog *prog)
9239{
9240 dev->xdp_state[mode].link = NULL;
9241 dev->xdp_state[mode].prog = prog;
9242}
9243
9244static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
9245 bpf_op_t bpf_op, struct netlink_ext_ack *extack,
9246 u32 flags, struct bpf_prog *prog)
9247{
9248 struct netdev_bpf xdp;
9249 int err;
9250
9251 memset(&xdp, 0, sizeof(xdp));
9252 xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
9253 xdp.extack = extack;
9254 xdp.flags = flags;
9255 xdp.prog = prog;
9256
9257 /* Drivers assume refcnt is already incremented (i.e, prog pointer is
9258 * "moved" into driver), so they don't increment it on their own, but
9259 * they do decrement refcnt when program is detached or replaced.
9260 * Given net_device also owns link/prog, we need to bump refcnt here
9261 * to prevent drivers from underflowing it.
9262 */
9263 if (prog)
9264 bpf_prog_inc(prog);
9265 err = bpf_op(dev, &xdp);
9266 if (err) {
9267 if (prog)
9268 bpf_prog_put(prog);
9269 return err;
9270 }
9271
9272 if (mode != XDP_MODE_HW)
9273 bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
9274
9275 return 0;
9276}
9277
9278static void dev_xdp_uninstall(struct net_device *dev)
9279{
9280 struct bpf_xdp_link *link;
9281 struct bpf_prog *prog;
9282 enum bpf_xdp_mode mode;
9283 bpf_op_t bpf_op;
9284
9285 ASSERT_RTNL();
9286
9287 for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
9288 prog = dev_xdp_prog(dev, mode);
9289 if (!prog)
9290 continue;
9291
9292 bpf_op = dev_xdp_bpf_op(dev, mode);
9293 if (!bpf_op)
9294 continue;
9295
9296 WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9297
9298 /* auto-detach link from net device */
9299 link = dev_xdp_link(dev, mode);
9300 if (link)
9301 link->dev = NULL;
9302 else
9303 bpf_prog_put(prog);
9304
9305 dev_xdp_set_link(dev, mode, NULL);
9306 }
9307}
9308
9309static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
9310 struct bpf_xdp_link *link, struct bpf_prog *new_prog,
9311 struct bpf_prog *old_prog, u32 flags)
9312{
9313 unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
9314 struct bpf_prog *cur_prog;
9315 struct net_device *upper;
9316 struct list_head *iter;
9317 enum bpf_xdp_mode mode;
9318 bpf_op_t bpf_op;
9319 int err;
9320
9321 ASSERT_RTNL();
9322
9323 /* either link or prog attachment, never both */
9324 if (link && (new_prog || old_prog))
9325 return -EINVAL;
9326 /* link supports only XDP mode flags */
9327 if (link && (flags & ~XDP_FLAGS_MODES)) {
9328 NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
9329 return -EINVAL;
9330 }
9331 /* just one XDP mode bit should be set, zero defaults to drv/skb mode */
9332 if (num_modes > 1) {
9333 NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
9334 return -EINVAL;
9335 }
9336 /* avoid ambiguity if offload + drv/skb mode progs are both loaded */
9337 if (!num_modes && dev_xdp_prog_count(dev) > 1) {
9338 NL_SET_ERR_MSG(extack,
9339 "More than one program loaded, unset mode is ambiguous");
9340 return -EINVAL;
9341 }
9342 /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
9343 if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
9344 NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
9345 return -EINVAL;
9346 }
9347
9348 mode = dev_xdp_mode(dev, flags);
9349 /* can't replace attached link */
9350 if (dev_xdp_link(dev, mode)) {
9351 NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
9352 return -EBUSY;
9353 }
9354
9355 /* don't allow if an upper device already has a program */
9356 netdev_for_each_upper_dev_rcu(dev, upper, iter) {
9357 if (dev_xdp_prog_count(upper) > 0) {
9358 NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
9359 return -EEXIST;
9360 }
9361 }
9362
9363 cur_prog = dev_xdp_prog(dev, mode);
9364 /* can't replace attached prog with link */
9365 if (link && cur_prog) {
9366 NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
9367 return -EBUSY;
9368 }
9369 if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
9370 NL_SET_ERR_MSG(extack, "Active program does not match expected");
9371 return -EEXIST;
9372 }
9373
9374 /* put effective new program into new_prog */
9375 if (link)
9376 new_prog = link->link.prog;
9377
9378 if (new_prog) {
9379 bool offload = mode == XDP_MODE_HW;
9380 enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
9381 ? XDP_MODE_DRV : XDP_MODE_SKB;
9382
9383 if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
9384 NL_SET_ERR_MSG(extack, "XDP program already attached");
9385 return -EBUSY;
9386 }
9387 if (!offload && dev_xdp_prog(dev, other_mode)) {
9388 NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
9389 return -EEXIST;
9390 }
9391 if (!offload && bpf_prog_is_offloaded(new_prog->aux)) {
9392 NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported");
9393 return -EINVAL;
9394 }
9395 if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) {
9396 NL_SET_ERR_MSG(extack, "Program bound to different device");
9397 return -EINVAL;
9398 }
9399 if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
9400 NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
9401 return -EINVAL;
9402 }
9403 if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
9404 NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
9405 return -EINVAL;
9406 }
9407 }
9408
9409 /* don't call drivers if the effective program didn't change */
9410 if (new_prog != cur_prog) {
9411 bpf_op = dev_xdp_bpf_op(dev, mode);
9412 if (!bpf_op) {
9413 NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
9414 return -EOPNOTSUPP;
9415 }
9416
9417 err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
9418 if (err)
9419 return err;
9420 }
9421
9422 if (link)
9423 dev_xdp_set_link(dev, mode, link);
9424 else
9425 dev_xdp_set_prog(dev, mode, new_prog);
9426 if (cur_prog)
9427 bpf_prog_put(cur_prog);
9428
9429 return 0;
9430}
9431
9432static int dev_xdp_attach_link(struct net_device *dev,
9433 struct netlink_ext_ack *extack,
9434 struct bpf_xdp_link *link)
9435{
9436 return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
9437}
9438
9439static int dev_xdp_detach_link(struct net_device *dev,
9440 struct netlink_ext_ack *extack,
9441 struct bpf_xdp_link *link)
9442{
9443 enum bpf_xdp_mode mode;
9444 bpf_op_t bpf_op;
9445
9446 ASSERT_RTNL();
9447
9448 mode = dev_xdp_mode(dev, link->flags);
9449 if (dev_xdp_link(dev, mode) != link)
9450 return -EINVAL;
9451
9452 bpf_op = dev_xdp_bpf_op(dev, mode);
9453 WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9454 dev_xdp_set_link(dev, mode, NULL);
9455 return 0;
9456}
9457
9458static void bpf_xdp_link_release(struct bpf_link *link)
9459{
9460 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9461
9462 rtnl_lock();
9463
9464 /* if racing with net_device's tear down, xdp_link->dev might be
9465 * already NULL, in which case link was already auto-detached
9466 */
9467 if (xdp_link->dev) {
9468 WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
9469 xdp_link->dev = NULL;
9470 }
9471
9472 rtnl_unlock();
9473}
9474
9475static int bpf_xdp_link_detach(struct bpf_link *link)
9476{
9477 bpf_xdp_link_release(link);
9478 return 0;
9479}
9480
9481static void bpf_xdp_link_dealloc(struct bpf_link *link)
9482{
9483 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9484
9485 kfree(xdp_link);
9486}
9487
9488static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
9489 struct seq_file *seq)
9490{
9491 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9492 u32 ifindex = 0;
9493
9494 rtnl_lock();
9495 if (xdp_link->dev)
9496 ifindex = xdp_link->dev->ifindex;
9497 rtnl_unlock();
9498
9499 seq_printf(seq, "ifindex:\t%u\n", ifindex);
9500}
9501
9502static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
9503 struct bpf_link_info *info)
9504{
9505 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9506 u32 ifindex = 0;
9507
9508 rtnl_lock();
9509 if (xdp_link->dev)
9510 ifindex = xdp_link->dev->ifindex;
9511 rtnl_unlock();
9512
9513 info->xdp.ifindex = ifindex;
9514 return 0;
9515}
9516
9517static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
9518 struct bpf_prog *old_prog)
9519{
9520 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9521 enum bpf_xdp_mode mode;
9522 bpf_op_t bpf_op;
9523 int err = 0;
9524
9525 rtnl_lock();
9526
9527 /* link might have been auto-released already, so fail */
9528 if (!xdp_link->dev) {
9529 err = -ENOLINK;
9530 goto out_unlock;
9531 }
9532
9533 if (old_prog && link->prog != old_prog) {
9534 err = -EPERM;
9535 goto out_unlock;
9536 }
9537 old_prog = link->prog;
9538 if (old_prog->type != new_prog->type ||
9539 old_prog->expected_attach_type != new_prog->expected_attach_type) {
9540 err = -EINVAL;
9541 goto out_unlock;
9542 }
9543
9544 if (old_prog == new_prog) {
9545 /* no-op, don't disturb drivers */
9546 bpf_prog_put(new_prog);
9547 goto out_unlock;
9548 }
9549
9550 mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
9551 bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
9552 err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
9553 xdp_link->flags, new_prog);
9554 if (err)
9555 goto out_unlock;
9556
9557 old_prog = xchg(&link->prog, new_prog);
9558 bpf_prog_put(old_prog);
9559
9560out_unlock:
9561 rtnl_unlock();
9562 return err;
9563}
9564
9565static const struct bpf_link_ops bpf_xdp_link_lops = {
9566 .release = bpf_xdp_link_release,
9567 .dealloc = bpf_xdp_link_dealloc,
9568 .detach = bpf_xdp_link_detach,
9569 .show_fdinfo = bpf_xdp_link_show_fdinfo,
9570 .fill_link_info = bpf_xdp_link_fill_link_info,
9571 .update_prog = bpf_xdp_link_update,
9572};
9573
9574int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
9575{
9576 struct net *net = current->nsproxy->net_ns;
9577 struct bpf_link_primer link_primer;
9578 struct netlink_ext_ack extack = {};
9579 struct bpf_xdp_link *link;
9580 struct net_device *dev;
9581 int err, fd;
9582
9583 rtnl_lock();
9584 dev = dev_get_by_index(net, attr->link_create.target_ifindex);
9585 if (!dev) {
9586 rtnl_unlock();
9587 return -EINVAL;
9588 }
9589
9590 link = kzalloc(sizeof(*link), GFP_USER);
9591 if (!link) {
9592 err = -ENOMEM;
9593 goto unlock;
9594 }
9595
9596 bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
9597 link->dev = dev;
9598 link->flags = attr->link_create.flags;
9599
9600 err = bpf_link_prime(&link->link, &link_primer);
9601 if (err) {
9602 kfree(link);
9603 goto unlock;
9604 }
9605
9606 err = dev_xdp_attach_link(dev, &extack, link);
9607 rtnl_unlock();
9608
9609 if (err) {
9610 link->dev = NULL;
9611 bpf_link_cleanup(&link_primer);
9612 trace_bpf_xdp_link_attach_failed(extack._msg);
9613 goto out_put_dev;
9614 }
9615
9616 fd = bpf_link_settle(&link_primer);
9617 /* link itself doesn't hold dev's refcnt to not complicate shutdown */
9618 dev_put(dev);
9619 return fd;
9620
9621unlock:
9622 rtnl_unlock();
9623
9624out_put_dev:
9625 dev_put(dev);
9626 return err;
9627}
9628
9629/**
9630 * dev_change_xdp_fd - set or clear a bpf program for a device rx path
9631 * @dev: device
9632 * @extack: netlink extended ack
9633 * @fd: new program fd or negative value to clear
9634 * @expected_fd: old program fd that userspace expects to replace or clear
9635 * @flags: xdp-related flags
9636 *
9637 * Set or clear a bpf program for a device
9638 */
9639int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
9640 int fd, int expected_fd, u32 flags)
9641{
9642 enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
9643 struct bpf_prog *new_prog = NULL, *old_prog = NULL;
9644 int err;
9645
9646 ASSERT_RTNL();
9647
9648 if (fd >= 0) {
9649 new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
9650 mode != XDP_MODE_SKB);
9651 if (IS_ERR(new_prog))
9652 return PTR_ERR(new_prog);
9653 }
9654
9655 if (expected_fd >= 0) {
9656 old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
9657 mode != XDP_MODE_SKB);
9658 if (IS_ERR(old_prog)) {
9659 err = PTR_ERR(old_prog);
9660 old_prog = NULL;
9661 goto err_out;
9662 }
9663 }
9664
9665 err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
9666
9667err_out:
9668 if (err && new_prog)
9669 bpf_prog_put(new_prog);
9670 if (old_prog)
9671 bpf_prog_put(old_prog);
9672 return err;
9673}
9674
9675/**
9676 * dev_index_reserve() - allocate an ifindex in a namespace
9677 * @net: the applicable net namespace
9678 * @ifindex: requested ifindex, pass %0 to get one allocated
9679 *
9680 * Allocate a ifindex for a new device. Caller must either use the ifindex
9681 * to store the device (via list_netdevice()) or call dev_index_release()
9682 * to give the index up.
9683 *
9684 * Return: a suitable unique value for a new device interface number or -errno.
9685 */
9686static int dev_index_reserve(struct net *net, u32 ifindex)
9687{
9688 int err;
9689
9690 if (ifindex > INT_MAX) {
9691 DEBUG_NET_WARN_ON_ONCE(1);
9692 return -EINVAL;
9693 }
9694
9695 if (!ifindex)
9696 err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL,
9697 xa_limit_31b, &net->ifindex, GFP_KERNEL);
9698 else
9699 err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL);
9700 if (err < 0)
9701 return err;
9702
9703 return ifindex;
9704}
9705
9706static void dev_index_release(struct net *net, int ifindex)
9707{
9708 /* Expect only unused indexes, unlist_netdevice() removes the used */
9709 WARN_ON(xa_erase(&net->dev_by_index, ifindex));
9710}
9711
9712/* Delayed registration/unregisteration */
9713LIST_HEAD(net_todo_list);
9714DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
9715atomic_t dev_unreg_count = ATOMIC_INIT(0);
9716
9717static void net_set_todo(struct net_device *dev)
9718{
9719 list_add_tail(&dev->todo_list, &net_todo_list);
9720}
9721
9722static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
9723 struct net_device *upper, netdev_features_t features)
9724{
9725 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9726 netdev_features_t feature;
9727 int feature_bit;
9728
9729 for_each_netdev_feature(upper_disables, feature_bit) {
9730 feature = __NETIF_F_BIT(feature_bit);
9731 if (!(upper->wanted_features & feature)
9732 && (features & feature)) {
9733 netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
9734 &feature, upper->name);
9735 features &= ~feature;
9736 }
9737 }
9738
9739 return features;
9740}
9741
9742static void netdev_sync_lower_features(struct net_device *upper,
9743 struct net_device *lower, netdev_features_t features)
9744{
9745 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9746 netdev_features_t feature;
9747 int feature_bit;
9748
9749 for_each_netdev_feature(upper_disables, feature_bit) {
9750 feature = __NETIF_F_BIT(feature_bit);
9751 if (!(features & feature) && (lower->features & feature)) {
9752 netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
9753 &feature, lower->name);
9754 lower->wanted_features &= ~feature;
9755 __netdev_update_features(lower);
9756
9757 if (unlikely(lower->features & feature))
9758 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
9759 &feature, lower->name);
9760 else
9761 netdev_features_change(lower);
9762 }
9763 }
9764}
9765
9766static netdev_features_t netdev_fix_features(struct net_device *dev,
9767 netdev_features_t features)
9768{
9769 /* Fix illegal checksum combinations */
9770 if ((features & NETIF_F_HW_CSUM) &&
9771 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
9772 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
9773 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
9774 }
9775
9776 /* TSO requires that SG is present as well. */
9777 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
9778 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
9779 features &= ~NETIF_F_ALL_TSO;
9780 }
9781
9782 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
9783 !(features & NETIF_F_IP_CSUM)) {
9784 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
9785 features &= ~NETIF_F_TSO;
9786 features &= ~NETIF_F_TSO_ECN;
9787 }
9788
9789 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
9790 !(features & NETIF_F_IPV6_CSUM)) {
9791 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
9792 features &= ~NETIF_F_TSO6;
9793 }
9794
9795 /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
9796 if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
9797 features &= ~NETIF_F_TSO_MANGLEID;
9798
9799 /* TSO ECN requires that TSO is present as well. */
9800 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
9801 features &= ~NETIF_F_TSO_ECN;
9802
9803 /* Software GSO depends on SG. */
9804 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
9805 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
9806 features &= ~NETIF_F_GSO;
9807 }
9808
9809 /* GSO partial features require GSO partial be set */
9810 if ((features & dev->gso_partial_features) &&
9811 !(features & NETIF_F_GSO_PARTIAL)) {
9812 netdev_dbg(dev,
9813 "Dropping partially supported GSO features since no GSO partial.\n");
9814 features &= ~dev->gso_partial_features;
9815 }
9816
9817 if (!(features & NETIF_F_RXCSUM)) {
9818 /* NETIF_F_GRO_HW implies doing RXCSUM since every packet
9819 * successfully merged by hardware must also have the
9820 * checksum verified by hardware. If the user does not
9821 * want to enable RXCSUM, logically, we should disable GRO_HW.
9822 */
9823 if (features & NETIF_F_GRO_HW) {
9824 netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
9825 features &= ~NETIF_F_GRO_HW;
9826 }
9827 }
9828
9829 /* LRO/HW-GRO features cannot be combined with RX-FCS */
9830 if (features & NETIF_F_RXFCS) {
9831 if (features & NETIF_F_LRO) {
9832 netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
9833 features &= ~NETIF_F_LRO;
9834 }
9835
9836 if (features & NETIF_F_GRO_HW) {
9837 netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
9838 features &= ~NETIF_F_GRO_HW;
9839 }
9840 }
9841
9842 if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
9843 netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
9844 features &= ~NETIF_F_LRO;
9845 }
9846
9847 if (features & NETIF_F_HW_TLS_TX) {
9848 bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
9849 (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
9850 bool hw_csum = features & NETIF_F_HW_CSUM;
9851
9852 if (!ip_csum && !hw_csum) {
9853 netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
9854 features &= ~NETIF_F_HW_TLS_TX;
9855 }
9856 }
9857
9858 if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
9859 netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
9860 features &= ~NETIF_F_HW_TLS_RX;
9861 }
9862
9863 return features;
9864}
9865
9866int __netdev_update_features(struct net_device *dev)
9867{
9868 struct net_device *upper, *lower;
9869 netdev_features_t features;
9870 struct list_head *iter;
9871 int err = -1;
9872
9873 ASSERT_RTNL();
9874
9875 features = netdev_get_wanted_features(dev);
9876
9877 if (dev->netdev_ops->ndo_fix_features)
9878 features = dev->netdev_ops->ndo_fix_features(dev, features);
9879
9880 /* driver might be less strict about feature dependencies */
9881 features = netdev_fix_features(dev, features);
9882
9883 /* some features can't be enabled if they're off on an upper device */
9884 netdev_for_each_upper_dev_rcu(dev, upper, iter)
9885 features = netdev_sync_upper_features(dev, upper, features);
9886
9887 if (dev->features == features)
9888 goto sync_lower;
9889
9890 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
9891 &dev->features, &features);
9892
9893 if (dev->netdev_ops->ndo_set_features)
9894 err = dev->netdev_ops->ndo_set_features(dev, features);
9895 else
9896 err = 0;
9897
9898 if (unlikely(err < 0)) {
9899 netdev_err(dev,
9900 "set_features() failed (%d); wanted %pNF, left %pNF\n",
9901 err, &features, &dev->features);
9902 /* return non-0 since some features might have changed and
9903 * it's better to fire a spurious notification than miss it
9904 */
9905 return -1;
9906 }
9907
9908sync_lower:
9909 /* some features must be disabled on lower devices when disabled
9910 * on an upper device (think: bonding master or bridge)
9911 */
9912 netdev_for_each_lower_dev(dev, lower, iter)
9913 netdev_sync_lower_features(dev, lower, features);
9914
9915 if (!err) {
9916 netdev_features_t diff = features ^ dev->features;
9917
9918 if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
9919 /* udp_tunnel_{get,drop}_rx_info both need
9920 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
9921 * device, or they won't do anything.
9922 * Thus we need to update dev->features
9923 * *before* calling udp_tunnel_get_rx_info,
9924 * but *after* calling udp_tunnel_drop_rx_info.
9925 */
9926 if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
9927 dev->features = features;
9928 udp_tunnel_get_rx_info(dev);
9929 } else {
9930 udp_tunnel_drop_rx_info(dev);
9931 }
9932 }
9933
9934 if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
9935 if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
9936 dev->features = features;
9937 err |= vlan_get_rx_ctag_filter_info(dev);
9938 } else {
9939 vlan_drop_rx_ctag_filter_info(dev);
9940 }
9941 }
9942
9943 if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
9944 if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
9945 dev->features = features;
9946 err |= vlan_get_rx_stag_filter_info(dev);
9947 } else {
9948 vlan_drop_rx_stag_filter_info(dev);
9949 }
9950 }
9951
9952 dev->features = features;
9953 }
9954
9955 return err < 0 ? 0 : 1;
9956}
9957
9958/**
9959 * netdev_update_features - recalculate device features
9960 * @dev: the device to check
9961 *
9962 * Recalculate dev->features set and send notifications if it
9963 * has changed. Should be called after driver or hardware dependent
9964 * conditions might have changed that influence the features.
9965 */
9966void netdev_update_features(struct net_device *dev)
9967{
9968 if (__netdev_update_features(dev))
9969 netdev_features_change(dev);
9970}
9971EXPORT_SYMBOL(netdev_update_features);
9972
9973/**
9974 * netdev_change_features - recalculate device features
9975 * @dev: the device to check
9976 *
9977 * Recalculate dev->features set and send notifications even
9978 * if they have not changed. Should be called instead of
9979 * netdev_update_features() if also dev->vlan_features might
9980 * have changed to allow the changes to be propagated to stacked
9981 * VLAN devices.
9982 */
9983void netdev_change_features(struct net_device *dev)
9984{
9985 __netdev_update_features(dev);
9986 netdev_features_change(dev);
9987}
9988EXPORT_SYMBOL(netdev_change_features);
9989
9990/**
9991 * netif_stacked_transfer_operstate - transfer operstate
9992 * @rootdev: the root or lower level device to transfer state from
9993 * @dev: the device to transfer operstate to
9994 *
9995 * Transfer operational state from root to device. This is normally
9996 * called when a stacking relationship exists between the root
9997 * device and the device(a leaf device).
9998 */
9999void netif_stacked_transfer_operstate(const struct net_device *rootdev,
10000 struct net_device *dev)
10001{
10002 if (rootdev->operstate == IF_OPER_DORMANT)
10003 netif_dormant_on(dev);
10004 else
10005 netif_dormant_off(dev);
10006
10007 if (rootdev->operstate == IF_OPER_TESTING)
10008 netif_testing_on(dev);
10009 else
10010 netif_testing_off(dev);
10011
10012 if (netif_carrier_ok(rootdev))
10013 netif_carrier_on(dev);
10014 else
10015 netif_carrier_off(dev);
10016}
10017EXPORT_SYMBOL(netif_stacked_transfer_operstate);
10018
10019static int netif_alloc_rx_queues(struct net_device *dev)
10020{
10021 unsigned int i, count = dev->num_rx_queues;
10022 struct netdev_rx_queue *rx;
10023 size_t sz = count * sizeof(*rx);
10024 int err = 0;
10025
10026 BUG_ON(count < 1);
10027
10028 rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10029 if (!rx)
10030 return -ENOMEM;
10031
10032 dev->_rx = rx;
10033
10034 for (i = 0; i < count; i++) {
10035 rx[i].dev = dev;
10036
10037 /* XDP RX-queue setup */
10038 err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
10039 if (err < 0)
10040 goto err_rxq_info;
10041 }
10042 return 0;
10043
10044err_rxq_info:
10045 /* Rollback successful reg's and free other resources */
10046 while (i--)
10047 xdp_rxq_info_unreg(&rx[i].xdp_rxq);
10048 kvfree(dev->_rx);
10049 dev->_rx = NULL;
10050 return err;
10051}
10052
10053static void netif_free_rx_queues(struct net_device *dev)
10054{
10055 unsigned int i, count = dev->num_rx_queues;
10056
10057 /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
10058 if (!dev->_rx)
10059 return;
10060
10061 for (i = 0; i < count; i++)
10062 xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
10063
10064 kvfree(dev->_rx);
10065}
10066
10067static void netdev_init_one_queue(struct net_device *dev,
10068 struct netdev_queue *queue, void *_unused)
10069{
10070 /* Initialize queue lock */
10071 spin_lock_init(&queue->_xmit_lock);
10072 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
10073 queue->xmit_lock_owner = -1;
10074 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
10075 queue->dev = dev;
10076#ifdef CONFIG_BQL
10077 dql_init(&queue->dql, HZ);
10078#endif
10079}
10080
10081static void netif_free_tx_queues(struct net_device *dev)
10082{
10083 kvfree(dev->_tx);
10084}
10085
10086static int netif_alloc_netdev_queues(struct net_device *dev)
10087{
10088 unsigned int count = dev->num_tx_queues;
10089 struct netdev_queue *tx;
10090 size_t sz = count * sizeof(*tx);
10091
10092 if (count < 1 || count > 0xffff)
10093 return -EINVAL;
10094
10095 tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10096 if (!tx)
10097 return -ENOMEM;
10098
10099 dev->_tx = tx;
10100
10101 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
10102 spin_lock_init(&dev->tx_global_lock);
10103
10104 return 0;
10105}
10106
10107void netif_tx_stop_all_queues(struct net_device *dev)
10108{
10109 unsigned int i;
10110
10111 for (i = 0; i < dev->num_tx_queues; i++) {
10112 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
10113
10114 netif_tx_stop_queue(txq);
10115 }
10116}
10117EXPORT_SYMBOL(netif_tx_stop_all_queues);
10118
10119static int netdev_do_alloc_pcpu_stats(struct net_device *dev)
10120{
10121 void __percpu *v;
10122
10123 /* Drivers implementing ndo_get_peer_dev must support tstat
10124 * accounting, so that skb_do_redirect() can bump the dev's
10125 * RX stats upon network namespace switch.
10126 */
10127 if (dev->netdev_ops->ndo_get_peer_dev &&
10128 dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS)
10129 return -EOPNOTSUPP;
10130
10131 switch (dev->pcpu_stat_type) {
10132 case NETDEV_PCPU_STAT_NONE:
10133 return 0;
10134 case NETDEV_PCPU_STAT_LSTATS:
10135 v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
10136 break;
10137 case NETDEV_PCPU_STAT_TSTATS:
10138 v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
10139 break;
10140 case NETDEV_PCPU_STAT_DSTATS:
10141 v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
10142 break;
10143 default:
10144 return -EINVAL;
10145 }
10146
10147 return v ? 0 : -ENOMEM;
10148}
10149
10150static void netdev_do_free_pcpu_stats(struct net_device *dev)
10151{
10152 switch (dev->pcpu_stat_type) {
10153 case NETDEV_PCPU_STAT_NONE:
10154 return;
10155 case NETDEV_PCPU_STAT_LSTATS:
10156 free_percpu(dev->lstats);
10157 break;
10158 case NETDEV_PCPU_STAT_TSTATS:
10159 free_percpu(dev->tstats);
10160 break;
10161 case NETDEV_PCPU_STAT_DSTATS:
10162 free_percpu(dev->dstats);
10163 break;
10164 }
10165}
10166
10167/**
10168 * register_netdevice() - register a network device
10169 * @dev: device to register
10170 *
10171 * Take a prepared network device structure and make it externally accessible.
10172 * A %NETDEV_REGISTER message is sent to the netdev notifier chain.
10173 * Callers must hold the rtnl lock - you may want register_netdev()
10174 * instead of this.
10175 */
10176int register_netdevice(struct net_device *dev)
10177{
10178 int ret;
10179 struct net *net = dev_net(dev);
10180
10181 BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
10182 NETDEV_FEATURE_COUNT);
10183 BUG_ON(dev_boot_phase);
10184 ASSERT_RTNL();
10185
10186 might_sleep();
10187
10188 /* When net_device's are persistent, this will be fatal. */
10189 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
10190 BUG_ON(!net);
10191
10192 ret = ethtool_check_ops(dev->ethtool_ops);
10193 if (ret)
10194 return ret;
10195
10196 spin_lock_init(&dev->addr_list_lock);
10197 netdev_set_addr_lockdep_class(dev);
10198
10199 ret = dev_get_valid_name(net, dev, dev->name);
10200 if (ret < 0)
10201 goto out;
10202
10203 ret = -ENOMEM;
10204 dev->name_node = netdev_name_node_head_alloc(dev);
10205 if (!dev->name_node)
10206 goto out;
10207
10208 /* Init, if this function is available */
10209 if (dev->netdev_ops->ndo_init) {
10210 ret = dev->netdev_ops->ndo_init(dev);
10211 if (ret) {
10212 if (ret > 0)
10213 ret = -EIO;
10214 goto err_free_name;
10215 }
10216 }
10217
10218 if (((dev->hw_features | dev->features) &
10219 NETIF_F_HW_VLAN_CTAG_FILTER) &&
10220 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
10221 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
10222 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
10223 ret = -EINVAL;
10224 goto err_uninit;
10225 }
10226
10227 ret = netdev_do_alloc_pcpu_stats(dev);
10228 if (ret)
10229 goto err_uninit;
10230
10231 ret = dev_index_reserve(net, dev->ifindex);
10232 if (ret < 0)
10233 goto err_free_pcpu;
10234 dev->ifindex = ret;
10235
10236 /* Transfer changeable features to wanted_features and enable
10237 * software offloads (GSO and GRO).
10238 */
10239 dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
10240 dev->features |= NETIF_F_SOFT_FEATURES;
10241
10242 if (dev->udp_tunnel_nic_info) {
10243 dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10244 dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10245 }
10246
10247 dev->wanted_features = dev->features & dev->hw_features;
10248
10249 if (!(dev->flags & IFF_LOOPBACK))
10250 dev->hw_features |= NETIF_F_NOCACHE_COPY;
10251
10252 /* If IPv4 TCP segmentation offload is supported we should also
10253 * allow the device to enable segmenting the frame with the option
10254 * of ignoring a static IP ID value. This doesn't enable the
10255 * feature itself but allows the user to enable it later.
10256 */
10257 if (dev->hw_features & NETIF_F_TSO)
10258 dev->hw_features |= NETIF_F_TSO_MANGLEID;
10259 if (dev->vlan_features & NETIF_F_TSO)
10260 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
10261 if (dev->mpls_features & NETIF_F_TSO)
10262 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
10263 if (dev->hw_enc_features & NETIF_F_TSO)
10264 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
10265
10266 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
10267 */
10268 dev->vlan_features |= NETIF_F_HIGHDMA;
10269
10270 /* Make NETIF_F_SG inheritable to tunnel devices.
10271 */
10272 dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
10273
10274 /* Make NETIF_F_SG inheritable to MPLS.
10275 */
10276 dev->mpls_features |= NETIF_F_SG;
10277
10278 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
10279 ret = notifier_to_errno(ret);
10280 if (ret)
10281 goto err_ifindex_release;
10282
10283 ret = netdev_register_kobject(dev);
10284
10285 WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
10286
10287 if (ret)
10288 goto err_uninit_notify;
10289
10290 __netdev_update_features(dev);
10291
10292 /*
10293 * Default initial state at registry is that the
10294 * device is present.
10295 */
10296
10297 set_bit(__LINK_STATE_PRESENT, &dev->state);
10298
10299 linkwatch_init_dev(dev);
10300
10301 dev_init_scheduler(dev);
10302
10303 netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL);
10304 list_netdevice(dev);
10305
10306 add_device_randomness(dev->dev_addr, dev->addr_len);
10307
10308 /* If the device has permanent device address, driver should
10309 * set dev_addr and also addr_assign_type should be set to
10310 * NET_ADDR_PERM (default value).
10311 */
10312 if (dev->addr_assign_type == NET_ADDR_PERM)
10313 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
10314
10315 /* Notify protocols, that a new device appeared. */
10316 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
10317 ret = notifier_to_errno(ret);
10318 if (ret) {
10319 /* Expect explicit free_netdev() on failure */
10320 dev->needs_free_netdev = false;
10321 unregister_netdevice_queue(dev, NULL);
10322 goto out;
10323 }
10324 /*
10325 * Prevent userspace races by waiting until the network
10326 * device is fully setup before sending notifications.
10327 */
10328 if (!dev->rtnl_link_ops ||
10329 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10330 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
10331
10332out:
10333 return ret;
10334
10335err_uninit_notify:
10336 call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
10337err_ifindex_release:
10338 dev_index_release(net, dev->ifindex);
10339err_free_pcpu:
10340 netdev_do_free_pcpu_stats(dev);
10341err_uninit:
10342 if (dev->netdev_ops->ndo_uninit)
10343 dev->netdev_ops->ndo_uninit(dev);
10344 if (dev->priv_destructor)
10345 dev->priv_destructor(dev);
10346err_free_name:
10347 netdev_name_node_free(dev->name_node);
10348 goto out;
10349}
10350EXPORT_SYMBOL(register_netdevice);
10351
10352/**
10353 * init_dummy_netdev - init a dummy network device for NAPI
10354 * @dev: device to init
10355 *
10356 * This takes a network device structure and initialize the minimum
10357 * amount of fields so it can be used to schedule NAPI polls without
10358 * registering a full blown interface. This is to be used by drivers
10359 * that need to tie several hardware interfaces to a single NAPI
10360 * poll scheduler due to HW limitations.
10361 */
10362void init_dummy_netdev(struct net_device *dev)
10363{
10364 /* Clear everything. Note we don't initialize spinlocks
10365 * are they aren't supposed to be taken by any of the
10366 * NAPI code and this dummy netdev is supposed to be
10367 * only ever used for NAPI polls
10368 */
10369 memset(dev, 0, sizeof(struct net_device));
10370
10371 /* make sure we BUG if trying to hit standard
10372 * register/unregister code path
10373 */
10374 dev->reg_state = NETREG_DUMMY;
10375
10376 /* NAPI wants this */
10377 INIT_LIST_HEAD(&dev->napi_list);
10378
10379 /* a dummy interface is started by default */
10380 set_bit(__LINK_STATE_PRESENT, &dev->state);
10381 set_bit(__LINK_STATE_START, &dev->state);
10382
10383 /* napi_busy_loop stats accounting wants this */
10384 dev_net_set(dev, &init_net);
10385
10386 /* Note : We dont allocate pcpu_refcnt for dummy devices,
10387 * because users of this 'device' dont need to change
10388 * its refcount.
10389 */
10390}
10391EXPORT_SYMBOL_GPL(init_dummy_netdev);
10392
10393
10394/**
10395 * register_netdev - register a network device
10396 * @dev: device to register
10397 *
10398 * Take a completed network device structure and add it to the kernel
10399 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
10400 * chain. 0 is returned on success. A negative errno code is returned
10401 * on a failure to set up the device, or if the name is a duplicate.
10402 *
10403 * This is a wrapper around register_netdevice that takes the rtnl semaphore
10404 * and expands the device name if you passed a format string to
10405 * alloc_netdev.
10406 */
10407int register_netdev(struct net_device *dev)
10408{
10409 int err;
10410
10411 if (rtnl_lock_killable())
10412 return -EINTR;
10413 err = register_netdevice(dev);
10414 rtnl_unlock();
10415 return err;
10416}
10417EXPORT_SYMBOL(register_netdev);
10418
10419int netdev_refcnt_read(const struct net_device *dev)
10420{
10421#ifdef CONFIG_PCPU_DEV_REFCNT
10422 int i, refcnt = 0;
10423
10424 for_each_possible_cpu(i)
10425 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
10426 return refcnt;
10427#else
10428 return refcount_read(&dev->dev_refcnt);
10429#endif
10430}
10431EXPORT_SYMBOL(netdev_refcnt_read);
10432
10433int netdev_unregister_timeout_secs __read_mostly = 10;
10434
10435#define WAIT_REFS_MIN_MSECS 1
10436#define WAIT_REFS_MAX_MSECS 250
10437/**
10438 * netdev_wait_allrefs_any - wait until all references are gone.
10439 * @list: list of net_devices to wait on
10440 *
10441 * This is called when unregistering network devices.
10442 *
10443 * Any protocol or device that holds a reference should register
10444 * for netdevice notification, and cleanup and put back the
10445 * reference if they receive an UNREGISTER event.
10446 * We can get stuck here if buggy protocols don't correctly
10447 * call dev_put.
10448 */
10449static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
10450{
10451 unsigned long rebroadcast_time, warning_time;
10452 struct net_device *dev;
10453 int wait = 0;
10454
10455 rebroadcast_time = warning_time = jiffies;
10456
10457 list_for_each_entry(dev, list, todo_list)
10458 if (netdev_refcnt_read(dev) == 1)
10459 return dev;
10460
10461 while (true) {
10462 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
10463 rtnl_lock();
10464
10465 /* Rebroadcast unregister notification */
10466 list_for_each_entry(dev, list, todo_list)
10467 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10468
10469 __rtnl_unlock();
10470 rcu_barrier();
10471 rtnl_lock();
10472
10473 list_for_each_entry(dev, list, todo_list)
10474 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
10475 &dev->state)) {
10476 /* We must not have linkwatch events
10477 * pending on unregister. If this
10478 * happens, we simply run the queue
10479 * unscheduled, resulting in a noop
10480 * for this device.
10481 */
10482 linkwatch_run_queue();
10483 break;
10484 }
10485
10486 __rtnl_unlock();
10487
10488 rebroadcast_time = jiffies;
10489 }
10490
10491 rcu_barrier();
10492
10493 if (!wait) {
10494 wait = WAIT_REFS_MIN_MSECS;
10495 } else {
10496 msleep(wait);
10497 wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
10498 }
10499
10500 list_for_each_entry(dev, list, todo_list)
10501 if (netdev_refcnt_read(dev) == 1)
10502 return dev;
10503
10504 if (time_after(jiffies, warning_time +
10505 READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
10506 list_for_each_entry(dev, list, todo_list) {
10507 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
10508 dev->name, netdev_refcnt_read(dev));
10509 ref_tracker_dir_print(&dev->refcnt_tracker, 10);
10510 }
10511
10512 warning_time = jiffies;
10513 }
10514 }
10515}
10516
10517/* The sequence is:
10518 *
10519 * rtnl_lock();
10520 * ...
10521 * register_netdevice(x1);
10522 * register_netdevice(x2);
10523 * ...
10524 * unregister_netdevice(y1);
10525 * unregister_netdevice(y2);
10526 * ...
10527 * rtnl_unlock();
10528 * free_netdev(y1);
10529 * free_netdev(y2);
10530 *
10531 * We are invoked by rtnl_unlock().
10532 * This allows us to deal with problems:
10533 * 1) We can delete sysfs objects which invoke hotplug
10534 * without deadlocking with linkwatch via keventd.
10535 * 2) Since we run with the RTNL semaphore not held, we can sleep
10536 * safely in order to wait for the netdev refcnt to drop to zero.
10537 *
10538 * We must not return until all unregister events added during
10539 * the interval the lock was held have been completed.
10540 */
10541void netdev_run_todo(void)
10542{
10543 struct net_device *dev, *tmp;
10544 struct list_head list;
10545 int cnt;
10546#ifdef CONFIG_LOCKDEP
10547 struct list_head unlink_list;
10548
10549 list_replace_init(&net_unlink_list, &unlink_list);
10550
10551 while (!list_empty(&unlink_list)) {
10552 struct net_device *dev = list_first_entry(&unlink_list,
10553 struct net_device,
10554 unlink_list);
10555 list_del_init(&dev->unlink_list);
10556 dev->nested_level = dev->lower_level - 1;
10557 }
10558#endif
10559
10560 /* Snapshot list, allow later requests */
10561 list_replace_init(&net_todo_list, &list);
10562
10563 __rtnl_unlock();
10564
10565 /* Wait for rcu callbacks to finish before next phase */
10566 if (!list_empty(&list))
10567 rcu_barrier();
10568
10569 list_for_each_entry_safe(dev, tmp, &list, todo_list) {
10570 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
10571 netdev_WARN(dev, "run_todo but not unregistering\n");
10572 list_del(&dev->todo_list);
10573 continue;
10574 }
10575
10576 WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
10577 linkwatch_sync_dev(dev);
10578 }
10579
10580 cnt = 0;
10581 while (!list_empty(&list)) {
10582 dev = netdev_wait_allrefs_any(&list);
10583 list_del(&dev->todo_list);
10584
10585 /* paranoia */
10586 BUG_ON(netdev_refcnt_read(dev) != 1);
10587 BUG_ON(!list_empty(&dev->ptype_all));
10588 BUG_ON(!list_empty(&dev->ptype_specific));
10589 WARN_ON(rcu_access_pointer(dev->ip_ptr));
10590 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
10591
10592 netdev_do_free_pcpu_stats(dev);
10593 if (dev->priv_destructor)
10594 dev->priv_destructor(dev);
10595 if (dev->needs_free_netdev)
10596 free_netdev(dev);
10597
10598 cnt++;
10599
10600 /* Free network device */
10601 kobject_put(&dev->dev.kobj);
10602 }
10603 if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count))
10604 wake_up(&netdev_unregistering_wq);
10605}
10606
10607/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
10608 * all the same fields in the same order as net_device_stats, with only
10609 * the type differing, but rtnl_link_stats64 may have additional fields
10610 * at the end for newer counters.
10611 */
10612void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
10613 const struct net_device_stats *netdev_stats)
10614{
10615 size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
10616 const atomic_long_t *src = (atomic_long_t *)netdev_stats;
10617 u64 *dst = (u64 *)stats64;
10618
10619 BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
10620 for (i = 0; i < n; i++)
10621 dst[i] = (unsigned long)atomic_long_read(&src[i]);
10622 /* zero out counters that only exist in rtnl_link_stats64 */
10623 memset((char *)stats64 + n * sizeof(u64), 0,
10624 sizeof(*stats64) - n * sizeof(u64));
10625}
10626EXPORT_SYMBOL(netdev_stats_to_stats64);
10627
10628static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc(
10629 struct net_device *dev)
10630{
10631 struct net_device_core_stats __percpu *p;
10632
10633 p = alloc_percpu_gfp(struct net_device_core_stats,
10634 GFP_ATOMIC | __GFP_NOWARN);
10635
10636 if (p && cmpxchg(&dev->core_stats, NULL, p))
10637 free_percpu(p);
10638
10639 /* This READ_ONCE() pairs with the cmpxchg() above */
10640 return READ_ONCE(dev->core_stats);
10641}
10642
10643noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset)
10644{
10645 /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
10646 struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
10647 unsigned long __percpu *field;
10648
10649 if (unlikely(!p)) {
10650 p = netdev_core_stats_alloc(dev);
10651 if (!p)
10652 return;
10653 }
10654
10655 field = (__force unsigned long __percpu *)((__force void *)p + offset);
10656 this_cpu_inc(*field);
10657}
10658EXPORT_SYMBOL_GPL(netdev_core_stats_inc);
10659
10660/**
10661 * dev_get_stats - get network device statistics
10662 * @dev: device to get statistics from
10663 * @storage: place to store stats
10664 *
10665 * Get network statistics from device. Return @storage.
10666 * The device driver may provide its own method by setting
10667 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
10668 * otherwise the internal statistics structure is used.
10669 */
10670struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
10671 struct rtnl_link_stats64 *storage)
10672{
10673 const struct net_device_ops *ops = dev->netdev_ops;
10674 const struct net_device_core_stats __percpu *p;
10675
10676 if (ops->ndo_get_stats64) {
10677 memset(storage, 0, sizeof(*storage));
10678 ops->ndo_get_stats64(dev, storage);
10679 } else if (ops->ndo_get_stats) {
10680 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
10681 } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) {
10682 dev_get_tstats64(dev, storage);
10683 } else {
10684 netdev_stats_to_stats64(storage, &dev->stats);
10685 }
10686
10687 /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
10688 p = READ_ONCE(dev->core_stats);
10689 if (p) {
10690 const struct net_device_core_stats *core_stats;
10691 int i;
10692
10693 for_each_possible_cpu(i) {
10694 core_stats = per_cpu_ptr(p, i);
10695 storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
10696 storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
10697 storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
10698 storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped);
10699 }
10700 }
10701 return storage;
10702}
10703EXPORT_SYMBOL(dev_get_stats);
10704
10705/**
10706 * dev_fetch_sw_netstats - get per-cpu network device statistics
10707 * @s: place to store stats
10708 * @netstats: per-cpu network stats to read from
10709 *
10710 * Read per-cpu network statistics and populate the related fields in @s.
10711 */
10712void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
10713 const struct pcpu_sw_netstats __percpu *netstats)
10714{
10715 int cpu;
10716
10717 for_each_possible_cpu(cpu) {
10718 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
10719 const struct pcpu_sw_netstats *stats;
10720 unsigned int start;
10721
10722 stats = per_cpu_ptr(netstats, cpu);
10723 do {
10724 start = u64_stats_fetch_begin(&stats->syncp);
10725 rx_packets = u64_stats_read(&stats->rx_packets);
10726 rx_bytes = u64_stats_read(&stats->rx_bytes);
10727 tx_packets = u64_stats_read(&stats->tx_packets);
10728 tx_bytes = u64_stats_read(&stats->tx_bytes);
10729 } while (u64_stats_fetch_retry(&stats->syncp, start));
10730
10731 s->rx_packets += rx_packets;
10732 s->rx_bytes += rx_bytes;
10733 s->tx_packets += tx_packets;
10734 s->tx_bytes += tx_bytes;
10735 }
10736}
10737EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
10738
10739/**
10740 * dev_get_tstats64 - ndo_get_stats64 implementation
10741 * @dev: device to get statistics from
10742 * @s: place to store stats
10743 *
10744 * Populate @s from dev->stats and dev->tstats. Can be used as
10745 * ndo_get_stats64() callback.
10746 */
10747void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
10748{
10749 netdev_stats_to_stats64(s, &dev->stats);
10750 dev_fetch_sw_netstats(s, dev->tstats);
10751}
10752EXPORT_SYMBOL_GPL(dev_get_tstats64);
10753
10754struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
10755{
10756 struct netdev_queue *queue = dev_ingress_queue(dev);
10757
10758#ifdef CONFIG_NET_CLS_ACT
10759 if (queue)
10760 return queue;
10761 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
10762 if (!queue)
10763 return NULL;
10764 netdev_init_one_queue(dev, queue, NULL);
10765 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
10766 RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc);
10767 rcu_assign_pointer(dev->ingress_queue, queue);
10768#endif
10769 return queue;
10770}
10771
10772static const struct ethtool_ops default_ethtool_ops;
10773
10774void netdev_set_default_ethtool_ops(struct net_device *dev,
10775 const struct ethtool_ops *ops)
10776{
10777 if (dev->ethtool_ops == &default_ethtool_ops)
10778 dev->ethtool_ops = ops;
10779}
10780EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
10781
10782/**
10783 * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default
10784 * @dev: netdev to enable the IRQ coalescing on
10785 *
10786 * Sets a conservative default for SW IRQ coalescing. Users can use
10787 * sysfs attributes to override the default values.
10788 */
10789void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
10790{
10791 WARN_ON(dev->reg_state == NETREG_REGISTERED);
10792
10793 if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
10794 dev->gro_flush_timeout = 20000;
10795 dev->napi_defer_hard_irqs = 1;
10796 }
10797}
10798EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
10799
10800void netdev_freemem(struct net_device *dev)
10801{
10802 char *addr = (char *)dev - dev->padded;
10803
10804 kvfree(addr);
10805}
10806
10807/**
10808 * alloc_netdev_mqs - allocate network device
10809 * @sizeof_priv: size of private data to allocate space for
10810 * @name: device name format string
10811 * @name_assign_type: origin of device name
10812 * @setup: callback to initialize device
10813 * @txqs: the number of TX subqueues to allocate
10814 * @rxqs: the number of RX subqueues to allocate
10815 *
10816 * Allocates a struct net_device with private data area for driver use
10817 * and performs basic initialization. Also allocates subqueue structs
10818 * for each queue on the device.
10819 */
10820struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
10821 unsigned char name_assign_type,
10822 void (*setup)(struct net_device *),
10823 unsigned int txqs, unsigned int rxqs)
10824{
10825 struct net_device *dev;
10826 unsigned int alloc_size;
10827 struct net_device *p;
10828
10829 BUG_ON(strlen(name) >= sizeof(dev->name));
10830
10831 if (txqs < 1) {
10832 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
10833 return NULL;
10834 }
10835
10836 if (rxqs < 1) {
10837 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
10838 return NULL;
10839 }
10840
10841 alloc_size = sizeof(struct net_device);
10842 if (sizeof_priv) {
10843 /* ensure 32-byte alignment of private area */
10844 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
10845 alloc_size += sizeof_priv;
10846 }
10847 /* ensure 32-byte alignment of whole construct */
10848 alloc_size += NETDEV_ALIGN - 1;
10849
10850 p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10851 if (!p)
10852 return NULL;
10853
10854 dev = PTR_ALIGN(p, NETDEV_ALIGN);
10855 dev->padded = (char *)dev - (char *)p;
10856
10857 ref_tracker_dir_init(&dev->refcnt_tracker, 128, name);
10858#ifdef CONFIG_PCPU_DEV_REFCNT
10859 dev->pcpu_refcnt = alloc_percpu(int);
10860 if (!dev->pcpu_refcnt)
10861 goto free_dev;
10862 __dev_hold(dev);
10863#else
10864 refcount_set(&dev->dev_refcnt, 1);
10865#endif
10866
10867 if (dev_addr_init(dev))
10868 goto free_pcpu;
10869
10870 dev_mc_init(dev);
10871 dev_uc_init(dev);
10872
10873 dev_net_set(dev, &init_net);
10874
10875 dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
10876 dev->xdp_zc_max_segs = 1;
10877 dev->gso_max_segs = GSO_MAX_SEGS;
10878 dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
10879 dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE;
10880 dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE;
10881 dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
10882 dev->tso_max_segs = TSO_MAX_SEGS;
10883 dev->upper_level = 1;
10884 dev->lower_level = 1;
10885#ifdef CONFIG_LOCKDEP
10886 dev->nested_level = 0;
10887 INIT_LIST_HEAD(&dev->unlink_list);
10888#endif
10889
10890 INIT_LIST_HEAD(&dev->napi_list);
10891 INIT_LIST_HEAD(&dev->unreg_list);
10892 INIT_LIST_HEAD(&dev->close_list);
10893 INIT_LIST_HEAD(&dev->link_watch_list);
10894 INIT_LIST_HEAD(&dev->adj_list.upper);
10895 INIT_LIST_HEAD(&dev->adj_list.lower);
10896 INIT_LIST_HEAD(&dev->ptype_all);
10897 INIT_LIST_HEAD(&dev->ptype_specific);
10898 INIT_LIST_HEAD(&dev->net_notifier_list);
10899#ifdef CONFIG_NET_SCHED
10900 hash_init(dev->qdisc_hash);
10901#endif
10902 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
10903 setup(dev);
10904
10905 if (!dev->tx_queue_len) {
10906 dev->priv_flags |= IFF_NO_QUEUE;
10907 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
10908 }
10909
10910 dev->num_tx_queues = txqs;
10911 dev->real_num_tx_queues = txqs;
10912 if (netif_alloc_netdev_queues(dev))
10913 goto free_all;
10914
10915 dev->num_rx_queues = rxqs;
10916 dev->real_num_rx_queues = rxqs;
10917 if (netif_alloc_rx_queues(dev))
10918 goto free_all;
10919
10920 strcpy(dev->name, name);
10921 dev->name_assign_type = name_assign_type;
10922 dev->group = INIT_NETDEV_GROUP;
10923 if (!dev->ethtool_ops)
10924 dev->ethtool_ops = &default_ethtool_ops;
10925
10926 nf_hook_netdev_init(dev);
10927
10928 return dev;
10929
10930free_all:
10931 free_netdev(dev);
10932 return NULL;
10933
10934free_pcpu:
10935#ifdef CONFIG_PCPU_DEV_REFCNT
10936 free_percpu(dev->pcpu_refcnt);
10937free_dev:
10938#endif
10939 netdev_freemem(dev);
10940 return NULL;
10941}
10942EXPORT_SYMBOL(alloc_netdev_mqs);
10943
10944/**
10945 * free_netdev - free network device
10946 * @dev: device
10947 *
10948 * This function does the last stage of destroying an allocated device
10949 * interface. The reference to the device object is released. If this
10950 * is the last reference then it will be freed.Must be called in process
10951 * context.
10952 */
10953void free_netdev(struct net_device *dev)
10954{
10955 struct napi_struct *p, *n;
10956
10957 might_sleep();
10958
10959 /* When called immediately after register_netdevice() failed the unwind
10960 * handling may still be dismantling the device. Handle that case by
10961 * deferring the free.
10962 */
10963 if (dev->reg_state == NETREG_UNREGISTERING) {
10964 ASSERT_RTNL();
10965 dev->needs_free_netdev = true;
10966 return;
10967 }
10968
10969 netif_free_tx_queues(dev);
10970 netif_free_rx_queues(dev);
10971
10972 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
10973
10974 /* Flush device addresses */
10975 dev_addr_flush(dev);
10976
10977 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
10978 netif_napi_del(p);
10979
10980 ref_tracker_dir_exit(&dev->refcnt_tracker);
10981#ifdef CONFIG_PCPU_DEV_REFCNT
10982 free_percpu(dev->pcpu_refcnt);
10983 dev->pcpu_refcnt = NULL;
10984#endif
10985 free_percpu(dev->core_stats);
10986 dev->core_stats = NULL;
10987 free_percpu(dev->xdp_bulkq);
10988 dev->xdp_bulkq = NULL;
10989
10990 /* Compatibility with error handling in drivers */
10991 if (dev->reg_state == NETREG_UNINITIALIZED) {
10992 netdev_freemem(dev);
10993 return;
10994 }
10995
10996 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
10997 WRITE_ONCE(dev->reg_state, NETREG_RELEASED);
10998
10999 /* will free via device release */
11000 put_device(&dev->dev);
11001}
11002EXPORT_SYMBOL(free_netdev);
11003
11004/**
11005 * synchronize_net - Synchronize with packet receive processing
11006 *
11007 * Wait for packets currently being received to be done.
11008 * Does not block later packets from starting.
11009 */
11010void synchronize_net(void)
11011{
11012 might_sleep();
11013 if (rtnl_is_locked())
11014 synchronize_rcu_expedited();
11015 else
11016 synchronize_rcu();
11017}
11018EXPORT_SYMBOL(synchronize_net);
11019
11020/**
11021 * unregister_netdevice_queue - remove device from the kernel
11022 * @dev: device
11023 * @head: list
11024 *
11025 * This function shuts down a device interface and removes it
11026 * from the kernel tables.
11027 * If head not NULL, device is queued to be unregistered later.
11028 *
11029 * Callers must hold the rtnl semaphore. You may want
11030 * unregister_netdev() instead of this.
11031 */
11032
11033void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
11034{
11035 ASSERT_RTNL();
11036
11037 if (head) {
11038 list_move_tail(&dev->unreg_list, head);
11039 } else {
11040 LIST_HEAD(single);
11041
11042 list_add(&dev->unreg_list, &single);
11043 unregister_netdevice_many(&single);
11044 }
11045}
11046EXPORT_SYMBOL(unregister_netdevice_queue);
11047
11048void unregister_netdevice_many_notify(struct list_head *head,
11049 u32 portid, const struct nlmsghdr *nlh)
11050{
11051 struct net_device *dev, *tmp;
11052 LIST_HEAD(close_head);
11053 int cnt = 0;
11054
11055 BUG_ON(dev_boot_phase);
11056 ASSERT_RTNL();
11057
11058 if (list_empty(head))
11059 return;
11060
11061 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
11062 /* Some devices call without registering
11063 * for initialization unwind. Remove those
11064 * devices and proceed with the remaining.
11065 */
11066 if (dev->reg_state == NETREG_UNINITIALIZED) {
11067 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
11068 dev->name, dev);
11069
11070 WARN_ON(1);
11071 list_del(&dev->unreg_list);
11072 continue;
11073 }
11074 dev->dismantle = true;
11075 BUG_ON(dev->reg_state != NETREG_REGISTERED);
11076 }
11077
11078 /* If device is running, close it first. */
11079 list_for_each_entry(dev, head, unreg_list)
11080 list_add_tail(&dev->close_list, &close_head);
11081 dev_close_many(&close_head, true);
11082
11083 list_for_each_entry(dev, head, unreg_list) {
11084 /* And unlink it from device chain. */
11085 unlist_netdevice(dev);
11086 WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
11087 }
11088 flush_all_backlogs();
11089
11090 synchronize_net();
11091
11092 list_for_each_entry(dev, head, unreg_list) {
11093 struct sk_buff *skb = NULL;
11094
11095 /* Shutdown queueing discipline. */
11096 dev_shutdown(dev);
11097 dev_tcx_uninstall(dev);
11098 dev_xdp_uninstall(dev);
11099 bpf_dev_bound_netdev_unregister(dev);
11100
11101 netdev_offload_xstats_disable_all(dev);
11102
11103 /* Notify protocols, that we are about to destroy
11104 * this device. They should clean all the things.
11105 */
11106 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
11107
11108 if (!dev->rtnl_link_ops ||
11109 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
11110 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
11111 GFP_KERNEL, NULL, 0,
11112 portid, nlh);
11113
11114 /*
11115 * Flush the unicast and multicast chains
11116 */
11117 dev_uc_flush(dev);
11118 dev_mc_flush(dev);
11119
11120 netdev_name_node_alt_flush(dev);
11121 netdev_name_node_free(dev->name_node);
11122
11123 call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
11124
11125 if (dev->netdev_ops->ndo_uninit)
11126 dev->netdev_ops->ndo_uninit(dev);
11127
11128 if (skb)
11129 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);
11130
11131 /* Notifier chain MUST detach us all upper devices. */
11132 WARN_ON(netdev_has_any_upper_dev(dev));
11133 WARN_ON(netdev_has_any_lower_dev(dev));
11134
11135 /* Remove entries from kobject tree */
11136 netdev_unregister_kobject(dev);
11137#ifdef CONFIG_XPS
11138 /* Remove XPS queueing entries */
11139 netif_reset_xps_queues_gt(dev, 0);
11140#endif
11141 }
11142
11143 synchronize_net();
11144
11145 list_for_each_entry(dev, head, unreg_list) {
11146 netdev_put(dev, &dev->dev_registered_tracker);
11147 net_set_todo(dev);
11148 cnt++;
11149 }
11150 atomic_add(cnt, &dev_unreg_count);
11151
11152 list_del(head);
11153}
11154
11155/**
11156 * unregister_netdevice_many - unregister many devices
11157 * @head: list of devices
11158 *
11159 * Note: As most callers use a stack allocated list_head,
11160 * we force a list_del() to make sure stack wont be corrupted later.
11161 */
11162void unregister_netdevice_many(struct list_head *head)
11163{
11164 unregister_netdevice_many_notify(head, 0, NULL);
11165}
11166EXPORT_SYMBOL(unregister_netdevice_many);
11167
11168/**
11169 * unregister_netdev - remove device from the kernel
11170 * @dev: device
11171 *
11172 * This function shuts down a device interface and removes it
11173 * from the kernel tables.
11174 *
11175 * This is just a wrapper for unregister_netdevice that takes
11176 * the rtnl semaphore. In general you want to use this and not
11177 * unregister_netdevice.
11178 */
11179void unregister_netdev(struct net_device *dev)
11180{
11181 rtnl_lock();
11182 unregister_netdevice(dev);
11183 rtnl_unlock();
11184}
11185EXPORT_SYMBOL(unregister_netdev);
11186
11187/**
11188 * __dev_change_net_namespace - move device to different nethost namespace
11189 * @dev: device
11190 * @net: network namespace
11191 * @pat: If not NULL name pattern to try if the current device name
11192 * is already taken in the destination network namespace.
11193 * @new_ifindex: If not zero, specifies device index in the target
11194 * namespace.
11195 *
11196 * This function shuts down a device interface and moves it
11197 * to a new network namespace. On success 0 is returned, on
11198 * a failure a netagive errno code is returned.
11199 *
11200 * Callers must hold the rtnl semaphore.
11201 */
11202
11203int __dev_change_net_namespace(struct net_device *dev, struct net *net,
11204 const char *pat, int new_ifindex)
11205{
11206 struct netdev_name_node *name_node;
11207 struct net *net_old = dev_net(dev);
11208 char new_name[IFNAMSIZ] = {};
11209 int err, new_nsid;
11210
11211 ASSERT_RTNL();
11212
11213 /* Don't allow namespace local devices to be moved. */
11214 err = -EINVAL;
11215 if (dev->features & NETIF_F_NETNS_LOCAL)
11216 goto out;
11217
11218 /* Ensure the device has been registrered */
11219 if (dev->reg_state != NETREG_REGISTERED)
11220 goto out;
11221
11222 /* Get out if there is nothing todo */
11223 err = 0;
11224 if (net_eq(net_old, net))
11225 goto out;
11226
11227 /* Pick the destination device name, and ensure
11228 * we can use it in the destination network namespace.
11229 */
11230 err = -EEXIST;
11231 if (netdev_name_in_use(net, dev->name)) {
11232 /* We get here if we can't use the current device name */
11233 if (!pat)
11234 goto out;
11235 err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST);
11236 if (err < 0)
11237 goto out;
11238 }
11239 /* Check that none of the altnames conflicts. */
11240 err = -EEXIST;
11241 netdev_for_each_altname(dev, name_node)
11242 if (netdev_name_in_use(net, name_node->name))
11243 goto out;
11244
11245 /* Check that new_ifindex isn't used yet. */
11246 if (new_ifindex) {
11247 err = dev_index_reserve(net, new_ifindex);
11248 if (err < 0)
11249 goto out;
11250 } else {
11251 /* If there is an ifindex conflict assign a new one */
11252 err = dev_index_reserve(net, dev->ifindex);
11253 if (err == -EBUSY)
11254 err = dev_index_reserve(net, 0);
11255 if (err < 0)
11256 goto out;
11257 new_ifindex = err;
11258 }
11259
11260 /*
11261 * And now a mini version of register_netdevice unregister_netdevice.
11262 */
11263
11264 /* If device is running close it first. */
11265 dev_close(dev);
11266
11267 /* And unlink it from device chain */
11268 unlist_netdevice(dev);
11269
11270 synchronize_net();
11271
11272 /* Shutdown queueing discipline. */
11273 dev_shutdown(dev);
11274
11275 /* Notify protocols, that we are about to destroy
11276 * this device. They should clean all the things.
11277 *
11278 * Note that dev->reg_state stays at NETREG_REGISTERED.
11279 * This is wanted because this way 8021q and macvlan know
11280 * the device is just moving and can keep their slaves up.
11281 */
11282 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
11283 rcu_barrier();
11284
11285 new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
11286
11287 rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
11288 new_ifindex);
11289
11290 /*
11291 * Flush the unicast and multicast chains
11292 */
11293 dev_uc_flush(dev);
11294 dev_mc_flush(dev);
11295
11296 /* Send a netdev-removed uevent to the old namespace */
11297 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
11298 netdev_adjacent_del_links(dev);
11299
11300 /* Move per-net netdevice notifiers that are following the netdevice */
11301 move_netdevice_notifiers_dev_net(dev, net);
11302
11303 /* Actually switch the network namespace */
11304 dev_net_set(dev, net);
11305 dev->ifindex = new_ifindex;
11306
11307 if (new_name[0]) /* Rename the netdev to prepared name */
11308 strscpy(dev->name, new_name, IFNAMSIZ);
11309
11310 /* Fixup kobjects */
11311 dev_set_uevent_suppress(&dev->dev, 1);
11312 err = device_rename(&dev->dev, dev->name);
11313 dev_set_uevent_suppress(&dev->dev, 0);
11314 WARN_ON(err);
11315
11316 /* Send a netdev-add uevent to the new namespace */
11317 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
11318 netdev_adjacent_add_links(dev);
11319
11320 /* Adapt owner in case owning user namespace of target network
11321 * namespace is different from the original one.
11322 */
11323 err = netdev_change_owner(dev, net_old, net);
11324 WARN_ON(err);
11325
11326 /* Add the device back in the hashes */
11327 list_netdevice(dev);
11328
11329 /* Notify protocols, that a new device appeared. */
11330 call_netdevice_notifiers(NETDEV_REGISTER, dev);
11331
11332 /*
11333 * Prevent userspace races by waiting until the network
11334 * device is fully setup before sending notifications.
11335 */
11336 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
11337
11338 synchronize_net();
11339 err = 0;
11340out:
11341 return err;
11342}
11343EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
11344
11345static int dev_cpu_dead(unsigned int oldcpu)
11346{
11347 struct sk_buff **list_skb;
11348 struct sk_buff *skb;
11349 unsigned int cpu;
11350 struct softnet_data *sd, *oldsd, *remsd = NULL;
11351
11352 local_irq_disable();
11353 cpu = smp_processor_id();
11354 sd = &per_cpu(softnet_data, cpu);
11355 oldsd = &per_cpu(softnet_data, oldcpu);
11356
11357 /* Find end of our completion_queue. */
11358 list_skb = &sd->completion_queue;
11359 while (*list_skb)
11360 list_skb = &(*list_skb)->next;
11361 /* Append completion queue from offline CPU. */
11362 *list_skb = oldsd->completion_queue;
11363 oldsd->completion_queue = NULL;
11364
11365 /* Append output queue from offline CPU. */
11366 if (oldsd->output_queue) {
11367 *sd->output_queue_tailp = oldsd->output_queue;
11368 sd->output_queue_tailp = oldsd->output_queue_tailp;
11369 oldsd->output_queue = NULL;
11370 oldsd->output_queue_tailp = &oldsd->output_queue;
11371 }
11372 /* Append NAPI poll list from offline CPU, with one exception :
11373 * process_backlog() must be called by cpu owning percpu backlog.
11374 * We properly handle process_queue & input_pkt_queue later.
11375 */
11376 while (!list_empty(&oldsd->poll_list)) {
11377 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
11378 struct napi_struct,
11379 poll_list);
11380
11381 list_del_init(&napi->poll_list);
11382 if (napi->poll == process_backlog)
11383 napi->state = 0;
11384 else
11385 ____napi_schedule(sd, napi);
11386 }
11387
11388 raise_softirq_irqoff(NET_TX_SOFTIRQ);
11389 local_irq_enable();
11390
11391#ifdef CONFIG_RPS
11392 remsd = oldsd->rps_ipi_list;
11393 oldsd->rps_ipi_list = NULL;
11394#endif
11395 /* send out pending IPI's on offline CPU */
11396 net_rps_send_ipi(remsd);
11397
11398 /* Process offline CPU's input_pkt_queue */
11399 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
11400 netif_rx(skb);
11401 input_queue_head_incr(oldsd);
11402 }
11403 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
11404 netif_rx(skb);
11405 input_queue_head_incr(oldsd);
11406 }
11407
11408 return 0;
11409}
11410
11411/**
11412 * netdev_increment_features - increment feature set by one
11413 * @all: current feature set
11414 * @one: new feature set
11415 * @mask: mask feature set
11416 *
11417 * Computes a new feature set after adding a device with feature set
11418 * @one to the master device with current feature set @all. Will not
11419 * enable anything that is off in @mask. Returns the new feature set.
11420 */
11421netdev_features_t netdev_increment_features(netdev_features_t all,
11422 netdev_features_t one, netdev_features_t mask)
11423{
11424 if (mask & NETIF_F_HW_CSUM)
11425 mask |= NETIF_F_CSUM_MASK;
11426 mask |= NETIF_F_VLAN_CHALLENGED;
11427
11428 all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
11429 all &= one | ~NETIF_F_ALL_FOR_ALL;
11430
11431 /* If one device supports hw checksumming, set for all. */
11432 if (all & NETIF_F_HW_CSUM)
11433 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
11434
11435 return all;
11436}
11437EXPORT_SYMBOL(netdev_increment_features);
11438
11439static struct hlist_head * __net_init netdev_create_hash(void)
11440{
11441 int i;
11442 struct hlist_head *hash;
11443
11444 hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
11445 if (hash != NULL)
11446 for (i = 0; i < NETDEV_HASHENTRIES; i++)
11447 INIT_HLIST_HEAD(&hash[i]);
11448
11449 return hash;
11450}
11451
11452/* Initialize per network namespace state */
11453static int __net_init netdev_init(struct net *net)
11454{
11455 BUILD_BUG_ON(GRO_HASH_BUCKETS >
11456 8 * sizeof_field(struct napi_struct, gro_bitmask));
11457
11458 INIT_LIST_HEAD(&net->dev_base_head);
11459
11460 net->dev_name_head = netdev_create_hash();
11461 if (net->dev_name_head == NULL)
11462 goto err_name;
11463
11464 net->dev_index_head = netdev_create_hash();
11465 if (net->dev_index_head == NULL)
11466 goto err_idx;
11467
11468 xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1);
11469
11470 RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
11471
11472 return 0;
11473
11474err_idx:
11475 kfree(net->dev_name_head);
11476err_name:
11477 return -ENOMEM;
11478}
11479
11480/**
11481 * netdev_drivername - network driver for the device
11482 * @dev: network device
11483 *
11484 * Determine network driver for device.
11485 */
11486const char *netdev_drivername(const struct net_device *dev)
11487{
11488 const struct device_driver *driver;
11489 const struct device *parent;
11490 const char *empty = "";
11491
11492 parent = dev->dev.parent;
11493 if (!parent)
11494 return empty;
11495
11496 driver = parent->driver;
11497 if (driver && driver->name)
11498 return driver->name;
11499 return empty;
11500}
11501
11502static void __netdev_printk(const char *level, const struct net_device *dev,
11503 struct va_format *vaf)
11504{
11505 if (dev && dev->dev.parent) {
11506 dev_printk_emit(level[1] - '0',
11507 dev->dev.parent,
11508 "%s %s %s%s: %pV",
11509 dev_driver_string(dev->dev.parent),
11510 dev_name(dev->dev.parent),
11511 netdev_name(dev), netdev_reg_state(dev),
11512 vaf);
11513 } else if (dev) {
11514 printk("%s%s%s: %pV",
11515 level, netdev_name(dev), netdev_reg_state(dev), vaf);
11516 } else {
11517 printk("%s(NULL net_device): %pV", level, vaf);
11518 }
11519}
11520
11521void netdev_printk(const char *level, const struct net_device *dev,
11522 const char *format, ...)
11523{
11524 struct va_format vaf;
11525 va_list args;
11526
11527 va_start(args, format);
11528
11529 vaf.fmt = format;
11530 vaf.va = &args;
11531
11532 __netdev_printk(level, dev, &vaf);
11533
11534 va_end(args);
11535}
11536EXPORT_SYMBOL(netdev_printk);
11537
11538#define define_netdev_printk_level(func, level) \
11539void func(const struct net_device *dev, const char *fmt, ...) \
11540{ \
11541 struct va_format vaf; \
11542 va_list args; \
11543 \
11544 va_start(args, fmt); \
11545 \
11546 vaf.fmt = fmt; \
11547 vaf.va = &args; \
11548 \
11549 __netdev_printk(level, dev, &vaf); \
11550 \
11551 va_end(args); \
11552} \
11553EXPORT_SYMBOL(func);
11554
11555define_netdev_printk_level(netdev_emerg, KERN_EMERG);
11556define_netdev_printk_level(netdev_alert, KERN_ALERT);
11557define_netdev_printk_level(netdev_crit, KERN_CRIT);
11558define_netdev_printk_level(netdev_err, KERN_ERR);
11559define_netdev_printk_level(netdev_warn, KERN_WARNING);
11560define_netdev_printk_level(netdev_notice, KERN_NOTICE);
11561define_netdev_printk_level(netdev_info, KERN_INFO);
11562
11563static void __net_exit netdev_exit(struct net *net)
11564{
11565 kfree(net->dev_name_head);
11566 kfree(net->dev_index_head);
11567 xa_destroy(&net->dev_by_index);
11568 if (net != &init_net)
11569 WARN_ON_ONCE(!list_empty(&net->dev_base_head));
11570}
11571
11572static struct pernet_operations __net_initdata netdev_net_ops = {
11573 .init = netdev_init,
11574 .exit = netdev_exit,
11575};
11576
11577static void __net_exit default_device_exit_net(struct net *net)
11578{
11579 struct netdev_name_node *name_node, *tmp;
11580 struct net_device *dev, *aux;
11581 /*
11582 * Push all migratable network devices back to the
11583 * initial network namespace
11584 */
11585 ASSERT_RTNL();
11586 for_each_netdev_safe(net, dev, aux) {
11587 int err;
11588 char fb_name[IFNAMSIZ];
11589
11590 /* Ignore unmoveable devices (i.e. loopback) */
11591 if (dev->features & NETIF_F_NETNS_LOCAL)
11592 continue;
11593
11594 /* Leave virtual devices for the generic cleanup */
11595 if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
11596 continue;
11597
11598 /* Push remaining network devices to init_net */
11599 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
11600 if (netdev_name_in_use(&init_net, fb_name))
11601 snprintf(fb_name, IFNAMSIZ, "dev%%d");
11602
11603 netdev_for_each_altname_safe(dev, name_node, tmp)
11604 if (netdev_name_in_use(&init_net, name_node->name))
11605 __netdev_name_node_alt_destroy(name_node);
11606
11607 err = dev_change_net_namespace(dev, &init_net, fb_name);
11608 if (err) {
11609 pr_emerg("%s: failed to move %s to init_net: %d\n",
11610 __func__, dev->name, err);
11611 BUG();
11612 }
11613 }
11614}
11615
11616static void __net_exit default_device_exit_batch(struct list_head *net_list)
11617{
11618 /* At exit all network devices most be removed from a network
11619 * namespace. Do this in the reverse order of registration.
11620 * Do this across as many network namespaces as possible to
11621 * improve batching efficiency.
11622 */
11623 struct net_device *dev;
11624 struct net *net;
11625 LIST_HEAD(dev_kill_list);
11626
11627 rtnl_lock();
11628 list_for_each_entry(net, net_list, exit_list) {
11629 default_device_exit_net(net);
11630 cond_resched();
11631 }
11632
11633 list_for_each_entry(net, net_list, exit_list) {
11634 for_each_netdev_reverse(net, dev) {
11635 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
11636 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
11637 else
11638 unregister_netdevice_queue(dev, &dev_kill_list);
11639 }
11640 }
11641 unregister_netdevice_many(&dev_kill_list);
11642 rtnl_unlock();
11643}
11644
11645static struct pernet_operations __net_initdata default_device_ops = {
11646 .exit_batch = default_device_exit_batch,
11647};
11648
11649static void __init net_dev_struct_check(void)
11650{
11651 /* TX read-mostly hotpath */
11652 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags);
11653 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops);
11654 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops);
11655 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx);
11656 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues);
11657 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size);
11658 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size);
11659 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs);
11660 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features);
11661 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc);
11662 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu);
11663 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom);
11664 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq);
11665#ifdef CONFIG_XPS
11666 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps);
11667#endif
11668#ifdef CONFIG_NETFILTER_EGRESS
11669 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress);
11670#endif
11671#ifdef CONFIG_NET_XGRESS
11672 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress);
11673#endif
11674 CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160);
11675
11676 /* TXRX read-mostly hotpath */
11677 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats);
11678 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state);
11679 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags);
11680 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len);
11681 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features);
11682 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr);
11683 CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46);
11684
11685 /* RX read-mostly hotpath */
11686 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific);
11687 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
11688 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
11689 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
11690 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout);
11691 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs);
11692 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
11693 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
11694 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
11695 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data);
11696 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net);
11697#ifdef CONFIG_NETPOLL
11698 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo);
11699#endif
11700#ifdef CONFIG_NET_XGRESS
11701 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
11702#endif
11703 CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104);
11704}
11705
11706/*
11707 * Initialize the DEV module. At boot time this walks the device list and
11708 * unhooks any devices that fail to initialise (normally hardware not
11709 * present) and leaves us with a valid list of present and active devices.
11710 *
11711 */
11712
11713/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */
11714#define SYSTEM_PERCPU_PAGE_POOL_SIZE ((1 << 20) / PAGE_SIZE)
11715
11716static int net_page_pool_create(int cpuid)
11717{
11718#if IS_ENABLED(CONFIG_PAGE_POOL)
11719 struct page_pool_params page_pool_params = {
11720 .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE,
11721 .flags = PP_FLAG_SYSTEM_POOL,
11722 .nid = NUMA_NO_NODE,
11723 };
11724 struct page_pool *pp_ptr;
11725
11726 pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
11727 if (IS_ERR(pp_ptr))
11728 return -ENOMEM;
11729
11730 per_cpu(system_page_pool, cpuid) = pp_ptr;
11731#endif
11732 return 0;
11733}
11734
11735/*
11736 * This is called single threaded during boot, so no need
11737 * to take the rtnl semaphore.
11738 */
11739static int __init net_dev_init(void)
11740{
11741 int i, rc = -ENOMEM;
11742
11743 BUG_ON(!dev_boot_phase);
11744
11745 net_dev_struct_check();
11746
11747 if (dev_proc_init())
11748 goto out;
11749
11750 if (netdev_kobject_init())
11751 goto out;
11752
11753 for (i = 0; i < PTYPE_HASH_SIZE; i++)
11754 INIT_LIST_HEAD(&ptype_base[i]);
11755
11756 if (register_pernet_subsys(&netdev_net_ops))
11757 goto out;
11758
11759 /*
11760 * Initialise the packet receive queues.
11761 */
11762
11763 for_each_possible_cpu(i) {
11764 struct work_struct *flush = per_cpu_ptr(&flush_works, i);
11765 struct softnet_data *sd = &per_cpu(softnet_data, i);
11766
11767 INIT_WORK(flush, flush_backlog);
11768
11769 skb_queue_head_init(&sd->input_pkt_queue);
11770 skb_queue_head_init(&sd->process_queue);
11771#ifdef CONFIG_XFRM_OFFLOAD
11772 skb_queue_head_init(&sd->xfrm_backlog);
11773#endif
11774 INIT_LIST_HEAD(&sd->poll_list);
11775 sd->output_queue_tailp = &sd->output_queue;
11776#ifdef CONFIG_RPS
11777 INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
11778 sd->cpu = i;
11779#endif
11780 INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
11781 spin_lock_init(&sd->defer_lock);
11782
11783 init_gro_hash(&sd->backlog);
11784 sd->backlog.poll = process_backlog;
11785 sd->backlog.weight = weight_p;
11786
11787 if (net_page_pool_create(i))
11788 goto out;
11789 }
11790
11791 dev_boot_phase = 0;
11792
11793 /* The loopback device is special if any other network devices
11794 * is present in a network namespace the loopback device must
11795 * be present. Since we now dynamically allocate and free the
11796 * loopback device ensure this invariant is maintained by
11797 * keeping the loopback device as the first device on the
11798 * list of network devices. Ensuring the loopback devices
11799 * is the first device that appears and the last network device
11800 * that disappears.
11801 */
11802 if (register_pernet_device(&loopback_net_ops))
11803 goto out;
11804
11805 if (register_pernet_device(&default_device_ops))
11806 goto out;
11807
11808 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
11809 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
11810
11811 rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
11812 NULL, dev_cpu_dead);
11813 WARN_ON(rc < 0);
11814 rc = 0;
11815out:
11816 if (rc < 0) {
11817 for_each_possible_cpu(i) {
11818 struct page_pool *pp_ptr;
11819
11820 pp_ptr = per_cpu(system_page_pool, i);
11821 if (!pp_ptr)
11822 continue;
11823
11824 page_pool_destroy(pp_ptr);
11825 per_cpu(system_page_pool, i) = NULL;
11826 }
11827 }
11828
11829 return rc;
11830}
11831
11832subsys_initcall(net_dev_init);
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
76#include <asm/system.h>
77#include <linux/bitops.h>
78#include <linux/capability.h>
79#include <linux/cpu.h>
80#include <linux/types.h>
81#include <linux/kernel.h>
82#include <linux/hash.h>
83#include <linux/slab.h>
84#include <linux/sched.h>
85#include <linux/mutex.h>
86#include <linux/string.h>
87#include <linux/mm.h>
88#include <linux/socket.h>
89#include <linux/sockios.h>
90#include <linux/errno.h>
91#include <linux/interrupt.h>
92#include <linux/if_ether.h>
93#include <linux/netdevice.h>
94#include <linux/etherdevice.h>
95#include <linux/ethtool.h>
96#include <linux/notifier.h>
97#include <linux/skbuff.h>
98#include <net/net_namespace.h>
99#include <net/sock.h>
100#include <linux/rtnetlink.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/stat.h>
104#include <net/dst.h>
105#include <net/pkt_sched.h>
106#include <net/checksum.h>
107#include <net/xfrm.h>
108#include <linux/highmem.h>
109#include <linux/init.h>
110#include <linux/kmod.h>
111#include <linux/module.h>
112#include <linux/netpoll.h>
113#include <linux/rcupdate.h>
114#include <linux/delay.h>
115#include <net/wext.h>
116#include <net/iw_handler.h>
117#include <asm/current.h>
118#include <linux/audit.h>
119#include <linux/dmaengine.h>
120#include <linux/err.h>
121#include <linux/ctype.h>
122#include <linux/if_arp.h>
123#include <linux/if_vlan.h>
124#include <linux/ip.h>
125#include <net/ip.h>
126#include <linux/ipv6.h>
127#include <linux/in.h>
128#include <linux/jhash.h>
129#include <linux/random.h>
130#include <trace/events/napi.h>
131#include <trace/events/net.h>
132#include <trace/events/skb.h>
133#include <linux/pci.h>
134#include <linux/inetdevice.h>
135#include <linux/cpu_rmap.h>
136
137#include "net-sysfs.h"
138
139/* Instead of increasing this, you should create a hash table. */
140#define MAX_GRO_SKBS 8
141
142/* This should be increased if a protocol with a bigger head is added. */
143#define GRO_MAX_HEAD (MAX_HEADER + 128)
144
145/*
146 * The list of packet types we will receive (as opposed to discard)
147 * and the routines to invoke.
148 *
149 * Why 16. Because with 16 the only overlap we get on a hash of the
150 * low nibble of the protocol value is RARP/SNAP/X.25.
151 *
152 * NOTE: That is no longer true with the addition of VLAN tags. Not
153 * sure which should go first, but I bet it won't make much
154 * difference if we are running VLANs. The good news is that
155 * this protocol won't be in the list unless compiled in, so
156 * the average user (w/out VLANs) will not be adversely affected.
157 * --BLG
158 *
159 * 0800 IP
160 * 8100 802.1Q VLAN
161 * 0001 802.3
162 * 0002 AX.25
163 * 0004 802.2
164 * 8035 RARP
165 * 0005 SNAP
166 * 0805 X.25
167 * 0806 ARP
168 * 8137 IPX
169 * 0009 Localtalk
170 * 86DD IPv6
171 */
172
173#define PTYPE_HASH_SIZE (16)
174#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
175
176static DEFINE_SPINLOCK(ptype_lock);
177static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
178static struct list_head ptype_all __read_mostly; /* Taps */
179
180/*
181 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
182 * semaphore.
183 *
184 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
185 *
186 * Writers must hold the rtnl semaphore while they loop through the
187 * dev_base_head list, and hold dev_base_lock for writing when they do the
188 * actual updates. This allows pure readers to access the list even
189 * while a writer is preparing to update it.
190 *
191 * To put it another way, dev_base_lock is held for writing only to
192 * protect against pure readers; the rtnl semaphore provides the
193 * protection against other writers.
194 *
195 * See, for example usages, register_netdevice() and
196 * unregister_netdevice(), which must be called with the rtnl
197 * semaphore held.
198 */
199DEFINE_RWLOCK(dev_base_lock);
200EXPORT_SYMBOL(dev_base_lock);
201
202static inline void dev_base_seq_inc(struct net *net)
203{
204 while (++net->dev_base_seq == 0);
205}
206
207static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
208{
209 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
210 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
211}
212
213static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
214{
215 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
216}
217
218static inline void rps_lock(struct softnet_data *sd)
219{
220#ifdef CONFIG_RPS
221 spin_lock(&sd->input_pkt_queue.lock);
222#endif
223}
224
225static inline void rps_unlock(struct softnet_data *sd)
226{
227#ifdef CONFIG_RPS
228 spin_unlock(&sd->input_pkt_queue.lock);
229#endif
230}
231
232/* Device list insertion */
233static int list_netdevice(struct net_device *dev)
234{
235 struct net *net = dev_net(dev);
236
237 ASSERT_RTNL();
238
239 write_lock_bh(&dev_base_lock);
240 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
241 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
242 hlist_add_head_rcu(&dev->index_hlist,
243 dev_index_hash(net, dev->ifindex));
244 write_unlock_bh(&dev_base_lock);
245
246 dev_base_seq_inc(net);
247
248 return 0;
249}
250
251/* Device list removal
252 * caller must respect a RCU grace period before freeing/reusing dev
253 */
254static void unlist_netdevice(struct net_device *dev)
255{
256 ASSERT_RTNL();
257
258 /* Unlink dev from the device chain */
259 write_lock_bh(&dev_base_lock);
260 list_del_rcu(&dev->dev_list);
261 hlist_del_rcu(&dev->name_hlist);
262 hlist_del_rcu(&dev->index_hlist);
263 write_unlock_bh(&dev_base_lock);
264
265 dev_base_seq_inc(dev_net(dev));
266}
267
268/*
269 * Our notifier list
270 */
271
272static RAW_NOTIFIER_HEAD(netdev_chain);
273
274/*
275 * Device drivers call our routines to queue packets here. We empty the
276 * queue in the local softnet handler.
277 */
278
279DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
280EXPORT_PER_CPU_SYMBOL(softnet_data);
281
282#ifdef CONFIG_LOCKDEP
283/*
284 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
285 * according to dev->type
286 */
287static const unsigned short netdev_lock_type[] =
288 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
289 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
290 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
291 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
292 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
293 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
294 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
295 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
296 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
297 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
298 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
299 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
300 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
301 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
302 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
303 ARPHRD_VOID, ARPHRD_NONE};
304
305static const char *const netdev_lock_name[] =
306 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
307 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
308 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
309 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
310 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
311 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
312 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
313 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
314 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
315 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
316 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
317 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
318 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
319 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
320 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
321 "_xmit_VOID", "_xmit_NONE"};
322
323static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
324static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
325
326static inline unsigned short netdev_lock_pos(unsigned short dev_type)
327{
328 int i;
329
330 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
331 if (netdev_lock_type[i] == dev_type)
332 return i;
333 /* the last key is used by default */
334 return ARRAY_SIZE(netdev_lock_type) - 1;
335}
336
337static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
338 unsigned short dev_type)
339{
340 int i;
341
342 i = netdev_lock_pos(dev_type);
343 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
344 netdev_lock_name[i]);
345}
346
347static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
348{
349 int i;
350
351 i = netdev_lock_pos(dev->type);
352 lockdep_set_class_and_name(&dev->addr_list_lock,
353 &netdev_addr_lock_key[i],
354 netdev_lock_name[i]);
355}
356#else
357static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
358 unsigned short dev_type)
359{
360}
361static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
362{
363}
364#endif
365
366/*******************************************************************************
367
368 Protocol management and registration routines
369
370*******************************************************************************/
371
372/*
373 * Add a protocol ID to the list. Now that the input handler is
374 * smarter we can dispense with all the messy stuff that used to be
375 * here.
376 *
377 * BEWARE!!! Protocol handlers, mangling input packets,
378 * MUST BE last in hash buckets and checking protocol handlers
379 * MUST start from promiscuous ptype_all chain in net_bh.
380 * It is true now, do not change it.
381 * Explanation follows: if protocol handler, mangling packet, will
382 * be the first on list, it is not able to sense, that packet
383 * is cloned and should be copied-on-write, so that it will
384 * change it and subsequent readers will get broken packet.
385 * --ANK (980803)
386 */
387
388static inline struct list_head *ptype_head(const struct packet_type *pt)
389{
390 if (pt->type == htons(ETH_P_ALL))
391 return &ptype_all;
392 else
393 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
394}
395
396/**
397 * dev_add_pack - add packet handler
398 * @pt: packet type declaration
399 *
400 * Add a protocol handler to the networking stack. The passed &packet_type
401 * is linked into kernel lists and may not be freed until it has been
402 * removed from the kernel lists.
403 *
404 * This call does not sleep therefore it can not
405 * guarantee all CPU's that are in middle of receiving packets
406 * will see the new packet type (until the next received packet).
407 */
408
409void dev_add_pack(struct packet_type *pt)
410{
411 struct list_head *head = ptype_head(pt);
412
413 spin_lock(&ptype_lock);
414 list_add_rcu(&pt->list, head);
415 spin_unlock(&ptype_lock);
416}
417EXPORT_SYMBOL(dev_add_pack);
418
419/**
420 * __dev_remove_pack - remove packet handler
421 * @pt: packet type declaration
422 *
423 * Remove a protocol handler that was previously added to the kernel
424 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
425 * from the kernel lists and can be freed or reused once this function
426 * returns.
427 *
428 * The packet type might still be in use by receivers
429 * and must not be freed until after all the CPU's have gone
430 * through a quiescent state.
431 */
432void __dev_remove_pack(struct packet_type *pt)
433{
434 struct list_head *head = ptype_head(pt);
435 struct packet_type *pt1;
436
437 spin_lock(&ptype_lock);
438
439 list_for_each_entry(pt1, head, list) {
440 if (pt == pt1) {
441 list_del_rcu(&pt->list);
442 goto out;
443 }
444 }
445
446 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
447out:
448 spin_unlock(&ptype_lock);
449}
450EXPORT_SYMBOL(__dev_remove_pack);
451
452/**
453 * dev_remove_pack - remove packet handler
454 * @pt: packet type declaration
455 *
456 * Remove a protocol handler that was previously added to the kernel
457 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
458 * from the kernel lists and can be freed or reused once this function
459 * returns.
460 *
461 * This call sleeps to guarantee that no CPU is looking at the packet
462 * type after return.
463 */
464void dev_remove_pack(struct packet_type *pt)
465{
466 __dev_remove_pack(pt);
467
468 synchronize_net();
469}
470EXPORT_SYMBOL(dev_remove_pack);
471
472/******************************************************************************
473
474 Device Boot-time Settings Routines
475
476*******************************************************************************/
477
478/* Boot time configuration table */
479static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
480
481/**
482 * netdev_boot_setup_add - add new setup entry
483 * @name: name of the device
484 * @map: configured settings for the device
485 *
486 * Adds new setup entry to the dev_boot_setup list. The function
487 * returns 0 on error and 1 on success. This is a generic routine to
488 * all netdevices.
489 */
490static int netdev_boot_setup_add(char *name, struct ifmap *map)
491{
492 struct netdev_boot_setup *s;
493 int i;
494
495 s = dev_boot_setup;
496 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
497 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
498 memset(s[i].name, 0, sizeof(s[i].name));
499 strlcpy(s[i].name, name, IFNAMSIZ);
500 memcpy(&s[i].map, map, sizeof(s[i].map));
501 break;
502 }
503 }
504
505 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
506}
507
508/**
509 * netdev_boot_setup_check - check boot time settings
510 * @dev: the netdevice
511 *
512 * Check boot time settings for the device.
513 * The found settings are set for the device to be used
514 * later in the device probing.
515 * Returns 0 if no settings found, 1 if they are.
516 */
517int netdev_boot_setup_check(struct net_device *dev)
518{
519 struct netdev_boot_setup *s = dev_boot_setup;
520 int i;
521
522 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
523 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
524 !strcmp(dev->name, s[i].name)) {
525 dev->irq = s[i].map.irq;
526 dev->base_addr = s[i].map.base_addr;
527 dev->mem_start = s[i].map.mem_start;
528 dev->mem_end = s[i].map.mem_end;
529 return 1;
530 }
531 }
532 return 0;
533}
534EXPORT_SYMBOL(netdev_boot_setup_check);
535
536
537/**
538 * netdev_boot_base - get address from boot time settings
539 * @prefix: prefix for network device
540 * @unit: id for network device
541 *
542 * Check boot time settings for the base address of device.
543 * The found settings are set for the device to be used
544 * later in the device probing.
545 * Returns 0 if no settings found.
546 */
547unsigned long netdev_boot_base(const char *prefix, int unit)
548{
549 const struct netdev_boot_setup *s = dev_boot_setup;
550 char name[IFNAMSIZ];
551 int i;
552
553 sprintf(name, "%s%d", prefix, unit);
554
555 /*
556 * If device already registered then return base of 1
557 * to indicate not to probe for this interface
558 */
559 if (__dev_get_by_name(&init_net, name))
560 return 1;
561
562 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
563 if (!strcmp(name, s[i].name))
564 return s[i].map.base_addr;
565 return 0;
566}
567
568/*
569 * Saves at boot time configured settings for any netdevice.
570 */
571int __init netdev_boot_setup(char *str)
572{
573 int ints[5];
574 struct ifmap map;
575
576 str = get_options(str, ARRAY_SIZE(ints), ints);
577 if (!str || !*str)
578 return 0;
579
580 /* Save settings */
581 memset(&map, 0, sizeof(map));
582 if (ints[0] > 0)
583 map.irq = ints[1];
584 if (ints[0] > 1)
585 map.base_addr = ints[2];
586 if (ints[0] > 2)
587 map.mem_start = ints[3];
588 if (ints[0] > 3)
589 map.mem_end = ints[4];
590
591 /* Add new entry to the list */
592 return netdev_boot_setup_add(str, &map);
593}
594
595__setup("netdev=", netdev_boot_setup);
596
597/*******************************************************************************
598
599 Device Interface Subroutines
600
601*******************************************************************************/
602
603/**
604 * __dev_get_by_name - find a device by its name
605 * @net: the applicable net namespace
606 * @name: name to find
607 *
608 * Find an interface by name. Must be called under RTNL semaphore
609 * or @dev_base_lock. If the name is found a pointer to the device
610 * is returned. If the name is not found then %NULL is returned. The
611 * reference counters are not incremented so the caller must be
612 * careful with locks.
613 */
614
615struct net_device *__dev_get_by_name(struct net *net, const char *name)
616{
617 struct hlist_node *p;
618 struct net_device *dev;
619 struct hlist_head *head = dev_name_hash(net, name);
620
621 hlist_for_each_entry(dev, p, head, name_hlist)
622 if (!strncmp(dev->name, name, IFNAMSIZ))
623 return dev;
624
625 return NULL;
626}
627EXPORT_SYMBOL(__dev_get_by_name);
628
629/**
630 * dev_get_by_name_rcu - find a device by its name
631 * @net: the applicable net namespace
632 * @name: name to find
633 *
634 * Find an interface by name.
635 * If the name is found a pointer to the device is returned.
636 * If the name is not found then %NULL is returned.
637 * The reference counters are not incremented so the caller must be
638 * careful with locks. The caller must hold RCU lock.
639 */
640
641struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
642{
643 struct hlist_node *p;
644 struct net_device *dev;
645 struct hlist_head *head = dev_name_hash(net, name);
646
647 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
648 if (!strncmp(dev->name, name, IFNAMSIZ))
649 return dev;
650
651 return NULL;
652}
653EXPORT_SYMBOL(dev_get_by_name_rcu);
654
655/**
656 * dev_get_by_name - find a device by its name
657 * @net: the applicable net namespace
658 * @name: name to find
659 *
660 * Find an interface by name. This can be called from any
661 * context and does its own locking. The returned handle has
662 * the usage count incremented and the caller must use dev_put() to
663 * release it when it is no longer needed. %NULL is returned if no
664 * matching device is found.
665 */
666
667struct net_device *dev_get_by_name(struct net *net, const char *name)
668{
669 struct net_device *dev;
670
671 rcu_read_lock();
672 dev = dev_get_by_name_rcu(net, name);
673 if (dev)
674 dev_hold(dev);
675 rcu_read_unlock();
676 return dev;
677}
678EXPORT_SYMBOL(dev_get_by_name);
679
680/**
681 * __dev_get_by_index - find a device by its ifindex
682 * @net: the applicable net namespace
683 * @ifindex: index of device
684 *
685 * Search for an interface by index. Returns %NULL if the device
686 * is not found or a pointer to the device. The device has not
687 * had its reference counter increased so the caller must be careful
688 * about locking. The caller must hold either the RTNL semaphore
689 * or @dev_base_lock.
690 */
691
692struct net_device *__dev_get_by_index(struct net *net, int ifindex)
693{
694 struct hlist_node *p;
695 struct net_device *dev;
696 struct hlist_head *head = dev_index_hash(net, ifindex);
697
698 hlist_for_each_entry(dev, p, head, index_hlist)
699 if (dev->ifindex == ifindex)
700 return dev;
701
702 return NULL;
703}
704EXPORT_SYMBOL(__dev_get_by_index);
705
706/**
707 * dev_get_by_index_rcu - find a device by its ifindex
708 * @net: the applicable net namespace
709 * @ifindex: index of device
710 *
711 * Search for an interface by index. Returns %NULL if the device
712 * is not found or a pointer to the device. The device has not
713 * had its reference counter increased so the caller must be careful
714 * about locking. The caller must hold RCU lock.
715 */
716
717struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
718{
719 struct hlist_node *p;
720 struct net_device *dev;
721 struct hlist_head *head = dev_index_hash(net, ifindex);
722
723 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
724 if (dev->ifindex == ifindex)
725 return dev;
726
727 return NULL;
728}
729EXPORT_SYMBOL(dev_get_by_index_rcu);
730
731
732/**
733 * dev_get_by_index - find a device by its ifindex
734 * @net: the applicable net namespace
735 * @ifindex: index of device
736 *
737 * Search for an interface by index. Returns NULL if the device
738 * is not found or a pointer to the device. The device returned has
739 * had a reference added and the pointer is safe until the user calls
740 * dev_put to indicate they have finished with it.
741 */
742
743struct net_device *dev_get_by_index(struct net *net, int ifindex)
744{
745 struct net_device *dev;
746
747 rcu_read_lock();
748 dev = dev_get_by_index_rcu(net, ifindex);
749 if (dev)
750 dev_hold(dev);
751 rcu_read_unlock();
752 return dev;
753}
754EXPORT_SYMBOL(dev_get_by_index);
755
756/**
757 * dev_getbyhwaddr_rcu - find a device by its hardware address
758 * @net: the applicable net namespace
759 * @type: media type of device
760 * @ha: hardware address
761 *
762 * Search for an interface by MAC address. Returns NULL if the device
763 * is not found or a pointer to the device.
764 * The caller must hold RCU or RTNL.
765 * The returned device has not had its ref count increased
766 * and the caller must therefore be careful about locking
767 *
768 */
769
770struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
771 const char *ha)
772{
773 struct net_device *dev;
774
775 for_each_netdev_rcu(net, dev)
776 if (dev->type == type &&
777 !memcmp(dev->dev_addr, ha, dev->addr_len))
778 return dev;
779
780 return NULL;
781}
782EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
783
784struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
785{
786 struct net_device *dev;
787
788 ASSERT_RTNL();
789 for_each_netdev(net, dev)
790 if (dev->type == type)
791 return dev;
792
793 return NULL;
794}
795EXPORT_SYMBOL(__dev_getfirstbyhwtype);
796
797struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
798{
799 struct net_device *dev, *ret = NULL;
800
801 rcu_read_lock();
802 for_each_netdev_rcu(net, dev)
803 if (dev->type == type) {
804 dev_hold(dev);
805 ret = dev;
806 break;
807 }
808 rcu_read_unlock();
809 return ret;
810}
811EXPORT_SYMBOL(dev_getfirstbyhwtype);
812
813/**
814 * dev_get_by_flags_rcu - find any device with given flags
815 * @net: the applicable net namespace
816 * @if_flags: IFF_* values
817 * @mask: bitmask of bits in if_flags to check
818 *
819 * Search for any interface with the given flags. Returns NULL if a device
820 * is not found or a pointer to the device. Must be called inside
821 * rcu_read_lock(), and result refcount is unchanged.
822 */
823
824struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
825 unsigned short mask)
826{
827 struct net_device *dev, *ret;
828
829 ret = NULL;
830 for_each_netdev_rcu(net, dev) {
831 if (((dev->flags ^ if_flags) & mask) == 0) {
832 ret = dev;
833 break;
834 }
835 }
836 return ret;
837}
838EXPORT_SYMBOL(dev_get_by_flags_rcu);
839
840/**
841 * dev_valid_name - check if name is okay for network device
842 * @name: name string
843 *
844 * Network device names need to be valid file names to
845 * to allow sysfs to work. We also disallow any kind of
846 * whitespace.
847 */
848int dev_valid_name(const char *name)
849{
850 if (*name == '\0')
851 return 0;
852 if (strlen(name) >= IFNAMSIZ)
853 return 0;
854 if (!strcmp(name, ".") || !strcmp(name, ".."))
855 return 0;
856
857 while (*name) {
858 if (*name == '/' || isspace(*name))
859 return 0;
860 name++;
861 }
862 return 1;
863}
864EXPORT_SYMBOL(dev_valid_name);
865
866/**
867 * __dev_alloc_name - allocate a name for a device
868 * @net: network namespace to allocate the device name in
869 * @name: name format string
870 * @buf: scratch buffer and result name string
871 *
872 * Passed a format string - eg "lt%d" it will try and find a suitable
873 * id. It scans list of devices to build up a free map, then chooses
874 * the first empty slot. The caller must hold the dev_base or rtnl lock
875 * while allocating the name and adding the device in order to avoid
876 * duplicates.
877 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
878 * Returns the number of the unit assigned or a negative errno code.
879 */
880
881static int __dev_alloc_name(struct net *net, const char *name, char *buf)
882{
883 int i = 0;
884 const char *p;
885 const int max_netdevices = 8*PAGE_SIZE;
886 unsigned long *inuse;
887 struct net_device *d;
888
889 p = strnchr(name, IFNAMSIZ-1, '%');
890 if (p) {
891 /*
892 * Verify the string as this thing may have come from
893 * the user. There must be either one "%d" and no other "%"
894 * characters.
895 */
896 if (p[1] != 'd' || strchr(p + 2, '%'))
897 return -EINVAL;
898
899 /* Use one page as a bit array of possible slots */
900 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
901 if (!inuse)
902 return -ENOMEM;
903
904 for_each_netdev(net, d) {
905 if (!sscanf(d->name, name, &i))
906 continue;
907 if (i < 0 || i >= max_netdevices)
908 continue;
909
910 /* avoid cases where sscanf is not exact inverse of printf */
911 snprintf(buf, IFNAMSIZ, name, i);
912 if (!strncmp(buf, d->name, IFNAMSIZ))
913 set_bit(i, inuse);
914 }
915
916 i = find_first_zero_bit(inuse, max_netdevices);
917 free_page((unsigned long) inuse);
918 }
919
920 if (buf != name)
921 snprintf(buf, IFNAMSIZ, name, i);
922 if (!__dev_get_by_name(net, buf))
923 return i;
924
925 /* It is possible to run out of possible slots
926 * when the name is long and there isn't enough space left
927 * for the digits, or if all bits are used.
928 */
929 return -ENFILE;
930}
931
932/**
933 * dev_alloc_name - allocate a name for a device
934 * @dev: device
935 * @name: name format string
936 *
937 * Passed a format string - eg "lt%d" it will try and find a suitable
938 * id. It scans list of devices to build up a free map, then chooses
939 * the first empty slot. The caller must hold the dev_base or rtnl lock
940 * while allocating the name and adding the device in order to avoid
941 * duplicates.
942 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
943 * Returns the number of the unit assigned or a negative errno code.
944 */
945
946int dev_alloc_name(struct net_device *dev, const char *name)
947{
948 char buf[IFNAMSIZ];
949 struct net *net;
950 int ret;
951
952 BUG_ON(!dev_net(dev));
953 net = dev_net(dev);
954 ret = __dev_alloc_name(net, name, buf);
955 if (ret >= 0)
956 strlcpy(dev->name, buf, IFNAMSIZ);
957 return ret;
958}
959EXPORT_SYMBOL(dev_alloc_name);
960
961static int dev_get_valid_name(struct net_device *dev, const char *name)
962{
963 struct net *net;
964
965 BUG_ON(!dev_net(dev));
966 net = dev_net(dev);
967
968 if (!dev_valid_name(name))
969 return -EINVAL;
970
971 if (strchr(name, '%'))
972 return dev_alloc_name(dev, name);
973 else if (__dev_get_by_name(net, name))
974 return -EEXIST;
975 else if (dev->name != name)
976 strlcpy(dev->name, name, IFNAMSIZ);
977
978 return 0;
979}
980
981/**
982 * dev_change_name - change name of a device
983 * @dev: device
984 * @newname: name (or format string) must be at least IFNAMSIZ
985 *
986 * Change name of a device, can pass format strings "eth%d".
987 * for wildcarding.
988 */
989int dev_change_name(struct net_device *dev, const char *newname)
990{
991 char oldname[IFNAMSIZ];
992 int err = 0;
993 int ret;
994 struct net *net;
995
996 ASSERT_RTNL();
997 BUG_ON(!dev_net(dev));
998
999 net = dev_net(dev);
1000 if (dev->flags & IFF_UP)
1001 return -EBUSY;
1002
1003 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1004 return 0;
1005
1006 memcpy(oldname, dev->name, IFNAMSIZ);
1007
1008 err = dev_get_valid_name(dev, newname);
1009 if (err < 0)
1010 return err;
1011
1012rollback:
1013 ret = device_rename(&dev->dev, dev->name);
1014 if (ret) {
1015 memcpy(dev->name, oldname, IFNAMSIZ);
1016 return ret;
1017 }
1018
1019 write_lock_bh(&dev_base_lock);
1020 hlist_del_rcu(&dev->name_hlist);
1021 write_unlock_bh(&dev_base_lock);
1022
1023 synchronize_rcu();
1024
1025 write_lock_bh(&dev_base_lock);
1026 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1027 write_unlock_bh(&dev_base_lock);
1028
1029 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1030 ret = notifier_to_errno(ret);
1031
1032 if (ret) {
1033 /* err >= 0 after dev_alloc_name() or stores the first errno */
1034 if (err >= 0) {
1035 err = ret;
1036 memcpy(dev->name, oldname, IFNAMSIZ);
1037 goto rollback;
1038 } else {
1039 printk(KERN_ERR
1040 "%s: name change rollback failed: %d.\n",
1041 dev->name, ret);
1042 }
1043 }
1044
1045 return err;
1046}
1047
1048/**
1049 * dev_set_alias - change ifalias of a device
1050 * @dev: device
1051 * @alias: name up to IFALIASZ
1052 * @len: limit of bytes to copy from info
1053 *
1054 * Set ifalias for a device,
1055 */
1056int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1057{
1058 ASSERT_RTNL();
1059
1060 if (len >= IFALIASZ)
1061 return -EINVAL;
1062
1063 if (!len) {
1064 if (dev->ifalias) {
1065 kfree(dev->ifalias);
1066 dev->ifalias = NULL;
1067 }
1068 return 0;
1069 }
1070
1071 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1072 if (!dev->ifalias)
1073 return -ENOMEM;
1074
1075 strlcpy(dev->ifalias, alias, len+1);
1076 return len;
1077}
1078
1079
1080/**
1081 * netdev_features_change - device changes features
1082 * @dev: device to cause notification
1083 *
1084 * Called to indicate a device has changed features.
1085 */
1086void netdev_features_change(struct net_device *dev)
1087{
1088 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1089}
1090EXPORT_SYMBOL(netdev_features_change);
1091
1092/**
1093 * netdev_state_change - device changes state
1094 * @dev: device to cause notification
1095 *
1096 * Called to indicate a device has changed state. This function calls
1097 * the notifier chains for netdev_chain and sends a NEWLINK message
1098 * to the routing socket.
1099 */
1100void netdev_state_change(struct net_device *dev)
1101{
1102 if (dev->flags & IFF_UP) {
1103 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1104 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1105 }
1106}
1107EXPORT_SYMBOL(netdev_state_change);
1108
1109int netdev_bonding_change(struct net_device *dev, unsigned long event)
1110{
1111 return call_netdevice_notifiers(event, dev);
1112}
1113EXPORT_SYMBOL(netdev_bonding_change);
1114
1115/**
1116 * dev_load - load a network module
1117 * @net: the applicable net namespace
1118 * @name: name of interface
1119 *
1120 * If a network interface is not present and the process has suitable
1121 * privileges this function loads the module. If module loading is not
1122 * available in this kernel then it becomes a nop.
1123 */
1124
1125void dev_load(struct net *net, const char *name)
1126{
1127 struct net_device *dev;
1128 int no_module;
1129
1130 rcu_read_lock();
1131 dev = dev_get_by_name_rcu(net, name);
1132 rcu_read_unlock();
1133
1134 no_module = !dev;
1135 if (no_module && capable(CAP_NET_ADMIN))
1136 no_module = request_module("netdev-%s", name);
1137 if (no_module && capable(CAP_SYS_MODULE)) {
1138 if (!request_module("%s", name))
1139 pr_err("Loading kernel module for a network device "
1140"with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s "
1141"instead\n", name);
1142 }
1143}
1144EXPORT_SYMBOL(dev_load);
1145
1146static int __dev_open(struct net_device *dev)
1147{
1148 const struct net_device_ops *ops = dev->netdev_ops;
1149 int ret;
1150
1151 ASSERT_RTNL();
1152
1153 if (!netif_device_present(dev))
1154 return -ENODEV;
1155
1156 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1157 ret = notifier_to_errno(ret);
1158 if (ret)
1159 return ret;
1160
1161 set_bit(__LINK_STATE_START, &dev->state);
1162
1163 if (ops->ndo_validate_addr)
1164 ret = ops->ndo_validate_addr(dev);
1165
1166 if (!ret && ops->ndo_open)
1167 ret = ops->ndo_open(dev);
1168
1169 if (ret)
1170 clear_bit(__LINK_STATE_START, &dev->state);
1171 else {
1172 dev->flags |= IFF_UP;
1173 net_dmaengine_get();
1174 dev_set_rx_mode(dev);
1175 dev_activate(dev);
1176 }
1177
1178 return ret;
1179}
1180
1181/**
1182 * dev_open - prepare an interface for use.
1183 * @dev: device to open
1184 *
1185 * Takes a device from down to up state. The device's private open
1186 * function is invoked and then the multicast lists are loaded. Finally
1187 * the device is moved into the up state and a %NETDEV_UP message is
1188 * sent to the netdev notifier chain.
1189 *
1190 * Calling this function on an active interface is a nop. On a failure
1191 * a negative errno code is returned.
1192 */
1193int dev_open(struct net_device *dev)
1194{
1195 int ret;
1196
1197 if (dev->flags & IFF_UP)
1198 return 0;
1199
1200 ret = __dev_open(dev);
1201 if (ret < 0)
1202 return ret;
1203
1204 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1205 call_netdevice_notifiers(NETDEV_UP, dev);
1206
1207 return ret;
1208}
1209EXPORT_SYMBOL(dev_open);
1210
1211static int __dev_close_many(struct list_head *head)
1212{
1213 struct net_device *dev;
1214
1215 ASSERT_RTNL();
1216 might_sleep();
1217
1218 list_for_each_entry(dev, head, unreg_list) {
1219 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1220
1221 clear_bit(__LINK_STATE_START, &dev->state);
1222
1223 /* Synchronize to scheduled poll. We cannot touch poll list, it
1224 * can be even on different cpu. So just clear netif_running().
1225 *
1226 * dev->stop() will invoke napi_disable() on all of it's
1227 * napi_struct instances on this device.
1228 */
1229 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1230 }
1231
1232 dev_deactivate_many(head);
1233
1234 list_for_each_entry(dev, head, unreg_list) {
1235 const struct net_device_ops *ops = dev->netdev_ops;
1236
1237 /*
1238 * Call the device specific close. This cannot fail.
1239 * Only if device is UP
1240 *
1241 * We allow it to be called even after a DETACH hot-plug
1242 * event.
1243 */
1244 if (ops->ndo_stop)
1245 ops->ndo_stop(dev);
1246
1247 dev->flags &= ~IFF_UP;
1248 net_dmaengine_put();
1249 }
1250
1251 return 0;
1252}
1253
1254static int __dev_close(struct net_device *dev)
1255{
1256 int retval;
1257 LIST_HEAD(single);
1258
1259 list_add(&dev->unreg_list, &single);
1260 retval = __dev_close_many(&single);
1261 list_del(&single);
1262 return retval;
1263}
1264
1265static int dev_close_many(struct list_head *head)
1266{
1267 struct net_device *dev, *tmp;
1268 LIST_HEAD(tmp_list);
1269
1270 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1271 if (!(dev->flags & IFF_UP))
1272 list_move(&dev->unreg_list, &tmp_list);
1273
1274 __dev_close_many(head);
1275
1276 list_for_each_entry(dev, head, unreg_list) {
1277 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1278 call_netdevice_notifiers(NETDEV_DOWN, dev);
1279 }
1280
1281 /* rollback_registered_many needs the complete original list */
1282 list_splice(&tmp_list, head);
1283 return 0;
1284}
1285
1286/**
1287 * dev_close - shutdown an interface.
1288 * @dev: device to shutdown
1289 *
1290 * This function moves an active device into down state. A
1291 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1292 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1293 * chain.
1294 */
1295int dev_close(struct net_device *dev)
1296{
1297 if (dev->flags & IFF_UP) {
1298 LIST_HEAD(single);
1299
1300 list_add(&dev->unreg_list, &single);
1301 dev_close_many(&single);
1302 list_del(&single);
1303 }
1304 return 0;
1305}
1306EXPORT_SYMBOL(dev_close);
1307
1308
1309/**
1310 * dev_disable_lro - disable Large Receive Offload on a device
1311 * @dev: device
1312 *
1313 * Disable Large Receive Offload (LRO) on a net device. Must be
1314 * called under RTNL. This is needed if received packets may be
1315 * forwarded to another interface.
1316 */
1317void dev_disable_lro(struct net_device *dev)
1318{
1319 u32 flags;
1320
1321 /*
1322 * If we're trying to disable lro on a vlan device
1323 * use the underlying physical device instead
1324 */
1325 if (is_vlan_dev(dev))
1326 dev = vlan_dev_real_dev(dev);
1327
1328 if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1329 flags = dev->ethtool_ops->get_flags(dev);
1330 else
1331 flags = ethtool_op_get_flags(dev);
1332
1333 if (!(flags & ETH_FLAG_LRO))
1334 return;
1335
1336 __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1337 if (unlikely(dev->features & NETIF_F_LRO))
1338 netdev_WARN(dev, "failed to disable LRO!\n");
1339}
1340EXPORT_SYMBOL(dev_disable_lro);
1341
1342
1343static int dev_boot_phase = 1;
1344
1345/**
1346 * register_netdevice_notifier - register a network notifier block
1347 * @nb: notifier
1348 *
1349 * Register a notifier to be called when network device events occur.
1350 * The notifier passed is linked into the kernel structures and must
1351 * not be reused until it has been unregistered. A negative errno code
1352 * is returned on a failure.
1353 *
1354 * When registered all registration and up events are replayed
1355 * to the new notifier to allow device to have a race free
1356 * view of the network device list.
1357 */
1358
1359int register_netdevice_notifier(struct notifier_block *nb)
1360{
1361 struct net_device *dev;
1362 struct net_device *last;
1363 struct net *net;
1364 int err;
1365
1366 rtnl_lock();
1367 err = raw_notifier_chain_register(&netdev_chain, nb);
1368 if (err)
1369 goto unlock;
1370 if (dev_boot_phase)
1371 goto unlock;
1372 for_each_net(net) {
1373 for_each_netdev(net, dev) {
1374 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1375 err = notifier_to_errno(err);
1376 if (err)
1377 goto rollback;
1378
1379 if (!(dev->flags & IFF_UP))
1380 continue;
1381
1382 nb->notifier_call(nb, NETDEV_UP, dev);
1383 }
1384 }
1385
1386unlock:
1387 rtnl_unlock();
1388 return err;
1389
1390rollback:
1391 last = dev;
1392 for_each_net(net) {
1393 for_each_netdev(net, dev) {
1394 if (dev == last)
1395 break;
1396
1397 if (dev->flags & IFF_UP) {
1398 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1399 nb->notifier_call(nb, NETDEV_DOWN, dev);
1400 }
1401 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1402 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1403 }
1404 }
1405
1406 raw_notifier_chain_unregister(&netdev_chain, nb);
1407 goto unlock;
1408}
1409EXPORT_SYMBOL(register_netdevice_notifier);
1410
1411/**
1412 * unregister_netdevice_notifier - unregister a network notifier block
1413 * @nb: notifier
1414 *
1415 * Unregister a notifier previously registered by
1416 * register_netdevice_notifier(). The notifier is unlinked into the
1417 * kernel structures and may then be reused. A negative errno code
1418 * is returned on a failure.
1419 */
1420
1421int unregister_netdevice_notifier(struct notifier_block *nb)
1422{
1423 int err;
1424
1425 rtnl_lock();
1426 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1427 rtnl_unlock();
1428 return err;
1429}
1430EXPORT_SYMBOL(unregister_netdevice_notifier);
1431
1432/**
1433 * call_netdevice_notifiers - call all network notifier blocks
1434 * @val: value passed unmodified to notifier function
1435 * @dev: net_device pointer passed unmodified to notifier function
1436 *
1437 * Call all network notifier blocks. Parameters and return value
1438 * are as for raw_notifier_call_chain().
1439 */
1440
1441int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1442{
1443 ASSERT_RTNL();
1444 return raw_notifier_call_chain(&netdev_chain, val, dev);
1445}
1446EXPORT_SYMBOL(call_netdevice_notifiers);
1447
1448/* When > 0 there are consumers of rx skb time stamps */
1449static atomic_t netstamp_needed = ATOMIC_INIT(0);
1450
1451void net_enable_timestamp(void)
1452{
1453 atomic_inc(&netstamp_needed);
1454}
1455EXPORT_SYMBOL(net_enable_timestamp);
1456
1457void net_disable_timestamp(void)
1458{
1459 atomic_dec(&netstamp_needed);
1460}
1461EXPORT_SYMBOL(net_disable_timestamp);
1462
1463static inline void net_timestamp_set(struct sk_buff *skb)
1464{
1465 if (atomic_read(&netstamp_needed))
1466 __net_timestamp(skb);
1467 else
1468 skb->tstamp.tv64 = 0;
1469}
1470
1471static inline void net_timestamp_check(struct sk_buff *skb)
1472{
1473 if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1474 __net_timestamp(skb);
1475}
1476
1477static inline bool is_skb_forwardable(struct net_device *dev,
1478 struct sk_buff *skb)
1479{
1480 unsigned int len;
1481
1482 if (!(dev->flags & IFF_UP))
1483 return false;
1484
1485 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1486 if (skb->len <= len)
1487 return true;
1488
1489 /* if TSO is enabled, we don't care about the length as the packet
1490 * could be forwarded without being segmented before
1491 */
1492 if (skb_is_gso(skb))
1493 return true;
1494
1495 return false;
1496}
1497
1498/**
1499 * dev_forward_skb - loopback an skb to another netif
1500 *
1501 * @dev: destination network device
1502 * @skb: buffer to forward
1503 *
1504 * return values:
1505 * NET_RX_SUCCESS (no congestion)
1506 * NET_RX_DROP (packet was dropped, but freed)
1507 *
1508 * dev_forward_skb can be used for injecting an skb from the
1509 * start_xmit function of one device into the receive queue
1510 * of another device.
1511 *
1512 * The receiving device may be in another namespace, so
1513 * we have to clear all information in the skb that could
1514 * impact namespace isolation.
1515 */
1516int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1517{
1518 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1519 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1520 atomic_long_inc(&dev->rx_dropped);
1521 kfree_skb(skb);
1522 return NET_RX_DROP;
1523 }
1524 }
1525
1526 skb_orphan(skb);
1527 nf_reset(skb);
1528
1529 if (unlikely(!is_skb_forwardable(dev, skb))) {
1530 atomic_long_inc(&dev->rx_dropped);
1531 kfree_skb(skb);
1532 return NET_RX_DROP;
1533 }
1534 skb_set_dev(skb, dev);
1535 skb->tstamp.tv64 = 0;
1536 skb->pkt_type = PACKET_HOST;
1537 skb->protocol = eth_type_trans(skb, dev);
1538 return netif_rx(skb);
1539}
1540EXPORT_SYMBOL_GPL(dev_forward_skb);
1541
1542static inline int deliver_skb(struct sk_buff *skb,
1543 struct packet_type *pt_prev,
1544 struct net_device *orig_dev)
1545{
1546 atomic_inc(&skb->users);
1547 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1548}
1549
1550/*
1551 * Support routine. Sends outgoing frames to any network
1552 * taps currently in use.
1553 */
1554
1555static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1556{
1557 struct packet_type *ptype;
1558 struct sk_buff *skb2 = NULL;
1559 struct packet_type *pt_prev = NULL;
1560
1561 rcu_read_lock();
1562 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1563 /* Never send packets back to the socket
1564 * they originated from - MvS (miquels@drinkel.ow.org)
1565 */
1566 if ((ptype->dev == dev || !ptype->dev) &&
1567 (ptype->af_packet_priv == NULL ||
1568 (struct sock *)ptype->af_packet_priv != skb->sk)) {
1569 if (pt_prev) {
1570 deliver_skb(skb2, pt_prev, skb->dev);
1571 pt_prev = ptype;
1572 continue;
1573 }
1574
1575 skb2 = skb_clone(skb, GFP_ATOMIC);
1576 if (!skb2)
1577 break;
1578
1579 net_timestamp_set(skb2);
1580
1581 /* skb->nh should be correctly
1582 set by sender, so that the second statement is
1583 just protection against buggy protocols.
1584 */
1585 skb_reset_mac_header(skb2);
1586
1587 if (skb_network_header(skb2) < skb2->data ||
1588 skb2->network_header > skb2->tail) {
1589 if (net_ratelimit())
1590 printk(KERN_CRIT "protocol %04x is "
1591 "buggy, dev %s\n",
1592 ntohs(skb2->protocol),
1593 dev->name);
1594 skb_reset_network_header(skb2);
1595 }
1596
1597 skb2->transport_header = skb2->network_header;
1598 skb2->pkt_type = PACKET_OUTGOING;
1599 pt_prev = ptype;
1600 }
1601 }
1602 if (pt_prev)
1603 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1604 rcu_read_unlock();
1605}
1606
1607/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1608 * @dev: Network device
1609 * @txq: number of queues available
1610 *
1611 * If real_num_tx_queues is changed the tc mappings may no longer be
1612 * valid. To resolve this verify the tc mapping remains valid and if
1613 * not NULL the mapping. With no priorities mapping to this
1614 * offset/count pair it will no longer be used. In the worst case TC0
1615 * is invalid nothing can be done so disable priority mappings. If is
1616 * expected that drivers will fix this mapping if they can before
1617 * calling netif_set_real_num_tx_queues.
1618 */
1619static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1620{
1621 int i;
1622 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1623
1624 /* If TC0 is invalidated disable TC mapping */
1625 if (tc->offset + tc->count > txq) {
1626 pr_warning("Number of in use tx queues changed "
1627 "invalidating tc mappings. Priority "
1628 "traffic classification disabled!\n");
1629 dev->num_tc = 0;
1630 return;
1631 }
1632
1633 /* Invalidated prio to tc mappings set to TC0 */
1634 for (i = 1; i < TC_BITMASK + 1; i++) {
1635 int q = netdev_get_prio_tc_map(dev, i);
1636
1637 tc = &dev->tc_to_txq[q];
1638 if (tc->offset + tc->count > txq) {
1639 pr_warning("Number of in use tx queues "
1640 "changed. Priority %i to tc "
1641 "mapping %i is no longer valid "
1642 "setting map to 0\n",
1643 i, q);
1644 netdev_set_prio_tc_map(dev, i, 0);
1645 }
1646 }
1647}
1648
1649/*
1650 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1651 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1652 */
1653int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1654{
1655 int rc;
1656
1657 if (txq < 1 || txq > dev->num_tx_queues)
1658 return -EINVAL;
1659
1660 if (dev->reg_state == NETREG_REGISTERED ||
1661 dev->reg_state == NETREG_UNREGISTERING) {
1662 ASSERT_RTNL();
1663
1664 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1665 txq);
1666 if (rc)
1667 return rc;
1668
1669 if (dev->num_tc)
1670 netif_setup_tc(dev, txq);
1671
1672 if (txq < dev->real_num_tx_queues)
1673 qdisc_reset_all_tx_gt(dev, txq);
1674 }
1675
1676 dev->real_num_tx_queues = txq;
1677 return 0;
1678}
1679EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1680
1681#ifdef CONFIG_RPS
1682/**
1683 * netif_set_real_num_rx_queues - set actual number of RX queues used
1684 * @dev: Network device
1685 * @rxq: Actual number of RX queues
1686 *
1687 * This must be called either with the rtnl_lock held or before
1688 * registration of the net device. Returns 0 on success, or a
1689 * negative error code. If called before registration, it always
1690 * succeeds.
1691 */
1692int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1693{
1694 int rc;
1695
1696 if (rxq < 1 || rxq > dev->num_rx_queues)
1697 return -EINVAL;
1698
1699 if (dev->reg_state == NETREG_REGISTERED) {
1700 ASSERT_RTNL();
1701
1702 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1703 rxq);
1704 if (rc)
1705 return rc;
1706 }
1707
1708 dev->real_num_rx_queues = rxq;
1709 return 0;
1710}
1711EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1712#endif
1713
1714static inline void __netif_reschedule(struct Qdisc *q)
1715{
1716 struct softnet_data *sd;
1717 unsigned long flags;
1718
1719 local_irq_save(flags);
1720 sd = &__get_cpu_var(softnet_data);
1721 q->next_sched = NULL;
1722 *sd->output_queue_tailp = q;
1723 sd->output_queue_tailp = &q->next_sched;
1724 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1725 local_irq_restore(flags);
1726}
1727
1728void __netif_schedule(struct Qdisc *q)
1729{
1730 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1731 __netif_reschedule(q);
1732}
1733EXPORT_SYMBOL(__netif_schedule);
1734
1735void dev_kfree_skb_irq(struct sk_buff *skb)
1736{
1737 if (atomic_dec_and_test(&skb->users)) {
1738 struct softnet_data *sd;
1739 unsigned long flags;
1740
1741 local_irq_save(flags);
1742 sd = &__get_cpu_var(softnet_data);
1743 skb->next = sd->completion_queue;
1744 sd->completion_queue = skb;
1745 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1746 local_irq_restore(flags);
1747 }
1748}
1749EXPORT_SYMBOL(dev_kfree_skb_irq);
1750
1751void dev_kfree_skb_any(struct sk_buff *skb)
1752{
1753 if (in_irq() || irqs_disabled())
1754 dev_kfree_skb_irq(skb);
1755 else
1756 dev_kfree_skb(skb);
1757}
1758EXPORT_SYMBOL(dev_kfree_skb_any);
1759
1760
1761/**
1762 * netif_device_detach - mark device as removed
1763 * @dev: network device
1764 *
1765 * Mark device as removed from system and therefore no longer available.
1766 */
1767void netif_device_detach(struct net_device *dev)
1768{
1769 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1770 netif_running(dev)) {
1771 netif_tx_stop_all_queues(dev);
1772 }
1773}
1774EXPORT_SYMBOL(netif_device_detach);
1775
1776/**
1777 * netif_device_attach - mark device as attached
1778 * @dev: network device
1779 *
1780 * Mark device as attached from system and restart if needed.
1781 */
1782void netif_device_attach(struct net_device *dev)
1783{
1784 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1785 netif_running(dev)) {
1786 netif_tx_wake_all_queues(dev);
1787 __netdev_watchdog_up(dev);
1788 }
1789}
1790EXPORT_SYMBOL(netif_device_attach);
1791
1792/**
1793 * skb_dev_set -- assign a new device to a buffer
1794 * @skb: buffer for the new device
1795 * @dev: network device
1796 *
1797 * If an skb is owned by a device already, we have to reset
1798 * all data private to the namespace a device belongs to
1799 * before assigning it a new device.
1800 */
1801#ifdef CONFIG_NET_NS
1802void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1803{
1804 skb_dst_drop(skb);
1805 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1806 secpath_reset(skb);
1807 nf_reset(skb);
1808 skb_init_secmark(skb);
1809 skb->mark = 0;
1810 skb->priority = 0;
1811 skb->nf_trace = 0;
1812 skb->ipvs_property = 0;
1813#ifdef CONFIG_NET_SCHED
1814 skb->tc_index = 0;
1815#endif
1816 }
1817 skb->dev = dev;
1818}
1819EXPORT_SYMBOL(skb_set_dev);
1820#endif /* CONFIG_NET_NS */
1821
1822/*
1823 * Invalidate hardware checksum when packet is to be mangled, and
1824 * complete checksum manually on outgoing path.
1825 */
1826int skb_checksum_help(struct sk_buff *skb)
1827{
1828 __wsum csum;
1829 int ret = 0, offset;
1830
1831 if (skb->ip_summed == CHECKSUM_COMPLETE)
1832 goto out_set_summed;
1833
1834 if (unlikely(skb_shinfo(skb)->gso_size)) {
1835 /* Let GSO fix up the checksum. */
1836 goto out_set_summed;
1837 }
1838
1839 offset = skb_checksum_start_offset(skb);
1840 BUG_ON(offset >= skb_headlen(skb));
1841 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1842
1843 offset += skb->csum_offset;
1844 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1845
1846 if (skb_cloned(skb) &&
1847 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1848 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1849 if (ret)
1850 goto out;
1851 }
1852
1853 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1854out_set_summed:
1855 skb->ip_summed = CHECKSUM_NONE;
1856out:
1857 return ret;
1858}
1859EXPORT_SYMBOL(skb_checksum_help);
1860
1861/**
1862 * skb_gso_segment - Perform segmentation on skb.
1863 * @skb: buffer to segment
1864 * @features: features for the output path (see dev->features)
1865 *
1866 * This function segments the given skb and returns a list of segments.
1867 *
1868 * It may return NULL if the skb requires no segmentation. This is
1869 * only possible when GSO is used for verifying header integrity.
1870 */
1871struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1872{
1873 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1874 struct packet_type *ptype;
1875 __be16 type = skb->protocol;
1876 int vlan_depth = ETH_HLEN;
1877 int err;
1878
1879 while (type == htons(ETH_P_8021Q)) {
1880 struct vlan_hdr *vh;
1881
1882 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1883 return ERR_PTR(-EINVAL);
1884
1885 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1886 type = vh->h_vlan_encapsulated_proto;
1887 vlan_depth += VLAN_HLEN;
1888 }
1889
1890 skb_reset_mac_header(skb);
1891 skb->mac_len = skb->network_header - skb->mac_header;
1892 __skb_pull(skb, skb->mac_len);
1893
1894 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1895 struct net_device *dev = skb->dev;
1896 struct ethtool_drvinfo info = {};
1897
1898 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1899 dev->ethtool_ops->get_drvinfo(dev, &info);
1900
1901 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1902 info.driver, dev ? dev->features : 0L,
1903 skb->sk ? skb->sk->sk_route_caps : 0L,
1904 skb->len, skb->data_len, skb->ip_summed);
1905
1906 if (skb_header_cloned(skb) &&
1907 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1908 return ERR_PTR(err);
1909 }
1910
1911 rcu_read_lock();
1912 list_for_each_entry_rcu(ptype,
1913 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1914 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1915 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1916 err = ptype->gso_send_check(skb);
1917 segs = ERR_PTR(err);
1918 if (err || skb_gso_ok(skb, features))
1919 break;
1920 __skb_push(skb, (skb->data -
1921 skb_network_header(skb)));
1922 }
1923 segs = ptype->gso_segment(skb, features);
1924 break;
1925 }
1926 }
1927 rcu_read_unlock();
1928
1929 __skb_push(skb, skb->data - skb_mac_header(skb));
1930
1931 return segs;
1932}
1933EXPORT_SYMBOL(skb_gso_segment);
1934
1935/* Take action when hardware reception checksum errors are detected. */
1936#ifdef CONFIG_BUG
1937void netdev_rx_csum_fault(struct net_device *dev)
1938{
1939 if (net_ratelimit()) {
1940 printk(KERN_ERR "%s: hw csum failure.\n",
1941 dev ? dev->name : "<unknown>");
1942 dump_stack();
1943 }
1944}
1945EXPORT_SYMBOL(netdev_rx_csum_fault);
1946#endif
1947
1948/* Actually, we should eliminate this check as soon as we know, that:
1949 * 1. IOMMU is present and allows to map all the memory.
1950 * 2. No high memory really exists on this machine.
1951 */
1952
1953static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1954{
1955#ifdef CONFIG_HIGHMEM
1956 int i;
1957 if (!(dev->features & NETIF_F_HIGHDMA)) {
1958 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1959 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1960 return 1;
1961 }
1962
1963 if (PCI_DMA_BUS_IS_PHYS) {
1964 struct device *pdev = dev->dev.parent;
1965
1966 if (!pdev)
1967 return 0;
1968 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1969 dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1970 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1971 return 1;
1972 }
1973 }
1974#endif
1975 return 0;
1976}
1977
1978struct dev_gso_cb {
1979 void (*destructor)(struct sk_buff *skb);
1980};
1981
1982#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1983
1984static void dev_gso_skb_destructor(struct sk_buff *skb)
1985{
1986 struct dev_gso_cb *cb;
1987
1988 do {
1989 struct sk_buff *nskb = skb->next;
1990
1991 skb->next = nskb->next;
1992 nskb->next = NULL;
1993 kfree_skb(nskb);
1994 } while (skb->next);
1995
1996 cb = DEV_GSO_CB(skb);
1997 if (cb->destructor)
1998 cb->destructor(skb);
1999}
2000
2001/**
2002 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2003 * @skb: buffer to segment
2004 * @features: device features as applicable to this skb
2005 *
2006 * This function segments the given skb and stores the list of segments
2007 * in skb->next.
2008 */
2009static int dev_gso_segment(struct sk_buff *skb, int features)
2010{
2011 struct sk_buff *segs;
2012
2013 segs = skb_gso_segment(skb, features);
2014
2015 /* Verifying header integrity only. */
2016 if (!segs)
2017 return 0;
2018
2019 if (IS_ERR(segs))
2020 return PTR_ERR(segs);
2021
2022 skb->next = segs;
2023 DEV_GSO_CB(skb)->destructor = skb->destructor;
2024 skb->destructor = dev_gso_skb_destructor;
2025
2026 return 0;
2027}
2028
2029/*
2030 * Try to orphan skb early, right before transmission by the device.
2031 * We cannot orphan skb if tx timestamp is requested or the sk-reference
2032 * is needed on driver level for other reasons, e.g. see net/can/raw.c
2033 */
2034static inline void skb_orphan_try(struct sk_buff *skb)
2035{
2036 struct sock *sk = skb->sk;
2037
2038 if (sk && !skb_shinfo(skb)->tx_flags) {
2039 /* skb_tx_hash() wont be able to get sk.
2040 * We copy sk_hash into skb->rxhash
2041 */
2042 if (!skb->rxhash)
2043 skb->rxhash = sk->sk_hash;
2044 skb_orphan(skb);
2045 }
2046}
2047
2048static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2049{
2050 return ((features & NETIF_F_GEN_CSUM) ||
2051 ((features & NETIF_F_V4_CSUM) &&
2052 protocol == htons(ETH_P_IP)) ||
2053 ((features & NETIF_F_V6_CSUM) &&
2054 protocol == htons(ETH_P_IPV6)) ||
2055 ((features & NETIF_F_FCOE_CRC) &&
2056 protocol == htons(ETH_P_FCOE)));
2057}
2058
2059static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2060{
2061 if (!can_checksum_protocol(features, protocol)) {
2062 features &= ~NETIF_F_ALL_CSUM;
2063 features &= ~NETIF_F_SG;
2064 } else if (illegal_highdma(skb->dev, skb)) {
2065 features &= ~NETIF_F_SG;
2066 }
2067
2068 return features;
2069}
2070
2071u32 netif_skb_features(struct sk_buff *skb)
2072{
2073 __be16 protocol = skb->protocol;
2074 u32 features = skb->dev->features;
2075
2076 if (protocol == htons(ETH_P_8021Q)) {
2077 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2078 protocol = veh->h_vlan_encapsulated_proto;
2079 } else if (!vlan_tx_tag_present(skb)) {
2080 return harmonize_features(skb, protocol, features);
2081 }
2082
2083 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2084
2085 if (protocol != htons(ETH_P_8021Q)) {
2086 return harmonize_features(skb, protocol, features);
2087 } else {
2088 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2089 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2090 return harmonize_features(skb, protocol, features);
2091 }
2092}
2093EXPORT_SYMBOL(netif_skb_features);
2094
2095/*
2096 * Returns true if either:
2097 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2098 * 2. skb is fragmented and the device does not support SG, or if
2099 * at least one of fragments is in highmem and device does not
2100 * support DMA from it.
2101 */
2102static inline int skb_needs_linearize(struct sk_buff *skb,
2103 int features)
2104{
2105 return skb_is_nonlinear(skb) &&
2106 ((skb_has_frag_list(skb) &&
2107 !(features & NETIF_F_FRAGLIST)) ||
2108 (skb_shinfo(skb)->nr_frags &&
2109 !(features & NETIF_F_SG)));
2110}
2111
2112int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2113 struct netdev_queue *txq)
2114{
2115 const struct net_device_ops *ops = dev->netdev_ops;
2116 int rc = NETDEV_TX_OK;
2117 unsigned int skb_len;
2118
2119 if (likely(!skb->next)) {
2120 u32 features;
2121
2122 /*
2123 * If device doesn't need skb->dst, release it right now while
2124 * its hot in this cpu cache
2125 */
2126 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2127 skb_dst_drop(skb);
2128
2129 if (!list_empty(&ptype_all))
2130 dev_queue_xmit_nit(skb, dev);
2131
2132 skb_orphan_try(skb);
2133
2134 features = netif_skb_features(skb);
2135
2136 if (vlan_tx_tag_present(skb) &&
2137 !(features & NETIF_F_HW_VLAN_TX)) {
2138 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2139 if (unlikely(!skb))
2140 goto out;
2141
2142 skb->vlan_tci = 0;
2143 }
2144
2145 if (netif_needs_gso(skb, features)) {
2146 if (unlikely(dev_gso_segment(skb, features)))
2147 goto out_kfree_skb;
2148 if (skb->next)
2149 goto gso;
2150 } else {
2151 if (skb_needs_linearize(skb, features) &&
2152 __skb_linearize(skb))
2153 goto out_kfree_skb;
2154
2155 /* If packet is not checksummed and device does not
2156 * support checksumming for this protocol, complete
2157 * checksumming here.
2158 */
2159 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2160 skb_set_transport_header(skb,
2161 skb_checksum_start_offset(skb));
2162 if (!(features & NETIF_F_ALL_CSUM) &&
2163 skb_checksum_help(skb))
2164 goto out_kfree_skb;
2165 }
2166 }
2167
2168 skb_len = skb->len;
2169 rc = ops->ndo_start_xmit(skb, dev);
2170 trace_net_dev_xmit(skb, rc, dev, skb_len);
2171 if (rc == NETDEV_TX_OK)
2172 txq_trans_update(txq);
2173 return rc;
2174 }
2175
2176gso:
2177 do {
2178 struct sk_buff *nskb = skb->next;
2179
2180 skb->next = nskb->next;
2181 nskb->next = NULL;
2182
2183 /*
2184 * If device doesn't need nskb->dst, release it right now while
2185 * its hot in this cpu cache
2186 */
2187 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2188 skb_dst_drop(nskb);
2189
2190 skb_len = nskb->len;
2191 rc = ops->ndo_start_xmit(nskb, dev);
2192 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2193 if (unlikely(rc != NETDEV_TX_OK)) {
2194 if (rc & ~NETDEV_TX_MASK)
2195 goto out_kfree_gso_skb;
2196 nskb->next = skb->next;
2197 skb->next = nskb;
2198 return rc;
2199 }
2200 txq_trans_update(txq);
2201 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2202 return NETDEV_TX_BUSY;
2203 } while (skb->next);
2204
2205out_kfree_gso_skb:
2206 if (likely(skb->next == NULL))
2207 skb->destructor = DEV_GSO_CB(skb)->destructor;
2208out_kfree_skb:
2209 kfree_skb(skb);
2210out:
2211 return rc;
2212}
2213
2214static u32 hashrnd __read_mostly;
2215
2216/*
2217 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2218 * to be used as a distribution range.
2219 */
2220u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2221 unsigned int num_tx_queues)
2222{
2223 u32 hash;
2224 u16 qoffset = 0;
2225 u16 qcount = num_tx_queues;
2226
2227 if (skb_rx_queue_recorded(skb)) {
2228 hash = skb_get_rx_queue(skb);
2229 while (unlikely(hash >= num_tx_queues))
2230 hash -= num_tx_queues;
2231 return hash;
2232 }
2233
2234 if (dev->num_tc) {
2235 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2236 qoffset = dev->tc_to_txq[tc].offset;
2237 qcount = dev->tc_to_txq[tc].count;
2238 }
2239
2240 if (skb->sk && skb->sk->sk_hash)
2241 hash = skb->sk->sk_hash;
2242 else
2243 hash = (__force u16) skb->protocol ^ skb->rxhash;
2244 hash = jhash_1word(hash, hashrnd);
2245
2246 return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2247}
2248EXPORT_SYMBOL(__skb_tx_hash);
2249
2250static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2251{
2252 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2253 if (net_ratelimit()) {
2254 pr_warning("%s selects TX queue %d, but "
2255 "real number of TX queues is %d\n",
2256 dev->name, queue_index, dev->real_num_tx_queues);
2257 }
2258 return 0;
2259 }
2260 return queue_index;
2261}
2262
2263static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2264{
2265#ifdef CONFIG_XPS
2266 struct xps_dev_maps *dev_maps;
2267 struct xps_map *map;
2268 int queue_index = -1;
2269
2270 rcu_read_lock();
2271 dev_maps = rcu_dereference(dev->xps_maps);
2272 if (dev_maps) {
2273 map = rcu_dereference(
2274 dev_maps->cpu_map[raw_smp_processor_id()]);
2275 if (map) {
2276 if (map->len == 1)
2277 queue_index = map->queues[0];
2278 else {
2279 u32 hash;
2280 if (skb->sk && skb->sk->sk_hash)
2281 hash = skb->sk->sk_hash;
2282 else
2283 hash = (__force u16) skb->protocol ^
2284 skb->rxhash;
2285 hash = jhash_1word(hash, hashrnd);
2286 queue_index = map->queues[
2287 ((u64)hash * map->len) >> 32];
2288 }
2289 if (unlikely(queue_index >= dev->real_num_tx_queues))
2290 queue_index = -1;
2291 }
2292 }
2293 rcu_read_unlock();
2294
2295 return queue_index;
2296#else
2297 return -1;
2298#endif
2299}
2300
2301static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2302 struct sk_buff *skb)
2303{
2304 int queue_index;
2305 const struct net_device_ops *ops = dev->netdev_ops;
2306
2307 if (dev->real_num_tx_queues == 1)
2308 queue_index = 0;
2309 else if (ops->ndo_select_queue) {
2310 queue_index = ops->ndo_select_queue(dev, skb);
2311 queue_index = dev_cap_txqueue(dev, queue_index);
2312 } else {
2313 struct sock *sk = skb->sk;
2314 queue_index = sk_tx_queue_get(sk);
2315
2316 if (queue_index < 0 || skb->ooo_okay ||
2317 queue_index >= dev->real_num_tx_queues) {
2318 int old_index = queue_index;
2319
2320 queue_index = get_xps_queue(dev, skb);
2321 if (queue_index < 0)
2322 queue_index = skb_tx_hash(dev, skb);
2323
2324 if (queue_index != old_index && sk) {
2325 struct dst_entry *dst =
2326 rcu_dereference_check(sk->sk_dst_cache, 1);
2327
2328 if (dst && skb_dst(skb) == dst)
2329 sk_tx_queue_set(sk, queue_index);
2330 }
2331 }
2332 }
2333
2334 skb_set_queue_mapping(skb, queue_index);
2335 return netdev_get_tx_queue(dev, queue_index);
2336}
2337
2338static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2339 struct net_device *dev,
2340 struct netdev_queue *txq)
2341{
2342 spinlock_t *root_lock = qdisc_lock(q);
2343 bool contended;
2344 int rc;
2345
2346 qdisc_skb_cb(skb)->pkt_len = skb->len;
2347 qdisc_calculate_pkt_len(skb, q);
2348 /*
2349 * Heuristic to force contended enqueues to serialize on a
2350 * separate lock before trying to get qdisc main lock.
2351 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2352 * and dequeue packets faster.
2353 */
2354 contended = qdisc_is_running(q);
2355 if (unlikely(contended))
2356 spin_lock(&q->busylock);
2357
2358 spin_lock(root_lock);
2359 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2360 kfree_skb(skb);
2361 rc = NET_XMIT_DROP;
2362 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2363 qdisc_run_begin(q)) {
2364 /*
2365 * This is a work-conserving queue; there are no old skbs
2366 * waiting to be sent out; and the qdisc is not running -
2367 * xmit the skb directly.
2368 */
2369 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2370 skb_dst_force(skb);
2371
2372 qdisc_bstats_update(q, skb);
2373
2374 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2375 if (unlikely(contended)) {
2376 spin_unlock(&q->busylock);
2377 contended = false;
2378 }
2379 __qdisc_run(q);
2380 } else
2381 qdisc_run_end(q);
2382
2383 rc = NET_XMIT_SUCCESS;
2384 } else {
2385 skb_dst_force(skb);
2386 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2387 if (qdisc_run_begin(q)) {
2388 if (unlikely(contended)) {
2389 spin_unlock(&q->busylock);
2390 contended = false;
2391 }
2392 __qdisc_run(q);
2393 }
2394 }
2395 spin_unlock(root_lock);
2396 if (unlikely(contended))
2397 spin_unlock(&q->busylock);
2398 return rc;
2399}
2400
2401static DEFINE_PER_CPU(int, xmit_recursion);
2402#define RECURSION_LIMIT 10
2403
2404/**
2405 * dev_queue_xmit - transmit a buffer
2406 * @skb: buffer to transmit
2407 *
2408 * Queue a buffer for transmission to a network device. The caller must
2409 * have set the device and priority and built the buffer before calling
2410 * this function. The function can be called from an interrupt.
2411 *
2412 * A negative errno code is returned on a failure. A success does not
2413 * guarantee the frame will be transmitted as it may be dropped due
2414 * to congestion or traffic shaping.
2415 *
2416 * -----------------------------------------------------------------------------------
2417 * I notice this method can also return errors from the queue disciplines,
2418 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2419 * be positive.
2420 *
2421 * Regardless of the return value, the skb is consumed, so it is currently
2422 * difficult to retry a send to this method. (You can bump the ref count
2423 * before sending to hold a reference for retry if you are careful.)
2424 *
2425 * When calling this method, interrupts MUST be enabled. This is because
2426 * the BH enable code must have IRQs enabled so that it will not deadlock.
2427 * --BLG
2428 */
2429int dev_queue_xmit(struct sk_buff *skb)
2430{
2431 struct net_device *dev = skb->dev;
2432 struct netdev_queue *txq;
2433 struct Qdisc *q;
2434 int rc = -ENOMEM;
2435
2436 /* Disable soft irqs for various locks below. Also
2437 * stops preemption for RCU.
2438 */
2439 rcu_read_lock_bh();
2440
2441 txq = dev_pick_tx(dev, skb);
2442 q = rcu_dereference_bh(txq->qdisc);
2443
2444#ifdef CONFIG_NET_CLS_ACT
2445 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2446#endif
2447 trace_net_dev_queue(skb);
2448 if (q->enqueue) {
2449 rc = __dev_xmit_skb(skb, q, dev, txq);
2450 goto out;
2451 }
2452
2453 /* The device has no queue. Common case for software devices:
2454 loopback, all the sorts of tunnels...
2455
2456 Really, it is unlikely that netif_tx_lock protection is necessary
2457 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2458 counters.)
2459 However, it is possible, that they rely on protection
2460 made by us here.
2461
2462 Check this and shot the lock. It is not prone from deadlocks.
2463 Either shot noqueue qdisc, it is even simpler 8)
2464 */
2465 if (dev->flags & IFF_UP) {
2466 int cpu = smp_processor_id(); /* ok because BHs are off */
2467
2468 if (txq->xmit_lock_owner != cpu) {
2469
2470 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2471 goto recursion_alert;
2472
2473 HARD_TX_LOCK(dev, txq, cpu);
2474
2475 if (!netif_tx_queue_stopped(txq)) {
2476 __this_cpu_inc(xmit_recursion);
2477 rc = dev_hard_start_xmit(skb, dev, txq);
2478 __this_cpu_dec(xmit_recursion);
2479 if (dev_xmit_complete(rc)) {
2480 HARD_TX_UNLOCK(dev, txq);
2481 goto out;
2482 }
2483 }
2484 HARD_TX_UNLOCK(dev, txq);
2485 if (net_ratelimit())
2486 printk(KERN_CRIT "Virtual device %s asks to "
2487 "queue packet!\n", dev->name);
2488 } else {
2489 /* Recursion is detected! It is possible,
2490 * unfortunately
2491 */
2492recursion_alert:
2493 if (net_ratelimit())
2494 printk(KERN_CRIT "Dead loop on virtual device "
2495 "%s, fix it urgently!\n", dev->name);
2496 }
2497 }
2498
2499 rc = -ENETDOWN;
2500 rcu_read_unlock_bh();
2501
2502 kfree_skb(skb);
2503 return rc;
2504out:
2505 rcu_read_unlock_bh();
2506 return rc;
2507}
2508EXPORT_SYMBOL(dev_queue_xmit);
2509
2510
2511/*=======================================================================
2512 Receiver routines
2513 =======================================================================*/
2514
2515int netdev_max_backlog __read_mostly = 1000;
2516int netdev_tstamp_prequeue __read_mostly = 1;
2517int netdev_budget __read_mostly = 300;
2518int weight_p __read_mostly = 64; /* old backlog weight */
2519
2520/* Called with irq disabled */
2521static inline void ____napi_schedule(struct softnet_data *sd,
2522 struct napi_struct *napi)
2523{
2524 list_add_tail(&napi->poll_list, &sd->poll_list);
2525 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2526}
2527
2528/*
2529 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2530 * and src/dst port numbers. Returns a non-zero hash number on success
2531 * and 0 on failure.
2532 */
2533__u32 __skb_get_rxhash(struct sk_buff *skb)
2534{
2535 int nhoff, hash = 0, poff;
2536 const struct ipv6hdr *ip6;
2537 const struct iphdr *ip;
2538 u8 ip_proto;
2539 u32 addr1, addr2, ihl;
2540 union {
2541 u32 v32;
2542 u16 v16[2];
2543 } ports;
2544
2545 nhoff = skb_network_offset(skb);
2546
2547 switch (skb->protocol) {
2548 case __constant_htons(ETH_P_IP):
2549 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2550 goto done;
2551
2552 ip = (const struct iphdr *) (skb->data + nhoff);
2553 if (ip_is_fragment(ip))
2554 ip_proto = 0;
2555 else
2556 ip_proto = ip->protocol;
2557 addr1 = (__force u32) ip->saddr;
2558 addr2 = (__force u32) ip->daddr;
2559 ihl = ip->ihl;
2560 break;
2561 case __constant_htons(ETH_P_IPV6):
2562 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2563 goto done;
2564
2565 ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2566 ip_proto = ip6->nexthdr;
2567 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2568 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2569 ihl = (40 >> 2);
2570 break;
2571 default:
2572 goto done;
2573 }
2574
2575 ports.v32 = 0;
2576 poff = proto_ports_offset(ip_proto);
2577 if (poff >= 0) {
2578 nhoff += ihl * 4 + poff;
2579 if (pskb_may_pull(skb, nhoff + 4)) {
2580 ports.v32 = * (__force u32 *) (skb->data + nhoff);
2581 if (ports.v16[1] < ports.v16[0])
2582 swap(ports.v16[0], ports.v16[1]);
2583 }
2584 }
2585
2586 /* get a consistent hash (same value on both flow directions) */
2587 if (addr2 < addr1)
2588 swap(addr1, addr2);
2589
2590 hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2591 if (!hash)
2592 hash = 1;
2593
2594done:
2595 return hash;
2596}
2597EXPORT_SYMBOL(__skb_get_rxhash);
2598
2599#ifdef CONFIG_RPS
2600
2601/* One global table that all flow-based protocols share. */
2602struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2603EXPORT_SYMBOL(rps_sock_flow_table);
2604
2605static struct rps_dev_flow *
2606set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2607 struct rps_dev_flow *rflow, u16 next_cpu)
2608{
2609 u16 tcpu;
2610
2611 tcpu = rflow->cpu = next_cpu;
2612 if (tcpu != RPS_NO_CPU) {
2613#ifdef CONFIG_RFS_ACCEL
2614 struct netdev_rx_queue *rxqueue;
2615 struct rps_dev_flow_table *flow_table;
2616 struct rps_dev_flow *old_rflow;
2617 u32 flow_id;
2618 u16 rxq_index;
2619 int rc;
2620
2621 /* Should we steer this flow to a different hardware queue? */
2622 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2623 !(dev->features & NETIF_F_NTUPLE))
2624 goto out;
2625 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2626 if (rxq_index == skb_get_rx_queue(skb))
2627 goto out;
2628
2629 rxqueue = dev->_rx + rxq_index;
2630 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2631 if (!flow_table)
2632 goto out;
2633 flow_id = skb->rxhash & flow_table->mask;
2634 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2635 rxq_index, flow_id);
2636 if (rc < 0)
2637 goto out;
2638 old_rflow = rflow;
2639 rflow = &flow_table->flows[flow_id];
2640 rflow->cpu = next_cpu;
2641 rflow->filter = rc;
2642 if (old_rflow->filter == rflow->filter)
2643 old_rflow->filter = RPS_NO_FILTER;
2644 out:
2645#endif
2646 rflow->last_qtail =
2647 per_cpu(softnet_data, tcpu).input_queue_head;
2648 }
2649
2650 return rflow;
2651}
2652
2653/*
2654 * get_rps_cpu is called from netif_receive_skb and returns the target
2655 * CPU from the RPS map of the receiving queue for a given skb.
2656 * rcu_read_lock must be held on entry.
2657 */
2658static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2659 struct rps_dev_flow **rflowp)
2660{
2661 struct netdev_rx_queue *rxqueue;
2662 struct rps_map *map;
2663 struct rps_dev_flow_table *flow_table;
2664 struct rps_sock_flow_table *sock_flow_table;
2665 int cpu = -1;
2666 u16 tcpu;
2667
2668 if (skb_rx_queue_recorded(skb)) {
2669 u16 index = skb_get_rx_queue(skb);
2670 if (unlikely(index >= dev->real_num_rx_queues)) {
2671 WARN_ONCE(dev->real_num_rx_queues > 1,
2672 "%s received packet on queue %u, but number "
2673 "of RX queues is %u\n",
2674 dev->name, index, dev->real_num_rx_queues);
2675 goto done;
2676 }
2677 rxqueue = dev->_rx + index;
2678 } else
2679 rxqueue = dev->_rx;
2680
2681 map = rcu_dereference(rxqueue->rps_map);
2682 if (map) {
2683 if (map->len == 1 &&
2684 !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2685 tcpu = map->cpus[0];
2686 if (cpu_online(tcpu))
2687 cpu = tcpu;
2688 goto done;
2689 }
2690 } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2691 goto done;
2692 }
2693
2694 skb_reset_network_header(skb);
2695 if (!skb_get_rxhash(skb))
2696 goto done;
2697
2698 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2699 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2700 if (flow_table && sock_flow_table) {
2701 u16 next_cpu;
2702 struct rps_dev_flow *rflow;
2703
2704 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2705 tcpu = rflow->cpu;
2706
2707 next_cpu = sock_flow_table->ents[skb->rxhash &
2708 sock_flow_table->mask];
2709
2710 /*
2711 * If the desired CPU (where last recvmsg was done) is
2712 * different from current CPU (one in the rx-queue flow
2713 * table entry), switch if one of the following holds:
2714 * - Current CPU is unset (equal to RPS_NO_CPU).
2715 * - Current CPU is offline.
2716 * - The current CPU's queue tail has advanced beyond the
2717 * last packet that was enqueued using this table entry.
2718 * This guarantees that all previous packets for the flow
2719 * have been dequeued, thus preserving in order delivery.
2720 */
2721 if (unlikely(tcpu != next_cpu) &&
2722 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2723 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2724 rflow->last_qtail)) >= 0))
2725 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2726
2727 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2728 *rflowp = rflow;
2729 cpu = tcpu;
2730 goto done;
2731 }
2732 }
2733
2734 if (map) {
2735 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2736
2737 if (cpu_online(tcpu)) {
2738 cpu = tcpu;
2739 goto done;
2740 }
2741 }
2742
2743done:
2744 return cpu;
2745}
2746
2747#ifdef CONFIG_RFS_ACCEL
2748
2749/**
2750 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2751 * @dev: Device on which the filter was set
2752 * @rxq_index: RX queue index
2753 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2754 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2755 *
2756 * Drivers that implement ndo_rx_flow_steer() should periodically call
2757 * this function for each installed filter and remove the filters for
2758 * which it returns %true.
2759 */
2760bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2761 u32 flow_id, u16 filter_id)
2762{
2763 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2764 struct rps_dev_flow_table *flow_table;
2765 struct rps_dev_flow *rflow;
2766 bool expire = true;
2767 int cpu;
2768
2769 rcu_read_lock();
2770 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2771 if (flow_table && flow_id <= flow_table->mask) {
2772 rflow = &flow_table->flows[flow_id];
2773 cpu = ACCESS_ONCE(rflow->cpu);
2774 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2775 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2776 rflow->last_qtail) <
2777 (int)(10 * flow_table->mask)))
2778 expire = false;
2779 }
2780 rcu_read_unlock();
2781 return expire;
2782}
2783EXPORT_SYMBOL(rps_may_expire_flow);
2784
2785#endif /* CONFIG_RFS_ACCEL */
2786
2787/* Called from hardirq (IPI) context */
2788static void rps_trigger_softirq(void *data)
2789{
2790 struct softnet_data *sd = data;
2791
2792 ____napi_schedule(sd, &sd->backlog);
2793 sd->received_rps++;
2794}
2795
2796#endif /* CONFIG_RPS */
2797
2798/*
2799 * Check if this softnet_data structure is another cpu one
2800 * If yes, queue it to our IPI list and return 1
2801 * If no, return 0
2802 */
2803static int rps_ipi_queued(struct softnet_data *sd)
2804{
2805#ifdef CONFIG_RPS
2806 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2807
2808 if (sd != mysd) {
2809 sd->rps_ipi_next = mysd->rps_ipi_list;
2810 mysd->rps_ipi_list = sd;
2811
2812 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2813 return 1;
2814 }
2815#endif /* CONFIG_RPS */
2816 return 0;
2817}
2818
2819/*
2820 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2821 * queue (may be a remote CPU queue).
2822 */
2823static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2824 unsigned int *qtail)
2825{
2826 struct softnet_data *sd;
2827 unsigned long flags;
2828
2829 sd = &per_cpu(softnet_data, cpu);
2830
2831 local_irq_save(flags);
2832
2833 rps_lock(sd);
2834 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2835 if (skb_queue_len(&sd->input_pkt_queue)) {
2836enqueue:
2837 __skb_queue_tail(&sd->input_pkt_queue, skb);
2838 input_queue_tail_incr_save(sd, qtail);
2839 rps_unlock(sd);
2840 local_irq_restore(flags);
2841 return NET_RX_SUCCESS;
2842 }
2843
2844 /* Schedule NAPI for backlog device
2845 * We can use non atomic operation since we own the queue lock
2846 */
2847 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2848 if (!rps_ipi_queued(sd))
2849 ____napi_schedule(sd, &sd->backlog);
2850 }
2851 goto enqueue;
2852 }
2853
2854 sd->dropped++;
2855 rps_unlock(sd);
2856
2857 local_irq_restore(flags);
2858
2859 atomic_long_inc(&skb->dev->rx_dropped);
2860 kfree_skb(skb);
2861 return NET_RX_DROP;
2862}
2863
2864/**
2865 * netif_rx - post buffer to the network code
2866 * @skb: buffer to post
2867 *
2868 * This function receives a packet from a device driver and queues it for
2869 * the upper (protocol) levels to process. It always succeeds. The buffer
2870 * may be dropped during processing for congestion control or by the
2871 * protocol layers.
2872 *
2873 * return values:
2874 * NET_RX_SUCCESS (no congestion)
2875 * NET_RX_DROP (packet was dropped)
2876 *
2877 */
2878
2879int netif_rx(struct sk_buff *skb)
2880{
2881 int ret;
2882
2883 /* if netpoll wants it, pretend we never saw it */
2884 if (netpoll_rx(skb))
2885 return NET_RX_DROP;
2886
2887 if (netdev_tstamp_prequeue)
2888 net_timestamp_check(skb);
2889
2890 trace_netif_rx(skb);
2891#ifdef CONFIG_RPS
2892 {
2893 struct rps_dev_flow voidflow, *rflow = &voidflow;
2894 int cpu;
2895
2896 preempt_disable();
2897 rcu_read_lock();
2898
2899 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2900 if (cpu < 0)
2901 cpu = smp_processor_id();
2902
2903 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2904
2905 rcu_read_unlock();
2906 preempt_enable();
2907 }
2908#else
2909 {
2910 unsigned int qtail;
2911 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2912 put_cpu();
2913 }
2914#endif
2915 return ret;
2916}
2917EXPORT_SYMBOL(netif_rx);
2918
2919int netif_rx_ni(struct sk_buff *skb)
2920{
2921 int err;
2922
2923 preempt_disable();
2924 err = netif_rx(skb);
2925 if (local_softirq_pending())
2926 do_softirq();
2927 preempt_enable();
2928
2929 return err;
2930}
2931EXPORT_SYMBOL(netif_rx_ni);
2932
2933static void net_tx_action(struct softirq_action *h)
2934{
2935 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2936
2937 if (sd->completion_queue) {
2938 struct sk_buff *clist;
2939
2940 local_irq_disable();
2941 clist = sd->completion_queue;
2942 sd->completion_queue = NULL;
2943 local_irq_enable();
2944
2945 while (clist) {
2946 struct sk_buff *skb = clist;
2947 clist = clist->next;
2948
2949 WARN_ON(atomic_read(&skb->users));
2950 trace_kfree_skb(skb, net_tx_action);
2951 __kfree_skb(skb);
2952 }
2953 }
2954
2955 if (sd->output_queue) {
2956 struct Qdisc *head;
2957
2958 local_irq_disable();
2959 head = sd->output_queue;
2960 sd->output_queue = NULL;
2961 sd->output_queue_tailp = &sd->output_queue;
2962 local_irq_enable();
2963
2964 while (head) {
2965 struct Qdisc *q = head;
2966 spinlock_t *root_lock;
2967
2968 head = head->next_sched;
2969
2970 root_lock = qdisc_lock(q);
2971 if (spin_trylock(root_lock)) {
2972 smp_mb__before_clear_bit();
2973 clear_bit(__QDISC_STATE_SCHED,
2974 &q->state);
2975 qdisc_run(q);
2976 spin_unlock(root_lock);
2977 } else {
2978 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2979 &q->state)) {
2980 __netif_reschedule(q);
2981 } else {
2982 smp_mb__before_clear_bit();
2983 clear_bit(__QDISC_STATE_SCHED,
2984 &q->state);
2985 }
2986 }
2987 }
2988 }
2989}
2990
2991#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2992 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2993/* This hook is defined here for ATM LANE */
2994int (*br_fdb_test_addr_hook)(struct net_device *dev,
2995 unsigned char *addr) __read_mostly;
2996EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2997#endif
2998
2999#ifdef CONFIG_NET_CLS_ACT
3000/* TODO: Maybe we should just force sch_ingress to be compiled in
3001 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3002 * a compare and 2 stores extra right now if we dont have it on
3003 * but have CONFIG_NET_CLS_ACT
3004 * NOTE: This doesn't stop any functionality; if you dont have
3005 * the ingress scheduler, you just can't add policies on ingress.
3006 *
3007 */
3008static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3009{
3010 struct net_device *dev = skb->dev;
3011 u32 ttl = G_TC_RTTL(skb->tc_verd);
3012 int result = TC_ACT_OK;
3013 struct Qdisc *q;
3014
3015 if (unlikely(MAX_RED_LOOP < ttl++)) {
3016 if (net_ratelimit())
3017 pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3018 skb->skb_iif, dev->ifindex);
3019 return TC_ACT_SHOT;
3020 }
3021
3022 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3023 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3024
3025 q = rxq->qdisc;
3026 if (q != &noop_qdisc) {
3027 spin_lock(qdisc_lock(q));
3028 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3029 result = qdisc_enqueue_root(skb, q);
3030 spin_unlock(qdisc_lock(q));
3031 }
3032
3033 return result;
3034}
3035
3036static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3037 struct packet_type **pt_prev,
3038 int *ret, struct net_device *orig_dev)
3039{
3040 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3041
3042 if (!rxq || rxq->qdisc == &noop_qdisc)
3043 goto out;
3044
3045 if (*pt_prev) {
3046 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3047 *pt_prev = NULL;
3048 }
3049
3050 switch (ing_filter(skb, rxq)) {
3051 case TC_ACT_SHOT:
3052 case TC_ACT_STOLEN:
3053 kfree_skb(skb);
3054 return NULL;
3055 }
3056
3057out:
3058 skb->tc_verd = 0;
3059 return skb;
3060}
3061#endif
3062
3063/**
3064 * netdev_rx_handler_register - register receive handler
3065 * @dev: device to register a handler for
3066 * @rx_handler: receive handler to register
3067 * @rx_handler_data: data pointer that is used by rx handler
3068 *
3069 * Register a receive hander for a device. This handler will then be
3070 * called from __netif_receive_skb. A negative errno code is returned
3071 * on a failure.
3072 *
3073 * The caller must hold the rtnl_mutex.
3074 *
3075 * For a general description of rx_handler, see enum rx_handler_result.
3076 */
3077int netdev_rx_handler_register(struct net_device *dev,
3078 rx_handler_func_t *rx_handler,
3079 void *rx_handler_data)
3080{
3081 ASSERT_RTNL();
3082
3083 if (dev->rx_handler)
3084 return -EBUSY;
3085
3086 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3087 rcu_assign_pointer(dev->rx_handler, rx_handler);
3088
3089 return 0;
3090}
3091EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3092
3093/**
3094 * netdev_rx_handler_unregister - unregister receive handler
3095 * @dev: device to unregister a handler from
3096 *
3097 * Unregister a receive hander from a device.
3098 *
3099 * The caller must hold the rtnl_mutex.
3100 */
3101void netdev_rx_handler_unregister(struct net_device *dev)
3102{
3103
3104 ASSERT_RTNL();
3105 rcu_assign_pointer(dev->rx_handler, NULL);
3106 rcu_assign_pointer(dev->rx_handler_data, NULL);
3107}
3108EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3109
3110static int __netif_receive_skb(struct sk_buff *skb)
3111{
3112 struct packet_type *ptype, *pt_prev;
3113 rx_handler_func_t *rx_handler;
3114 struct net_device *orig_dev;
3115 struct net_device *null_or_dev;
3116 bool deliver_exact = false;
3117 int ret = NET_RX_DROP;
3118 __be16 type;
3119
3120 if (!netdev_tstamp_prequeue)
3121 net_timestamp_check(skb);
3122
3123 trace_netif_receive_skb(skb);
3124
3125 /* if we've gotten here through NAPI, check netpoll */
3126 if (netpoll_receive_skb(skb))
3127 return NET_RX_DROP;
3128
3129 if (!skb->skb_iif)
3130 skb->skb_iif = skb->dev->ifindex;
3131 orig_dev = skb->dev;
3132
3133 skb_reset_network_header(skb);
3134 skb_reset_transport_header(skb);
3135 skb_reset_mac_len(skb);
3136
3137 pt_prev = NULL;
3138
3139 rcu_read_lock();
3140
3141another_round:
3142
3143 __this_cpu_inc(softnet_data.processed);
3144
3145 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3146 skb = vlan_untag(skb);
3147 if (unlikely(!skb))
3148 goto out;
3149 }
3150
3151#ifdef CONFIG_NET_CLS_ACT
3152 if (skb->tc_verd & TC_NCLS) {
3153 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3154 goto ncls;
3155 }
3156#endif
3157
3158 list_for_each_entry_rcu(ptype, &ptype_all, list) {
3159 if (!ptype->dev || ptype->dev == skb->dev) {
3160 if (pt_prev)
3161 ret = deliver_skb(skb, pt_prev, orig_dev);
3162 pt_prev = ptype;
3163 }
3164 }
3165
3166#ifdef CONFIG_NET_CLS_ACT
3167 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3168 if (!skb)
3169 goto out;
3170ncls:
3171#endif
3172
3173 rx_handler = rcu_dereference(skb->dev->rx_handler);
3174 if (rx_handler) {
3175 if (pt_prev) {
3176 ret = deliver_skb(skb, pt_prev, orig_dev);
3177 pt_prev = NULL;
3178 }
3179 switch (rx_handler(&skb)) {
3180 case RX_HANDLER_CONSUMED:
3181 goto out;
3182 case RX_HANDLER_ANOTHER:
3183 goto another_round;
3184 case RX_HANDLER_EXACT:
3185 deliver_exact = true;
3186 case RX_HANDLER_PASS:
3187 break;
3188 default:
3189 BUG();
3190 }
3191 }
3192
3193 if (vlan_tx_tag_present(skb)) {
3194 if (pt_prev) {
3195 ret = deliver_skb(skb, pt_prev, orig_dev);
3196 pt_prev = NULL;
3197 }
3198 if (vlan_do_receive(&skb)) {
3199 ret = __netif_receive_skb(skb);
3200 goto out;
3201 } else if (unlikely(!skb))
3202 goto out;
3203 }
3204
3205 /* deliver only exact match when indicated */
3206 null_or_dev = deliver_exact ? skb->dev : NULL;
3207
3208 type = skb->protocol;
3209 list_for_each_entry_rcu(ptype,
3210 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3211 if (ptype->type == type &&
3212 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3213 ptype->dev == orig_dev)) {
3214 if (pt_prev)
3215 ret = deliver_skb(skb, pt_prev, orig_dev);
3216 pt_prev = ptype;
3217 }
3218 }
3219
3220 if (pt_prev) {
3221 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3222 } else {
3223 atomic_long_inc(&skb->dev->rx_dropped);
3224 kfree_skb(skb);
3225 /* Jamal, now you will not able to escape explaining
3226 * me how you were going to use this. :-)
3227 */
3228 ret = NET_RX_DROP;
3229 }
3230
3231out:
3232 rcu_read_unlock();
3233 return ret;
3234}
3235
3236/**
3237 * netif_receive_skb - process receive buffer from network
3238 * @skb: buffer to process
3239 *
3240 * netif_receive_skb() is the main receive data processing function.
3241 * It always succeeds. The buffer may be dropped during processing
3242 * for congestion control or by the protocol layers.
3243 *
3244 * This function may only be called from softirq context and interrupts
3245 * should be enabled.
3246 *
3247 * Return values (usually ignored):
3248 * NET_RX_SUCCESS: no congestion
3249 * NET_RX_DROP: packet was dropped
3250 */
3251int netif_receive_skb(struct sk_buff *skb)
3252{
3253 if (netdev_tstamp_prequeue)
3254 net_timestamp_check(skb);
3255
3256 if (skb_defer_rx_timestamp(skb))
3257 return NET_RX_SUCCESS;
3258
3259#ifdef CONFIG_RPS
3260 {
3261 struct rps_dev_flow voidflow, *rflow = &voidflow;
3262 int cpu, ret;
3263
3264 rcu_read_lock();
3265
3266 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3267
3268 if (cpu >= 0) {
3269 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3270 rcu_read_unlock();
3271 } else {
3272 rcu_read_unlock();
3273 ret = __netif_receive_skb(skb);
3274 }
3275
3276 return ret;
3277 }
3278#else
3279 return __netif_receive_skb(skb);
3280#endif
3281}
3282EXPORT_SYMBOL(netif_receive_skb);
3283
3284/* Network device is going away, flush any packets still pending
3285 * Called with irqs disabled.
3286 */
3287static void flush_backlog(void *arg)
3288{
3289 struct net_device *dev = arg;
3290 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3291 struct sk_buff *skb, *tmp;
3292
3293 rps_lock(sd);
3294 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3295 if (skb->dev == dev) {
3296 __skb_unlink(skb, &sd->input_pkt_queue);
3297 kfree_skb(skb);
3298 input_queue_head_incr(sd);
3299 }
3300 }
3301 rps_unlock(sd);
3302
3303 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3304 if (skb->dev == dev) {
3305 __skb_unlink(skb, &sd->process_queue);
3306 kfree_skb(skb);
3307 input_queue_head_incr(sd);
3308 }
3309 }
3310}
3311
3312static int napi_gro_complete(struct sk_buff *skb)
3313{
3314 struct packet_type *ptype;
3315 __be16 type = skb->protocol;
3316 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3317 int err = -ENOENT;
3318
3319 if (NAPI_GRO_CB(skb)->count == 1) {
3320 skb_shinfo(skb)->gso_size = 0;
3321 goto out;
3322 }
3323
3324 rcu_read_lock();
3325 list_for_each_entry_rcu(ptype, head, list) {
3326 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3327 continue;
3328
3329 err = ptype->gro_complete(skb);
3330 break;
3331 }
3332 rcu_read_unlock();
3333
3334 if (err) {
3335 WARN_ON(&ptype->list == head);
3336 kfree_skb(skb);
3337 return NET_RX_SUCCESS;
3338 }
3339
3340out:
3341 return netif_receive_skb(skb);
3342}
3343
3344inline void napi_gro_flush(struct napi_struct *napi)
3345{
3346 struct sk_buff *skb, *next;
3347
3348 for (skb = napi->gro_list; skb; skb = next) {
3349 next = skb->next;
3350 skb->next = NULL;
3351 napi_gro_complete(skb);
3352 }
3353
3354 napi->gro_count = 0;
3355 napi->gro_list = NULL;
3356}
3357EXPORT_SYMBOL(napi_gro_flush);
3358
3359enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3360{
3361 struct sk_buff **pp = NULL;
3362 struct packet_type *ptype;
3363 __be16 type = skb->protocol;
3364 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3365 int same_flow;
3366 int mac_len;
3367 enum gro_result ret;
3368
3369 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3370 goto normal;
3371
3372 if (skb_is_gso(skb) || skb_has_frag_list(skb))
3373 goto normal;
3374
3375 rcu_read_lock();
3376 list_for_each_entry_rcu(ptype, head, list) {
3377 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3378 continue;
3379
3380 skb_set_network_header(skb, skb_gro_offset(skb));
3381 mac_len = skb->network_header - skb->mac_header;
3382 skb->mac_len = mac_len;
3383 NAPI_GRO_CB(skb)->same_flow = 0;
3384 NAPI_GRO_CB(skb)->flush = 0;
3385 NAPI_GRO_CB(skb)->free = 0;
3386
3387 pp = ptype->gro_receive(&napi->gro_list, skb);
3388 break;
3389 }
3390 rcu_read_unlock();
3391
3392 if (&ptype->list == head)
3393 goto normal;
3394
3395 same_flow = NAPI_GRO_CB(skb)->same_flow;
3396 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3397
3398 if (pp) {
3399 struct sk_buff *nskb = *pp;
3400
3401 *pp = nskb->next;
3402 nskb->next = NULL;
3403 napi_gro_complete(nskb);
3404 napi->gro_count--;
3405 }
3406
3407 if (same_flow)
3408 goto ok;
3409
3410 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3411 goto normal;
3412
3413 napi->gro_count++;
3414 NAPI_GRO_CB(skb)->count = 1;
3415 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3416 skb->next = napi->gro_list;
3417 napi->gro_list = skb;
3418 ret = GRO_HELD;
3419
3420pull:
3421 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3422 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3423
3424 BUG_ON(skb->end - skb->tail < grow);
3425
3426 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3427
3428 skb->tail += grow;
3429 skb->data_len -= grow;
3430
3431 skb_shinfo(skb)->frags[0].page_offset += grow;
3432 skb_shinfo(skb)->frags[0].size -= grow;
3433
3434 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3435 put_page(skb_shinfo(skb)->frags[0].page);
3436 memmove(skb_shinfo(skb)->frags,
3437 skb_shinfo(skb)->frags + 1,
3438 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3439 }
3440 }
3441
3442ok:
3443 return ret;
3444
3445normal:
3446 ret = GRO_NORMAL;
3447 goto pull;
3448}
3449EXPORT_SYMBOL(dev_gro_receive);
3450
3451static inline gro_result_t
3452__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3453{
3454 struct sk_buff *p;
3455
3456 for (p = napi->gro_list; p; p = p->next) {
3457 unsigned long diffs;
3458
3459 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3460 diffs |= p->vlan_tci ^ skb->vlan_tci;
3461 diffs |= compare_ether_header(skb_mac_header(p),
3462 skb_gro_mac_header(skb));
3463 NAPI_GRO_CB(p)->same_flow = !diffs;
3464 NAPI_GRO_CB(p)->flush = 0;
3465 }
3466
3467 return dev_gro_receive(napi, skb);
3468}
3469
3470gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3471{
3472 switch (ret) {
3473 case GRO_NORMAL:
3474 if (netif_receive_skb(skb))
3475 ret = GRO_DROP;
3476 break;
3477
3478 case GRO_DROP:
3479 case GRO_MERGED_FREE:
3480 kfree_skb(skb);
3481 break;
3482
3483 case GRO_HELD:
3484 case GRO_MERGED:
3485 break;
3486 }
3487
3488 return ret;
3489}
3490EXPORT_SYMBOL(napi_skb_finish);
3491
3492void skb_gro_reset_offset(struct sk_buff *skb)
3493{
3494 NAPI_GRO_CB(skb)->data_offset = 0;
3495 NAPI_GRO_CB(skb)->frag0 = NULL;
3496 NAPI_GRO_CB(skb)->frag0_len = 0;
3497
3498 if (skb->mac_header == skb->tail &&
3499 !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3500 NAPI_GRO_CB(skb)->frag0 =
3501 page_address(skb_shinfo(skb)->frags[0].page) +
3502 skb_shinfo(skb)->frags[0].page_offset;
3503 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3504 }
3505}
3506EXPORT_SYMBOL(skb_gro_reset_offset);
3507
3508gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3509{
3510 skb_gro_reset_offset(skb);
3511
3512 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3513}
3514EXPORT_SYMBOL(napi_gro_receive);
3515
3516static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3517{
3518 __skb_pull(skb, skb_headlen(skb));
3519 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3520 skb->vlan_tci = 0;
3521 skb->dev = napi->dev;
3522 skb->skb_iif = 0;
3523
3524 napi->skb = skb;
3525}
3526
3527struct sk_buff *napi_get_frags(struct napi_struct *napi)
3528{
3529 struct sk_buff *skb = napi->skb;
3530
3531 if (!skb) {
3532 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3533 if (skb)
3534 napi->skb = skb;
3535 }
3536 return skb;
3537}
3538EXPORT_SYMBOL(napi_get_frags);
3539
3540gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3541 gro_result_t ret)
3542{
3543 switch (ret) {
3544 case GRO_NORMAL:
3545 case GRO_HELD:
3546 skb->protocol = eth_type_trans(skb, skb->dev);
3547
3548 if (ret == GRO_HELD)
3549 skb_gro_pull(skb, -ETH_HLEN);
3550 else if (netif_receive_skb(skb))
3551 ret = GRO_DROP;
3552 break;
3553
3554 case GRO_DROP:
3555 case GRO_MERGED_FREE:
3556 napi_reuse_skb(napi, skb);
3557 break;
3558
3559 case GRO_MERGED:
3560 break;
3561 }
3562
3563 return ret;
3564}
3565EXPORT_SYMBOL(napi_frags_finish);
3566
3567struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3568{
3569 struct sk_buff *skb = napi->skb;
3570 struct ethhdr *eth;
3571 unsigned int hlen;
3572 unsigned int off;
3573
3574 napi->skb = NULL;
3575
3576 skb_reset_mac_header(skb);
3577 skb_gro_reset_offset(skb);
3578
3579 off = skb_gro_offset(skb);
3580 hlen = off + sizeof(*eth);
3581 eth = skb_gro_header_fast(skb, off);
3582 if (skb_gro_header_hard(skb, hlen)) {
3583 eth = skb_gro_header_slow(skb, hlen, off);
3584 if (unlikely(!eth)) {
3585 napi_reuse_skb(napi, skb);
3586 skb = NULL;
3587 goto out;
3588 }
3589 }
3590
3591 skb_gro_pull(skb, sizeof(*eth));
3592
3593 /*
3594 * This works because the only protocols we care about don't require
3595 * special handling. We'll fix it up properly at the end.
3596 */
3597 skb->protocol = eth->h_proto;
3598
3599out:
3600 return skb;
3601}
3602EXPORT_SYMBOL(napi_frags_skb);
3603
3604gro_result_t napi_gro_frags(struct napi_struct *napi)
3605{
3606 struct sk_buff *skb = napi_frags_skb(napi);
3607
3608 if (!skb)
3609 return GRO_DROP;
3610
3611 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3612}
3613EXPORT_SYMBOL(napi_gro_frags);
3614
3615/*
3616 * net_rps_action sends any pending IPI's for rps.
3617 * Note: called with local irq disabled, but exits with local irq enabled.
3618 */
3619static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3620{
3621#ifdef CONFIG_RPS
3622 struct softnet_data *remsd = sd->rps_ipi_list;
3623
3624 if (remsd) {
3625 sd->rps_ipi_list = NULL;
3626
3627 local_irq_enable();
3628
3629 /* Send pending IPI's to kick RPS processing on remote cpus. */
3630 while (remsd) {
3631 struct softnet_data *next = remsd->rps_ipi_next;
3632
3633 if (cpu_online(remsd->cpu))
3634 __smp_call_function_single(remsd->cpu,
3635 &remsd->csd, 0);
3636 remsd = next;
3637 }
3638 } else
3639#endif
3640 local_irq_enable();
3641}
3642
3643static int process_backlog(struct napi_struct *napi, int quota)
3644{
3645 int work = 0;
3646 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3647
3648#ifdef CONFIG_RPS
3649 /* Check if we have pending ipi, its better to send them now,
3650 * not waiting net_rx_action() end.
3651 */
3652 if (sd->rps_ipi_list) {
3653 local_irq_disable();
3654 net_rps_action_and_irq_enable(sd);
3655 }
3656#endif
3657 napi->weight = weight_p;
3658 local_irq_disable();
3659 while (work < quota) {
3660 struct sk_buff *skb;
3661 unsigned int qlen;
3662
3663 while ((skb = __skb_dequeue(&sd->process_queue))) {
3664 local_irq_enable();
3665 __netif_receive_skb(skb);
3666 local_irq_disable();
3667 input_queue_head_incr(sd);
3668 if (++work >= quota) {
3669 local_irq_enable();
3670 return work;
3671 }
3672 }
3673
3674 rps_lock(sd);
3675 qlen = skb_queue_len(&sd->input_pkt_queue);
3676 if (qlen)
3677 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3678 &sd->process_queue);
3679
3680 if (qlen < quota - work) {
3681 /*
3682 * Inline a custom version of __napi_complete().
3683 * only current cpu owns and manipulates this napi,
3684 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3685 * we can use a plain write instead of clear_bit(),
3686 * and we dont need an smp_mb() memory barrier.
3687 */
3688 list_del(&napi->poll_list);
3689 napi->state = 0;
3690
3691 quota = work + qlen;
3692 }
3693 rps_unlock(sd);
3694 }
3695 local_irq_enable();
3696
3697 return work;
3698}
3699
3700/**
3701 * __napi_schedule - schedule for receive
3702 * @n: entry to schedule
3703 *
3704 * The entry's receive function will be scheduled to run
3705 */
3706void __napi_schedule(struct napi_struct *n)
3707{
3708 unsigned long flags;
3709
3710 local_irq_save(flags);
3711 ____napi_schedule(&__get_cpu_var(softnet_data), n);
3712 local_irq_restore(flags);
3713}
3714EXPORT_SYMBOL(__napi_schedule);
3715
3716void __napi_complete(struct napi_struct *n)
3717{
3718 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3719 BUG_ON(n->gro_list);
3720
3721 list_del(&n->poll_list);
3722 smp_mb__before_clear_bit();
3723 clear_bit(NAPI_STATE_SCHED, &n->state);
3724}
3725EXPORT_SYMBOL(__napi_complete);
3726
3727void napi_complete(struct napi_struct *n)
3728{
3729 unsigned long flags;
3730
3731 /*
3732 * don't let napi dequeue from the cpu poll list
3733 * just in case its running on a different cpu
3734 */
3735 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3736 return;
3737
3738 napi_gro_flush(n);
3739 local_irq_save(flags);
3740 __napi_complete(n);
3741 local_irq_restore(flags);
3742}
3743EXPORT_SYMBOL(napi_complete);
3744
3745void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3746 int (*poll)(struct napi_struct *, int), int weight)
3747{
3748 INIT_LIST_HEAD(&napi->poll_list);
3749 napi->gro_count = 0;
3750 napi->gro_list = NULL;
3751 napi->skb = NULL;
3752 napi->poll = poll;
3753 napi->weight = weight;
3754 list_add(&napi->dev_list, &dev->napi_list);
3755 napi->dev = dev;
3756#ifdef CONFIG_NETPOLL
3757 spin_lock_init(&napi->poll_lock);
3758 napi->poll_owner = -1;
3759#endif
3760 set_bit(NAPI_STATE_SCHED, &napi->state);
3761}
3762EXPORT_SYMBOL(netif_napi_add);
3763
3764void netif_napi_del(struct napi_struct *napi)
3765{
3766 struct sk_buff *skb, *next;
3767
3768 list_del_init(&napi->dev_list);
3769 napi_free_frags(napi);
3770
3771 for (skb = napi->gro_list; skb; skb = next) {
3772 next = skb->next;
3773 skb->next = NULL;
3774 kfree_skb(skb);
3775 }
3776
3777 napi->gro_list = NULL;
3778 napi->gro_count = 0;
3779}
3780EXPORT_SYMBOL(netif_napi_del);
3781
3782static void net_rx_action(struct softirq_action *h)
3783{
3784 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3785 unsigned long time_limit = jiffies + 2;
3786 int budget = netdev_budget;
3787 void *have;
3788
3789 local_irq_disable();
3790
3791 while (!list_empty(&sd->poll_list)) {
3792 struct napi_struct *n;
3793 int work, weight;
3794
3795 /* If softirq window is exhuasted then punt.
3796 * Allow this to run for 2 jiffies since which will allow
3797 * an average latency of 1.5/HZ.
3798 */
3799 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3800 goto softnet_break;
3801
3802 local_irq_enable();
3803
3804 /* Even though interrupts have been re-enabled, this
3805 * access is safe because interrupts can only add new
3806 * entries to the tail of this list, and only ->poll()
3807 * calls can remove this head entry from the list.
3808 */
3809 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3810
3811 have = netpoll_poll_lock(n);
3812
3813 weight = n->weight;
3814
3815 /* This NAPI_STATE_SCHED test is for avoiding a race
3816 * with netpoll's poll_napi(). Only the entity which
3817 * obtains the lock and sees NAPI_STATE_SCHED set will
3818 * actually make the ->poll() call. Therefore we avoid
3819 * accidentally calling ->poll() when NAPI is not scheduled.
3820 */
3821 work = 0;
3822 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3823 work = n->poll(n, weight);
3824 trace_napi_poll(n);
3825 }
3826
3827 WARN_ON_ONCE(work > weight);
3828
3829 budget -= work;
3830
3831 local_irq_disable();
3832
3833 /* Drivers must not modify the NAPI state if they
3834 * consume the entire weight. In such cases this code
3835 * still "owns" the NAPI instance and therefore can
3836 * move the instance around on the list at-will.
3837 */
3838 if (unlikely(work == weight)) {
3839 if (unlikely(napi_disable_pending(n))) {
3840 local_irq_enable();
3841 napi_complete(n);
3842 local_irq_disable();
3843 } else
3844 list_move_tail(&n->poll_list, &sd->poll_list);
3845 }
3846
3847 netpoll_poll_unlock(have);
3848 }
3849out:
3850 net_rps_action_and_irq_enable(sd);
3851
3852#ifdef CONFIG_NET_DMA
3853 /*
3854 * There may not be any more sk_buffs coming right now, so push
3855 * any pending DMA copies to hardware
3856 */
3857 dma_issue_pending_all();
3858#endif
3859
3860 return;
3861
3862softnet_break:
3863 sd->time_squeeze++;
3864 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3865 goto out;
3866}
3867
3868static gifconf_func_t *gifconf_list[NPROTO];
3869
3870/**
3871 * register_gifconf - register a SIOCGIF handler
3872 * @family: Address family
3873 * @gifconf: Function handler
3874 *
3875 * Register protocol dependent address dumping routines. The handler
3876 * that is passed must not be freed or reused until it has been replaced
3877 * by another handler.
3878 */
3879int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3880{
3881 if (family >= NPROTO)
3882 return -EINVAL;
3883 gifconf_list[family] = gifconf;
3884 return 0;
3885}
3886EXPORT_SYMBOL(register_gifconf);
3887
3888
3889/*
3890 * Map an interface index to its name (SIOCGIFNAME)
3891 */
3892
3893/*
3894 * We need this ioctl for efficient implementation of the
3895 * if_indextoname() function required by the IPv6 API. Without
3896 * it, we would have to search all the interfaces to find a
3897 * match. --pb
3898 */
3899
3900static int dev_ifname(struct net *net, struct ifreq __user *arg)
3901{
3902 struct net_device *dev;
3903 struct ifreq ifr;
3904
3905 /*
3906 * Fetch the caller's info block.
3907 */
3908
3909 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3910 return -EFAULT;
3911
3912 rcu_read_lock();
3913 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3914 if (!dev) {
3915 rcu_read_unlock();
3916 return -ENODEV;
3917 }
3918
3919 strcpy(ifr.ifr_name, dev->name);
3920 rcu_read_unlock();
3921
3922 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3923 return -EFAULT;
3924 return 0;
3925}
3926
3927/*
3928 * Perform a SIOCGIFCONF call. This structure will change
3929 * size eventually, and there is nothing I can do about it.
3930 * Thus we will need a 'compatibility mode'.
3931 */
3932
3933static int dev_ifconf(struct net *net, char __user *arg)
3934{
3935 struct ifconf ifc;
3936 struct net_device *dev;
3937 char __user *pos;
3938 int len;
3939 int total;
3940 int i;
3941
3942 /*
3943 * Fetch the caller's info block.
3944 */
3945
3946 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3947 return -EFAULT;
3948
3949 pos = ifc.ifc_buf;
3950 len = ifc.ifc_len;
3951
3952 /*
3953 * Loop over the interfaces, and write an info block for each.
3954 */
3955
3956 total = 0;
3957 for_each_netdev(net, dev) {
3958 for (i = 0; i < NPROTO; i++) {
3959 if (gifconf_list[i]) {
3960 int done;
3961 if (!pos)
3962 done = gifconf_list[i](dev, NULL, 0);
3963 else
3964 done = gifconf_list[i](dev, pos + total,
3965 len - total);
3966 if (done < 0)
3967 return -EFAULT;
3968 total += done;
3969 }
3970 }
3971 }
3972
3973 /*
3974 * All done. Write the updated control block back to the caller.
3975 */
3976 ifc.ifc_len = total;
3977
3978 /*
3979 * Both BSD and Solaris return 0 here, so we do too.
3980 */
3981 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3982}
3983
3984#ifdef CONFIG_PROC_FS
3985/*
3986 * This is invoked by the /proc filesystem handler to display a device
3987 * in detail.
3988 */
3989void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3990 __acquires(RCU)
3991{
3992 struct net *net = seq_file_net(seq);
3993 loff_t off;
3994 struct net_device *dev;
3995
3996 rcu_read_lock();
3997 if (!*pos)
3998 return SEQ_START_TOKEN;
3999
4000 off = 1;
4001 for_each_netdev_rcu(net, dev)
4002 if (off++ == *pos)
4003 return dev;
4004
4005 return NULL;
4006}
4007
4008void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4009{
4010 struct net_device *dev = v;
4011
4012 if (v == SEQ_START_TOKEN)
4013 dev = first_net_device_rcu(seq_file_net(seq));
4014 else
4015 dev = next_net_device_rcu(dev);
4016
4017 ++*pos;
4018 return dev;
4019}
4020
4021void dev_seq_stop(struct seq_file *seq, void *v)
4022 __releases(RCU)
4023{
4024 rcu_read_unlock();
4025}
4026
4027static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4028{
4029 struct rtnl_link_stats64 temp;
4030 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4031
4032 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4033 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4034 dev->name, stats->rx_bytes, stats->rx_packets,
4035 stats->rx_errors,
4036 stats->rx_dropped + stats->rx_missed_errors,
4037 stats->rx_fifo_errors,
4038 stats->rx_length_errors + stats->rx_over_errors +
4039 stats->rx_crc_errors + stats->rx_frame_errors,
4040 stats->rx_compressed, stats->multicast,
4041 stats->tx_bytes, stats->tx_packets,
4042 stats->tx_errors, stats->tx_dropped,
4043 stats->tx_fifo_errors, stats->collisions,
4044 stats->tx_carrier_errors +
4045 stats->tx_aborted_errors +
4046 stats->tx_window_errors +
4047 stats->tx_heartbeat_errors,
4048 stats->tx_compressed);
4049}
4050
4051/*
4052 * Called from the PROCfs module. This now uses the new arbitrary sized
4053 * /proc/net interface to create /proc/net/dev
4054 */
4055static int dev_seq_show(struct seq_file *seq, void *v)
4056{
4057 if (v == SEQ_START_TOKEN)
4058 seq_puts(seq, "Inter-| Receive "
4059 " | Transmit\n"
4060 " face |bytes packets errs drop fifo frame "
4061 "compressed multicast|bytes packets errs "
4062 "drop fifo colls carrier compressed\n");
4063 else
4064 dev_seq_printf_stats(seq, v);
4065 return 0;
4066}
4067
4068static struct softnet_data *softnet_get_online(loff_t *pos)
4069{
4070 struct softnet_data *sd = NULL;
4071
4072 while (*pos < nr_cpu_ids)
4073 if (cpu_online(*pos)) {
4074 sd = &per_cpu(softnet_data, *pos);
4075 break;
4076 } else
4077 ++*pos;
4078 return sd;
4079}
4080
4081static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4082{
4083 return softnet_get_online(pos);
4084}
4085
4086static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4087{
4088 ++*pos;
4089 return softnet_get_online(pos);
4090}
4091
4092static void softnet_seq_stop(struct seq_file *seq, void *v)
4093{
4094}
4095
4096static int softnet_seq_show(struct seq_file *seq, void *v)
4097{
4098 struct softnet_data *sd = v;
4099
4100 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4101 sd->processed, sd->dropped, sd->time_squeeze, 0,
4102 0, 0, 0, 0, /* was fastroute */
4103 sd->cpu_collision, sd->received_rps);
4104 return 0;
4105}
4106
4107static const struct seq_operations dev_seq_ops = {
4108 .start = dev_seq_start,
4109 .next = dev_seq_next,
4110 .stop = dev_seq_stop,
4111 .show = dev_seq_show,
4112};
4113
4114static int dev_seq_open(struct inode *inode, struct file *file)
4115{
4116 return seq_open_net(inode, file, &dev_seq_ops,
4117 sizeof(struct seq_net_private));
4118}
4119
4120static const struct file_operations dev_seq_fops = {
4121 .owner = THIS_MODULE,
4122 .open = dev_seq_open,
4123 .read = seq_read,
4124 .llseek = seq_lseek,
4125 .release = seq_release_net,
4126};
4127
4128static const struct seq_operations softnet_seq_ops = {
4129 .start = softnet_seq_start,
4130 .next = softnet_seq_next,
4131 .stop = softnet_seq_stop,
4132 .show = softnet_seq_show,
4133};
4134
4135static int softnet_seq_open(struct inode *inode, struct file *file)
4136{
4137 return seq_open(file, &softnet_seq_ops);
4138}
4139
4140static const struct file_operations softnet_seq_fops = {
4141 .owner = THIS_MODULE,
4142 .open = softnet_seq_open,
4143 .read = seq_read,
4144 .llseek = seq_lseek,
4145 .release = seq_release,
4146};
4147
4148static void *ptype_get_idx(loff_t pos)
4149{
4150 struct packet_type *pt = NULL;
4151 loff_t i = 0;
4152 int t;
4153
4154 list_for_each_entry_rcu(pt, &ptype_all, list) {
4155 if (i == pos)
4156 return pt;
4157 ++i;
4158 }
4159
4160 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4161 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4162 if (i == pos)
4163 return pt;
4164 ++i;
4165 }
4166 }
4167 return NULL;
4168}
4169
4170static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4171 __acquires(RCU)
4172{
4173 rcu_read_lock();
4174 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4175}
4176
4177static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4178{
4179 struct packet_type *pt;
4180 struct list_head *nxt;
4181 int hash;
4182
4183 ++*pos;
4184 if (v == SEQ_START_TOKEN)
4185 return ptype_get_idx(0);
4186
4187 pt = v;
4188 nxt = pt->list.next;
4189 if (pt->type == htons(ETH_P_ALL)) {
4190 if (nxt != &ptype_all)
4191 goto found;
4192 hash = 0;
4193 nxt = ptype_base[0].next;
4194 } else
4195 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4196
4197 while (nxt == &ptype_base[hash]) {
4198 if (++hash >= PTYPE_HASH_SIZE)
4199 return NULL;
4200 nxt = ptype_base[hash].next;
4201 }
4202found:
4203 return list_entry(nxt, struct packet_type, list);
4204}
4205
4206static void ptype_seq_stop(struct seq_file *seq, void *v)
4207 __releases(RCU)
4208{
4209 rcu_read_unlock();
4210}
4211
4212static int ptype_seq_show(struct seq_file *seq, void *v)
4213{
4214 struct packet_type *pt = v;
4215
4216 if (v == SEQ_START_TOKEN)
4217 seq_puts(seq, "Type Device Function\n");
4218 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4219 if (pt->type == htons(ETH_P_ALL))
4220 seq_puts(seq, "ALL ");
4221 else
4222 seq_printf(seq, "%04x", ntohs(pt->type));
4223
4224 seq_printf(seq, " %-8s %pF\n",
4225 pt->dev ? pt->dev->name : "", pt->func);
4226 }
4227
4228 return 0;
4229}
4230
4231static const struct seq_operations ptype_seq_ops = {
4232 .start = ptype_seq_start,
4233 .next = ptype_seq_next,
4234 .stop = ptype_seq_stop,
4235 .show = ptype_seq_show,
4236};
4237
4238static int ptype_seq_open(struct inode *inode, struct file *file)
4239{
4240 return seq_open_net(inode, file, &ptype_seq_ops,
4241 sizeof(struct seq_net_private));
4242}
4243
4244static const struct file_operations ptype_seq_fops = {
4245 .owner = THIS_MODULE,
4246 .open = ptype_seq_open,
4247 .read = seq_read,
4248 .llseek = seq_lseek,
4249 .release = seq_release_net,
4250};
4251
4252
4253static int __net_init dev_proc_net_init(struct net *net)
4254{
4255 int rc = -ENOMEM;
4256
4257 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4258 goto out;
4259 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4260 goto out_dev;
4261 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4262 goto out_softnet;
4263
4264 if (wext_proc_init(net))
4265 goto out_ptype;
4266 rc = 0;
4267out:
4268 return rc;
4269out_ptype:
4270 proc_net_remove(net, "ptype");
4271out_softnet:
4272 proc_net_remove(net, "softnet_stat");
4273out_dev:
4274 proc_net_remove(net, "dev");
4275 goto out;
4276}
4277
4278static void __net_exit dev_proc_net_exit(struct net *net)
4279{
4280 wext_proc_exit(net);
4281
4282 proc_net_remove(net, "ptype");
4283 proc_net_remove(net, "softnet_stat");
4284 proc_net_remove(net, "dev");
4285}
4286
4287static struct pernet_operations __net_initdata dev_proc_ops = {
4288 .init = dev_proc_net_init,
4289 .exit = dev_proc_net_exit,
4290};
4291
4292static int __init dev_proc_init(void)
4293{
4294 return register_pernet_subsys(&dev_proc_ops);
4295}
4296#else
4297#define dev_proc_init() 0
4298#endif /* CONFIG_PROC_FS */
4299
4300
4301/**
4302 * netdev_set_master - set up master pointer
4303 * @slave: slave device
4304 * @master: new master device
4305 *
4306 * Changes the master device of the slave. Pass %NULL to break the
4307 * bonding. The caller must hold the RTNL semaphore. On a failure
4308 * a negative errno code is returned. On success the reference counts
4309 * are adjusted and the function returns zero.
4310 */
4311int netdev_set_master(struct net_device *slave, struct net_device *master)
4312{
4313 struct net_device *old = slave->master;
4314
4315 ASSERT_RTNL();
4316
4317 if (master) {
4318 if (old)
4319 return -EBUSY;
4320 dev_hold(master);
4321 }
4322
4323 slave->master = master;
4324
4325 if (old)
4326 dev_put(old);
4327 return 0;
4328}
4329EXPORT_SYMBOL(netdev_set_master);
4330
4331/**
4332 * netdev_set_bond_master - set up bonding master/slave pair
4333 * @slave: slave device
4334 * @master: new master device
4335 *
4336 * Changes the master device of the slave. Pass %NULL to break the
4337 * bonding. The caller must hold the RTNL semaphore. On a failure
4338 * a negative errno code is returned. On success %RTM_NEWLINK is sent
4339 * to the routing socket and the function returns zero.
4340 */
4341int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4342{
4343 int err;
4344
4345 ASSERT_RTNL();
4346
4347 err = netdev_set_master(slave, master);
4348 if (err)
4349 return err;
4350 if (master)
4351 slave->flags |= IFF_SLAVE;
4352 else
4353 slave->flags &= ~IFF_SLAVE;
4354
4355 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4356 return 0;
4357}
4358EXPORT_SYMBOL(netdev_set_bond_master);
4359
4360static void dev_change_rx_flags(struct net_device *dev, int flags)
4361{
4362 const struct net_device_ops *ops = dev->netdev_ops;
4363
4364 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4365 ops->ndo_change_rx_flags(dev, flags);
4366}
4367
4368static int __dev_set_promiscuity(struct net_device *dev, int inc)
4369{
4370 unsigned short old_flags = dev->flags;
4371 uid_t uid;
4372 gid_t gid;
4373
4374 ASSERT_RTNL();
4375
4376 dev->flags |= IFF_PROMISC;
4377 dev->promiscuity += inc;
4378 if (dev->promiscuity == 0) {
4379 /*
4380 * Avoid overflow.
4381 * If inc causes overflow, untouch promisc and return error.
4382 */
4383 if (inc < 0)
4384 dev->flags &= ~IFF_PROMISC;
4385 else {
4386 dev->promiscuity -= inc;
4387 printk(KERN_WARNING "%s: promiscuity touches roof, "
4388 "set promiscuity failed, promiscuity feature "
4389 "of device might be broken.\n", dev->name);
4390 return -EOVERFLOW;
4391 }
4392 }
4393 if (dev->flags != old_flags) {
4394 printk(KERN_INFO "device %s %s promiscuous mode\n",
4395 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4396 "left");
4397 if (audit_enabled) {
4398 current_uid_gid(&uid, &gid);
4399 audit_log(current->audit_context, GFP_ATOMIC,
4400 AUDIT_ANOM_PROMISCUOUS,
4401 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4402 dev->name, (dev->flags & IFF_PROMISC),
4403 (old_flags & IFF_PROMISC),
4404 audit_get_loginuid(current),
4405 uid, gid,
4406 audit_get_sessionid(current));
4407 }
4408
4409 dev_change_rx_flags(dev, IFF_PROMISC);
4410 }
4411 return 0;
4412}
4413
4414/**
4415 * dev_set_promiscuity - update promiscuity count on a device
4416 * @dev: device
4417 * @inc: modifier
4418 *
4419 * Add or remove promiscuity from a device. While the count in the device
4420 * remains above zero the interface remains promiscuous. Once it hits zero
4421 * the device reverts back to normal filtering operation. A negative inc
4422 * value is used to drop promiscuity on the device.
4423 * Return 0 if successful or a negative errno code on error.
4424 */
4425int dev_set_promiscuity(struct net_device *dev, int inc)
4426{
4427 unsigned short old_flags = dev->flags;
4428 int err;
4429
4430 err = __dev_set_promiscuity(dev, inc);
4431 if (err < 0)
4432 return err;
4433 if (dev->flags != old_flags)
4434 dev_set_rx_mode(dev);
4435 return err;
4436}
4437EXPORT_SYMBOL(dev_set_promiscuity);
4438
4439/**
4440 * dev_set_allmulti - update allmulti count on a device
4441 * @dev: device
4442 * @inc: modifier
4443 *
4444 * Add or remove reception of all multicast frames to a device. While the
4445 * count in the device remains above zero the interface remains listening
4446 * to all interfaces. Once it hits zero the device reverts back to normal
4447 * filtering operation. A negative @inc value is used to drop the counter
4448 * when releasing a resource needing all multicasts.
4449 * Return 0 if successful or a negative errno code on error.
4450 */
4451
4452int dev_set_allmulti(struct net_device *dev, int inc)
4453{
4454 unsigned short old_flags = dev->flags;
4455
4456 ASSERT_RTNL();
4457
4458 dev->flags |= IFF_ALLMULTI;
4459 dev->allmulti += inc;
4460 if (dev->allmulti == 0) {
4461 /*
4462 * Avoid overflow.
4463 * If inc causes overflow, untouch allmulti and return error.
4464 */
4465 if (inc < 0)
4466 dev->flags &= ~IFF_ALLMULTI;
4467 else {
4468 dev->allmulti -= inc;
4469 printk(KERN_WARNING "%s: allmulti touches roof, "
4470 "set allmulti failed, allmulti feature of "
4471 "device might be broken.\n", dev->name);
4472 return -EOVERFLOW;
4473 }
4474 }
4475 if (dev->flags ^ old_flags) {
4476 dev_change_rx_flags(dev, IFF_ALLMULTI);
4477 dev_set_rx_mode(dev);
4478 }
4479 return 0;
4480}
4481EXPORT_SYMBOL(dev_set_allmulti);
4482
4483/*
4484 * Upload unicast and multicast address lists to device and
4485 * configure RX filtering. When the device doesn't support unicast
4486 * filtering it is put in promiscuous mode while unicast addresses
4487 * are present.
4488 */
4489void __dev_set_rx_mode(struct net_device *dev)
4490{
4491 const struct net_device_ops *ops = dev->netdev_ops;
4492
4493 /* dev_open will call this function so the list will stay sane. */
4494 if (!(dev->flags&IFF_UP))
4495 return;
4496
4497 if (!netif_device_present(dev))
4498 return;
4499
4500 if (ops->ndo_set_rx_mode)
4501 ops->ndo_set_rx_mode(dev);
4502 else {
4503 /* Unicast addresses changes may only happen under the rtnl,
4504 * therefore calling __dev_set_promiscuity here is safe.
4505 */
4506 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4507 __dev_set_promiscuity(dev, 1);
4508 dev->uc_promisc = true;
4509 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4510 __dev_set_promiscuity(dev, -1);
4511 dev->uc_promisc = false;
4512 }
4513
4514 if (ops->ndo_set_multicast_list)
4515 ops->ndo_set_multicast_list(dev);
4516 }
4517}
4518
4519void dev_set_rx_mode(struct net_device *dev)
4520{
4521 netif_addr_lock_bh(dev);
4522 __dev_set_rx_mode(dev);
4523 netif_addr_unlock_bh(dev);
4524}
4525
4526/**
4527 * dev_ethtool_get_settings - call device's ethtool_ops::get_settings()
4528 * @dev: device
4529 * @cmd: memory area for ethtool_ops::get_settings() result
4530 *
4531 * The cmd arg is initialized properly (cleared and
4532 * ethtool_cmd::cmd field set to ETHTOOL_GSET).
4533 *
4534 * Return device's ethtool_ops::get_settings() result value or
4535 * -EOPNOTSUPP when device doesn't expose
4536 * ethtool_ops::get_settings() operation.
4537 */
4538int dev_ethtool_get_settings(struct net_device *dev,
4539 struct ethtool_cmd *cmd)
4540{
4541 if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings)
4542 return -EOPNOTSUPP;
4543
4544 memset(cmd, 0, sizeof(struct ethtool_cmd));
4545 cmd->cmd = ETHTOOL_GSET;
4546 return dev->ethtool_ops->get_settings(dev, cmd);
4547}
4548EXPORT_SYMBOL(dev_ethtool_get_settings);
4549
4550/**
4551 * dev_get_flags - get flags reported to userspace
4552 * @dev: device
4553 *
4554 * Get the combination of flag bits exported through APIs to userspace.
4555 */
4556unsigned dev_get_flags(const struct net_device *dev)
4557{
4558 unsigned flags;
4559
4560 flags = (dev->flags & ~(IFF_PROMISC |
4561 IFF_ALLMULTI |
4562 IFF_RUNNING |
4563 IFF_LOWER_UP |
4564 IFF_DORMANT)) |
4565 (dev->gflags & (IFF_PROMISC |
4566 IFF_ALLMULTI));
4567
4568 if (netif_running(dev)) {
4569 if (netif_oper_up(dev))
4570 flags |= IFF_RUNNING;
4571 if (netif_carrier_ok(dev))
4572 flags |= IFF_LOWER_UP;
4573 if (netif_dormant(dev))
4574 flags |= IFF_DORMANT;
4575 }
4576
4577 return flags;
4578}
4579EXPORT_SYMBOL(dev_get_flags);
4580
4581int __dev_change_flags(struct net_device *dev, unsigned int flags)
4582{
4583 int old_flags = dev->flags;
4584 int ret;
4585
4586 ASSERT_RTNL();
4587
4588 /*
4589 * Set the flags on our device.
4590 */
4591
4592 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4593 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4594 IFF_AUTOMEDIA)) |
4595 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4596 IFF_ALLMULTI));
4597
4598 /*
4599 * Load in the correct multicast list now the flags have changed.
4600 */
4601
4602 if ((old_flags ^ flags) & IFF_MULTICAST)
4603 dev_change_rx_flags(dev, IFF_MULTICAST);
4604
4605 dev_set_rx_mode(dev);
4606
4607 /*
4608 * Have we downed the interface. We handle IFF_UP ourselves
4609 * according to user attempts to set it, rather than blindly
4610 * setting it.
4611 */
4612
4613 ret = 0;
4614 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
4615 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4616
4617 if (!ret)
4618 dev_set_rx_mode(dev);
4619 }
4620
4621 if ((flags ^ dev->gflags) & IFF_PROMISC) {
4622 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4623
4624 dev->gflags ^= IFF_PROMISC;
4625 dev_set_promiscuity(dev, inc);
4626 }
4627
4628 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4629 is important. Some (broken) drivers set IFF_PROMISC, when
4630 IFF_ALLMULTI is requested not asking us and not reporting.
4631 */
4632 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4633 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4634
4635 dev->gflags ^= IFF_ALLMULTI;
4636 dev_set_allmulti(dev, inc);
4637 }
4638
4639 return ret;
4640}
4641
4642void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4643{
4644 unsigned int changes = dev->flags ^ old_flags;
4645
4646 if (changes & IFF_UP) {
4647 if (dev->flags & IFF_UP)
4648 call_netdevice_notifiers(NETDEV_UP, dev);
4649 else
4650 call_netdevice_notifiers(NETDEV_DOWN, dev);
4651 }
4652
4653 if (dev->flags & IFF_UP &&
4654 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4655 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4656}
4657
4658/**
4659 * dev_change_flags - change device settings
4660 * @dev: device
4661 * @flags: device state flags
4662 *
4663 * Change settings on device based state flags. The flags are
4664 * in the userspace exported format.
4665 */
4666int dev_change_flags(struct net_device *dev, unsigned flags)
4667{
4668 int ret, changes;
4669 int old_flags = dev->flags;
4670
4671 ret = __dev_change_flags(dev, flags);
4672 if (ret < 0)
4673 return ret;
4674
4675 changes = old_flags ^ dev->flags;
4676 if (changes)
4677 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4678
4679 __dev_notify_flags(dev, old_flags);
4680 return ret;
4681}
4682EXPORT_SYMBOL(dev_change_flags);
4683
4684/**
4685 * dev_set_mtu - Change maximum transfer unit
4686 * @dev: device
4687 * @new_mtu: new transfer unit
4688 *
4689 * Change the maximum transfer size of the network device.
4690 */
4691int dev_set_mtu(struct net_device *dev, int new_mtu)
4692{
4693 const struct net_device_ops *ops = dev->netdev_ops;
4694 int err;
4695
4696 if (new_mtu == dev->mtu)
4697 return 0;
4698
4699 /* MTU must be positive. */
4700 if (new_mtu < 0)
4701 return -EINVAL;
4702
4703 if (!netif_device_present(dev))
4704 return -ENODEV;
4705
4706 err = 0;
4707 if (ops->ndo_change_mtu)
4708 err = ops->ndo_change_mtu(dev, new_mtu);
4709 else
4710 dev->mtu = new_mtu;
4711
4712 if (!err && dev->flags & IFF_UP)
4713 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4714 return err;
4715}
4716EXPORT_SYMBOL(dev_set_mtu);
4717
4718/**
4719 * dev_set_group - Change group this device belongs to
4720 * @dev: device
4721 * @new_group: group this device should belong to
4722 */
4723void dev_set_group(struct net_device *dev, int new_group)
4724{
4725 dev->group = new_group;
4726}
4727EXPORT_SYMBOL(dev_set_group);
4728
4729/**
4730 * dev_set_mac_address - Change Media Access Control Address
4731 * @dev: device
4732 * @sa: new address
4733 *
4734 * Change the hardware (MAC) address of the device
4735 */
4736int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4737{
4738 const struct net_device_ops *ops = dev->netdev_ops;
4739 int err;
4740
4741 if (!ops->ndo_set_mac_address)
4742 return -EOPNOTSUPP;
4743 if (sa->sa_family != dev->type)
4744 return -EINVAL;
4745 if (!netif_device_present(dev))
4746 return -ENODEV;
4747 err = ops->ndo_set_mac_address(dev, sa);
4748 if (!err)
4749 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4750 return err;
4751}
4752EXPORT_SYMBOL(dev_set_mac_address);
4753
4754/*
4755 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4756 */
4757static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4758{
4759 int err;
4760 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4761
4762 if (!dev)
4763 return -ENODEV;
4764
4765 switch (cmd) {
4766 case SIOCGIFFLAGS: /* Get interface flags */
4767 ifr->ifr_flags = (short) dev_get_flags(dev);
4768 return 0;
4769
4770 case SIOCGIFMETRIC: /* Get the metric on the interface
4771 (currently unused) */
4772 ifr->ifr_metric = 0;
4773 return 0;
4774
4775 case SIOCGIFMTU: /* Get the MTU of a device */
4776 ifr->ifr_mtu = dev->mtu;
4777 return 0;
4778
4779 case SIOCGIFHWADDR:
4780 if (!dev->addr_len)
4781 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4782 else
4783 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4784 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4785 ifr->ifr_hwaddr.sa_family = dev->type;
4786 return 0;
4787
4788 case SIOCGIFSLAVE:
4789 err = -EINVAL;
4790 break;
4791
4792 case SIOCGIFMAP:
4793 ifr->ifr_map.mem_start = dev->mem_start;
4794 ifr->ifr_map.mem_end = dev->mem_end;
4795 ifr->ifr_map.base_addr = dev->base_addr;
4796 ifr->ifr_map.irq = dev->irq;
4797 ifr->ifr_map.dma = dev->dma;
4798 ifr->ifr_map.port = dev->if_port;
4799 return 0;
4800
4801 case SIOCGIFINDEX:
4802 ifr->ifr_ifindex = dev->ifindex;
4803 return 0;
4804
4805 case SIOCGIFTXQLEN:
4806 ifr->ifr_qlen = dev->tx_queue_len;
4807 return 0;
4808
4809 default:
4810 /* dev_ioctl() should ensure this case
4811 * is never reached
4812 */
4813 WARN_ON(1);
4814 err = -ENOTTY;
4815 break;
4816
4817 }
4818 return err;
4819}
4820
4821/*
4822 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4823 */
4824static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4825{
4826 int err;
4827 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4828 const struct net_device_ops *ops;
4829
4830 if (!dev)
4831 return -ENODEV;
4832
4833 ops = dev->netdev_ops;
4834
4835 switch (cmd) {
4836 case SIOCSIFFLAGS: /* Set interface flags */
4837 return dev_change_flags(dev, ifr->ifr_flags);
4838
4839 case SIOCSIFMETRIC: /* Set the metric on the interface
4840 (currently unused) */
4841 return -EOPNOTSUPP;
4842
4843 case SIOCSIFMTU: /* Set the MTU of a device */
4844 return dev_set_mtu(dev, ifr->ifr_mtu);
4845
4846 case SIOCSIFHWADDR:
4847 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4848
4849 case SIOCSIFHWBROADCAST:
4850 if (ifr->ifr_hwaddr.sa_family != dev->type)
4851 return -EINVAL;
4852 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4853 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4854 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4855 return 0;
4856
4857 case SIOCSIFMAP:
4858 if (ops->ndo_set_config) {
4859 if (!netif_device_present(dev))
4860 return -ENODEV;
4861 return ops->ndo_set_config(dev, &ifr->ifr_map);
4862 }
4863 return -EOPNOTSUPP;
4864
4865 case SIOCADDMULTI:
4866 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4867 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4868 return -EINVAL;
4869 if (!netif_device_present(dev))
4870 return -ENODEV;
4871 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4872
4873 case SIOCDELMULTI:
4874 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4875 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4876 return -EINVAL;
4877 if (!netif_device_present(dev))
4878 return -ENODEV;
4879 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4880
4881 case SIOCSIFTXQLEN:
4882 if (ifr->ifr_qlen < 0)
4883 return -EINVAL;
4884 dev->tx_queue_len = ifr->ifr_qlen;
4885 return 0;
4886
4887 case SIOCSIFNAME:
4888 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4889 return dev_change_name(dev, ifr->ifr_newname);
4890
4891 /*
4892 * Unknown or private ioctl
4893 */
4894 default:
4895 if ((cmd >= SIOCDEVPRIVATE &&
4896 cmd <= SIOCDEVPRIVATE + 15) ||
4897 cmd == SIOCBONDENSLAVE ||
4898 cmd == SIOCBONDRELEASE ||
4899 cmd == SIOCBONDSETHWADDR ||
4900 cmd == SIOCBONDSLAVEINFOQUERY ||
4901 cmd == SIOCBONDINFOQUERY ||
4902 cmd == SIOCBONDCHANGEACTIVE ||
4903 cmd == SIOCGMIIPHY ||
4904 cmd == SIOCGMIIREG ||
4905 cmd == SIOCSMIIREG ||
4906 cmd == SIOCBRADDIF ||
4907 cmd == SIOCBRDELIF ||
4908 cmd == SIOCSHWTSTAMP ||
4909 cmd == SIOCWANDEV) {
4910 err = -EOPNOTSUPP;
4911 if (ops->ndo_do_ioctl) {
4912 if (netif_device_present(dev))
4913 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4914 else
4915 err = -ENODEV;
4916 }
4917 } else
4918 err = -EINVAL;
4919
4920 }
4921 return err;
4922}
4923
4924/*
4925 * This function handles all "interface"-type I/O control requests. The actual
4926 * 'doing' part of this is dev_ifsioc above.
4927 */
4928
4929/**
4930 * dev_ioctl - network device ioctl
4931 * @net: the applicable net namespace
4932 * @cmd: command to issue
4933 * @arg: pointer to a struct ifreq in user space
4934 *
4935 * Issue ioctl functions to devices. This is normally called by the
4936 * user space syscall interfaces but can sometimes be useful for
4937 * other purposes. The return value is the return from the syscall if
4938 * positive or a negative errno code on error.
4939 */
4940
4941int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4942{
4943 struct ifreq ifr;
4944 int ret;
4945 char *colon;
4946
4947 /* One special case: SIOCGIFCONF takes ifconf argument
4948 and requires shared lock, because it sleeps writing
4949 to user space.
4950 */
4951
4952 if (cmd == SIOCGIFCONF) {
4953 rtnl_lock();
4954 ret = dev_ifconf(net, (char __user *) arg);
4955 rtnl_unlock();
4956 return ret;
4957 }
4958 if (cmd == SIOCGIFNAME)
4959 return dev_ifname(net, (struct ifreq __user *)arg);
4960
4961 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4962 return -EFAULT;
4963
4964 ifr.ifr_name[IFNAMSIZ-1] = 0;
4965
4966 colon = strchr(ifr.ifr_name, ':');
4967 if (colon)
4968 *colon = 0;
4969
4970 /*
4971 * See which interface the caller is talking about.
4972 */
4973
4974 switch (cmd) {
4975 /*
4976 * These ioctl calls:
4977 * - can be done by all.
4978 * - atomic and do not require locking.
4979 * - return a value
4980 */
4981 case SIOCGIFFLAGS:
4982 case SIOCGIFMETRIC:
4983 case SIOCGIFMTU:
4984 case SIOCGIFHWADDR:
4985 case SIOCGIFSLAVE:
4986 case SIOCGIFMAP:
4987 case SIOCGIFINDEX:
4988 case SIOCGIFTXQLEN:
4989 dev_load(net, ifr.ifr_name);
4990 rcu_read_lock();
4991 ret = dev_ifsioc_locked(net, &ifr, cmd);
4992 rcu_read_unlock();
4993 if (!ret) {
4994 if (colon)
4995 *colon = ':';
4996 if (copy_to_user(arg, &ifr,
4997 sizeof(struct ifreq)))
4998 ret = -EFAULT;
4999 }
5000 return ret;
5001
5002 case SIOCETHTOOL:
5003 dev_load(net, ifr.ifr_name);
5004 rtnl_lock();
5005 ret = dev_ethtool(net, &ifr);
5006 rtnl_unlock();
5007 if (!ret) {
5008 if (colon)
5009 *colon = ':';
5010 if (copy_to_user(arg, &ifr,
5011 sizeof(struct ifreq)))
5012 ret = -EFAULT;
5013 }
5014 return ret;
5015
5016 /*
5017 * These ioctl calls:
5018 * - require superuser power.
5019 * - require strict serialization.
5020 * - return a value
5021 */
5022 case SIOCGMIIPHY:
5023 case SIOCGMIIREG:
5024 case SIOCSIFNAME:
5025 if (!capable(CAP_NET_ADMIN))
5026 return -EPERM;
5027 dev_load(net, ifr.ifr_name);
5028 rtnl_lock();
5029 ret = dev_ifsioc(net, &ifr, cmd);
5030 rtnl_unlock();
5031 if (!ret) {
5032 if (colon)
5033 *colon = ':';
5034 if (copy_to_user(arg, &ifr,
5035 sizeof(struct ifreq)))
5036 ret = -EFAULT;
5037 }
5038 return ret;
5039
5040 /*
5041 * These ioctl calls:
5042 * - require superuser power.
5043 * - require strict serialization.
5044 * - do not return a value
5045 */
5046 case SIOCSIFFLAGS:
5047 case SIOCSIFMETRIC:
5048 case SIOCSIFMTU:
5049 case SIOCSIFMAP:
5050 case SIOCSIFHWADDR:
5051 case SIOCSIFSLAVE:
5052 case SIOCADDMULTI:
5053 case SIOCDELMULTI:
5054 case SIOCSIFHWBROADCAST:
5055 case SIOCSIFTXQLEN:
5056 case SIOCSMIIREG:
5057 case SIOCBONDENSLAVE:
5058 case SIOCBONDRELEASE:
5059 case SIOCBONDSETHWADDR:
5060 case SIOCBONDCHANGEACTIVE:
5061 case SIOCBRADDIF:
5062 case SIOCBRDELIF:
5063 case SIOCSHWTSTAMP:
5064 if (!capable(CAP_NET_ADMIN))
5065 return -EPERM;
5066 /* fall through */
5067 case SIOCBONDSLAVEINFOQUERY:
5068 case SIOCBONDINFOQUERY:
5069 dev_load(net, ifr.ifr_name);
5070 rtnl_lock();
5071 ret = dev_ifsioc(net, &ifr, cmd);
5072 rtnl_unlock();
5073 return ret;
5074
5075 case SIOCGIFMEM:
5076 /* Get the per device memory space. We can add this but
5077 * currently do not support it */
5078 case SIOCSIFMEM:
5079 /* Set the per device memory buffer space.
5080 * Not applicable in our case */
5081 case SIOCSIFLINK:
5082 return -ENOTTY;
5083
5084 /*
5085 * Unknown or private ioctl.
5086 */
5087 default:
5088 if (cmd == SIOCWANDEV ||
5089 (cmd >= SIOCDEVPRIVATE &&
5090 cmd <= SIOCDEVPRIVATE + 15)) {
5091 dev_load(net, ifr.ifr_name);
5092 rtnl_lock();
5093 ret = dev_ifsioc(net, &ifr, cmd);
5094 rtnl_unlock();
5095 if (!ret && copy_to_user(arg, &ifr,
5096 sizeof(struct ifreq)))
5097 ret = -EFAULT;
5098 return ret;
5099 }
5100 /* Take care of Wireless Extensions */
5101 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5102 return wext_handle_ioctl(net, &ifr, cmd, arg);
5103 return -ENOTTY;
5104 }
5105}
5106
5107
5108/**
5109 * dev_new_index - allocate an ifindex
5110 * @net: the applicable net namespace
5111 *
5112 * Returns a suitable unique value for a new device interface
5113 * number. The caller must hold the rtnl semaphore or the
5114 * dev_base_lock to be sure it remains unique.
5115 */
5116static int dev_new_index(struct net *net)
5117{
5118 static int ifindex;
5119 for (;;) {
5120 if (++ifindex <= 0)
5121 ifindex = 1;
5122 if (!__dev_get_by_index(net, ifindex))
5123 return ifindex;
5124 }
5125}
5126
5127/* Delayed registration/unregisteration */
5128static LIST_HEAD(net_todo_list);
5129
5130static void net_set_todo(struct net_device *dev)
5131{
5132 list_add_tail(&dev->todo_list, &net_todo_list);
5133}
5134
5135static void rollback_registered_many(struct list_head *head)
5136{
5137 struct net_device *dev, *tmp;
5138
5139 BUG_ON(dev_boot_phase);
5140 ASSERT_RTNL();
5141
5142 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5143 /* Some devices call without registering
5144 * for initialization unwind. Remove those
5145 * devices and proceed with the remaining.
5146 */
5147 if (dev->reg_state == NETREG_UNINITIALIZED) {
5148 pr_debug("unregister_netdevice: device %s/%p never "
5149 "was registered\n", dev->name, dev);
5150
5151 WARN_ON(1);
5152 list_del(&dev->unreg_list);
5153 continue;
5154 }
5155 dev->dismantle = true;
5156 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5157 }
5158
5159 /* If device is running, close it first. */
5160 dev_close_many(head);
5161
5162 list_for_each_entry(dev, head, unreg_list) {
5163 /* And unlink it from device chain. */
5164 unlist_netdevice(dev);
5165
5166 dev->reg_state = NETREG_UNREGISTERING;
5167 }
5168
5169 synchronize_net();
5170
5171 list_for_each_entry(dev, head, unreg_list) {
5172 /* Shutdown queueing discipline. */
5173 dev_shutdown(dev);
5174
5175
5176 /* Notify protocols, that we are about to destroy
5177 this device. They should clean all the things.
5178 */
5179 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5180
5181 if (!dev->rtnl_link_ops ||
5182 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5183 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5184
5185 /*
5186 * Flush the unicast and multicast chains
5187 */
5188 dev_uc_flush(dev);
5189 dev_mc_flush(dev);
5190
5191 if (dev->netdev_ops->ndo_uninit)
5192 dev->netdev_ops->ndo_uninit(dev);
5193
5194 /* Notifier chain MUST detach us from master device. */
5195 WARN_ON(dev->master);
5196
5197 /* Remove entries from kobject tree */
5198 netdev_unregister_kobject(dev);
5199 }
5200
5201 /* Process any work delayed until the end of the batch */
5202 dev = list_first_entry(head, struct net_device, unreg_list);
5203 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5204
5205 rcu_barrier();
5206
5207 list_for_each_entry(dev, head, unreg_list)
5208 dev_put(dev);
5209}
5210
5211static void rollback_registered(struct net_device *dev)
5212{
5213 LIST_HEAD(single);
5214
5215 list_add(&dev->unreg_list, &single);
5216 rollback_registered_many(&single);
5217 list_del(&single);
5218}
5219
5220static u32 netdev_fix_features(struct net_device *dev, u32 features)
5221{
5222 /* Fix illegal checksum combinations */
5223 if ((features & NETIF_F_HW_CSUM) &&
5224 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5225 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5226 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5227 }
5228
5229 if ((features & NETIF_F_NO_CSUM) &&
5230 (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5231 netdev_warn(dev, "mixed no checksumming and other settings.\n");
5232 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5233 }
5234
5235 /* Fix illegal SG+CSUM combinations. */
5236 if ((features & NETIF_F_SG) &&
5237 !(features & NETIF_F_ALL_CSUM)) {
5238 netdev_dbg(dev,
5239 "Dropping NETIF_F_SG since no checksum feature.\n");
5240 features &= ~NETIF_F_SG;
5241 }
5242
5243 /* TSO requires that SG is present as well. */
5244 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5245 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5246 features &= ~NETIF_F_ALL_TSO;
5247 }
5248
5249 /* TSO ECN requires that TSO is present as well. */
5250 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5251 features &= ~NETIF_F_TSO_ECN;
5252
5253 /* Software GSO depends on SG. */
5254 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5255 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5256 features &= ~NETIF_F_GSO;
5257 }
5258
5259 /* UFO needs SG and checksumming */
5260 if (features & NETIF_F_UFO) {
5261 /* maybe split UFO into V4 and V6? */
5262 if (!((features & NETIF_F_GEN_CSUM) ||
5263 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5264 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5265 netdev_dbg(dev,
5266 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5267 features &= ~NETIF_F_UFO;
5268 }
5269
5270 if (!(features & NETIF_F_SG)) {
5271 netdev_dbg(dev,
5272 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5273 features &= ~NETIF_F_UFO;
5274 }
5275 }
5276
5277 return features;
5278}
5279
5280int __netdev_update_features(struct net_device *dev)
5281{
5282 u32 features;
5283 int err = 0;
5284
5285 ASSERT_RTNL();
5286
5287 features = netdev_get_wanted_features(dev);
5288
5289 if (dev->netdev_ops->ndo_fix_features)
5290 features = dev->netdev_ops->ndo_fix_features(dev, features);
5291
5292 /* driver might be less strict about feature dependencies */
5293 features = netdev_fix_features(dev, features);
5294
5295 if (dev->features == features)
5296 return 0;
5297
5298 netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
5299 dev->features, features);
5300
5301 if (dev->netdev_ops->ndo_set_features)
5302 err = dev->netdev_ops->ndo_set_features(dev, features);
5303
5304 if (unlikely(err < 0)) {
5305 netdev_err(dev,
5306 "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5307 err, features, dev->features);
5308 return -1;
5309 }
5310
5311 if (!err)
5312 dev->features = features;
5313
5314 return 1;
5315}
5316
5317/**
5318 * netdev_update_features - recalculate device features
5319 * @dev: the device to check
5320 *
5321 * Recalculate dev->features set and send notifications if it
5322 * has changed. Should be called after driver or hardware dependent
5323 * conditions might have changed that influence the features.
5324 */
5325void netdev_update_features(struct net_device *dev)
5326{
5327 if (__netdev_update_features(dev))
5328 netdev_features_change(dev);
5329}
5330EXPORT_SYMBOL(netdev_update_features);
5331
5332/**
5333 * netdev_change_features - recalculate device features
5334 * @dev: the device to check
5335 *
5336 * Recalculate dev->features set and send notifications even
5337 * if they have not changed. Should be called instead of
5338 * netdev_update_features() if also dev->vlan_features might
5339 * have changed to allow the changes to be propagated to stacked
5340 * VLAN devices.
5341 */
5342void netdev_change_features(struct net_device *dev)
5343{
5344 __netdev_update_features(dev);
5345 netdev_features_change(dev);
5346}
5347EXPORT_SYMBOL(netdev_change_features);
5348
5349/**
5350 * netif_stacked_transfer_operstate - transfer operstate
5351 * @rootdev: the root or lower level device to transfer state from
5352 * @dev: the device to transfer operstate to
5353 *
5354 * Transfer operational state from root to device. This is normally
5355 * called when a stacking relationship exists between the root
5356 * device and the device(a leaf device).
5357 */
5358void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5359 struct net_device *dev)
5360{
5361 if (rootdev->operstate == IF_OPER_DORMANT)
5362 netif_dormant_on(dev);
5363 else
5364 netif_dormant_off(dev);
5365
5366 if (netif_carrier_ok(rootdev)) {
5367 if (!netif_carrier_ok(dev))
5368 netif_carrier_on(dev);
5369 } else {
5370 if (netif_carrier_ok(dev))
5371 netif_carrier_off(dev);
5372 }
5373}
5374EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5375
5376#ifdef CONFIG_RPS
5377static int netif_alloc_rx_queues(struct net_device *dev)
5378{
5379 unsigned int i, count = dev->num_rx_queues;
5380 struct netdev_rx_queue *rx;
5381
5382 BUG_ON(count < 1);
5383
5384 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5385 if (!rx) {
5386 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5387 return -ENOMEM;
5388 }
5389 dev->_rx = rx;
5390
5391 for (i = 0; i < count; i++)
5392 rx[i].dev = dev;
5393 return 0;
5394}
5395#endif
5396
5397static void netdev_init_one_queue(struct net_device *dev,
5398 struct netdev_queue *queue, void *_unused)
5399{
5400 /* Initialize queue lock */
5401 spin_lock_init(&queue->_xmit_lock);
5402 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5403 queue->xmit_lock_owner = -1;
5404 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5405 queue->dev = dev;
5406}
5407
5408static int netif_alloc_netdev_queues(struct net_device *dev)
5409{
5410 unsigned int count = dev->num_tx_queues;
5411 struct netdev_queue *tx;
5412
5413 BUG_ON(count < 1);
5414
5415 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5416 if (!tx) {
5417 pr_err("netdev: Unable to allocate %u tx queues.\n",
5418 count);
5419 return -ENOMEM;
5420 }
5421 dev->_tx = tx;
5422
5423 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5424 spin_lock_init(&dev->tx_global_lock);
5425
5426 return 0;
5427}
5428
5429/**
5430 * register_netdevice - register a network device
5431 * @dev: device to register
5432 *
5433 * Take a completed network device structure and add it to the kernel
5434 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5435 * chain. 0 is returned on success. A negative errno code is returned
5436 * on a failure to set up the device, or if the name is a duplicate.
5437 *
5438 * Callers must hold the rtnl semaphore. You may want
5439 * register_netdev() instead of this.
5440 *
5441 * BUGS:
5442 * The locking appears insufficient to guarantee two parallel registers
5443 * will not get the same name.
5444 */
5445
5446int register_netdevice(struct net_device *dev)
5447{
5448 int ret;
5449 struct net *net = dev_net(dev);
5450
5451 BUG_ON(dev_boot_phase);
5452 ASSERT_RTNL();
5453
5454 might_sleep();
5455
5456 /* When net_device's are persistent, this will be fatal. */
5457 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5458 BUG_ON(!net);
5459
5460 spin_lock_init(&dev->addr_list_lock);
5461 netdev_set_addr_lockdep_class(dev);
5462
5463 dev->iflink = -1;
5464
5465 ret = dev_get_valid_name(dev, dev->name);
5466 if (ret < 0)
5467 goto out;
5468
5469 /* Init, if this function is available */
5470 if (dev->netdev_ops->ndo_init) {
5471 ret = dev->netdev_ops->ndo_init(dev);
5472 if (ret) {
5473 if (ret > 0)
5474 ret = -EIO;
5475 goto out;
5476 }
5477 }
5478
5479 dev->ifindex = dev_new_index(net);
5480 if (dev->iflink == -1)
5481 dev->iflink = dev->ifindex;
5482
5483 /* Transfer changeable features to wanted_features and enable
5484 * software offloads (GSO and GRO).
5485 */
5486 dev->hw_features |= NETIF_F_SOFT_FEATURES;
5487 dev->features |= NETIF_F_SOFT_FEATURES;
5488 dev->wanted_features = dev->features & dev->hw_features;
5489
5490 /* Turn on no cache copy if HW is doing checksum */
5491 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5492 if ((dev->features & NETIF_F_ALL_CSUM) &&
5493 !(dev->features & NETIF_F_NO_CSUM)) {
5494 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5495 dev->features |= NETIF_F_NOCACHE_COPY;
5496 }
5497
5498 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5499 */
5500 dev->vlan_features |= NETIF_F_HIGHDMA;
5501
5502 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5503 ret = notifier_to_errno(ret);
5504 if (ret)
5505 goto err_uninit;
5506
5507 ret = netdev_register_kobject(dev);
5508 if (ret)
5509 goto err_uninit;
5510 dev->reg_state = NETREG_REGISTERED;
5511
5512 __netdev_update_features(dev);
5513
5514 /*
5515 * Default initial state at registry is that the
5516 * device is present.
5517 */
5518
5519 set_bit(__LINK_STATE_PRESENT, &dev->state);
5520
5521 dev_init_scheduler(dev);
5522 dev_hold(dev);
5523 list_netdevice(dev);
5524
5525 /* Notify protocols, that a new device appeared. */
5526 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5527 ret = notifier_to_errno(ret);
5528 if (ret) {
5529 rollback_registered(dev);
5530 dev->reg_state = NETREG_UNREGISTERED;
5531 }
5532 /*
5533 * Prevent userspace races by waiting until the network
5534 * device is fully setup before sending notifications.
5535 */
5536 if (!dev->rtnl_link_ops ||
5537 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5538 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5539
5540out:
5541 return ret;
5542
5543err_uninit:
5544 if (dev->netdev_ops->ndo_uninit)
5545 dev->netdev_ops->ndo_uninit(dev);
5546 goto out;
5547}
5548EXPORT_SYMBOL(register_netdevice);
5549
5550/**
5551 * init_dummy_netdev - init a dummy network device for NAPI
5552 * @dev: device to init
5553 *
5554 * This takes a network device structure and initialize the minimum
5555 * amount of fields so it can be used to schedule NAPI polls without
5556 * registering a full blown interface. This is to be used by drivers
5557 * that need to tie several hardware interfaces to a single NAPI
5558 * poll scheduler due to HW limitations.
5559 */
5560int init_dummy_netdev(struct net_device *dev)
5561{
5562 /* Clear everything. Note we don't initialize spinlocks
5563 * are they aren't supposed to be taken by any of the
5564 * NAPI code and this dummy netdev is supposed to be
5565 * only ever used for NAPI polls
5566 */
5567 memset(dev, 0, sizeof(struct net_device));
5568
5569 /* make sure we BUG if trying to hit standard
5570 * register/unregister code path
5571 */
5572 dev->reg_state = NETREG_DUMMY;
5573
5574 /* NAPI wants this */
5575 INIT_LIST_HEAD(&dev->napi_list);
5576
5577 /* a dummy interface is started by default */
5578 set_bit(__LINK_STATE_PRESENT, &dev->state);
5579 set_bit(__LINK_STATE_START, &dev->state);
5580
5581 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5582 * because users of this 'device' dont need to change
5583 * its refcount.
5584 */
5585
5586 return 0;
5587}
5588EXPORT_SYMBOL_GPL(init_dummy_netdev);
5589
5590
5591/**
5592 * register_netdev - register a network device
5593 * @dev: device to register
5594 *
5595 * Take a completed network device structure and add it to the kernel
5596 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5597 * chain. 0 is returned on success. A negative errno code is returned
5598 * on a failure to set up the device, or if the name is a duplicate.
5599 *
5600 * This is a wrapper around register_netdevice that takes the rtnl semaphore
5601 * and expands the device name if you passed a format string to
5602 * alloc_netdev.
5603 */
5604int register_netdev(struct net_device *dev)
5605{
5606 int err;
5607
5608 rtnl_lock();
5609 err = register_netdevice(dev);
5610 rtnl_unlock();
5611 return err;
5612}
5613EXPORT_SYMBOL(register_netdev);
5614
5615int netdev_refcnt_read(const struct net_device *dev)
5616{
5617 int i, refcnt = 0;
5618
5619 for_each_possible_cpu(i)
5620 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5621 return refcnt;
5622}
5623EXPORT_SYMBOL(netdev_refcnt_read);
5624
5625/*
5626 * netdev_wait_allrefs - wait until all references are gone.
5627 *
5628 * This is called when unregistering network devices.
5629 *
5630 * Any protocol or device that holds a reference should register
5631 * for netdevice notification, and cleanup and put back the
5632 * reference if they receive an UNREGISTER event.
5633 * We can get stuck here if buggy protocols don't correctly
5634 * call dev_put.
5635 */
5636static void netdev_wait_allrefs(struct net_device *dev)
5637{
5638 unsigned long rebroadcast_time, warning_time;
5639 int refcnt;
5640
5641 linkwatch_forget_dev(dev);
5642
5643 rebroadcast_time = warning_time = jiffies;
5644 refcnt = netdev_refcnt_read(dev);
5645
5646 while (refcnt != 0) {
5647 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5648 rtnl_lock();
5649
5650 /* Rebroadcast unregister notification */
5651 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5652 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5653 * should have already handle it the first time */
5654
5655 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5656 &dev->state)) {
5657 /* We must not have linkwatch events
5658 * pending on unregister. If this
5659 * happens, we simply run the queue
5660 * unscheduled, resulting in a noop
5661 * for this device.
5662 */
5663 linkwatch_run_queue();
5664 }
5665
5666 __rtnl_unlock();
5667
5668 rebroadcast_time = jiffies;
5669 }
5670
5671 msleep(250);
5672
5673 refcnt = netdev_refcnt_read(dev);
5674
5675 if (time_after(jiffies, warning_time + 10 * HZ)) {
5676 printk(KERN_EMERG "unregister_netdevice: "
5677 "waiting for %s to become free. Usage "
5678 "count = %d\n",
5679 dev->name, refcnt);
5680 warning_time = jiffies;
5681 }
5682 }
5683}
5684
5685/* The sequence is:
5686 *
5687 * rtnl_lock();
5688 * ...
5689 * register_netdevice(x1);
5690 * register_netdevice(x2);
5691 * ...
5692 * unregister_netdevice(y1);
5693 * unregister_netdevice(y2);
5694 * ...
5695 * rtnl_unlock();
5696 * free_netdev(y1);
5697 * free_netdev(y2);
5698 *
5699 * We are invoked by rtnl_unlock().
5700 * This allows us to deal with problems:
5701 * 1) We can delete sysfs objects which invoke hotplug
5702 * without deadlocking with linkwatch via keventd.
5703 * 2) Since we run with the RTNL semaphore not held, we can sleep
5704 * safely in order to wait for the netdev refcnt to drop to zero.
5705 *
5706 * We must not return until all unregister events added during
5707 * the interval the lock was held have been completed.
5708 */
5709void netdev_run_todo(void)
5710{
5711 struct list_head list;
5712
5713 /* Snapshot list, allow later requests */
5714 list_replace_init(&net_todo_list, &list);
5715
5716 __rtnl_unlock();
5717
5718 while (!list_empty(&list)) {
5719 struct net_device *dev
5720 = list_first_entry(&list, struct net_device, todo_list);
5721 list_del(&dev->todo_list);
5722
5723 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5724 printk(KERN_ERR "network todo '%s' but state %d\n",
5725 dev->name, dev->reg_state);
5726 dump_stack();
5727 continue;
5728 }
5729
5730 dev->reg_state = NETREG_UNREGISTERED;
5731
5732 on_each_cpu(flush_backlog, dev, 1);
5733
5734 netdev_wait_allrefs(dev);
5735
5736 /* paranoia */
5737 BUG_ON(netdev_refcnt_read(dev));
5738 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5739 WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5740 WARN_ON(dev->dn_ptr);
5741
5742 if (dev->destructor)
5743 dev->destructor(dev);
5744
5745 /* Free network device */
5746 kobject_put(&dev->dev.kobj);
5747 }
5748}
5749
5750/* Convert net_device_stats to rtnl_link_stats64. They have the same
5751 * fields in the same order, with only the type differing.
5752 */
5753static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5754 const struct net_device_stats *netdev_stats)
5755{
5756#if BITS_PER_LONG == 64
5757 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5758 memcpy(stats64, netdev_stats, sizeof(*stats64));
5759#else
5760 size_t i, n = sizeof(*stats64) / sizeof(u64);
5761 const unsigned long *src = (const unsigned long *)netdev_stats;
5762 u64 *dst = (u64 *)stats64;
5763
5764 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5765 sizeof(*stats64) / sizeof(u64));
5766 for (i = 0; i < n; i++)
5767 dst[i] = src[i];
5768#endif
5769}
5770
5771/**
5772 * dev_get_stats - get network device statistics
5773 * @dev: device to get statistics from
5774 * @storage: place to store stats
5775 *
5776 * Get network statistics from device. Return @storage.
5777 * The device driver may provide its own method by setting
5778 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5779 * otherwise the internal statistics structure is used.
5780 */
5781struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5782 struct rtnl_link_stats64 *storage)
5783{
5784 const struct net_device_ops *ops = dev->netdev_ops;
5785
5786 if (ops->ndo_get_stats64) {
5787 memset(storage, 0, sizeof(*storage));
5788 ops->ndo_get_stats64(dev, storage);
5789 } else if (ops->ndo_get_stats) {
5790 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5791 } else {
5792 netdev_stats_to_stats64(storage, &dev->stats);
5793 }
5794 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5795 return storage;
5796}
5797EXPORT_SYMBOL(dev_get_stats);
5798
5799struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5800{
5801 struct netdev_queue *queue = dev_ingress_queue(dev);
5802
5803#ifdef CONFIG_NET_CLS_ACT
5804 if (queue)
5805 return queue;
5806 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5807 if (!queue)
5808 return NULL;
5809 netdev_init_one_queue(dev, queue, NULL);
5810 queue->qdisc = &noop_qdisc;
5811 queue->qdisc_sleeping = &noop_qdisc;
5812 rcu_assign_pointer(dev->ingress_queue, queue);
5813#endif
5814 return queue;
5815}
5816
5817/**
5818 * alloc_netdev_mqs - allocate network device
5819 * @sizeof_priv: size of private data to allocate space for
5820 * @name: device name format string
5821 * @setup: callback to initialize device
5822 * @txqs: the number of TX subqueues to allocate
5823 * @rxqs: the number of RX subqueues to allocate
5824 *
5825 * Allocates a struct net_device with private data area for driver use
5826 * and performs basic initialization. Also allocates subquue structs
5827 * for each queue on the device.
5828 */
5829struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5830 void (*setup)(struct net_device *),
5831 unsigned int txqs, unsigned int rxqs)
5832{
5833 struct net_device *dev;
5834 size_t alloc_size;
5835 struct net_device *p;
5836
5837 BUG_ON(strlen(name) >= sizeof(dev->name));
5838
5839 if (txqs < 1) {
5840 pr_err("alloc_netdev: Unable to allocate device "
5841 "with zero queues.\n");
5842 return NULL;
5843 }
5844
5845#ifdef CONFIG_RPS
5846 if (rxqs < 1) {
5847 pr_err("alloc_netdev: Unable to allocate device "
5848 "with zero RX queues.\n");
5849 return NULL;
5850 }
5851#endif
5852
5853 alloc_size = sizeof(struct net_device);
5854 if (sizeof_priv) {
5855 /* ensure 32-byte alignment of private area */
5856 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5857 alloc_size += sizeof_priv;
5858 }
5859 /* ensure 32-byte alignment of whole construct */
5860 alloc_size += NETDEV_ALIGN - 1;
5861
5862 p = kzalloc(alloc_size, GFP_KERNEL);
5863 if (!p) {
5864 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5865 return NULL;
5866 }
5867
5868 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5869 dev->padded = (char *)dev - (char *)p;
5870
5871 dev->pcpu_refcnt = alloc_percpu(int);
5872 if (!dev->pcpu_refcnt)
5873 goto free_p;
5874
5875 if (dev_addr_init(dev))
5876 goto free_pcpu;
5877
5878 dev_mc_init(dev);
5879 dev_uc_init(dev);
5880
5881 dev_net_set(dev, &init_net);
5882
5883 dev->gso_max_size = GSO_MAX_SIZE;
5884
5885 INIT_LIST_HEAD(&dev->napi_list);
5886 INIT_LIST_HEAD(&dev->unreg_list);
5887 INIT_LIST_HEAD(&dev->link_watch_list);
5888 dev->priv_flags = IFF_XMIT_DST_RELEASE;
5889 setup(dev);
5890
5891 dev->num_tx_queues = txqs;
5892 dev->real_num_tx_queues = txqs;
5893 if (netif_alloc_netdev_queues(dev))
5894 goto free_all;
5895
5896#ifdef CONFIG_RPS
5897 dev->num_rx_queues = rxqs;
5898 dev->real_num_rx_queues = rxqs;
5899 if (netif_alloc_rx_queues(dev))
5900 goto free_all;
5901#endif
5902
5903 strcpy(dev->name, name);
5904 dev->group = INIT_NETDEV_GROUP;
5905 return dev;
5906
5907free_all:
5908 free_netdev(dev);
5909 return NULL;
5910
5911free_pcpu:
5912 free_percpu(dev->pcpu_refcnt);
5913 kfree(dev->_tx);
5914#ifdef CONFIG_RPS
5915 kfree(dev->_rx);
5916#endif
5917
5918free_p:
5919 kfree(p);
5920 return NULL;
5921}
5922EXPORT_SYMBOL(alloc_netdev_mqs);
5923
5924/**
5925 * free_netdev - free network device
5926 * @dev: device
5927 *
5928 * This function does the last stage of destroying an allocated device
5929 * interface. The reference to the device object is released.
5930 * If this is the last reference then it will be freed.
5931 */
5932void free_netdev(struct net_device *dev)
5933{
5934 struct napi_struct *p, *n;
5935
5936 release_net(dev_net(dev));
5937
5938 kfree(dev->_tx);
5939#ifdef CONFIG_RPS
5940 kfree(dev->_rx);
5941#endif
5942
5943 kfree(rcu_dereference_raw(dev->ingress_queue));
5944
5945 /* Flush device addresses */
5946 dev_addr_flush(dev);
5947
5948 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5949 netif_napi_del(p);
5950
5951 free_percpu(dev->pcpu_refcnt);
5952 dev->pcpu_refcnt = NULL;
5953
5954 /* Compatibility with error handling in drivers */
5955 if (dev->reg_state == NETREG_UNINITIALIZED) {
5956 kfree((char *)dev - dev->padded);
5957 return;
5958 }
5959
5960 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5961 dev->reg_state = NETREG_RELEASED;
5962
5963 /* will free via device release */
5964 put_device(&dev->dev);
5965}
5966EXPORT_SYMBOL(free_netdev);
5967
5968/**
5969 * synchronize_net - Synchronize with packet receive processing
5970 *
5971 * Wait for packets currently being received to be done.
5972 * Does not block later packets from starting.
5973 */
5974void synchronize_net(void)
5975{
5976 might_sleep();
5977 if (rtnl_is_locked())
5978 synchronize_rcu_expedited();
5979 else
5980 synchronize_rcu();
5981}
5982EXPORT_SYMBOL(synchronize_net);
5983
5984/**
5985 * unregister_netdevice_queue - remove device from the kernel
5986 * @dev: device
5987 * @head: list
5988 *
5989 * This function shuts down a device interface and removes it
5990 * from the kernel tables.
5991 * If head not NULL, device is queued to be unregistered later.
5992 *
5993 * Callers must hold the rtnl semaphore. You may want
5994 * unregister_netdev() instead of this.
5995 */
5996
5997void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5998{
5999 ASSERT_RTNL();
6000
6001 if (head) {
6002 list_move_tail(&dev->unreg_list, head);
6003 } else {
6004 rollback_registered(dev);
6005 /* Finish processing unregister after unlock */
6006 net_set_todo(dev);
6007 }
6008}
6009EXPORT_SYMBOL(unregister_netdevice_queue);
6010
6011/**
6012 * unregister_netdevice_many - unregister many devices
6013 * @head: list of devices
6014 */
6015void unregister_netdevice_many(struct list_head *head)
6016{
6017 struct net_device *dev;
6018
6019 if (!list_empty(head)) {
6020 rollback_registered_many(head);
6021 list_for_each_entry(dev, head, unreg_list)
6022 net_set_todo(dev);
6023 }
6024}
6025EXPORT_SYMBOL(unregister_netdevice_many);
6026
6027/**
6028 * unregister_netdev - remove device from the kernel
6029 * @dev: device
6030 *
6031 * This function shuts down a device interface and removes it
6032 * from the kernel tables.
6033 *
6034 * This is just a wrapper for unregister_netdevice that takes
6035 * the rtnl semaphore. In general you want to use this and not
6036 * unregister_netdevice.
6037 */
6038void unregister_netdev(struct net_device *dev)
6039{
6040 rtnl_lock();
6041 unregister_netdevice(dev);
6042 rtnl_unlock();
6043}
6044EXPORT_SYMBOL(unregister_netdev);
6045
6046/**
6047 * dev_change_net_namespace - move device to different nethost namespace
6048 * @dev: device
6049 * @net: network namespace
6050 * @pat: If not NULL name pattern to try if the current device name
6051 * is already taken in the destination network namespace.
6052 *
6053 * This function shuts down a device interface and moves it
6054 * to a new network namespace. On success 0 is returned, on
6055 * a failure a netagive errno code is returned.
6056 *
6057 * Callers must hold the rtnl semaphore.
6058 */
6059
6060int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6061{
6062 int err;
6063
6064 ASSERT_RTNL();
6065
6066 /* Don't allow namespace local devices to be moved. */
6067 err = -EINVAL;
6068 if (dev->features & NETIF_F_NETNS_LOCAL)
6069 goto out;
6070
6071 /* Ensure the device has been registrered */
6072 err = -EINVAL;
6073 if (dev->reg_state != NETREG_REGISTERED)
6074 goto out;
6075
6076 /* Get out if there is nothing todo */
6077 err = 0;
6078 if (net_eq(dev_net(dev), net))
6079 goto out;
6080
6081 /* Pick the destination device name, and ensure
6082 * we can use it in the destination network namespace.
6083 */
6084 err = -EEXIST;
6085 if (__dev_get_by_name(net, dev->name)) {
6086 /* We get here if we can't use the current device name */
6087 if (!pat)
6088 goto out;
6089 if (dev_get_valid_name(dev, pat) < 0)
6090 goto out;
6091 }
6092
6093 /*
6094 * And now a mini version of register_netdevice unregister_netdevice.
6095 */
6096
6097 /* If device is running close it first. */
6098 dev_close(dev);
6099
6100 /* And unlink it from device chain */
6101 err = -ENODEV;
6102 unlist_netdevice(dev);
6103
6104 synchronize_net();
6105
6106 /* Shutdown queueing discipline. */
6107 dev_shutdown(dev);
6108
6109 /* Notify protocols, that we are about to destroy
6110 this device. They should clean all the things.
6111
6112 Note that dev->reg_state stays at NETREG_REGISTERED.
6113 This is wanted because this way 8021q and macvlan know
6114 the device is just moving and can keep their slaves up.
6115 */
6116 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6117 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6118
6119 /*
6120 * Flush the unicast and multicast chains
6121 */
6122 dev_uc_flush(dev);
6123 dev_mc_flush(dev);
6124
6125 /* Actually switch the network namespace */
6126 dev_net_set(dev, net);
6127
6128 /* If there is an ifindex conflict assign a new one */
6129 if (__dev_get_by_index(net, dev->ifindex)) {
6130 int iflink = (dev->iflink == dev->ifindex);
6131 dev->ifindex = dev_new_index(net);
6132 if (iflink)
6133 dev->iflink = dev->ifindex;
6134 }
6135
6136 /* Fixup kobjects */
6137 err = device_rename(&dev->dev, dev->name);
6138 WARN_ON(err);
6139
6140 /* Add the device back in the hashes */
6141 list_netdevice(dev);
6142
6143 /* Notify protocols, that a new device appeared. */
6144 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6145
6146 /*
6147 * Prevent userspace races by waiting until the network
6148 * device is fully setup before sending notifications.
6149 */
6150 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6151
6152 synchronize_net();
6153 err = 0;
6154out:
6155 return err;
6156}
6157EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6158
6159static int dev_cpu_callback(struct notifier_block *nfb,
6160 unsigned long action,
6161 void *ocpu)
6162{
6163 struct sk_buff **list_skb;
6164 struct sk_buff *skb;
6165 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6166 struct softnet_data *sd, *oldsd;
6167
6168 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6169 return NOTIFY_OK;
6170
6171 local_irq_disable();
6172 cpu = smp_processor_id();
6173 sd = &per_cpu(softnet_data, cpu);
6174 oldsd = &per_cpu(softnet_data, oldcpu);
6175
6176 /* Find end of our completion_queue. */
6177 list_skb = &sd->completion_queue;
6178 while (*list_skb)
6179 list_skb = &(*list_skb)->next;
6180 /* Append completion queue from offline CPU. */
6181 *list_skb = oldsd->completion_queue;
6182 oldsd->completion_queue = NULL;
6183
6184 /* Append output queue from offline CPU. */
6185 if (oldsd->output_queue) {
6186 *sd->output_queue_tailp = oldsd->output_queue;
6187 sd->output_queue_tailp = oldsd->output_queue_tailp;
6188 oldsd->output_queue = NULL;
6189 oldsd->output_queue_tailp = &oldsd->output_queue;
6190 }
6191 /* Append NAPI poll list from offline CPU. */
6192 if (!list_empty(&oldsd->poll_list)) {
6193 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6194 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6195 }
6196
6197 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6198 local_irq_enable();
6199
6200 /* Process offline CPU's input_pkt_queue */
6201 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6202 netif_rx(skb);
6203 input_queue_head_incr(oldsd);
6204 }
6205 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6206 netif_rx(skb);
6207 input_queue_head_incr(oldsd);
6208 }
6209
6210 return NOTIFY_OK;
6211}
6212
6213
6214/**
6215 * netdev_increment_features - increment feature set by one
6216 * @all: current feature set
6217 * @one: new feature set
6218 * @mask: mask feature set
6219 *
6220 * Computes a new feature set after adding a device with feature set
6221 * @one to the master device with current feature set @all. Will not
6222 * enable anything that is off in @mask. Returns the new feature set.
6223 */
6224u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6225{
6226 if (mask & NETIF_F_GEN_CSUM)
6227 mask |= NETIF_F_ALL_CSUM;
6228 mask |= NETIF_F_VLAN_CHALLENGED;
6229
6230 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6231 all &= one | ~NETIF_F_ALL_FOR_ALL;
6232
6233 /* If device needs checksumming, downgrade to it. */
6234 if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6235 all &= ~NETIF_F_NO_CSUM;
6236
6237 /* If one device supports hw checksumming, set for all. */
6238 if (all & NETIF_F_GEN_CSUM)
6239 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6240
6241 return all;
6242}
6243EXPORT_SYMBOL(netdev_increment_features);
6244
6245static struct hlist_head *netdev_create_hash(void)
6246{
6247 int i;
6248 struct hlist_head *hash;
6249
6250 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6251 if (hash != NULL)
6252 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6253 INIT_HLIST_HEAD(&hash[i]);
6254
6255 return hash;
6256}
6257
6258/* Initialize per network namespace state */
6259static int __net_init netdev_init(struct net *net)
6260{
6261 INIT_LIST_HEAD(&net->dev_base_head);
6262
6263 net->dev_name_head = netdev_create_hash();
6264 if (net->dev_name_head == NULL)
6265 goto err_name;
6266
6267 net->dev_index_head = netdev_create_hash();
6268 if (net->dev_index_head == NULL)
6269 goto err_idx;
6270
6271 return 0;
6272
6273err_idx:
6274 kfree(net->dev_name_head);
6275err_name:
6276 return -ENOMEM;
6277}
6278
6279/**
6280 * netdev_drivername - network driver for the device
6281 * @dev: network device
6282 *
6283 * Determine network driver for device.
6284 */
6285const char *netdev_drivername(const struct net_device *dev)
6286{
6287 const struct device_driver *driver;
6288 const struct device *parent;
6289 const char *empty = "";
6290
6291 parent = dev->dev.parent;
6292 if (!parent)
6293 return empty;
6294
6295 driver = parent->driver;
6296 if (driver && driver->name)
6297 return driver->name;
6298 return empty;
6299}
6300
6301static int __netdev_printk(const char *level, const struct net_device *dev,
6302 struct va_format *vaf)
6303{
6304 int r;
6305
6306 if (dev && dev->dev.parent)
6307 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6308 netdev_name(dev), vaf);
6309 else if (dev)
6310 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6311 else
6312 r = printk("%s(NULL net_device): %pV", level, vaf);
6313
6314 return r;
6315}
6316
6317int netdev_printk(const char *level, const struct net_device *dev,
6318 const char *format, ...)
6319{
6320 struct va_format vaf;
6321 va_list args;
6322 int r;
6323
6324 va_start(args, format);
6325
6326 vaf.fmt = format;
6327 vaf.va = &args;
6328
6329 r = __netdev_printk(level, dev, &vaf);
6330 va_end(args);
6331
6332 return r;
6333}
6334EXPORT_SYMBOL(netdev_printk);
6335
6336#define define_netdev_printk_level(func, level) \
6337int func(const struct net_device *dev, const char *fmt, ...) \
6338{ \
6339 int r; \
6340 struct va_format vaf; \
6341 va_list args; \
6342 \
6343 va_start(args, fmt); \
6344 \
6345 vaf.fmt = fmt; \
6346 vaf.va = &args; \
6347 \
6348 r = __netdev_printk(level, dev, &vaf); \
6349 va_end(args); \
6350 \
6351 return r; \
6352} \
6353EXPORT_SYMBOL(func);
6354
6355define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6356define_netdev_printk_level(netdev_alert, KERN_ALERT);
6357define_netdev_printk_level(netdev_crit, KERN_CRIT);
6358define_netdev_printk_level(netdev_err, KERN_ERR);
6359define_netdev_printk_level(netdev_warn, KERN_WARNING);
6360define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6361define_netdev_printk_level(netdev_info, KERN_INFO);
6362
6363static void __net_exit netdev_exit(struct net *net)
6364{
6365 kfree(net->dev_name_head);
6366 kfree(net->dev_index_head);
6367}
6368
6369static struct pernet_operations __net_initdata netdev_net_ops = {
6370 .init = netdev_init,
6371 .exit = netdev_exit,
6372};
6373
6374static void __net_exit default_device_exit(struct net *net)
6375{
6376 struct net_device *dev, *aux;
6377 /*
6378 * Push all migratable network devices back to the
6379 * initial network namespace
6380 */
6381 rtnl_lock();
6382 for_each_netdev_safe(net, dev, aux) {
6383 int err;
6384 char fb_name[IFNAMSIZ];
6385
6386 /* Ignore unmoveable devices (i.e. loopback) */
6387 if (dev->features & NETIF_F_NETNS_LOCAL)
6388 continue;
6389
6390 /* Leave virtual devices for the generic cleanup */
6391 if (dev->rtnl_link_ops)
6392 continue;
6393
6394 /* Push remaining network devices to init_net */
6395 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6396 err = dev_change_net_namespace(dev, &init_net, fb_name);
6397 if (err) {
6398 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6399 __func__, dev->name, err);
6400 BUG();
6401 }
6402 }
6403 rtnl_unlock();
6404}
6405
6406static void __net_exit default_device_exit_batch(struct list_head *net_list)
6407{
6408 /* At exit all network devices most be removed from a network
6409 * namespace. Do this in the reverse order of registration.
6410 * Do this across as many network namespaces as possible to
6411 * improve batching efficiency.
6412 */
6413 struct net_device *dev;
6414 struct net *net;
6415 LIST_HEAD(dev_kill_list);
6416
6417 rtnl_lock();
6418 list_for_each_entry(net, net_list, exit_list) {
6419 for_each_netdev_reverse(net, dev) {
6420 if (dev->rtnl_link_ops)
6421 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6422 else
6423 unregister_netdevice_queue(dev, &dev_kill_list);
6424 }
6425 }
6426 unregister_netdevice_many(&dev_kill_list);
6427 list_del(&dev_kill_list);
6428 rtnl_unlock();
6429}
6430
6431static struct pernet_operations __net_initdata default_device_ops = {
6432 .exit = default_device_exit,
6433 .exit_batch = default_device_exit_batch,
6434};
6435
6436/*
6437 * Initialize the DEV module. At boot time this walks the device list and
6438 * unhooks any devices that fail to initialise (normally hardware not
6439 * present) and leaves us with a valid list of present and active devices.
6440 *
6441 */
6442
6443/*
6444 * This is called single threaded during boot, so no need
6445 * to take the rtnl semaphore.
6446 */
6447static int __init net_dev_init(void)
6448{
6449 int i, rc = -ENOMEM;
6450
6451 BUG_ON(!dev_boot_phase);
6452
6453 if (dev_proc_init())
6454 goto out;
6455
6456 if (netdev_kobject_init())
6457 goto out;
6458
6459 INIT_LIST_HEAD(&ptype_all);
6460 for (i = 0; i < PTYPE_HASH_SIZE; i++)
6461 INIT_LIST_HEAD(&ptype_base[i]);
6462
6463 if (register_pernet_subsys(&netdev_net_ops))
6464 goto out;
6465
6466 /*
6467 * Initialise the packet receive queues.
6468 */
6469
6470 for_each_possible_cpu(i) {
6471 struct softnet_data *sd = &per_cpu(softnet_data, i);
6472
6473 memset(sd, 0, sizeof(*sd));
6474 skb_queue_head_init(&sd->input_pkt_queue);
6475 skb_queue_head_init(&sd->process_queue);
6476 sd->completion_queue = NULL;
6477 INIT_LIST_HEAD(&sd->poll_list);
6478 sd->output_queue = NULL;
6479 sd->output_queue_tailp = &sd->output_queue;
6480#ifdef CONFIG_RPS
6481 sd->csd.func = rps_trigger_softirq;
6482 sd->csd.info = sd;
6483 sd->csd.flags = 0;
6484 sd->cpu = i;
6485#endif
6486
6487 sd->backlog.poll = process_backlog;
6488 sd->backlog.weight = weight_p;
6489 sd->backlog.gro_list = NULL;
6490 sd->backlog.gro_count = 0;
6491 }
6492
6493 dev_boot_phase = 0;
6494
6495 /* The loopback device is special if any other network devices
6496 * is present in a network namespace the loopback device must
6497 * be present. Since we now dynamically allocate and free the
6498 * loopback device ensure this invariant is maintained by
6499 * keeping the loopback device as the first device on the
6500 * list of network devices. Ensuring the loopback devices
6501 * is the first device that appears and the last network device
6502 * that disappears.
6503 */
6504 if (register_pernet_device(&loopback_net_ops))
6505 goto out;
6506
6507 if (register_pernet_device(&default_device_ops))
6508 goto out;
6509
6510 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6511 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6512
6513 hotcpu_notifier(dev_cpu_callback, 0);
6514 dst_init();
6515 dev_mcast_init();
6516 rc = 0;
6517out:
6518 return rc;
6519}
6520
6521subsys_initcall(net_dev_init);
6522
6523static int __init initialize_hashrnd(void)
6524{
6525 get_random_bytes(&hashrnd, sizeof(hashrnd));
6526 return 0;
6527}
6528
6529late_initcall_sync(initialize_hashrnd);
6530