Loading...
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
76#include <linux/bitops.h>
77#include <linux/capability.h>
78#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
81#include <linux/hash.h>
82#include <linux/slab.h>
83#include <linux/sched.h>
84#include <linux/mutex.h>
85#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
94#include <linux/ethtool.h>
95#include <linux/notifier.h>
96#include <linux/skbuff.h>
97#include <net/net_namespace.h>
98#include <net/sock.h>
99#include <linux/rtnetlink.h>
100#include <linux/proc_fs.h>
101#include <linux/seq_file.h>
102#include <linux/stat.h>
103#include <net/dst.h>
104#include <net/pkt_sched.h>
105#include <net/checksum.h>
106#include <net/xfrm.h>
107#include <linux/highmem.h>
108#include <linux/init.h>
109#include <linux/kmod.h>
110#include <linux/module.h>
111#include <linux/netpoll.h>
112#include <linux/rcupdate.h>
113#include <linux/delay.h>
114#include <net/wext.h>
115#include <net/iw_handler.h>
116#include <asm/current.h>
117#include <linux/audit.h>
118#include <linux/dmaengine.h>
119#include <linux/err.h>
120#include <linux/ctype.h>
121#include <linux/if_arp.h>
122#include <linux/if_vlan.h>
123#include <linux/ip.h>
124#include <net/ip.h>
125#include <linux/ipv6.h>
126#include <linux/in.h>
127#include <linux/jhash.h>
128#include <linux/random.h>
129#include <trace/events/napi.h>
130#include <trace/events/net.h>
131#include <trace/events/skb.h>
132#include <linux/pci.h>
133#include <linux/inetdevice.h>
134#include <linux/cpu_rmap.h>
135#include <linux/net_tstamp.h>
136#include <linux/static_key.h>
137#include <net/flow_keys.h>
138
139#include "net-sysfs.h"
140
141/* Instead of increasing this, you should create a hash table. */
142#define MAX_GRO_SKBS 8
143
144/* This should be increased if a protocol with a bigger head is added. */
145#define GRO_MAX_HEAD (MAX_HEADER + 128)
146
147/*
148 * The list of packet types we will receive (as opposed to discard)
149 * and the routines to invoke.
150 *
151 * Why 16. Because with 16 the only overlap we get on a hash of the
152 * low nibble of the protocol value is RARP/SNAP/X.25.
153 *
154 * NOTE: That is no longer true with the addition of VLAN tags. Not
155 * sure which should go first, but I bet it won't make much
156 * difference if we are running VLANs. The good news is that
157 * this protocol won't be in the list unless compiled in, so
158 * the average user (w/out VLANs) will not be adversely affected.
159 * --BLG
160 *
161 * 0800 IP
162 * 8100 802.1Q VLAN
163 * 0001 802.3
164 * 0002 AX.25
165 * 0004 802.2
166 * 8035 RARP
167 * 0005 SNAP
168 * 0805 X.25
169 * 0806 ARP
170 * 8137 IPX
171 * 0009 Localtalk
172 * 86DD IPv6
173 */
174
175#define PTYPE_HASH_SIZE (16)
176#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
177
178static DEFINE_SPINLOCK(ptype_lock);
179static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
180static struct list_head ptype_all __read_mostly; /* Taps */
181
182/*
183 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
184 * semaphore.
185 *
186 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
187 *
188 * Writers must hold the rtnl semaphore while they loop through the
189 * dev_base_head list, and hold dev_base_lock for writing when they do the
190 * actual updates. This allows pure readers to access the list even
191 * while a writer is preparing to update it.
192 *
193 * To put it another way, dev_base_lock is held for writing only to
194 * protect against pure readers; the rtnl semaphore provides the
195 * protection against other writers.
196 *
197 * See, for example usages, register_netdevice() and
198 * unregister_netdevice(), which must be called with the rtnl
199 * semaphore held.
200 */
201DEFINE_RWLOCK(dev_base_lock);
202EXPORT_SYMBOL(dev_base_lock);
203
204static inline void dev_base_seq_inc(struct net *net)
205{
206 while (++net->dev_base_seq == 0);
207}
208
209static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
210{
211 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
212
213 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
214}
215
216static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
217{
218 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
219}
220
221static inline void rps_lock(struct softnet_data *sd)
222{
223#ifdef CONFIG_RPS
224 spin_lock(&sd->input_pkt_queue.lock);
225#endif
226}
227
228static inline void rps_unlock(struct softnet_data *sd)
229{
230#ifdef CONFIG_RPS
231 spin_unlock(&sd->input_pkt_queue.lock);
232#endif
233}
234
235/* Device list insertion */
236static int list_netdevice(struct net_device *dev)
237{
238 struct net *net = dev_net(dev);
239
240 ASSERT_RTNL();
241
242 write_lock_bh(&dev_base_lock);
243 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
244 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
245 hlist_add_head_rcu(&dev->index_hlist,
246 dev_index_hash(net, dev->ifindex));
247 write_unlock_bh(&dev_base_lock);
248
249 dev_base_seq_inc(net);
250
251 return 0;
252}
253
254/* Device list removal
255 * caller must respect a RCU grace period before freeing/reusing dev
256 */
257static void unlist_netdevice(struct net_device *dev)
258{
259 ASSERT_RTNL();
260
261 /* Unlink dev from the device chain */
262 write_lock_bh(&dev_base_lock);
263 list_del_rcu(&dev->dev_list);
264 hlist_del_rcu(&dev->name_hlist);
265 hlist_del_rcu(&dev->index_hlist);
266 write_unlock_bh(&dev_base_lock);
267
268 dev_base_seq_inc(dev_net(dev));
269}
270
271/*
272 * Our notifier list
273 */
274
275static RAW_NOTIFIER_HEAD(netdev_chain);
276
277/*
278 * Device drivers call our routines to queue packets here. We empty the
279 * queue in the local softnet handler.
280 */
281
282DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
283EXPORT_PER_CPU_SYMBOL(softnet_data);
284
285#ifdef CONFIG_LOCKDEP
286/*
287 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
288 * according to dev->type
289 */
290static const unsigned short netdev_lock_type[] =
291 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
292 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
293 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
294 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
295 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
296 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
297 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
298 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
299 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
300 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
301 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
302 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
303 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
304 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
305 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
306
307static const char *const netdev_lock_name[] =
308 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
309 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
310 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
311 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
312 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
313 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
314 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
315 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
316 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
317 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
318 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
319 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
320 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
321 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
322 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
323
324static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
325static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
326
327static inline unsigned short netdev_lock_pos(unsigned short dev_type)
328{
329 int i;
330
331 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
332 if (netdev_lock_type[i] == dev_type)
333 return i;
334 /* the last key is used by default */
335 return ARRAY_SIZE(netdev_lock_type) - 1;
336}
337
338static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
339 unsigned short dev_type)
340{
341 int i;
342
343 i = netdev_lock_pos(dev_type);
344 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
345 netdev_lock_name[i]);
346}
347
348static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
349{
350 int i;
351
352 i = netdev_lock_pos(dev->type);
353 lockdep_set_class_and_name(&dev->addr_list_lock,
354 &netdev_addr_lock_key[i],
355 netdev_lock_name[i]);
356}
357#else
358static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
359 unsigned short dev_type)
360{
361}
362static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
363{
364}
365#endif
366
367/*******************************************************************************
368
369 Protocol management and registration routines
370
371*******************************************************************************/
372
373/*
374 * Add a protocol ID to the list. Now that the input handler is
375 * smarter we can dispense with all the messy stuff that used to be
376 * here.
377 *
378 * BEWARE!!! Protocol handlers, mangling input packets,
379 * MUST BE last in hash buckets and checking protocol handlers
380 * MUST start from promiscuous ptype_all chain in net_bh.
381 * It is true now, do not change it.
382 * Explanation follows: if protocol handler, mangling packet, will
383 * be the first on list, it is not able to sense, that packet
384 * is cloned and should be copied-on-write, so that it will
385 * change it and subsequent readers will get broken packet.
386 * --ANK (980803)
387 */
388
389static inline struct list_head *ptype_head(const struct packet_type *pt)
390{
391 if (pt->type == htons(ETH_P_ALL))
392 return &ptype_all;
393 else
394 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
395}
396
397/**
398 * dev_add_pack - add packet handler
399 * @pt: packet type declaration
400 *
401 * Add a protocol handler to the networking stack. The passed &packet_type
402 * is linked into kernel lists and may not be freed until it has been
403 * removed from the kernel lists.
404 *
405 * This call does not sleep therefore it can not
406 * guarantee all CPU's that are in middle of receiving packets
407 * will see the new packet type (until the next received packet).
408 */
409
410void dev_add_pack(struct packet_type *pt)
411{
412 struct list_head *head = ptype_head(pt);
413
414 spin_lock(&ptype_lock);
415 list_add_rcu(&pt->list, head);
416 spin_unlock(&ptype_lock);
417}
418EXPORT_SYMBOL(dev_add_pack);
419
420/**
421 * __dev_remove_pack - remove packet handler
422 * @pt: packet type declaration
423 *
424 * Remove a protocol handler that was previously added to the kernel
425 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
426 * from the kernel lists and can be freed or reused once this function
427 * returns.
428 *
429 * The packet type might still be in use by receivers
430 * and must not be freed until after all the CPU's have gone
431 * through a quiescent state.
432 */
433void __dev_remove_pack(struct packet_type *pt)
434{
435 struct list_head *head = ptype_head(pt);
436 struct packet_type *pt1;
437
438 spin_lock(&ptype_lock);
439
440 list_for_each_entry(pt1, head, list) {
441 if (pt == pt1) {
442 list_del_rcu(&pt->list);
443 goto out;
444 }
445 }
446
447 pr_warn("dev_remove_pack: %p not found\n", pt);
448out:
449 spin_unlock(&ptype_lock);
450}
451EXPORT_SYMBOL(__dev_remove_pack);
452
453/**
454 * dev_remove_pack - remove packet handler
455 * @pt: packet type declaration
456 *
457 * Remove a protocol handler that was previously added to the kernel
458 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
459 * from the kernel lists and can be freed or reused once this function
460 * returns.
461 *
462 * This call sleeps to guarantee that no CPU is looking at the packet
463 * type after return.
464 */
465void dev_remove_pack(struct packet_type *pt)
466{
467 __dev_remove_pack(pt);
468
469 synchronize_net();
470}
471EXPORT_SYMBOL(dev_remove_pack);
472
473/******************************************************************************
474
475 Device Boot-time Settings Routines
476
477*******************************************************************************/
478
479/* Boot time configuration table */
480static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
481
482/**
483 * netdev_boot_setup_add - add new setup entry
484 * @name: name of the device
485 * @map: configured settings for the device
486 *
487 * Adds new setup entry to the dev_boot_setup list. The function
488 * returns 0 on error and 1 on success. This is a generic routine to
489 * all netdevices.
490 */
491static int netdev_boot_setup_add(char *name, struct ifmap *map)
492{
493 struct netdev_boot_setup *s;
494 int i;
495
496 s = dev_boot_setup;
497 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
498 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
499 memset(s[i].name, 0, sizeof(s[i].name));
500 strlcpy(s[i].name, name, IFNAMSIZ);
501 memcpy(&s[i].map, map, sizeof(s[i].map));
502 break;
503 }
504 }
505
506 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
507}
508
509/**
510 * netdev_boot_setup_check - check boot time settings
511 * @dev: the netdevice
512 *
513 * Check boot time settings for the device.
514 * The found settings are set for the device to be used
515 * later in the device probing.
516 * Returns 0 if no settings found, 1 if they are.
517 */
518int netdev_boot_setup_check(struct net_device *dev)
519{
520 struct netdev_boot_setup *s = dev_boot_setup;
521 int i;
522
523 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
524 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
525 !strcmp(dev->name, s[i].name)) {
526 dev->irq = s[i].map.irq;
527 dev->base_addr = s[i].map.base_addr;
528 dev->mem_start = s[i].map.mem_start;
529 dev->mem_end = s[i].map.mem_end;
530 return 1;
531 }
532 }
533 return 0;
534}
535EXPORT_SYMBOL(netdev_boot_setup_check);
536
537
538/**
539 * netdev_boot_base - get address from boot time settings
540 * @prefix: prefix for network device
541 * @unit: id for network device
542 *
543 * Check boot time settings for the base address of device.
544 * The found settings are set for the device to be used
545 * later in the device probing.
546 * Returns 0 if no settings found.
547 */
548unsigned long netdev_boot_base(const char *prefix, int unit)
549{
550 const struct netdev_boot_setup *s = dev_boot_setup;
551 char name[IFNAMSIZ];
552 int i;
553
554 sprintf(name, "%s%d", prefix, unit);
555
556 /*
557 * If device already registered then return base of 1
558 * to indicate not to probe for this interface
559 */
560 if (__dev_get_by_name(&init_net, name))
561 return 1;
562
563 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
564 if (!strcmp(name, s[i].name))
565 return s[i].map.base_addr;
566 return 0;
567}
568
569/*
570 * Saves at boot time configured settings for any netdevice.
571 */
572int __init netdev_boot_setup(char *str)
573{
574 int ints[5];
575 struct ifmap map;
576
577 str = get_options(str, ARRAY_SIZE(ints), ints);
578 if (!str || !*str)
579 return 0;
580
581 /* Save settings */
582 memset(&map, 0, sizeof(map));
583 if (ints[0] > 0)
584 map.irq = ints[1];
585 if (ints[0] > 1)
586 map.base_addr = ints[2];
587 if (ints[0] > 2)
588 map.mem_start = ints[3];
589 if (ints[0] > 3)
590 map.mem_end = ints[4];
591
592 /* Add new entry to the list */
593 return netdev_boot_setup_add(str, &map);
594}
595
596__setup("netdev=", netdev_boot_setup);
597
598/*******************************************************************************
599
600 Device Interface Subroutines
601
602*******************************************************************************/
603
604/**
605 * __dev_get_by_name - find a device by its name
606 * @net: the applicable net namespace
607 * @name: name to find
608 *
609 * Find an interface by name. Must be called under RTNL semaphore
610 * or @dev_base_lock. If the name is found a pointer to the device
611 * is returned. If the name is not found then %NULL is returned. The
612 * reference counters are not incremented so the caller must be
613 * careful with locks.
614 */
615
616struct net_device *__dev_get_by_name(struct net *net, const char *name)
617{
618 struct hlist_node *p;
619 struct net_device *dev;
620 struct hlist_head *head = dev_name_hash(net, name);
621
622 hlist_for_each_entry(dev, p, head, name_hlist)
623 if (!strncmp(dev->name, name, IFNAMSIZ))
624 return dev;
625
626 return NULL;
627}
628EXPORT_SYMBOL(__dev_get_by_name);
629
630/**
631 * dev_get_by_name_rcu - find a device by its name
632 * @net: the applicable net namespace
633 * @name: name to find
634 *
635 * Find an interface by name.
636 * If the name is found a pointer to the device is returned.
637 * If the name is not found then %NULL is returned.
638 * The reference counters are not incremented so the caller must be
639 * careful with locks. The caller must hold RCU lock.
640 */
641
642struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
643{
644 struct hlist_node *p;
645 struct net_device *dev;
646 struct hlist_head *head = dev_name_hash(net, name);
647
648 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
649 if (!strncmp(dev->name, name, IFNAMSIZ))
650 return dev;
651
652 return NULL;
653}
654EXPORT_SYMBOL(dev_get_by_name_rcu);
655
656/**
657 * dev_get_by_name - find a device by its name
658 * @net: the applicable net namespace
659 * @name: name to find
660 *
661 * Find an interface by name. This can be called from any
662 * context and does its own locking. The returned handle has
663 * the usage count incremented and the caller must use dev_put() to
664 * release it when it is no longer needed. %NULL is returned if no
665 * matching device is found.
666 */
667
668struct net_device *dev_get_by_name(struct net *net, const char *name)
669{
670 struct net_device *dev;
671
672 rcu_read_lock();
673 dev = dev_get_by_name_rcu(net, name);
674 if (dev)
675 dev_hold(dev);
676 rcu_read_unlock();
677 return dev;
678}
679EXPORT_SYMBOL(dev_get_by_name);
680
681/**
682 * __dev_get_by_index - find a device by its ifindex
683 * @net: the applicable net namespace
684 * @ifindex: index of device
685 *
686 * Search for an interface by index. Returns %NULL if the device
687 * is not found or a pointer to the device. The device has not
688 * had its reference counter increased so the caller must be careful
689 * about locking. The caller must hold either the RTNL semaphore
690 * or @dev_base_lock.
691 */
692
693struct net_device *__dev_get_by_index(struct net *net, int ifindex)
694{
695 struct hlist_node *p;
696 struct net_device *dev;
697 struct hlist_head *head = dev_index_hash(net, ifindex);
698
699 hlist_for_each_entry(dev, p, head, index_hlist)
700 if (dev->ifindex == ifindex)
701 return dev;
702
703 return NULL;
704}
705EXPORT_SYMBOL(__dev_get_by_index);
706
707/**
708 * dev_get_by_index_rcu - find a device by its ifindex
709 * @net: the applicable net namespace
710 * @ifindex: index of device
711 *
712 * Search for an interface by index. Returns %NULL if the device
713 * is not found or a pointer to the device. The device has not
714 * had its reference counter increased so the caller must be careful
715 * about locking. The caller must hold RCU lock.
716 */
717
718struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
719{
720 struct hlist_node *p;
721 struct net_device *dev;
722 struct hlist_head *head = dev_index_hash(net, ifindex);
723
724 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
725 if (dev->ifindex == ifindex)
726 return dev;
727
728 return NULL;
729}
730EXPORT_SYMBOL(dev_get_by_index_rcu);
731
732
733/**
734 * dev_get_by_index - find a device by its ifindex
735 * @net: the applicable net namespace
736 * @ifindex: index of device
737 *
738 * Search for an interface by index. Returns NULL if the device
739 * is not found or a pointer to the device. The device returned has
740 * had a reference added and the pointer is safe until the user calls
741 * dev_put to indicate they have finished with it.
742 */
743
744struct net_device *dev_get_by_index(struct net *net, int ifindex)
745{
746 struct net_device *dev;
747
748 rcu_read_lock();
749 dev = dev_get_by_index_rcu(net, ifindex);
750 if (dev)
751 dev_hold(dev);
752 rcu_read_unlock();
753 return dev;
754}
755EXPORT_SYMBOL(dev_get_by_index);
756
757/**
758 * dev_getbyhwaddr_rcu - find a device by its hardware address
759 * @net: the applicable net namespace
760 * @type: media type of device
761 * @ha: hardware address
762 *
763 * Search for an interface by MAC address. Returns NULL if the device
764 * is not found or a pointer to the device.
765 * The caller must hold RCU or RTNL.
766 * The returned device has not had its ref count increased
767 * and the caller must therefore be careful about locking
768 *
769 */
770
771struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
772 const char *ha)
773{
774 struct net_device *dev;
775
776 for_each_netdev_rcu(net, dev)
777 if (dev->type == type &&
778 !memcmp(dev->dev_addr, ha, dev->addr_len))
779 return dev;
780
781 return NULL;
782}
783EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
784
785struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
786{
787 struct net_device *dev;
788
789 ASSERT_RTNL();
790 for_each_netdev(net, dev)
791 if (dev->type == type)
792 return dev;
793
794 return NULL;
795}
796EXPORT_SYMBOL(__dev_getfirstbyhwtype);
797
798struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
799{
800 struct net_device *dev, *ret = NULL;
801
802 rcu_read_lock();
803 for_each_netdev_rcu(net, dev)
804 if (dev->type == type) {
805 dev_hold(dev);
806 ret = dev;
807 break;
808 }
809 rcu_read_unlock();
810 return ret;
811}
812EXPORT_SYMBOL(dev_getfirstbyhwtype);
813
814/**
815 * dev_get_by_flags_rcu - find any device with given flags
816 * @net: the applicable net namespace
817 * @if_flags: IFF_* values
818 * @mask: bitmask of bits in if_flags to check
819 *
820 * Search for any interface with the given flags. Returns NULL if a device
821 * is not found or a pointer to the device. Must be called inside
822 * rcu_read_lock(), and result refcount is unchanged.
823 */
824
825struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
826 unsigned short mask)
827{
828 struct net_device *dev, *ret;
829
830 ret = NULL;
831 for_each_netdev_rcu(net, dev) {
832 if (((dev->flags ^ if_flags) & mask) == 0) {
833 ret = dev;
834 break;
835 }
836 }
837 return ret;
838}
839EXPORT_SYMBOL(dev_get_by_flags_rcu);
840
841/**
842 * dev_valid_name - check if name is okay for network device
843 * @name: name string
844 *
845 * Network device names need to be valid file names to
846 * to allow sysfs to work. We also disallow any kind of
847 * whitespace.
848 */
849bool dev_valid_name(const char *name)
850{
851 if (*name == '\0')
852 return false;
853 if (strlen(name) >= IFNAMSIZ)
854 return false;
855 if (!strcmp(name, ".") || !strcmp(name, ".."))
856 return false;
857
858 while (*name) {
859 if (*name == '/' || isspace(*name))
860 return false;
861 name++;
862 }
863 return true;
864}
865EXPORT_SYMBOL(dev_valid_name);
866
867/**
868 * __dev_alloc_name - allocate a name for a device
869 * @net: network namespace to allocate the device name in
870 * @name: name format string
871 * @buf: scratch buffer and result name string
872 *
873 * Passed a format string - eg "lt%d" it will try and find a suitable
874 * id. It scans list of devices to build up a free map, then chooses
875 * the first empty slot. The caller must hold the dev_base or rtnl lock
876 * while allocating the name and adding the device in order to avoid
877 * duplicates.
878 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
879 * Returns the number of the unit assigned or a negative errno code.
880 */
881
882static int __dev_alloc_name(struct net *net, const char *name, char *buf)
883{
884 int i = 0;
885 const char *p;
886 const int max_netdevices = 8*PAGE_SIZE;
887 unsigned long *inuse;
888 struct net_device *d;
889
890 p = strnchr(name, IFNAMSIZ-1, '%');
891 if (p) {
892 /*
893 * Verify the string as this thing may have come from
894 * the user. There must be either one "%d" and no other "%"
895 * characters.
896 */
897 if (p[1] != 'd' || strchr(p + 2, '%'))
898 return -EINVAL;
899
900 /* Use one page as a bit array of possible slots */
901 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
902 if (!inuse)
903 return -ENOMEM;
904
905 for_each_netdev(net, d) {
906 if (!sscanf(d->name, name, &i))
907 continue;
908 if (i < 0 || i >= max_netdevices)
909 continue;
910
911 /* avoid cases where sscanf is not exact inverse of printf */
912 snprintf(buf, IFNAMSIZ, name, i);
913 if (!strncmp(buf, d->name, IFNAMSIZ))
914 set_bit(i, inuse);
915 }
916
917 i = find_first_zero_bit(inuse, max_netdevices);
918 free_page((unsigned long) inuse);
919 }
920
921 if (buf != name)
922 snprintf(buf, IFNAMSIZ, name, i);
923 if (!__dev_get_by_name(net, buf))
924 return i;
925
926 /* It is possible to run out of possible slots
927 * when the name is long and there isn't enough space left
928 * for the digits, or if all bits are used.
929 */
930 return -ENFILE;
931}
932
933/**
934 * dev_alloc_name - allocate a name for a device
935 * @dev: device
936 * @name: name format string
937 *
938 * Passed a format string - eg "lt%d" it will try and find a suitable
939 * id. It scans list of devices to build up a free map, then chooses
940 * the first empty slot. The caller must hold the dev_base or rtnl lock
941 * while allocating the name and adding the device in order to avoid
942 * duplicates.
943 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
944 * Returns the number of the unit assigned or a negative errno code.
945 */
946
947int dev_alloc_name(struct net_device *dev, const char *name)
948{
949 char buf[IFNAMSIZ];
950 struct net *net;
951 int ret;
952
953 BUG_ON(!dev_net(dev));
954 net = dev_net(dev);
955 ret = __dev_alloc_name(net, name, buf);
956 if (ret >= 0)
957 strlcpy(dev->name, buf, IFNAMSIZ);
958 return ret;
959}
960EXPORT_SYMBOL(dev_alloc_name);
961
962static int dev_get_valid_name(struct net_device *dev, const char *name)
963{
964 struct net *net;
965
966 BUG_ON(!dev_net(dev));
967 net = dev_net(dev);
968
969 if (!dev_valid_name(name))
970 return -EINVAL;
971
972 if (strchr(name, '%'))
973 return dev_alloc_name(dev, name);
974 else if (__dev_get_by_name(net, name))
975 return -EEXIST;
976 else if (dev->name != name)
977 strlcpy(dev->name, name, IFNAMSIZ);
978
979 return 0;
980}
981
982/**
983 * dev_change_name - change name of a device
984 * @dev: device
985 * @newname: name (or format string) must be at least IFNAMSIZ
986 *
987 * Change name of a device, can pass format strings "eth%d".
988 * for wildcarding.
989 */
990int dev_change_name(struct net_device *dev, const char *newname)
991{
992 char oldname[IFNAMSIZ];
993 int err = 0;
994 int ret;
995 struct net *net;
996
997 ASSERT_RTNL();
998 BUG_ON(!dev_net(dev));
999
1000 net = dev_net(dev);
1001 if (dev->flags & IFF_UP)
1002 return -EBUSY;
1003
1004 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1005 return 0;
1006
1007 memcpy(oldname, dev->name, IFNAMSIZ);
1008
1009 err = dev_get_valid_name(dev, newname);
1010 if (err < 0)
1011 return err;
1012
1013rollback:
1014 ret = device_rename(&dev->dev, dev->name);
1015 if (ret) {
1016 memcpy(dev->name, oldname, IFNAMSIZ);
1017 return ret;
1018 }
1019
1020 write_lock_bh(&dev_base_lock);
1021 hlist_del_rcu(&dev->name_hlist);
1022 write_unlock_bh(&dev_base_lock);
1023
1024 synchronize_rcu();
1025
1026 write_lock_bh(&dev_base_lock);
1027 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1028 write_unlock_bh(&dev_base_lock);
1029
1030 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1031 ret = notifier_to_errno(ret);
1032
1033 if (ret) {
1034 /* err >= 0 after dev_alloc_name() or stores the first errno */
1035 if (err >= 0) {
1036 err = ret;
1037 memcpy(dev->name, oldname, IFNAMSIZ);
1038 goto rollback;
1039 } else {
1040 pr_err("%s: name change rollback failed: %d\n",
1041 dev->name, ret);
1042 }
1043 }
1044
1045 return err;
1046}
1047
1048/**
1049 * dev_set_alias - change ifalias of a device
1050 * @dev: device
1051 * @alias: name up to IFALIASZ
1052 * @len: limit of bytes to copy from info
1053 *
1054 * Set ifalias for a device,
1055 */
1056int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1057{
1058 char *new_ifalias;
1059
1060 ASSERT_RTNL();
1061
1062 if (len >= IFALIASZ)
1063 return -EINVAL;
1064
1065 if (!len) {
1066 if (dev->ifalias) {
1067 kfree(dev->ifalias);
1068 dev->ifalias = NULL;
1069 }
1070 return 0;
1071 }
1072
1073 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1074 if (!new_ifalias)
1075 return -ENOMEM;
1076 dev->ifalias = new_ifalias;
1077
1078 strlcpy(dev->ifalias, alias, len+1);
1079 return len;
1080}
1081
1082
1083/**
1084 * netdev_features_change - device changes features
1085 * @dev: device to cause notification
1086 *
1087 * Called to indicate a device has changed features.
1088 */
1089void netdev_features_change(struct net_device *dev)
1090{
1091 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1092}
1093EXPORT_SYMBOL(netdev_features_change);
1094
1095/**
1096 * netdev_state_change - device changes state
1097 * @dev: device to cause notification
1098 *
1099 * Called to indicate a device has changed state. This function calls
1100 * the notifier chains for netdev_chain and sends a NEWLINK message
1101 * to the routing socket.
1102 */
1103void netdev_state_change(struct net_device *dev)
1104{
1105 if (dev->flags & IFF_UP) {
1106 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1107 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1108 }
1109}
1110EXPORT_SYMBOL(netdev_state_change);
1111
1112int netdev_bonding_change(struct net_device *dev, unsigned long event)
1113{
1114 return call_netdevice_notifiers(event, dev);
1115}
1116EXPORT_SYMBOL(netdev_bonding_change);
1117
1118/**
1119 * dev_load - load a network module
1120 * @net: the applicable net namespace
1121 * @name: name of interface
1122 *
1123 * If a network interface is not present and the process has suitable
1124 * privileges this function loads the module. If module loading is not
1125 * available in this kernel then it becomes a nop.
1126 */
1127
1128void dev_load(struct net *net, const char *name)
1129{
1130 struct net_device *dev;
1131 int no_module;
1132
1133 rcu_read_lock();
1134 dev = dev_get_by_name_rcu(net, name);
1135 rcu_read_unlock();
1136
1137 no_module = !dev;
1138 if (no_module && capable(CAP_NET_ADMIN))
1139 no_module = request_module("netdev-%s", name);
1140 if (no_module && capable(CAP_SYS_MODULE)) {
1141 if (!request_module("%s", name))
1142 pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1143 name);
1144 }
1145}
1146EXPORT_SYMBOL(dev_load);
1147
1148static int __dev_open(struct net_device *dev)
1149{
1150 const struct net_device_ops *ops = dev->netdev_ops;
1151 int ret;
1152
1153 ASSERT_RTNL();
1154
1155 if (!netif_device_present(dev))
1156 return -ENODEV;
1157
1158 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1159 ret = notifier_to_errno(ret);
1160 if (ret)
1161 return ret;
1162
1163 set_bit(__LINK_STATE_START, &dev->state);
1164
1165 if (ops->ndo_validate_addr)
1166 ret = ops->ndo_validate_addr(dev);
1167
1168 if (!ret && ops->ndo_open)
1169 ret = ops->ndo_open(dev);
1170
1171 if (ret)
1172 clear_bit(__LINK_STATE_START, &dev->state);
1173 else {
1174 dev->flags |= IFF_UP;
1175 net_dmaengine_get();
1176 dev_set_rx_mode(dev);
1177 dev_activate(dev);
1178 add_device_randomness(dev->dev_addr, dev->addr_len);
1179 }
1180
1181 return ret;
1182}
1183
1184/**
1185 * dev_open - prepare an interface for use.
1186 * @dev: device to open
1187 *
1188 * Takes a device from down to up state. The device's private open
1189 * function is invoked and then the multicast lists are loaded. Finally
1190 * the device is moved into the up state and a %NETDEV_UP message is
1191 * sent to the netdev notifier chain.
1192 *
1193 * Calling this function on an active interface is a nop. On a failure
1194 * a negative errno code is returned.
1195 */
1196int dev_open(struct net_device *dev)
1197{
1198 int ret;
1199
1200 if (dev->flags & IFF_UP)
1201 return 0;
1202
1203 ret = __dev_open(dev);
1204 if (ret < 0)
1205 return ret;
1206
1207 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1208 call_netdevice_notifiers(NETDEV_UP, dev);
1209
1210 return ret;
1211}
1212EXPORT_SYMBOL(dev_open);
1213
1214static int __dev_close_many(struct list_head *head)
1215{
1216 struct net_device *dev;
1217
1218 ASSERT_RTNL();
1219 might_sleep();
1220
1221 list_for_each_entry(dev, head, unreg_list) {
1222 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1223
1224 clear_bit(__LINK_STATE_START, &dev->state);
1225
1226 /* Synchronize to scheduled poll. We cannot touch poll list, it
1227 * can be even on different cpu. So just clear netif_running().
1228 *
1229 * dev->stop() will invoke napi_disable() on all of it's
1230 * napi_struct instances on this device.
1231 */
1232 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1233 }
1234
1235 dev_deactivate_many(head);
1236
1237 list_for_each_entry(dev, head, unreg_list) {
1238 const struct net_device_ops *ops = dev->netdev_ops;
1239
1240 /*
1241 * Call the device specific close. This cannot fail.
1242 * Only if device is UP
1243 *
1244 * We allow it to be called even after a DETACH hot-plug
1245 * event.
1246 */
1247 if (ops->ndo_stop)
1248 ops->ndo_stop(dev);
1249
1250 dev->flags &= ~IFF_UP;
1251 net_dmaengine_put();
1252 }
1253
1254 return 0;
1255}
1256
1257static int __dev_close(struct net_device *dev)
1258{
1259 int retval;
1260 LIST_HEAD(single);
1261
1262 list_add(&dev->unreg_list, &single);
1263 retval = __dev_close_many(&single);
1264 list_del(&single);
1265 return retval;
1266}
1267
1268static int dev_close_many(struct list_head *head)
1269{
1270 struct net_device *dev, *tmp;
1271 LIST_HEAD(tmp_list);
1272
1273 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1274 if (!(dev->flags & IFF_UP))
1275 list_move(&dev->unreg_list, &tmp_list);
1276
1277 __dev_close_many(head);
1278
1279 list_for_each_entry(dev, head, unreg_list) {
1280 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1281 call_netdevice_notifiers(NETDEV_DOWN, dev);
1282 }
1283
1284 /* rollback_registered_many needs the complete original list */
1285 list_splice(&tmp_list, head);
1286 return 0;
1287}
1288
1289/**
1290 * dev_close - shutdown an interface.
1291 * @dev: device to shutdown
1292 *
1293 * This function moves an active device into down state. A
1294 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1295 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1296 * chain.
1297 */
1298int dev_close(struct net_device *dev)
1299{
1300 if (dev->flags & IFF_UP) {
1301 LIST_HEAD(single);
1302
1303 list_add(&dev->unreg_list, &single);
1304 dev_close_many(&single);
1305 list_del(&single);
1306 }
1307 return 0;
1308}
1309EXPORT_SYMBOL(dev_close);
1310
1311
1312/**
1313 * dev_disable_lro - disable Large Receive Offload on a device
1314 * @dev: device
1315 *
1316 * Disable Large Receive Offload (LRO) on a net device. Must be
1317 * called under RTNL. This is needed if received packets may be
1318 * forwarded to another interface.
1319 */
1320void dev_disable_lro(struct net_device *dev)
1321{
1322 /*
1323 * If we're trying to disable lro on a vlan device
1324 * use the underlying physical device instead
1325 */
1326 if (is_vlan_dev(dev))
1327 dev = vlan_dev_real_dev(dev);
1328
1329 dev->wanted_features &= ~NETIF_F_LRO;
1330 netdev_update_features(dev);
1331
1332 if (unlikely(dev->features & NETIF_F_LRO))
1333 netdev_WARN(dev, "failed to disable LRO!\n");
1334}
1335EXPORT_SYMBOL(dev_disable_lro);
1336
1337
1338static int dev_boot_phase = 1;
1339
1340/**
1341 * register_netdevice_notifier - register a network notifier block
1342 * @nb: notifier
1343 *
1344 * Register a notifier to be called when network device events occur.
1345 * The notifier passed is linked into the kernel structures and must
1346 * not be reused until it has been unregistered. A negative errno code
1347 * is returned on a failure.
1348 *
1349 * When registered all registration and up events are replayed
1350 * to the new notifier to allow device to have a race free
1351 * view of the network device list.
1352 */
1353
1354int register_netdevice_notifier(struct notifier_block *nb)
1355{
1356 struct net_device *dev;
1357 struct net_device *last;
1358 struct net *net;
1359 int err;
1360
1361 rtnl_lock();
1362 err = raw_notifier_chain_register(&netdev_chain, nb);
1363 if (err)
1364 goto unlock;
1365 if (dev_boot_phase)
1366 goto unlock;
1367 for_each_net(net) {
1368 for_each_netdev(net, dev) {
1369 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1370 err = notifier_to_errno(err);
1371 if (err)
1372 goto rollback;
1373
1374 if (!(dev->flags & IFF_UP))
1375 continue;
1376
1377 nb->notifier_call(nb, NETDEV_UP, dev);
1378 }
1379 }
1380
1381unlock:
1382 rtnl_unlock();
1383 return err;
1384
1385rollback:
1386 last = dev;
1387 for_each_net(net) {
1388 for_each_netdev(net, dev) {
1389 if (dev == last)
1390 goto outroll;
1391
1392 if (dev->flags & IFF_UP) {
1393 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1394 nb->notifier_call(nb, NETDEV_DOWN, dev);
1395 }
1396 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1397 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1398 }
1399 }
1400
1401outroll:
1402 raw_notifier_chain_unregister(&netdev_chain, nb);
1403 goto unlock;
1404}
1405EXPORT_SYMBOL(register_netdevice_notifier);
1406
1407/**
1408 * unregister_netdevice_notifier - unregister a network notifier block
1409 * @nb: notifier
1410 *
1411 * Unregister a notifier previously registered by
1412 * register_netdevice_notifier(). The notifier is unlinked into the
1413 * kernel structures and may then be reused. A negative errno code
1414 * is returned on a failure.
1415 *
1416 * After unregistering unregister and down device events are synthesized
1417 * for all devices on the device list to the removed notifier to remove
1418 * the need for special case cleanup code.
1419 */
1420
1421int unregister_netdevice_notifier(struct notifier_block *nb)
1422{
1423 struct net_device *dev;
1424 struct net *net;
1425 int err;
1426
1427 rtnl_lock();
1428 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1429 if (err)
1430 goto unlock;
1431
1432 for_each_net(net) {
1433 for_each_netdev(net, dev) {
1434 if (dev->flags & IFF_UP) {
1435 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1436 nb->notifier_call(nb, NETDEV_DOWN, dev);
1437 }
1438 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1439 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1440 }
1441 }
1442unlock:
1443 rtnl_unlock();
1444 return err;
1445}
1446EXPORT_SYMBOL(unregister_netdevice_notifier);
1447
1448/**
1449 * call_netdevice_notifiers - call all network notifier blocks
1450 * @val: value passed unmodified to notifier function
1451 * @dev: net_device pointer passed unmodified to notifier function
1452 *
1453 * Call all network notifier blocks. Parameters and return value
1454 * are as for raw_notifier_call_chain().
1455 */
1456
1457int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1458{
1459 ASSERT_RTNL();
1460 return raw_notifier_call_chain(&netdev_chain, val, dev);
1461}
1462EXPORT_SYMBOL(call_netdevice_notifiers);
1463
1464static struct static_key netstamp_needed __read_mostly;
1465#ifdef HAVE_JUMP_LABEL
1466/* We are not allowed to call static_key_slow_dec() from irq context
1467 * If net_disable_timestamp() is called from irq context, defer the
1468 * static_key_slow_dec() calls.
1469 */
1470static atomic_t netstamp_needed_deferred;
1471#endif
1472
1473void net_enable_timestamp(void)
1474{
1475#ifdef HAVE_JUMP_LABEL
1476 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1477
1478 if (deferred) {
1479 while (--deferred)
1480 static_key_slow_dec(&netstamp_needed);
1481 return;
1482 }
1483#endif
1484 WARN_ON(in_interrupt());
1485 static_key_slow_inc(&netstamp_needed);
1486}
1487EXPORT_SYMBOL(net_enable_timestamp);
1488
1489void net_disable_timestamp(void)
1490{
1491#ifdef HAVE_JUMP_LABEL
1492 if (in_interrupt()) {
1493 atomic_inc(&netstamp_needed_deferred);
1494 return;
1495 }
1496#endif
1497 static_key_slow_dec(&netstamp_needed);
1498}
1499EXPORT_SYMBOL(net_disable_timestamp);
1500
1501static inline void net_timestamp_set(struct sk_buff *skb)
1502{
1503 skb->tstamp.tv64 = 0;
1504 if (static_key_false(&netstamp_needed))
1505 __net_timestamp(skb);
1506}
1507
1508#define net_timestamp_check(COND, SKB) \
1509 if (static_key_false(&netstamp_needed)) { \
1510 if ((COND) && !(SKB)->tstamp.tv64) \
1511 __net_timestamp(SKB); \
1512 } \
1513
1514static int net_hwtstamp_validate(struct ifreq *ifr)
1515{
1516 struct hwtstamp_config cfg;
1517 enum hwtstamp_tx_types tx_type;
1518 enum hwtstamp_rx_filters rx_filter;
1519 int tx_type_valid = 0;
1520 int rx_filter_valid = 0;
1521
1522 if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1523 return -EFAULT;
1524
1525 if (cfg.flags) /* reserved for future extensions */
1526 return -EINVAL;
1527
1528 tx_type = cfg.tx_type;
1529 rx_filter = cfg.rx_filter;
1530
1531 switch (tx_type) {
1532 case HWTSTAMP_TX_OFF:
1533 case HWTSTAMP_TX_ON:
1534 case HWTSTAMP_TX_ONESTEP_SYNC:
1535 tx_type_valid = 1;
1536 break;
1537 }
1538
1539 switch (rx_filter) {
1540 case HWTSTAMP_FILTER_NONE:
1541 case HWTSTAMP_FILTER_ALL:
1542 case HWTSTAMP_FILTER_SOME:
1543 case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1544 case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1545 case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1546 case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1547 case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1548 case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1549 case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1550 case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1551 case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1552 case HWTSTAMP_FILTER_PTP_V2_EVENT:
1553 case HWTSTAMP_FILTER_PTP_V2_SYNC:
1554 case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1555 rx_filter_valid = 1;
1556 break;
1557 }
1558
1559 if (!tx_type_valid || !rx_filter_valid)
1560 return -ERANGE;
1561
1562 return 0;
1563}
1564
1565static inline bool is_skb_forwardable(struct net_device *dev,
1566 struct sk_buff *skb)
1567{
1568 unsigned int len;
1569
1570 if (!(dev->flags & IFF_UP))
1571 return false;
1572
1573 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1574 if (skb->len <= len)
1575 return true;
1576
1577 /* if TSO is enabled, we don't care about the length as the packet
1578 * could be forwarded without being segmented before
1579 */
1580 if (skb_is_gso(skb))
1581 return true;
1582
1583 return false;
1584}
1585
1586/**
1587 * dev_forward_skb - loopback an skb to another netif
1588 *
1589 * @dev: destination network device
1590 * @skb: buffer to forward
1591 *
1592 * return values:
1593 * NET_RX_SUCCESS (no congestion)
1594 * NET_RX_DROP (packet was dropped, but freed)
1595 *
1596 * dev_forward_skb can be used for injecting an skb from the
1597 * start_xmit function of one device into the receive queue
1598 * of another device.
1599 *
1600 * The receiving device may be in another namespace, so
1601 * we have to clear all information in the skb that could
1602 * impact namespace isolation.
1603 */
1604int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1605{
1606 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1607 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1608 atomic_long_inc(&dev->rx_dropped);
1609 kfree_skb(skb);
1610 return NET_RX_DROP;
1611 }
1612 }
1613
1614 skb_orphan(skb);
1615 nf_reset(skb);
1616
1617 if (unlikely(!is_skb_forwardable(dev, skb))) {
1618 atomic_long_inc(&dev->rx_dropped);
1619 kfree_skb(skb);
1620 return NET_RX_DROP;
1621 }
1622 skb->skb_iif = 0;
1623 skb->dev = dev;
1624 skb_dst_drop(skb);
1625 skb->tstamp.tv64 = 0;
1626 skb->pkt_type = PACKET_HOST;
1627 skb->protocol = eth_type_trans(skb, dev);
1628 skb->mark = 0;
1629 secpath_reset(skb);
1630 nf_reset(skb);
1631 return netif_rx(skb);
1632}
1633EXPORT_SYMBOL_GPL(dev_forward_skb);
1634
1635static inline int deliver_skb(struct sk_buff *skb,
1636 struct packet_type *pt_prev,
1637 struct net_device *orig_dev)
1638{
1639 atomic_inc(&skb->users);
1640 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1641}
1642
1643static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1644{
1645 if (ptype->af_packet_priv == NULL)
1646 return false;
1647
1648 if (ptype->id_match)
1649 return ptype->id_match(ptype, skb->sk);
1650 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1651 return true;
1652
1653 return false;
1654}
1655
1656/*
1657 * Support routine. Sends outgoing frames to any network
1658 * taps currently in use.
1659 */
1660
1661static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1662{
1663 struct packet_type *ptype;
1664 struct sk_buff *skb2 = NULL;
1665 struct packet_type *pt_prev = NULL;
1666
1667 rcu_read_lock();
1668 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1669 /* Never send packets back to the socket
1670 * they originated from - MvS (miquels@drinkel.ow.org)
1671 */
1672 if ((ptype->dev == dev || !ptype->dev) &&
1673 (!skb_loop_sk(ptype, skb))) {
1674 if (pt_prev) {
1675 deliver_skb(skb2, pt_prev, skb->dev);
1676 pt_prev = ptype;
1677 continue;
1678 }
1679
1680 skb2 = skb_clone(skb, GFP_ATOMIC);
1681 if (!skb2)
1682 break;
1683
1684 net_timestamp_set(skb2);
1685
1686 /* skb->nh should be correctly
1687 set by sender, so that the second statement is
1688 just protection against buggy protocols.
1689 */
1690 skb_reset_mac_header(skb2);
1691
1692 if (skb_network_header(skb2) < skb2->data ||
1693 skb2->network_header > skb2->tail) {
1694 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1695 ntohs(skb2->protocol),
1696 dev->name);
1697 skb_reset_network_header(skb2);
1698 }
1699
1700 skb2->transport_header = skb2->network_header;
1701 skb2->pkt_type = PACKET_OUTGOING;
1702 pt_prev = ptype;
1703 }
1704 }
1705 if (pt_prev)
1706 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1707 rcu_read_unlock();
1708}
1709
1710/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1711 * @dev: Network device
1712 * @txq: number of queues available
1713 *
1714 * If real_num_tx_queues is changed the tc mappings may no longer be
1715 * valid. To resolve this verify the tc mapping remains valid and if
1716 * not NULL the mapping. With no priorities mapping to this
1717 * offset/count pair it will no longer be used. In the worst case TC0
1718 * is invalid nothing can be done so disable priority mappings. If is
1719 * expected that drivers will fix this mapping if they can before
1720 * calling netif_set_real_num_tx_queues.
1721 */
1722static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1723{
1724 int i;
1725 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1726
1727 /* If TC0 is invalidated disable TC mapping */
1728 if (tc->offset + tc->count > txq) {
1729 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1730 dev->num_tc = 0;
1731 return;
1732 }
1733
1734 /* Invalidated prio to tc mappings set to TC0 */
1735 for (i = 1; i < TC_BITMASK + 1; i++) {
1736 int q = netdev_get_prio_tc_map(dev, i);
1737
1738 tc = &dev->tc_to_txq[q];
1739 if (tc->offset + tc->count > txq) {
1740 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1741 i, q);
1742 netdev_set_prio_tc_map(dev, i, 0);
1743 }
1744 }
1745}
1746
1747/*
1748 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1749 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1750 */
1751int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1752{
1753 int rc;
1754
1755 if (txq < 1 || txq > dev->num_tx_queues)
1756 return -EINVAL;
1757
1758 if (dev->reg_state == NETREG_REGISTERED ||
1759 dev->reg_state == NETREG_UNREGISTERING) {
1760 ASSERT_RTNL();
1761
1762 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1763 txq);
1764 if (rc)
1765 return rc;
1766
1767 if (dev->num_tc)
1768 netif_setup_tc(dev, txq);
1769
1770 if (txq < dev->real_num_tx_queues)
1771 qdisc_reset_all_tx_gt(dev, txq);
1772 }
1773
1774 dev->real_num_tx_queues = txq;
1775 return 0;
1776}
1777EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1778
1779#ifdef CONFIG_RPS
1780/**
1781 * netif_set_real_num_rx_queues - set actual number of RX queues used
1782 * @dev: Network device
1783 * @rxq: Actual number of RX queues
1784 *
1785 * This must be called either with the rtnl_lock held or before
1786 * registration of the net device. Returns 0 on success, or a
1787 * negative error code. If called before registration, it always
1788 * succeeds.
1789 */
1790int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1791{
1792 int rc;
1793
1794 if (rxq < 1 || rxq > dev->num_rx_queues)
1795 return -EINVAL;
1796
1797 if (dev->reg_state == NETREG_REGISTERED) {
1798 ASSERT_RTNL();
1799
1800 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1801 rxq);
1802 if (rc)
1803 return rc;
1804 }
1805
1806 dev->real_num_rx_queues = rxq;
1807 return 0;
1808}
1809EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1810#endif
1811
1812static inline void __netif_reschedule(struct Qdisc *q)
1813{
1814 struct softnet_data *sd;
1815 unsigned long flags;
1816
1817 local_irq_save(flags);
1818 sd = &__get_cpu_var(softnet_data);
1819 q->next_sched = NULL;
1820 *sd->output_queue_tailp = q;
1821 sd->output_queue_tailp = &q->next_sched;
1822 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1823 local_irq_restore(flags);
1824}
1825
1826void __netif_schedule(struct Qdisc *q)
1827{
1828 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1829 __netif_reschedule(q);
1830}
1831EXPORT_SYMBOL(__netif_schedule);
1832
1833void dev_kfree_skb_irq(struct sk_buff *skb)
1834{
1835 if (atomic_dec_and_test(&skb->users)) {
1836 struct softnet_data *sd;
1837 unsigned long flags;
1838
1839 local_irq_save(flags);
1840 sd = &__get_cpu_var(softnet_data);
1841 skb->next = sd->completion_queue;
1842 sd->completion_queue = skb;
1843 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1844 local_irq_restore(flags);
1845 }
1846}
1847EXPORT_SYMBOL(dev_kfree_skb_irq);
1848
1849void dev_kfree_skb_any(struct sk_buff *skb)
1850{
1851 if (in_irq() || irqs_disabled())
1852 dev_kfree_skb_irq(skb);
1853 else
1854 dev_kfree_skb(skb);
1855}
1856EXPORT_SYMBOL(dev_kfree_skb_any);
1857
1858
1859/**
1860 * netif_device_detach - mark device as removed
1861 * @dev: network device
1862 *
1863 * Mark device as removed from system and therefore no longer available.
1864 */
1865void netif_device_detach(struct net_device *dev)
1866{
1867 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1868 netif_running(dev)) {
1869 netif_tx_stop_all_queues(dev);
1870 }
1871}
1872EXPORT_SYMBOL(netif_device_detach);
1873
1874/**
1875 * netif_device_attach - mark device as attached
1876 * @dev: network device
1877 *
1878 * Mark device as attached from system and restart if needed.
1879 */
1880void netif_device_attach(struct net_device *dev)
1881{
1882 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1883 netif_running(dev)) {
1884 netif_tx_wake_all_queues(dev);
1885 __netdev_watchdog_up(dev);
1886 }
1887}
1888EXPORT_SYMBOL(netif_device_attach);
1889
1890static void skb_warn_bad_offload(const struct sk_buff *skb)
1891{
1892 static const netdev_features_t null_features = 0;
1893 struct net_device *dev = skb->dev;
1894 const char *driver = "";
1895
1896 if (dev && dev->dev.parent)
1897 driver = dev_driver_string(dev->dev.parent);
1898
1899 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
1900 "gso_type=%d ip_summed=%d\n",
1901 driver, dev ? &dev->features : &null_features,
1902 skb->sk ? &skb->sk->sk_route_caps : &null_features,
1903 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
1904 skb_shinfo(skb)->gso_type, skb->ip_summed);
1905}
1906
1907/*
1908 * Invalidate hardware checksum when packet is to be mangled, and
1909 * complete checksum manually on outgoing path.
1910 */
1911int skb_checksum_help(struct sk_buff *skb)
1912{
1913 __wsum csum;
1914 int ret = 0, offset;
1915
1916 if (skb->ip_summed == CHECKSUM_COMPLETE)
1917 goto out_set_summed;
1918
1919 if (unlikely(skb_shinfo(skb)->gso_size)) {
1920 skb_warn_bad_offload(skb);
1921 return -EINVAL;
1922 }
1923
1924 offset = skb_checksum_start_offset(skb);
1925 BUG_ON(offset >= skb_headlen(skb));
1926 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1927
1928 offset += skb->csum_offset;
1929 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1930
1931 if (skb_cloned(skb) &&
1932 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1933 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1934 if (ret)
1935 goto out;
1936 }
1937
1938 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1939out_set_summed:
1940 skb->ip_summed = CHECKSUM_NONE;
1941out:
1942 return ret;
1943}
1944EXPORT_SYMBOL(skb_checksum_help);
1945
1946/**
1947 * skb_gso_segment - Perform segmentation on skb.
1948 * @skb: buffer to segment
1949 * @features: features for the output path (see dev->features)
1950 *
1951 * This function segments the given skb and returns a list of segments.
1952 *
1953 * It may return NULL if the skb requires no segmentation. This is
1954 * only possible when GSO is used for verifying header integrity.
1955 */
1956struct sk_buff *skb_gso_segment(struct sk_buff *skb,
1957 netdev_features_t features)
1958{
1959 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1960 struct packet_type *ptype;
1961 __be16 type = skb->protocol;
1962 int vlan_depth = ETH_HLEN;
1963 int err;
1964
1965 while (type == htons(ETH_P_8021Q)) {
1966 struct vlan_hdr *vh;
1967
1968 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1969 return ERR_PTR(-EINVAL);
1970
1971 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1972 type = vh->h_vlan_encapsulated_proto;
1973 vlan_depth += VLAN_HLEN;
1974 }
1975
1976 skb_reset_mac_header(skb);
1977 skb->mac_len = skb->network_header - skb->mac_header;
1978 __skb_pull(skb, skb->mac_len);
1979
1980 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1981 skb_warn_bad_offload(skb);
1982
1983 if (skb_header_cloned(skb) &&
1984 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1985 return ERR_PTR(err);
1986 }
1987
1988 rcu_read_lock();
1989 list_for_each_entry_rcu(ptype,
1990 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1991 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1992 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1993 err = ptype->gso_send_check(skb);
1994 segs = ERR_PTR(err);
1995 if (err || skb_gso_ok(skb, features))
1996 break;
1997 __skb_push(skb, (skb->data -
1998 skb_network_header(skb)));
1999 }
2000 segs = ptype->gso_segment(skb, features);
2001 break;
2002 }
2003 }
2004 rcu_read_unlock();
2005
2006 __skb_push(skb, skb->data - skb_mac_header(skb));
2007
2008 return segs;
2009}
2010EXPORT_SYMBOL(skb_gso_segment);
2011
2012/* Take action when hardware reception checksum errors are detected. */
2013#ifdef CONFIG_BUG
2014void netdev_rx_csum_fault(struct net_device *dev)
2015{
2016 if (net_ratelimit()) {
2017 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2018 dump_stack();
2019 }
2020}
2021EXPORT_SYMBOL(netdev_rx_csum_fault);
2022#endif
2023
2024/* Actually, we should eliminate this check as soon as we know, that:
2025 * 1. IOMMU is present and allows to map all the memory.
2026 * 2. No high memory really exists on this machine.
2027 */
2028
2029static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2030{
2031#ifdef CONFIG_HIGHMEM
2032 int i;
2033 if (!(dev->features & NETIF_F_HIGHDMA)) {
2034 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2035 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2036 if (PageHighMem(skb_frag_page(frag)))
2037 return 1;
2038 }
2039 }
2040
2041 if (PCI_DMA_BUS_IS_PHYS) {
2042 struct device *pdev = dev->dev.parent;
2043
2044 if (!pdev)
2045 return 0;
2046 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2047 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2048 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2049 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2050 return 1;
2051 }
2052 }
2053#endif
2054 return 0;
2055}
2056
2057struct dev_gso_cb {
2058 void (*destructor)(struct sk_buff *skb);
2059};
2060
2061#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2062
2063static void dev_gso_skb_destructor(struct sk_buff *skb)
2064{
2065 struct dev_gso_cb *cb;
2066
2067 do {
2068 struct sk_buff *nskb = skb->next;
2069
2070 skb->next = nskb->next;
2071 nskb->next = NULL;
2072 kfree_skb(nskb);
2073 } while (skb->next);
2074
2075 cb = DEV_GSO_CB(skb);
2076 if (cb->destructor)
2077 cb->destructor(skb);
2078}
2079
2080/**
2081 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2082 * @skb: buffer to segment
2083 * @features: device features as applicable to this skb
2084 *
2085 * This function segments the given skb and stores the list of segments
2086 * in skb->next.
2087 */
2088static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2089{
2090 struct sk_buff *segs;
2091
2092 segs = skb_gso_segment(skb, features);
2093
2094 /* Verifying header integrity only. */
2095 if (!segs)
2096 return 0;
2097
2098 if (IS_ERR(segs))
2099 return PTR_ERR(segs);
2100
2101 skb->next = segs;
2102 DEV_GSO_CB(skb)->destructor = skb->destructor;
2103 skb->destructor = dev_gso_skb_destructor;
2104
2105 return 0;
2106}
2107
2108static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2109{
2110 return ((features & NETIF_F_GEN_CSUM) ||
2111 ((features & NETIF_F_V4_CSUM) &&
2112 protocol == htons(ETH_P_IP)) ||
2113 ((features & NETIF_F_V6_CSUM) &&
2114 protocol == htons(ETH_P_IPV6)) ||
2115 ((features & NETIF_F_FCOE_CRC) &&
2116 protocol == htons(ETH_P_FCOE)));
2117}
2118
2119static netdev_features_t harmonize_features(struct sk_buff *skb,
2120 __be16 protocol, netdev_features_t features)
2121{
2122 if (!can_checksum_protocol(features, protocol)) {
2123 features &= ~NETIF_F_ALL_CSUM;
2124 features &= ~NETIF_F_SG;
2125 } else if (illegal_highdma(skb->dev, skb)) {
2126 features &= ~NETIF_F_SG;
2127 }
2128
2129 return features;
2130}
2131
2132netdev_features_t netif_skb_features(struct sk_buff *skb)
2133{
2134 __be16 protocol = skb->protocol;
2135 netdev_features_t features = skb->dev->features;
2136
2137 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2138 features &= ~NETIF_F_GSO_MASK;
2139
2140 if (protocol == htons(ETH_P_8021Q)) {
2141 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2142 protocol = veh->h_vlan_encapsulated_proto;
2143 } else if (!vlan_tx_tag_present(skb)) {
2144 return harmonize_features(skb, protocol, features);
2145 }
2146
2147 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2148
2149 if (protocol != htons(ETH_P_8021Q)) {
2150 return harmonize_features(skb, protocol, features);
2151 } else {
2152 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2153 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2154 return harmonize_features(skb, protocol, features);
2155 }
2156}
2157EXPORT_SYMBOL(netif_skb_features);
2158
2159/*
2160 * Returns true if either:
2161 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2162 * 2. skb is fragmented and the device does not support SG, or if
2163 * at least one of fragments is in highmem and device does not
2164 * support DMA from it.
2165 */
2166static inline int skb_needs_linearize(struct sk_buff *skb,
2167 int features)
2168{
2169 return skb_is_nonlinear(skb) &&
2170 ((skb_has_frag_list(skb) &&
2171 !(features & NETIF_F_FRAGLIST)) ||
2172 (skb_shinfo(skb)->nr_frags &&
2173 !(features & NETIF_F_SG)));
2174}
2175
2176int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2177 struct netdev_queue *txq)
2178{
2179 const struct net_device_ops *ops = dev->netdev_ops;
2180 int rc = NETDEV_TX_OK;
2181 unsigned int skb_len;
2182
2183 if (likely(!skb->next)) {
2184 netdev_features_t features;
2185
2186 /*
2187 * If device doesn't need skb->dst, release it right now while
2188 * its hot in this cpu cache
2189 */
2190 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2191 skb_dst_drop(skb);
2192
2193 if (!list_empty(&ptype_all))
2194 dev_queue_xmit_nit(skb, dev);
2195
2196 features = netif_skb_features(skb);
2197
2198 if (vlan_tx_tag_present(skb) &&
2199 !(features & NETIF_F_HW_VLAN_TX)) {
2200 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2201 if (unlikely(!skb))
2202 goto out;
2203
2204 skb->vlan_tci = 0;
2205 }
2206
2207 if (netif_needs_gso(skb, features)) {
2208 if (unlikely(dev_gso_segment(skb, features)))
2209 goto out_kfree_skb;
2210 if (skb->next)
2211 goto gso;
2212 } else {
2213 if (skb_needs_linearize(skb, features) &&
2214 __skb_linearize(skb))
2215 goto out_kfree_skb;
2216
2217 /* If packet is not checksummed and device does not
2218 * support checksumming for this protocol, complete
2219 * checksumming here.
2220 */
2221 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2222 skb_set_transport_header(skb,
2223 skb_checksum_start_offset(skb));
2224 if (!(features & NETIF_F_ALL_CSUM) &&
2225 skb_checksum_help(skb))
2226 goto out_kfree_skb;
2227 }
2228 }
2229
2230 skb_len = skb->len;
2231 rc = ops->ndo_start_xmit(skb, dev);
2232 trace_net_dev_xmit(skb, rc, dev, skb_len);
2233 if (rc == NETDEV_TX_OK)
2234 txq_trans_update(txq);
2235 return rc;
2236 }
2237
2238gso:
2239 do {
2240 struct sk_buff *nskb = skb->next;
2241
2242 skb->next = nskb->next;
2243 nskb->next = NULL;
2244
2245 /*
2246 * If device doesn't need nskb->dst, release it right now while
2247 * its hot in this cpu cache
2248 */
2249 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2250 skb_dst_drop(nskb);
2251
2252 skb_len = nskb->len;
2253 rc = ops->ndo_start_xmit(nskb, dev);
2254 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2255 if (unlikely(rc != NETDEV_TX_OK)) {
2256 if (rc & ~NETDEV_TX_MASK)
2257 goto out_kfree_gso_skb;
2258 nskb->next = skb->next;
2259 skb->next = nskb;
2260 return rc;
2261 }
2262 txq_trans_update(txq);
2263 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2264 return NETDEV_TX_BUSY;
2265 } while (skb->next);
2266
2267out_kfree_gso_skb:
2268 if (likely(skb->next == NULL))
2269 skb->destructor = DEV_GSO_CB(skb)->destructor;
2270out_kfree_skb:
2271 kfree_skb(skb);
2272out:
2273 return rc;
2274}
2275
2276static u32 hashrnd __read_mostly;
2277
2278/*
2279 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2280 * to be used as a distribution range.
2281 */
2282u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2283 unsigned int num_tx_queues)
2284{
2285 u32 hash;
2286 u16 qoffset = 0;
2287 u16 qcount = num_tx_queues;
2288
2289 if (skb_rx_queue_recorded(skb)) {
2290 hash = skb_get_rx_queue(skb);
2291 while (unlikely(hash >= num_tx_queues))
2292 hash -= num_tx_queues;
2293 return hash;
2294 }
2295
2296 if (dev->num_tc) {
2297 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2298 qoffset = dev->tc_to_txq[tc].offset;
2299 qcount = dev->tc_to_txq[tc].count;
2300 }
2301
2302 if (skb->sk && skb->sk->sk_hash)
2303 hash = skb->sk->sk_hash;
2304 else
2305 hash = (__force u16) skb->protocol;
2306 hash = jhash_1word(hash, hashrnd);
2307
2308 return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2309}
2310EXPORT_SYMBOL(__skb_tx_hash);
2311
2312static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2313{
2314 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2315 net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
2316 dev->name, queue_index,
2317 dev->real_num_tx_queues);
2318 return 0;
2319 }
2320 return queue_index;
2321}
2322
2323static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2324{
2325#ifdef CONFIG_XPS
2326 struct xps_dev_maps *dev_maps;
2327 struct xps_map *map;
2328 int queue_index = -1;
2329
2330 rcu_read_lock();
2331 dev_maps = rcu_dereference(dev->xps_maps);
2332 if (dev_maps) {
2333 map = rcu_dereference(
2334 dev_maps->cpu_map[raw_smp_processor_id()]);
2335 if (map) {
2336 if (map->len == 1)
2337 queue_index = map->queues[0];
2338 else {
2339 u32 hash;
2340 if (skb->sk && skb->sk->sk_hash)
2341 hash = skb->sk->sk_hash;
2342 else
2343 hash = (__force u16) skb->protocol ^
2344 skb->rxhash;
2345 hash = jhash_1word(hash, hashrnd);
2346 queue_index = map->queues[
2347 ((u64)hash * map->len) >> 32];
2348 }
2349 if (unlikely(queue_index >= dev->real_num_tx_queues))
2350 queue_index = -1;
2351 }
2352 }
2353 rcu_read_unlock();
2354
2355 return queue_index;
2356#else
2357 return -1;
2358#endif
2359}
2360
2361static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2362 struct sk_buff *skb)
2363{
2364 int queue_index;
2365 const struct net_device_ops *ops = dev->netdev_ops;
2366
2367 if (dev->real_num_tx_queues == 1)
2368 queue_index = 0;
2369 else if (ops->ndo_select_queue) {
2370 queue_index = ops->ndo_select_queue(dev, skb);
2371 queue_index = dev_cap_txqueue(dev, queue_index);
2372 } else {
2373 struct sock *sk = skb->sk;
2374 queue_index = sk_tx_queue_get(sk);
2375
2376 if (queue_index < 0 || skb->ooo_okay ||
2377 queue_index >= dev->real_num_tx_queues) {
2378 int old_index = queue_index;
2379
2380 queue_index = get_xps_queue(dev, skb);
2381 if (queue_index < 0)
2382 queue_index = skb_tx_hash(dev, skb);
2383
2384 if (queue_index != old_index && sk) {
2385 struct dst_entry *dst =
2386 rcu_dereference_check(sk->sk_dst_cache, 1);
2387
2388 if (dst && skb_dst(skb) == dst)
2389 sk_tx_queue_set(sk, queue_index);
2390 }
2391 }
2392 }
2393
2394 skb_set_queue_mapping(skb, queue_index);
2395 return netdev_get_tx_queue(dev, queue_index);
2396}
2397
2398static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2399 struct net_device *dev,
2400 struct netdev_queue *txq)
2401{
2402 spinlock_t *root_lock = qdisc_lock(q);
2403 bool contended;
2404 int rc;
2405
2406 qdisc_skb_cb(skb)->pkt_len = skb->len;
2407 qdisc_calculate_pkt_len(skb, q);
2408 /*
2409 * Heuristic to force contended enqueues to serialize on a
2410 * separate lock before trying to get qdisc main lock.
2411 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2412 * and dequeue packets faster.
2413 */
2414 contended = qdisc_is_running(q);
2415 if (unlikely(contended))
2416 spin_lock(&q->busylock);
2417
2418 spin_lock(root_lock);
2419 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2420 kfree_skb(skb);
2421 rc = NET_XMIT_DROP;
2422 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2423 qdisc_run_begin(q)) {
2424 /*
2425 * This is a work-conserving queue; there are no old skbs
2426 * waiting to be sent out; and the qdisc is not running -
2427 * xmit the skb directly.
2428 */
2429 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2430 skb_dst_force(skb);
2431
2432 qdisc_bstats_update(q, skb);
2433
2434 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2435 if (unlikely(contended)) {
2436 spin_unlock(&q->busylock);
2437 contended = false;
2438 }
2439 __qdisc_run(q);
2440 } else
2441 qdisc_run_end(q);
2442
2443 rc = NET_XMIT_SUCCESS;
2444 } else {
2445 skb_dst_force(skb);
2446 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2447 if (qdisc_run_begin(q)) {
2448 if (unlikely(contended)) {
2449 spin_unlock(&q->busylock);
2450 contended = false;
2451 }
2452 __qdisc_run(q);
2453 }
2454 }
2455 spin_unlock(root_lock);
2456 if (unlikely(contended))
2457 spin_unlock(&q->busylock);
2458 return rc;
2459}
2460
2461#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2462static void skb_update_prio(struct sk_buff *skb)
2463{
2464 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2465
2466 if (!skb->priority && skb->sk && map) {
2467 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2468
2469 if (prioidx < map->priomap_len)
2470 skb->priority = map->priomap[prioidx];
2471 }
2472}
2473#else
2474#define skb_update_prio(skb)
2475#endif
2476
2477static DEFINE_PER_CPU(int, xmit_recursion);
2478#define RECURSION_LIMIT 10
2479
2480/**
2481 * dev_queue_xmit - transmit a buffer
2482 * @skb: buffer to transmit
2483 *
2484 * Queue a buffer for transmission to a network device. The caller must
2485 * have set the device and priority and built the buffer before calling
2486 * this function. The function can be called from an interrupt.
2487 *
2488 * A negative errno code is returned on a failure. A success does not
2489 * guarantee the frame will be transmitted as it may be dropped due
2490 * to congestion or traffic shaping.
2491 *
2492 * -----------------------------------------------------------------------------------
2493 * I notice this method can also return errors from the queue disciplines,
2494 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2495 * be positive.
2496 *
2497 * Regardless of the return value, the skb is consumed, so it is currently
2498 * difficult to retry a send to this method. (You can bump the ref count
2499 * before sending to hold a reference for retry if you are careful.)
2500 *
2501 * When calling this method, interrupts MUST be enabled. This is because
2502 * the BH enable code must have IRQs enabled so that it will not deadlock.
2503 * --BLG
2504 */
2505int dev_queue_xmit(struct sk_buff *skb)
2506{
2507 struct net_device *dev = skb->dev;
2508 struct netdev_queue *txq;
2509 struct Qdisc *q;
2510 int rc = -ENOMEM;
2511
2512 /* Disable soft irqs for various locks below. Also
2513 * stops preemption for RCU.
2514 */
2515 rcu_read_lock_bh();
2516
2517 skb_update_prio(skb);
2518
2519 txq = dev_pick_tx(dev, skb);
2520 q = rcu_dereference_bh(txq->qdisc);
2521
2522#ifdef CONFIG_NET_CLS_ACT
2523 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2524#endif
2525 trace_net_dev_queue(skb);
2526 if (q->enqueue) {
2527 rc = __dev_xmit_skb(skb, q, dev, txq);
2528 goto out;
2529 }
2530
2531 /* The device has no queue. Common case for software devices:
2532 loopback, all the sorts of tunnels...
2533
2534 Really, it is unlikely that netif_tx_lock protection is necessary
2535 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2536 counters.)
2537 However, it is possible, that they rely on protection
2538 made by us here.
2539
2540 Check this and shot the lock. It is not prone from deadlocks.
2541 Either shot noqueue qdisc, it is even simpler 8)
2542 */
2543 if (dev->flags & IFF_UP) {
2544 int cpu = smp_processor_id(); /* ok because BHs are off */
2545
2546 if (txq->xmit_lock_owner != cpu) {
2547
2548 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2549 goto recursion_alert;
2550
2551 HARD_TX_LOCK(dev, txq, cpu);
2552
2553 if (!netif_xmit_stopped(txq)) {
2554 __this_cpu_inc(xmit_recursion);
2555 rc = dev_hard_start_xmit(skb, dev, txq);
2556 __this_cpu_dec(xmit_recursion);
2557 if (dev_xmit_complete(rc)) {
2558 HARD_TX_UNLOCK(dev, txq);
2559 goto out;
2560 }
2561 }
2562 HARD_TX_UNLOCK(dev, txq);
2563 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2564 dev->name);
2565 } else {
2566 /* Recursion is detected! It is possible,
2567 * unfortunately
2568 */
2569recursion_alert:
2570 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2571 dev->name);
2572 }
2573 }
2574
2575 rc = -ENETDOWN;
2576 rcu_read_unlock_bh();
2577
2578 kfree_skb(skb);
2579 return rc;
2580out:
2581 rcu_read_unlock_bh();
2582 return rc;
2583}
2584EXPORT_SYMBOL(dev_queue_xmit);
2585
2586
2587/*=======================================================================
2588 Receiver routines
2589 =======================================================================*/
2590
2591int netdev_max_backlog __read_mostly = 1000;
2592int netdev_tstamp_prequeue __read_mostly = 1;
2593int netdev_budget __read_mostly = 300;
2594int weight_p __read_mostly = 64; /* old backlog weight */
2595
2596/* Called with irq disabled */
2597static inline void ____napi_schedule(struct softnet_data *sd,
2598 struct napi_struct *napi)
2599{
2600 list_add_tail(&napi->poll_list, &sd->poll_list);
2601 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2602}
2603
2604/*
2605 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2606 * and src/dst port numbers. Sets rxhash in skb to non-zero hash value
2607 * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb
2608 * if hash is a canonical 4-tuple hash over transport ports.
2609 */
2610void __skb_get_rxhash(struct sk_buff *skb)
2611{
2612 struct flow_keys keys;
2613 u32 hash;
2614
2615 if (!skb_flow_dissect(skb, &keys))
2616 return;
2617
2618 if (keys.ports) {
2619 if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0])
2620 swap(keys.port16[0], keys.port16[1]);
2621 skb->l4_rxhash = 1;
2622 }
2623
2624 /* get a consistent hash (same value on both flow directions) */
2625 if ((__force u32)keys.dst < (__force u32)keys.src)
2626 swap(keys.dst, keys.src);
2627
2628 hash = jhash_3words((__force u32)keys.dst,
2629 (__force u32)keys.src,
2630 (__force u32)keys.ports, hashrnd);
2631 if (!hash)
2632 hash = 1;
2633
2634 skb->rxhash = hash;
2635}
2636EXPORT_SYMBOL(__skb_get_rxhash);
2637
2638#ifdef CONFIG_RPS
2639
2640/* One global table that all flow-based protocols share. */
2641struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2642EXPORT_SYMBOL(rps_sock_flow_table);
2643
2644struct static_key rps_needed __read_mostly;
2645
2646static struct rps_dev_flow *
2647set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2648 struct rps_dev_flow *rflow, u16 next_cpu)
2649{
2650 if (next_cpu != RPS_NO_CPU) {
2651#ifdef CONFIG_RFS_ACCEL
2652 struct netdev_rx_queue *rxqueue;
2653 struct rps_dev_flow_table *flow_table;
2654 struct rps_dev_flow *old_rflow;
2655 u32 flow_id;
2656 u16 rxq_index;
2657 int rc;
2658
2659 /* Should we steer this flow to a different hardware queue? */
2660 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2661 !(dev->features & NETIF_F_NTUPLE))
2662 goto out;
2663 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2664 if (rxq_index == skb_get_rx_queue(skb))
2665 goto out;
2666
2667 rxqueue = dev->_rx + rxq_index;
2668 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2669 if (!flow_table)
2670 goto out;
2671 flow_id = skb->rxhash & flow_table->mask;
2672 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2673 rxq_index, flow_id);
2674 if (rc < 0)
2675 goto out;
2676 old_rflow = rflow;
2677 rflow = &flow_table->flows[flow_id];
2678 rflow->filter = rc;
2679 if (old_rflow->filter == rflow->filter)
2680 old_rflow->filter = RPS_NO_FILTER;
2681 out:
2682#endif
2683 rflow->last_qtail =
2684 per_cpu(softnet_data, next_cpu).input_queue_head;
2685 }
2686
2687 rflow->cpu = next_cpu;
2688 return rflow;
2689}
2690
2691/*
2692 * get_rps_cpu is called from netif_receive_skb and returns the target
2693 * CPU from the RPS map of the receiving queue for a given skb.
2694 * rcu_read_lock must be held on entry.
2695 */
2696static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2697 struct rps_dev_flow **rflowp)
2698{
2699 struct netdev_rx_queue *rxqueue;
2700 struct rps_map *map;
2701 struct rps_dev_flow_table *flow_table;
2702 struct rps_sock_flow_table *sock_flow_table;
2703 int cpu = -1;
2704 u16 tcpu;
2705
2706 if (skb_rx_queue_recorded(skb)) {
2707 u16 index = skb_get_rx_queue(skb);
2708 if (unlikely(index >= dev->real_num_rx_queues)) {
2709 WARN_ONCE(dev->real_num_rx_queues > 1,
2710 "%s received packet on queue %u, but number "
2711 "of RX queues is %u\n",
2712 dev->name, index, dev->real_num_rx_queues);
2713 goto done;
2714 }
2715 rxqueue = dev->_rx + index;
2716 } else
2717 rxqueue = dev->_rx;
2718
2719 map = rcu_dereference(rxqueue->rps_map);
2720 if (map) {
2721 if (map->len == 1 &&
2722 !rcu_access_pointer(rxqueue->rps_flow_table)) {
2723 tcpu = map->cpus[0];
2724 if (cpu_online(tcpu))
2725 cpu = tcpu;
2726 goto done;
2727 }
2728 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2729 goto done;
2730 }
2731
2732 skb_reset_network_header(skb);
2733 if (!skb_get_rxhash(skb))
2734 goto done;
2735
2736 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2737 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2738 if (flow_table && sock_flow_table) {
2739 u16 next_cpu;
2740 struct rps_dev_flow *rflow;
2741
2742 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2743 tcpu = rflow->cpu;
2744
2745 next_cpu = sock_flow_table->ents[skb->rxhash &
2746 sock_flow_table->mask];
2747
2748 /*
2749 * If the desired CPU (where last recvmsg was done) is
2750 * different from current CPU (one in the rx-queue flow
2751 * table entry), switch if one of the following holds:
2752 * - Current CPU is unset (equal to RPS_NO_CPU).
2753 * - Current CPU is offline.
2754 * - The current CPU's queue tail has advanced beyond the
2755 * last packet that was enqueued using this table entry.
2756 * This guarantees that all previous packets for the flow
2757 * have been dequeued, thus preserving in order delivery.
2758 */
2759 if (unlikely(tcpu != next_cpu) &&
2760 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2761 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2762 rflow->last_qtail)) >= 0))
2763 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2764
2765 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2766 *rflowp = rflow;
2767 cpu = tcpu;
2768 goto done;
2769 }
2770 }
2771
2772 if (map) {
2773 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2774
2775 if (cpu_online(tcpu)) {
2776 cpu = tcpu;
2777 goto done;
2778 }
2779 }
2780
2781done:
2782 return cpu;
2783}
2784
2785#ifdef CONFIG_RFS_ACCEL
2786
2787/**
2788 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2789 * @dev: Device on which the filter was set
2790 * @rxq_index: RX queue index
2791 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2792 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2793 *
2794 * Drivers that implement ndo_rx_flow_steer() should periodically call
2795 * this function for each installed filter and remove the filters for
2796 * which it returns %true.
2797 */
2798bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2799 u32 flow_id, u16 filter_id)
2800{
2801 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2802 struct rps_dev_flow_table *flow_table;
2803 struct rps_dev_flow *rflow;
2804 bool expire = true;
2805 int cpu;
2806
2807 rcu_read_lock();
2808 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2809 if (flow_table && flow_id <= flow_table->mask) {
2810 rflow = &flow_table->flows[flow_id];
2811 cpu = ACCESS_ONCE(rflow->cpu);
2812 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2813 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2814 rflow->last_qtail) <
2815 (int)(10 * flow_table->mask)))
2816 expire = false;
2817 }
2818 rcu_read_unlock();
2819 return expire;
2820}
2821EXPORT_SYMBOL(rps_may_expire_flow);
2822
2823#endif /* CONFIG_RFS_ACCEL */
2824
2825/* Called from hardirq (IPI) context */
2826static void rps_trigger_softirq(void *data)
2827{
2828 struct softnet_data *sd = data;
2829
2830 ____napi_schedule(sd, &sd->backlog);
2831 sd->received_rps++;
2832}
2833
2834#endif /* CONFIG_RPS */
2835
2836/*
2837 * Check if this softnet_data structure is another cpu one
2838 * If yes, queue it to our IPI list and return 1
2839 * If no, return 0
2840 */
2841static int rps_ipi_queued(struct softnet_data *sd)
2842{
2843#ifdef CONFIG_RPS
2844 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2845
2846 if (sd != mysd) {
2847 sd->rps_ipi_next = mysd->rps_ipi_list;
2848 mysd->rps_ipi_list = sd;
2849
2850 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2851 return 1;
2852 }
2853#endif /* CONFIG_RPS */
2854 return 0;
2855}
2856
2857/*
2858 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2859 * queue (may be a remote CPU queue).
2860 */
2861static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2862 unsigned int *qtail)
2863{
2864 struct softnet_data *sd;
2865 unsigned long flags;
2866
2867 sd = &per_cpu(softnet_data, cpu);
2868
2869 local_irq_save(flags);
2870
2871 rps_lock(sd);
2872 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2873 if (skb_queue_len(&sd->input_pkt_queue)) {
2874enqueue:
2875 __skb_queue_tail(&sd->input_pkt_queue, skb);
2876 input_queue_tail_incr_save(sd, qtail);
2877 rps_unlock(sd);
2878 local_irq_restore(flags);
2879 return NET_RX_SUCCESS;
2880 }
2881
2882 /* Schedule NAPI for backlog device
2883 * We can use non atomic operation since we own the queue lock
2884 */
2885 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2886 if (!rps_ipi_queued(sd))
2887 ____napi_schedule(sd, &sd->backlog);
2888 }
2889 goto enqueue;
2890 }
2891
2892 sd->dropped++;
2893 rps_unlock(sd);
2894
2895 local_irq_restore(flags);
2896
2897 atomic_long_inc(&skb->dev->rx_dropped);
2898 kfree_skb(skb);
2899 return NET_RX_DROP;
2900}
2901
2902/**
2903 * netif_rx - post buffer to the network code
2904 * @skb: buffer to post
2905 *
2906 * This function receives a packet from a device driver and queues it for
2907 * the upper (protocol) levels to process. It always succeeds. The buffer
2908 * may be dropped during processing for congestion control or by the
2909 * protocol layers.
2910 *
2911 * return values:
2912 * NET_RX_SUCCESS (no congestion)
2913 * NET_RX_DROP (packet was dropped)
2914 *
2915 */
2916
2917int netif_rx(struct sk_buff *skb)
2918{
2919 int ret;
2920
2921 /* if netpoll wants it, pretend we never saw it */
2922 if (netpoll_rx(skb))
2923 return NET_RX_DROP;
2924
2925 net_timestamp_check(netdev_tstamp_prequeue, skb);
2926
2927 trace_netif_rx(skb);
2928#ifdef CONFIG_RPS
2929 if (static_key_false(&rps_needed)) {
2930 struct rps_dev_flow voidflow, *rflow = &voidflow;
2931 int cpu;
2932
2933 preempt_disable();
2934 rcu_read_lock();
2935
2936 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2937 if (cpu < 0)
2938 cpu = smp_processor_id();
2939
2940 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2941
2942 rcu_read_unlock();
2943 preempt_enable();
2944 } else
2945#endif
2946 {
2947 unsigned int qtail;
2948 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2949 put_cpu();
2950 }
2951 return ret;
2952}
2953EXPORT_SYMBOL(netif_rx);
2954
2955int netif_rx_ni(struct sk_buff *skb)
2956{
2957 int err;
2958
2959 preempt_disable();
2960 err = netif_rx(skb);
2961 if (local_softirq_pending())
2962 do_softirq();
2963 preempt_enable();
2964
2965 return err;
2966}
2967EXPORT_SYMBOL(netif_rx_ni);
2968
2969static void net_tx_action(struct softirq_action *h)
2970{
2971 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2972
2973 if (sd->completion_queue) {
2974 struct sk_buff *clist;
2975
2976 local_irq_disable();
2977 clist = sd->completion_queue;
2978 sd->completion_queue = NULL;
2979 local_irq_enable();
2980
2981 while (clist) {
2982 struct sk_buff *skb = clist;
2983 clist = clist->next;
2984
2985 WARN_ON(atomic_read(&skb->users));
2986 trace_kfree_skb(skb, net_tx_action);
2987 __kfree_skb(skb);
2988 }
2989 }
2990
2991 if (sd->output_queue) {
2992 struct Qdisc *head;
2993
2994 local_irq_disable();
2995 head = sd->output_queue;
2996 sd->output_queue = NULL;
2997 sd->output_queue_tailp = &sd->output_queue;
2998 local_irq_enable();
2999
3000 while (head) {
3001 struct Qdisc *q = head;
3002 spinlock_t *root_lock;
3003
3004 head = head->next_sched;
3005
3006 root_lock = qdisc_lock(q);
3007 if (spin_trylock(root_lock)) {
3008 smp_mb__before_clear_bit();
3009 clear_bit(__QDISC_STATE_SCHED,
3010 &q->state);
3011 qdisc_run(q);
3012 spin_unlock(root_lock);
3013 } else {
3014 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3015 &q->state)) {
3016 __netif_reschedule(q);
3017 } else {
3018 smp_mb__before_clear_bit();
3019 clear_bit(__QDISC_STATE_SCHED,
3020 &q->state);
3021 }
3022 }
3023 }
3024 }
3025}
3026
3027#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3028 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3029/* This hook is defined here for ATM LANE */
3030int (*br_fdb_test_addr_hook)(struct net_device *dev,
3031 unsigned char *addr) __read_mostly;
3032EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3033#endif
3034
3035#ifdef CONFIG_NET_CLS_ACT
3036/* TODO: Maybe we should just force sch_ingress to be compiled in
3037 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3038 * a compare and 2 stores extra right now if we dont have it on
3039 * but have CONFIG_NET_CLS_ACT
3040 * NOTE: This doesn't stop any functionality; if you dont have
3041 * the ingress scheduler, you just can't add policies on ingress.
3042 *
3043 */
3044static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3045{
3046 struct net_device *dev = skb->dev;
3047 u32 ttl = G_TC_RTTL(skb->tc_verd);
3048 int result = TC_ACT_OK;
3049 struct Qdisc *q;
3050
3051 if (unlikely(MAX_RED_LOOP < ttl++)) {
3052 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3053 skb->skb_iif, dev->ifindex);
3054 return TC_ACT_SHOT;
3055 }
3056
3057 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3058 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3059
3060 q = rxq->qdisc;
3061 if (q != &noop_qdisc) {
3062 spin_lock(qdisc_lock(q));
3063 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3064 result = qdisc_enqueue_root(skb, q);
3065 spin_unlock(qdisc_lock(q));
3066 }
3067
3068 return result;
3069}
3070
3071static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3072 struct packet_type **pt_prev,
3073 int *ret, struct net_device *orig_dev)
3074{
3075 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3076
3077 if (!rxq || rxq->qdisc == &noop_qdisc)
3078 goto out;
3079
3080 if (*pt_prev) {
3081 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3082 *pt_prev = NULL;
3083 }
3084
3085 switch (ing_filter(skb, rxq)) {
3086 case TC_ACT_SHOT:
3087 case TC_ACT_STOLEN:
3088 kfree_skb(skb);
3089 return NULL;
3090 }
3091
3092out:
3093 skb->tc_verd = 0;
3094 return skb;
3095}
3096#endif
3097
3098/**
3099 * netdev_rx_handler_register - register receive handler
3100 * @dev: device to register a handler for
3101 * @rx_handler: receive handler to register
3102 * @rx_handler_data: data pointer that is used by rx handler
3103 *
3104 * Register a receive hander for a device. This handler will then be
3105 * called from __netif_receive_skb. A negative errno code is returned
3106 * on a failure.
3107 *
3108 * The caller must hold the rtnl_mutex.
3109 *
3110 * For a general description of rx_handler, see enum rx_handler_result.
3111 */
3112int netdev_rx_handler_register(struct net_device *dev,
3113 rx_handler_func_t *rx_handler,
3114 void *rx_handler_data)
3115{
3116 ASSERT_RTNL();
3117
3118 if (dev->rx_handler)
3119 return -EBUSY;
3120
3121 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3122 rcu_assign_pointer(dev->rx_handler, rx_handler);
3123
3124 return 0;
3125}
3126EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3127
3128/**
3129 * netdev_rx_handler_unregister - unregister receive handler
3130 * @dev: device to unregister a handler from
3131 *
3132 * Unregister a receive hander from a device.
3133 *
3134 * The caller must hold the rtnl_mutex.
3135 */
3136void netdev_rx_handler_unregister(struct net_device *dev)
3137{
3138
3139 ASSERT_RTNL();
3140 RCU_INIT_POINTER(dev->rx_handler, NULL);
3141 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3142}
3143EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3144
3145static int __netif_receive_skb(struct sk_buff *skb)
3146{
3147 struct packet_type *ptype, *pt_prev;
3148 rx_handler_func_t *rx_handler;
3149 struct net_device *orig_dev;
3150 struct net_device *null_or_dev;
3151 bool deliver_exact = false;
3152 int ret = NET_RX_DROP;
3153 __be16 type;
3154
3155 net_timestamp_check(!netdev_tstamp_prequeue, skb);
3156
3157 trace_netif_receive_skb(skb);
3158
3159 /* if we've gotten here through NAPI, check netpoll */
3160 if (netpoll_receive_skb(skb))
3161 return NET_RX_DROP;
3162
3163 if (!skb->skb_iif)
3164 skb->skb_iif = skb->dev->ifindex;
3165 orig_dev = skb->dev;
3166
3167 skb_reset_network_header(skb);
3168 skb_reset_transport_header(skb);
3169 skb_reset_mac_len(skb);
3170
3171 pt_prev = NULL;
3172
3173 rcu_read_lock();
3174
3175another_round:
3176
3177 __this_cpu_inc(softnet_data.processed);
3178
3179 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3180 skb = vlan_untag(skb);
3181 if (unlikely(!skb))
3182 goto out;
3183 }
3184
3185#ifdef CONFIG_NET_CLS_ACT
3186 if (skb->tc_verd & TC_NCLS) {
3187 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3188 goto ncls;
3189 }
3190#endif
3191
3192 list_for_each_entry_rcu(ptype, &ptype_all, list) {
3193 if (!ptype->dev || ptype->dev == skb->dev) {
3194 if (pt_prev)
3195 ret = deliver_skb(skb, pt_prev, orig_dev);
3196 pt_prev = ptype;
3197 }
3198 }
3199
3200#ifdef CONFIG_NET_CLS_ACT
3201 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3202 if (!skb)
3203 goto out;
3204ncls:
3205#endif
3206
3207 rx_handler = rcu_dereference(skb->dev->rx_handler);
3208 if (vlan_tx_tag_present(skb)) {
3209 if (pt_prev) {
3210 ret = deliver_skb(skb, pt_prev, orig_dev);
3211 pt_prev = NULL;
3212 }
3213 if (vlan_do_receive(&skb, !rx_handler))
3214 goto another_round;
3215 else if (unlikely(!skb))
3216 goto out;
3217 }
3218
3219 if (rx_handler) {
3220 if (pt_prev) {
3221 ret = deliver_skb(skb, pt_prev, orig_dev);
3222 pt_prev = NULL;
3223 }
3224 switch (rx_handler(&skb)) {
3225 case RX_HANDLER_CONSUMED:
3226 goto out;
3227 case RX_HANDLER_ANOTHER:
3228 goto another_round;
3229 case RX_HANDLER_EXACT:
3230 deliver_exact = true;
3231 case RX_HANDLER_PASS:
3232 break;
3233 default:
3234 BUG();
3235 }
3236 }
3237
3238 /* deliver only exact match when indicated */
3239 null_or_dev = deliver_exact ? skb->dev : NULL;
3240
3241 type = skb->protocol;
3242 list_for_each_entry_rcu(ptype,
3243 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3244 if (ptype->type == type &&
3245 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3246 ptype->dev == orig_dev)) {
3247 if (pt_prev)
3248 ret = deliver_skb(skb, pt_prev, orig_dev);
3249 pt_prev = ptype;
3250 }
3251 }
3252
3253 if (pt_prev) {
3254 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3255 } else {
3256 atomic_long_inc(&skb->dev->rx_dropped);
3257 kfree_skb(skb);
3258 /* Jamal, now you will not able to escape explaining
3259 * me how you were going to use this. :-)
3260 */
3261 ret = NET_RX_DROP;
3262 }
3263
3264out:
3265 rcu_read_unlock();
3266 return ret;
3267}
3268
3269/**
3270 * netif_receive_skb - process receive buffer from network
3271 * @skb: buffer to process
3272 *
3273 * netif_receive_skb() is the main receive data processing function.
3274 * It always succeeds. The buffer may be dropped during processing
3275 * for congestion control or by the protocol layers.
3276 *
3277 * This function may only be called from softirq context and interrupts
3278 * should be enabled.
3279 *
3280 * Return values (usually ignored):
3281 * NET_RX_SUCCESS: no congestion
3282 * NET_RX_DROP: packet was dropped
3283 */
3284int netif_receive_skb(struct sk_buff *skb)
3285{
3286 net_timestamp_check(netdev_tstamp_prequeue, skb);
3287
3288 if (skb_defer_rx_timestamp(skb))
3289 return NET_RX_SUCCESS;
3290
3291#ifdef CONFIG_RPS
3292 if (static_key_false(&rps_needed)) {
3293 struct rps_dev_flow voidflow, *rflow = &voidflow;
3294 int cpu, ret;
3295
3296 rcu_read_lock();
3297
3298 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3299
3300 if (cpu >= 0) {
3301 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3302 rcu_read_unlock();
3303 return ret;
3304 }
3305 rcu_read_unlock();
3306 }
3307#endif
3308 return __netif_receive_skb(skb);
3309}
3310EXPORT_SYMBOL(netif_receive_skb);
3311
3312/* Network device is going away, flush any packets still pending
3313 * Called with irqs disabled.
3314 */
3315static void flush_backlog(void *arg)
3316{
3317 struct net_device *dev = arg;
3318 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3319 struct sk_buff *skb, *tmp;
3320
3321 rps_lock(sd);
3322 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3323 if (skb->dev == dev) {
3324 __skb_unlink(skb, &sd->input_pkt_queue);
3325 kfree_skb(skb);
3326 input_queue_head_incr(sd);
3327 }
3328 }
3329 rps_unlock(sd);
3330
3331 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3332 if (skb->dev == dev) {
3333 __skb_unlink(skb, &sd->process_queue);
3334 kfree_skb(skb);
3335 input_queue_head_incr(sd);
3336 }
3337 }
3338}
3339
3340static int napi_gro_complete(struct sk_buff *skb)
3341{
3342 struct packet_type *ptype;
3343 __be16 type = skb->protocol;
3344 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3345 int err = -ENOENT;
3346
3347 if (NAPI_GRO_CB(skb)->count == 1) {
3348 skb_shinfo(skb)->gso_size = 0;
3349 goto out;
3350 }
3351
3352 rcu_read_lock();
3353 list_for_each_entry_rcu(ptype, head, list) {
3354 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3355 continue;
3356
3357 err = ptype->gro_complete(skb);
3358 break;
3359 }
3360 rcu_read_unlock();
3361
3362 if (err) {
3363 WARN_ON(&ptype->list == head);
3364 kfree_skb(skb);
3365 return NET_RX_SUCCESS;
3366 }
3367
3368out:
3369 return netif_receive_skb(skb);
3370}
3371
3372inline void napi_gro_flush(struct napi_struct *napi)
3373{
3374 struct sk_buff *skb, *next;
3375
3376 for (skb = napi->gro_list; skb; skb = next) {
3377 next = skb->next;
3378 skb->next = NULL;
3379 napi_gro_complete(skb);
3380 }
3381
3382 napi->gro_count = 0;
3383 napi->gro_list = NULL;
3384}
3385EXPORT_SYMBOL(napi_gro_flush);
3386
3387enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3388{
3389 struct sk_buff **pp = NULL;
3390 struct packet_type *ptype;
3391 __be16 type = skb->protocol;
3392 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3393 int same_flow;
3394 int mac_len;
3395 enum gro_result ret;
3396
3397 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3398 goto normal;
3399
3400 if (skb_is_gso(skb) || skb_has_frag_list(skb))
3401 goto normal;
3402
3403 rcu_read_lock();
3404 list_for_each_entry_rcu(ptype, head, list) {
3405 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3406 continue;
3407
3408 skb_set_network_header(skb, skb_gro_offset(skb));
3409 mac_len = skb->network_header - skb->mac_header;
3410 skb->mac_len = mac_len;
3411 NAPI_GRO_CB(skb)->same_flow = 0;
3412 NAPI_GRO_CB(skb)->flush = 0;
3413 NAPI_GRO_CB(skb)->free = 0;
3414
3415 pp = ptype->gro_receive(&napi->gro_list, skb);
3416 break;
3417 }
3418 rcu_read_unlock();
3419
3420 if (&ptype->list == head)
3421 goto normal;
3422
3423 same_flow = NAPI_GRO_CB(skb)->same_flow;
3424 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3425
3426 if (pp) {
3427 struct sk_buff *nskb = *pp;
3428
3429 *pp = nskb->next;
3430 nskb->next = NULL;
3431 napi_gro_complete(nskb);
3432 napi->gro_count--;
3433 }
3434
3435 if (same_flow)
3436 goto ok;
3437
3438 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3439 goto normal;
3440
3441 napi->gro_count++;
3442 NAPI_GRO_CB(skb)->count = 1;
3443 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3444 skb->next = napi->gro_list;
3445 napi->gro_list = skb;
3446 ret = GRO_HELD;
3447
3448pull:
3449 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3450 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3451
3452 BUG_ON(skb->end - skb->tail < grow);
3453
3454 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3455
3456 skb->tail += grow;
3457 skb->data_len -= grow;
3458
3459 skb_shinfo(skb)->frags[0].page_offset += grow;
3460 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3461
3462 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3463 skb_frag_unref(skb, 0);
3464 memmove(skb_shinfo(skb)->frags,
3465 skb_shinfo(skb)->frags + 1,
3466 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3467 }
3468 }
3469
3470ok:
3471 return ret;
3472
3473normal:
3474 ret = GRO_NORMAL;
3475 goto pull;
3476}
3477EXPORT_SYMBOL(dev_gro_receive);
3478
3479static inline gro_result_t
3480__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3481{
3482 struct sk_buff *p;
3483 unsigned int maclen = skb->dev->hard_header_len;
3484
3485 for (p = napi->gro_list; p; p = p->next) {
3486 unsigned long diffs;
3487
3488 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3489 diffs |= p->vlan_tci ^ skb->vlan_tci;
3490 if (maclen == ETH_HLEN)
3491 diffs |= compare_ether_header(skb_mac_header(p),
3492 skb_gro_mac_header(skb));
3493 else if (!diffs)
3494 diffs = memcmp(skb_mac_header(p),
3495 skb_gro_mac_header(skb),
3496 maclen);
3497 NAPI_GRO_CB(p)->same_flow = !diffs;
3498 NAPI_GRO_CB(p)->flush = 0;
3499 }
3500
3501 return dev_gro_receive(napi, skb);
3502}
3503
3504gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3505{
3506 switch (ret) {
3507 case GRO_NORMAL:
3508 if (netif_receive_skb(skb))
3509 ret = GRO_DROP;
3510 break;
3511
3512 case GRO_DROP:
3513 kfree_skb(skb);
3514 break;
3515
3516 case GRO_MERGED_FREE:
3517 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3518 kmem_cache_free(skbuff_head_cache, skb);
3519 else
3520 __kfree_skb(skb);
3521 break;
3522
3523 case GRO_HELD:
3524 case GRO_MERGED:
3525 break;
3526 }
3527
3528 return ret;
3529}
3530EXPORT_SYMBOL(napi_skb_finish);
3531
3532void skb_gro_reset_offset(struct sk_buff *skb)
3533{
3534 NAPI_GRO_CB(skb)->data_offset = 0;
3535 NAPI_GRO_CB(skb)->frag0 = NULL;
3536 NAPI_GRO_CB(skb)->frag0_len = 0;
3537
3538 if (skb->mac_header == skb->tail &&
3539 !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3540 NAPI_GRO_CB(skb)->frag0 =
3541 skb_frag_address(&skb_shinfo(skb)->frags[0]);
3542 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3543 }
3544}
3545EXPORT_SYMBOL(skb_gro_reset_offset);
3546
3547gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3548{
3549 skb_gro_reset_offset(skb);
3550
3551 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3552}
3553EXPORT_SYMBOL(napi_gro_receive);
3554
3555static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3556{
3557 __skb_pull(skb, skb_headlen(skb));
3558 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3559 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3560 skb->vlan_tci = 0;
3561 skb->dev = napi->dev;
3562 skb->skb_iif = 0;
3563
3564 napi->skb = skb;
3565}
3566
3567struct sk_buff *napi_get_frags(struct napi_struct *napi)
3568{
3569 struct sk_buff *skb = napi->skb;
3570
3571 if (!skb) {
3572 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3573 if (skb)
3574 napi->skb = skb;
3575 }
3576 return skb;
3577}
3578EXPORT_SYMBOL(napi_get_frags);
3579
3580gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3581 gro_result_t ret)
3582{
3583 switch (ret) {
3584 case GRO_NORMAL:
3585 case GRO_HELD:
3586 skb->protocol = eth_type_trans(skb, skb->dev);
3587
3588 if (ret == GRO_HELD)
3589 skb_gro_pull(skb, -ETH_HLEN);
3590 else if (netif_receive_skb(skb))
3591 ret = GRO_DROP;
3592 break;
3593
3594 case GRO_DROP:
3595 case GRO_MERGED_FREE:
3596 napi_reuse_skb(napi, skb);
3597 break;
3598
3599 case GRO_MERGED:
3600 break;
3601 }
3602
3603 return ret;
3604}
3605EXPORT_SYMBOL(napi_frags_finish);
3606
3607static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3608{
3609 struct sk_buff *skb = napi->skb;
3610 struct ethhdr *eth;
3611 unsigned int hlen;
3612 unsigned int off;
3613
3614 napi->skb = NULL;
3615
3616 skb_reset_mac_header(skb);
3617 skb_gro_reset_offset(skb);
3618
3619 off = skb_gro_offset(skb);
3620 hlen = off + sizeof(*eth);
3621 eth = skb_gro_header_fast(skb, off);
3622 if (skb_gro_header_hard(skb, hlen)) {
3623 eth = skb_gro_header_slow(skb, hlen, off);
3624 if (unlikely(!eth)) {
3625 napi_reuse_skb(napi, skb);
3626 skb = NULL;
3627 goto out;
3628 }
3629 }
3630
3631 skb_gro_pull(skb, sizeof(*eth));
3632
3633 /*
3634 * This works because the only protocols we care about don't require
3635 * special handling. We'll fix it up properly at the end.
3636 */
3637 skb->protocol = eth->h_proto;
3638
3639out:
3640 return skb;
3641}
3642
3643gro_result_t napi_gro_frags(struct napi_struct *napi)
3644{
3645 struct sk_buff *skb = napi_frags_skb(napi);
3646
3647 if (!skb)
3648 return GRO_DROP;
3649
3650 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3651}
3652EXPORT_SYMBOL(napi_gro_frags);
3653
3654/*
3655 * net_rps_action sends any pending IPI's for rps.
3656 * Note: called with local irq disabled, but exits with local irq enabled.
3657 */
3658static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3659{
3660#ifdef CONFIG_RPS
3661 struct softnet_data *remsd = sd->rps_ipi_list;
3662
3663 if (remsd) {
3664 sd->rps_ipi_list = NULL;
3665
3666 local_irq_enable();
3667
3668 /* Send pending IPI's to kick RPS processing on remote cpus. */
3669 while (remsd) {
3670 struct softnet_data *next = remsd->rps_ipi_next;
3671
3672 if (cpu_online(remsd->cpu))
3673 __smp_call_function_single(remsd->cpu,
3674 &remsd->csd, 0);
3675 remsd = next;
3676 }
3677 } else
3678#endif
3679 local_irq_enable();
3680}
3681
3682static int process_backlog(struct napi_struct *napi, int quota)
3683{
3684 int work = 0;
3685 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3686
3687#ifdef CONFIG_RPS
3688 /* Check if we have pending ipi, its better to send them now,
3689 * not waiting net_rx_action() end.
3690 */
3691 if (sd->rps_ipi_list) {
3692 local_irq_disable();
3693 net_rps_action_and_irq_enable(sd);
3694 }
3695#endif
3696 napi->weight = weight_p;
3697 local_irq_disable();
3698 while (work < quota) {
3699 struct sk_buff *skb;
3700 unsigned int qlen;
3701
3702 while ((skb = __skb_dequeue(&sd->process_queue))) {
3703 local_irq_enable();
3704 __netif_receive_skb(skb);
3705 local_irq_disable();
3706 input_queue_head_incr(sd);
3707 if (++work >= quota) {
3708 local_irq_enable();
3709 return work;
3710 }
3711 }
3712
3713 rps_lock(sd);
3714 qlen = skb_queue_len(&sd->input_pkt_queue);
3715 if (qlen)
3716 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3717 &sd->process_queue);
3718
3719 if (qlen < quota - work) {
3720 /*
3721 * Inline a custom version of __napi_complete().
3722 * only current cpu owns and manipulates this napi,
3723 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3724 * we can use a plain write instead of clear_bit(),
3725 * and we dont need an smp_mb() memory barrier.
3726 */
3727 list_del(&napi->poll_list);
3728 napi->state = 0;
3729
3730 quota = work + qlen;
3731 }
3732 rps_unlock(sd);
3733 }
3734 local_irq_enable();
3735
3736 return work;
3737}
3738
3739/**
3740 * __napi_schedule - schedule for receive
3741 * @n: entry to schedule
3742 *
3743 * The entry's receive function will be scheduled to run
3744 */
3745void __napi_schedule(struct napi_struct *n)
3746{
3747 unsigned long flags;
3748
3749 local_irq_save(flags);
3750 ____napi_schedule(&__get_cpu_var(softnet_data), n);
3751 local_irq_restore(flags);
3752}
3753EXPORT_SYMBOL(__napi_schedule);
3754
3755void __napi_complete(struct napi_struct *n)
3756{
3757 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3758 BUG_ON(n->gro_list);
3759
3760 list_del(&n->poll_list);
3761 smp_mb__before_clear_bit();
3762 clear_bit(NAPI_STATE_SCHED, &n->state);
3763}
3764EXPORT_SYMBOL(__napi_complete);
3765
3766void napi_complete(struct napi_struct *n)
3767{
3768 unsigned long flags;
3769
3770 /*
3771 * don't let napi dequeue from the cpu poll list
3772 * just in case its running on a different cpu
3773 */
3774 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3775 return;
3776
3777 napi_gro_flush(n);
3778 local_irq_save(flags);
3779 __napi_complete(n);
3780 local_irq_restore(flags);
3781}
3782EXPORT_SYMBOL(napi_complete);
3783
3784void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3785 int (*poll)(struct napi_struct *, int), int weight)
3786{
3787 INIT_LIST_HEAD(&napi->poll_list);
3788 napi->gro_count = 0;
3789 napi->gro_list = NULL;
3790 napi->skb = NULL;
3791 napi->poll = poll;
3792 napi->weight = weight;
3793 list_add(&napi->dev_list, &dev->napi_list);
3794 napi->dev = dev;
3795#ifdef CONFIG_NETPOLL
3796 spin_lock_init(&napi->poll_lock);
3797 napi->poll_owner = -1;
3798#endif
3799 set_bit(NAPI_STATE_SCHED, &napi->state);
3800}
3801EXPORT_SYMBOL(netif_napi_add);
3802
3803void netif_napi_del(struct napi_struct *napi)
3804{
3805 struct sk_buff *skb, *next;
3806
3807 list_del_init(&napi->dev_list);
3808 napi_free_frags(napi);
3809
3810 for (skb = napi->gro_list; skb; skb = next) {
3811 next = skb->next;
3812 skb->next = NULL;
3813 kfree_skb(skb);
3814 }
3815
3816 napi->gro_list = NULL;
3817 napi->gro_count = 0;
3818}
3819EXPORT_SYMBOL(netif_napi_del);
3820
3821static void net_rx_action(struct softirq_action *h)
3822{
3823 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3824 unsigned long time_limit = jiffies + 2;
3825 int budget = netdev_budget;
3826 void *have;
3827
3828 local_irq_disable();
3829
3830 while (!list_empty(&sd->poll_list)) {
3831 struct napi_struct *n;
3832 int work, weight;
3833
3834 /* If softirq window is exhuasted then punt.
3835 * Allow this to run for 2 jiffies since which will allow
3836 * an average latency of 1.5/HZ.
3837 */
3838 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3839 goto softnet_break;
3840
3841 local_irq_enable();
3842
3843 /* Even though interrupts have been re-enabled, this
3844 * access is safe because interrupts can only add new
3845 * entries to the tail of this list, and only ->poll()
3846 * calls can remove this head entry from the list.
3847 */
3848 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3849
3850 have = netpoll_poll_lock(n);
3851
3852 weight = n->weight;
3853
3854 /* This NAPI_STATE_SCHED test is for avoiding a race
3855 * with netpoll's poll_napi(). Only the entity which
3856 * obtains the lock and sees NAPI_STATE_SCHED set will
3857 * actually make the ->poll() call. Therefore we avoid
3858 * accidentally calling ->poll() when NAPI is not scheduled.
3859 */
3860 work = 0;
3861 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3862 work = n->poll(n, weight);
3863 trace_napi_poll(n);
3864 }
3865
3866 WARN_ON_ONCE(work > weight);
3867
3868 budget -= work;
3869
3870 local_irq_disable();
3871
3872 /* Drivers must not modify the NAPI state if they
3873 * consume the entire weight. In such cases this code
3874 * still "owns" the NAPI instance and therefore can
3875 * move the instance around on the list at-will.
3876 */
3877 if (unlikely(work == weight)) {
3878 if (unlikely(napi_disable_pending(n))) {
3879 local_irq_enable();
3880 napi_complete(n);
3881 local_irq_disable();
3882 } else
3883 list_move_tail(&n->poll_list, &sd->poll_list);
3884 }
3885
3886 netpoll_poll_unlock(have);
3887 }
3888out:
3889 net_rps_action_and_irq_enable(sd);
3890
3891#ifdef CONFIG_NET_DMA
3892 /*
3893 * There may not be any more sk_buffs coming right now, so push
3894 * any pending DMA copies to hardware
3895 */
3896 dma_issue_pending_all();
3897#endif
3898
3899 return;
3900
3901softnet_break:
3902 sd->time_squeeze++;
3903 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3904 goto out;
3905}
3906
3907static gifconf_func_t *gifconf_list[NPROTO];
3908
3909/**
3910 * register_gifconf - register a SIOCGIF handler
3911 * @family: Address family
3912 * @gifconf: Function handler
3913 *
3914 * Register protocol dependent address dumping routines. The handler
3915 * that is passed must not be freed or reused until it has been replaced
3916 * by another handler.
3917 */
3918int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3919{
3920 if (family >= NPROTO)
3921 return -EINVAL;
3922 gifconf_list[family] = gifconf;
3923 return 0;
3924}
3925EXPORT_SYMBOL(register_gifconf);
3926
3927
3928/*
3929 * Map an interface index to its name (SIOCGIFNAME)
3930 */
3931
3932/*
3933 * We need this ioctl for efficient implementation of the
3934 * if_indextoname() function required by the IPv6 API. Without
3935 * it, we would have to search all the interfaces to find a
3936 * match. --pb
3937 */
3938
3939static int dev_ifname(struct net *net, struct ifreq __user *arg)
3940{
3941 struct net_device *dev;
3942 struct ifreq ifr;
3943
3944 /*
3945 * Fetch the caller's info block.
3946 */
3947
3948 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3949 return -EFAULT;
3950
3951 rcu_read_lock();
3952 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3953 if (!dev) {
3954 rcu_read_unlock();
3955 return -ENODEV;
3956 }
3957
3958 strcpy(ifr.ifr_name, dev->name);
3959 rcu_read_unlock();
3960
3961 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3962 return -EFAULT;
3963 return 0;
3964}
3965
3966/*
3967 * Perform a SIOCGIFCONF call. This structure will change
3968 * size eventually, and there is nothing I can do about it.
3969 * Thus we will need a 'compatibility mode'.
3970 */
3971
3972static int dev_ifconf(struct net *net, char __user *arg)
3973{
3974 struct ifconf ifc;
3975 struct net_device *dev;
3976 char __user *pos;
3977 int len;
3978 int total;
3979 int i;
3980
3981 /*
3982 * Fetch the caller's info block.
3983 */
3984
3985 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3986 return -EFAULT;
3987
3988 pos = ifc.ifc_buf;
3989 len = ifc.ifc_len;
3990
3991 /*
3992 * Loop over the interfaces, and write an info block for each.
3993 */
3994
3995 total = 0;
3996 for_each_netdev(net, dev) {
3997 for (i = 0; i < NPROTO; i++) {
3998 if (gifconf_list[i]) {
3999 int done;
4000 if (!pos)
4001 done = gifconf_list[i](dev, NULL, 0);
4002 else
4003 done = gifconf_list[i](dev, pos + total,
4004 len - total);
4005 if (done < 0)
4006 return -EFAULT;
4007 total += done;
4008 }
4009 }
4010 }
4011
4012 /*
4013 * All done. Write the updated control block back to the caller.
4014 */
4015 ifc.ifc_len = total;
4016
4017 /*
4018 * Both BSD and Solaris return 0 here, so we do too.
4019 */
4020 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4021}
4022
4023#ifdef CONFIG_PROC_FS
4024
4025#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4026
4027#define get_bucket(x) ((x) >> BUCKET_SPACE)
4028#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4029#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4030
4031static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4032{
4033 struct net *net = seq_file_net(seq);
4034 struct net_device *dev;
4035 struct hlist_node *p;
4036 struct hlist_head *h;
4037 unsigned int count = 0, offset = get_offset(*pos);
4038
4039 h = &net->dev_name_head[get_bucket(*pos)];
4040 hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4041 if (++count == offset)
4042 return dev;
4043 }
4044
4045 return NULL;
4046}
4047
4048static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4049{
4050 struct net_device *dev;
4051 unsigned int bucket;
4052
4053 do {
4054 dev = dev_from_same_bucket(seq, pos);
4055 if (dev)
4056 return dev;
4057
4058 bucket = get_bucket(*pos) + 1;
4059 *pos = set_bucket_offset(bucket, 1);
4060 } while (bucket < NETDEV_HASHENTRIES);
4061
4062 return NULL;
4063}
4064
4065/*
4066 * This is invoked by the /proc filesystem handler to display a device
4067 * in detail.
4068 */
4069void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4070 __acquires(RCU)
4071{
4072 rcu_read_lock();
4073 if (!*pos)
4074 return SEQ_START_TOKEN;
4075
4076 if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4077 return NULL;
4078
4079 return dev_from_bucket(seq, pos);
4080}
4081
4082void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4083{
4084 ++*pos;
4085 return dev_from_bucket(seq, pos);
4086}
4087
4088void dev_seq_stop(struct seq_file *seq, void *v)
4089 __releases(RCU)
4090{
4091 rcu_read_unlock();
4092}
4093
4094static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4095{
4096 struct rtnl_link_stats64 temp;
4097 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4098
4099 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4100 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4101 dev->name, stats->rx_bytes, stats->rx_packets,
4102 stats->rx_errors,
4103 stats->rx_dropped + stats->rx_missed_errors,
4104 stats->rx_fifo_errors,
4105 stats->rx_length_errors + stats->rx_over_errors +
4106 stats->rx_crc_errors + stats->rx_frame_errors,
4107 stats->rx_compressed, stats->multicast,
4108 stats->tx_bytes, stats->tx_packets,
4109 stats->tx_errors, stats->tx_dropped,
4110 stats->tx_fifo_errors, stats->collisions,
4111 stats->tx_carrier_errors +
4112 stats->tx_aborted_errors +
4113 stats->tx_window_errors +
4114 stats->tx_heartbeat_errors,
4115 stats->tx_compressed);
4116}
4117
4118/*
4119 * Called from the PROCfs module. This now uses the new arbitrary sized
4120 * /proc/net interface to create /proc/net/dev
4121 */
4122static int dev_seq_show(struct seq_file *seq, void *v)
4123{
4124 if (v == SEQ_START_TOKEN)
4125 seq_puts(seq, "Inter-| Receive "
4126 " | Transmit\n"
4127 " face |bytes packets errs drop fifo frame "
4128 "compressed multicast|bytes packets errs "
4129 "drop fifo colls carrier compressed\n");
4130 else
4131 dev_seq_printf_stats(seq, v);
4132 return 0;
4133}
4134
4135static struct softnet_data *softnet_get_online(loff_t *pos)
4136{
4137 struct softnet_data *sd = NULL;
4138
4139 while (*pos < nr_cpu_ids)
4140 if (cpu_online(*pos)) {
4141 sd = &per_cpu(softnet_data, *pos);
4142 break;
4143 } else
4144 ++*pos;
4145 return sd;
4146}
4147
4148static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4149{
4150 return softnet_get_online(pos);
4151}
4152
4153static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4154{
4155 ++*pos;
4156 return softnet_get_online(pos);
4157}
4158
4159static void softnet_seq_stop(struct seq_file *seq, void *v)
4160{
4161}
4162
4163static int softnet_seq_show(struct seq_file *seq, void *v)
4164{
4165 struct softnet_data *sd = v;
4166
4167 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4168 sd->processed, sd->dropped, sd->time_squeeze, 0,
4169 0, 0, 0, 0, /* was fastroute */
4170 sd->cpu_collision, sd->received_rps);
4171 return 0;
4172}
4173
4174static const struct seq_operations dev_seq_ops = {
4175 .start = dev_seq_start,
4176 .next = dev_seq_next,
4177 .stop = dev_seq_stop,
4178 .show = dev_seq_show,
4179};
4180
4181static int dev_seq_open(struct inode *inode, struct file *file)
4182{
4183 return seq_open_net(inode, file, &dev_seq_ops,
4184 sizeof(struct seq_net_private));
4185}
4186
4187static const struct file_operations dev_seq_fops = {
4188 .owner = THIS_MODULE,
4189 .open = dev_seq_open,
4190 .read = seq_read,
4191 .llseek = seq_lseek,
4192 .release = seq_release_net,
4193};
4194
4195static const struct seq_operations softnet_seq_ops = {
4196 .start = softnet_seq_start,
4197 .next = softnet_seq_next,
4198 .stop = softnet_seq_stop,
4199 .show = softnet_seq_show,
4200};
4201
4202static int softnet_seq_open(struct inode *inode, struct file *file)
4203{
4204 return seq_open(file, &softnet_seq_ops);
4205}
4206
4207static const struct file_operations softnet_seq_fops = {
4208 .owner = THIS_MODULE,
4209 .open = softnet_seq_open,
4210 .read = seq_read,
4211 .llseek = seq_lseek,
4212 .release = seq_release,
4213};
4214
4215static void *ptype_get_idx(loff_t pos)
4216{
4217 struct packet_type *pt = NULL;
4218 loff_t i = 0;
4219 int t;
4220
4221 list_for_each_entry_rcu(pt, &ptype_all, list) {
4222 if (i == pos)
4223 return pt;
4224 ++i;
4225 }
4226
4227 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4228 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4229 if (i == pos)
4230 return pt;
4231 ++i;
4232 }
4233 }
4234 return NULL;
4235}
4236
4237static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4238 __acquires(RCU)
4239{
4240 rcu_read_lock();
4241 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4242}
4243
4244static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4245{
4246 struct packet_type *pt;
4247 struct list_head *nxt;
4248 int hash;
4249
4250 ++*pos;
4251 if (v == SEQ_START_TOKEN)
4252 return ptype_get_idx(0);
4253
4254 pt = v;
4255 nxt = pt->list.next;
4256 if (pt->type == htons(ETH_P_ALL)) {
4257 if (nxt != &ptype_all)
4258 goto found;
4259 hash = 0;
4260 nxt = ptype_base[0].next;
4261 } else
4262 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4263
4264 while (nxt == &ptype_base[hash]) {
4265 if (++hash >= PTYPE_HASH_SIZE)
4266 return NULL;
4267 nxt = ptype_base[hash].next;
4268 }
4269found:
4270 return list_entry(nxt, struct packet_type, list);
4271}
4272
4273static void ptype_seq_stop(struct seq_file *seq, void *v)
4274 __releases(RCU)
4275{
4276 rcu_read_unlock();
4277}
4278
4279static int ptype_seq_show(struct seq_file *seq, void *v)
4280{
4281 struct packet_type *pt = v;
4282
4283 if (v == SEQ_START_TOKEN)
4284 seq_puts(seq, "Type Device Function\n");
4285 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4286 if (pt->type == htons(ETH_P_ALL))
4287 seq_puts(seq, "ALL ");
4288 else
4289 seq_printf(seq, "%04x", ntohs(pt->type));
4290
4291 seq_printf(seq, " %-8s %pF\n",
4292 pt->dev ? pt->dev->name : "", pt->func);
4293 }
4294
4295 return 0;
4296}
4297
4298static const struct seq_operations ptype_seq_ops = {
4299 .start = ptype_seq_start,
4300 .next = ptype_seq_next,
4301 .stop = ptype_seq_stop,
4302 .show = ptype_seq_show,
4303};
4304
4305static int ptype_seq_open(struct inode *inode, struct file *file)
4306{
4307 return seq_open_net(inode, file, &ptype_seq_ops,
4308 sizeof(struct seq_net_private));
4309}
4310
4311static const struct file_operations ptype_seq_fops = {
4312 .owner = THIS_MODULE,
4313 .open = ptype_seq_open,
4314 .read = seq_read,
4315 .llseek = seq_lseek,
4316 .release = seq_release_net,
4317};
4318
4319
4320static int __net_init dev_proc_net_init(struct net *net)
4321{
4322 int rc = -ENOMEM;
4323
4324 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4325 goto out;
4326 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4327 goto out_dev;
4328 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4329 goto out_softnet;
4330
4331 if (wext_proc_init(net))
4332 goto out_ptype;
4333 rc = 0;
4334out:
4335 return rc;
4336out_ptype:
4337 proc_net_remove(net, "ptype");
4338out_softnet:
4339 proc_net_remove(net, "softnet_stat");
4340out_dev:
4341 proc_net_remove(net, "dev");
4342 goto out;
4343}
4344
4345static void __net_exit dev_proc_net_exit(struct net *net)
4346{
4347 wext_proc_exit(net);
4348
4349 proc_net_remove(net, "ptype");
4350 proc_net_remove(net, "softnet_stat");
4351 proc_net_remove(net, "dev");
4352}
4353
4354static struct pernet_operations __net_initdata dev_proc_ops = {
4355 .init = dev_proc_net_init,
4356 .exit = dev_proc_net_exit,
4357};
4358
4359static int __init dev_proc_init(void)
4360{
4361 return register_pernet_subsys(&dev_proc_ops);
4362}
4363#else
4364#define dev_proc_init() 0
4365#endif /* CONFIG_PROC_FS */
4366
4367
4368/**
4369 * netdev_set_master - set up master pointer
4370 * @slave: slave device
4371 * @master: new master device
4372 *
4373 * Changes the master device of the slave. Pass %NULL to break the
4374 * bonding. The caller must hold the RTNL semaphore. On a failure
4375 * a negative errno code is returned. On success the reference counts
4376 * are adjusted and the function returns zero.
4377 */
4378int netdev_set_master(struct net_device *slave, struct net_device *master)
4379{
4380 struct net_device *old = slave->master;
4381
4382 ASSERT_RTNL();
4383
4384 if (master) {
4385 if (old)
4386 return -EBUSY;
4387 dev_hold(master);
4388 }
4389
4390 slave->master = master;
4391
4392 if (old)
4393 dev_put(old);
4394 return 0;
4395}
4396EXPORT_SYMBOL(netdev_set_master);
4397
4398/**
4399 * netdev_set_bond_master - set up bonding master/slave pair
4400 * @slave: slave device
4401 * @master: new master device
4402 *
4403 * Changes the master device of the slave. Pass %NULL to break the
4404 * bonding. The caller must hold the RTNL semaphore. On a failure
4405 * a negative errno code is returned. On success %RTM_NEWLINK is sent
4406 * to the routing socket and the function returns zero.
4407 */
4408int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4409{
4410 int err;
4411
4412 ASSERT_RTNL();
4413
4414 err = netdev_set_master(slave, master);
4415 if (err)
4416 return err;
4417 if (master)
4418 slave->flags |= IFF_SLAVE;
4419 else
4420 slave->flags &= ~IFF_SLAVE;
4421
4422 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4423 return 0;
4424}
4425EXPORT_SYMBOL(netdev_set_bond_master);
4426
4427static void dev_change_rx_flags(struct net_device *dev, int flags)
4428{
4429 const struct net_device_ops *ops = dev->netdev_ops;
4430
4431 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4432 ops->ndo_change_rx_flags(dev, flags);
4433}
4434
4435static int __dev_set_promiscuity(struct net_device *dev, int inc)
4436{
4437 unsigned int old_flags = dev->flags;
4438 uid_t uid;
4439 gid_t gid;
4440
4441 ASSERT_RTNL();
4442
4443 dev->flags |= IFF_PROMISC;
4444 dev->promiscuity += inc;
4445 if (dev->promiscuity == 0) {
4446 /*
4447 * Avoid overflow.
4448 * If inc causes overflow, untouch promisc and return error.
4449 */
4450 if (inc < 0)
4451 dev->flags &= ~IFF_PROMISC;
4452 else {
4453 dev->promiscuity -= inc;
4454 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4455 dev->name);
4456 return -EOVERFLOW;
4457 }
4458 }
4459 if (dev->flags != old_flags) {
4460 pr_info("device %s %s promiscuous mode\n",
4461 dev->name,
4462 dev->flags & IFF_PROMISC ? "entered" : "left");
4463 if (audit_enabled) {
4464 current_uid_gid(&uid, &gid);
4465 audit_log(current->audit_context, GFP_ATOMIC,
4466 AUDIT_ANOM_PROMISCUOUS,
4467 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4468 dev->name, (dev->flags & IFF_PROMISC),
4469 (old_flags & IFF_PROMISC),
4470 audit_get_loginuid(current),
4471 uid, gid,
4472 audit_get_sessionid(current));
4473 }
4474
4475 dev_change_rx_flags(dev, IFF_PROMISC);
4476 }
4477 return 0;
4478}
4479
4480/**
4481 * dev_set_promiscuity - update promiscuity count on a device
4482 * @dev: device
4483 * @inc: modifier
4484 *
4485 * Add or remove promiscuity from a device. While the count in the device
4486 * remains above zero the interface remains promiscuous. Once it hits zero
4487 * the device reverts back to normal filtering operation. A negative inc
4488 * value is used to drop promiscuity on the device.
4489 * Return 0 if successful or a negative errno code on error.
4490 */
4491int dev_set_promiscuity(struct net_device *dev, int inc)
4492{
4493 unsigned int old_flags = dev->flags;
4494 int err;
4495
4496 err = __dev_set_promiscuity(dev, inc);
4497 if (err < 0)
4498 return err;
4499 if (dev->flags != old_flags)
4500 dev_set_rx_mode(dev);
4501 return err;
4502}
4503EXPORT_SYMBOL(dev_set_promiscuity);
4504
4505/**
4506 * dev_set_allmulti - update allmulti count on a device
4507 * @dev: device
4508 * @inc: modifier
4509 *
4510 * Add or remove reception of all multicast frames to a device. While the
4511 * count in the device remains above zero the interface remains listening
4512 * to all interfaces. Once it hits zero the device reverts back to normal
4513 * filtering operation. A negative @inc value is used to drop the counter
4514 * when releasing a resource needing all multicasts.
4515 * Return 0 if successful or a negative errno code on error.
4516 */
4517
4518int dev_set_allmulti(struct net_device *dev, int inc)
4519{
4520 unsigned int old_flags = dev->flags;
4521
4522 ASSERT_RTNL();
4523
4524 dev->flags |= IFF_ALLMULTI;
4525 dev->allmulti += inc;
4526 if (dev->allmulti == 0) {
4527 /*
4528 * Avoid overflow.
4529 * If inc causes overflow, untouch allmulti and return error.
4530 */
4531 if (inc < 0)
4532 dev->flags &= ~IFF_ALLMULTI;
4533 else {
4534 dev->allmulti -= inc;
4535 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4536 dev->name);
4537 return -EOVERFLOW;
4538 }
4539 }
4540 if (dev->flags ^ old_flags) {
4541 dev_change_rx_flags(dev, IFF_ALLMULTI);
4542 dev_set_rx_mode(dev);
4543 }
4544 return 0;
4545}
4546EXPORT_SYMBOL(dev_set_allmulti);
4547
4548/*
4549 * Upload unicast and multicast address lists to device and
4550 * configure RX filtering. When the device doesn't support unicast
4551 * filtering it is put in promiscuous mode while unicast addresses
4552 * are present.
4553 */
4554void __dev_set_rx_mode(struct net_device *dev)
4555{
4556 const struct net_device_ops *ops = dev->netdev_ops;
4557
4558 /* dev_open will call this function so the list will stay sane. */
4559 if (!(dev->flags&IFF_UP))
4560 return;
4561
4562 if (!netif_device_present(dev))
4563 return;
4564
4565 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4566 /* Unicast addresses changes may only happen under the rtnl,
4567 * therefore calling __dev_set_promiscuity here is safe.
4568 */
4569 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4570 __dev_set_promiscuity(dev, 1);
4571 dev->uc_promisc = true;
4572 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4573 __dev_set_promiscuity(dev, -1);
4574 dev->uc_promisc = false;
4575 }
4576 }
4577
4578 if (ops->ndo_set_rx_mode)
4579 ops->ndo_set_rx_mode(dev);
4580}
4581
4582void dev_set_rx_mode(struct net_device *dev)
4583{
4584 netif_addr_lock_bh(dev);
4585 __dev_set_rx_mode(dev);
4586 netif_addr_unlock_bh(dev);
4587}
4588
4589/**
4590 * dev_get_flags - get flags reported to userspace
4591 * @dev: device
4592 *
4593 * Get the combination of flag bits exported through APIs to userspace.
4594 */
4595unsigned int dev_get_flags(const struct net_device *dev)
4596{
4597 unsigned int flags;
4598
4599 flags = (dev->flags & ~(IFF_PROMISC |
4600 IFF_ALLMULTI |
4601 IFF_RUNNING |
4602 IFF_LOWER_UP |
4603 IFF_DORMANT)) |
4604 (dev->gflags & (IFF_PROMISC |
4605 IFF_ALLMULTI));
4606
4607 if (netif_running(dev)) {
4608 if (netif_oper_up(dev))
4609 flags |= IFF_RUNNING;
4610 if (netif_carrier_ok(dev))
4611 flags |= IFF_LOWER_UP;
4612 if (netif_dormant(dev))
4613 flags |= IFF_DORMANT;
4614 }
4615
4616 return flags;
4617}
4618EXPORT_SYMBOL(dev_get_flags);
4619
4620int __dev_change_flags(struct net_device *dev, unsigned int flags)
4621{
4622 unsigned int old_flags = dev->flags;
4623 int ret;
4624
4625 ASSERT_RTNL();
4626
4627 /*
4628 * Set the flags on our device.
4629 */
4630
4631 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4632 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4633 IFF_AUTOMEDIA)) |
4634 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4635 IFF_ALLMULTI));
4636
4637 /*
4638 * Load in the correct multicast list now the flags have changed.
4639 */
4640
4641 if ((old_flags ^ flags) & IFF_MULTICAST)
4642 dev_change_rx_flags(dev, IFF_MULTICAST);
4643
4644 dev_set_rx_mode(dev);
4645
4646 /*
4647 * Have we downed the interface. We handle IFF_UP ourselves
4648 * according to user attempts to set it, rather than blindly
4649 * setting it.
4650 */
4651
4652 ret = 0;
4653 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
4654 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4655
4656 if (!ret)
4657 dev_set_rx_mode(dev);
4658 }
4659
4660 if ((flags ^ dev->gflags) & IFF_PROMISC) {
4661 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4662
4663 dev->gflags ^= IFF_PROMISC;
4664 dev_set_promiscuity(dev, inc);
4665 }
4666
4667 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4668 is important. Some (broken) drivers set IFF_PROMISC, when
4669 IFF_ALLMULTI is requested not asking us and not reporting.
4670 */
4671 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4672 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4673
4674 dev->gflags ^= IFF_ALLMULTI;
4675 dev_set_allmulti(dev, inc);
4676 }
4677
4678 return ret;
4679}
4680
4681void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4682{
4683 unsigned int changes = dev->flags ^ old_flags;
4684
4685 if (changes & IFF_UP) {
4686 if (dev->flags & IFF_UP)
4687 call_netdevice_notifiers(NETDEV_UP, dev);
4688 else
4689 call_netdevice_notifiers(NETDEV_DOWN, dev);
4690 }
4691
4692 if (dev->flags & IFF_UP &&
4693 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4694 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4695}
4696
4697/**
4698 * dev_change_flags - change device settings
4699 * @dev: device
4700 * @flags: device state flags
4701 *
4702 * Change settings on device based state flags. The flags are
4703 * in the userspace exported format.
4704 */
4705int dev_change_flags(struct net_device *dev, unsigned int flags)
4706{
4707 int ret;
4708 unsigned int changes, old_flags = dev->flags;
4709
4710 ret = __dev_change_flags(dev, flags);
4711 if (ret < 0)
4712 return ret;
4713
4714 changes = old_flags ^ dev->flags;
4715 if (changes)
4716 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4717
4718 __dev_notify_flags(dev, old_flags);
4719 return ret;
4720}
4721EXPORT_SYMBOL(dev_change_flags);
4722
4723/**
4724 * dev_set_mtu - Change maximum transfer unit
4725 * @dev: device
4726 * @new_mtu: new transfer unit
4727 *
4728 * Change the maximum transfer size of the network device.
4729 */
4730int dev_set_mtu(struct net_device *dev, int new_mtu)
4731{
4732 const struct net_device_ops *ops = dev->netdev_ops;
4733 int err;
4734
4735 if (new_mtu == dev->mtu)
4736 return 0;
4737
4738 /* MTU must be positive. */
4739 if (new_mtu < 0)
4740 return -EINVAL;
4741
4742 if (!netif_device_present(dev))
4743 return -ENODEV;
4744
4745 err = 0;
4746 if (ops->ndo_change_mtu)
4747 err = ops->ndo_change_mtu(dev, new_mtu);
4748 else
4749 dev->mtu = new_mtu;
4750
4751 if (!err && dev->flags & IFF_UP)
4752 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4753 return err;
4754}
4755EXPORT_SYMBOL(dev_set_mtu);
4756
4757/**
4758 * dev_set_group - Change group this device belongs to
4759 * @dev: device
4760 * @new_group: group this device should belong to
4761 */
4762void dev_set_group(struct net_device *dev, int new_group)
4763{
4764 dev->group = new_group;
4765}
4766EXPORT_SYMBOL(dev_set_group);
4767
4768/**
4769 * dev_set_mac_address - Change Media Access Control Address
4770 * @dev: device
4771 * @sa: new address
4772 *
4773 * Change the hardware (MAC) address of the device
4774 */
4775int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4776{
4777 const struct net_device_ops *ops = dev->netdev_ops;
4778 int err;
4779
4780 if (!ops->ndo_set_mac_address)
4781 return -EOPNOTSUPP;
4782 if (sa->sa_family != dev->type)
4783 return -EINVAL;
4784 if (!netif_device_present(dev))
4785 return -ENODEV;
4786 err = ops->ndo_set_mac_address(dev, sa);
4787 if (!err)
4788 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4789 add_device_randomness(dev->dev_addr, dev->addr_len);
4790 return err;
4791}
4792EXPORT_SYMBOL(dev_set_mac_address);
4793
4794/*
4795 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4796 */
4797static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4798{
4799 int err;
4800 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4801
4802 if (!dev)
4803 return -ENODEV;
4804
4805 switch (cmd) {
4806 case SIOCGIFFLAGS: /* Get interface flags */
4807 ifr->ifr_flags = (short) dev_get_flags(dev);
4808 return 0;
4809
4810 case SIOCGIFMETRIC: /* Get the metric on the interface
4811 (currently unused) */
4812 ifr->ifr_metric = 0;
4813 return 0;
4814
4815 case SIOCGIFMTU: /* Get the MTU of a device */
4816 ifr->ifr_mtu = dev->mtu;
4817 return 0;
4818
4819 case SIOCGIFHWADDR:
4820 if (!dev->addr_len)
4821 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4822 else
4823 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4824 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4825 ifr->ifr_hwaddr.sa_family = dev->type;
4826 return 0;
4827
4828 case SIOCGIFSLAVE:
4829 err = -EINVAL;
4830 break;
4831
4832 case SIOCGIFMAP:
4833 ifr->ifr_map.mem_start = dev->mem_start;
4834 ifr->ifr_map.mem_end = dev->mem_end;
4835 ifr->ifr_map.base_addr = dev->base_addr;
4836 ifr->ifr_map.irq = dev->irq;
4837 ifr->ifr_map.dma = dev->dma;
4838 ifr->ifr_map.port = dev->if_port;
4839 return 0;
4840
4841 case SIOCGIFINDEX:
4842 ifr->ifr_ifindex = dev->ifindex;
4843 return 0;
4844
4845 case SIOCGIFTXQLEN:
4846 ifr->ifr_qlen = dev->tx_queue_len;
4847 return 0;
4848
4849 default:
4850 /* dev_ioctl() should ensure this case
4851 * is never reached
4852 */
4853 WARN_ON(1);
4854 err = -ENOTTY;
4855 break;
4856
4857 }
4858 return err;
4859}
4860
4861/*
4862 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4863 */
4864static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4865{
4866 int err;
4867 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4868 const struct net_device_ops *ops;
4869
4870 if (!dev)
4871 return -ENODEV;
4872
4873 ops = dev->netdev_ops;
4874
4875 switch (cmd) {
4876 case SIOCSIFFLAGS: /* Set interface flags */
4877 return dev_change_flags(dev, ifr->ifr_flags);
4878
4879 case SIOCSIFMETRIC: /* Set the metric on the interface
4880 (currently unused) */
4881 return -EOPNOTSUPP;
4882
4883 case SIOCSIFMTU: /* Set the MTU of a device */
4884 return dev_set_mtu(dev, ifr->ifr_mtu);
4885
4886 case SIOCSIFHWADDR:
4887 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4888
4889 case SIOCSIFHWBROADCAST:
4890 if (ifr->ifr_hwaddr.sa_family != dev->type)
4891 return -EINVAL;
4892 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4893 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4894 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4895 return 0;
4896
4897 case SIOCSIFMAP:
4898 if (ops->ndo_set_config) {
4899 if (!netif_device_present(dev))
4900 return -ENODEV;
4901 return ops->ndo_set_config(dev, &ifr->ifr_map);
4902 }
4903 return -EOPNOTSUPP;
4904
4905 case SIOCADDMULTI:
4906 if (!ops->ndo_set_rx_mode ||
4907 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4908 return -EINVAL;
4909 if (!netif_device_present(dev))
4910 return -ENODEV;
4911 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4912
4913 case SIOCDELMULTI:
4914 if (!ops->ndo_set_rx_mode ||
4915 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4916 return -EINVAL;
4917 if (!netif_device_present(dev))
4918 return -ENODEV;
4919 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4920
4921 case SIOCSIFTXQLEN:
4922 if (ifr->ifr_qlen < 0)
4923 return -EINVAL;
4924 dev->tx_queue_len = ifr->ifr_qlen;
4925 return 0;
4926
4927 case SIOCSIFNAME:
4928 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4929 return dev_change_name(dev, ifr->ifr_newname);
4930
4931 case SIOCSHWTSTAMP:
4932 err = net_hwtstamp_validate(ifr);
4933 if (err)
4934 return err;
4935 /* fall through */
4936
4937 /*
4938 * Unknown or private ioctl
4939 */
4940 default:
4941 if ((cmd >= SIOCDEVPRIVATE &&
4942 cmd <= SIOCDEVPRIVATE + 15) ||
4943 cmd == SIOCBONDENSLAVE ||
4944 cmd == SIOCBONDRELEASE ||
4945 cmd == SIOCBONDSETHWADDR ||
4946 cmd == SIOCBONDSLAVEINFOQUERY ||
4947 cmd == SIOCBONDINFOQUERY ||
4948 cmd == SIOCBONDCHANGEACTIVE ||
4949 cmd == SIOCGMIIPHY ||
4950 cmd == SIOCGMIIREG ||
4951 cmd == SIOCSMIIREG ||
4952 cmd == SIOCBRADDIF ||
4953 cmd == SIOCBRDELIF ||
4954 cmd == SIOCSHWTSTAMP ||
4955 cmd == SIOCWANDEV) {
4956 err = -EOPNOTSUPP;
4957 if (ops->ndo_do_ioctl) {
4958 if (netif_device_present(dev))
4959 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4960 else
4961 err = -ENODEV;
4962 }
4963 } else
4964 err = -EINVAL;
4965
4966 }
4967 return err;
4968}
4969
4970/*
4971 * This function handles all "interface"-type I/O control requests. The actual
4972 * 'doing' part of this is dev_ifsioc above.
4973 */
4974
4975/**
4976 * dev_ioctl - network device ioctl
4977 * @net: the applicable net namespace
4978 * @cmd: command to issue
4979 * @arg: pointer to a struct ifreq in user space
4980 *
4981 * Issue ioctl functions to devices. This is normally called by the
4982 * user space syscall interfaces but can sometimes be useful for
4983 * other purposes. The return value is the return from the syscall if
4984 * positive or a negative errno code on error.
4985 */
4986
4987int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4988{
4989 struct ifreq ifr;
4990 int ret;
4991 char *colon;
4992
4993 /* One special case: SIOCGIFCONF takes ifconf argument
4994 and requires shared lock, because it sleeps writing
4995 to user space.
4996 */
4997
4998 if (cmd == SIOCGIFCONF) {
4999 rtnl_lock();
5000 ret = dev_ifconf(net, (char __user *) arg);
5001 rtnl_unlock();
5002 return ret;
5003 }
5004 if (cmd == SIOCGIFNAME)
5005 return dev_ifname(net, (struct ifreq __user *)arg);
5006
5007 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5008 return -EFAULT;
5009
5010 ifr.ifr_name[IFNAMSIZ-1] = 0;
5011
5012 colon = strchr(ifr.ifr_name, ':');
5013 if (colon)
5014 *colon = 0;
5015
5016 /*
5017 * See which interface the caller is talking about.
5018 */
5019
5020 switch (cmd) {
5021 /*
5022 * These ioctl calls:
5023 * - can be done by all.
5024 * - atomic and do not require locking.
5025 * - return a value
5026 */
5027 case SIOCGIFFLAGS:
5028 case SIOCGIFMETRIC:
5029 case SIOCGIFMTU:
5030 case SIOCGIFHWADDR:
5031 case SIOCGIFSLAVE:
5032 case SIOCGIFMAP:
5033 case SIOCGIFINDEX:
5034 case SIOCGIFTXQLEN:
5035 dev_load(net, ifr.ifr_name);
5036 rcu_read_lock();
5037 ret = dev_ifsioc_locked(net, &ifr, cmd);
5038 rcu_read_unlock();
5039 if (!ret) {
5040 if (colon)
5041 *colon = ':';
5042 if (copy_to_user(arg, &ifr,
5043 sizeof(struct ifreq)))
5044 ret = -EFAULT;
5045 }
5046 return ret;
5047
5048 case SIOCETHTOOL:
5049 dev_load(net, ifr.ifr_name);
5050 rtnl_lock();
5051 ret = dev_ethtool(net, &ifr);
5052 rtnl_unlock();
5053 if (!ret) {
5054 if (colon)
5055 *colon = ':';
5056 if (copy_to_user(arg, &ifr,
5057 sizeof(struct ifreq)))
5058 ret = -EFAULT;
5059 }
5060 return ret;
5061
5062 /*
5063 * These ioctl calls:
5064 * - require superuser power.
5065 * - require strict serialization.
5066 * - return a value
5067 */
5068 case SIOCGMIIPHY:
5069 case SIOCGMIIREG:
5070 case SIOCSIFNAME:
5071 if (!capable(CAP_NET_ADMIN))
5072 return -EPERM;
5073 dev_load(net, ifr.ifr_name);
5074 rtnl_lock();
5075 ret = dev_ifsioc(net, &ifr, cmd);
5076 rtnl_unlock();
5077 if (!ret) {
5078 if (colon)
5079 *colon = ':';
5080 if (copy_to_user(arg, &ifr,
5081 sizeof(struct ifreq)))
5082 ret = -EFAULT;
5083 }
5084 return ret;
5085
5086 /*
5087 * These ioctl calls:
5088 * - require superuser power.
5089 * - require strict serialization.
5090 * - do not return a value
5091 */
5092 case SIOCSIFFLAGS:
5093 case SIOCSIFMETRIC:
5094 case SIOCSIFMTU:
5095 case SIOCSIFMAP:
5096 case SIOCSIFHWADDR:
5097 case SIOCSIFSLAVE:
5098 case SIOCADDMULTI:
5099 case SIOCDELMULTI:
5100 case SIOCSIFHWBROADCAST:
5101 case SIOCSIFTXQLEN:
5102 case SIOCSMIIREG:
5103 case SIOCBONDENSLAVE:
5104 case SIOCBONDRELEASE:
5105 case SIOCBONDSETHWADDR:
5106 case SIOCBONDCHANGEACTIVE:
5107 case SIOCBRADDIF:
5108 case SIOCBRDELIF:
5109 case SIOCSHWTSTAMP:
5110 if (!capable(CAP_NET_ADMIN))
5111 return -EPERM;
5112 /* fall through */
5113 case SIOCBONDSLAVEINFOQUERY:
5114 case SIOCBONDINFOQUERY:
5115 dev_load(net, ifr.ifr_name);
5116 rtnl_lock();
5117 ret = dev_ifsioc(net, &ifr, cmd);
5118 rtnl_unlock();
5119 return ret;
5120
5121 case SIOCGIFMEM:
5122 /* Get the per device memory space. We can add this but
5123 * currently do not support it */
5124 case SIOCSIFMEM:
5125 /* Set the per device memory buffer space.
5126 * Not applicable in our case */
5127 case SIOCSIFLINK:
5128 return -ENOTTY;
5129
5130 /*
5131 * Unknown or private ioctl.
5132 */
5133 default:
5134 if (cmd == SIOCWANDEV ||
5135 (cmd >= SIOCDEVPRIVATE &&
5136 cmd <= SIOCDEVPRIVATE + 15)) {
5137 dev_load(net, ifr.ifr_name);
5138 rtnl_lock();
5139 ret = dev_ifsioc(net, &ifr, cmd);
5140 rtnl_unlock();
5141 if (!ret && copy_to_user(arg, &ifr,
5142 sizeof(struct ifreq)))
5143 ret = -EFAULT;
5144 return ret;
5145 }
5146 /* Take care of Wireless Extensions */
5147 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5148 return wext_handle_ioctl(net, &ifr, cmd, arg);
5149 return -ENOTTY;
5150 }
5151}
5152
5153
5154/**
5155 * dev_new_index - allocate an ifindex
5156 * @net: the applicable net namespace
5157 *
5158 * Returns a suitable unique value for a new device interface
5159 * number. The caller must hold the rtnl semaphore or the
5160 * dev_base_lock to be sure it remains unique.
5161 */
5162static int dev_new_index(struct net *net)
5163{
5164 static int ifindex;
5165 for (;;) {
5166 if (++ifindex <= 0)
5167 ifindex = 1;
5168 if (!__dev_get_by_index(net, ifindex))
5169 return ifindex;
5170 }
5171}
5172
5173/* Delayed registration/unregisteration */
5174static LIST_HEAD(net_todo_list);
5175
5176static void net_set_todo(struct net_device *dev)
5177{
5178 list_add_tail(&dev->todo_list, &net_todo_list);
5179}
5180
5181static void rollback_registered_many(struct list_head *head)
5182{
5183 struct net_device *dev, *tmp;
5184
5185 BUG_ON(dev_boot_phase);
5186 ASSERT_RTNL();
5187
5188 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5189 /* Some devices call without registering
5190 * for initialization unwind. Remove those
5191 * devices and proceed with the remaining.
5192 */
5193 if (dev->reg_state == NETREG_UNINITIALIZED) {
5194 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5195 dev->name, dev);
5196
5197 WARN_ON(1);
5198 list_del(&dev->unreg_list);
5199 continue;
5200 }
5201 dev->dismantle = true;
5202 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5203 }
5204
5205 /* If device is running, close it first. */
5206 dev_close_many(head);
5207
5208 list_for_each_entry(dev, head, unreg_list) {
5209 /* And unlink it from device chain. */
5210 unlist_netdevice(dev);
5211
5212 dev->reg_state = NETREG_UNREGISTERING;
5213 }
5214
5215 synchronize_net();
5216
5217 list_for_each_entry(dev, head, unreg_list) {
5218 /* Shutdown queueing discipline. */
5219 dev_shutdown(dev);
5220
5221
5222 /* Notify protocols, that we are about to destroy
5223 this device. They should clean all the things.
5224 */
5225 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5226
5227 if (!dev->rtnl_link_ops ||
5228 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5229 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5230
5231 /*
5232 * Flush the unicast and multicast chains
5233 */
5234 dev_uc_flush(dev);
5235 dev_mc_flush(dev);
5236
5237 if (dev->netdev_ops->ndo_uninit)
5238 dev->netdev_ops->ndo_uninit(dev);
5239
5240 /* Notifier chain MUST detach us from master device. */
5241 WARN_ON(dev->master);
5242
5243 /* Remove entries from kobject tree */
5244 netdev_unregister_kobject(dev);
5245 }
5246
5247 /* Process any work delayed until the end of the batch */
5248 dev = list_first_entry(head, struct net_device, unreg_list);
5249 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5250
5251 synchronize_net();
5252
5253 list_for_each_entry(dev, head, unreg_list)
5254 dev_put(dev);
5255}
5256
5257static void rollback_registered(struct net_device *dev)
5258{
5259 LIST_HEAD(single);
5260
5261 list_add(&dev->unreg_list, &single);
5262 rollback_registered_many(&single);
5263 list_del(&single);
5264}
5265
5266static netdev_features_t netdev_fix_features(struct net_device *dev,
5267 netdev_features_t features)
5268{
5269 /* Fix illegal checksum combinations */
5270 if ((features & NETIF_F_HW_CSUM) &&
5271 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5272 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5273 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5274 }
5275
5276 /* Fix illegal SG+CSUM combinations. */
5277 if ((features & NETIF_F_SG) &&
5278 !(features & NETIF_F_ALL_CSUM)) {
5279 netdev_dbg(dev,
5280 "Dropping NETIF_F_SG since no checksum feature.\n");
5281 features &= ~NETIF_F_SG;
5282 }
5283
5284 /* TSO requires that SG is present as well. */
5285 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5286 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5287 features &= ~NETIF_F_ALL_TSO;
5288 }
5289
5290 /* TSO ECN requires that TSO is present as well. */
5291 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5292 features &= ~NETIF_F_TSO_ECN;
5293
5294 /* Software GSO depends on SG. */
5295 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5296 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5297 features &= ~NETIF_F_GSO;
5298 }
5299
5300 /* UFO needs SG and checksumming */
5301 if (features & NETIF_F_UFO) {
5302 /* maybe split UFO into V4 and V6? */
5303 if (!((features & NETIF_F_GEN_CSUM) ||
5304 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5305 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5306 netdev_dbg(dev,
5307 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5308 features &= ~NETIF_F_UFO;
5309 }
5310
5311 if (!(features & NETIF_F_SG)) {
5312 netdev_dbg(dev,
5313 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5314 features &= ~NETIF_F_UFO;
5315 }
5316 }
5317
5318 return features;
5319}
5320
5321int __netdev_update_features(struct net_device *dev)
5322{
5323 netdev_features_t features;
5324 int err = 0;
5325
5326 ASSERT_RTNL();
5327
5328 features = netdev_get_wanted_features(dev);
5329
5330 if (dev->netdev_ops->ndo_fix_features)
5331 features = dev->netdev_ops->ndo_fix_features(dev, features);
5332
5333 /* driver might be less strict about feature dependencies */
5334 features = netdev_fix_features(dev, features);
5335
5336 if (dev->features == features)
5337 return 0;
5338
5339 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5340 &dev->features, &features);
5341
5342 if (dev->netdev_ops->ndo_set_features)
5343 err = dev->netdev_ops->ndo_set_features(dev, features);
5344
5345 if (unlikely(err < 0)) {
5346 netdev_err(dev,
5347 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5348 err, &features, &dev->features);
5349 return -1;
5350 }
5351
5352 if (!err)
5353 dev->features = features;
5354
5355 return 1;
5356}
5357
5358/**
5359 * netdev_update_features - recalculate device features
5360 * @dev: the device to check
5361 *
5362 * Recalculate dev->features set and send notifications if it
5363 * has changed. Should be called after driver or hardware dependent
5364 * conditions might have changed that influence the features.
5365 */
5366void netdev_update_features(struct net_device *dev)
5367{
5368 if (__netdev_update_features(dev))
5369 netdev_features_change(dev);
5370}
5371EXPORT_SYMBOL(netdev_update_features);
5372
5373/**
5374 * netdev_change_features - recalculate device features
5375 * @dev: the device to check
5376 *
5377 * Recalculate dev->features set and send notifications even
5378 * if they have not changed. Should be called instead of
5379 * netdev_update_features() if also dev->vlan_features might
5380 * have changed to allow the changes to be propagated to stacked
5381 * VLAN devices.
5382 */
5383void netdev_change_features(struct net_device *dev)
5384{
5385 __netdev_update_features(dev);
5386 netdev_features_change(dev);
5387}
5388EXPORT_SYMBOL(netdev_change_features);
5389
5390/**
5391 * netif_stacked_transfer_operstate - transfer operstate
5392 * @rootdev: the root or lower level device to transfer state from
5393 * @dev: the device to transfer operstate to
5394 *
5395 * Transfer operational state from root to device. This is normally
5396 * called when a stacking relationship exists between the root
5397 * device and the device(a leaf device).
5398 */
5399void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5400 struct net_device *dev)
5401{
5402 if (rootdev->operstate == IF_OPER_DORMANT)
5403 netif_dormant_on(dev);
5404 else
5405 netif_dormant_off(dev);
5406
5407 if (netif_carrier_ok(rootdev)) {
5408 if (!netif_carrier_ok(dev))
5409 netif_carrier_on(dev);
5410 } else {
5411 if (netif_carrier_ok(dev))
5412 netif_carrier_off(dev);
5413 }
5414}
5415EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5416
5417#ifdef CONFIG_RPS
5418static int netif_alloc_rx_queues(struct net_device *dev)
5419{
5420 unsigned int i, count = dev->num_rx_queues;
5421 struct netdev_rx_queue *rx;
5422
5423 BUG_ON(count < 1);
5424
5425 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5426 if (!rx) {
5427 pr_err("netdev: Unable to allocate %u rx queues\n", count);
5428 return -ENOMEM;
5429 }
5430 dev->_rx = rx;
5431
5432 for (i = 0; i < count; i++)
5433 rx[i].dev = dev;
5434 return 0;
5435}
5436#endif
5437
5438static void netdev_init_one_queue(struct net_device *dev,
5439 struct netdev_queue *queue, void *_unused)
5440{
5441 /* Initialize queue lock */
5442 spin_lock_init(&queue->_xmit_lock);
5443 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5444 queue->xmit_lock_owner = -1;
5445 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5446 queue->dev = dev;
5447#ifdef CONFIG_BQL
5448 dql_init(&queue->dql, HZ);
5449#endif
5450}
5451
5452static int netif_alloc_netdev_queues(struct net_device *dev)
5453{
5454 unsigned int count = dev->num_tx_queues;
5455 struct netdev_queue *tx;
5456
5457 BUG_ON(count < 1);
5458
5459 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5460 if (!tx) {
5461 pr_err("netdev: Unable to allocate %u tx queues\n", count);
5462 return -ENOMEM;
5463 }
5464 dev->_tx = tx;
5465
5466 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5467 spin_lock_init(&dev->tx_global_lock);
5468
5469 return 0;
5470}
5471
5472/**
5473 * register_netdevice - register a network device
5474 * @dev: device to register
5475 *
5476 * Take a completed network device structure and add it to the kernel
5477 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5478 * chain. 0 is returned on success. A negative errno code is returned
5479 * on a failure to set up the device, or if the name is a duplicate.
5480 *
5481 * Callers must hold the rtnl semaphore. You may want
5482 * register_netdev() instead of this.
5483 *
5484 * BUGS:
5485 * The locking appears insufficient to guarantee two parallel registers
5486 * will not get the same name.
5487 */
5488
5489int register_netdevice(struct net_device *dev)
5490{
5491 int ret;
5492 struct net *net = dev_net(dev);
5493
5494 BUG_ON(dev_boot_phase);
5495 ASSERT_RTNL();
5496
5497 might_sleep();
5498
5499 /* When net_device's are persistent, this will be fatal. */
5500 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5501 BUG_ON(!net);
5502
5503 spin_lock_init(&dev->addr_list_lock);
5504 netdev_set_addr_lockdep_class(dev);
5505
5506 dev->iflink = -1;
5507
5508 ret = dev_get_valid_name(dev, dev->name);
5509 if (ret < 0)
5510 goto out;
5511
5512 /* Init, if this function is available */
5513 if (dev->netdev_ops->ndo_init) {
5514 ret = dev->netdev_ops->ndo_init(dev);
5515 if (ret) {
5516 if (ret > 0)
5517 ret = -EIO;
5518 goto out;
5519 }
5520 }
5521
5522 dev->ifindex = dev_new_index(net);
5523 if (dev->iflink == -1)
5524 dev->iflink = dev->ifindex;
5525
5526 /* Transfer changeable features to wanted_features and enable
5527 * software offloads (GSO and GRO).
5528 */
5529 dev->hw_features |= NETIF_F_SOFT_FEATURES;
5530 dev->features |= NETIF_F_SOFT_FEATURES;
5531 dev->wanted_features = dev->features & dev->hw_features;
5532
5533 /* Turn on no cache copy if HW is doing checksum */
5534 if (!(dev->flags & IFF_LOOPBACK)) {
5535 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5536 if (dev->features & NETIF_F_ALL_CSUM) {
5537 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5538 dev->features |= NETIF_F_NOCACHE_COPY;
5539 }
5540 }
5541
5542 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5543 */
5544 dev->vlan_features |= NETIF_F_HIGHDMA;
5545
5546 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5547 ret = notifier_to_errno(ret);
5548 if (ret)
5549 goto err_uninit;
5550
5551 ret = netdev_register_kobject(dev);
5552 if (ret)
5553 goto err_uninit;
5554 dev->reg_state = NETREG_REGISTERED;
5555
5556 __netdev_update_features(dev);
5557
5558 /*
5559 * Default initial state at registry is that the
5560 * device is present.
5561 */
5562
5563 set_bit(__LINK_STATE_PRESENT, &dev->state);
5564
5565 dev_init_scheduler(dev);
5566 dev_hold(dev);
5567 list_netdevice(dev);
5568 add_device_randomness(dev->dev_addr, dev->addr_len);
5569
5570 /* Notify protocols, that a new device appeared. */
5571 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5572 ret = notifier_to_errno(ret);
5573 if (ret) {
5574 rollback_registered(dev);
5575 dev->reg_state = NETREG_UNREGISTERED;
5576 }
5577 /*
5578 * Prevent userspace races by waiting until the network
5579 * device is fully setup before sending notifications.
5580 */
5581 if (!dev->rtnl_link_ops ||
5582 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5583 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5584
5585out:
5586 return ret;
5587
5588err_uninit:
5589 if (dev->netdev_ops->ndo_uninit)
5590 dev->netdev_ops->ndo_uninit(dev);
5591 goto out;
5592}
5593EXPORT_SYMBOL(register_netdevice);
5594
5595/**
5596 * init_dummy_netdev - init a dummy network device for NAPI
5597 * @dev: device to init
5598 *
5599 * This takes a network device structure and initialize the minimum
5600 * amount of fields so it can be used to schedule NAPI polls without
5601 * registering a full blown interface. This is to be used by drivers
5602 * that need to tie several hardware interfaces to a single NAPI
5603 * poll scheduler due to HW limitations.
5604 */
5605int init_dummy_netdev(struct net_device *dev)
5606{
5607 /* Clear everything. Note we don't initialize spinlocks
5608 * are they aren't supposed to be taken by any of the
5609 * NAPI code and this dummy netdev is supposed to be
5610 * only ever used for NAPI polls
5611 */
5612 memset(dev, 0, sizeof(struct net_device));
5613
5614 /* make sure we BUG if trying to hit standard
5615 * register/unregister code path
5616 */
5617 dev->reg_state = NETREG_DUMMY;
5618
5619 /* NAPI wants this */
5620 INIT_LIST_HEAD(&dev->napi_list);
5621
5622 /* a dummy interface is started by default */
5623 set_bit(__LINK_STATE_PRESENT, &dev->state);
5624 set_bit(__LINK_STATE_START, &dev->state);
5625
5626 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5627 * because users of this 'device' dont need to change
5628 * its refcount.
5629 */
5630
5631 return 0;
5632}
5633EXPORT_SYMBOL_GPL(init_dummy_netdev);
5634
5635
5636/**
5637 * register_netdev - register a network device
5638 * @dev: device to register
5639 *
5640 * Take a completed network device structure and add it to the kernel
5641 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5642 * chain. 0 is returned on success. A negative errno code is returned
5643 * on a failure to set up the device, or if the name is a duplicate.
5644 *
5645 * This is a wrapper around register_netdevice that takes the rtnl semaphore
5646 * and expands the device name if you passed a format string to
5647 * alloc_netdev.
5648 */
5649int register_netdev(struct net_device *dev)
5650{
5651 int err;
5652
5653 rtnl_lock();
5654 err = register_netdevice(dev);
5655 rtnl_unlock();
5656 return err;
5657}
5658EXPORT_SYMBOL(register_netdev);
5659
5660int netdev_refcnt_read(const struct net_device *dev)
5661{
5662 int i, refcnt = 0;
5663
5664 for_each_possible_cpu(i)
5665 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5666 return refcnt;
5667}
5668EXPORT_SYMBOL(netdev_refcnt_read);
5669
5670/*
5671 * netdev_wait_allrefs - wait until all references are gone.
5672 *
5673 * This is called when unregistering network devices.
5674 *
5675 * Any protocol or device that holds a reference should register
5676 * for netdevice notification, and cleanup and put back the
5677 * reference if they receive an UNREGISTER event.
5678 * We can get stuck here if buggy protocols don't correctly
5679 * call dev_put.
5680 */
5681static void netdev_wait_allrefs(struct net_device *dev)
5682{
5683 unsigned long rebroadcast_time, warning_time;
5684 int refcnt;
5685
5686 linkwatch_forget_dev(dev);
5687
5688 rebroadcast_time = warning_time = jiffies;
5689 refcnt = netdev_refcnt_read(dev);
5690
5691 while (refcnt != 0) {
5692 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5693 rtnl_lock();
5694
5695 /* Rebroadcast unregister notification */
5696 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5697 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5698 * should have already handle it the first time */
5699
5700 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5701 &dev->state)) {
5702 /* We must not have linkwatch events
5703 * pending on unregister. If this
5704 * happens, we simply run the queue
5705 * unscheduled, resulting in a noop
5706 * for this device.
5707 */
5708 linkwatch_run_queue();
5709 }
5710
5711 __rtnl_unlock();
5712
5713 rebroadcast_time = jiffies;
5714 }
5715
5716 msleep(250);
5717
5718 refcnt = netdev_refcnt_read(dev);
5719
5720 if (time_after(jiffies, warning_time + 10 * HZ)) {
5721 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5722 dev->name, refcnt);
5723 warning_time = jiffies;
5724 }
5725 }
5726}
5727
5728/* The sequence is:
5729 *
5730 * rtnl_lock();
5731 * ...
5732 * register_netdevice(x1);
5733 * register_netdevice(x2);
5734 * ...
5735 * unregister_netdevice(y1);
5736 * unregister_netdevice(y2);
5737 * ...
5738 * rtnl_unlock();
5739 * free_netdev(y1);
5740 * free_netdev(y2);
5741 *
5742 * We are invoked by rtnl_unlock().
5743 * This allows us to deal with problems:
5744 * 1) We can delete sysfs objects which invoke hotplug
5745 * without deadlocking with linkwatch via keventd.
5746 * 2) Since we run with the RTNL semaphore not held, we can sleep
5747 * safely in order to wait for the netdev refcnt to drop to zero.
5748 *
5749 * We must not return until all unregister events added during
5750 * the interval the lock was held have been completed.
5751 */
5752void netdev_run_todo(void)
5753{
5754 struct list_head list;
5755
5756 /* Snapshot list, allow later requests */
5757 list_replace_init(&net_todo_list, &list);
5758
5759 __rtnl_unlock();
5760
5761 /* Wait for rcu callbacks to finish before attempting to drain
5762 * the device list. This usually avoids a 250ms wait.
5763 */
5764 if (!list_empty(&list))
5765 rcu_barrier();
5766
5767 while (!list_empty(&list)) {
5768 struct net_device *dev
5769 = list_first_entry(&list, struct net_device, todo_list);
5770 list_del(&dev->todo_list);
5771
5772 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5773 pr_err("network todo '%s' but state %d\n",
5774 dev->name, dev->reg_state);
5775 dump_stack();
5776 continue;
5777 }
5778
5779 dev->reg_state = NETREG_UNREGISTERED;
5780
5781 on_each_cpu(flush_backlog, dev, 1);
5782
5783 netdev_wait_allrefs(dev);
5784
5785 /* paranoia */
5786 BUG_ON(netdev_refcnt_read(dev));
5787 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5788 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5789 WARN_ON(dev->dn_ptr);
5790
5791 if (dev->destructor)
5792 dev->destructor(dev);
5793
5794 /* Free network device */
5795 kobject_put(&dev->dev.kobj);
5796 }
5797}
5798
5799/* Convert net_device_stats to rtnl_link_stats64. They have the same
5800 * fields in the same order, with only the type differing.
5801 */
5802void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5803 const struct net_device_stats *netdev_stats)
5804{
5805#if BITS_PER_LONG == 64
5806 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5807 memcpy(stats64, netdev_stats, sizeof(*stats64));
5808#else
5809 size_t i, n = sizeof(*stats64) / sizeof(u64);
5810 const unsigned long *src = (const unsigned long *)netdev_stats;
5811 u64 *dst = (u64 *)stats64;
5812
5813 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5814 sizeof(*stats64) / sizeof(u64));
5815 for (i = 0; i < n; i++)
5816 dst[i] = src[i];
5817#endif
5818}
5819EXPORT_SYMBOL(netdev_stats_to_stats64);
5820
5821/**
5822 * dev_get_stats - get network device statistics
5823 * @dev: device to get statistics from
5824 * @storage: place to store stats
5825 *
5826 * Get network statistics from device. Return @storage.
5827 * The device driver may provide its own method by setting
5828 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5829 * otherwise the internal statistics structure is used.
5830 */
5831struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5832 struct rtnl_link_stats64 *storage)
5833{
5834 const struct net_device_ops *ops = dev->netdev_ops;
5835
5836 if (ops->ndo_get_stats64) {
5837 memset(storage, 0, sizeof(*storage));
5838 ops->ndo_get_stats64(dev, storage);
5839 } else if (ops->ndo_get_stats) {
5840 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5841 } else {
5842 netdev_stats_to_stats64(storage, &dev->stats);
5843 }
5844 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5845 return storage;
5846}
5847EXPORT_SYMBOL(dev_get_stats);
5848
5849struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5850{
5851 struct netdev_queue *queue = dev_ingress_queue(dev);
5852
5853#ifdef CONFIG_NET_CLS_ACT
5854 if (queue)
5855 return queue;
5856 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5857 if (!queue)
5858 return NULL;
5859 netdev_init_one_queue(dev, queue, NULL);
5860 queue->qdisc = &noop_qdisc;
5861 queue->qdisc_sleeping = &noop_qdisc;
5862 rcu_assign_pointer(dev->ingress_queue, queue);
5863#endif
5864 return queue;
5865}
5866
5867/**
5868 * alloc_netdev_mqs - allocate network device
5869 * @sizeof_priv: size of private data to allocate space for
5870 * @name: device name format string
5871 * @setup: callback to initialize device
5872 * @txqs: the number of TX subqueues to allocate
5873 * @rxqs: the number of RX subqueues to allocate
5874 *
5875 * Allocates a struct net_device with private data area for driver use
5876 * and performs basic initialization. Also allocates subquue structs
5877 * for each queue on the device.
5878 */
5879struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5880 void (*setup)(struct net_device *),
5881 unsigned int txqs, unsigned int rxqs)
5882{
5883 struct net_device *dev;
5884 size_t alloc_size;
5885 struct net_device *p;
5886
5887 BUG_ON(strlen(name) >= sizeof(dev->name));
5888
5889 if (txqs < 1) {
5890 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5891 return NULL;
5892 }
5893
5894#ifdef CONFIG_RPS
5895 if (rxqs < 1) {
5896 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5897 return NULL;
5898 }
5899#endif
5900
5901 alloc_size = sizeof(struct net_device);
5902 if (sizeof_priv) {
5903 /* ensure 32-byte alignment of private area */
5904 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5905 alloc_size += sizeof_priv;
5906 }
5907 /* ensure 32-byte alignment of whole construct */
5908 alloc_size += NETDEV_ALIGN - 1;
5909
5910 p = kzalloc(alloc_size, GFP_KERNEL);
5911 if (!p) {
5912 pr_err("alloc_netdev: Unable to allocate device\n");
5913 return NULL;
5914 }
5915
5916 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5917 dev->padded = (char *)dev - (char *)p;
5918
5919 dev->pcpu_refcnt = alloc_percpu(int);
5920 if (!dev->pcpu_refcnt)
5921 goto free_p;
5922
5923 if (dev_addr_init(dev))
5924 goto free_pcpu;
5925
5926 dev_mc_init(dev);
5927 dev_uc_init(dev);
5928
5929 dev_net_set(dev, &init_net);
5930
5931 dev->gso_max_size = GSO_MAX_SIZE;
5932 dev->gso_max_segs = GSO_MAX_SEGS;
5933
5934 INIT_LIST_HEAD(&dev->napi_list);
5935 INIT_LIST_HEAD(&dev->unreg_list);
5936 INIT_LIST_HEAD(&dev->link_watch_list);
5937 dev->priv_flags = IFF_XMIT_DST_RELEASE;
5938 setup(dev);
5939
5940 dev->num_tx_queues = txqs;
5941 dev->real_num_tx_queues = txqs;
5942 if (netif_alloc_netdev_queues(dev))
5943 goto free_all;
5944
5945#ifdef CONFIG_RPS
5946 dev->num_rx_queues = rxqs;
5947 dev->real_num_rx_queues = rxqs;
5948 if (netif_alloc_rx_queues(dev))
5949 goto free_all;
5950#endif
5951
5952 strcpy(dev->name, name);
5953 dev->group = INIT_NETDEV_GROUP;
5954 return dev;
5955
5956free_all:
5957 free_netdev(dev);
5958 return NULL;
5959
5960free_pcpu:
5961 free_percpu(dev->pcpu_refcnt);
5962 kfree(dev->_tx);
5963#ifdef CONFIG_RPS
5964 kfree(dev->_rx);
5965#endif
5966
5967free_p:
5968 kfree(p);
5969 return NULL;
5970}
5971EXPORT_SYMBOL(alloc_netdev_mqs);
5972
5973/**
5974 * free_netdev - free network device
5975 * @dev: device
5976 *
5977 * This function does the last stage of destroying an allocated device
5978 * interface. The reference to the device object is released.
5979 * If this is the last reference then it will be freed.
5980 */
5981void free_netdev(struct net_device *dev)
5982{
5983 struct napi_struct *p, *n;
5984
5985 release_net(dev_net(dev));
5986
5987 kfree(dev->_tx);
5988#ifdef CONFIG_RPS
5989 kfree(dev->_rx);
5990#endif
5991
5992 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
5993
5994 /* Flush device addresses */
5995 dev_addr_flush(dev);
5996
5997 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5998 netif_napi_del(p);
5999
6000 free_percpu(dev->pcpu_refcnt);
6001 dev->pcpu_refcnt = NULL;
6002
6003 /* Compatibility with error handling in drivers */
6004 if (dev->reg_state == NETREG_UNINITIALIZED) {
6005 kfree((char *)dev - dev->padded);
6006 return;
6007 }
6008
6009 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6010 dev->reg_state = NETREG_RELEASED;
6011
6012 /* will free via device release */
6013 put_device(&dev->dev);
6014}
6015EXPORT_SYMBOL(free_netdev);
6016
6017/**
6018 * synchronize_net - Synchronize with packet receive processing
6019 *
6020 * Wait for packets currently being received to be done.
6021 * Does not block later packets from starting.
6022 */
6023void synchronize_net(void)
6024{
6025 might_sleep();
6026 if (rtnl_is_locked())
6027 synchronize_rcu_expedited();
6028 else
6029 synchronize_rcu();
6030}
6031EXPORT_SYMBOL(synchronize_net);
6032
6033/**
6034 * unregister_netdevice_queue - remove device from the kernel
6035 * @dev: device
6036 * @head: list
6037 *
6038 * This function shuts down a device interface and removes it
6039 * from the kernel tables.
6040 * If head not NULL, device is queued to be unregistered later.
6041 *
6042 * Callers must hold the rtnl semaphore. You may want
6043 * unregister_netdev() instead of this.
6044 */
6045
6046void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6047{
6048 ASSERT_RTNL();
6049
6050 if (head) {
6051 list_move_tail(&dev->unreg_list, head);
6052 } else {
6053 rollback_registered(dev);
6054 /* Finish processing unregister after unlock */
6055 net_set_todo(dev);
6056 }
6057}
6058EXPORT_SYMBOL(unregister_netdevice_queue);
6059
6060/**
6061 * unregister_netdevice_many - unregister many devices
6062 * @head: list of devices
6063 */
6064void unregister_netdevice_many(struct list_head *head)
6065{
6066 struct net_device *dev;
6067
6068 if (!list_empty(head)) {
6069 rollback_registered_many(head);
6070 list_for_each_entry(dev, head, unreg_list)
6071 net_set_todo(dev);
6072 }
6073}
6074EXPORT_SYMBOL(unregister_netdevice_many);
6075
6076/**
6077 * unregister_netdev - remove device from the kernel
6078 * @dev: device
6079 *
6080 * This function shuts down a device interface and removes it
6081 * from the kernel tables.
6082 *
6083 * This is just a wrapper for unregister_netdevice that takes
6084 * the rtnl semaphore. In general you want to use this and not
6085 * unregister_netdevice.
6086 */
6087void unregister_netdev(struct net_device *dev)
6088{
6089 rtnl_lock();
6090 unregister_netdevice(dev);
6091 rtnl_unlock();
6092}
6093EXPORT_SYMBOL(unregister_netdev);
6094
6095/**
6096 * dev_change_net_namespace - move device to different nethost namespace
6097 * @dev: device
6098 * @net: network namespace
6099 * @pat: If not NULL name pattern to try if the current device name
6100 * is already taken in the destination network namespace.
6101 *
6102 * This function shuts down a device interface and moves it
6103 * to a new network namespace. On success 0 is returned, on
6104 * a failure a netagive errno code is returned.
6105 *
6106 * Callers must hold the rtnl semaphore.
6107 */
6108
6109int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6110{
6111 int err;
6112
6113 ASSERT_RTNL();
6114
6115 /* Don't allow namespace local devices to be moved. */
6116 err = -EINVAL;
6117 if (dev->features & NETIF_F_NETNS_LOCAL)
6118 goto out;
6119
6120 /* Ensure the device has been registrered */
6121 err = -EINVAL;
6122 if (dev->reg_state != NETREG_REGISTERED)
6123 goto out;
6124
6125 /* Get out if there is nothing todo */
6126 err = 0;
6127 if (net_eq(dev_net(dev), net))
6128 goto out;
6129
6130 /* Pick the destination device name, and ensure
6131 * we can use it in the destination network namespace.
6132 */
6133 err = -EEXIST;
6134 if (__dev_get_by_name(net, dev->name)) {
6135 /* We get here if we can't use the current device name */
6136 if (!pat)
6137 goto out;
6138 if (dev_get_valid_name(dev, pat) < 0)
6139 goto out;
6140 }
6141
6142 /*
6143 * And now a mini version of register_netdevice unregister_netdevice.
6144 */
6145
6146 /* If device is running close it first. */
6147 dev_close(dev);
6148
6149 /* And unlink it from device chain */
6150 err = -ENODEV;
6151 unlist_netdevice(dev);
6152
6153 synchronize_net();
6154
6155 /* Shutdown queueing discipline. */
6156 dev_shutdown(dev);
6157
6158 /* Notify protocols, that we are about to destroy
6159 this device. They should clean all the things.
6160
6161 Note that dev->reg_state stays at NETREG_REGISTERED.
6162 This is wanted because this way 8021q and macvlan know
6163 the device is just moving and can keep their slaves up.
6164 */
6165 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6166 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6167 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6168
6169 /*
6170 * Flush the unicast and multicast chains
6171 */
6172 dev_uc_flush(dev);
6173 dev_mc_flush(dev);
6174
6175 /* Actually switch the network namespace */
6176 dev_net_set(dev, net);
6177
6178 /* If there is an ifindex conflict assign a new one */
6179 if (__dev_get_by_index(net, dev->ifindex)) {
6180 int iflink = (dev->iflink == dev->ifindex);
6181 dev->ifindex = dev_new_index(net);
6182 if (iflink)
6183 dev->iflink = dev->ifindex;
6184 }
6185
6186 /* Fixup kobjects */
6187 err = device_rename(&dev->dev, dev->name);
6188 WARN_ON(err);
6189
6190 /* Add the device back in the hashes */
6191 list_netdevice(dev);
6192
6193 /* Notify protocols, that a new device appeared. */
6194 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6195
6196 /*
6197 * Prevent userspace races by waiting until the network
6198 * device is fully setup before sending notifications.
6199 */
6200 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6201
6202 synchronize_net();
6203 err = 0;
6204out:
6205 return err;
6206}
6207EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6208
6209static int dev_cpu_callback(struct notifier_block *nfb,
6210 unsigned long action,
6211 void *ocpu)
6212{
6213 struct sk_buff **list_skb;
6214 struct sk_buff *skb;
6215 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6216 struct softnet_data *sd, *oldsd;
6217
6218 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6219 return NOTIFY_OK;
6220
6221 local_irq_disable();
6222 cpu = smp_processor_id();
6223 sd = &per_cpu(softnet_data, cpu);
6224 oldsd = &per_cpu(softnet_data, oldcpu);
6225
6226 /* Find end of our completion_queue. */
6227 list_skb = &sd->completion_queue;
6228 while (*list_skb)
6229 list_skb = &(*list_skb)->next;
6230 /* Append completion queue from offline CPU. */
6231 *list_skb = oldsd->completion_queue;
6232 oldsd->completion_queue = NULL;
6233
6234 /* Append output queue from offline CPU. */
6235 if (oldsd->output_queue) {
6236 *sd->output_queue_tailp = oldsd->output_queue;
6237 sd->output_queue_tailp = oldsd->output_queue_tailp;
6238 oldsd->output_queue = NULL;
6239 oldsd->output_queue_tailp = &oldsd->output_queue;
6240 }
6241 /* Append NAPI poll list from offline CPU. */
6242 if (!list_empty(&oldsd->poll_list)) {
6243 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6244 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6245 }
6246
6247 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6248 local_irq_enable();
6249
6250 /* Process offline CPU's input_pkt_queue */
6251 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6252 netif_rx(skb);
6253 input_queue_head_incr(oldsd);
6254 }
6255 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6256 netif_rx(skb);
6257 input_queue_head_incr(oldsd);
6258 }
6259
6260 return NOTIFY_OK;
6261}
6262
6263
6264/**
6265 * netdev_increment_features - increment feature set by one
6266 * @all: current feature set
6267 * @one: new feature set
6268 * @mask: mask feature set
6269 *
6270 * Computes a new feature set after adding a device with feature set
6271 * @one to the master device with current feature set @all. Will not
6272 * enable anything that is off in @mask. Returns the new feature set.
6273 */
6274netdev_features_t netdev_increment_features(netdev_features_t all,
6275 netdev_features_t one, netdev_features_t mask)
6276{
6277 if (mask & NETIF_F_GEN_CSUM)
6278 mask |= NETIF_F_ALL_CSUM;
6279 mask |= NETIF_F_VLAN_CHALLENGED;
6280
6281 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6282 all &= one | ~NETIF_F_ALL_FOR_ALL;
6283
6284 /* If one device supports hw checksumming, set for all. */
6285 if (all & NETIF_F_GEN_CSUM)
6286 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6287
6288 return all;
6289}
6290EXPORT_SYMBOL(netdev_increment_features);
6291
6292static struct hlist_head *netdev_create_hash(void)
6293{
6294 int i;
6295 struct hlist_head *hash;
6296
6297 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6298 if (hash != NULL)
6299 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6300 INIT_HLIST_HEAD(&hash[i]);
6301
6302 return hash;
6303}
6304
6305/* Initialize per network namespace state */
6306static int __net_init netdev_init(struct net *net)
6307{
6308 if (net != &init_net)
6309 INIT_LIST_HEAD(&net->dev_base_head);
6310
6311 net->dev_name_head = netdev_create_hash();
6312 if (net->dev_name_head == NULL)
6313 goto err_name;
6314
6315 net->dev_index_head = netdev_create_hash();
6316 if (net->dev_index_head == NULL)
6317 goto err_idx;
6318
6319 return 0;
6320
6321err_idx:
6322 kfree(net->dev_name_head);
6323err_name:
6324 return -ENOMEM;
6325}
6326
6327/**
6328 * netdev_drivername - network driver for the device
6329 * @dev: network device
6330 *
6331 * Determine network driver for device.
6332 */
6333const char *netdev_drivername(const struct net_device *dev)
6334{
6335 const struct device_driver *driver;
6336 const struct device *parent;
6337 const char *empty = "";
6338
6339 parent = dev->dev.parent;
6340 if (!parent)
6341 return empty;
6342
6343 driver = parent->driver;
6344 if (driver && driver->name)
6345 return driver->name;
6346 return empty;
6347}
6348
6349int __netdev_printk(const char *level, const struct net_device *dev,
6350 struct va_format *vaf)
6351{
6352 int r;
6353
6354 if (dev && dev->dev.parent)
6355 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6356 netdev_name(dev), vaf);
6357 else if (dev)
6358 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6359 else
6360 r = printk("%s(NULL net_device): %pV", level, vaf);
6361
6362 return r;
6363}
6364EXPORT_SYMBOL(__netdev_printk);
6365
6366int netdev_printk(const char *level, const struct net_device *dev,
6367 const char *format, ...)
6368{
6369 struct va_format vaf;
6370 va_list args;
6371 int r;
6372
6373 va_start(args, format);
6374
6375 vaf.fmt = format;
6376 vaf.va = &args;
6377
6378 r = __netdev_printk(level, dev, &vaf);
6379 va_end(args);
6380
6381 return r;
6382}
6383EXPORT_SYMBOL(netdev_printk);
6384
6385#define define_netdev_printk_level(func, level) \
6386int func(const struct net_device *dev, const char *fmt, ...) \
6387{ \
6388 int r; \
6389 struct va_format vaf; \
6390 va_list args; \
6391 \
6392 va_start(args, fmt); \
6393 \
6394 vaf.fmt = fmt; \
6395 vaf.va = &args; \
6396 \
6397 r = __netdev_printk(level, dev, &vaf); \
6398 va_end(args); \
6399 \
6400 return r; \
6401} \
6402EXPORT_SYMBOL(func);
6403
6404define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6405define_netdev_printk_level(netdev_alert, KERN_ALERT);
6406define_netdev_printk_level(netdev_crit, KERN_CRIT);
6407define_netdev_printk_level(netdev_err, KERN_ERR);
6408define_netdev_printk_level(netdev_warn, KERN_WARNING);
6409define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6410define_netdev_printk_level(netdev_info, KERN_INFO);
6411
6412static void __net_exit netdev_exit(struct net *net)
6413{
6414 kfree(net->dev_name_head);
6415 kfree(net->dev_index_head);
6416}
6417
6418static struct pernet_operations __net_initdata netdev_net_ops = {
6419 .init = netdev_init,
6420 .exit = netdev_exit,
6421};
6422
6423static void __net_exit default_device_exit(struct net *net)
6424{
6425 struct net_device *dev, *aux;
6426 /*
6427 * Push all migratable network devices back to the
6428 * initial network namespace
6429 */
6430 rtnl_lock();
6431 for_each_netdev_safe(net, dev, aux) {
6432 int err;
6433 char fb_name[IFNAMSIZ];
6434
6435 /* Ignore unmoveable devices (i.e. loopback) */
6436 if (dev->features & NETIF_F_NETNS_LOCAL)
6437 continue;
6438
6439 /* Leave virtual devices for the generic cleanup */
6440 if (dev->rtnl_link_ops)
6441 continue;
6442
6443 /* Push remaining network devices to init_net */
6444 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6445 err = dev_change_net_namespace(dev, &init_net, fb_name);
6446 if (err) {
6447 pr_emerg("%s: failed to move %s to init_net: %d\n",
6448 __func__, dev->name, err);
6449 BUG();
6450 }
6451 }
6452 rtnl_unlock();
6453}
6454
6455static void __net_exit default_device_exit_batch(struct list_head *net_list)
6456{
6457 /* At exit all network devices most be removed from a network
6458 * namespace. Do this in the reverse order of registration.
6459 * Do this across as many network namespaces as possible to
6460 * improve batching efficiency.
6461 */
6462 struct net_device *dev;
6463 struct net *net;
6464 LIST_HEAD(dev_kill_list);
6465
6466 rtnl_lock();
6467 list_for_each_entry(net, net_list, exit_list) {
6468 for_each_netdev_reverse(net, dev) {
6469 if (dev->rtnl_link_ops)
6470 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6471 else
6472 unregister_netdevice_queue(dev, &dev_kill_list);
6473 }
6474 }
6475 unregister_netdevice_many(&dev_kill_list);
6476 list_del(&dev_kill_list);
6477 rtnl_unlock();
6478}
6479
6480static struct pernet_operations __net_initdata default_device_ops = {
6481 .exit = default_device_exit,
6482 .exit_batch = default_device_exit_batch,
6483};
6484
6485/*
6486 * Initialize the DEV module. At boot time this walks the device list and
6487 * unhooks any devices that fail to initialise (normally hardware not
6488 * present) and leaves us with a valid list of present and active devices.
6489 *
6490 */
6491
6492/*
6493 * This is called single threaded during boot, so no need
6494 * to take the rtnl semaphore.
6495 */
6496static int __init net_dev_init(void)
6497{
6498 int i, rc = -ENOMEM;
6499
6500 BUG_ON(!dev_boot_phase);
6501
6502 if (dev_proc_init())
6503 goto out;
6504
6505 if (netdev_kobject_init())
6506 goto out;
6507
6508 INIT_LIST_HEAD(&ptype_all);
6509 for (i = 0; i < PTYPE_HASH_SIZE; i++)
6510 INIT_LIST_HEAD(&ptype_base[i]);
6511
6512 if (register_pernet_subsys(&netdev_net_ops))
6513 goto out;
6514
6515 /*
6516 * Initialise the packet receive queues.
6517 */
6518
6519 for_each_possible_cpu(i) {
6520 struct softnet_data *sd = &per_cpu(softnet_data, i);
6521
6522 memset(sd, 0, sizeof(*sd));
6523 skb_queue_head_init(&sd->input_pkt_queue);
6524 skb_queue_head_init(&sd->process_queue);
6525 sd->completion_queue = NULL;
6526 INIT_LIST_HEAD(&sd->poll_list);
6527 sd->output_queue = NULL;
6528 sd->output_queue_tailp = &sd->output_queue;
6529#ifdef CONFIG_RPS
6530 sd->csd.func = rps_trigger_softirq;
6531 sd->csd.info = sd;
6532 sd->csd.flags = 0;
6533 sd->cpu = i;
6534#endif
6535
6536 sd->backlog.poll = process_backlog;
6537 sd->backlog.weight = weight_p;
6538 sd->backlog.gro_list = NULL;
6539 sd->backlog.gro_count = 0;
6540 }
6541
6542 dev_boot_phase = 0;
6543
6544 /* The loopback device is special if any other network devices
6545 * is present in a network namespace the loopback device must
6546 * be present. Since we now dynamically allocate and free the
6547 * loopback device ensure this invariant is maintained by
6548 * keeping the loopback device as the first device on the
6549 * list of network devices. Ensuring the loopback devices
6550 * is the first device that appears and the last network device
6551 * that disappears.
6552 */
6553 if (register_pernet_device(&loopback_net_ops))
6554 goto out;
6555
6556 if (register_pernet_device(&default_device_ops))
6557 goto out;
6558
6559 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6560 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6561
6562 hotcpu_notifier(dev_cpu_callback, 0);
6563 dst_init();
6564 dev_mcast_init();
6565 rc = 0;
6566out:
6567 return rc;
6568}
6569
6570subsys_initcall(net_dev_init);
6571
6572static int __init initialize_hashrnd(void)
6573{
6574 get_random_bytes(&hashrnd, sizeof(hashrnd));
6575 return 0;
6576}
6577
6578late_initcall_sync(initialize_hashrnd);
6579
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * NET3 Protocol independent device support routines.
4 *
5 * Derived from the non IP parts of dev.c 1.0.19
6 * Authors: Ross Biro
7 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
8 * Mark Evans, <evansmp@uhura.aston.ac.uk>
9 *
10 * Additional Authors:
11 * Florian la Roche <rzsfl@rz.uni-sb.de>
12 * Alan Cox <gw4pts@gw4pts.ampr.org>
13 * David Hinds <dahinds@users.sourceforge.net>
14 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
15 * Adam Sulmicki <adam@cfar.umd.edu>
16 * Pekka Riikonen <priikone@poesidon.pspt.fi>
17 *
18 * Changes:
19 * D.J. Barrow : Fixed bug where dev->refcnt gets set
20 * to 2 if register_netdev gets called
21 * before net_dev_init & also removed a
22 * few lines of code in the process.
23 * Alan Cox : device private ioctl copies fields back.
24 * Alan Cox : Transmit queue code does relevant
25 * stunts to keep the queue safe.
26 * Alan Cox : Fixed double lock.
27 * Alan Cox : Fixed promisc NULL pointer trap
28 * ???????? : Support the full private ioctl range
29 * Alan Cox : Moved ioctl permission check into
30 * drivers
31 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
32 * Alan Cox : 100 backlog just doesn't cut it when
33 * you start doing multicast video 8)
34 * Alan Cox : Rewrote net_bh and list manager.
35 * Alan Cox : Fix ETH_P_ALL echoback lengths.
36 * Alan Cox : Took out transmit every packet pass
37 * Saved a few bytes in the ioctl handler
38 * Alan Cox : Network driver sets packet type before
39 * calling netif_rx. Saves a function
40 * call a packet.
41 * Alan Cox : Hashed net_bh()
42 * Richard Kooijman: Timestamp fixes.
43 * Alan Cox : Wrong field in SIOCGIFDSTADDR
44 * Alan Cox : Device lock protection.
45 * Alan Cox : Fixed nasty side effect of device close
46 * changes.
47 * Rudi Cilibrasi : Pass the right thing to
48 * set_mac_address()
49 * Dave Miller : 32bit quantity for the device lock to
50 * make it work out on a Sparc.
51 * Bjorn Ekwall : Added KERNELD hack.
52 * Alan Cox : Cleaned up the backlog initialise.
53 * Craig Metz : SIOCGIFCONF fix if space for under
54 * 1 device.
55 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
56 * is no device open function.
57 * Andi Kleen : Fix error reporting for SIOCGIFCONF
58 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
59 * Cyrus Durgin : Cleaned for KMOD
60 * Adam Sulmicki : Bug Fix : Network Device Unload
61 * A network device unload needs to purge
62 * the backlog queue.
63 * Paul Rusty Russell : SIOCSIFNAME
64 * Pekka Riikonen : Netdev boot-time settings code
65 * Andrew Morton : Make unregister_netdevice wait
66 * indefinitely on dev->refcnt
67 * J Hadi Salim : - Backlog queue sampling
68 * - netif_rx() feedback
69 */
70
71#include <linux/uaccess.h>
72#include <linux/bitops.h>
73#include <linux/capability.h>
74#include <linux/cpu.h>
75#include <linux/types.h>
76#include <linux/kernel.h>
77#include <linux/hash.h>
78#include <linux/slab.h>
79#include <linux/sched.h>
80#include <linux/sched/mm.h>
81#include <linux/mutex.h>
82#include <linux/rwsem.h>
83#include <linux/string.h>
84#include <linux/mm.h>
85#include <linux/socket.h>
86#include <linux/sockios.h>
87#include <linux/errno.h>
88#include <linux/interrupt.h>
89#include <linux/if_ether.h>
90#include <linux/netdevice.h>
91#include <linux/etherdevice.h>
92#include <linux/ethtool.h>
93#include <linux/skbuff.h>
94#include <linux/kthread.h>
95#include <linux/bpf.h>
96#include <linux/bpf_trace.h>
97#include <net/net_namespace.h>
98#include <net/sock.h>
99#include <net/busy_poll.h>
100#include <linux/rtnetlink.h>
101#include <linux/stat.h>
102#include <net/dsa.h>
103#include <net/dst.h>
104#include <net/dst_metadata.h>
105#include <net/gro.h>
106#include <net/pkt_sched.h>
107#include <net/pkt_cls.h>
108#include <net/checksum.h>
109#include <net/xfrm.h>
110#include <linux/highmem.h>
111#include <linux/init.h>
112#include <linux/module.h>
113#include <linux/netpoll.h>
114#include <linux/rcupdate.h>
115#include <linux/delay.h>
116#include <net/iw_handler.h>
117#include <asm/current.h>
118#include <linux/audit.h>
119#include <linux/dmaengine.h>
120#include <linux/err.h>
121#include <linux/ctype.h>
122#include <linux/if_arp.h>
123#include <linux/if_vlan.h>
124#include <linux/ip.h>
125#include <net/ip.h>
126#include <net/mpls.h>
127#include <linux/ipv6.h>
128#include <linux/in.h>
129#include <linux/jhash.h>
130#include <linux/random.h>
131#include <trace/events/napi.h>
132#include <trace/events/net.h>
133#include <trace/events/skb.h>
134#include <trace/events/qdisc.h>
135#include <linux/inetdevice.h>
136#include <linux/cpu_rmap.h>
137#include <linux/static_key.h>
138#include <linux/hashtable.h>
139#include <linux/vmalloc.h>
140#include <linux/if_macvlan.h>
141#include <linux/errqueue.h>
142#include <linux/hrtimer.h>
143#include <linux/netfilter_netdev.h>
144#include <linux/crash_dump.h>
145#include <linux/sctp.h>
146#include <net/udp_tunnel.h>
147#include <linux/net_namespace.h>
148#include <linux/indirect_call_wrapper.h>
149#include <net/devlink.h>
150#include <linux/pm_runtime.h>
151#include <linux/prandom.h>
152#include <linux/once_lite.h>
153
154#include "dev.h"
155#include "net-sysfs.h"
156
157
158static DEFINE_SPINLOCK(ptype_lock);
159struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
160struct list_head ptype_all __read_mostly; /* Taps */
161
162static int netif_rx_internal(struct sk_buff *skb);
163static int call_netdevice_notifiers_info(unsigned long val,
164 struct netdev_notifier_info *info);
165static int call_netdevice_notifiers_extack(unsigned long val,
166 struct net_device *dev,
167 struct netlink_ext_ack *extack);
168static struct napi_struct *napi_by_id(unsigned int napi_id);
169
170/*
171 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
172 * semaphore.
173 *
174 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
175 *
176 * Writers must hold the rtnl semaphore while they loop through the
177 * dev_base_head list, and hold dev_base_lock for writing when they do the
178 * actual updates. This allows pure readers to access the list even
179 * while a writer is preparing to update it.
180 *
181 * To put it another way, dev_base_lock is held for writing only to
182 * protect against pure readers; the rtnl semaphore provides the
183 * protection against other writers.
184 *
185 * See, for example usages, register_netdevice() and
186 * unregister_netdevice(), which must be called with the rtnl
187 * semaphore held.
188 */
189DEFINE_RWLOCK(dev_base_lock);
190EXPORT_SYMBOL(dev_base_lock);
191
192static DEFINE_MUTEX(ifalias_mutex);
193
194/* protects napi_hash addition/deletion and napi_gen_id */
195static DEFINE_SPINLOCK(napi_hash_lock);
196
197static unsigned int napi_gen_id = NR_CPUS;
198static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
199
200static DECLARE_RWSEM(devnet_rename_sem);
201
202static inline void dev_base_seq_inc(struct net *net)
203{
204 while (++net->dev_base_seq == 0)
205 ;
206}
207
208static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
209{
210 unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
211
212 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
213}
214
215static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
216{
217 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
218}
219
220static inline void rps_lock_irqsave(struct softnet_data *sd,
221 unsigned long *flags)
222{
223 if (IS_ENABLED(CONFIG_RPS))
224 spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
225 else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
226 local_irq_save(*flags);
227}
228
229static inline void rps_lock_irq_disable(struct softnet_data *sd)
230{
231 if (IS_ENABLED(CONFIG_RPS))
232 spin_lock_irq(&sd->input_pkt_queue.lock);
233 else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
234 local_irq_disable();
235}
236
237static inline void rps_unlock_irq_restore(struct softnet_data *sd,
238 unsigned long *flags)
239{
240 if (IS_ENABLED(CONFIG_RPS))
241 spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
242 else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
243 local_irq_restore(*flags);
244}
245
246static inline void rps_unlock_irq_enable(struct softnet_data *sd)
247{
248 if (IS_ENABLED(CONFIG_RPS))
249 spin_unlock_irq(&sd->input_pkt_queue.lock);
250 else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
251 local_irq_enable();
252}
253
254static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
255 const char *name)
256{
257 struct netdev_name_node *name_node;
258
259 name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
260 if (!name_node)
261 return NULL;
262 INIT_HLIST_NODE(&name_node->hlist);
263 name_node->dev = dev;
264 name_node->name = name;
265 return name_node;
266}
267
268static struct netdev_name_node *
269netdev_name_node_head_alloc(struct net_device *dev)
270{
271 struct netdev_name_node *name_node;
272
273 name_node = netdev_name_node_alloc(dev, dev->name);
274 if (!name_node)
275 return NULL;
276 INIT_LIST_HEAD(&name_node->list);
277 return name_node;
278}
279
280static void netdev_name_node_free(struct netdev_name_node *name_node)
281{
282 kfree(name_node);
283}
284
285static void netdev_name_node_add(struct net *net,
286 struct netdev_name_node *name_node)
287{
288 hlist_add_head_rcu(&name_node->hlist,
289 dev_name_hash(net, name_node->name));
290}
291
292static void netdev_name_node_del(struct netdev_name_node *name_node)
293{
294 hlist_del_rcu(&name_node->hlist);
295}
296
297static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
298 const char *name)
299{
300 struct hlist_head *head = dev_name_hash(net, name);
301 struct netdev_name_node *name_node;
302
303 hlist_for_each_entry(name_node, head, hlist)
304 if (!strcmp(name_node->name, name))
305 return name_node;
306 return NULL;
307}
308
309static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
310 const char *name)
311{
312 struct hlist_head *head = dev_name_hash(net, name);
313 struct netdev_name_node *name_node;
314
315 hlist_for_each_entry_rcu(name_node, head, hlist)
316 if (!strcmp(name_node->name, name))
317 return name_node;
318 return NULL;
319}
320
321bool netdev_name_in_use(struct net *net, const char *name)
322{
323 return netdev_name_node_lookup(net, name);
324}
325EXPORT_SYMBOL(netdev_name_in_use);
326
327int netdev_name_node_alt_create(struct net_device *dev, const char *name)
328{
329 struct netdev_name_node *name_node;
330 struct net *net = dev_net(dev);
331
332 name_node = netdev_name_node_lookup(net, name);
333 if (name_node)
334 return -EEXIST;
335 name_node = netdev_name_node_alloc(dev, name);
336 if (!name_node)
337 return -ENOMEM;
338 netdev_name_node_add(net, name_node);
339 /* The node that holds dev->name acts as a head of per-device list. */
340 list_add_tail(&name_node->list, &dev->name_node->list);
341
342 return 0;
343}
344
345static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
346{
347 list_del(&name_node->list);
348 netdev_name_node_del(name_node);
349 kfree(name_node->name);
350 netdev_name_node_free(name_node);
351}
352
353int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
354{
355 struct netdev_name_node *name_node;
356 struct net *net = dev_net(dev);
357
358 name_node = netdev_name_node_lookup(net, name);
359 if (!name_node)
360 return -ENOENT;
361 /* lookup might have found our primary name or a name belonging
362 * to another device.
363 */
364 if (name_node == dev->name_node || name_node->dev != dev)
365 return -EINVAL;
366
367 __netdev_name_node_alt_destroy(name_node);
368
369 return 0;
370}
371
372static void netdev_name_node_alt_flush(struct net_device *dev)
373{
374 struct netdev_name_node *name_node, *tmp;
375
376 list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
377 __netdev_name_node_alt_destroy(name_node);
378}
379
380/* Device list insertion */
381static void list_netdevice(struct net_device *dev)
382{
383 struct net *net = dev_net(dev);
384
385 ASSERT_RTNL();
386
387 write_lock(&dev_base_lock);
388 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
389 netdev_name_node_add(net, dev->name_node);
390 hlist_add_head_rcu(&dev->index_hlist,
391 dev_index_hash(net, dev->ifindex));
392 write_unlock(&dev_base_lock);
393
394 dev_base_seq_inc(net);
395}
396
397/* Device list removal
398 * caller must respect a RCU grace period before freeing/reusing dev
399 */
400static void unlist_netdevice(struct net_device *dev, bool lock)
401{
402 ASSERT_RTNL();
403
404 /* Unlink dev from the device chain */
405 if (lock)
406 write_lock(&dev_base_lock);
407 list_del_rcu(&dev->dev_list);
408 netdev_name_node_del(dev->name_node);
409 hlist_del_rcu(&dev->index_hlist);
410 if (lock)
411 write_unlock(&dev_base_lock);
412
413 dev_base_seq_inc(dev_net(dev));
414}
415
416/*
417 * Our notifier list
418 */
419
420static RAW_NOTIFIER_HEAD(netdev_chain);
421
422/*
423 * Device drivers call our routines to queue packets here. We empty the
424 * queue in the local softnet handler.
425 */
426
427DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
428EXPORT_PER_CPU_SYMBOL(softnet_data);
429
430#ifdef CONFIG_LOCKDEP
431/*
432 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
433 * according to dev->type
434 */
435static const unsigned short netdev_lock_type[] = {
436 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
437 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
438 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
439 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
440 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
441 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
442 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
443 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
444 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
445 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
446 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
447 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
448 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
449 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
450 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
451
452static const char *const netdev_lock_name[] = {
453 "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
454 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
455 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
456 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
457 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
458 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
459 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
460 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
461 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
462 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
463 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
464 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
465 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
466 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
467 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
468
469static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
470static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
471
472static inline unsigned short netdev_lock_pos(unsigned short dev_type)
473{
474 int i;
475
476 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
477 if (netdev_lock_type[i] == dev_type)
478 return i;
479 /* the last key is used by default */
480 return ARRAY_SIZE(netdev_lock_type) - 1;
481}
482
483static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
484 unsigned short dev_type)
485{
486 int i;
487
488 i = netdev_lock_pos(dev_type);
489 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
490 netdev_lock_name[i]);
491}
492
493static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
494{
495 int i;
496
497 i = netdev_lock_pos(dev->type);
498 lockdep_set_class_and_name(&dev->addr_list_lock,
499 &netdev_addr_lock_key[i],
500 netdev_lock_name[i]);
501}
502#else
503static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
504 unsigned short dev_type)
505{
506}
507
508static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
509{
510}
511#endif
512
513/*******************************************************************************
514 *
515 * Protocol management and registration routines
516 *
517 *******************************************************************************/
518
519
520/*
521 * Add a protocol ID to the list. Now that the input handler is
522 * smarter we can dispense with all the messy stuff that used to be
523 * here.
524 *
525 * BEWARE!!! Protocol handlers, mangling input packets,
526 * MUST BE last in hash buckets and checking protocol handlers
527 * MUST start from promiscuous ptype_all chain in net_bh.
528 * It is true now, do not change it.
529 * Explanation follows: if protocol handler, mangling packet, will
530 * be the first on list, it is not able to sense, that packet
531 * is cloned and should be copied-on-write, so that it will
532 * change it and subsequent readers will get broken packet.
533 * --ANK (980803)
534 */
535
536static inline struct list_head *ptype_head(const struct packet_type *pt)
537{
538 if (pt->type == htons(ETH_P_ALL))
539 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
540 else
541 return pt->dev ? &pt->dev->ptype_specific :
542 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
543}
544
545/**
546 * dev_add_pack - add packet handler
547 * @pt: packet type declaration
548 *
549 * Add a protocol handler to the networking stack. The passed &packet_type
550 * is linked into kernel lists and may not be freed until it has been
551 * removed from the kernel lists.
552 *
553 * This call does not sleep therefore it can not
554 * guarantee all CPU's that are in middle of receiving packets
555 * will see the new packet type (until the next received packet).
556 */
557
558void dev_add_pack(struct packet_type *pt)
559{
560 struct list_head *head = ptype_head(pt);
561
562 spin_lock(&ptype_lock);
563 list_add_rcu(&pt->list, head);
564 spin_unlock(&ptype_lock);
565}
566EXPORT_SYMBOL(dev_add_pack);
567
568/**
569 * __dev_remove_pack - remove packet handler
570 * @pt: packet type declaration
571 *
572 * Remove a protocol handler that was previously added to the kernel
573 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
574 * from the kernel lists and can be freed or reused once this function
575 * returns.
576 *
577 * The packet type might still be in use by receivers
578 * and must not be freed until after all the CPU's have gone
579 * through a quiescent state.
580 */
581void __dev_remove_pack(struct packet_type *pt)
582{
583 struct list_head *head = ptype_head(pt);
584 struct packet_type *pt1;
585
586 spin_lock(&ptype_lock);
587
588 list_for_each_entry(pt1, head, list) {
589 if (pt == pt1) {
590 list_del_rcu(&pt->list);
591 goto out;
592 }
593 }
594
595 pr_warn("dev_remove_pack: %p not found\n", pt);
596out:
597 spin_unlock(&ptype_lock);
598}
599EXPORT_SYMBOL(__dev_remove_pack);
600
601/**
602 * dev_remove_pack - remove packet handler
603 * @pt: packet type declaration
604 *
605 * Remove a protocol handler that was previously added to the kernel
606 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
607 * from the kernel lists and can be freed or reused once this function
608 * returns.
609 *
610 * This call sleeps to guarantee that no CPU is looking at the packet
611 * type after return.
612 */
613void dev_remove_pack(struct packet_type *pt)
614{
615 __dev_remove_pack(pt);
616
617 synchronize_net();
618}
619EXPORT_SYMBOL(dev_remove_pack);
620
621
622/*******************************************************************************
623 *
624 * Device Interface Subroutines
625 *
626 *******************************************************************************/
627
628/**
629 * dev_get_iflink - get 'iflink' value of a interface
630 * @dev: targeted interface
631 *
632 * Indicates the ifindex the interface is linked to.
633 * Physical interfaces have the same 'ifindex' and 'iflink' values.
634 */
635
636int dev_get_iflink(const struct net_device *dev)
637{
638 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
639 return dev->netdev_ops->ndo_get_iflink(dev);
640
641 return dev->ifindex;
642}
643EXPORT_SYMBOL(dev_get_iflink);
644
645/**
646 * dev_fill_metadata_dst - Retrieve tunnel egress information.
647 * @dev: targeted interface
648 * @skb: The packet.
649 *
650 * For better visibility of tunnel traffic OVS needs to retrieve
651 * egress tunnel information for a packet. Following API allows
652 * user to get this info.
653 */
654int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
655{
656 struct ip_tunnel_info *info;
657
658 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
659 return -EINVAL;
660
661 info = skb_tunnel_info_unclone(skb);
662 if (!info)
663 return -ENOMEM;
664 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
665 return -EINVAL;
666
667 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
668}
669EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
670
671static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
672{
673 int k = stack->num_paths++;
674
675 if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
676 return NULL;
677
678 return &stack->path[k];
679}
680
681int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
682 struct net_device_path_stack *stack)
683{
684 const struct net_device *last_dev;
685 struct net_device_path_ctx ctx = {
686 .dev = dev,
687 };
688 struct net_device_path *path;
689 int ret = 0;
690
691 memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
692 stack->num_paths = 0;
693 while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
694 last_dev = ctx.dev;
695 path = dev_fwd_path(stack);
696 if (!path)
697 return -1;
698
699 memset(path, 0, sizeof(struct net_device_path));
700 ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
701 if (ret < 0)
702 return -1;
703
704 if (WARN_ON_ONCE(last_dev == ctx.dev))
705 return -1;
706 }
707
708 if (!ctx.dev)
709 return ret;
710
711 path = dev_fwd_path(stack);
712 if (!path)
713 return -1;
714 path->type = DEV_PATH_ETHERNET;
715 path->dev = ctx.dev;
716
717 return ret;
718}
719EXPORT_SYMBOL_GPL(dev_fill_forward_path);
720
721/**
722 * __dev_get_by_name - find a device by its name
723 * @net: the applicable net namespace
724 * @name: name to find
725 *
726 * Find an interface by name. Must be called under RTNL semaphore
727 * or @dev_base_lock. If the name is found a pointer to the device
728 * is returned. If the name is not found then %NULL is returned. The
729 * reference counters are not incremented so the caller must be
730 * careful with locks.
731 */
732
733struct net_device *__dev_get_by_name(struct net *net, const char *name)
734{
735 struct netdev_name_node *node_name;
736
737 node_name = netdev_name_node_lookup(net, name);
738 return node_name ? node_name->dev : NULL;
739}
740EXPORT_SYMBOL(__dev_get_by_name);
741
742/**
743 * dev_get_by_name_rcu - find a device by its name
744 * @net: the applicable net namespace
745 * @name: name to find
746 *
747 * Find an interface by name.
748 * If the name is found a pointer to the device is returned.
749 * If the name is not found then %NULL is returned.
750 * The reference counters are not incremented so the caller must be
751 * careful with locks. The caller must hold RCU lock.
752 */
753
754struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
755{
756 struct netdev_name_node *node_name;
757
758 node_name = netdev_name_node_lookup_rcu(net, name);
759 return node_name ? node_name->dev : NULL;
760}
761EXPORT_SYMBOL(dev_get_by_name_rcu);
762
763/**
764 * dev_get_by_name - find a device by its name
765 * @net: the applicable net namespace
766 * @name: name to find
767 *
768 * Find an interface by name. This can be called from any
769 * context and does its own locking. The returned handle has
770 * the usage count incremented and the caller must use dev_put() to
771 * release it when it is no longer needed. %NULL is returned if no
772 * matching device is found.
773 */
774
775struct net_device *dev_get_by_name(struct net *net, const char *name)
776{
777 struct net_device *dev;
778
779 rcu_read_lock();
780 dev = dev_get_by_name_rcu(net, name);
781 dev_hold(dev);
782 rcu_read_unlock();
783 return dev;
784}
785EXPORT_SYMBOL(dev_get_by_name);
786
787/**
788 * __dev_get_by_index - find a device by its ifindex
789 * @net: the applicable net namespace
790 * @ifindex: index of device
791 *
792 * Search for an interface by index. Returns %NULL if the device
793 * is not found or a pointer to the device. The device has not
794 * had its reference counter increased so the caller must be careful
795 * about locking. The caller must hold either the RTNL semaphore
796 * or @dev_base_lock.
797 */
798
799struct net_device *__dev_get_by_index(struct net *net, int ifindex)
800{
801 struct net_device *dev;
802 struct hlist_head *head = dev_index_hash(net, ifindex);
803
804 hlist_for_each_entry(dev, head, index_hlist)
805 if (dev->ifindex == ifindex)
806 return dev;
807
808 return NULL;
809}
810EXPORT_SYMBOL(__dev_get_by_index);
811
812/**
813 * dev_get_by_index_rcu - find a device by its ifindex
814 * @net: the applicable net namespace
815 * @ifindex: index of device
816 *
817 * Search for an interface by index. Returns %NULL if the device
818 * is not found or a pointer to the device. The device has not
819 * had its reference counter increased so the caller must be careful
820 * about locking. The caller must hold RCU lock.
821 */
822
823struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
824{
825 struct net_device *dev;
826 struct hlist_head *head = dev_index_hash(net, ifindex);
827
828 hlist_for_each_entry_rcu(dev, head, index_hlist)
829 if (dev->ifindex == ifindex)
830 return dev;
831
832 return NULL;
833}
834EXPORT_SYMBOL(dev_get_by_index_rcu);
835
836
837/**
838 * dev_get_by_index - find a device by its ifindex
839 * @net: the applicable net namespace
840 * @ifindex: index of device
841 *
842 * Search for an interface by index. Returns NULL if the device
843 * is not found or a pointer to the device. The device returned has
844 * had a reference added and the pointer is safe until the user calls
845 * dev_put to indicate they have finished with it.
846 */
847
848struct net_device *dev_get_by_index(struct net *net, int ifindex)
849{
850 struct net_device *dev;
851
852 rcu_read_lock();
853 dev = dev_get_by_index_rcu(net, ifindex);
854 dev_hold(dev);
855 rcu_read_unlock();
856 return dev;
857}
858EXPORT_SYMBOL(dev_get_by_index);
859
860/**
861 * dev_get_by_napi_id - find a device by napi_id
862 * @napi_id: ID of the NAPI struct
863 *
864 * Search for an interface by NAPI ID. Returns %NULL if the device
865 * is not found or a pointer to the device. The device has not had
866 * its reference counter increased so the caller must be careful
867 * about locking. The caller must hold RCU lock.
868 */
869
870struct net_device *dev_get_by_napi_id(unsigned int napi_id)
871{
872 struct napi_struct *napi;
873
874 WARN_ON_ONCE(!rcu_read_lock_held());
875
876 if (napi_id < MIN_NAPI_ID)
877 return NULL;
878
879 napi = napi_by_id(napi_id);
880
881 return napi ? napi->dev : NULL;
882}
883EXPORT_SYMBOL(dev_get_by_napi_id);
884
885/**
886 * netdev_get_name - get a netdevice name, knowing its ifindex.
887 * @net: network namespace
888 * @name: a pointer to the buffer where the name will be stored.
889 * @ifindex: the ifindex of the interface to get the name from.
890 */
891int netdev_get_name(struct net *net, char *name, int ifindex)
892{
893 struct net_device *dev;
894 int ret;
895
896 down_read(&devnet_rename_sem);
897 rcu_read_lock();
898
899 dev = dev_get_by_index_rcu(net, ifindex);
900 if (!dev) {
901 ret = -ENODEV;
902 goto out;
903 }
904
905 strcpy(name, dev->name);
906
907 ret = 0;
908out:
909 rcu_read_unlock();
910 up_read(&devnet_rename_sem);
911 return ret;
912}
913
914/**
915 * dev_getbyhwaddr_rcu - find a device by its hardware address
916 * @net: the applicable net namespace
917 * @type: media type of device
918 * @ha: hardware address
919 *
920 * Search for an interface by MAC address. Returns NULL if the device
921 * is not found or a pointer to the device.
922 * The caller must hold RCU or RTNL.
923 * The returned device has not had its ref count increased
924 * and the caller must therefore be careful about locking
925 *
926 */
927
928struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
929 const char *ha)
930{
931 struct net_device *dev;
932
933 for_each_netdev_rcu(net, dev)
934 if (dev->type == type &&
935 !memcmp(dev->dev_addr, ha, dev->addr_len))
936 return dev;
937
938 return NULL;
939}
940EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
941
942struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
943{
944 struct net_device *dev, *ret = NULL;
945
946 rcu_read_lock();
947 for_each_netdev_rcu(net, dev)
948 if (dev->type == type) {
949 dev_hold(dev);
950 ret = dev;
951 break;
952 }
953 rcu_read_unlock();
954 return ret;
955}
956EXPORT_SYMBOL(dev_getfirstbyhwtype);
957
958/**
959 * __dev_get_by_flags - find any device with given flags
960 * @net: the applicable net namespace
961 * @if_flags: IFF_* values
962 * @mask: bitmask of bits in if_flags to check
963 *
964 * Search for any interface with the given flags. Returns NULL if a device
965 * is not found or a pointer to the device. Must be called inside
966 * rtnl_lock(), and result refcount is unchanged.
967 */
968
969struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
970 unsigned short mask)
971{
972 struct net_device *dev, *ret;
973
974 ASSERT_RTNL();
975
976 ret = NULL;
977 for_each_netdev(net, dev) {
978 if (((dev->flags ^ if_flags) & mask) == 0) {
979 ret = dev;
980 break;
981 }
982 }
983 return ret;
984}
985EXPORT_SYMBOL(__dev_get_by_flags);
986
987/**
988 * dev_valid_name - check if name is okay for network device
989 * @name: name string
990 *
991 * Network device names need to be valid file names to
992 * allow sysfs to work. We also disallow any kind of
993 * whitespace.
994 */
995bool dev_valid_name(const char *name)
996{
997 if (*name == '\0')
998 return false;
999 if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1000 return false;
1001 if (!strcmp(name, ".") || !strcmp(name, ".."))
1002 return false;
1003
1004 while (*name) {
1005 if (*name == '/' || *name == ':' || isspace(*name))
1006 return false;
1007 name++;
1008 }
1009 return true;
1010}
1011EXPORT_SYMBOL(dev_valid_name);
1012
1013/**
1014 * __dev_alloc_name - allocate a name for a device
1015 * @net: network namespace to allocate the device name in
1016 * @name: name format string
1017 * @buf: scratch buffer and result name string
1018 *
1019 * Passed a format string - eg "lt%d" it will try and find a suitable
1020 * id. It scans list of devices to build up a free map, then chooses
1021 * the first empty slot. The caller must hold the dev_base or rtnl lock
1022 * while allocating the name and adding the device in order to avoid
1023 * duplicates.
1024 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1025 * Returns the number of the unit assigned or a negative errno code.
1026 */
1027
1028static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1029{
1030 int i = 0;
1031 const char *p;
1032 const int max_netdevices = 8*PAGE_SIZE;
1033 unsigned long *inuse;
1034 struct net_device *d;
1035
1036 if (!dev_valid_name(name))
1037 return -EINVAL;
1038
1039 p = strchr(name, '%');
1040 if (p) {
1041 /*
1042 * Verify the string as this thing may have come from
1043 * the user. There must be either one "%d" and no other "%"
1044 * characters.
1045 */
1046 if (p[1] != 'd' || strchr(p + 2, '%'))
1047 return -EINVAL;
1048
1049 /* Use one page as a bit array of possible slots */
1050 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1051 if (!inuse)
1052 return -ENOMEM;
1053
1054 for_each_netdev(net, d) {
1055 struct netdev_name_node *name_node;
1056 list_for_each_entry(name_node, &d->name_node->list, list) {
1057 if (!sscanf(name_node->name, name, &i))
1058 continue;
1059 if (i < 0 || i >= max_netdevices)
1060 continue;
1061
1062 /* avoid cases where sscanf is not exact inverse of printf */
1063 snprintf(buf, IFNAMSIZ, name, i);
1064 if (!strncmp(buf, name_node->name, IFNAMSIZ))
1065 __set_bit(i, inuse);
1066 }
1067 if (!sscanf(d->name, name, &i))
1068 continue;
1069 if (i < 0 || i >= max_netdevices)
1070 continue;
1071
1072 /* avoid cases where sscanf is not exact inverse of printf */
1073 snprintf(buf, IFNAMSIZ, name, i);
1074 if (!strncmp(buf, d->name, IFNAMSIZ))
1075 __set_bit(i, inuse);
1076 }
1077
1078 i = find_first_zero_bit(inuse, max_netdevices);
1079 free_page((unsigned long) inuse);
1080 }
1081
1082 snprintf(buf, IFNAMSIZ, name, i);
1083 if (!netdev_name_in_use(net, buf))
1084 return i;
1085
1086 /* It is possible to run out of possible slots
1087 * when the name is long and there isn't enough space left
1088 * for the digits, or if all bits are used.
1089 */
1090 return -ENFILE;
1091}
1092
1093static int dev_alloc_name_ns(struct net *net,
1094 struct net_device *dev,
1095 const char *name)
1096{
1097 char buf[IFNAMSIZ];
1098 int ret;
1099
1100 BUG_ON(!net);
1101 ret = __dev_alloc_name(net, name, buf);
1102 if (ret >= 0)
1103 strscpy(dev->name, buf, IFNAMSIZ);
1104 return ret;
1105}
1106
1107/**
1108 * dev_alloc_name - allocate a name for a device
1109 * @dev: device
1110 * @name: name format string
1111 *
1112 * Passed a format string - eg "lt%d" it will try and find a suitable
1113 * id. It scans list of devices to build up a free map, then chooses
1114 * the first empty slot. The caller must hold the dev_base or rtnl lock
1115 * while allocating the name and adding the device in order to avoid
1116 * duplicates.
1117 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1118 * Returns the number of the unit assigned or a negative errno code.
1119 */
1120
1121int dev_alloc_name(struct net_device *dev, const char *name)
1122{
1123 return dev_alloc_name_ns(dev_net(dev), dev, name);
1124}
1125EXPORT_SYMBOL(dev_alloc_name);
1126
1127static int dev_get_valid_name(struct net *net, struct net_device *dev,
1128 const char *name)
1129{
1130 BUG_ON(!net);
1131
1132 if (!dev_valid_name(name))
1133 return -EINVAL;
1134
1135 if (strchr(name, '%'))
1136 return dev_alloc_name_ns(net, dev, name);
1137 else if (netdev_name_in_use(net, name))
1138 return -EEXIST;
1139 else if (dev->name != name)
1140 strscpy(dev->name, name, IFNAMSIZ);
1141
1142 return 0;
1143}
1144
1145/**
1146 * dev_change_name - change name of a device
1147 * @dev: device
1148 * @newname: name (or format string) must be at least IFNAMSIZ
1149 *
1150 * Change name of a device, can pass format strings "eth%d".
1151 * for wildcarding.
1152 */
1153int dev_change_name(struct net_device *dev, const char *newname)
1154{
1155 unsigned char old_assign_type;
1156 char oldname[IFNAMSIZ];
1157 int err = 0;
1158 int ret;
1159 struct net *net;
1160
1161 ASSERT_RTNL();
1162 BUG_ON(!dev_net(dev));
1163
1164 net = dev_net(dev);
1165
1166 down_write(&devnet_rename_sem);
1167
1168 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1169 up_write(&devnet_rename_sem);
1170 return 0;
1171 }
1172
1173 memcpy(oldname, dev->name, IFNAMSIZ);
1174
1175 err = dev_get_valid_name(net, dev, newname);
1176 if (err < 0) {
1177 up_write(&devnet_rename_sem);
1178 return err;
1179 }
1180
1181 if (oldname[0] && !strchr(oldname, '%'))
1182 netdev_info(dev, "renamed from %s%s\n", oldname,
1183 dev->flags & IFF_UP ? " (while UP)" : "");
1184
1185 old_assign_type = dev->name_assign_type;
1186 dev->name_assign_type = NET_NAME_RENAMED;
1187
1188rollback:
1189 ret = device_rename(&dev->dev, dev->name);
1190 if (ret) {
1191 memcpy(dev->name, oldname, IFNAMSIZ);
1192 dev->name_assign_type = old_assign_type;
1193 up_write(&devnet_rename_sem);
1194 return ret;
1195 }
1196
1197 up_write(&devnet_rename_sem);
1198
1199 netdev_adjacent_rename_links(dev, oldname);
1200
1201 write_lock(&dev_base_lock);
1202 netdev_name_node_del(dev->name_node);
1203 write_unlock(&dev_base_lock);
1204
1205 synchronize_rcu();
1206
1207 write_lock(&dev_base_lock);
1208 netdev_name_node_add(net, dev->name_node);
1209 write_unlock(&dev_base_lock);
1210
1211 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1212 ret = notifier_to_errno(ret);
1213
1214 if (ret) {
1215 /* err >= 0 after dev_alloc_name() or stores the first errno */
1216 if (err >= 0) {
1217 err = ret;
1218 down_write(&devnet_rename_sem);
1219 memcpy(dev->name, oldname, IFNAMSIZ);
1220 memcpy(oldname, newname, IFNAMSIZ);
1221 dev->name_assign_type = old_assign_type;
1222 old_assign_type = NET_NAME_RENAMED;
1223 goto rollback;
1224 } else {
1225 netdev_err(dev, "name change rollback failed: %d\n",
1226 ret);
1227 }
1228 }
1229
1230 return err;
1231}
1232
1233/**
1234 * dev_set_alias - change ifalias of a device
1235 * @dev: device
1236 * @alias: name up to IFALIASZ
1237 * @len: limit of bytes to copy from info
1238 *
1239 * Set ifalias for a device,
1240 */
1241int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1242{
1243 struct dev_ifalias *new_alias = NULL;
1244
1245 if (len >= IFALIASZ)
1246 return -EINVAL;
1247
1248 if (len) {
1249 new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1250 if (!new_alias)
1251 return -ENOMEM;
1252
1253 memcpy(new_alias->ifalias, alias, len);
1254 new_alias->ifalias[len] = 0;
1255 }
1256
1257 mutex_lock(&ifalias_mutex);
1258 new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
1259 mutex_is_locked(&ifalias_mutex));
1260 mutex_unlock(&ifalias_mutex);
1261
1262 if (new_alias)
1263 kfree_rcu(new_alias, rcuhead);
1264
1265 return len;
1266}
1267EXPORT_SYMBOL(dev_set_alias);
1268
1269/**
1270 * dev_get_alias - get ifalias of a device
1271 * @dev: device
1272 * @name: buffer to store name of ifalias
1273 * @len: size of buffer
1274 *
1275 * get ifalias for a device. Caller must make sure dev cannot go
1276 * away, e.g. rcu read lock or own a reference count to device.
1277 */
1278int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1279{
1280 const struct dev_ifalias *alias;
1281 int ret = 0;
1282
1283 rcu_read_lock();
1284 alias = rcu_dereference(dev->ifalias);
1285 if (alias)
1286 ret = snprintf(name, len, "%s", alias->ifalias);
1287 rcu_read_unlock();
1288
1289 return ret;
1290}
1291
1292/**
1293 * netdev_features_change - device changes features
1294 * @dev: device to cause notification
1295 *
1296 * Called to indicate a device has changed features.
1297 */
1298void netdev_features_change(struct net_device *dev)
1299{
1300 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1301}
1302EXPORT_SYMBOL(netdev_features_change);
1303
1304/**
1305 * netdev_state_change - device changes state
1306 * @dev: device to cause notification
1307 *
1308 * Called to indicate a device has changed state. This function calls
1309 * the notifier chains for netdev_chain and sends a NEWLINK message
1310 * to the routing socket.
1311 */
1312void netdev_state_change(struct net_device *dev)
1313{
1314 if (dev->flags & IFF_UP) {
1315 struct netdev_notifier_change_info change_info = {
1316 .info.dev = dev,
1317 };
1318
1319 call_netdevice_notifiers_info(NETDEV_CHANGE,
1320 &change_info.info);
1321 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
1322 }
1323}
1324EXPORT_SYMBOL(netdev_state_change);
1325
1326/**
1327 * __netdev_notify_peers - notify network peers about existence of @dev,
1328 * to be called when rtnl lock is already held.
1329 * @dev: network device
1330 *
1331 * Generate traffic such that interested network peers are aware of
1332 * @dev, such as by generating a gratuitous ARP. This may be used when
1333 * a device wants to inform the rest of the network about some sort of
1334 * reconfiguration such as a failover event or virtual machine
1335 * migration.
1336 */
1337void __netdev_notify_peers(struct net_device *dev)
1338{
1339 ASSERT_RTNL();
1340 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1341 call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1342}
1343EXPORT_SYMBOL(__netdev_notify_peers);
1344
1345/**
1346 * netdev_notify_peers - notify network peers about existence of @dev
1347 * @dev: network device
1348 *
1349 * Generate traffic such that interested network peers are aware of
1350 * @dev, such as by generating a gratuitous ARP. This may be used when
1351 * a device wants to inform the rest of the network about some sort of
1352 * reconfiguration such as a failover event or virtual machine
1353 * migration.
1354 */
1355void netdev_notify_peers(struct net_device *dev)
1356{
1357 rtnl_lock();
1358 __netdev_notify_peers(dev);
1359 rtnl_unlock();
1360}
1361EXPORT_SYMBOL(netdev_notify_peers);
1362
1363static int napi_threaded_poll(void *data);
1364
1365static int napi_kthread_create(struct napi_struct *n)
1366{
1367 int err = 0;
1368
1369 /* Create and wake up the kthread once to put it in
1370 * TASK_INTERRUPTIBLE mode to avoid the blocked task
1371 * warning and work with loadavg.
1372 */
1373 n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
1374 n->dev->name, n->napi_id);
1375 if (IS_ERR(n->thread)) {
1376 err = PTR_ERR(n->thread);
1377 pr_err("kthread_run failed with err %d\n", err);
1378 n->thread = NULL;
1379 }
1380
1381 return err;
1382}
1383
1384static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1385{
1386 const struct net_device_ops *ops = dev->netdev_ops;
1387 int ret;
1388
1389 ASSERT_RTNL();
1390 dev_addr_check(dev);
1391
1392 if (!netif_device_present(dev)) {
1393 /* may be detached because parent is runtime-suspended */
1394 if (dev->dev.parent)
1395 pm_runtime_resume(dev->dev.parent);
1396 if (!netif_device_present(dev))
1397 return -ENODEV;
1398 }
1399
1400 /* Block netpoll from trying to do any rx path servicing.
1401 * If we don't do this there is a chance ndo_poll_controller
1402 * or ndo_poll may be running while we open the device
1403 */
1404 netpoll_poll_disable(dev);
1405
1406 ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
1407 ret = notifier_to_errno(ret);
1408 if (ret)
1409 return ret;
1410
1411 set_bit(__LINK_STATE_START, &dev->state);
1412
1413 if (ops->ndo_validate_addr)
1414 ret = ops->ndo_validate_addr(dev);
1415
1416 if (!ret && ops->ndo_open)
1417 ret = ops->ndo_open(dev);
1418
1419 netpoll_poll_enable(dev);
1420
1421 if (ret)
1422 clear_bit(__LINK_STATE_START, &dev->state);
1423 else {
1424 dev->flags |= IFF_UP;
1425 dev_set_rx_mode(dev);
1426 dev_activate(dev);
1427 add_device_randomness(dev->dev_addr, dev->addr_len);
1428 }
1429
1430 return ret;
1431}
1432
1433/**
1434 * dev_open - prepare an interface for use.
1435 * @dev: device to open
1436 * @extack: netlink extended ack
1437 *
1438 * Takes a device from down to up state. The device's private open
1439 * function is invoked and then the multicast lists are loaded. Finally
1440 * the device is moved into the up state and a %NETDEV_UP message is
1441 * sent to the netdev notifier chain.
1442 *
1443 * Calling this function on an active interface is a nop. On a failure
1444 * a negative errno code is returned.
1445 */
1446int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1447{
1448 int ret;
1449
1450 if (dev->flags & IFF_UP)
1451 return 0;
1452
1453 ret = __dev_open(dev, extack);
1454 if (ret < 0)
1455 return ret;
1456
1457 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
1458 call_netdevice_notifiers(NETDEV_UP, dev);
1459
1460 return ret;
1461}
1462EXPORT_SYMBOL(dev_open);
1463
1464static void __dev_close_many(struct list_head *head)
1465{
1466 struct net_device *dev;
1467
1468 ASSERT_RTNL();
1469 might_sleep();
1470
1471 list_for_each_entry(dev, head, close_list) {
1472 /* Temporarily disable netpoll until the interface is down */
1473 netpoll_poll_disable(dev);
1474
1475 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1476
1477 clear_bit(__LINK_STATE_START, &dev->state);
1478
1479 /* Synchronize to scheduled poll. We cannot touch poll list, it
1480 * can be even on different cpu. So just clear netif_running().
1481 *
1482 * dev->stop() will invoke napi_disable() on all of it's
1483 * napi_struct instances on this device.
1484 */
1485 smp_mb__after_atomic(); /* Commit netif_running(). */
1486 }
1487
1488 dev_deactivate_many(head);
1489
1490 list_for_each_entry(dev, head, close_list) {
1491 const struct net_device_ops *ops = dev->netdev_ops;
1492
1493 /*
1494 * Call the device specific close. This cannot fail.
1495 * Only if device is UP
1496 *
1497 * We allow it to be called even after a DETACH hot-plug
1498 * event.
1499 */
1500 if (ops->ndo_stop)
1501 ops->ndo_stop(dev);
1502
1503 dev->flags &= ~IFF_UP;
1504 netpoll_poll_enable(dev);
1505 }
1506}
1507
1508static void __dev_close(struct net_device *dev)
1509{
1510 LIST_HEAD(single);
1511
1512 list_add(&dev->close_list, &single);
1513 __dev_close_many(&single);
1514 list_del(&single);
1515}
1516
1517void dev_close_many(struct list_head *head, bool unlink)
1518{
1519 struct net_device *dev, *tmp;
1520
1521 /* Remove the devices that don't need to be closed */
1522 list_for_each_entry_safe(dev, tmp, head, close_list)
1523 if (!(dev->flags & IFF_UP))
1524 list_del_init(&dev->close_list);
1525
1526 __dev_close_many(head);
1527
1528 list_for_each_entry_safe(dev, tmp, head, close_list) {
1529 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
1530 call_netdevice_notifiers(NETDEV_DOWN, dev);
1531 if (unlink)
1532 list_del_init(&dev->close_list);
1533 }
1534}
1535EXPORT_SYMBOL(dev_close_many);
1536
1537/**
1538 * dev_close - shutdown an interface.
1539 * @dev: device to shutdown
1540 *
1541 * This function moves an active device into down state. A
1542 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1543 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1544 * chain.
1545 */
1546void dev_close(struct net_device *dev)
1547{
1548 if (dev->flags & IFF_UP) {
1549 LIST_HEAD(single);
1550
1551 list_add(&dev->close_list, &single);
1552 dev_close_many(&single, true);
1553 list_del(&single);
1554 }
1555}
1556EXPORT_SYMBOL(dev_close);
1557
1558
1559/**
1560 * dev_disable_lro - disable Large Receive Offload on a device
1561 * @dev: device
1562 *
1563 * Disable Large Receive Offload (LRO) on a net device. Must be
1564 * called under RTNL. This is needed if received packets may be
1565 * forwarded to another interface.
1566 */
1567void dev_disable_lro(struct net_device *dev)
1568{
1569 struct net_device *lower_dev;
1570 struct list_head *iter;
1571
1572 dev->wanted_features &= ~NETIF_F_LRO;
1573 netdev_update_features(dev);
1574
1575 if (unlikely(dev->features & NETIF_F_LRO))
1576 netdev_WARN(dev, "failed to disable LRO!\n");
1577
1578 netdev_for_each_lower_dev(dev, lower_dev, iter)
1579 dev_disable_lro(lower_dev);
1580}
1581EXPORT_SYMBOL(dev_disable_lro);
1582
1583/**
1584 * dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1585 * @dev: device
1586 *
1587 * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be
1588 * called under RTNL. This is needed if Generic XDP is installed on
1589 * the device.
1590 */
1591static void dev_disable_gro_hw(struct net_device *dev)
1592{
1593 dev->wanted_features &= ~NETIF_F_GRO_HW;
1594 netdev_update_features(dev);
1595
1596 if (unlikely(dev->features & NETIF_F_GRO_HW))
1597 netdev_WARN(dev, "failed to disable GRO_HW!\n");
1598}
1599
1600const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1601{
1602#define N(val) \
1603 case NETDEV_##val: \
1604 return "NETDEV_" __stringify(val);
1605 switch (cmd) {
1606 N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1607 N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1608 N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1609 N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN)
1610 N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA)
1611 N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE)
1612 N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1613 N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1614 N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1615 N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
1616 N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
1617 }
1618#undef N
1619 return "UNKNOWN_NETDEV_EVENT";
1620}
1621EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1622
1623static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1624 struct net_device *dev)
1625{
1626 struct netdev_notifier_info info = {
1627 .dev = dev,
1628 };
1629
1630 return nb->notifier_call(nb, val, &info);
1631}
1632
1633static int call_netdevice_register_notifiers(struct notifier_block *nb,
1634 struct net_device *dev)
1635{
1636 int err;
1637
1638 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1639 err = notifier_to_errno(err);
1640 if (err)
1641 return err;
1642
1643 if (!(dev->flags & IFF_UP))
1644 return 0;
1645
1646 call_netdevice_notifier(nb, NETDEV_UP, dev);
1647 return 0;
1648}
1649
1650static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
1651 struct net_device *dev)
1652{
1653 if (dev->flags & IFF_UP) {
1654 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1655 dev);
1656 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1657 }
1658 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1659}
1660
1661static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
1662 struct net *net)
1663{
1664 struct net_device *dev;
1665 int err;
1666
1667 for_each_netdev(net, dev) {
1668 err = call_netdevice_register_notifiers(nb, dev);
1669 if (err)
1670 goto rollback;
1671 }
1672 return 0;
1673
1674rollback:
1675 for_each_netdev_continue_reverse(net, dev)
1676 call_netdevice_unregister_notifiers(nb, dev);
1677 return err;
1678}
1679
1680static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
1681 struct net *net)
1682{
1683 struct net_device *dev;
1684
1685 for_each_netdev(net, dev)
1686 call_netdevice_unregister_notifiers(nb, dev);
1687}
1688
1689static int dev_boot_phase = 1;
1690
1691/**
1692 * register_netdevice_notifier - register a network notifier block
1693 * @nb: notifier
1694 *
1695 * Register a notifier to be called when network device events occur.
1696 * The notifier passed is linked into the kernel structures and must
1697 * not be reused until it has been unregistered. A negative errno code
1698 * is returned on a failure.
1699 *
1700 * When registered all registration and up events are replayed
1701 * to the new notifier to allow device to have a race free
1702 * view of the network device list.
1703 */
1704
1705int register_netdevice_notifier(struct notifier_block *nb)
1706{
1707 struct net *net;
1708 int err;
1709
1710 /* Close race with setup_net() and cleanup_net() */
1711 down_write(&pernet_ops_rwsem);
1712 rtnl_lock();
1713 err = raw_notifier_chain_register(&netdev_chain, nb);
1714 if (err)
1715 goto unlock;
1716 if (dev_boot_phase)
1717 goto unlock;
1718 for_each_net(net) {
1719 err = call_netdevice_register_net_notifiers(nb, net);
1720 if (err)
1721 goto rollback;
1722 }
1723
1724unlock:
1725 rtnl_unlock();
1726 up_write(&pernet_ops_rwsem);
1727 return err;
1728
1729rollback:
1730 for_each_net_continue_reverse(net)
1731 call_netdevice_unregister_net_notifiers(nb, net);
1732
1733 raw_notifier_chain_unregister(&netdev_chain, nb);
1734 goto unlock;
1735}
1736EXPORT_SYMBOL(register_netdevice_notifier);
1737
1738/**
1739 * unregister_netdevice_notifier - unregister a network notifier block
1740 * @nb: notifier
1741 *
1742 * Unregister a notifier previously registered by
1743 * register_netdevice_notifier(). The notifier is unlinked into the
1744 * kernel structures and may then be reused. A negative errno code
1745 * is returned on a failure.
1746 *
1747 * After unregistering unregister and down device events are synthesized
1748 * for all devices on the device list to the removed notifier to remove
1749 * the need for special case cleanup code.
1750 */
1751
1752int unregister_netdevice_notifier(struct notifier_block *nb)
1753{
1754 struct net *net;
1755 int err;
1756
1757 /* Close race with setup_net() and cleanup_net() */
1758 down_write(&pernet_ops_rwsem);
1759 rtnl_lock();
1760 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1761 if (err)
1762 goto unlock;
1763
1764 for_each_net(net)
1765 call_netdevice_unregister_net_notifiers(nb, net);
1766
1767unlock:
1768 rtnl_unlock();
1769 up_write(&pernet_ops_rwsem);
1770 return err;
1771}
1772EXPORT_SYMBOL(unregister_netdevice_notifier);
1773
1774static int __register_netdevice_notifier_net(struct net *net,
1775 struct notifier_block *nb,
1776 bool ignore_call_fail)
1777{
1778 int err;
1779
1780 err = raw_notifier_chain_register(&net->netdev_chain, nb);
1781 if (err)
1782 return err;
1783 if (dev_boot_phase)
1784 return 0;
1785
1786 err = call_netdevice_register_net_notifiers(nb, net);
1787 if (err && !ignore_call_fail)
1788 goto chain_unregister;
1789
1790 return 0;
1791
1792chain_unregister:
1793 raw_notifier_chain_unregister(&net->netdev_chain, nb);
1794 return err;
1795}
1796
1797static int __unregister_netdevice_notifier_net(struct net *net,
1798 struct notifier_block *nb)
1799{
1800 int err;
1801
1802 err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
1803 if (err)
1804 return err;
1805
1806 call_netdevice_unregister_net_notifiers(nb, net);
1807 return 0;
1808}
1809
1810/**
1811 * register_netdevice_notifier_net - register a per-netns network notifier block
1812 * @net: network namespace
1813 * @nb: notifier
1814 *
1815 * Register a notifier to be called when network device events occur.
1816 * The notifier passed is linked into the kernel structures and must
1817 * not be reused until it has been unregistered. A negative errno code
1818 * is returned on a failure.
1819 *
1820 * When registered all registration and up events are replayed
1821 * to the new notifier to allow device to have a race free
1822 * view of the network device list.
1823 */
1824
1825int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
1826{
1827 int err;
1828
1829 rtnl_lock();
1830 err = __register_netdevice_notifier_net(net, nb, false);
1831 rtnl_unlock();
1832 return err;
1833}
1834EXPORT_SYMBOL(register_netdevice_notifier_net);
1835
1836/**
1837 * unregister_netdevice_notifier_net - unregister a per-netns
1838 * network notifier block
1839 * @net: network namespace
1840 * @nb: notifier
1841 *
1842 * Unregister a notifier previously registered by
1843 * register_netdevice_notifier(). The notifier is unlinked into the
1844 * kernel structures and may then be reused. A negative errno code
1845 * is returned on a failure.
1846 *
1847 * After unregistering unregister and down device events are synthesized
1848 * for all devices on the device list to the removed notifier to remove
1849 * the need for special case cleanup code.
1850 */
1851
1852int unregister_netdevice_notifier_net(struct net *net,
1853 struct notifier_block *nb)
1854{
1855 int err;
1856
1857 rtnl_lock();
1858 err = __unregister_netdevice_notifier_net(net, nb);
1859 rtnl_unlock();
1860 return err;
1861}
1862EXPORT_SYMBOL(unregister_netdevice_notifier_net);
1863
1864static void __move_netdevice_notifier_net(struct net *src_net,
1865 struct net *dst_net,
1866 struct notifier_block *nb)
1867{
1868 __unregister_netdevice_notifier_net(src_net, nb);
1869 __register_netdevice_notifier_net(dst_net, nb, true);
1870}
1871
1872int register_netdevice_notifier_dev_net(struct net_device *dev,
1873 struct notifier_block *nb,
1874 struct netdev_net_notifier *nn)
1875{
1876 int err;
1877
1878 rtnl_lock();
1879 err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
1880 if (!err) {
1881 nn->nb = nb;
1882 list_add(&nn->list, &dev->net_notifier_list);
1883 }
1884 rtnl_unlock();
1885 return err;
1886}
1887EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
1888
1889int unregister_netdevice_notifier_dev_net(struct net_device *dev,
1890 struct notifier_block *nb,
1891 struct netdev_net_notifier *nn)
1892{
1893 int err;
1894
1895 rtnl_lock();
1896 list_del(&nn->list);
1897 err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
1898 rtnl_unlock();
1899 return err;
1900}
1901EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
1902
1903static void move_netdevice_notifiers_dev_net(struct net_device *dev,
1904 struct net *net)
1905{
1906 struct netdev_net_notifier *nn;
1907
1908 list_for_each_entry(nn, &dev->net_notifier_list, list)
1909 __move_netdevice_notifier_net(dev_net(dev), net, nn->nb);
1910}
1911
1912/**
1913 * call_netdevice_notifiers_info - call all network notifier blocks
1914 * @val: value passed unmodified to notifier function
1915 * @info: notifier information data
1916 *
1917 * Call all network notifier blocks. Parameters and return value
1918 * are as for raw_notifier_call_chain().
1919 */
1920
1921static int call_netdevice_notifiers_info(unsigned long val,
1922 struct netdev_notifier_info *info)
1923{
1924 struct net *net = dev_net(info->dev);
1925 int ret;
1926
1927 ASSERT_RTNL();
1928
1929 /* Run per-netns notifier block chain first, then run the global one.
1930 * Hopefully, one day, the global one is going to be removed after
1931 * all notifier block registrators get converted to be per-netns.
1932 */
1933 ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
1934 if (ret & NOTIFY_STOP_MASK)
1935 return ret;
1936 return raw_notifier_call_chain(&netdev_chain, val, info);
1937}
1938
1939/**
1940 * call_netdevice_notifiers_info_robust - call per-netns notifier blocks
1941 * for and rollback on error
1942 * @val_up: value passed unmodified to notifier function
1943 * @val_down: value passed unmodified to the notifier function when
1944 * recovering from an error on @val_up
1945 * @info: notifier information data
1946 *
1947 * Call all per-netns network notifier blocks, but not notifier blocks on
1948 * the global notifier chain. Parameters and return value are as for
1949 * raw_notifier_call_chain_robust().
1950 */
1951
1952static int
1953call_netdevice_notifiers_info_robust(unsigned long val_up,
1954 unsigned long val_down,
1955 struct netdev_notifier_info *info)
1956{
1957 struct net *net = dev_net(info->dev);
1958
1959 ASSERT_RTNL();
1960
1961 return raw_notifier_call_chain_robust(&net->netdev_chain,
1962 val_up, val_down, info);
1963}
1964
1965static int call_netdevice_notifiers_extack(unsigned long val,
1966 struct net_device *dev,
1967 struct netlink_ext_ack *extack)
1968{
1969 struct netdev_notifier_info info = {
1970 .dev = dev,
1971 .extack = extack,
1972 };
1973
1974 return call_netdevice_notifiers_info(val, &info);
1975}
1976
1977/**
1978 * call_netdevice_notifiers - call all network notifier blocks
1979 * @val: value passed unmodified to notifier function
1980 * @dev: net_device pointer passed unmodified to notifier function
1981 *
1982 * Call all network notifier blocks. Parameters and return value
1983 * are as for raw_notifier_call_chain().
1984 */
1985
1986int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1987{
1988 return call_netdevice_notifiers_extack(val, dev, NULL);
1989}
1990EXPORT_SYMBOL(call_netdevice_notifiers);
1991
1992/**
1993 * call_netdevice_notifiers_mtu - call all network notifier blocks
1994 * @val: value passed unmodified to notifier function
1995 * @dev: net_device pointer passed unmodified to notifier function
1996 * @arg: additional u32 argument passed to the notifier function
1997 *
1998 * Call all network notifier blocks. Parameters and return value
1999 * are as for raw_notifier_call_chain().
2000 */
2001static int call_netdevice_notifiers_mtu(unsigned long val,
2002 struct net_device *dev, u32 arg)
2003{
2004 struct netdev_notifier_info_ext info = {
2005 .info.dev = dev,
2006 .ext.mtu = arg,
2007 };
2008
2009 BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
2010
2011 return call_netdevice_notifiers_info(val, &info.info);
2012}
2013
2014#ifdef CONFIG_NET_INGRESS
2015static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
2016
2017void net_inc_ingress_queue(void)
2018{
2019 static_branch_inc(&ingress_needed_key);
2020}
2021EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
2022
2023void net_dec_ingress_queue(void)
2024{
2025 static_branch_dec(&ingress_needed_key);
2026}
2027EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
2028#endif
2029
2030#ifdef CONFIG_NET_EGRESS
2031static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
2032
2033void net_inc_egress_queue(void)
2034{
2035 static_branch_inc(&egress_needed_key);
2036}
2037EXPORT_SYMBOL_GPL(net_inc_egress_queue);
2038
2039void net_dec_egress_queue(void)
2040{
2041 static_branch_dec(&egress_needed_key);
2042}
2043EXPORT_SYMBOL_GPL(net_dec_egress_queue);
2044#endif
2045
2046DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
2047EXPORT_SYMBOL(netstamp_needed_key);
2048#ifdef CONFIG_JUMP_LABEL
2049static atomic_t netstamp_needed_deferred;
2050static atomic_t netstamp_wanted;
2051static void netstamp_clear(struct work_struct *work)
2052{
2053 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
2054 int wanted;
2055
2056 wanted = atomic_add_return(deferred, &netstamp_wanted);
2057 if (wanted > 0)
2058 static_branch_enable(&netstamp_needed_key);
2059 else
2060 static_branch_disable(&netstamp_needed_key);
2061}
2062static DECLARE_WORK(netstamp_work, netstamp_clear);
2063#endif
2064
2065void net_enable_timestamp(void)
2066{
2067#ifdef CONFIG_JUMP_LABEL
2068 int wanted = atomic_read(&netstamp_wanted);
2069
2070 while (wanted > 0) {
2071 if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1))
2072 return;
2073 }
2074 atomic_inc(&netstamp_needed_deferred);
2075 schedule_work(&netstamp_work);
2076#else
2077 static_branch_inc(&netstamp_needed_key);
2078#endif
2079}
2080EXPORT_SYMBOL(net_enable_timestamp);
2081
2082void net_disable_timestamp(void)
2083{
2084#ifdef CONFIG_JUMP_LABEL
2085 int wanted = atomic_read(&netstamp_wanted);
2086
2087 while (wanted > 1) {
2088 if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1))
2089 return;
2090 }
2091 atomic_dec(&netstamp_needed_deferred);
2092 schedule_work(&netstamp_work);
2093#else
2094 static_branch_dec(&netstamp_needed_key);
2095#endif
2096}
2097EXPORT_SYMBOL(net_disable_timestamp);
2098
2099static inline void net_timestamp_set(struct sk_buff *skb)
2100{
2101 skb->tstamp = 0;
2102 skb->mono_delivery_time = 0;
2103 if (static_branch_unlikely(&netstamp_needed_key))
2104 skb->tstamp = ktime_get_real();
2105}
2106
2107#define net_timestamp_check(COND, SKB) \
2108 if (static_branch_unlikely(&netstamp_needed_key)) { \
2109 if ((COND) && !(SKB)->tstamp) \
2110 (SKB)->tstamp = ktime_get_real(); \
2111 } \
2112
2113bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
2114{
2115 return __is_skb_forwardable(dev, skb, true);
2116}
2117EXPORT_SYMBOL_GPL(is_skb_forwardable);
2118
2119static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
2120 bool check_mtu)
2121{
2122 int ret = ____dev_forward_skb(dev, skb, check_mtu);
2123
2124 if (likely(!ret)) {
2125 skb->protocol = eth_type_trans(skb, dev);
2126 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
2127 }
2128
2129 return ret;
2130}
2131
2132int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2133{
2134 return __dev_forward_skb2(dev, skb, true);
2135}
2136EXPORT_SYMBOL_GPL(__dev_forward_skb);
2137
2138/**
2139 * dev_forward_skb - loopback an skb to another netif
2140 *
2141 * @dev: destination network device
2142 * @skb: buffer to forward
2143 *
2144 * return values:
2145 * NET_RX_SUCCESS (no congestion)
2146 * NET_RX_DROP (packet was dropped, but freed)
2147 *
2148 * dev_forward_skb can be used for injecting an skb from the
2149 * start_xmit function of one device into the receive queue
2150 * of another device.
2151 *
2152 * The receiving device may be in another namespace, so
2153 * we have to clear all information in the skb that could
2154 * impact namespace isolation.
2155 */
2156int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2157{
2158 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
2159}
2160EXPORT_SYMBOL_GPL(dev_forward_skb);
2161
2162int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
2163{
2164 return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
2165}
2166
2167static inline int deliver_skb(struct sk_buff *skb,
2168 struct packet_type *pt_prev,
2169 struct net_device *orig_dev)
2170{
2171 if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
2172 return -ENOMEM;
2173 refcount_inc(&skb->users);
2174 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2175}
2176
2177static inline void deliver_ptype_list_skb(struct sk_buff *skb,
2178 struct packet_type **pt,
2179 struct net_device *orig_dev,
2180 __be16 type,
2181 struct list_head *ptype_list)
2182{
2183 struct packet_type *ptype, *pt_prev = *pt;
2184
2185 list_for_each_entry_rcu(ptype, ptype_list, list) {
2186 if (ptype->type != type)
2187 continue;
2188 if (pt_prev)
2189 deliver_skb(skb, pt_prev, orig_dev);
2190 pt_prev = ptype;
2191 }
2192 *pt = pt_prev;
2193}
2194
2195static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
2196{
2197 if (!ptype->af_packet_priv || !skb->sk)
2198 return false;
2199
2200 if (ptype->id_match)
2201 return ptype->id_match(ptype, skb->sk);
2202 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
2203 return true;
2204
2205 return false;
2206}
2207
2208/**
2209 * dev_nit_active - return true if any network interface taps are in use
2210 *
2211 * @dev: network device to check for the presence of taps
2212 */
2213bool dev_nit_active(struct net_device *dev)
2214{
2215 return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
2216}
2217EXPORT_SYMBOL_GPL(dev_nit_active);
2218
2219/*
2220 * Support routine. Sends outgoing frames to any network
2221 * taps currently in use.
2222 */
2223
2224void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
2225{
2226 struct packet_type *ptype;
2227 struct sk_buff *skb2 = NULL;
2228 struct packet_type *pt_prev = NULL;
2229 struct list_head *ptype_list = &ptype_all;
2230
2231 rcu_read_lock();
2232again:
2233 list_for_each_entry_rcu(ptype, ptype_list, list) {
2234 if (ptype->ignore_outgoing)
2235 continue;
2236
2237 /* Never send packets back to the socket
2238 * they originated from - MvS (miquels@drinkel.ow.org)
2239 */
2240 if (skb_loop_sk(ptype, skb))
2241 continue;
2242
2243 if (pt_prev) {
2244 deliver_skb(skb2, pt_prev, skb->dev);
2245 pt_prev = ptype;
2246 continue;
2247 }
2248
2249 /* need to clone skb, done only once */
2250 skb2 = skb_clone(skb, GFP_ATOMIC);
2251 if (!skb2)
2252 goto out_unlock;
2253
2254 net_timestamp_set(skb2);
2255
2256 /* skb->nh should be correctly
2257 * set by sender, so that the second statement is
2258 * just protection against buggy protocols.
2259 */
2260 skb_reset_mac_header(skb2);
2261
2262 if (skb_network_header(skb2) < skb2->data ||
2263 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2264 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2265 ntohs(skb2->protocol),
2266 dev->name);
2267 skb_reset_network_header(skb2);
2268 }
2269
2270 skb2->transport_header = skb2->network_header;
2271 skb2->pkt_type = PACKET_OUTGOING;
2272 pt_prev = ptype;
2273 }
2274
2275 if (ptype_list == &ptype_all) {
2276 ptype_list = &dev->ptype_all;
2277 goto again;
2278 }
2279out_unlock:
2280 if (pt_prev) {
2281 if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2282 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2283 else
2284 kfree_skb(skb2);
2285 }
2286 rcu_read_unlock();
2287}
2288EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2289
2290/**
2291 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2292 * @dev: Network device
2293 * @txq: number of queues available
2294 *
2295 * If real_num_tx_queues is changed the tc mappings may no longer be
2296 * valid. To resolve this verify the tc mapping remains valid and if
2297 * not NULL the mapping. With no priorities mapping to this
2298 * offset/count pair it will no longer be used. In the worst case TC0
2299 * is invalid nothing can be done so disable priority mappings. If is
2300 * expected that drivers will fix this mapping if they can before
2301 * calling netif_set_real_num_tx_queues.
2302 */
2303static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2304{
2305 int i;
2306 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2307
2308 /* If TC0 is invalidated disable TC mapping */
2309 if (tc->offset + tc->count > txq) {
2310 netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2311 dev->num_tc = 0;
2312 return;
2313 }
2314
2315 /* Invalidated prio to tc mappings set to TC0 */
2316 for (i = 1; i < TC_BITMASK + 1; i++) {
2317 int q = netdev_get_prio_tc_map(dev, i);
2318
2319 tc = &dev->tc_to_txq[q];
2320 if (tc->offset + tc->count > txq) {
2321 netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2322 i, q);
2323 netdev_set_prio_tc_map(dev, i, 0);
2324 }
2325 }
2326}
2327
2328int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2329{
2330 if (dev->num_tc) {
2331 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2332 int i;
2333
2334 /* walk through the TCs and see if it falls into any of them */
2335 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2336 if ((txq - tc->offset) < tc->count)
2337 return i;
2338 }
2339
2340 /* didn't find it, just return -1 to indicate no match */
2341 return -1;
2342 }
2343
2344 return 0;
2345}
2346EXPORT_SYMBOL(netdev_txq_to_tc);
2347
2348#ifdef CONFIG_XPS
2349static struct static_key xps_needed __read_mostly;
2350static struct static_key xps_rxqs_needed __read_mostly;
2351static DEFINE_MUTEX(xps_map_mutex);
2352#define xmap_dereference(P) \
2353 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2354
2355static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2356 struct xps_dev_maps *old_maps, int tci, u16 index)
2357{
2358 struct xps_map *map = NULL;
2359 int pos;
2360
2361 if (dev_maps)
2362 map = xmap_dereference(dev_maps->attr_map[tci]);
2363 if (!map)
2364 return false;
2365
2366 for (pos = map->len; pos--;) {
2367 if (map->queues[pos] != index)
2368 continue;
2369
2370 if (map->len > 1) {
2371 map->queues[pos] = map->queues[--map->len];
2372 break;
2373 }
2374
2375 if (old_maps)
2376 RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
2377 RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2378 kfree_rcu(map, rcu);
2379 return false;
2380 }
2381
2382 return true;
2383}
2384
2385static bool remove_xps_queue_cpu(struct net_device *dev,
2386 struct xps_dev_maps *dev_maps,
2387 int cpu, u16 offset, u16 count)
2388{
2389 int num_tc = dev_maps->num_tc;
2390 bool active = false;
2391 int tci;
2392
2393 for (tci = cpu * num_tc; num_tc--; tci++) {
2394 int i, j;
2395
2396 for (i = count, j = offset; i--; j++) {
2397 if (!remove_xps_queue(dev_maps, NULL, tci, j))
2398 break;
2399 }
2400
2401 active |= i < 0;
2402 }
2403
2404 return active;
2405}
2406
2407static void reset_xps_maps(struct net_device *dev,
2408 struct xps_dev_maps *dev_maps,
2409 enum xps_map_type type)
2410{
2411 static_key_slow_dec_cpuslocked(&xps_needed);
2412 if (type == XPS_RXQS)
2413 static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
2414
2415 RCU_INIT_POINTER(dev->xps_maps[type], NULL);
2416
2417 kfree_rcu(dev_maps, rcu);
2418}
2419
2420static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
2421 u16 offset, u16 count)
2422{
2423 struct xps_dev_maps *dev_maps;
2424 bool active = false;
2425 int i, j;
2426
2427 dev_maps = xmap_dereference(dev->xps_maps[type]);
2428 if (!dev_maps)
2429 return;
2430
2431 for (j = 0; j < dev_maps->nr_ids; j++)
2432 active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
2433 if (!active)
2434 reset_xps_maps(dev, dev_maps, type);
2435
2436 if (type == XPS_CPUS) {
2437 for (i = offset + (count - 1); count--; i--)
2438 netdev_queue_numa_node_write(
2439 netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
2440 }
2441}
2442
2443static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2444 u16 count)
2445{
2446 if (!static_key_false(&xps_needed))
2447 return;
2448
2449 cpus_read_lock();
2450 mutex_lock(&xps_map_mutex);
2451
2452 if (static_key_false(&xps_rxqs_needed))
2453 clean_xps_maps(dev, XPS_RXQS, offset, count);
2454
2455 clean_xps_maps(dev, XPS_CPUS, offset, count);
2456
2457 mutex_unlock(&xps_map_mutex);
2458 cpus_read_unlock();
2459}
2460
2461static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2462{
2463 netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2464}
2465
2466static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2467 u16 index, bool is_rxqs_map)
2468{
2469 struct xps_map *new_map;
2470 int alloc_len = XPS_MIN_MAP_ALLOC;
2471 int i, pos;
2472
2473 for (pos = 0; map && pos < map->len; pos++) {
2474 if (map->queues[pos] != index)
2475 continue;
2476 return map;
2477 }
2478
2479 /* Need to add tx-queue to this CPU's/rx-queue's existing map */
2480 if (map) {
2481 if (pos < map->alloc_len)
2482 return map;
2483
2484 alloc_len = map->alloc_len * 2;
2485 }
2486
2487 /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2488 * map
2489 */
2490 if (is_rxqs_map)
2491 new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2492 else
2493 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2494 cpu_to_node(attr_index));
2495 if (!new_map)
2496 return NULL;
2497
2498 for (i = 0; i < pos; i++)
2499 new_map->queues[i] = map->queues[i];
2500 new_map->alloc_len = alloc_len;
2501 new_map->len = pos;
2502
2503 return new_map;
2504}
2505
2506/* Copy xps maps at a given index */
2507static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
2508 struct xps_dev_maps *new_dev_maps, int index,
2509 int tc, bool skip_tc)
2510{
2511 int i, tci = index * dev_maps->num_tc;
2512 struct xps_map *map;
2513
2514 /* copy maps belonging to foreign traffic classes */
2515 for (i = 0; i < dev_maps->num_tc; i++, tci++) {
2516 if (i == tc && skip_tc)
2517 continue;
2518
2519 /* fill in the new device map from the old device map */
2520 map = xmap_dereference(dev_maps->attr_map[tci]);
2521 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2522 }
2523}
2524
2525/* Must be called under cpus_read_lock */
2526int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2527 u16 index, enum xps_map_type type)
2528{
2529 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
2530 const unsigned long *online_mask = NULL;
2531 bool active = false, copy = false;
2532 int i, j, tci, numa_node_id = -2;
2533 int maps_sz, num_tc = 1, tc = 0;
2534 struct xps_map *map, *new_map;
2535 unsigned int nr_ids;
2536
2537 if (dev->num_tc) {
2538 /* Do not allow XPS on subordinate device directly */
2539 num_tc = dev->num_tc;
2540 if (num_tc < 0)
2541 return -EINVAL;
2542
2543 /* If queue belongs to subordinate dev use its map */
2544 dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2545
2546 tc = netdev_txq_to_tc(dev, index);
2547 if (tc < 0)
2548 return -EINVAL;
2549 }
2550
2551 mutex_lock(&xps_map_mutex);
2552
2553 dev_maps = xmap_dereference(dev->xps_maps[type]);
2554 if (type == XPS_RXQS) {
2555 maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2556 nr_ids = dev->num_rx_queues;
2557 } else {
2558 maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2559 if (num_possible_cpus() > 1)
2560 online_mask = cpumask_bits(cpu_online_mask);
2561 nr_ids = nr_cpu_ids;
2562 }
2563
2564 if (maps_sz < L1_CACHE_BYTES)
2565 maps_sz = L1_CACHE_BYTES;
2566
2567 /* The old dev_maps could be larger or smaller than the one we're
2568 * setting up now, as dev->num_tc or nr_ids could have been updated in
2569 * between. We could try to be smart, but let's be safe instead and only
2570 * copy foreign traffic classes if the two map sizes match.
2571 */
2572 if (dev_maps &&
2573 dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
2574 copy = true;
2575
2576 /* allocate memory for queue storage */
2577 for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
2578 j < nr_ids;) {
2579 if (!new_dev_maps) {
2580 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2581 if (!new_dev_maps) {
2582 mutex_unlock(&xps_map_mutex);
2583 return -ENOMEM;
2584 }
2585
2586 new_dev_maps->nr_ids = nr_ids;
2587 new_dev_maps->num_tc = num_tc;
2588 }
2589
2590 tci = j * num_tc + tc;
2591 map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
2592
2593 map = expand_xps_map(map, j, index, type == XPS_RXQS);
2594 if (!map)
2595 goto error;
2596
2597 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2598 }
2599
2600 if (!new_dev_maps)
2601 goto out_no_new_maps;
2602
2603 if (!dev_maps) {
2604 /* Increment static keys at most once per type */
2605 static_key_slow_inc_cpuslocked(&xps_needed);
2606 if (type == XPS_RXQS)
2607 static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
2608 }
2609
2610 for (j = 0; j < nr_ids; j++) {
2611 bool skip_tc = false;
2612
2613 tci = j * num_tc + tc;
2614 if (netif_attr_test_mask(j, mask, nr_ids) &&
2615 netif_attr_test_online(j, online_mask, nr_ids)) {
2616 /* add tx-queue to CPU/rx-queue maps */
2617 int pos = 0;
2618
2619 skip_tc = true;
2620
2621 map = xmap_dereference(new_dev_maps->attr_map[tci]);
2622 while ((pos < map->len) && (map->queues[pos] != index))
2623 pos++;
2624
2625 if (pos == map->len)
2626 map->queues[map->len++] = index;
2627#ifdef CONFIG_NUMA
2628 if (type == XPS_CPUS) {
2629 if (numa_node_id == -2)
2630 numa_node_id = cpu_to_node(j);
2631 else if (numa_node_id != cpu_to_node(j))
2632 numa_node_id = -1;
2633 }
2634#endif
2635 }
2636
2637 if (copy)
2638 xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
2639 skip_tc);
2640 }
2641
2642 rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
2643
2644 /* Cleanup old maps */
2645 if (!dev_maps)
2646 goto out_no_old_maps;
2647
2648 for (j = 0; j < dev_maps->nr_ids; j++) {
2649 for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
2650 map = xmap_dereference(dev_maps->attr_map[tci]);
2651 if (!map)
2652 continue;
2653
2654 if (copy) {
2655 new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2656 if (map == new_map)
2657 continue;
2658 }
2659
2660 RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2661 kfree_rcu(map, rcu);
2662 }
2663 }
2664
2665 old_dev_maps = dev_maps;
2666
2667out_no_old_maps:
2668 dev_maps = new_dev_maps;
2669 active = true;
2670
2671out_no_new_maps:
2672 if (type == XPS_CPUS)
2673 /* update Tx queue numa node */
2674 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2675 (numa_node_id >= 0) ?
2676 numa_node_id : NUMA_NO_NODE);
2677
2678 if (!dev_maps)
2679 goto out_no_maps;
2680
2681 /* removes tx-queue from unused CPUs/rx-queues */
2682 for (j = 0; j < dev_maps->nr_ids; j++) {
2683 tci = j * dev_maps->num_tc;
2684
2685 for (i = 0; i < dev_maps->num_tc; i++, tci++) {
2686 if (i == tc &&
2687 netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
2688 netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
2689 continue;
2690
2691 active |= remove_xps_queue(dev_maps,
2692 copy ? old_dev_maps : NULL,
2693 tci, index);
2694 }
2695 }
2696
2697 if (old_dev_maps)
2698 kfree_rcu(old_dev_maps, rcu);
2699
2700 /* free map if not active */
2701 if (!active)
2702 reset_xps_maps(dev, dev_maps, type);
2703
2704out_no_maps:
2705 mutex_unlock(&xps_map_mutex);
2706
2707 return 0;
2708error:
2709 /* remove any maps that we added */
2710 for (j = 0; j < nr_ids; j++) {
2711 for (i = num_tc, tci = j * num_tc; i--; tci++) {
2712 new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2713 map = copy ?
2714 xmap_dereference(dev_maps->attr_map[tci]) :
2715 NULL;
2716 if (new_map && new_map != map)
2717 kfree(new_map);
2718 }
2719 }
2720
2721 mutex_unlock(&xps_map_mutex);
2722
2723 kfree(new_dev_maps);
2724 return -ENOMEM;
2725}
2726EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
2727
2728int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2729 u16 index)
2730{
2731 int ret;
2732
2733 cpus_read_lock();
2734 ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
2735 cpus_read_unlock();
2736
2737 return ret;
2738}
2739EXPORT_SYMBOL(netif_set_xps_queue);
2740
2741#endif
2742static void netdev_unbind_all_sb_channels(struct net_device *dev)
2743{
2744 struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2745
2746 /* Unbind any subordinate channels */
2747 while (txq-- != &dev->_tx[0]) {
2748 if (txq->sb_dev)
2749 netdev_unbind_sb_channel(dev, txq->sb_dev);
2750 }
2751}
2752
2753void netdev_reset_tc(struct net_device *dev)
2754{
2755#ifdef CONFIG_XPS
2756 netif_reset_xps_queues_gt(dev, 0);
2757#endif
2758 netdev_unbind_all_sb_channels(dev);
2759
2760 /* Reset TC configuration of device */
2761 dev->num_tc = 0;
2762 memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2763 memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2764}
2765EXPORT_SYMBOL(netdev_reset_tc);
2766
2767int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2768{
2769 if (tc >= dev->num_tc)
2770 return -EINVAL;
2771
2772#ifdef CONFIG_XPS
2773 netif_reset_xps_queues(dev, offset, count);
2774#endif
2775 dev->tc_to_txq[tc].count = count;
2776 dev->tc_to_txq[tc].offset = offset;
2777 return 0;
2778}
2779EXPORT_SYMBOL(netdev_set_tc_queue);
2780
2781int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2782{
2783 if (num_tc > TC_MAX_QUEUE)
2784 return -EINVAL;
2785
2786#ifdef CONFIG_XPS
2787 netif_reset_xps_queues_gt(dev, 0);
2788#endif
2789 netdev_unbind_all_sb_channels(dev);
2790
2791 dev->num_tc = num_tc;
2792 return 0;
2793}
2794EXPORT_SYMBOL(netdev_set_num_tc);
2795
2796void netdev_unbind_sb_channel(struct net_device *dev,
2797 struct net_device *sb_dev)
2798{
2799 struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2800
2801#ifdef CONFIG_XPS
2802 netif_reset_xps_queues_gt(sb_dev, 0);
2803#endif
2804 memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
2805 memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
2806
2807 while (txq-- != &dev->_tx[0]) {
2808 if (txq->sb_dev == sb_dev)
2809 txq->sb_dev = NULL;
2810 }
2811}
2812EXPORT_SYMBOL(netdev_unbind_sb_channel);
2813
2814int netdev_bind_sb_channel_queue(struct net_device *dev,
2815 struct net_device *sb_dev,
2816 u8 tc, u16 count, u16 offset)
2817{
2818 /* Make certain the sb_dev and dev are already configured */
2819 if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
2820 return -EINVAL;
2821
2822 /* We cannot hand out queues we don't have */
2823 if ((offset + count) > dev->real_num_tx_queues)
2824 return -EINVAL;
2825
2826 /* Record the mapping */
2827 sb_dev->tc_to_txq[tc].count = count;
2828 sb_dev->tc_to_txq[tc].offset = offset;
2829
2830 /* Provide a way for Tx queue to find the tc_to_txq map or
2831 * XPS map for itself.
2832 */
2833 while (count--)
2834 netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
2835
2836 return 0;
2837}
2838EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
2839
2840int netdev_set_sb_channel(struct net_device *dev, u16 channel)
2841{
2842 /* Do not use a multiqueue device to represent a subordinate channel */
2843 if (netif_is_multiqueue(dev))
2844 return -ENODEV;
2845
2846 /* We allow channels 1 - 32767 to be used for subordinate channels.
2847 * Channel 0 is meant to be "native" mode and used only to represent
2848 * the main root device. We allow writing 0 to reset the device back
2849 * to normal mode after being used as a subordinate channel.
2850 */
2851 if (channel > S16_MAX)
2852 return -EINVAL;
2853
2854 dev->num_tc = -channel;
2855
2856 return 0;
2857}
2858EXPORT_SYMBOL(netdev_set_sb_channel);
2859
2860/*
2861 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2862 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2863 */
2864int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2865{
2866 bool disabling;
2867 int rc;
2868
2869 disabling = txq < dev->real_num_tx_queues;
2870
2871 if (txq < 1 || txq > dev->num_tx_queues)
2872 return -EINVAL;
2873
2874 if (dev->reg_state == NETREG_REGISTERED ||
2875 dev->reg_state == NETREG_UNREGISTERING) {
2876 ASSERT_RTNL();
2877
2878 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2879 txq);
2880 if (rc)
2881 return rc;
2882
2883 if (dev->num_tc)
2884 netif_setup_tc(dev, txq);
2885
2886 dev_qdisc_change_real_num_tx(dev, txq);
2887
2888 dev->real_num_tx_queues = txq;
2889
2890 if (disabling) {
2891 synchronize_net();
2892 qdisc_reset_all_tx_gt(dev, txq);
2893#ifdef CONFIG_XPS
2894 netif_reset_xps_queues_gt(dev, txq);
2895#endif
2896 }
2897 } else {
2898 dev->real_num_tx_queues = txq;
2899 }
2900
2901 return 0;
2902}
2903EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2904
2905#ifdef CONFIG_SYSFS
2906/**
2907 * netif_set_real_num_rx_queues - set actual number of RX queues used
2908 * @dev: Network device
2909 * @rxq: Actual number of RX queues
2910 *
2911 * This must be called either with the rtnl_lock held or before
2912 * registration of the net device. Returns 0 on success, or a
2913 * negative error code. If called before registration, it always
2914 * succeeds.
2915 */
2916int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2917{
2918 int rc;
2919
2920 if (rxq < 1 || rxq > dev->num_rx_queues)
2921 return -EINVAL;
2922
2923 if (dev->reg_state == NETREG_REGISTERED) {
2924 ASSERT_RTNL();
2925
2926 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2927 rxq);
2928 if (rc)
2929 return rc;
2930 }
2931
2932 dev->real_num_rx_queues = rxq;
2933 return 0;
2934}
2935EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2936#endif
2937
2938/**
2939 * netif_set_real_num_queues - set actual number of RX and TX queues used
2940 * @dev: Network device
2941 * @txq: Actual number of TX queues
2942 * @rxq: Actual number of RX queues
2943 *
2944 * Set the real number of both TX and RX queues.
2945 * Does nothing if the number of queues is already correct.
2946 */
2947int netif_set_real_num_queues(struct net_device *dev,
2948 unsigned int txq, unsigned int rxq)
2949{
2950 unsigned int old_rxq = dev->real_num_rx_queues;
2951 int err;
2952
2953 if (txq < 1 || txq > dev->num_tx_queues ||
2954 rxq < 1 || rxq > dev->num_rx_queues)
2955 return -EINVAL;
2956
2957 /* Start from increases, so the error path only does decreases -
2958 * decreases can't fail.
2959 */
2960 if (rxq > dev->real_num_rx_queues) {
2961 err = netif_set_real_num_rx_queues(dev, rxq);
2962 if (err)
2963 return err;
2964 }
2965 if (txq > dev->real_num_tx_queues) {
2966 err = netif_set_real_num_tx_queues(dev, txq);
2967 if (err)
2968 goto undo_rx;
2969 }
2970 if (rxq < dev->real_num_rx_queues)
2971 WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
2972 if (txq < dev->real_num_tx_queues)
2973 WARN_ON(netif_set_real_num_tx_queues(dev, txq));
2974
2975 return 0;
2976undo_rx:
2977 WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
2978 return err;
2979}
2980EXPORT_SYMBOL(netif_set_real_num_queues);
2981
2982/**
2983 * netif_set_tso_max_size() - set the max size of TSO frames supported
2984 * @dev: netdev to update
2985 * @size: max skb->len of a TSO frame
2986 *
2987 * Set the limit on the size of TSO super-frames the device can handle.
2988 * Unless explicitly set the stack will assume the value of
2989 * %GSO_LEGACY_MAX_SIZE.
2990 */
2991void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
2992{
2993 dev->tso_max_size = min(GSO_MAX_SIZE, size);
2994 if (size < READ_ONCE(dev->gso_max_size))
2995 netif_set_gso_max_size(dev, size);
2996}
2997EXPORT_SYMBOL(netif_set_tso_max_size);
2998
2999/**
3000 * netif_set_tso_max_segs() - set the max number of segs supported for TSO
3001 * @dev: netdev to update
3002 * @segs: max number of TCP segments
3003 *
3004 * Set the limit on the number of TCP segments the device can generate from
3005 * a single TSO super-frame.
3006 * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
3007 */
3008void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
3009{
3010 dev->tso_max_segs = segs;
3011 if (segs < READ_ONCE(dev->gso_max_segs))
3012 netif_set_gso_max_segs(dev, segs);
3013}
3014EXPORT_SYMBOL(netif_set_tso_max_segs);
3015
3016/**
3017 * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
3018 * @to: netdev to update
3019 * @from: netdev from which to copy the limits
3020 */
3021void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
3022{
3023 netif_set_tso_max_size(to, from->tso_max_size);
3024 netif_set_tso_max_segs(to, from->tso_max_segs);
3025}
3026EXPORT_SYMBOL(netif_inherit_tso_max);
3027
3028/**
3029 * netif_get_num_default_rss_queues - default number of RSS queues
3030 *
3031 * Default value is the number of physical cores if there are only 1 or 2, or
3032 * divided by 2 if there are more.
3033 */
3034int netif_get_num_default_rss_queues(void)
3035{
3036 cpumask_var_t cpus;
3037 int cpu, count = 0;
3038
3039 if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
3040 return 1;
3041
3042 cpumask_copy(cpus, cpu_online_mask);
3043 for_each_cpu(cpu, cpus) {
3044 ++count;
3045 cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
3046 }
3047 free_cpumask_var(cpus);
3048
3049 return count > 2 ? DIV_ROUND_UP(count, 2) : count;
3050}
3051EXPORT_SYMBOL(netif_get_num_default_rss_queues);
3052
3053static void __netif_reschedule(struct Qdisc *q)
3054{
3055 struct softnet_data *sd;
3056 unsigned long flags;
3057
3058 local_irq_save(flags);
3059 sd = this_cpu_ptr(&softnet_data);
3060 q->next_sched = NULL;
3061 *sd->output_queue_tailp = q;
3062 sd->output_queue_tailp = &q->next_sched;
3063 raise_softirq_irqoff(NET_TX_SOFTIRQ);
3064 local_irq_restore(flags);
3065}
3066
3067void __netif_schedule(struct Qdisc *q)
3068{
3069 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
3070 __netif_reschedule(q);
3071}
3072EXPORT_SYMBOL(__netif_schedule);
3073
3074struct dev_kfree_skb_cb {
3075 enum skb_free_reason reason;
3076};
3077
3078static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
3079{
3080 return (struct dev_kfree_skb_cb *)skb->cb;
3081}
3082
3083void netif_schedule_queue(struct netdev_queue *txq)
3084{
3085 rcu_read_lock();
3086 if (!netif_xmit_stopped(txq)) {
3087 struct Qdisc *q = rcu_dereference(txq->qdisc);
3088
3089 __netif_schedule(q);
3090 }
3091 rcu_read_unlock();
3092}
3093EXPORT_SYMBOL(netif_schedule_queue);
3094
3095void netif_tx_wake_queue(struct netdev_queue *dev_queue)
3096{
3097 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
3098 struct Qdisc *q;
3099
3100 rcu_read_lock();
3101 q = rcu_dereference(dev_queue->qdisc);
3102 __netif_schedule(q);
3103 rcu_read_unlock();
3104 }
3105}
3106EXPORT_SYMBOL(netif_tx_wake_queue);
3107
3108void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
3109{
3110 unsigned long flags;
3111
3112 if (unlikely(!skb))
3113 return;
3114
3115 if (likely(refcount_read(&skb->users) == 1)) {
3116 smp_rmb();
3117 refcount_set(&skb->users, 0);
3118 } else if (likely(!refcount_dec_and_test(&skb->users))) {
3119 return;
3120 }
3121 get_kfree_skb_cb(skb)->reason = reason;
3122 local_irq_save(flags);
3123 skb->next = __this_cpu_read(softnet_data.completion_queue);
3124 __this_cpu_write(softnet_data.completion_queue, skb);
3125 raise_softirq_irqoff(NET_TX_SOFTIRQ);
3126 local_irq_restore(flags);
3127}
3128EXPORT_SYMBOL(__dev_kfree_skb_irq);
3129
3130void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
3131{
3132 if (in_hardirq() || irqs_disabled())
3133 __dev_kfree_skb_irq(skb, reason);
3134 else
3135 dev_kfree_skb(skb);
3136}
3137EXPORT_SYMBOL(__dev_kfree_skb_any);
3138
3139
3140/**
3141 * netif_device_detach - mark device as removed
3142 * @dev: network device
3143 *
3144 * Mark device as removed from system and therefore no longer available.
3145 */
3146void netif_device_detach(struct net_device *dev)
3147{
3148 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
3149 netif_running(dev)) {
3150 netif_tx_stop_all_queues(dev);
3151 }
3152}
3153EXPORT_SYMBOL(netif_device_detach);
3154
3155/**
3156 * netif_device_attach - mark device as attached
3157 * @dev: network device
3158 *
3159 * Mark device as attached from system and restart if needed.
3160 */
3161void netif_device_attach(struct net_device *dev)
3162{
3163 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
3164 netif_running(dev)) {
3165 netif_tx_wake_all_queues(dev);
3166 __netdev_watchdog_up(dev);
3167 }
3168}
3169EXPORT_SYMBOL(netif_device_attach);
3170
3171/*
3172 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
3173 * to be used as a distribution range.
3174 */
3175static u16 skb_tx_hash(const struct net_device *dev,
3176 const struct net_device *sb_dev,
3177 struct sk_buff *skb)
3178{
3179 u32 hash;
3180 u16 qoffset = 0;
3181 u16 qcount = dev->real_num_tx_queues;
3182
3183 if (dev->num_tc) {
3184 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
3185
3186 qoffset = sb_dev->tc_to_txq[tc].offset;
3187 qcount = sb_dev->tc_to_txq[tc].count;
3188 if (unlikely(!qcount)) {
3189 net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
3190 sb_dev->name, qoffset, tc);
3191 qoffset = 0;
3192 qcount = dev->real_num_tx_queues;
3193 }
3194 }
3195
3196 if (skb_rx_queue_recorded(skb)) {
3197 hash = skb_get_rx_queue(skb);
3198 if (hash >= qoffset)
3199 hash -= qoffset;
3200 while (unlikely(hash >= qcount))
3201 hash -= qcount;
3202 return hash + qoffset;
3203 }
3204
3205 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
3206}
3207
3208static void skb_warn_bad_offload(const struct sk_buff *skb)
3209{
3210 static const netdev_features_t null_features;
3211 struct net_device *dev = skb->dev;
3212 const char *name = "";
3213
3214 if (!net_ratelimit())
3215 return;
3216
3217 if (dev) {
3218 if (dev->dev.parent)
3219 name = dev_driver_string(dev->dev.parent);
3220 else
3221 name = netdev_name(dev);
3222 }
3223 skb_dump(KERN_WARNING, skb, false);
3224 WARN(1, "%s: caps=(%pNF, %pNF)\n",
3225 name, dev ? &dev->features : &null_features,
3226 skb->sk ? &skb->sk->sk_route_caps : &null_features);
3227}
3228
3229/*
3230 * Invalidate hardware checksum when packet is to be mangled, and
3231 * complete checksum manually on outgoing path.
3232 */
3233int skb_checksum_help(struct sk_buff *skb)
3234{
3235 __wsum csum;
3236 int ret = 0, offset;
3237
3238 if (skb->ip_summed == CHECKSUM_COMPLETE)
3239 goto out_set_summed;
3240
3241 if (unlikely(skb_is_gso(skb))) {
3242 skb_warn_bad_offload(skb);
3243 return -EINVAL;
3244 }
3245
3246 /* Before computing a checksum, we should make sure no frag could
3247 * be modified by an external entity : checksum could be wrong.
3248 */
3249 if (skb_has_shared_frag(skb)) {
3250 ret = __skb_linearize(skb);
3251 if (ret)
3252 goto out;
3253 }
3254
3255 offset = skb_checksum_start_offset(skb);
3256 ret = -EINVAL;
3257 if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
3258 DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
3259 goto out;
3260 }
3261 csum = skb_checksum(skb, offset, skb->len - offset, 0);
3262
3263 offset += skb->csum_offset;
3264 if (WARN_ON_ONCE(offset + sizeof(__sum16) > skb_headlen(skb))) {
3265 DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
3266 goto out;
3267 }
3268 ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
3269 if (ret)
3270 goto out;
3271
3272 *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
3273out_set_summed:
3274 skb->ip_summed = CHECKSUM_NONE;
3275out:
3276 return ret;
3277}
3278EXPORT_SYMBOL(skb_checksum_help);
3279
3280int skb_crc32c_csum_help(struct sk_buff *skb)
3281{
3282 __le32 crc32c_csum;
3283 int ret = 0, offset, start;
3284
3285 if (skb->ip_summed != CHECKSUM_PARTIAL)
3286 goto out;
3287
3288 if (unlikely(skb_is_gso(skb)))
3289 goto out;
3290
3291 /* Before computing a checksum, we should make sure no frag could
3292 * be modified by an external entity : checksum could be wrong.
3293 */
3294 if (unlikely(skb_has_shared_frag(skb))) {
3295 ret = __skb_linearize(skb);
3296 if (ret)
3297 goto out;
3298 }
3299 start = skb_checksum_start_offset(skb);
3300 offset = start + offsetof(struct sctphdr, checksum);
3301 if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
3302 ret = -EINVAL;
3303 goto out;
3304 }
3305
3306 ret = skb_ensure_writable(skb, offset + sizeof(__le32));
3307 if (ret)
3308 goto out;
3309
3310 crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
3311 skb->len - start, ~(__u32)0,
3312 crc32c_csum_stub));
3313 *(__le32 *)(skb->data + offset) = crc32c_csum;
3314 skb->ip_summed = CHECKSUM_NONE;
3315 skb->csum_not_inet = 0;
3316out:
3317 return ret;
3318}
3319
3320__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
3321{
3322 __be16 type = skb->protocol;
3323
3324 /* Tunnel gso handlers can set protocol to ethernet. */
3325 if (type == htons(ETH_P_TEB)) {
3326 struct ethhdr *eth;
3327
3328 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
3329 return 0;
3330
3331 eth = (struct ethhdr *)skb->data;
3332 type = eth->h_proto;
3333 }
3334
3335 return __vlan_get_protocol(skb, type, depth);
3336}
3337
3338/* openvswitch calls this on rx path, so we need a different check.
3339 */
3340static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
3341{
3342 if (tx_path)
3343 return skb->ip_summed != CHECKSUM_PARTIAL &&
3344 skb->ip_summed != CHECKSUM_UNNECESSARY;
3345
3346 return skb->ip_summed == CHECKSUM_NONE;
3347}
3348
3349/**
3350 * __skb_gso_segment - Perform segmentation on skb.
3351 * @skb: buffer to segment
3352 * @features: features for the output path (see dev->features)
3353 * @tx_path: whether it is called in TX path
3354 *
3355 * This function segments the given skb and returns a list of segments.
3356 *
3357 * It may return NULL if the skb requires no segmentation. This is
3358 * only possible when GSO is used for verifying header integrity.
3359 *
3360 * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
3361 */
3362struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
3363 netdev_features_t features, bool tx_path)
3364{
3365 struct sk_buff *segs;
3366
3367 if (unlikely(skb_needs_check(skb, tx_path))) {
3368 int err;
3369
3370 /* We're going to init ->check field in TCP or UDP header */
3371 err = skb_cow_head(skb, 0);
3372 if (err < 0)
3373 return ERR_PTR(err);
3374 }
3375
3376 /* Only report GSO partial support if it will enable us to
3377 * support segmentation on this frame without needing additional
3378 * work.
3379 */
3380 if (features & NETIF_F_GSO_PARTIAL) {
3381 netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
3382 struct net_device *dev = skb->dev;
3383
3384 partial_features |= dev->features & dev->gso_partial_features;
3385 if (!skb_gso_ok(skb, features | partial_features))
3386 features &= ~NETIF_F_GSO_PARTIAL;
3387 }
3388
3389 BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
3390 sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
3391
3392 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
3393 SKB_GSO_CB(skb)->encap_level = 0;
3394
3395 skb_reset_mac_header(skb);
3396 skb_reset_mac_len(skb);
3397
3398 segs = skb_mac_gso_segment(skb, features);
3399
3400 if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
3401 skb_warn_bad_offload(skb);
3402
3403 return segs;
3404}
3405EXPORT_SYMBOL(__skb_gso_segment);
3406
3407/* Take action when hardware reception checksum errors are detected. */
3408#ifdef CONFIG_BUG
3409static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3410{
3411 netdev_err(dev, "hw csum failure\n");
3412 skb_dump(KERN_ERR, skb, true);
3413 dump_stack();
3414}
3415
3416void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3417{
3418 DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
3419}
3420EXPORT_SYMBOL(netdev_rx_csum_fault);
3421#endif
3422
3423/* XXX: check that highmem exists at all on the given machine. */
3424static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
3425{
3426#ifdef CONFIG_HIGHMEM
3427 int i;
3428
3429 if (!(dev->features & NETIF_F_HIGHDMA)) {
3430 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3431 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3432
3433 if (PageHighMem(skb_frag_page(frag)))
3434 return 1;
3435 }
3436 }
3437#endif
3438 return 0;
3439}
3440
3441/* If MPLS offload request, verify we are testing hardware MPLS features
3442 * instead of standard features for the netdev.
3443 */
3444#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3445static netdev_features_t net_mpls_features(struct sk_buff *skb,
3446 netdev_features_t features,
3447 __be16 type)
3448{
3449 if (eth_p_mpls(type))
3450 features &= skb->dev->mpls_features;
3451
3452 return features;
3453}
3454#else
3455static netdev_features_t net_mpls_features(struct sk_buff *skb,
3456 netdev_features_t features,
3457 __be16 type)
3458{
3459 return features;
3460}
3461#endif
3462
3463static netdev_features_t harmonize_features(struct sk_buff *skb,
3464 netdev_features_t features)
3465{
3466 __be16 type;
3467
3468 type = skb_network_protocol(skb, NULL);
3469 features = net_mpls_features(skb, features, type);
3470
3471 if (skb->ip_summed != CHECKSUM_NONE &&
3472 !can_checksum_protocol(features, type)) {
3473 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3474 }
3475 if (illegal_highdma(skb->dev, skb))
3476 features &= ~NETIF_F_SG;
3477
3478 return features;
3479}
3480
3481netdev_features_t passthru_features_check(struct sk_buff *skb,
3482 struct net_device *dev,
3483 netdev_features_t features)
3484{
3485 return features;
3486}
3487EXPORT_SYMBOL(passthru_features_check);
3488
3489static netdev_features_t dflt_features_check(struct sk_buff *skb,
3490 struct net_device *dev,
3491 netdev_features_t features)
3492{
3493 return vlan_features_check(skb, features);
3494}
3495
3496static netdev_features_t gso_features_check(const struct sk_buff *skb,
3497 struct net_device *dev,
3498 netdev_features_t features)
3499{
3500 u16 gso_segs = skb_shinfo(skb)->gso_segs;
3501
3502 if (gso_segs > READ_ONCE(dev->gso_max_segs))
3503 return features & ~NETIF_F_GSO_MASK;
3504
3505 if (!skb_shinfo(skb)->gso_type) {
3506 skb_warn_bad_offload(skb);
3507 return features & ~NETIF_F_GSO_MASK;
3508 }
3509
3510 /* Support for GSO partial features requires software
3511 * intervention before we can actually process the packets
3512 * so we need to strip support for any partial features now
3513 * and we can pull them back in after we have partially
3514 * segmented the frame.
3515 */
3516 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
3517 features &= ~dev->gso_partial_features;
3518
3519 /* Make sure to clear the IPv4 ID mangling feature if the
3520 * IPv4 header has the potential to be fragmented.
3521 */
3522 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3523 struct iphdr *iph = skb->encapsulation ?
3524 inner_ip_hdr(skb) : ip_hdr(skb);
3525
3526 if (!(iph->frag_off & htons(IP_DF)))
3527 features &= ~NETIF_F_TSO_MANGLEID;
3528 }
3529
3530 return features;
3531}
3532
3533netdev_features_t netif_skb_features(struct sk_buff *skb)
3534{
3535 struct net_device *dev = skb->dev;
3536 netdev_features_t features = dev->features;
3537
3538 if (skb_is_gso(skb))
3539 features = gso_features_check(skb, dev, features);
3540
3541 /* If encapsulation offload request, verify we are testing
3542 * hardware encapsulation features instead of standard
3543 * features for the netdev
3544 */
3545 if (skb->encapsulation)
3546 features &= dev->hw_enc_features;
3547
3548 if (skb_vlan_tagged(skb))
3549 features = netdev_intersect_features(features,
3550 dev->vlan_features |
3551 NETIF_F_HW_VLAN_CTAG_TX |
3552 NETIF_F_HW_VLAN_STAG_TX);
3553
3554 if (dev->netdev_ops->ndo_features_check)
3555 features &= dev->netdev_ops->ndo_features_check(skb, dev,
3556 features);
3557 else
3558 features &= dflt_features_check(skb, dev, features);
3559
3560 return harmonize_features(skb, features);
3561}
3562EXPORT_SYMBOL(netif_skb_features);
3563
3564static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3565 struct netdev_queue *txq, bool more)
3566{
3567 unsigned int len;
3568 int rc;
3569
3570 if (dev_nit_active(dev))
3571 dev_queue_xmit_nit(skb, dev);
3572
3573 len = skb->len;
3574 trace_net_dev_start_xmit(skb, dev);
3575 rc = netdev_start_xmit(skb, dev, txq, more);
3576 trace_net_dev_xmit(skb, rc, dev, len);
3577
3578 return rc;
3579}
3580
3581struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3582 struct netdev_queue *txq, int *ret)
3583{
3584 struct sk_buff *skb = first;
3585 int rc = NETDEV_TX_OK;
3586
3587 while (skb) {
3588 struct sk_buff *next = skb->next;
3589
3590 skb_mark_not_on_list(skb);
3591 rc = xmit_one(skb, dev, txq, next != NULL);
3592 if (unlikely(!dev_xmit_complete(rc))) {
3593 skb->next = next;
3594 goto out;
3595 }
3596
3597 skb = next;
3598 if (netif_tx_queue_stopped(txq) && skb) {
3599 rc = NETDEV_TX_BUSY;
3600 break;
3601 }
3602 }
3603
3604out:
3605 *ret = rc;
3606 return skb;
3607}
3608
3609static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3610 netdev_features_t features)
3611{
3612 if (skb_vlan_tag_present(skb) &&
3613 !vlan_hw_offload_capable(features, skb->vlan_proto))
3614 skb = __vlan_hwaccel_push_inside(skb);
3615 return skb;
3616}
3617
3618int skb_csum_hwoffload_help(struct sk_buff *skb,
3619 const netdev_features_t features)
3620{
3621 if (unlikely(skb_csum_is_sctp(skb)))
3622 return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3623 skb_crc32c_csum_help(skb);
3624
3625 if (features & NETIF_F_HW_CSUM)
3626 return 0;
3627
3628 if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
3629 switch (skb->csum_offset) {
3630 case offsetof(struct tcphdr, check):
3631 case offsetof(struct udphdr, check):
3632 return 0;
3633 }
3634 }
3635
3636 return skb_checksum_help(skb);
3637}
3638EXPORT_SYMBOL(skb_csum_hwoffload_help);
3639
3640static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3641{
3642 netdev_features_t features;
3643
3644 features = netif_skb_features(skb);
3645 skb = validate_xmit_vlan(skb, features);
3646 if (unlikely(!skb))
3647 goto out_null;
3648
3649 skb = sk_validate_xmit_skb(skb, dev);
3650 if (unlikely(!skb))
3651 goto out_null;
3652
3653 if (netif_needs_gso(skb, features)) {
3654 struct sk_buff *segs;
3655
3656 segs = skb_gso_segment(skb, features);
3657 if (IS_ERR(segs)) {
3658 goto out_kfree_skb;
3659 } else if (segs) {
3660 consume_skb(skb);
3661 skb = segs;
3662 }
3663 } else {
3664 if (skb_needs_linearize(skb, features) &&
3665 __skb_linearize(skb))
3666 goto out_kfree_skb;
3667
3668 /* If packet is not checksummed and device does not
3669 * support checksumming for this protocol, complete
3670 * checksumming here.
3671 */
3672 if (skb->ip_summed == CHECKSUM_PARTIAL) {
3673 if (skb->encapsulation)
3674 skb_set_inner_transport_header(skb,
3675 skb_checksum_start_offset(skb));
3676 else
3677 skb_set_transport_header(skb,
3678 skb_checksum_start_offset(skb));
3679 if (skb_csum_hwoffload_help(skb, features))
3680 goto out_kfree_skb;
3681 }
3682 }
3683
3684 skb = validate_xmit_xfrm(skb, features, again);
3685
3686 return skb;
3687
3688out_kfree_skb:
3689 kfree_skb(skb);
3690out_null:
3691 dev_core_stats_tx_dropped_inc(dev);
3692 return NULL;
3693}
3694
3695struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3696{
3697 struct sk_buff *next, *head = NULL, *tail;
3698
3699 for (; skb != NULL; skb = next) {
3700 next = skb->next;
3701 skb_mark_not_on_list(skb);
3702
3703 /* in case skb wont be segmented, point to itself */
3704 skb->prev = skb;
3705
3706 skb = validate_xmit_skb(skb, dev, again);
3707 if (!skb)
3708 continue;
3709
3710 if (!head)
3711 head = skb;
3712 else
3713 tail->next = skb;
3714 /* If skb was segmented, skb->prev points to
3715 * the last segment. If not, it still contains skb.
3716 */
3717 tail = skb->prev;
3718 }
3719 return head;
3720}
3721EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3722
3723static void qdisc_pkt_len_init(struct sk_buff *skb)
3724{
3725 const struct skb_shared_info *shinfo = skb_shinfo(skb);
3726
3727 qdisc_skb_cb(skb)->pkt_len = skb->len;
3728
3729 /* To get more precise estimation of bytes sent on wire,
3730 * we add to pkt_len the headers size of all segments
3731 */
3732 if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
3733 unsigned int hdr_len;
3734 u16 gso_segs = shinfo->gso_segs;
3735
3736 /* mac layer + network layer */
3737 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3738
3739 /* + transport layer */
3740 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3741 const struct tcphdr *th;
3742 struct tcphdr _tcphdr;
3743
3744 th = skb_header_pointer(skb, skb_transport_offset(skb),
3745 sizeof(_tcphdr), &_tcphdr);
3746 if (likely(th))
3747 hdr_len += __tcp_hdrlen(th);
3748 } else {
3749 struct udphdr _udphdr;
3750
3751 if (skb_header_pointer(skb, skb_transport_offset(skb),
3752 sizeof(_udphdr), &_udphdr))
3753 hdr_len += sizeof(struct udphdr);
3754 }
3755
3756 if (shinfo->gso_type & SKB_GSO_DODGY)
3757 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3758 shinfo->gso_size);
3759
3760 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3761 }
3762}
3763
3764static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
3765 struct sk_buff **to_free,
3766 struct netdev_queue *txq)
3767{
3768 int rc;
3769
3770 rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
3771 if (rc == NET_XMIT_SUCCESS)
3772 trace_qdisc_enqueue(q, txq, skb);
3773 return rc;
3774}
3775
3776static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3777 struct net_device *dev,
3778 struct netdev_queue *txq)
3779{
3780 spinlock_t *root_lock = qdisc_lock(q);
3781 struct sk_buff *to_free = NULL;
3782 bool contended;
3783 int rc;
3784
3785 qdisc_calculate_pkt_len(skb, q);
3786
3787 if (q->flags & TCQ_F_NOLOCK) {
3788 if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
3789 qdisc_run_begin(q)) {
3790 /* Retest nolock_qdisc_is_empty() within the protection
3791 * of q->seqlock to protect from racing with requeuing.
3792 */
3793 if (unlikely(!nolock_qdisc_is_empty(q))) {
3794 rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3795 __qdisc_run(q);
3796 qdisc_run_end(q);
3797
3798 goto no_lock_out;
3799 }
3800
3801 qdisc_bstats_cpu_update(q, skb);
3802 if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
3803 !nolock_qdisc_is_empty(q))
3804 __qdisc_run(q);
3805
3806 qdisc_run_end(q);
3807 return NET_XMIT_SUCCESS;
3808 }
3809
3810 rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3811 qdisc_run(q);
3812
3813no_lock_out:
3814 if (unlikely(to_free))
3815 kfree_skb_list_reason(to_free,
3816 SKB_DROP_REASON_QDISC_DROP);
3817 return rc;
3818 }
3819
3820 /*
3821 * Heuristic to force contended enqueues to serialize on a
3822 * separate lock before trying to get qdisc main lock.
3823 * This permits qdisc->running owner to get the lock more
3824 * often and dequeue packets faster.
3825 * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
3826 * and then other tasks will only enqueue packets. The packets will be
3827 * sent after the qdisc owner is scheduled again. To prevent this
3828 * scenario the task always serialize on the lock.
3829 */
3830 contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
3831 if (unlikely(contended))
3832 spin_lock(&q->busylock);
3833
3834 spin_lock(root_lock);
3835 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3836 __qdisc_drop(skb, &to_free);
3837 rc = NET_XMIT_DROP;
3838 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3839 qdisc_run_begin(q)) {
3840 /*
3841 * This is a work-conserving queue; there are no old skbs
3842 * waiting to be sent out; and the qdisc is not running -
3843 * xmit the skb directly.
3844 */
3845
3846 qdisc_bstats_update(q, skb);
3847
3848 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3849 if (unlikely(contended)) {
3850 spin_unlock(&q->busylock);
3851 contended = false;
3852 }
3853 __qdisc_run(q);
3854 }
3855
3856 qdisc_run_end(q);
3857 rc = NET_XMIT_SUCCESS;
3858 } else {
3859 rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3860 if (qdisc_run_begin(q)) {
3861 if (unlikely(contended)) {
3862 spin_unlock(&q->busylock);
3863 contended = false;
3864 }
3865 __qdisc_run(q);
3866 qdisc_run_end(q);
3867 }
3868 }
3869 spin_unlock(root_lock);
3870 if (unlikely(to_free))
3871 kfree_skb_list_reason(to_free, SKB_DROP_REASON_QDISC_DROP);
3872 if (unlikely(contended))
3873 spin_unlock(&q->busylock);
3874 return rc;
3875}
3876
3877#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3878static void skb_update_prio(struct sk_buff *skb)
3879{
3880 const struct netprio_map *map;
3881 const struct sock *sk;
3882 unsigned int prioidx;
3883
3884 if (skb->priority)
3885 return;
3886 map = rcu_dereference_bh(skb->dev->priomap);
3887 if (!map)
3888 return;
3889 sk = skb_to_full_sk(skb);
3890 if (!sk)
3891 return;
3892
3893 prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3894
3895 if (prioidx < map->priomap_len)
3896 skb->priority = map->priomap[prioidx];
3897}
3898#else
3899#define skb_update_prio(skb)
3900#endif
3901
3902/**
3903 * dev_loopback_xmit - loop back @skb
3904 * @net: network namespace this loopback is happening in
3905 * @sk: sk needed to be a netfilter okfn
3906 * @skb: buffer to transmit
3907 */
3908int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3909{
3910 skb_reset_mac_header(skb);
3911 __skb_pull(skb, skb_network_offset(skb));
3912 skb->pkt_type = PACKET_LOOPBACK;
3913 if (skb->ip_summed == CHECKSUM_NONE)
3914 skb->ip_summed = CHECKSUM_UNNECESSARY;
3915 DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb));
3916 skb_dst_force(skb);
3917 netif_rx(skb);
3918 return 0;
3919}
3920EXPORT_SYMBOL(dev_loopback_xmit);
3921
3922#ifdef CONFIG_NET_EGRESS
3923static struct sk_buff *
3924sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3925{
3926#ifdef CONFIG_NET_CLS_ACT
3927 struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
3928 struct tcf_result cl_res;
3929
3930 if (!miniq)
3931 return skb;
3932
3933 /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3934 tc_skb_cb(skb)->mru = 0;
3935 tc_skb_cb(skb)->post_ct = false;
3936 mini_qdisc_bstats_cpu_update(miniq, skb);
3937
3938 switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
3939 case TC_ACT_OK:
3940 case TC_ACT_RECLASSIFY:
3941 skb->tc_index = TC_H_MIN(cl_res.classid);
3942 break;
3943 case TC_ACT_SHOT:
3944 mini_qdisc_qstats_cpu_drop(miniq);
3945 *ret = NET_XMIT_DROP;
3946 kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
3947 return NULL;
3948 case TC_ACT_STOLEN:
3949 case TC_ACT_QUEUED:
3950 case TC_ACT_TRAP:
3951 *ret = NET_XMIT_SUCCESS;
3952 consume_skb(skb);
3953 return NULL;
3954 case TC_ACT_REDIRECT:
3955 /* No need to push/pop skb's mac_header here on egress! */
3956 skb_do_redirect(skb);
3957 *ret = NET_XMIT_SUCCESS;
3958 return NULL;
3959 default:
3960 break;
3961 }
3962#endif /* CONFIG_NET_CLS_ACT */
3963
3964 return skb;
3965}
3966
3967static struct netdev_queue *
3968netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
3969{
3970 int qm = skb_get_queue_mapping(skb);
3971
3972 return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
3973}
3974
3975static bool netdev_xmit_txqueue_skipped(void)
3976{
3977 return __this_cpu_read(softnet_data.xmit.skip_txqueue);
3978}
3979
3980void netdev_xmit_skip_txqueue(bool skip)
3981{
3982 __this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
3983}
3984EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
3985#endif /* CONFIG_NET_EGRESS */
3986
3987#ifdef CONFIG_XPS
3988static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
3989 struct xps_dev_maps *dev_maps, unsigned int tci)
3990{
3991 int tc = netdev_get_prio_tc_map(dev, skb->priority);
3992 struct xps_map *map;
3993 int queue_index = -1;
3994
3995 if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
3996 return queue_index;
3997
3998 tci *= dev_maps->num_tc;
3999 tci += tc;
4000
4001 map = rcu_dereference(dev_maps->attr_map[tci]);
4002 if (map) {
4003 if (map->len == 1)
4004 queue_index = map->queues[0];
4005 else
4006 queue_index = map->queues[reciprocal_scale(
4007 skb_get_hash(skb), map->len)];
4008 if (unlikely(queue_index >= dev->real_num_tx_queues))
4009 queue_index = -1;
4010 }
4011 return queue_index;
4012}
4013#endif
4014
4015static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
4016 struct sk_buff *skb)
4017{
4018#ifdef CONFIG_XPS
4019 struct xps_dev_maps *dev_maps;
4020 struct sock *sk = skb->sk;
4021 int queue_index = -1;
4022
4023 if (!static_key_false(&xps_needed))
4024 return -1;
4025
4026 rcu_read_lock();
4027 if (!static_key_false(&xps_rxqs_needed))
4028 goto get_cpus_map;
4029
4030 dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
4031 if (dev_maps) {
4032 int tci = sk_rx_queue_get(sk);
4033
4034 if (tci >= 0)
4035 queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
4036 tci);
4037 }
4038
4039get_cpus_map:
4040 if (queue_index < 0) {
4041 dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
4042 if (dev_maps) {
4043 unsigned int tci = skb->sender_cpu - 1;
4044
4045 queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
4046 tci);
4047 }
4048 }
4049 rcu_read_unlock();
4050
4051 return queue_index;
4052#else
4053 return -1;
4054#endif
4055}
4056
4057u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
4058 struct net_device *sb_dev)
4059{
4060 return 0;
4061}
4062EXPORT_SYMBOL(dev_pick_tx_zero);
4063
4064u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
4065 struct net_device *sb_dev)
4066{
4067 return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
4068}
4069EXPORT_SYMBOL(dev_pick_tx_cpu_id);
4070
4071u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
4072 struct net_device *sb_dev)
4073{
4074 struct sock *sk = skb->sk;
4075 int queue_index = sk_tx_queue_get(sk);
4076
4077 sb_dev = sb_dev ? : dev;
4078
4079 if (queue_index < 0 || skb->ooo_okay ||
4080 queue_index >= dev->real_num_tx_queues) {
4081 int new_index = get_xps_queue(dev, sb_dev, skb);
4082
4083 if (new_index < 0)
4084 new_index = skb_tx_hash(dev, sb_dev, skb);
4085
4086 if (queue_index != new_index && sk &&
4087 sk_fullsock(sk) &&
4088 rcu_access_pointer(sk->sk_dst_cache))
4089 sk_tx_queue_set(sk, new_index);
4090
4091 queue_index = new_index;
4092 }
4093
4094 return queue_index;
4095}
4096EXPORT_SYMBOL(netdev_pick_tx);
4097
4098struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
4099 struct sk_buff *skb,
4100 struct net_device *sb_dev)
4101{
4102 int queue_index = 0;
4103
4104#ifdef CONFIG_XPS
4105 u32 sender_cpu = skb->sender_cpu - 1;
4106
4107 if (sender_cpu >= (u32)NR_CPUS)
4108 skb->sender_cpu = raw_smp_processor_id() + 1;
4109#endif
4110
4111 if (dev->real_num_tx_queues != 1) {
4112 const struct net_device_ops *ops = dev->netdev_ops;
4113
4114 if (ops->ndo_select_queue)
4115 queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
4116 else
4117 queue_index = netdev_pick_tx(dev, skb, sb_dev);
4118
4119 queue_index = netdev_cap_txqueue(dev, queue_index);
4120 }
4121
4122 skb_set_queue_mapping(skb, queue_index);
4123 return netdev_get_tx_queue(dev, queue_index);
4124}
4125
4126/**
4127 * __dev_queue_xmit() - transmit a buffer
4128 * @skb: buffer to transmit
4129 * @sb_dev: suboordinate device used for L2 forwarding offload
4130 *
4131 * Queue a buffer for transmission to a network device. The caller must
4132 * have set the device and priority and built the buffer before calling
4133 * this function. The function can be called from an interrupt.
4134 *
4135 * When calling this method, interrupts MUST be enabled. This is because
4136 * the BH enable code must have IRQs enabled so that it will not deadlock.
4137 *
4138 * Regardless of the return value, the skb is consumed, so it is currently
4139 * difficult to retry a send to this method. (You can bump the ref count
4140 * before sending to hold a reference for retry if you are careful.)
4141 *
4142 * Return:
4143 * * 0 - buffer successfully transmitted
4144 * * positive qdisc return code - NET_XMIT_DROP etc.
4145 * * negative errno - other errors
4146 */
4147int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
4148{
4149 struct net_device *dev = skb->dev;
4150 struct netdev_queue *txq = NULL;
4151 struct Qdisc *q;
4152 int rc = -ENOMEM;
4153 bool again = false;
4154
4155 skb_reset_mac_header(skb);
4156 skb_assert_len(skb);
4157
4158 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
4159 __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
4160
4161 /* Disable soft irqs for various locks below. Also
4162 * stops preemption for RCU.
4163 */
4164 rcu_read_lock_bh();
4165
4166 skb_update_prio(skb);
4167
4168 qdisc_pkt_len_init(skb);
4169#ifdef CONFIG_NET_CLS_ACT
4170 skb->tc_at_ingress = 0;
4171#endif
4172#ifdef CONFIG_NET_EGRESS
4173 if (static_branch_unlikely(&egress_needed_key)) {
4174 if (nf_hook_egress_active()) {
4175 skb = nf_hook_egress(skb, &rc, dev);
4176 if (!skb)
4177 goto out;
4178 }
4179
4180 netdev_xmit_skip_txqueue(false);
4181
4182 nf_skip_egress(skb, true);
4183 skb = sch_handle_egress(skb, &rc, dev);
4184 if (!skb)
4185 goto out;
4186 nf_skip_egress(skb, false);
4187
4188 if (netdev_xmit_txqueue_skipped())
4189 txq = netdev_tx_queue_mapping(dev, skb);
4190 }
4191#endif
4192 /* If device/qdisc don't need skb->dst, release it right now while
4193 * its hot in this cpu cache.
4194 */
4195 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
4196 skb_dst_drop(skb);
4197 else
4198 skb_dst_force(skb);
4199
4200 if (!txq)
4201 txq = netdev_core_pick_tx(dev, skb, sb_dev);
4202
4203 q = rcu_dereference_bh(txq->qdisc);
4204
4205 trace_net_dev_queue(skb);
4206 if (q->enqueue) {
4207 rc = __dev_xmit_skb(skb, q, dev, txq);
4208 goto out;
4209 }
4210
4211 /* The device has no queue. Common case for software devices:
4212 * loopback, all the sorts of tunnels...
4213
4214 * Really, it is unlikely that netif_tx_lock protection is necessary
4215 * here. (f.e. loopback and IP tunnels are clean ignoring statistics
4216 * counters.)
4217 * However, it is possible, that they rely on protection
4218 * made by us here.
4219
4220 * Check this and shot the lock. It is not prone from deadlocks.
4221 *Either shot noqueue qdisc, it is even simpler 8)
4222 */
4223 if (dev->flags & IFF_UP) {
4224 int cpu = smp_processor_id(); /* ok because BHs are off */
4225
4226 /* Other cpus might concurrently change txq->xmit_lock_owner
4227 * to -1 or to their cpu id, but not to our id.
4228 */
4229 if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
4230 if (dev_xmit_recursion())
4231 goto recursion_alert;
4232
4233 skb = validate_xmit_skb(skb, dev, &again);
4234 if (!skb)
4235 goto out;
4236
4237 HARD_TX_LOCK(dev, txq, cpu);
4238
4239 if (!netif_xmit_stopped(txq)) {
4240 dev_xmit_recursion_inc();
4241 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
4242 dev_xmit_recursion_dec();
4243 if (dev_xmit_complete(rc)) {
4244 HARD_TX_UNLOCK(dev, txq);
4245 goto out;
4246 }
4247 }
4248 HARD_TX_UNLOCK(dev, txq);
4249 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
4250 dev->name);
4251 } else {
4252 /* Recursion is detected! It is possible,
4253 * unfortunately
4254 */
4255recursion_alert:
4256 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
4257 dev->name);
4258 }
4259 }
4260
4261 rc = -ENETDOWN;
4262 rcu_read_unlock_bh();
4263
4264 dev_core_stats_tx_dropped_inc(dev);
4265 kfree_skb_list(skb);
4266 return rc;
4267out:
4268 rcu_read_unlock_bh();
4269 return rc;
4270}
4271EXPORT_SYMBOL(__dev_queue_xmit);
4272
4273int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
4274{
4275 struct net_device *dev = skb->dev;
4276 struct sk_buff *orig_skb = skb;
4277 struct netdev_queue *txq;
4278 int ret = NETDEV_TX_BUSY;
4279 bool again = false;
4280
4281 if (unlikely(!netif_running(dev) ||
4282 !netif_carrier_ok(dev)))
4283 goto drop;
4284
4285 skb = validate_xmit_skb_list(skb, dev, &again);
4286 if (skb != orig_skb)
4287 goto drop;
4288
4289 skb_set_queue_mapping(skb, queue_id);
4290 txq = skb_get_tx_queue(dev, skb);
4291
4292 local_bh_disable();
4293
4294 dev_xmit_recursion_inc();
4295 HARD_TX_LOCK(dev, txq, smp_processor_id());
4296 if (!netif_xmit_frozen_or_drv_stopped(txq))
4297 ret = netdev_start_xmit(skb, dev, txq, false);
4298 HARD_TX_UNLOCK(dev, txq);
4299 dev_xmit_recursion_dec();
4300
4301 local_bh_enable();
4302 return ret;
4303drop:
4304 dev_core_stats_tx_dropped_inc(dev);
4305 kfree_skb_list(skb);
4306 return NET_XMIT_DROP;
4307}
4308EXPORT_SYMBOL(__dev_direct_xmit);
4309
4310/*************************************************************************
4311 * Receiver routines
4312 *************************************************************************/
4313
4314int netdev_max_backlog __read_mostly = 1000;
4315EXPORT_SYMBOL(netdev_max_backlog);
4316
4317int netdev_tstamp_prequeue __read_mostly = 1;
4318unsigned int sysctl_skb_defer_max __read_mostly = 64;
4319int netdev_budget __read_mostly = 300;
4320/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
4321unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
4322int weight_p __read_mostly = 64; /* old backlog weight */
4323int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
4324int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
4325int dev_rx_weight __read_mostly = 64;
4326int dev_tx_weight __read_mostly = 64;
4327
4328/* Called with irq disabled */
4329static inline void ____napi_schedule(struct softnet_data *sd,
4330 struct napi_struct *napi)
4331{
4332 struct task_struct *thread;
4333
4334 lockdep_assert_irqs_disabled();
4335
4336 if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
4337 /* Paired with smp_mb__before_atomic() in
4338 * napi_enable()/dev_set_threaded().
4339 * Use READ_ONCE() to guarantee a complete
4340 * read on napi->thread. Only call
4341 * wake_up_process() when it's not NULL.
4342 */
4343 thread = READ_ONCE(napi->thread);
4344 if (thread) {
4345 /* Avoid doing set_bit() if the thread is in
4346 * INTERRUPTIBLE state, cause napi_thread_wait()
4347 * makes sure to proceed with napi polling
4348 * if the thread is explicitly woken from here.
4349 */
4350 if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
4351 set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
4352 wake_up_process(thread);
4353 return;
4354 }
4355 }
4356
4357 list_add_tail(&napi->poll_list, &sd->poll_list);
4358 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4359}
4360
4361#ifdef CONFIG_RPS
4362
4363/* One global table that all flow-based protocols share. */
4364struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
4365EXPORT_SYMBOL(rps_sock_flow_table);
4366u32 rps_cpu_mask __read_mostly;
4367EXPORT_SYMBOL(rps_cpu_mask);
4368
4369struct static_key_false rps_needed __read_mostly;
4370EXPORT_SYMBOL(rps_needed);
4371struct static_key_false rfs_needed __read_mostly;
4372EXPORT_SYMBOL(rfs_needed);
4373
4374static struct rps_dev_flow *
4375set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4376 struct rps_dev_flow *rflow, u16 next_cpu)
4377{
4378 if (next_cpu < nr_cpu_ids) {
4379#ifdef CONFIG_RFS_ACCEL
4380 struct netdev_rx_queue *rxqueue;
4381 struct rps_dev_flow_table *flow_table;
4382 struct rps_dev_flow *old_rflow;
4383 u32 flow_id;
4384 u16 rxq_index;
4385 int rc;
4386
4387 /* Should we steer this flow to a different hardware queue? */
4388 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
4389 !(dev->features & NETIF_F_NTUPLE))
4390 goto out;
4391 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
4392 if (rxq_index == skb_get_rx_queue(skb))
4393 goto out;
4394
4395 rxqueue = dev->_rx + rxq_index;
4396 flow_table = rcu_dereference(rxqueue->rps_flow_table);
4397 if (!flow_table)
4398 goto out;
4399 flow_id = skb_get_hash(skb) & flow_table->mask;
4400 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
4401 rxq_index, flow_id);
4402 if (rc < 0)
4403 goto out;
4404 old_rflow = rflow;
4405 rflow = &flow_table->flows[flow_id];
4406 rflow->filter = rc;
4407 if (old_rflow->filter == rflow->filter)
4408 old_rflow->filter = RPS_NO_FILTER;
4409 out:
4410#endif
4411 rflow->last_qtail =
4412 per_cpu(softnet_data, next_cpu).input_queue_head;
4413 }
4414
4415 rflow->cpu = next_cpu;
4416 return rflow;
4417}
4418
4419/*
4420 * get_rps_cpu is called from netif_receive_skb and returns the target
4421 * CPU from the RPS map of the receiving queue for a given skb.
4422 * rcu_read_lock must be held on entry.
4423 */
4424static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4425 struct rps_dev_flow **rflowp)
4426{
4427 const struct rps_sock_flow_table *sock_flow_table;
4428 struct netdev_rx_queue *rxqueue = dev->_rx;
4429 struct rps_dev_flow_table *flow_table;
4430 struct rps_map *map;
4431 int cpu = -1;
4432 u32 tcpu;
4433 u32 hash;
4434
4435 if (skb_rx_queue_recorded(skb)) {
4436 u16 index = skb_get_rx_queue(skb);
4437
4438 if (unlikely(index >= dev->real_num_rx_queues)) {
4439 WARN_ONCE(dev->real_num_rx_queues > 1,
4440 "%s received packet on queue %u, but number "
4441 "of RX queues is %u\n",
4442 dev->name, index, dev->real_num_rx_queues);
4443 goto done;
4444 }
4445 rxqueue += index;
4446 }
4447
4448 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
4449
4450 flow_table = rcu_dereference(rxqueue->rps_flow_table);
4451 map = rcu_dereference(rxqueue->rps_map);
4452 if (!flow_table && !map)
4453 goto done;
4454
4455 skb_reset_network_header(skb);
4456 hash = skb_get_hash(skb);
4457 if (!hash)
4458 goto done;
4459
4460 sock_flow_table = rcu_dereference(rps_sock_flow_table);
4461 if (flow_table && sock_flow_table) {
4462 struct rps_dev_flow *rflow;
4463 u32 next_cpu;
4464 u32 ident;
4465
4466 /* First check into global flow table if there is a match */
4467 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
4468 if ((ident ^ hash) & ~rps_cpu_mask)
4469 goto try_rps;
4470
4471 next_cpu = ident & rps_cpu_mask;
4472
4473 /* OK, now we know there is a match,
4474 * we can look at the local (per receive queue) flow table
4475 */
4476 rflow = &flow_table->flows[hash & flow_table->mask];
4477 tcpu = rflow->cpu;
4478
4479 /*
4480 * If the desired CPU (where last recvmsg was done) is
4481 * different from current CPU (one in the rx-queue flow
4482 * table entry), switch if one of the following holds:
4483 * - Current CPU is unset (>= nr_cpu_ids).
4484 * - Current CPU is offline.
4485 * - The current CPU's queue tail has advanced beyond the
4486 * last packet that was enqueued using this table entry.
4487 * This guarantees that all previous packets for the flow
4488 * have been dequeued, thus preserving in order delivery.
4489 */
4490 if (unlikely(tcpu != next_cpu) &&
4491 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
4492 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
4493 rflow->last_qtail)) >= 0)) {
4494 tcpu = next_cpu;
4495 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
4496 }
4497
4498 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
4499 *rflowp = rflow;
4500 cpu = tcpu;
4501 goto done;
4502 }
4503 }
4504
4505try_rps:
4506
4507 if (map) {
4508 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
4509 if (cpu_online(tcpu)) {
4510 cpu = tcpu;
4511 goto done;
4512 }
4513 }
4514
4515done:
4516 return cpu;
4517}
4518
4519#ifdef CONFIG_RFS_ACCEL
4520
4521/**
4522 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
4523 * @dev: Device on which the filter was set
4524 * @rxq_index: RX queue index
4525 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
4526 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
4527 *
4528 * Drivers that implement ndo_rx_flow_steer() should periodically call
4529 * this function for each installed filter and remove the filters for
4530 * which it returns %true.
4531 */
4532bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
4533 u32 flow_id, u16 filter_id)
4534{
4535 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
4536 struct rps_dev_flow_table *flow_table;
4537 struct rps_dev_flow *rflow;
4538 bool expire = true;
4539 unsigned int cpu;
4540
4541 rcu_read_lock();
4542 flow_table = rcu_dereference(rxqueue->rps_flow_table);
4543 if (flow_table && flow_id <= flow_table->mask) {
4544 rflow = &flow_table->flows[flow_id];
4545 cpu = READ_ONCE(rflow->cpu);
4546 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
4547 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
4548 rflow->last_qtail) <
4549 (int)(10 * flow_table->mask)))
4550 expire = false;
4551 }
4552 rcu_read_unlock();
4553 return expire;
4554}
4555EXPORT_SYMBOL(rps_may_expire_flow);
4556
4557#endif /* CONFIG_RFS_ACCEL */
4558
4559/* Called from hardirq (IPI) context */
4560static void rps_trigger_softirq(void *data)
4561{
4562 struct softnet_data *sd = data;
4563
4564 ____napi_schedule(sd, &sd->backlog);
4565 sd->received_rps++;
4566}
4567
4568#endif /* CONFIG_RPS */
4569
4570/* Called from hardirq (IPI) context */
4571static void trigger_rx_softirq(void *data)
4572{
4573 struct softnet_data *sd = data;
4574
4575 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4576 smp_store_release(&sd->defer_ipi_scheduled, 0);
4577}
4578
4579/*
4580 * Check if this softnet_data structure is another cpu one
4581 * If yes, queue it to our IPI list and return 1
4582 * If no, return 0
4583 */
4584static int napi_schedule_rps(struct softnet_data *sd)
4585{
4586 struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
4587
4588#ifdef CONFIG_RPS
4589 if (sd != mysd) {
4590 sd->rps_ipi_next = mysd->rps_ipi_list;
4591 mysd->rps_ipi_list = sd;
4592
4593 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4594 return 1;
4595 }
4596#endif /* CONFIG_RPS */
4597 __napi_schedule_irqoff(&mysd->backlog);
4598 return 0;
4599}
4600
4601#ifdef CONFIG_NET_FLOW_LIMIT
4602int netdev_flow_limit_table_len __read_mostly = (1 << 12);
4603#endif
4604
4605static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
4606{
4607#ifdef CONFIG_NET_FLOW_LIMIT
4608 struct sd_flow_limit *fl;
4609 struct softnet_data *sd;
4610 unsigned int old_flow, new_flow;
4611
4612 if (qlen < (READ_ONCE(netdev_max_backlog) >> 1))
4613 return false;
4614
4615 sd = this_cpu_ptr(&softnet_data);
4616
4617 rcu_read_lock();
4618 fl = rcu_dereference(sd->flow_limit);
4619 if (fl) {
4620 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
4621 old_flow = fl->history[fl->history_head];
4622 fl->history[fl->history_head] = new_flow;
4623
4624 fl->history_head++;
4625 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
4626
4627 if (likely(fl->buckets[old_flow]))
4628 fl->buckets[old_flow]--;
4629
4630 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
4631 fl->count++;
4632 rcu_read_unlock();
4633 return true;
4634 }
4635 }
4636 rcu_read_unlock();
4637#endif
4638 return false;
4639}
4640
4641/*
4642 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
4643 * queue (may be a remote CPU queue).
4644 */
4645static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
4646 unsigned int *qtail)
4647{
4648 enum skb_drop_reason reason;
4649 struct softnet_data *sd;
4650 unsigned long flags;
4651 unsigned int qlen;
4652
4653 reason = SKB_DROP_REASON_NOT_SPECIFIED;
4654 sd = &per_cpu(softnet_data, cpu);
4655
4656 rps_lock_irqsave(sd, &flags);
4657 if (!netif_running(skb->dev))
4658 goto drop;
4659 qlen = skb_queue_len(&sd->input_pkt_queue);
4660 if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) {
4661 if (qlen) {
4662enqueue:
4663 __skb_queue_tail(&sd->input_pkt_queue, skb);
4664 input_queue_tail_incr_save(sd, qtail);
4665 rps_unlock_irq_restore(sd, &flags);
4666 return NET_RX_SUCCESS;
4667 }
4668
4669 /* Schedule NAPI for backlog device
4670 * We can use non atomic operation since we own the queue lock
4671 */
4672 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
4673 napi_schedule_rps(sd);
4674 goto enqueue;
4675 }
4676 reason = SKB_DROP_REASON_CPU_BACKLOG;
4677
4678drop:
4679 sd->dropped++;
4680 rps_unlock_irq_restore(sd, &flags);
4681
4682 dev_core_stats_rx_dropped_inc(skb->dev);
4683 kfree_skb_reason(skb, reason);
4684 return NET_RX_DROP;
4685}
4686
4687static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
4688{
4689 struct net_device *dev = skb->dev;
4690 struct netdev_rx_queue *rxqueue;
4691
4692 rxqueue = dev->_rx;
4693
4694 if (skb_rx_queue_recorded(skb)) {
4695 u16 index = skb_get_rx_queue(skb);
4696
4697 if (unlikely(index >= dev->real_num_rx_queues)) {
4698 WARN_ONCE(dev->real_num_rx_queues > 1,
4699 "%s received packet on queue %u, but number "
4700 "of RX queues is %u\n",
4701 dev->name, index, dev->real_num_rx_queues);
4702
4703 return rxqueue; /* Return first rxqueue */
4704 }
4705 rxqueue += index;
4706 }
4707 return rxqueue;
4708}
4709
4710u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
4711 struct bpf_prog *xdp_prog)
4712{
4713 void *orig_data, *orig_data_end, *hard_start;
4714 struct netdev_rx_queue *rxqueue;
4715 bool orig_bcast, orig_host;
4716 u32 mac_len, frame_sz;
4717 __be16 orig_eth_type;
4718 struct ethhdr *eth;
4719 u32 metalen, act;
4720 int off;
4721
4722 /* The XDP program wants to see the packet starting at the MAC
4723 * header.
4724 */
4725 mac_len = skb->data - skb_mac_header(skb);
4726 hard_start = skb->data - skb_headroom(skb);
4727
4728 /* SKB "head" area always have tailroom for skb_shared_info */
4729 frame_sz = (void *)skb_end_pointer(skb) - hard_start;
4730 frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
4731
4732 rxqueue = netif_get_rxqueue(skb);
4733 xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
4734 xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
4735 skb_headlen(skb) + mac_len, true);
4736
4737 orig_data_end = xdp->data_end;
4738 orig_data = xdp->data;
4739 eth = (struct ethhdr *)xdp->data;
4740 orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
4741 orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
4742 orig_eth_type = eth->h_proto;
4743
4744 act = bpf_prog_run_xdp(xdp_prog, xdp);
4745
4746 /* check if bpf_xdp_adjust_head was used */
4747 off = xdp->data - orig_data;
4748 if (off) {
4749 if (off > 0)
4750 __skb_pull(skb, off);
4751 else if (off < 0)
4752 __skb_push(skb, -off);
4753
4754 skb->mac_header += off;
4755 skb_reset_network_header(skb);
4756 }
4757
4758 /* check if bpf_xdp_adjust_tail was used */
4759 off = xdp->data_end - orig_data_end;
4760 if (off != 0) {
4761 skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4762 skb->len += off; /* positive on grow, negative on shrink */
4763 }
4764
4765 /* check if XDP changed eth hdr such SKB needs update */
4766 eth = (struct ethhdr *)xdp->data;
4767 if ((orig_eth_type != eth->h_proto) ||
4768 (orig_host != ether_addr_equal_64bits(eth->h_dest,
4769 skb->dev->dev_addr)) ||
4770 (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
4771 __skb_push(skb, ETH_HLEN);
4772 skb->pkt_type = PACKET_HOST;
4773 skb->protocol = eth_type_trans(skb, skb->dev);
4774 }
4775
4776 /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
4777 * before calling us again on redirect path. We do not call do_redirect
4778 * as we leave that up to the caller.
4779 *
4780 * Caller is responsible for managing lifetime of skb (i.e. calling
4781 * kfree_skb in response to actions it cannot handle/XDP_DROP).
4782 */
4783 switch (act) {
4784 case XDP_REDIRECT:
4785 case XDP_TX:
4786 __skb_push(skb, mac_len);
4787 break;
4788 case XDP_PASS:
4789 metalen = xdp->data - xdp->data_meta;
4790 if (metalen)
4791 skb_metadata_set(skb, metalen);
4792 break;
4793 }
4794
4795 return act;
4796}
4797
4798static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4799 struct xdp_buff *xdp,
4800 struct bpf_prog *xdp_prog)
4801{
4802 u32 act = XDP_DROP;
4803
4804 /* Reinjected packets coming from act_mirred or similar should
4805 * not get XDP generic processing.
4806 */
4807 if (skb_is_redirected(skb))
4808 return XDP_PASS;
4809
4810 /* XDP packets must be linear and must have sufficient headroom
4811 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
4812 * native XDP provides, thus we need to do it here as well.
4813 */
4814 if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
4815 skb_headroom(skb) < XDP_PACKET_HEADROOM) {
4816 int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
4817 int troom = skb->tail + skb->data_len - skb->end;
4818
4819 /* In case we have to go down the path and also linearize,
4820 * then lets do the pskb_expand_head() work just once here.
4821 */
4822 if (pskb_expand_head(skb,
4823 hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
4824 troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
4825 goto do_drop;
4826 if (skb_linearize(skb))
4827 goto do_drop;
4828 }
4829
4830 act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
4831 switch (act) {
4832 case XDP_REDIRECT:
4833 case XDP_TX:
4834 case XDP_PASS:
4835 break;
4836 default:
4837 bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act);
4838 fallthrough;
4839 case XDP_ABORTED:
4840 trace_xdp_exception(skb->dev, xdp_prog, act);
4841 fallthrough;
4842 case XDP_DROP:
4843 do_drop:
4844 kfree_skb(skb);
4845 break;
4846 }
4847
4848 return act;
4849}
4850
4851/* When doing generic XDP we have to bypass the qdisc layer and the
4852 * network taps in order to match in-driver-XDP behavior. This also means
4853 * that XDP packets are able to starve other packets going through a qdisc,
4854 * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
4855 * queues, so they do not have this starvation issue.
4856 */
4857void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
4858{
4859 struct net_device *dev = skb->dev;
4860 struct netdev_queue *txq;
4861 bool free_skb = true;
4862 int cpu, rc;
4863
4864 txq = netdev_core_pick_tx(dev, skb, NULL);
4865 cpu = smp_processor_id();
4866 HARD_TX_LOCK(dev, txq, cpu);
4867 if (!netif_xmit_frozen_or_drv_stopped(txq)) {
4868 rc = netdev_start_xmit(skb, dev, txq, 0);
4869 if (dev_xmit_complete(rc))
4870 free_skb = false;
4871 }
4872 HARD_TX_UNLOCK(dev, txq);
4873 if (free_skb) {
4874 trace_xdp_exception(dev, xdp_prog, XDP_TX);
4875 dev_core_stats_tx_dropped_inc(dev);
4876 kfree_skb(skb);
4877 }
4878}
4879
4880static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
4881
4882int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
4883{
4884 if (xdp_prog) {
4885 struct xdp_buff xdp;
4886 u32 act;
4887 int err;
4888
4889 act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
4890 if (act != XDP_PASS) {
4891 switch (act) {
4892 case XDP_REDIRECT:
4893 err = xdp_do_generic_redirect(skb->dev, skb,
4894 &xdp, xdp_prog);
4895 if (err)
4896 goto out_redir;
4897 break;
4898 case XDP_TX:
4899 generic_xdp_tx(skb, xdp_prog);
4900 break;
4901 }
4902 return XDP_DROP;
4903 }
4904 }
4905 return XDP_PASS;
4906out_redir:
4907 kfree_skb_reason(skb, SKB_DROP_REASON_XDP);
4908 return XDP_DROP;
4909}
4910EXPORT_SYMBOL_GPL(do_xdp_generic);
4911
4912static int netif_rx_internal(struct sk_buff *skb)
4913{
4914 int ret;
4915
4916 net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
4917
4918 trace_netif_rx(skb);
4919
4920#ifdef CONFIG_RPS
4921 if (static_branch_unlikely(&rps_needed)) {
4922 struct rps_dev_flow voidflow, *rflow = &voidflow;
4923 int cpu;
4924
4925 rcu_read_lock();
4926
4927 cpu = get_rps_cpu(skb->dev, skb, &rflow);
4928 if (cpu < 0)
4929 cpu = smp_processor_id();
4930
4931 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4932
4933 rcu_read_unlock();
4934 } else
4935#endif
4936 {
4937 unsigned int qtail;
4938
4939 ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
4940 }
4941 return ret;
4942}
4943
4944/**
4945 * __netif_rx - Slightly optimized version of netif_rx
4946 * @skb: buffer to post
4947 *
4948 * This behaves as netif_rx except that it does not disable bottom halves.
4949 * As a result this function may only be invoked from the interrupt context
4950 * (either hard or soft interrupt).
4951 */
4952int __netif_rx(struct sk_buff *skb)
4953{
4954 int ret;
4955
4956 lockdep_assert_once(hardirq_count() | softirq_count());
4957
4958 trace_netif_rx_entry(skb);
4959 ret = netif_rx_internal(skb);
4960 trace_netif_rx_exit(ret);
4961 return ret;
4962}
4963EXPORT_SYMBOL(__netif_rx);
4964
4965/**
4966 * netif_rx - post buffer to the network code
4967 * @skb: buffer to post
4968 *
4969 * This function receives a packet from a device driver and queues it for
4970 * the upper (protocol) levels to process via the backlog NAPI device. It
4971 * always succeeds. The buffer may be dropped during processing for
4972 * congestion control or by the protocol layers.
4973 * The network buffer is passed via the backlog NAPI device. Modern NIC
4974 * driver should use NAPI and GRO.
4975 * This function can used from interrupt and from process context. The
4976 * caller from process context must not disable interrupts before invoking
4977 * this function.
4978 *
4979 * return values:
4980 * NET_RX_SUCCESS (no congestion)
4981 * NET_RX_DROP (packet was dropped)
4982 *
4983 */
4984int netif_rx(struct sk_buff *skb)
4985{
4986 bool need_bh_off = !(hardirq_count() | softirq_count());
4987 int ret;
4988
4989 if (need_bh_off)
4990 local_bh_disable();
4991 trace_netif_rx_entry(skb);
4992 ret = netif_rx_internal(skb);
4993 trace_netif_rx_exit(ret);
4994 if (need_bh_off)
4995 local_bh_enable();
4996 return ret;
4997}
4998EXPORT_SYMBOL(netif_rx);
4999
5000static __latent_entropy void net_tx_action(struct softirq_action *h)
5001{
5002 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5003
5004 if (sd->completion_queue) {
5005 struct sk_buff *clist;
5006
5007 local_irq_disable();
5008 clist = sd->completion_queue;
5009 sd->completion_queue = NULL;
5010 local_irq_enable();
5011
5012 while (clist) {
5013 struct sk_buff *skb = clist;
5014
5015 clist = clist->next;
5016
5017 WARN_ON(refcount_read(&skb->users));
5018 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
5019 trace_consume_skb(skb);
5020 else
5021 trace_kfree_skb(skb, net_tx_action,
5022 SKB_DROP_REASON_NOT_SPECIFIED);
5023
5024 if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
5025 __kfree_skb(skb);
5026 else
5027 __kfree_skb_defer(skb);
5028 }
5029 }
5030
5031 if (sd->output_queue) {
5032 struct Qdisc *head;
5033
5034 local_irq_disable();
5035 head = sd->output_queue;
5036 sd->output_queue = NULL;
5037 sd->output_queue_tailp = &sd->output_queue;
5038 local_irq_enable();
5039
5040 rcu_read_lock();
5041
5042 while (head) {
5043 struct Qdisc *q = head;
5044 spinlock_t *root_lock = NULL;
5045
5046 head = head->next_sched;
5047
5048 /* We need to make sure head->next_sched is read
5049 * before clearing __QDISC_STATE_SCHED
5050 */
5051 smp_mb__before_atomic();
5052
5053 if (!(q->flags & TCQ_F_NOLOCK)) {
5054 root_lock = qdisc_lock(q);
5055 spin_lock(root_lock);
5056 } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
5057 &q->state))) {
5058 /* There is a synchronize_net() between
5059 * STATE_DEACTIVATED flag being set and
5060 * qdisc_reset()/some_qdisc_is_busy() in
5061 * dev_deactivate(), so we can safely bail out
5062 * early here to avoid data race between
5063 * qdisc_deactivate() and some_qdisc_is_busy()
5064 * for lockless qdisc.
5065 */
5066 clear_bit(__QDISC_STATE_SCHED, &q->state);
5067 continue;
5068 }
5069
5070 clear_bit(__QDISC_STATE_SCHED, &q->state);
5071 qdisc_run(q);
5072 if (root_lock)
5073 spin_unlock(root_lock);
5074 }
5075
5076 rcu_read_unlock();
5077 }
5078
5079 xfrm_dev_backlog(sd);
5080}
5081
5082#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
5083/* This hook is defined here for ATM LANE */
5084int (*br_fdb_test_addr_hook)(struct net_device *dev,
5085 unsigned char *addr) __read_mostly;
5086EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
5087#endif
5088
5089static inline struct sk_buff *
5090sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
5091 struct net_device *orig_dev, bool *another)
5092{
5093#ifdef CONFIG_NET_CLS_ACT
5094 struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
5095 struct tcf_result cl_res;
5096
5097 /* If there's at least one ingress present somewhere (so
5098 * we get here via enabled static key), remaining devices
5099 * that are not configured with an ingress qdisc will bail
5100 * out here.
5101 */
5102 if (!miniq)
5103 return skb;
5104
5105 if (*pt_prev) {
5106 *ret = deliver_skb(skb, *pt_prev, orig_dev);
5107 *pt_prev = NULL;
5108 }
5109
5110 qdisc_skb_cb(skb)->pkt_len = skb->len;
5111 tc_skb_cb(skb)->mru = 0;
5112 tc_skb_cb(skb)->post_ct = false;
5113 skb->tc_at_ingress = 1;
5114 mini_qdisc_bstats_cpu_update(miniq, skb);
5115
5116 switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
5117 case TC_ACT_OK:
5118 case TC_ACT_RECLASSIFY:
5119 skb->tc_index = TC_H_MIN(cl_res.classid);
5120 break;
5121 case TC_ACT_SHOT:
5122 mini_qdisc_qstats_cpu_drop(miniq);
5123 kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
5124 *ret = NET_RX_DROP;
5125 return NULL;
5126 case TC_ACT_STOLEN:
5127 case TC_ACT_QUEUED:
5128 case TC_ACT_TRAP:
5129 consume_skb(skb);
5130 *ret = NET_RX_SUCCESS;
5131 return NULL;
5132 case TC_ACT_REDIRECT:
5133 /* skb_mac_header check was done by cls/act_bpf, so
5134 * we can safely push the L2 header back before
5135 * redirecting to another netdev
5136 */
5137 __skb_push(skb, skb->mac_len);
5138 if (skb_do_redirect(skb) == -EAGAIN) {
5139 __skb_pull(skb, skb->mac_len);
5140 *another = true;
5141 break;
5142 }
5143 *ret = NET_RX_SUCCESS;
5144 return NULL;
5145 case TC_ACT_CONSUMED:
5146 *ret = NET_RX_SUCCESS;
5147 return NULL;
5148 default:
5149 break;
5150 }
5151#endif /* CONFIG_NET_CLS_ACT */
5152 return skb;
5153}
5154
5155/**
5156 * netdev_is_rx_handler_busy - check if receive handler is registered
5157 * @dev: device to check
5158 *
5159 * Check if a receive handler is already registered for a given device.
5160 * Return true if there one.
5161 *
5162 * The caller must hold the rtnl_mutex.
5163 */
5164bool netdev_is_rx_handler_busy(struct net_device *dev)
5165{
5166 ASSERT_RTNL();
5167 return dev && rtnl_dereference(dev->rx_handler);
5168}
5169EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
5170
5171/**
5172 * netdev_rx_handler_register - register receive handler
5173 * @dev: device to register a handler for
5174 * @rx_handler: receive handler to register
5175 * @rx_handler_data: data pointer that is used by rx handler
5176 *
5177 * Register a receive handler for a device. This handler will then be
5178 * called from __netif_receive_skb. A negative errno code is returned
5179 * on a failure.
5180 *
5181 * The caller must hold the rtnl_mutex.
5182 *
5183 * For a general description of rx_handler, see enum rx_handler_result.
5184 */
5185int netdev_rx_handler_register(struct net_device *dev,
5186 rx_handler_func_t *rx_handler,
5187 void *rx_handler_data)
5188{
5189 if (netdev_is_rx_handler_busy(dev))
5190 return -EBUSY;
5191
5192 if (dev->priv_flags & IFF_NO_RX_HANDLER)
5193 return -EINVAL;
5194
5195 /* Note: rx_handler_data must be set before rx_handler */
5196 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
5197 rcu_assign_pointer(dev->rx_handler, rx_handler);
5198
5199 return 0;
5200}
5201EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
5202
5203/**
5204 * netdev_rx_handler_unregister - unregister receive handler
5205 * @dev: device to unregister a handler from
5206 *
5207 * Unregister a receive handler from a device.
5208 *
5209 * The caller must hold the rtnl_mutex.
5210 */
5211void netdev_rx_handler_unregister(struct net_device *dev)
5212{
5213
5214 ASSERT_RTNL();
5215 RCU_INIT_POINTER(dev->rx_handler, NULL);
5216 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
5217 * section has a guarantee to see a non NULL rx_handler_data
5218 * as well.
5219 */
5220 synchronize_net();
5221 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
5222}
5223EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
5224
5225/*
5226 * Limit the use of PFMEMALLOC reserves to those protocols that implement
5227 * the special handling of PFMEMALLOC skbs.
5228 */
5229static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
5230{
5231 switch (skb->protocol) {
5232 case htons(ETH_P_ARP):
5233 case htons(ETH_P_IP):
5234 case htons(ETH_P_IPV6):
5235 case htons(ETH_P_8021Q):
5236 case htons(ETH_P_8021AD):
5237 return true;
5238 default:
5239 return false;
5240 }
5241}
5242
5243static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
5244 int *ret, struct net_device *orig_dev)
5245{
5246 if (nf_hook_ingress_active(skb)) {
5247 int ingress_retval;
5248
5249 if (*pt_prev) {
5250 *ret = deliver_skb(skb, *pt_prev, orig_dev);
5251 *pt_prev = NULL;
5252 }
5253
5254 rcu_read_lock();
5255 ingress_retval = nf_hook_ingress(skb);
5256 rcu_read_unlock();
5257 return ingress_retval;
5258 }
5259 return 0;
5260}
5261
5262static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
5263 struct packet_type **ppt_prev)
5264{
5265 struct packet_type *ptype, *pt_prev;
5266 rx_handler_func_t *rx_handler;
5267 struct sk_buff *skb = *pskb;
5268 struct net_device *orig_dev;
5269 bool deliver_exact = false;
5270 int ret = NET_RX_DROP;
5271 __be16 type;
5272
5273 net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb);
5274
5275 trace_netif_receive_skb(skb);
5276
5277 orig_dev = skb->dev;
5278
5279 skb_reset_network_header(skb);
5280 if (!skb_transport_header_was_set(skb))
5281 skb_reset_transport_header(skb);
5282 skb_reset_mac_len(skb);
5283
5284 pt_prev = NULL;
5285
5286another_round:
5287 skb->skb_iif = skb->dev->ifindex;
5288
5289 __this_cpu_inc(softnet_data.processed);
5290
5291 if (static_branch_unlikely(&generic_xdp_needed_key)) {
5292 int ret2;
5293
5294 migrate_disable();
5295 ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
5296 migrate_enable();
5297
5298 if (ret2 != XDP_PASS) {
5299 ret = NET_RX_DROP;
5300 goto out;
5301 }
5302 }
5303
5304 if (eth_type_vlan(skb->protocol)) {
5305 skb = skb_vlan_untag(skb);
5306 if (unlikely(!skb))
5307 goto out;
5308 }
5309
5310 if (skb_skip_tc_classify(skb))
5311 goto skip_classify;
5312
5313 if (pfmemalloc)
5314 goto skip_taps;
5315
5316 list_for_each_entry_rcu(ptype, &ptype_all, list) {
5317 if (pt_prev)
5318 ret = deliver_skb(skb, pt_prev, orig_dev);
5319 pt_prev = ptype;
5320 }
5321
5322 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
5323 if (pt_prev)
5324 ret = deliver_skb(skb, pt_prev, orig_dev);
5325 pt_prev = ptype;
5326 }
5327
5328skip_taps:
5329#ifdef CONFIG_NET_INGRESS
5330 if (static_branch_unlikely(&ingress_needed_key)) {
5331 bool another = false;
5332
5333 nf_skip_egress(skb, true);
5334 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
5335 &another);
5336 if (another)
5337 goto another_round;
5338 if (!skb)
5339 goto out;
5340
5341 nf_skip_egress(skb, false);
5342 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
5343 goto out;
5344 }
5345#endif
5346 skb_reset_redirect(skb);
5347skip_classify:
5348 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
5349 goto drop;
5350
5351 if (skb_vlan_tag_present(skb)) {
5352 if (pt_prev) {
5353 ret = deliver_skb(skb, pt_prev, orig_dev);
5354 pt_prev = NULL;
5355 }
5356 if (vlan_do_receive(&skb))
5357 goto another_round;
5358 else if (unlikely(!skb))
5359 goto out;
5360 }
5361
5362 rx_handler = rcu_dereference(skb->dev->rx_handler);
5363 if (rx_handler) {
5364 if (pt_prev) {
5365 ret = deliver_skb(skb, pt_prev, orig_dev);
5366 pt_prev = NULL;
5367 }
5368 switch (rx_handler(&skb)) {
5369 case RX_HANDLER_CONSUMED:
5370 ret = NET_RX_SUCCESS;
5371 goto out;
5372 case RX_HANDLER_ANOTHER:
5373 goto another_round;
5374 case RX_HANDLER_EXACT:
5375 deliver_exact = true;
5376 break;
5377 case RX_HANDLER_PASS:
5378 break;
5379 default:
5380 BUG();
5381 }
5382 }
5383
5384 if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
5385check_vlan_id:
5386 if (skb_vlan_tag_get_id(skb)) {
5387 /* Vlan id is non 0 and vlan_do_receive() above couldn't
5388 * find vlan device.
5389 */
5390 skb->pkt_type = PACKET_OTHERHOST;
5391 } else if (eth_type_vlan(skb->protocol)) {
5392 /* Outer header is 802.1P with vlan 0, inner header is
5393 * 802.1Q or 802.1AD and vlan_do_receive() above could
5394 * not find vlan dev for vlan id 0.
5395 */
5396 __vlan_hwaccel_clear_tag(skb);
5397 skb = skb_vlan_untag(skb);
5398 if (unlikely(!skb))
5399 goto out;
5400 if (vlan_do_receive(&skb))
5401 /* After stripping off 802.1P header with vlan 0
5402 * vlan dev is found for inner header.
5403 */
5404 goto another_round;
5405 else if (unlikely(!skb))
5406 goto out;
5407 else
5408 /* We have stripped outer 802.1P vlan 0 header.
5409 * But could not find vlan dev.
5410 * check again for vlan id to set OTHERHOST.
5411 */
5412 goto check_vlan_id;
5413 }
5414 /* Note: we might in the future use prio bits
5415 * and set skb->priority like in vlan_do_receive()
5416 * For the time being, just ignore Priority Code Point
5417 */
5418 __vlan_hwaccel_clear_tag(skb);
5419 }
5420
5421 type = skb->protocol;
5422
5423 /* deliver only exact match when indicated */
5424 if (likely(!deliver_exact)) {
5425 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5426 &ptype_base[ntohs(type) &
5427 PTYPE_HASH_MASK]);
5428 }
5429
5430 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5431 &orig_dev->ptype_specific);
5432
5433 if (unlikely(skb->dev != orig_dev)) {
5434 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5435 &skb->dev->ptype_specific);
5436 }
5437
5438 if (pt_prev) {
5439 if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
5440 goto drop;
5441 *ppt_prev = pt_prev;
5442 } else {
5443drop:
5444 if (!deliver_exact)
5445 dev_core_stats_rx_dropped_inc(skb->dev);
5446 else
5447 dev_core_stats_rx_nohandler_inc(skb->dev);
5448 kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
5449 /* Jamal, now you will not able to escape explaining
5450 * me how you were going to use this. :-)
5451 */
5452 ret = NET_RX_DROP;
5453 }
5454
5455out:
5456 /* The invariant here is that if *ppt_prev is not NULL
5457 * then skb should also be non-NULL.
5458 *
5459 * Apparently *ppt_prev assignment above holds this invariant due to
5460 * skb dereferencing near it.
5461 */
5462 *pskb = skb;
5463 return ret;
5464}
5465
5466static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
5467{
5468 struct net_device *orig_dev = skb->dev;
5469 struct packet_type *pt_prev = NULL;
5470 int ret;
5471
5472 ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5473 if (pt_prev)
5474 ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
5475 skb->dev, pt_prev, orig_dev);
5476 return ret;
5477}
5478
5479/**
5480 * netif_receive_skb_core - special purpose version of netif_receive_skb
5481 * @skb: buffer to process
5482 *
5483 * More direct receive version of netif_receive_skb(). It should
5484 * only be used by callers that have a need to skip RPS and Generic XDP.
5485 * Caller must also take care of handling if ``(page_is_)pfmemalloc``.
5486 *
5487 * This function may only be called from softirq context and interrupts
5488 * should be enabled.
5489 *
5490 * Return values (usually ignored):
5491 * NET_RX_SUCCESS: no congestion
5492 * NET_RX_DROP: packet was dropped
5493 */
5494int netif_receive_skb_core(struct sk_buff *skb)
5495{
5496 int ret;
5497
5498 rcu_read_lock();
5499 ret = __netif_receive_skb_one_core(skb, false);
5500 rcu_read_unlock();
5501
5502 return ret;
5503}
5504EXPORT_SYMBOL(netif_receive_skb_core);
5505
5506static inline void __netif_receive_skb_list_ptype(struct list_head *head,
5507 struct packet_type *pt_prev,
5508 struct net_device *orig_dev)
5509{
5510 struct sk_buff *skb, *next;
5511
5512 if (!pt_prev)
5513 return;
5514 if (list_empty(head))
5515 return;
5516 if (pt_prev->list_func != NULL)
5517 INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
5518 ip_list_rcv, head, pt_prev, orig_dev);
5519 else
5520 list_for_each_entry_safe(skb, next, head, list) {
5521 skb_list_del_init(skb);
5522 pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
5523 }
5524}
5525
5526static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
5527{
5528 /* Fast-path assumptions:
5529 * - There is no RX handler.
5530 * - Only one packet_type matches.
5531 * If either of these fails, we will end up doing some per-packet
5532 * processing in-line, then handling the 'last ptype' for the whole
5533 * sublist. This can't cause out-of-order delivery to any single ptype,
5534 * because the 'last ptype' must be constant across the sublist, and all
5535 * other ptypes are handled per-packet.
5536 */
5537 /* Current (common) ptype of sublist */
5538 struct packet_type *pt_curr = NULL;
5539 /* Current (common) orig_dev of sublist */
5540 struct net_device *od_curr = NULL;
5541 struct list_head sublist;
5542 struct sk_buff *skb, *next;
5543
5544 INIT_LIST_HEAD(&sublist);
5545 list_for_each_entry_safe(skb, next, head, list) {
5546 struct net_device *orig_dev = skb->dev;
5547 struct packet_type *pt_prev = NULL;
5548
5549 skb_list_del_init(skb);
5550 __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5551 if (!pt_prev)
5552 continue;
5553 if (pt_curr != pt_prev || od_curr != orig_dev) {
5554 /* dispatch old sublist */
5555 __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5556 /* start new sublist */
5557 INIT_LIST_HEAD(&sublist);
5558 pt_curr = pt_prev;
5559 od_curr = orig_dev;
5560 }
5561 list_add_tail(&skb->list, &sublist);
5562 }
5563
5564 /* dispatch final sublist */
5565 __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5566}
5567
5568static int __netif_receive_skb(struct sk_buff *skb)
5569{
5570 int ret;
5571
5572 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
5573 unsigned int noreclaim_flag;
5574
5575 /*
5576 * PFMEMALLOC skbs are special, they should
5577 * - be delivered to SOCK_MEMALLOC sockets only
5578 * - stay away from userspace
5579 * - have bounded memory usage
5580 *
5581 * Use PF_MEMALLOC as this saves us from propagating the allocation
5582 * context down to all allocation sites.
5583 */
5584 noreclaim_flag = memalloc_noreclaim_save();
5585 ret = __netif_receive_skb_one_core(skb, true);
5586 memalloc_noreclaim_restore(noreclaim_flag);
5587 } else
5588 ret = __netif_receive_skb_one_core(skb, false);
5589
5590 return ret;
5591}
5592
5593static void __netif_receive_skb_list(struct list_head *head)
5594{
5595 unsigned long noreclaim_flag = 0;
5596 struct sk_buff *skb, *next;
5597 bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
5598
5599 list_for_each_entry_safe(skb, next, head, list) {
5600 if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
5601 struct list_head sublist;
5602
5603 /* Handle the previous sublist */
5604 list_cut_before(&sublist, head, &skb->list);
5605 if (!list_empty(&sublist))
5606 __netif_receive_skb_list_core(&sublist, pfmemalloc);
5607 pfmemalloc = !pfmemalloc;
5608 /* See comments in __netif_receive_skb */
5609 if (pfmemalloc)
5610 noreclaim_flag = memalloc_noreclaim_save();
5611 else
5612 memalloc_noreclaim_restore(noreclaim_flag);
5613 }
5614 }
5615 /* Handle the remaining sublist */
5616 if (!list_empty(head))
5617 __netif_receive_skb_list_core(head, pfmemalloc);
5618 /* Restore pflags */
5619 if (pfmemalloc)
5620 memalloc_noreclaim_restore(noreclaim_flag);
5621}
5622
5623static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
5624{
5625 struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
5626 struct bpf_prog *new = xdp->prog;
5627 int ret = 0;
5628
5629 switch (xdp->command) {
5630 case XDP_SETUP_PROG:
5631 rcu_assign_pointer(dev->xdp_prog, new);
5632 if (old)
5633 bpf_prog_put(old);
5634
5635 if (old && !new) {
5636 static_branch_dec(&generic_xdp_needed_key);
5637 } else if (new && !old) {
5638 static_branch_inc(&generic_xdp_needed_key);
5639 dev_disable_lro(dev);
5640 dev_disable_gro_hw(dev);
5641 }
5642 break;
5643
5644 default:
5645 ret = -EINVAL;
5646 break;
5647 }
5648
5649 return ret;
5650}
5651
5652static int netif_receive_skb_internal(struct sk_buff *skb)
5653{
5654 int ret;
5655
5656 net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
5657
5658 if (skb_defer_rx_timestamp(skb))
5659 return NET_RX_SUCCESS;
5660
5661 rcu_read_lock();
5662#ifdef CONFIG_RPS
5663 if (static_branch_unlikely(&rps_needed)) {
5664 struct rps_dev_flow voidflow, *rflow = &voidflow;
5665 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5666
5667 if (cpu >= 0) {
5668 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5669 rcu_read_unlock();
5670 return ret;
5671 }
5672 }
5673#endif
5674 ret = __netif_receive_skb(skb);
5675 rcu_read_unlock();
5676 return ret;
5677}
5678
5679void netif_receive_skb_list_internal(struct list_head *head)
5680{
5681 struct sk_buff *skb, *next;
5682 struct list_head sublist;
5683
5684 INIT_LIST_HEAD(&sublist);
5685 list_for_each_entry_safe(skb, next, head, list) {
5686 net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
5687 skb_list_del_init(skb);
5688 if (!skb_defer_rx_timestamp(skb))
5689 list_add_tail(&skb->list, &sublist);
5690 }
5691 list_splice_init(&sublist, head);
5692
5693 rcu_read_lock();
5694#ifdef CONFIG_RPS
5695 if (static_branch_unlikely(&rps_needed)) {
5696 list_for_each_entry_safe(skb, next, head, list) {
5697 struct rps_dev_flow voidflow, *rflow = &voidflow;
5698 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5699
5700 if (cpu >= 0) {
5701 /* Will be handled, remove from list */
5702 skb_list_del_init(skb);
5703 enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5704 }
5705 }
5706 }
5707#endif
5708 __netif_receive_skb_list(head);
5709 rcu_read_unlock();
5710}
5711
5712/**
5713 * netif_receive_skb - process receive buffer from network
5714 * @skb: buffer to process
5715 *
5716 * netif_receive_skb() is the main receive data processing function.
5717 * It always succeeds. The buffer may be dropped during processing
5718 * for congestion control or by the protocol layers.
5719 *
5720 * This function may only be called from softirq context and interrupts
5721 * should be enabled.
5722 *
5723 * Return values (usually ignored):
5724 * NET_RX_SUCCESS: no congestion
5725 * NET_RX_DROP: packet was dropped
5726 */
5727int netif_receive_skb(struct sk_buff *skb)
5728{
5729 int ret;
5730
5731 trace_netif_receive_skb_entry(skb);
5732
5733 ret = netif_receive_skb_internal(skb);
5734 trace_netif_receive_skb_exit(ret);
5735
5736 return ret;
5737}
5738EXPORT_SYMBOL(netif_receive_skb);
5739
5740/**
5741 * netif_receive_skb_list - process many receive buffers from network
5742 * @head: list of skbs to process.
5743 *
5744 * Since return value of netif_receive_skb() is normally ignored, and
5745 * wouldn't be meaningful for a list, this function returns void.
5746 *
5747 * This function may only be called from softirq context and interrupts
5748 * should be enabled.
5749 */
5750void netif_receive_skb_list(struct list_head *head)
5751{
5752 struct sk_buff *skb;
5753
5754 if (list_empty(head))
5755 return;
5756 if (trace_netif_receive_skb_list_entry_enabled()) {
5757 list_for_each_entry(skb, head, list)
5758 trace_netif_receive_skb_list_entry(skb);
5759 }
5760 netif_receive_skb_list_internal(head);
5761 trace_netif_receive_skb_list_exit(0);
5762}
5763EXPORT_SYMBOL(netif_receive_skb_list);
5764
5765static DEFINE_PER_CPU(struct work_struct, flush_works);
5766
5767/* Network device is going away, flush any packets still pending */
5768static void flush_backlog(struct work_struct *work)
5769{
5770 struct sk_buff *skb, *tmp;
5771 struct softnet_data *sd;
5772
5773 local_bh_disable();
5774 sd = this_cpu_ptr(&softnet_data);
5775
5776 rps_lock_irq_disable(sd);
5777 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
5778 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5779 __skb_unlink(skb, &sd->input_pkt_queue);
5780 dev_kfree_skb_irq(skb);
5781 input_queue_head_incr(sd);
5782 }
5783 }
5784 rps_unlock_irq_enable(sd);
5785
5786 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
5787 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5788 __skb_unlink(skb, &sd->process_queue);
5789 kfree_skb(skb);
5790 input_queue_head_incr(sd);
5791 }
5792 }
5793 local_bh_enable();
5794}
5795
5796static bool flush_required(int cpu)
5797{
5798#if IS_ENABLED(CONFIG_RPS)
5799 struct softnet_data *sd = &per_cpu(softnet_data, cpu);
5800 bool do_flush;
5801
5802 rps_lock_irq_disable(sd);
5803
5804 /* as insertion into process_queue happens with the rps lock held,
5805 * process_queue access may race only with dequeue
5806 */
5807 do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
5808 !skb_queue_empty_lockless(&sd->process_queue);
5809 rps_unlock_irq_enable(sd);
5810
5811 return do_flush;
5812#endif
5813 /* without RPS we can't safely check input_pkt_queue: during a
5814 * concurrent remote skb_queue_splice() we can detect as empty both
5815 * input_pkt_queue and process_queue even if the latter could end-up
5816 * containing a lot of packets.
5817 */
5818 return true;
5819}
5820
5821static void flush_all_backlogs(void)
5822{
5823 static cpumask_t flush_cpus;
5824 unsigned int cpu;
5825
5826 /* since we are under rtnl lock protection we can use static data
5827 * for the cpumask and avoid allocating on stack the possibly
5828 * large mask
5829 */
5830 ASSERT_RTNL();
5831
5832 cpus_read_lock();
5833
5834 cpumask_clear(&flush_cpus);
5835 for_each_online_cpu(cpu) {
5836 if (flush_required(cpu)) {
5837 queue_work_on(cpu, system_highpri_wq,
5838 per_cpu_ptr(&flush_works, cpu));
5839 cpumask_set_cpu(cpu, &flush_cpus);
5840 }
5841 }
5842
5843 /* we can have in flight packet[s] on the cpus we are not flushing,
5844 * synchronize_net() in unregister_netdevice_many() will take care of
5845 * them
5846 */
5847 for_each_cpu(cpu, &flush_cpus)
5848 flush_work(per_cpu_ptr(&flush_works, cpu));
5849
5850 cpus_read_unlock();
5851}
5852
5853static void net_rps_send_ipi(struct softnet_data *remsd)
5854{
5855#ifdef CONFIG_RPS
5856 while (remsd) {
5857 struct softnet_data *next = remsd->rps_ipi_next;
5858
5859 if (cpu_online(remsd->cpu))
5860 smp_call_function_single_async(remsd->cpu, &remsd->csd);
5861 remsd = next;
5862 }
5863#endif
5864}
5865
5866/*
5867 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
5868 * Note: called with local irq disabled, but exits with local irq enabled.
5869 */
5870static void net_rps_action_and_irq_enable(struct softnet_data *sd)
5871{
5872#ifdef CONFIG_RPS
5873 struct softnet_data *remsd = sd->rps_ipi_list;
5874
5875 if (remsd) {
5876 sd->rps_ipi_list = NULL;
5877
5878 local_irq_enable();
5879
5880 /* Send pending IPI's to kick RPS processing on remote cpus. */
5881 net_rps_send_ipi(remsd);
5882 } else
5883#endif
5884 local_irq_enable();
5885}
5886
5887static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
5888{
5889#ifdef CONFIG_RPS
5890 return sd->rps_ipi_list != NULL;
5891#else
5892 return false;
5893#endif
5894}
5895
5896static int process_backlog(struct napi_struct *napi, int quota)
5897{
5898 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
5899 bool again = true;
5900 int work = 0;
5901
5902 /* Check if we have pending ipi, its better to send them now,
5903 * not waiting net_rx_action() end.
5904 */
5905 if (sd_has_rps_ipi_waiting(sd)) {
5906 local_irq_disable();
5907 net_rps_action_and_irq_enable(sd);
5908 }
5909
5910 napi->weight = READ_ONCE(dev_rx_weight);
5911 while (again) {
5912 struct sk_buff *skb;
5913
5914 while ((skb = __skb_dequeue(&sd->process_queue))) {
5915 rcu_read_lock();
5916 __netif_receive_skb(skb);
5917 rcu_read_unlock();
5918 input_queue_head_incr(sd);
5919 if (++work >= quota)
5920 return work;
5921
5922 }
5923
5924 rps_lock_irq_disable(sd);
5925 if (skb_queue_empty(&sd->input_pkt_queue)) {
5926 /*
5927 * Inline a custom version of __napi_complete().
5928 * only current cpu owns and manipulates this napi,
5929 * and NAPI_STATE_SCHED is the only possible flag set
5930 * on backlog.
5931 * We can use a plain write instead of clear_bit(),
5932 * and we dont need an smp_mb() memory barrier.
5933 */
5934 napi->state = 0;
5935 again = false;
5936 } else {
5937 skb_queue_splice_tail_init(&sd->input_pkt_queue,
5938 &sd->process_queue);
5939 }
5940 rps_unlock_irq_enable(sd);
5941 }
5942
5943 return work;
5944}
5945
5946/**
5947 * __napi_schedule - schedule for receive
5948 * @n: entry to schedule
5949 *
5950 * The entry's receive function will be scheduled to run.
5951 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
5952 */
5953void __napi_schedule(struct napi_struct *n)
5954{
5955 unsigned long flags;
5956
5957 local_irq_save(flags);
5958 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
5959 local_irq_restore(flags);
5960}
5961EXPORT_SYMBOL(__napi_schedule);
5962
5963/**
5964 * napi_schedule_prep - check if napi can be scheduled
5965 * @n: napi context
5966 *
5967 * Test if NAPI routine is already running, and if not mark
5968 * it as running. This is used as a condition variable to
5969 * insure only one NAPI poll instance runs. We also make
5970 * sure there is no pending NAPI disable.
5971 */
5972bool napi_schedule_prep(struct napi_struct *n)
5973{
5974 unsigned long new, val = READ_ONCE(n->state);
5975
5976 do {
5977 if (unlikely(val & NAPIF_STATE_DISABLE))
5978 return false;
5979 new = val | NAPIF_STATE_SCHED;
5980
5981 /* Sets STATE_MISSED bit if STATE_SCHED was already set
5982 * This was suggested by Alexander Duyck, as compiler
5983 * emits better code than :
5984 * if (val & NAPIF_STATE_SCHED)
5985 * new |= NAPIF_STATE_MISSED;
5986 */
5987 new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
5988 NAPIF_STATE_MISSED;
5989 } while (!try_cmpxchg(&n->state, &val, new));
5990
5991 return !(val & NAPIF_STATE_SCHED);
5992}
5993EXPORT_SYMBOL(napi_schedule_prep);
5994
5995/**
5996 * __napi_schedule_irqoff - schedule for receive
5997 * @n: entry to schedule
5998 *
5999 * Variant of __napi_schedule() assuming hard irqs are masked.
6000 *
6001 * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
6002 * because the interrupt disabled assumption might not be true
6003 * due to force-threaded interrupts and spinlock substitution.
6004 */
6005void __napi_schedule_irqoff(struct napi_struct *n)
6006{
6007 if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6008 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6009 else
6010 __napi_schedule(n);
6011}
6012EXPORT_SYMBOL(__napi_schedule_irqoff);
6013
6014bool napi_complete_done(struct napi_struct *n, int work_done)
6015{
6016 unsigned long flags, val, new, timeout = 0;
6017 bool ret = true;
6018
6019 /*
6020 * 1) Don't let napi dequeue from the cpu poll list
6021 * just in case its running on a different cpu.
6022 * 2) If we are busy polling, do nothing here, we have
6023 * the guarantee we will be called later.
6024 */
6025 if (unlikely(n->state & (NAPIF_STATE_NPSVC |
6026 NAPIF_STATE_IN_BUSY_POLL)))
6027 return false;
6028
6029 if (work_done) {
6030 if (n->gro_bitmask)
6031 timeout = READ_ONCE(n->dev->gro_flush_timeout);
6032 n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
6033 }
6034 if (n->defer_hard_irqs_count > 0) {
6035 n->defer_hard_irqs_count--;
6036 timeout = READ_ONCE(n->dev->gro_flush_timeout);
6037 if (timeout)
6038 ret = false;
6039 }
6040 if (n->gro_bitmask) {
6041 /* When the NAPI instance uses a timeout and keeps postponing
6042 * it, we need to bound somehow the time packets are kept in
6043 * the GRO layer
6044 */
6045 napi_gro_flush(n, !!timeout);
6046 }
6047
6048 gro_normal_list(n);
6049
6050 if (unlikely(!list_empty(&n->poll_list))) {
6051 /* If n->poll_list is not empty, we need to mask irqs */
6052 local_irq_save(flags);
6053 list_del_init(&n->poll_list);
6054 local_irq_restore(flags);
6055 }
6056
6057 val = READ_ONCE(n->state);
6058 do {
6059 WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
6060
6061 new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
6062 NAPIF_STATE_SCHED_THREADED |
6063 NAPIF_STATE_PREFER_BUSY_POLL);
6064
6065 /* If STATE_MISSED was set, leave STATE_SCHED set,
6066 * because we will call napi->poll() one more time.
6067 * This C code was suggested by Alexander Duyck to help gcc.
6068 */
6069 new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
6070 NAPIF_STATE_SCHED;
6071 } while (!try_cmpxchg(&n->state, &val, new));
6072
6073 if (unlikely(val & NAPIF_STATE_MISSED)) {
6074 __napi_schedule(n);
6075 return false;
6076 }
6077
6078 if (timeout)
6079 hrtimer_start(&n->timer, ns_to_ktime(timeout),
6080 HRTIMER_MODE_REL_PINNED);
6081 return ret;
6082}
6083EXPORT_SYMBOL(napi_complete_done);
6084
6085/* must be called under rcu_read_lock(), as we dont take a reference */
6086static struct napi_struct *napi_by_id(unsigned int napi_id)
6087{
6088 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
6089 struct napi_struct *napi;
6090
6091 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
6092 if (napi->napi_id == napi_id)
6093 return napi;
6094
6095 return NULL;
6096}
6097
6098#if defined(CONFIG_NET_RX_BUSY_POLL)
6099
6100static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
6101{
6102 if (!skip_schedule) {
6103 gro_normal_list(napi);
6104 __napi_schedule(napi);
6105 return;
6106 }
6107
6108 if (napi->gro_bitmask) {
6109 /* flush too old packets
6110 * If HZ < 1000, flush all packets.
6111 */
6112 napi_gro_flush(napi, HZ >= 1000);
6113 }
6114
6115 gro_normal_list(napi);
6116 clear_bit(NAPI_STATE_SCHED, &napi->state);
6117}
6118
6119static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
6120 u16 budget)
6121{
6122 bool skip_schedule = false;
6123 unsigned long timeout;
6124 int rc;
6125
6126 /* Busy polling means there is a high chance device driver hard irq
6127 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
6128 * set in napi_schedule_prep().
6129 * Since we are about to call napi->poll() once more, we can safely
6130 * clear NAPI_STATE_MISSED.
6131 *
6132 * Note: x86 could use a single "lock and ..." instruction
6133 * to perform these two clear_bit()
6134 */
6135 clear_bit(NAPI_STATE_MISSED, &napi->state);
6136 clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
6137
6138 local_bh_disable();
6139
6140 if (prefer_busy_poll) {
6141 napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
6142 timeout = READ_ONCE(napi->dev->gro_flush_timeout);
6143 if (napi->defer_hard_irqs_count && timeout) {
6144 hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
6145 skip_schedule = true;
6146 }
6147 }
6148
6149 /* All we really want here is to re-enable device interrupts.
6150 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
6151 */
6152 rc = napi->poll(napi, budget);
6153 /* We can't gro_normal_list() here, because napi->poll() might have
6154 * rearmed the napi (napi_complete_done()) in which case it could
6155 * already be running on another CPU.
6156 */
6157 trace_napi_poll(napi, rc, budget);
6158 netpoll_poll_unlock(have_poll_lock);
6159 if (rc == budget)
6160 __busy_poll_stop(napi, skip_schedule);
6161 local_bh_enable();
6162}
6163
6164void napi_busy_loop(unsigned int napi_id,
6165 bool (*loop_end)(void *, unsigned long),
6166 void *loop_end_arg, bool prefer_busy_poll, u16 budget)
6167{
6168 unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
6169 int (*napi_poll)(struct napi_struct *napi, int budget);
6170 void *have_poll_lock = NULL;
6171 struct napi_struct *napi;
6172
6173restart:
6174 napi_poll = NULL;
6175
6176 rcu_read_lock();
6177
6178 napi = napi_by_id(napi_id);
6179 if (!napi)
6180 goto out;
6181
6182 preempt_disable();
6183 for (;;) {
6184 int work = 0;
6185
6186 local_bh_disable();
6187 if (!napi_poll) {
6188 unsigned long val = READ_ONCE(napi->state);
6189
6190 /* If multiple threads are competing for this napi,
6191 * we avoid dirtying napi->state as much as we can.
6192 */
6193 if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
6194 NAPIF_STATE_IN_BUSY_POLL)) {
6195 if (prefer_busy_poll)
6196 set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6197 goto count;
6198 }
6199 if (cmpxchg(&napi->state, val,
6200 val | NAPIF_STATE_IN_BUSY_POLL |
6201 NAPIF_STATE_SCHED) != val) {
6202 if (prefer_busy_poll)
6203 set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6204 goto count;
6205 }
6206 have_poll_lock = netpoll_poll_lock(napi);
6207 napi_poll = napi->poll;
6208 }
6209 work = napi_poll(napi, budget);
6210 trace_napi_poll(napi, work, budget);
6211 gro_normal_list(napi);
6212count:
6213 if (work > 0)
6214 __NET_ADD_STATS(dev_net(napi->dev),
6215 LINUX_MIB_BUSYPOLLRXPACKETS, work);
6216 local_bh_enable();
6217
6218 if (!loop_end || loop_end(loop_end_arg, start_time))
6219 break;
6220
6221 if (unlikely(need_resched())) {
6222 if (napi_poll)
6223 busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
6224 preempt_enable();
6225 rcu_read_unlock();
6226 cond_resched();
6227 if (loop_end(loop_end_arg, start_time))
6228 return;
6229 goto restart;
6230 }
6231 cpu_relax();
6232 }
6233 if (napi_poll)
6234 busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
6235 preempt_enable();
6236out:
6237 rcu_read_unlock();
6238}
6239EXPORT_SYMBOL(napi_busy_loop);
6240
6241#endif /* CONFIG_NET_RX_BUSY_POLL */
6242
6243static void napi_hash_add(struct napi_struct *napi)
6244{
6245 if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
6246 return;
6247
6248 spin_lock(&napi_hash_lock);
6249
6250 /* 0..NR_CPUS range is reserved for sender_cpu use */
6251 do {
6252 if (unlikely(++napi_gen_id < MIN_NAPI_ID))
6253 napi_gen_id = MIN_NAPI_ID;
6254 } while (napi_by_id(napi_gen_id));
6255 napi->napi_id = napi_gen_id;
6256
6257 hlist_add_head_rcu(&napi->napi_hash_node,
6258 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
6259
6260 spin_unlock(&napi_hash_lock);
6261}
6262
6263/* Warning : caller is responsible to make sure rcu grace period
6264 * is respected before freeing memory containing @napi
6265 */
6266static void napi_hash_del(struct napi_struct *napi)
6267{
6268 spin_lock(&napi_hash_lock);
6269
6270 hlist_del_init_rcu(&napi->napi_hash_node);
6271
6272 spin_unlock(&napi_hash_lock);
6273}
6274
6275static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
6276{
6277 struct napi_struct *napi;
6278
6279 napi = container_of(timer, struct napi_struct, timer);
6280
6281 /* Note : we use a relaxed variant of napi_schedule_prep() not setting
6282 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
6283 */
6284 if (!napi_disable_pending(napi) &&
6285 !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
6286 clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6287 __napi_schedule_irqoff(napi);
6288 }
6289
6290 return HRTIMER_NORESTART;
6291}
6292
6293static void init_gro_hash(struct napi_struct *napi)
6294{
6295 int i;
6296
6297 for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6298 INIT_LIST_HEAD(&napi->gro_hash[i].list);
6299 napi->gro_hash[i].count = 0;
6300 }
6301 napi->gro_bitmask = 0;
6302}
6303
6304int dev_set_threaded(struct net_device *dev, bool threaded)
6305{
6306 struct napi_struct *napi;
6307 int err = 0;
6308
6309 if (dev->threaded == threaded)
6310 return 0;
6311
6312 if (threaded) {
6313 list_for_each_entry(napi, &dev->napi_list, dev_list) {
6314 if (!napi->thread) {
6315 err = napi_kthread_create(napi);
6316 if (err) {
6317 threaded = false;
6318 break;
6319 }
6320 }
6321 }
6322 }
6323
6324 dev->threaded = threaded;
6325
6326 /* Make sure kthread is created before THREADED bit
6327 * is set.
6328 */
6329 smp_mb__before_atomic();
6330
6331 /* Setting/unsetting threaded mode on a napi might not immediately
6332 * take effect, if the current napi instance is actively being
6333 * polled. In this case, the switch between threaded mode and
6334 * softirq mode will happen in the next round of napi_schedule().
6335 * This should not cause hiccups/stalls to the live traffic.
6336 */
6337 list_for_each_entry(napi, &dev->napi_list, dev_list) {
6338 if (threaded)
6339 set_bit(NAPI_STATE_THREADED, &napi->state);
6340 else
6341 clear_bit(NAPI_STATE_THREADED, &napi->state);
6342 }
6343
6344 return err;
6345}
6346EXPORT_SYMBOL(dev_set_threaded);
6347
6348void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
6349 int (*poll)(struct napi_struct *, int), int weight)
6350{
6351 if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
6352 return;
6353
6354 INIT_LIST_HEAD(&napi->poll_list);
6355 INIT_HLIST_NODE(&napi->napi_hash_node);
6356 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
6357 napi->timer.function = napi_watchdog;
6358 init_gro_hash(napi);
6359 napi->skb = NULL;
6360 INIT_LIST_HEAD(&napi->rx_list);
6361 napi->rx_count = 0;
6362 napi->poll = poll;
6363 if (weight > NAPI_POLL_WEIGHT)
6364 netdev_err_once(dev, "%s() called with weight %d\n", __func__,
6365 weight);
6366 napi->weight = weight;
6367 napi->dev = dev;
6368#ifdef CONFIG_NETPOLL
6369 napi->poll_owner = -1;
6370#endif
6371 set_bit(NAPI_STATE_SCHED, &napi->state);
6372 set_bit(NAPI_STATE_NPSVC, &napi->state);
6373 list_add_rcu(&napi->dev_list, &dev->napi_list);
6374 napi_hash_add(napi);
6375 napi_get_frags_check(napi);
6376 /* Create kthread for this napi if dev->threaded is set.
6377 * Clear dev->threaded if kthread creation failed so that
6378 * threaded mode will not be enabled in napi_enable().
6379 */
6380 if (dev->threaded && napi_kthread_create(napi))
6381 dev->threaded = 0;
6382}
6383EXPORT_SYMBOL(netif_napi_add_weight);
6384
6385void napi_disable(struct napi_struct *n)
6386{
6387 unsigned long val, new;
6388
6389 might_sleep();
6390 set_bit(NAPI_STATE_DISABLE, &n->state);
6391
6392 val = READ_ONCE(n->state);
6393 do {
6394 while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
6395 usleep_range(20, 200);
6396 val = READ_ONCE(n->state);
6397 }
6398
6399 new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
6400 new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
6401 } while (!try_cmpxchg(&n->state, &val, new));
6402
6403 hrtimer_cancel(&n->timer);
6404
6405 clear_bit(NAPI_STATE_DISABLE, &n->state);
6406}
6407EXPORT_SYMBOL(napi_disable);
6408
6409/**
6410 * napi_enable - enable NAPI scheduling
6411 * @n: NAPI context
6412 *
6413 * Resume NAPI from being scheduled on this context.
6414 * Must be paired with napi_disable.
6415 */
6416void napi_enable(struct napi_struct *n)
6417{
6418 unsigned long new, val = READ_ONCE(n->state);
6419
6420 do {
6421 BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
6422
6423 new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
6424 if (n->dev->threaded && n->thread)
6425 new |= NAPIF_STATE_THREADED;
6426 } while (!try_cmpxchg(&n->state, &val, new));
6427}
6428EXPORT_SYMBOL(napi_enable);
6429
6430static void flush_gro_hash(struct napi_struct *napi)
6431{
6432 int i;
6433
6434 for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6435 struct sk_buff *skb, *n;
6436
6437 list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
6438 kfree_skb(skb);
6439 napi->gro_hash[i].count = 0;
6440 }
6441}
6442
6443/* Must be called in process context */
6444void __netif_napi_del(struct napi_struct *napi)
6445{
6446 if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
6447 return;
6448
6449 napi_hash_del(napi);
6450 list_del_rcu(&napi->dev_list);
6451 napi_free_frags(napi);
6452
6453 flush_gro_hash(napi);
6454 napi->gro_bitmask = 0;
6455
6456 if (napi->thread) {
6457 kthread_stop(napi->thread);
6458 napi->thread = NULL;
6459 }
6460}
6461EXPORT_SYMBOL(__netif_napi_del);
6462
6463static int __napi_poll(struct napi_struct *n, bool *repoll)
6464{
6465 int work, weight;
6466
6467 weight = n->weight;
6468
6469 /* This NAPI_STATE_SCHED test is for avoiding a race
6470 * with netpoll's poll_napi(). Only the entity which
6471 * obtains the lock and sees NAPI_STATE_SCHED set will
6472 * actually make the ->poll() call. Therefore we avoid
6473 * accidentally calling ->poll() when NAPI is not scheduled.
6474 */
6475 work = 0;
6476 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
6477 work = n->poll(n, weight);
6478 trace_napi_poll(n, work, weight);
6479 }
6480
6481 if (unlikely(work > weight))
6482 netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
6483 n->poll, work, weight);
6484
6485 if (likely(work < weight))
6486 return work;
6487
6488 /* Drivers must not modify the NAPI state if they
6489 * consume the entire weight. In such cases this code
6490 * still "owns" the NAPI instance and therefore can
6491 * move the instance around on the list at-will.
6492 */
6493 if (unlikely(napi_disable_pending(n))) {
6494 napi_complete(n);
6495 return work;
6496 }
6497
6498 /* The NAPI context has more processing work, but busy-polling
6499 * is preferred. Exit early.
6500 */
6501 if (napi_prefer_busy_poll(n)) {
6502 if (napi_complete_done(n, work)) {
6503 /* If timeout is not set, we need to make sure
6504 * that the NAPI is re-scheduled.
6505 */
6506 napi_schedule(n);
6507 }
6508 return work;
6509 }
6510
6511 if (n->gro_bitmask) {
6512 /* flush too old packets
6513 * If HZ < 1000, flush all packets.
6514 */
6515 napi_gro_flush(n, HZ >= 1000);
6516 }
6517
6518 gro_normal_list(n);
6519
6520 /* Some drivers may have called napi_schedule
6521 * prior to exhausting their budget.
6522 */
6523 if (unlikely(!list_empty(&n->poll_list))) {
6524 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
6525 n->dev ? n->dev->name : "backlog");
6526 return work;
6527 }
6528
6529 *repoll = true;
6530
6531 return work;
6532}
6533
6534static int napi_poll(struct napi_struct *n, struct list_head *repoll)
6535{
6536 bool do_repoll = false;
6537 void *have;
6538 int work;
6539
6540 list_del_init(&n->poll_list);
6541
6542 have = netpoll_poll_lock(n);
6543
6544 work = __napi_poll(n, &do_repoll);
6545
6546 if (do_repoll)
6547 list_add_tail(&n->poll_list, repoll);
6548
6549 netpoll_poll_unlock(have);
6550
6551 return work;
6552}
6553
6554static int napi_thread_wait(struct napi_struct *napi)
6555{
6556 bool woken = false;
6557
6558 set_current_state(TASK_INTERRUPTIBLE);
6559
6560 while (!kthread_should_stop()) {
6561 /* Testing SCHED_THREADED bit here to make sure the current
6562 * kthread owns this napi and could poll on this napi.
6563 * Testing SCHED bit is not enough because SCHED bit might be
6564 * set by some other busy poll thread or by napi_disable().
6565 */
6566 if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
6567 WARN_ON(!list_empty(&napi->poll_list));
6568 __set_current_state(TASK_RUNNING);
6569 return 0;
6570 }
6571
6572 schedule();
6573 /* woken being true indicates this thread owns this napi. */
6574 woken = true;
6575 set_current_state(TASK_INTERRUPTIBLE);
6576 }
6577 __set_current_state(TASK_RUNNING);
6578
6579 return -1;
6580}
6581
6582static int napi_threaded_poll(void *data)
6583{
6584 struct napi_struct *napi = data;
6585 void *have;
6586
6587 while (!napi_thread_wait(napi)) {
6588 for (;;) {
6589 bool repoll = false;
6590
6591 local_bh_disable();
6592
6593 have = netpoll_poll_lock(napi);
6594 __napi_poll(napi, &repoll);
6595 netpoll_poll_unlock(have);
6596
6597 local_bh_enable();
6598
6599 if (!repoll)
6600 break;
6601
6602 cond_resched();
6603 }
6604 }
6605 return 0;
6606}
6607
6608static void skb_defer_free_flush(struct softnet_data *sd)
6609{
6610 struct sk_buff *skb, *next;
6611 unsigned long flags;
6612
6613 /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
6614 if (!READ_ONCE(sd->defer_list))
6615 return;
6616
6617 spin_lock_irqsave(&sd->defer_lock, flags);
6618 skb = sd->defer_list;
6619 sd->defer_list = NULL;
6620 sd->defer_count = 0;
6621 spin_unlock_irqrestore(&sd->defer_lock, flags);
6622
6623 while (skb != NULL) {
6624 next = skb->next;
6625 napi_consume_skb(skb, 1);
6626 skb = next;
6627 }
6628}
6629
6630static __latent_entropy void net_rx_action(struct softirq_action *h)
6631{
6632 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6633 unsigned long time_limit = jiffies +
6634 usecs_to_jiffies(READ_ONCE(netdev_budget_usecs));
6635 int budget = READ_ONCE(netdev_budget);
6636 LIST_HEAD(list);
6637 LIST_HEAD(repoll);
6638
6639 local_irq_disable();
6640 list_splice_init(&sd->poll_list, &list);
6641 local_irq_enable();
6642
6643 for (;;) {
6644 struct napi_struct *n;
6645
6646 skb_defer_free_flush(sd);
6647
6648 if (list_empty(&list)) {
6649 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
6650 goto end;
6651 break;
6652 }
6653
6654 n = list_first_entry(&list, struct napi_struct, poll_list);
6655 budget -= napi_poll(n, &repoll);
6656
6657 /* If softirq window is exhausted then punt.
6658 * Allow this to run for 2 jiffies since which will allow
6659 * an average latency of 1.5/HZ.
6660 */
6661 if (unlikely(budget <= 0 ||
6662 time_after_eq(jiffies, time_limit))) {
6663 sd->time_squeeze++;
6664 break;
6665 }
6666 }
6667
6668 local_irq_disable();
6669
6670 list_splice_tail_init(&sd->poll_list, &list);
6671 list_splice_tail(&repoll, &list);
6672 list_splice(&list, &sd->poll_list);
6673 if (!list_empty(&sd->poll_list))
6674 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
6675
6676 net_rps_action_and_irq_enable(sd);
6677end:;
6678}
6679
6680struct netdev_adjacent {
6681 struct net_device *dev;
6682 netdevice_tracker dev_tracker;
6683
6684 /* upper master flag, there can only be one master device per list */
6685 bool master;
6686
6687 /* lookup ignore flag */
6688 bool ignore;
6689
6690 /* counter for the number of times this device was added to us */
6691 u16 ref_nr;
6692
6693 /* private field for the users */
6694 void *private;
6695
6696 struct list_head list;
6697 struct rcu_head rcu;
6698};
6699
6700static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
6701 struct list_head *adj_list)
6702{
6703 struct netdev_adjacent *adj;
6704
6705 list_for_each_entry(adj, adj_list, list) {
6706 if (adj->dev == adj_dev)
6707 return adj;
6708 }
6709 return NULL;
6710}
6711
6712static int ____netdev_has_upper_dev(struct net_device *upper_dev,
6713 struct netdev_nested_priv *priv)
6714{
6715 struct net_device *dev = (struct net_device *)priv->data;
6716
6717 return upper_dev == dev;
6718}
6719
6720/**
6721 * netdev_has_upper_dev - Check if device is linked to an upper device
6722 * @dev: device
6723 * @upper_dev: upper device to check
6724 *
6725 * Find out if a device is linked to specified upper device and return true
6726 * in case it is. Note that this checks only immediate upper device,
6727 * not through a complete stack of devices. The caller must hold the RTNL lock.
6728 */
6729bool netdev_has_upper_dev(struct net_device *dev,
6730 struct net_device *upper_dev)
6731{
6732 struct netdev_nested_priv priv = {
6733 .data = (void *)upper_dev,
6734 };
6735
6736 ASSERT_RTNL();
6737
6738 return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6739 &priv);
6740}
6741EXPORT_SYMBOL(netdev_has_upper_dev);
6742
6743/**
6744 * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
6745 * @dev: device
6746 * @upper_dev: upper device to check
6747 *
6748 * Find out if a device is linked to specified upper device and return true
6749 * in case it is. Note that this checks the entire upper device chain.
6750 * The caller must hold rcu lock.
6751 */
6752
6753bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
6754 struct net_device *upper_dev)
6755{
6756 struct netdev_nested_priv priv = {
6757 .data = (void *)upper_dev,
6758 };
6759
6760 return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6761 &priv);
6762}
6763EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
6764
6765/**
6766 * netdev_has_any_upper_dev - Check if device is linked to some device
6767 * @dev: device
6768 *
6769 * Find out if a device is linked to an upper device and return true in case
6770 * it is. The caller must hold the RTNL lock.
6771 */
6772bool netdev_has_any_upper_dev(struct net_device *dev)
6773{
6774 ASSERT_RTNL();
6775
6776 return !list_empty(&dev->adj_list.upper);
6777}
6778EXPORT_SYMBOL(netdev_has_any_upper_dev);
6779
6780/**
6781 * netdev_master_upper_dev_get - Get master upper device
6782 * @dev: device
6783 *
6784 * Find a master upper device and return pointer to it or NULL in case
6785 * it's not there. The caller must hold the RTNL lock.
6786 */
6787struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
6788{
6789 struct netdev_adjacent *upper;
6790
6791 ASSERT_RTNL();
6792
6793 if (list_empty(&dev->adj_list.upper))
6794 return NULL;
6795
6796 upper = list_first_entry(&dev->adj_list.upper,
6797 struct netdev_adjacent, list);
6798 if (likely(upper->master))
6799 return upper->dev;
6800 return NULL;
6801}
6802EXPORT_SYMBOL(netdev_master_upper_dev_get);
6803
6804static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
6805{
6806 struct netdev_adjacent *upper;
6807
6808 ASSERT_RTNL();
6809
6810 if (list_empty(&dev->adj_list.upper))
6811 return NULL;
6812
6813 upper = list_first_entry(&dev->adj_list.upper,
6814 struct netdev_adjacent, list);
6815 if (likely(upper->master) && !upper->ignore)
6816 return upper->dev;
6817 return NULL;
6818}
6819
6820/**
6821 * netdev_has_any_lower_dev - Check if device is linked to some device
6822 * @dev: device
6823 *
6824 * Find out if a device is linked to a lower device and return true in case
6825 * it is. The caller must hold the RTNL lock.
6826 */
6827static bool netdev_has_any_lower_dev(struct net_device *dev)
6828{
6829 ASSERT_RTNL();
6830
6831 return !list_empty(&dev->adj_list.lower);
6832}
6833
6834void *netdev_adjacent_get_private(struct list_head *adj_list)
6835{
6836 struct netdev_adjacent *adj;
6837
6838 adj = list_entry(adj_list, struct netdev_adjacent, list);
6839
6840 return adj->private;
6841}
6842EXPORT_SYMBOL(netdev_adjacent_get_private);
6843
6844/**
6845 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
6846 * @dev: device
6847 * @iter: list_head ** of the current position
6848 *
6849 * Gets the next device from the dev's upper list, starting from iter
6850 * position. The caller must hold RCU read lock.
6851 */
6852struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
6853 struct list_head **iter)
6854{
6855 struct netdev_adjacent *upper;
6856
6857 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
6858
6859 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6860
6861 if (&upper->list == &dev->adj_list.upper)
6862 return NULL;
6863
6864 *iter = &upper->list;
6865
6866 return upper->dev;
6867}
6868EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
6869
6870static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
6871 struct list_head **iter,
6872 bool *ignore)
6873{
6874 struct netdev_adjacent *upper;
6875
6876 upper = list_entry((*iter)->next, struct netdev_adjacent, list);
6877
6878 if (&upper->list == &dev->adj_list.upper)
6879 return NULL;
6880
6881 *iter = &upper->list;
6882 *ignore = upper->ignore;
6883
6884 return upper->dev;
6885}
6886
6887static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
6888 struct list_head **iter)
6889{
6890 struct netdev_adjacent *upper;
6891
6892 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
6893
6894 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6895
6896 if (&upper->list == &dev->adj_list.upper)
6897 return NULL;
6898
6899 *iter = &upper->list;
6900
6901 return upper->dev;
6902}
6903
6904static int __netdev_walk_all_upper_dev(struct net_device *dev,
6905 int (*fn)(struct net_device *dev,
6906 struct netdev_nested_priv *priv),
6907 struct netdev_nested_priv *priv)
6908{
6909 struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
6910 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
6911 int ret, cur = 0;
6912 bool ignore;
6913
6914 now = dev;
6915 iter = &dev->adj_list.upper;
6916
6917 while (1) {
6918 if (now != dev) {
6919 ret = fn(now, priv);
6920 if (ret)
6921 return ret;
6922 }
6923
6924 next = NULL;
6925 while (1) {
6926 udev = __netdev_next_upper_dev(now, &iter, &ignore);
6927 if (!udev)
6928 break;
6929 if (ignore)
6930 continue;
6931
6932 next = udev;
6933 niter = &udev->adj_list.upper;
6934 dev_stack[cur] = now;
6935 iter_stack[cur++] = iter;
6936 break;
6937 }
6938
6939 if (!next) {
6940 if (!cur)
6941 return 0;
6942 next = dev_stack[--cur];
6943 niter = iter_stack[cur];
6944 }
6945
6946 now = next;
6947 iter = niter;
6948 }
6949
6950 return 0;
6951}
6952
6953int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
6954 int (*fn)(struct net_device *dev,
6955 struct netdev_nested_priv *priv),
6956 struct netdev_nested_priv *priv)
6957{
6958 struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
6959 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
6960 int ret, cur = 0;
6961
6962 now = dev;
6963 iter = &dev->adj_list.upper;
6964
6965 while (1) {
6966 if (now != dev) {
6967 ret = fn(now, priv);
6968 if (ret)
6969 return ret;
6970 }
6971
6972 next = NULL;
6973 while (1) {
6974 udev = netdev_next_upper_dev_rcu(now, &iter);
6975 if (!udev)
6976 break;
6977
6978 next = udev;
6979 niter = &udev->adj_list.upper;
6980 dev_stack[cur] = now;
6981 iter_stack[cur++] = iter;
6982 break;
6983 }
6984
6985 if (!next) {
6986 if (!cur)
6987 return 0;
6988 next = dev_stack[--cur];
6989 niter = iter_stack[cur];
6990 }
6991
6992 now = next;
6993 iter = niter;
6994 }
6995
6996 return 0;
6997}
6998EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
6999
7000static bool __netdev_has_upper_dev(struct net_device *dev,
7001 struct net_device *upper_dev)
7002{
7003 struct netdev_nested_priv priv = {
7004 .flags = 0,
7005 .data = (void *)upper_dev,
7006 };
7007
7008 ASSERT_RTNL();
7009
7010 return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
7011 &priv);
7012}
7013
7014/**
7015 * netdev_lower_get_next_private - Get the next ->private from the
7016 * lower neighbour list
7017 * @dev: device
7018 * @iter: list_head ** of the current position
7019 *
7020 * Gets the next netdev_adjacent->private from the dev's lower neighbour
7021 * list, starting from iter position. The caller must hold either hold the
7022 * RTNL lock or its own locking that guarantees that the neighbour lower
7023 * list will remain unchanged.
7024 */
7025void *netdev_lower_get_next_private(struct net_device *dev,
7026 struct list_head **iter)
7027{
7028 struct netdev_adjacent *lower;
7029
7030 lower = list_entry(*iter, struct netdev_adjacent, list);
7031
7032 if (&lower->list == &dev->adj_list.lower)
7033 return NULL;
7034
7035 *iter = lower->list.next;
7036
7037 return lower->private;
7038}
7039EXPORT_SYMBOL(netdev_lower_get_next_private);
7040
7041/**
7042 * netdev_lower_get_next_private_rcu - Get the next ->private from the
7043 * lower neighbour list, RCU
7044 * variant
7045 * @dev: device
7046 * @iter: list_head ** of the current position
7047 *
7048 * Gets the next netdev_adjacent->private from the dev's lower neighbour
7049 * list, starting from iter position. The caller must hold RCU read lock.
7050 */
7051void *netdev_lower_get_next_private_rcu(struct net_device *dev,
7052 struct list_head **iter)
7053{
7054 struct netdev_adjacent *lower;
7055
7056 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
7057
7058 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7059
7060 if (&lower->list == &dev->adj_list.lower)
7061 return NULL;
7062
7063 *iter = &lower->list;
7064
7065 return lower->private;
7066}
7067EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
7068
7069/**
7070 * netdev_lower_get_next - Get the next device from the lower neighbour
7071 * list
7072 * @dev: device
7073 * @iter: list_head ** of the current position
7074 *
7075 * Gets the next netdev_adjacent from the dev's lower neighbour
7076 * list, starting from iter position. The caller must hold RTNL lock or
7077 * its own locking that guarantees that the neighbour lower
7078 * list will remain unchanged.
7079 */
7080void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
7081{
7082 struct netdev_adjacent *lower;
7083
7084 lower = list_entry(*iter, struct netdev_adjacent, list);
7085
7086 if (&lower->list == &dev->adj_list.lower)
7087 return NULL;
7088
7089 *iter = lower->list.next;
7090
7091 return lower->dev;
7092}
7093EXPORT_SYMBOL(netdev_lower_get_next);
7094
7095static struct net_device *netdev_next_lower_dev(struct net_device *dev,
7096 struct list_head **iter)
7097{
7098 struct netdev_adjacent *lower;
7099
7100 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7101
7102 if (&lower->list == &dev->adj_list.lower)
7103 return NULL;
7104
7105 *iter = &lower->list;
7106
7107 return lower->dev;
7108}
7109
7110static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
7111 struct list_head **iter,
7112 bool *ignore)
7113{
7114 struct netdev_adjacent *lower;
7115
7116 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7117
7118 if (&lower->list == &dev->adj_list.lower)
7119 return NULL;
7120
7121 *iter = &lower->list;
7122 *ignore = lower->ignore;
7123
7124 return lower->dev;
7125}
7126
7127int netdev_walk_all_lower_dev(struct net_device *dev,
7128 int (*fn)(struct net_device *dev,
7129 struct netdev_nested_priv *priv),
7130 struct netdev_nested_priv *priv)
7131{
7132 struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7133 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7134 int ret, cur = 0;
7135
7136 now = dev;
7137 iter = &dev->adj_list.lower;
7138
7139 while (1) {
7140 if (now != dev) {
7141 ret = fn(now, priv);
7142 if (ret)
7143 return ret;
7144 }
7145
7146 next = NULL;
7147 while (1) {
7148 ldev = netdev_next_lower_dev(now, &iter);
7149 if (!ldev)
7150 break;
7151
7152 next = ldev;
7153 niter = &ldev->adj_list.lower;
7154 dev_stack[cur] = now;
7155 iter_stack[cur++] = iter;
7156 break;
7157 }
7158
7159 if (!next) {
7160 if (!cur)
7161 return 0;
7162 next = dev_stack[--cur];
7163 niter = iter_stack[cur];
7164 }
7165
7166 now = next;
7167 iter = niter;
7168 }
7169
7170 return 0;
7171}
7172EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
7173
7174static int __netdev_walk_all_lower_dev(struct net_device *dev,
7175 int (*fn)(struct net_device *dev,
7176 struct netdev_nested_priv *priv),
7177 struct netdev_nested_priv *priv)
7178{
7179 struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7180 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7181 int ret, cur = 0;
7182 bool ignore;
7183
7184 now = dev;
7185 iter = &dev->adj_list.lower;
7186
7187 while (1) {
7188 if (now != dev) {
7189 ret = fn(now, priv);
7190 if (ret)
7191 return ret;
7192 }
7193
7194 next = NULL;
7195 while (1) {
7196 ldev = __netdev_next_lower_dev(now, &iter, &ignore);
7197 if (!ldev)
7198 break;
7199 if (ignore)
7200 continue;
7201
7202 next = ldev;
7203 niter = &ldev->adj_list.lower;
7204 dev_stack[cur] = now;
7205 iter_stack[cur++] = iter;
7206 break;
7207 }
7208
7209 if (!next) {
7210 if (!cur)
7211 return 0;
7212 next = dev_stack[--cur];
7213 niter = iter_stack[cur];
7214 }
7215
7216 now = next;
7217 iter = niter;
7218 }
7219
7220 return 0;
7221}
7222
7223struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
7224 struct list_head **iter)
7225{
7226 struct netdev_adjacent *lower;
7227
7228 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7229 if (&lower->list == &dev->adj_list.lower)
7230 return NULL;
7231
7232 *iter = &lower->list;
7233
7234 return lower->dev;
7235}
7236EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
7237
7238static u8 __netdev_upper_depth(struct net_device *dev)
7239{
7240 struct net_device *udev;
7241 struct list_head *iter;
7242 u8 max_depth = 0;
7243 bool ignore;
7244
7245 for (iter = &dev->adj_list.upper,
7246 udev = __netdev_next_upper_dev(dev, &iter, &ignore);
7247 udev;
7248 udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
7249 if (ignore)
7250 continue;
7251 if (max_depth < udev->upper_level)
7252 max_depth = udev->upper_level;
7253 }
7254
7255 return max_depth;
7256}
7257
7258static u8 __netdev_lower_depth(struct net_device *dev)
7259{
7260 struct net_device *ldev;
7261 struct list_head *iter;
7262 u8 max_depth = 0;
7263 bool ignore;
7264
7265 for (iter = &dev->adj_list.lower,
7266 ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
7267 ldev;
7268 ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
7269 if (ignore)
7270 continue;
7271 if (max_depth < ldev->lower_level)
7272 max_depth = ldev->lower_level;
7273 }
7274
7275 return max_depth;
7276}
7277
7278static int __netdev_update_upper_level(struct net_device *dev,
7279 struct netdev_nested_priv *__unused)
7280{
7281 dev->upper_level = __netdev_upper_depth(dev) + 1;
7282 return 0;
7283}
7284
7285#ifdef CONFIG_LOCKDEP
7286static LIST_HEAD(net_unlink_list);
7287
7288static void net_unlink_todo(struct net_device *dev)
7289{
7290 if (list_empty(&dev->unlink_list))
7291 list_add_tail(&dev->unlink_list, &net_unlink_list);
7292}
7293#endif
7294
7295static int __netdev_update_lower_level(struct net_device *dev,
7296 struct netdev_nested_priv *priv)
7297{
7298 dev->lower_level = __netdev_lower_depth(dev) + 1;
7299
7300#ifdef CONFIG_LOCKDEP
7301 if (!priv)
7302 return 0;
7303
7304 if (priv->flags & NESTED_SYNC_IMM)
7305 dev->nested_level = dev->lower_level - 1;
7306 if (priv->flags & NESTED_SYNC_TODO)
7307 net_unlink_todo(dev);
7308#endif
7309 return 0;
7310}
7311
7312int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
7313 int (*fn)(struct net_device *dev,
7314 struct netdev_nested_priv *priv),
7315 struct netdev_nested_priv *priv)
7316{
7317 struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7318 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7319 int ret, cur = 0;
7320
7321 now = dev;
7322 iter = &dev->adj_list.lower;
7323
7324 while (1) {
7325 if (now != dev) {
7326 ret = fn(now, priv);
7327 if (ret)
7328 return ret;
7329 }
7330
7331 next = NULL;
7332 while (1) {
7333 ldev = netdev_next_lower_dev_rcu(now, &iter);
7334 if (!ldev)
7335 break;
7336
7337 next = ldev;
7338 niter = &ldev->adj_list.lower;
7339 dev_stack[cur] = now;
7340 iter_stack[cur++] = iter;
7341 break;
7342 }
7343
7344 if (!next) {
7345 if (!cur)
7346 return 0;
7347 next = dev_stack[--cur];
7348 niter = iter_stack[cur];
7349 }
7350
7351 now = next;
7352 iter = niter;
7353 }
7354
7355 return 0;
7356}
7357EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
7358
7359/**
7360 * netdev_lower_get_first_private_rcu - Get the first ->private from the
7361 * lower neighbour list, RCU
7362 * variant
7363 * @dev: device
7364 *
7365 * Gets the first netdev_adjacent->private from the dev's lower neighbour
7366 * list. The caller must hold RCU read lock.
7367 */
7368void *netdev_lower_get_first_private_rcu(struct net_device *dev)
7369{
7370 struct netdev_adjacent *lower;
7371
7372 lower = list_first_or_null_rcu(&dev->adj_list.lower,
7373 struct netdev_adjacent, list);
7374 if (lower)
7375 return lower->private;
7376 return NULL;
7377}
7378EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
7379
7380/**
7381 * netdev_master_upper_dev_get_rcu - Get master upper device
7382 * @dev: device
7383 *
7384 * Find a master upper device and return pointer to it or NULL in case
7385 * it's not there. The caller must hold the RCU read lock.
7386 */
7387struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
7388{
7389 struct netdev_adjacent *upper;
7390
7391 upper = list_first_or_null_rcu(&dev->adj_list.upper,
7392 struct netdev_adjacent, list);
7393 if (upper && likely(upper->master))
7394 return upper->dev;
7395 return NULL;
7396}
7397EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
7398
7399static int netdev_adjacent_sysfs_add(struct net_device *dev,
7400 struct net_device *adj_dev,
7401 struct list_head *dev_list)
7402{
7403 char linkname[IFNAMSIZ+7];
7404
7405 sprintf(linkname, dev_list == &dev->adj_list.upper ?
7406 "upper_%s" : "lower_%s", adj_dev->name);
7407 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
7408 linkname);
7409}
7410static void netdev_adjacent_sysfs_del(struct net_device *dev,
7411 char *name,
7412 struct list_head *dev_list)
7413{
7414 char linkname[IFNAMSIZ+7];
7415
7416 sprintf(linkname, dev_list == &dev->adj_list.upper ?
7417 "upper_%s" : "lower_%s", name);
7418 sysfs_remove_link(&(dev->dev.kobj), linkname);
7419}
7420
7421static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
7422 struct net_device *adj_dev,
7423 struct list_head *dev_list)
7424{
7425 return (dev_list == &dev->adj_list.upper ||
7426 dev_list == &dev->adj_list.lower) &&
7427 net_eq(dev_net(dev), dev_net(adj_dev));
7428}
7429
7430static int __netdev_adjacent_dev_insert(struct net_device *dev,
7431 struct net_device *adj_dev,
7432 struct list_head *dev_list,
7433 void *private, bool master)
7434{
7435 struct netdev_adjacent *adj;
7436 int ret;
7437
7438 adj = __netdev_find_adj(adj_dev, dev_list);
7439
7440 if (adj) {
7441 adj->ref_nr += 1;
7442 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
7443 dev->name, adj_dev->name, adj->ref_nr);
7444
7445 return 0;
7446 }
7447
7448 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
7449 if (!adj)
7450 return -ENOMEM;
7451
7452 adj->dev = adj_dev;
7453 adj->master = master;
7454 adj->ref_nr = 1;
7455 adj->private = private;
7456 adj->ignore = false;
7457 netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL);
7458
7459 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
7460 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
7461
7462 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
7463 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
7464 if (ret)
7465 goto free_adj;
7466 }
7467
7468 /* Ensure that master link is always the first item in list. */
7469 if (master) {
7470 ret = sysfs_create_link(&(dev->dev.kobj),
7471 &(adj_dev->dev.kobj), "master");
7472 if (ret)
7473 goto remove_symlinks;
7474
7475 list_add_rcu(&adj->list, dev_list);
7476 } else {
7477 list_add_tail_rcu(&adj->list, dev_list);
7478 }
7479
7480 return 0;
7481
7482remove_symlinks:
7483 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7484 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7485free_adj:
7486 netdev_put(adj_dev, &adj->dev_tracker);
7487 kfree(adj);
7488
7489 return ret;
7490}
7491
7492static void __netdev_adjacent_dev_remove(struct net_device *dev,
7493 struct net_device *adj_dev,
7494 u16 ref_nr,
7495 struct list_head *dev_list)
7496{
7497 struct netdev_adjacent *adj;
7498
7499 pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
7500 dev->name, adj_dev->name, ref_nr);
7501
7502 adj = __netdev_find_adj(adj_dev, dev_list);
7503
7504 if (!adj) {
7505 pr_err("Adjacency does not exist for device %s from %s\n",
7506 dev->name, adj_dev->name);
7507 WARN_ON(1);
7508 return;
7509 }
7510
7511 if (adj->ref_nr > ref_nr) {
7512 pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
7513 dev->name, adj_dev->name, ref_nr,
7514 adj->ref_nr - ref_nr);
7515 adj->ref_nr -= ref_nr;
7516 return;
7517 }
7518
7519 if (adj->master)
7520 sysfs_remove_link(&(dev->dev.kobj), "master");
7521
7522 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7523 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7524
7525 list_del_rcu(&adj->list);
7526 pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
7527 adj_dev->name, dev->name, adj_dev->name);
7528 netdev_put(adj_dev, &adj->dev_tracker);
7529 kfree_rcu(adj, rcu);
7530}
7531
7532static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
7533 struct net_device *upper_dev,
7534 struct list_head *up_list,
7535 struct list_head *down_list,
7536 void *private, bool master)
7537{
7538 int ret;
7539
7540 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
7541 private, master);
7542 if (ret)
7543 return ret;
7544
7545 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
7546 private, false);
7547 if (ret) {
7548 __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
7549 return ret;
7550 }
7551
7552 return 0;
7553}
7554
7555static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
7556 struct net_device *upper_dev,
7557 u16 ref_nr,
7558 struct list_head *up_list,
7559 struct list_head *down_list)
7560{
7561 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
7562 __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
7563}
7564
7565static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
7566 struct net_device *upper_dev,
7567 void *private, bool master)
7568{
7569 return __netdev_adjacent_dev_link_lists(dev, upper_dev,
7570 &dev->adj_list.upper,
7571 &upper_dev->adj_list.lower,
7572 private, master);
7573}
7574
7575static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
7576 struct net_device *upper_dev)
7577{
7578 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
7579 &dev->adj_list.upper,
7580 &upper_dev->adj_list.lower);
7581}
7582
7583static int __netdev_upper_dev_link(struct net_device *dev,
7584 struct net_device *upper_dev, bool master,
7585 void *upper_priv, void *upper_info,
7586 struct netdev_nested_priv *priv,
7587 struct netlink_ext_ack *extack)
7588{
7589 struct netdev_notifier_changeupper_info changeupper_info = {
7590 .info = {
7591 .dev = dev,
7592 .extack = extack,
7593 },
7594 .upper_dev = upper_dev,
7595 .master = master,
7596 .linking = true,
7597 .upper_info = upper_info,
7598 };
7599 struct net_device *master_dev;
7600 int ret = 0;
7601
7602 ASSERT_RTNL();
7603
7604 if (dev == upper_dev)
7605 return -EBUSY;
7606
7607 /* To prevent loops, check if dev is not upper device to upper_dev. */
7608 if (__netdev_has_upper_dev(upper_dev, dev))
7609 return -EBUSY;
7610
7611 if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
7612 return -EMLINK;
7613
7614 if (!master) {
7615 if (__netdev_has_upper_dev(dev, upper_dev))
7616 return -EEXIST;
7617 } else {
7618 master_dev = __netdev_master_upper_dev_get(dev);
7619 if (master_dev)
7620 return master_dev == upper_dev ? -EEXIST : -EBUSY;
7621 }
7622
7623 ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7624 &changeupper_info.info);
7625 ret = notifier_to_errno(ret);
7626 if (ret)
7627 return ret;
7628
7629 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
7630 master);
7631 if (ret)
7632 return ret;
7633
7634 ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7635 &changeupper_info.info);
7636 ret = notifier_to_errno(ret);
7637 if (ret)
7638 goto rollback;
7639
7640 __netdev_update_upper_level(dev, NULL);
7641 __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7642
7643 __netdev_update_lower_level(upper_dev, priv);
7644 __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7645 priv);
7646
7647 return 0;
7648
7649rollback:
7650 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7651
7652 return ret;
7653}
7654
7655/**
7656 * netdev_upper_dev_link - Add a link to the upper device
7657 * @dev: device
7658 * @upper_dev: new upper device
7659 * @extack: netlink extended ack
7660 *
7661 * Adds a link to device which is upper to this one. The caller must hold
7662 * the RTNL lock. On a failure a negative errno code is returned.
7663 * On success the reference counts are adjusted and the function
7664 * returns zero.
7665 */
7666int netdev_upper_dev_link(struct net_device *dev,
7667 struct net_device *upper_dev,
7668 struct netlink_ext_ack *extack)
7669{
7670 struct netdev_nested_priv priv = {
7671 .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7672 .data = NULL,
7673 };
7674
7675 return __netdev_upper_dev_link(dev, upper_dev, false,
7676 NULL, NULL, &priv, extack);
7677}
7678EXPORT_SYMBOL(netdev_upper_dev_link);
7679
7680/**
7681 * netdev_master_upper_dev_link - Add a master link to the upper device
7682 * @dev: device
7683 * @upper_dev: new upper device
7684 * @upper_priv: upper device private
7685 * @upper_info: upper info to be passed down via notifier
7686 * @extack: netlink extended ack
7687 *
7688 * Adds a link to device which is upper to this one. In this case, only
7689 * one master upper device can be linked, although other non-master devices
7690 * might be linked as well. The caller must hold the RTNL lock.
7691 * On a failure a negative errno code is returned. On success the reference
7692 * counts are adjusted and the function returns zero.
7693 */
7694int netdev_master_upper_dev_link(struct net_device *dev,
7695 struct net_device *upper_dev,
7696 void *upper_priv, void *upper_info,
7697 struct netlink_ext_ack *extack)
7698{
7699 struct netdev_nested_priv priv = {
7700 .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7701 .data = NULL,
7702 };
7703
7704 return __netdev_upper_dev_link(dev, upper_dev, true,
7705 upper_priv, upper_info, &priv, extack);
7706}
7707EXPORT_SYMBOL(netdev_master_upper_dev_link);
7708
7709static void __netdev_upper_dev_unlink(struct net_device *dev,
7710 struct net_device *upper_dev,
7711 struct netdev_nested_priv *priv)
7712{
7713 struct netdev_notifier_changeupper_info changeupper_info = {
7714 .info = {
7715 .dev = dev,
7716 },
7717 .upper_dev = upper_dev,
7718 .linking = false,
7719 };
7720
7721 ASSERT_RTNL();
7722
7723 changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
7724
7725 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7726 &changeupper_info.info);
7727
7728 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7729
7730 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7731 &changeupper_info.info);
7732
7733 __netdev_update_upper_level(dev, NULL);
7734 __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7735
7736 __netdev_update_lower_level(upper_dev, priv);
7737 __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7738 priv);
7739}
7740
7741/**
7742 * netdev_upper_dev_unlink - Removes a link to upper device
7743 * @dev: device
7744 * @upper_dev: new upper device
7745 *
7746 * Removes a link to device which is upper to this one. The caller must hold
7747 * the RTNL lock.
7748 */
7749void netdev_upper_dev_unlink(struct net_device *dev,
7750 struct net_device *upper_dev)
7751{
7752 struct netdev_nested_priv priv = {
7753 .flags = NESTED_SYNC_TODO,
7754 .data = NULL,
7755 };
7756
7757 __netdev_upper_dev_unlink(dev, upper_dev, &priv);
7758}
7759EXPORT_SYMBOL(netdev_upper_dev_unlink);
7760
7761static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
7762 struct net_device *lower_dev,
7763 bool val)
7764{
7765 struct netdev_adjacent *adj;
7766
7767 adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
7768 if (adj)
7769 adj->ignore = val;
7770
7771 adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
7772 if (adj)
7773 adj->ignore = val;
7774}
7775
7776static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
7777 struct net_device *lower_dev)
7778{
7779 __netdev_adjacent_dev_set(upper_dev, lower_dev, true);
7780}
7781
7782static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
7783 struct net_device *lower_dev)
7784{
7785 __netdev_adjacent_dev_set(upper_dev, lower_dev, false);
7786}
7787
7788int netdev_adjacent_change_prepare(struct net_device *old_dev,
7789 struct net_device *new_dev,
7790 struct net_device *dev,
7791 struct netlink_ext_ack *extack)
7792{
7793 struct netdev_nested_priv priv = {
7794 .flags = 0,
7795 .data = NULL,
7796 };
7797 int err;
7798
7799 if (!new_dev)
7800 return 0;
7801
7802 if (old_dev && new_dev != old_dev)
7803 netdev_adjacent_dev_disable(dev, old_dev);
7804 err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
7805 extack);
7806 if (err) {
7807 if (old_dev && new_dev != old_dev)
7808 netdev_adjacent_dev_enable(dev, old_dev);
7809 return err;
7810 }
7811
7812 return 0;
7813}
7814EXPORT_SYMBOL(netdev_adjacent_change_prepare);
7815
7816void netdev_adjacent_change_commit(struct net_device *old_dev,
7817 struct net_device *new_dev,
7818 struct net_device *dev)
7819{
7820 struct netdev_nested_priv priv = {
7821 .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7822 .data = NULL,
7823 };
7824
7825 if (!new_dev || !old_dev)
7826 return;
7827
7828 if (new_dev == old_dev)
7829 return;
7830
7831 netdev_adjacent_dev_enable(dev, old_dev);
7832 __netdev_upper_dev_unlink(old_dev, dev, &priv);
7833}
7834EXPORT_SYMBOL(netdev_adjacent_change_commit);
7835
7836void netdev_adjacent_change_abort(struct net_device *old_dev,
7837 struct net_device *new_dev,
7838 struct net_device *dev)
7839{
7840 struct netdev_nested_priv priv = {
7841 .flags = 0,
7842 .data = NULL,
7843 };
7844
7845 if (!new_dev)
7846 return;
7847
7848 if (old_dev && new_dev != old_dev)
7849 netdev_adjacent_dev_enable(dev, old_dev);
7850
7851 __netdev_upper_dev_unlink(new_dev, dev, &priv);
7852}
7853EXPORT_SYMBOL(netdev_adjacent_change_abort);
7854
7855/**
7856 * netdev_bonding_info_change - Dispatch event about slave change
7857 * @dev: device
7858 * @bonding_info: info to dispatch
7859 *
7860 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
7861 * The caller must hold the RTNL lock.
7862 */
7863void netdev_bonding_info_change(struct net_device *dev,
7864 struct netdev_bonding_info *bonding_info)
7865{
7866 struct netdev_notifier_bonding_info info = {
7867 .info.dev = dev,
7868 };
7869
7870 memcpy(&info.bonding_info, bonding_info,
7871 sizeof(struct netdev_bonding_info));
7872 call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
7873 &info.info);
7874}
7875EXPORT_SYMBOL(netdev_bonding_info_change);
7876
7877static int netdev_offload_xstats_enable_l3(struct net_device *dev,
7878 struct netlink_ext_ack *extack)
7879{
7880 struct netdev_notifier_offload_xstats_info info = {
7881 .info.dev = dev,
7882 .info.extack = extack,
7883 .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
7884 };
7885 int err;
7886 int rc;
7887
7888 dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
7889 GFP_KERNEL);
7890 if (!dev->offload_xstats_l3)
7891 return -ENOMEM;
7892
7893 rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
7894 NETDEV_OFFLOAD_XSTATS_DISABLE,
7895 &info.info);
7896 err = notifier_to_errno(rc);
7897 if (err)
7898 goto free_stats;
7899
7900 return 0;
7901
7902free_stats:
7903 kfree(dev->offload_xstats_l3);
7904 dev->offload_xstats_l3 = NULL;
7905 return err;
7906}
7907
7908int netdev_offload_xstats_enable(struct net_device *dev,
7909 enum netdev_offload_xstats_type type,
7910 struct netlink_ext_ack *extack)
7911{
7912 ASSERT_RTNL();
7913
7914 if (netdev_offload_xstats_enabled(dev, type))
7915 return -EALREADY;
7916
7917 switch (type) {
7918 case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
7919 return netdev_offload_xstats_enable_l3(dev, extack);
7920 }
7921
7922 WARN_ON(1);
7923 return -EINVAL;
7924}
7925EXPORT_SYMBOL(netdev_offload_xstats_enable);
7926
7927static void netdev_offload_xstats_disable_l3(struct net_device *dev)
7928{
7929 struct netdev_notifier_offload_xstats_info info = {
7930 .info.dev = dev,
7931 .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
7932 };
7933
7934 call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
7935 &info.info);
7936 kfree(dev->offload_xstats_l3);
7937 dev->offload_xstats_l3 = NULL;
7938}
7939
7940int netdev_offload_xstats_disable(struct net_device *dev,
7941 enum netdev_offload_xstats_type type)
7942{
7943 ASSERT_RTNL();
7944
7945 if (!netdev_offload_xstats_enabled(dev, type))
7946 return -EALREADY;
7947
7948 switch (type) {
7949 case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
7950 netdev_offload_xstats_disable_l3(dev);
7951 return 0;
7952 }
7953
7954 WARN_ON(1);
7955 return -EINVAL;
7956}
7957EXPORT_SYMBOL(netdev_offload_xstats_disable);
7958
7959static void netdev_offload_xstats_disable_all(struct net_device *dev)
7960{
7961 netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
7962}
7963
7964static struct rtnl_hw_stats64 *
7965netdev_offload_xstats_get_ptr(const struct net_device *dev,
7966 enum netdev_offload_xstats_type type)
7967{
7968 switch (type) {
7969 case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
7970 return dev->offload_xstats_l3;
7971 }
7972
7973 WARN_ON(1);
7974 return NULL;
7975}
7976
7977bool netdev_offload_xstats_enabled(const struct net_device *dev,
7978 enum netdev_offload_xstats_type type)
7979{
7980 ASSERT_RTNL();
7981
7982 return netdev_offload_xstats_get_ptr(dev, type);
7983}
7984EXPORT_SYMBOL(netdev_offload_xstats_enabled);
7985
7986struct netdev_notifier_offload_xstats_ru {
7987 bool used;
7988};
7989
7990struct netdev_notifier_offload_xstats_rd {
7991 struct rtnl_hw_stats64 stats;
7992 bool used;
7993};
7994
7995static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
7996 const struct rtnl_hw_stats64 *src)
7997{
7998 dest->rx_packets += src->rx_packets;
7999 dest->tx_packets += src->tx_packets;
8000 dest->rx_bytes += src->rx_bytes;
8001 dest->tx_bytes += src->tx_bytes;
8002 dest->rx_errors += src->rx_errors;
8003 dest->tx_errors += src->tx_errors;
8004 dest->rx_dropped += src->rx_dropped;
8005 dest->tx_dropped += src->tx_dropped;
8006 dest->multicast += src->multicast;
8007}
8008
8009static int netdev_offload_xstats_get_used(struct net_device *dev,
8010 enum netdev_offload_xstats_type type,
8011 bool *p_used,
8012 struct netlink_ext_ack *extack)
8013{
8014 struct netdev_notifier_offload_xstats_ru report_used = {};
8015 struct netdev_notifier_offload_xstats_info info = {
8016 .info.dev = dev,
8017 .info.extack = extack,
8018 .type = type,
8019 .report_used = &report_used,
8020 };
8021 int rc;
8022
8023 WARN_ON(!netdev_offload_xstats_enabled(dev, type));
8024 rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
8025 &info.info);
8026 *p_used = report_used.used;
8027 return notifier_to_errno(rc);
8028}
8029
8030static int netdev_offload_xstats_get_stats(struct net_device *dev,
8031 enum netdev_offload_xstats_type type,
8032 struct rtnl_hw_stats64 *p_stats,
8033 bool *p_used,
8034 struct netlink_ext_ack *extack)
8035{
8036 struct netdev_notifier_offload_xstats_rd report_delta = {};
8037 struct netdev_notifier_offload_xstats_info info = {
8038 .info.dev = dev,
8039 .info.extack = extack,
8040 .type = type,
8041 .report_delta = &report_delta,
8042 };
8043 struct rtnl_hw_stats64 *stats;
8044 int rc;
8045
8046 stats = netdev_offload_xstats_get_ptr(dev, type);
8047 if (WARN_ON(!stats))
8048 return -EINVAL;
8049
8050 rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
8051 &info.info);
8052
8053 /* Cache whatever we got, even if there was an error, otherwise the
8054 * successful stats retrievals would get lost.
8055 */
8056 netdev_hw_stats64_add(stats, &report_delta.stats);
8057
8058 if (p_stats)
8059 *p_stats = *stats;
8060 *p_used = report_delta.used;
8061
8062 return notifier_to_errno(rc);
8063}
8064
8065int netdev_offload_xstats_get(struct net_device *dev,
8066 enum netdev_offload_xstats_type type,
8067 struct rtnl_hw_stats64 *p_stats, bool *p_used,
8068 struct netlink_ext_ack *extack)
8069{
8070 ASSERT_RTNL();
8071
8072 if (p_stats)
8073 return netdev_offload_xstats_get_stats(dev, type, p_stats,
8074 p_used, extack);
8075 else
8076 return netdev_offload_xstats_get_used(dev, type, p_used,
8077 extack);
8078}
8079EXPORT_SYMBOL(netdev_offload_xstats_get);
8080
8081void
8082netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
8083 const struct rtnl_hw_stats64 *stats)
8084{
8085 report_delta->used = true;
8086 netdev_hw_stats64_add(&report_delta->stats, stats);
8087}
8088EXPORT_SYMBOL(netdev_offload_xstats_report_delta);
8089
8090void
8091netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
8092{
8093 report_used->used = true;
8094}
8095EXPORT_SYMBOL(netdev_offload_xstats_report_used);
8096
8097void netdev_offload_xstats_push_delta(struct net_device *dev,
8098 enum netdev_offload_xstats_type type,
8099 const struct rtnl_hw_stats64 *p_stats)
8100{
8101 struct rtnl_hw_stats64 *stats;
8102
8103 ASSERT_RTNL();
8104
8105 stats = netdev_offload_xstats_get_ptr(dev, type);
8106 if (WARN_ON(!stats))
8107 return;
8108
8109 netdev_hw_stats64_add(stats, p_stats);
8110}
8111EXPORT_SYMBOL(netdev_offload_xstats_push_delta);
8112
8113/**
8114 * netdev_get_xmit_slave - Get the xmit slave of master device
8115 * @dev: device
8116 * @skb: The packet
8117 * @all_slaves: assume all the slaves are active
8118 *
8119 * The reference counters are not incremented so the caller must be
8120 * careful with locks. The caller must hold RCU lock.
8121 * %NULL is returned if no slave is found.
8122 */
8123
8124struct net_device *netdev_get_xmit_slave(struct net_device *dev,
8125 struct sk_buff *skb,
8126 bool all_slaves)
8127{
8128 const struct net_device_ops *ops = dev->netdev_ops;
8129
8130 if (!ops->ndo_get_xmit_slave)
8131 return NULL;
8132 return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
8133}
8134EXPORT_SYMBOL(netdev_get_xmit_slave);
8135
8136static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
8137 struct sock *sk)
8138{
8139 const struct net_device_ops *ops = dev->netdev_ops;
8140
8141 if (!ops->ndo_sk_get_lower_dev)
8142 return NULL;
8143 return ops->ndo_sk_get_lower_dev(dev, sk);
8144}
8145
8146/**
8147 * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
8148 * @dev: device
8149 * @sk: the socket
8150 *
8151 * %NULL is returned if no lower device is found.
8152 */
8153
8154struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
8155 struct sock *sk)
8156{
8157 struct net_device *lower;
8158
8159 lower = netdev_sk_get_lower_dev(dev, sk);
8160 while (lower) {
8161 dev = lower;
8162 lower = netdev_sk_get_lower_dev(dev, sk);
8163 }
8164
8165 return dev;
8166}
8167EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
8168
8169static void netdev_adjacent_add_links(struct net_device *dev)
8170{
8171 struct netdev_adjacent *iter;
8172
8173 struct net *net = dev_net(dev);
8174
8175 list_for_each_entry(iter, &dev->adj_list.upper, list) {
8176 if (!net_eq(net, dev_net(iter->dev)))
8177 continue;
8178 netdev_adjacent_sysfs_add(iter->dev, dev,
8179 &iter->dev->adj_list.lower);
8180 netdev_adjacent_sysfs_add(dev, iter->dev,
8181 &dev->adj_list.upper);
8182 }
8183
8184 list_for_each_entry(iter, &dev->adj_list.lower, list) {
8185 if (!net_eq(net, dev_net(iter->dev)))
8186 continue;
8187 netdev_adjacent_sysfs_add(iter->dev, dev,
8188 &iter->dev->adj_list.upper);
8189 netdev_adjacent_sysfs_add(dev, iter->dev,
8190 &dev->adj_list.lower);
8191 }
8192}
8193
8194static void netdev_adjacent_del_links(struct net_device *dev)
8195{
8196 struct netdev_adjacent *iter;
8197
8198 struct net *net = dev_net(dev);
8199
8200 list_for_each_entry(iter, &dev->adj_list.upper, list) {
8201 if (!net_eq(net, dev_net(iter->dev)))
8202 continue;
8203 netdev_adjacent_sysfs_del(iter->dev, dev->name,
8204 &iter->dev->adj_list.lower);
8205 netdev_adjacent_sysfs_del(dev, iter->dev->name,
8206 &dev->adj_list.upper);
8207 }
8208
8209 list_for_each_entry(iter, &dev->adj_list.lower, list) {
8210 if (!net_eq(net, dev_net(iter->dev)))
8211 continue;
8212 netdev_adjacent_sysfs_del(iter->dev, dev->name,
8213 &iter->dev->adj_list.upper);
8214 netdev_adjacent_sysfs_del(dev, iter->dev->name,
8215 &dev->adj_list.lower);
8216 }
8217}
8218
8219void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
8220{
8221 struct netdev_adjacent *iter;
8222
8223 struct net *net = dev_net(dev);
8224
8225 list_for_each_entry(iter, &dev->adj_list.upper, list) {
8226 if (!net_eq(net, dev_net(iter->dev)))
8227 continue;
8228 netdev_adjacent_sysfs_del(iter->dev, oldname,
8229 &iter->dev->adj_list.lower);
8230 netdev_adjacent_sysfs_add(iter->dev, dev,
8231 &iter->dev->adj_list.lower);
8232 }
8233
8234 list_for_each_entry(iter, &dev->adj_list.lower, list) {
8235 if (!net_eq(net, dev_net(iter->dev)))
8236 continue;
8237 netdev_adjacent_sysfs_del(iter->dev, oldname,
8238 &iter->dev->adj_list.upper);
8239 netdev_adjacent_sysfs_add(iter->dev, dev,
8240 &iter->dev->adj_list.upper);
8241 }
8242}
8243
8244void *netdev_lower_dev_get_private(struct net_device *dev,
8245 struct net_device *lower_dev)
8246{
8247 struct netdev_adjacent *lower;
8248
8249 if (!lower_dev)
8250 return NULL;
8251 lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
8252 if (!lower)
8253 return NULL;
8254
8255 return lower->private;
8256}
8257EXPORT_SYMBOL(netdev_lower_dev_get_private);
8258
8259
8260/**
8261 * netdev_lower_state_changed - Dispatch event about lower device state change
8262 * @lower_dev: device
8263 * @lower_state_info: state to dispatch
8264 *
8265 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
8266 * The caller must hold the RTNL lock.
8267 */
8268void netdev_lower_state_changed(struct net_device *lower_dev,
8269 void *lower_state_info)
8270{
8271 struct netdev_notifier_changelowerstate_info changelowerstate_info = {
8272 .info.dev = lower_dev,
8273 };
8274
8275 ASSERT_RTNL();
8276 changelowerstate_info.lower_state_info = lower_state_info;
8277 call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
8278 &changelowerstate_info.info);
8279}
8280EXPORT_SYMBOL(netdev_lower_state_changed);
8281
8282static void dev_change_rx_flags(struct net_device *dev, int flags)
8283{
8284 const struct net_device_ops *ops = dev->netdev_ops;
8285
8286 if (ops->ndo_change_rx_flags)
8287 ops->ndo_change_rx_flags(dev, flags);
8288}
8289
8290static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
8291{
8292 unsigned int old_flags = dev->flags;
8293 kuid_t uid;
8294 kgid_t gid;
8295
8296 ASSERT_RTNL();
8297
8298 dev->flags |= IFF_PROMISC;
8299 dev->promiscuity += inc;
8300 if (dev->promiscuity == 0) {
8301 /*
8302 * Avoid overflow.
8303 * If inc causes overflow, untouch promisc and return error.
8304 */
8305 if (inc < 0)
8306 dev->flags &= ~IFF_PROMISC;
8307 else {
8308 dev->promiscuity -= inc;
8309 netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
8310 return -EOVERFLOW;
8311 }
8312 }
8313 if (dev->flags != old_flags) {
8314 pr_info("device %s %s promiscuous mode\n",
8315 dev->name,
8316 dev->flags & IFF_PROMISC ? "entered" : "left");
8317 if (audit_enabled) {
8318 current_uid_gid(&uid, &gid);
8319 audit_log(audit_context(), GFP_ATOMIC,
8320 AUDIT_ANOM_PROMISCUOUS,
8321 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
8322 dev->name, (dev->flags & IFF_PROMISC),
8323 (old_flags & IFF_PROMISC),
8324 from_kuid(&init_user_ns, audit_get_loginuid(current)),
8325 from_kuid(&init_user_ns, uid),
8326 from_kgid(&init_user_ns, gid),
8327 audit_get_sessionid(current));
8328 }
8329
8330 dev_change_rx_flags(dev, IFF_PROMISC);
8331 }
8332 if (notify)
8333 __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
8334 return 0;
8335}
8336
8337/**
8338 * dev_set_promiscuity - update promiscuity count on a device
8339 * @dev: device
8340 * @inc: modifier
8341 *
8342 * Add or remove promiscuity from a device. While the count in the device
8343 * remains above zero the interface remains promiscuous. Once it hits zero
8344 * the device reverts back to normal filtering operation. A negative inc
8345 * value is used to drop promiscuity on the device.
8346 * Return 0 if successful or a negative errno code on error.
8347 */
8348int dev_set_promiscuity(struct net_device *dev, int inc)
8349{
8350 unsigned int old_flags = dev->flags;
8351 int err;
8352
8353 err = __dev_set_promiscuity(dev, inc, true);
8354 if (err < 0)
8355 return err;
8356 if (dev->flags != old_flags)
8357 dev_set_rx_mode(dev);
8358 return err;
8359}
8360EXPORT_SYMBOL(dev_set_promiscuity);
8361
8362static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
8363{
8364 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
8365
8366 ASSERT_RTNL();
8367
8368 dev->flags |= IFF_ALLMULTI;
8369 dev->allmulti += inc;
8370 if (dev->allmulti == 0) {
8371 /*
8372 * Avoid overflow.
8373 * If inc causes overflow, untouch allmulti and return error.
8374 */
8375 if (inc < 0)
8376 dev->flags &= ~IFF_ALLMULTI;
8377 else {
8378 dev->allmulti -= inc;
8379 netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
8380 return -EOVERFLOW;
8381 }
8382 }
8383 if (dev->flags ^ old_flags) {
8384 dev_change_rx_flags(dev, IFF_ALLMULTI);
8385 dev_set_rx_mode(dev);
8386 if (notify)
8387 __dev_notify_flags(dev, old_flags,
8388 dev->gflags ^ old_gflags, 0, NULL);
8389 }
8390 return 0;
8391}
8392
8393/**
8394 * dev_set_allmulti - update allmulti count on a device
8395 * @dev: device
8396 * @inc: modifier
8397 *
8398 * Add or remove reception of all multicast frames to a device. While the
8399 * count in the device remains above zero the interface remains listening
8400 * to all interfaces. Once it hits zero the device reverts back to normal
8401 * filtering operation. A negative @inc value is used to drop the counter
8402 * when releasing a resource needing all multicasts.
8403 * Return 0 if successful or a negative errno code on error.
8404 */
8405
8406int dev_set_allmulti(struct net_device *dev, int inc)
8407{
8408 return __dev_set_allmulti(dev, inc, true);
8409}
8410EXPORT_SYMBOL(dev_set_allmulti);
8411
8412/*
8413 * Upload unicast and multicast address lists to device and
8414 * configure RX filtering. When the device doesn't support unicast
8415 * filtering it is put in promiscuous mode while unicast addresses
8416 * are present.
8417 */
8418void __dev_set_rx_mode(struct net_device *dev)
8419{
8420 const struct net_device_ops *ops = dev->netdev_ops;
8421
8422 /* dev_open will call this function so the list will stay sane. */
8423 if (!(dev->flags&IFF_UP))
8424 return;
8425
8426 if (!netif_device_present(dev))
8427 return;
8428
8429 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
8430 /* Unicast addresses changes may only happen under the rtnl,
8431 * therefore calling __dev_set_promiscuity here is safe.
8432 */
8433 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
8434 __dev_set_promiscuity(dev, 1, false);
8435 dev->uc_promisc = true;
8436 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
8437 __dev_set_promiscuity(dev, -1, false);
8438 dev->uc_promisc = false;
8439 }
8440 }
8441
8442 if (ops->ndo_set_rx_mode)
8443 ops->ndo_set_rx_mode(dev);
8444}
8445
8446void dev_set_rx_mode(struct net_device *dev)
8447{
8448 netif_addr_lock_bh(dev);
8449 __dev_set_rx_mode(dev);
8450 netif_addr_unlock_bh(dev);
8451}
8452
8453/**
8454 * dev_get_flags - get flags reported to userspace
8455 * @dev: device
8456 *
8457 * Get the combination of flag bits exported through APIs to userspace.
8458 */
8459unsigned int dev_get_flags(const struct net_device *dev)
8460{
8461 unsigned int flags;
8462
8463 flags = (dev->flags & ~(IFF_PROMISC |
8464 IFF_ALLMULTI |
8465 IFF_RUNNING |
8466 IFF_LOWER_UP |
8467 IFF_DORMANT)) |
8468 (dev->gflags & (IFF_PROMISC |
8469 IFF_ALLMULTI));
8470
8471 if (netif_running(dev)) {
8472 if (netif_oper_up(dev))
8473 flags |= IFF_RUNNING;
8474 if (netif_carrier_ok(dev))
8475 flags |= IFF_LOWER_UP;
8476 if (netif_dormant(dev))
8477 flags |= IFF_DORMANT;
8478 }
8479
8480 return flags;
8481}
8482EXPORT_SYMBOL(dev_get_flags);
8483
8484int __dev_change_flags(struct net_device *dev, unsigned int flags,
8485 struct netlink_ext_ack *extack)
8486{
8487 unsigned int old_flags = dev->flags;
8488 int ret;
8489
8490 ASSERT_RTNL();
8491
8492 /*
8493 * Set the flags on our device.
8494 */
8495
8496 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
8497 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
8498 IFF_AUTOMEDIA)) |
8499 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
8500 IFF_ALLMULTI));
8501
8502 /*
8503 * Load in the correct multicast list now the flags have changed.
8504 */
8505
8506 if ((old_flags ^ flags) & IFF_MULTICAST)
8507 dev_change_rx_flags(dev, IFF_MULTICAST);
8508
8509 dev_set_rx_mode(dev);
8510
8511 /*
8512 * Have we downed the interface. We handle IFF_UP ourselves
8513 * according to user attempts to set it, rather than blindly
8514 * setting it.
8515 */
8516
8517 ret = 0;
8518 if ((old_flags ^ flags) & IFF_UP) {
8519 if (old_flags & IFF_UP)
8520 __dev_close(dev);
8521 else
8522 ret = __dev_open(dev, extack);
8523 }
8524
8525 if ((flags ^ dev->gflags) & IFF_PROMISC) {
8526 int inc = (flags & IFF_PROMISC) ? 1 : -1;
8527 unsigned int old_flags = dev->flags;
8528
8529 dev->gflags ^= IFF_PROMISC;
8530
8531 if (__dev_set_promiscuity(dev, inc, false) >= 0)
8532 if (dev->flags != old_flags)
8533 dev_set_rx_mode(dev);
8534 }
8535
8536 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
8537 * is important. Some (broken) drivers set IFF_PROMISC, when
8538 * IFF_ALLMULTI is requested not asking us and not reporting.
8539 */
8540 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
8541 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
8542
8543 dev->gflags ^= IFF_ALLMULTI;
8544 __dev_set_allmulti(dev, inc, false);
8545 }
8546
8547 return ret;
8548}
8549
8550void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
8551 unsigned int gchanges, u32 portid,
8552 const struct nlmsghdr *nlh)
8553{
8554 unsigned int changes = dev->flags ^ old_flags;
8555
8556 if (gchanges)
8557 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh);
8558
8559 if (changes & IFF_UP) {
8560 if (dev->flags & IFF_UP)
8561 call_netdevice_notifiers(NETDEV_UP, dev);
8562 else
8563 call_netdevice_notifiers(NETDEV_DOWN, dev);
8564 }
8565
8566 if (dev->flags & IFF_UP &&
8567 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
8568 struct netdev_notifier_change_info change_info = {
8569 .info = {
8570 .dev = dev,
8571 },
8572 .flags_changed = changes,
8573 };
8574
8575 call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
8576 }
8577}
8578
8579/**
8580 * dev_change_flags - change device settings
8581 * @dev: device
8582 * @flags: device state flags
8583 * @extack: netlink extended ack
8584 *
8585 * Change settings on device based state flags. The flags are
8586 * in the userspace exported format.
8587 */
8588int dev_change_flags(struct net_device *dev, unsigned int flags,
8589 struct netlink_ext_ack *extack)
8590{
8591 int ret;
8592 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
8593
8594 ret = __dev_change_flags(dev, flags, extack);
8595 if (ret < 0)
8596 return ret;
8597
8598 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
8599 __dev_notify_flags(dev, old_flags, changes, 0, NULL);
8600 return ret;
8601}
8602EXPORT_SYMBOL(dev_change_flags);
8603
8604int __dev_set_mtu(struct net_device *dev, int new_mtu)
8605{
8606 const struct net_device_ops *ops = dev->netdev_ops;
8607
8608 if (ops->ndo_change_mtu)
8609 return ops->ndo_change_mtu(dev, new_mtu);
8610
8611 /* Pairs with all the lockless reads of dev->mtu in the stack */
8612 WRITE_ONCE(dev->mtu, new_mtu);
8613 return 0;
8614}
8615EXPORT_SYMBOL(__dev_set_mtu);
8616
8617int dev_validate_mtu(struct net_device *dev, int new_mtu,
8618 struct netlink_ext_ack *extack)
8619{
8620 /* MTU must be positive, and in range */
8621 if (new_mtu < 0 || new_mtu < dev->min_mtu) {
8622 NL_SET_ERR_MSG(extack, "mtu less than device minimum");
8623 return -EINVAL;
8624 }
8625
8626 if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
8627 NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
8628 return -EINVAL;
8629 }
8630 return 0;
8631}
8632
8633/**
8634 * dev_set_mtu_ext - Change maximum transfer unit
8635 * @dev: device
8636 * @new_mtu: new transfer unit
8637 * @extack: netlink extended ack
8638 *
8639 * Change the maximum transfer size of the network device.
8640 */
8641int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
8642 struct netlink_ext_ack *extack)
8643{
8644 int err, orig_mtu;
8645
8646 if (new_mtu == dev->mtu)
8647 return 0;
8648
8649 err = dev_validate_mtu(dev, new_mtu, extack);
8650 if (err)
8651 return err;
8652
8653 if (!netif_device_present(dev))
8654 return -ENODEV;
8655
8656 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
8657 err = notifier_to_errno(err);
8658 if (err)
8659 return err;
8660
8661 orig_mtu = dev->mtu;
8662 err = __dev_set_mtu(dev, new_mtu);
8663
8664 if (!err) {
8665 err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8666 orig_mtu);
8667 err = notifier_to_errno(err);
8668 if (err) {
8669 /* setting mtu back and notifying everyone again,
8670 * so that they have a chance to revert changes.
8671 */
8672 __dev_set_mtu(dev, orig_mtu);
8673 call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8674 new_mtu);
8675 }
8676 }
8677 return err;
8678}
8679
8680int dev_set_mtu(struct net_device *dev, int new_mtu)
8681{
8682 struct netlink_ext_ack extack;
8683 int err;
8684
8685 memset(&extack, 0, sizeof(extack));
8686 err = dev_set_mtu_ext(dev, new_mtu, &extack);
8687 if (err && extack._msg)
8688 net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
8689 return err;
8690}
8691EXPORT_SYMBOL(dev_set_mtu);
8692
8693/**
8694 * dev_change_tx_queue_len - Change TX queue length of a netdevice
8695 * @dev: device
8696 * @new_len: new tx queue length
8697 */
8698int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
8699{
8700 unsigned int orig_len = dev->tx_queue_len;
8701 int res;
8702
8703 if (new_len != (unsigned int)new_len)
8704 return -ERANGE;
8705
8706 if (new_len != orig_len) {
8707 dev->tx_queue_len = new_len;
8708 res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
8709 res = notifier_to_errno(res);
8710 if (res)
8711 goto err_rollback;
8712 res = dev_qdisc_change_tx_queue_len(dev);
8713 if (res)
8714 goto err_rollback;
8715 }
8716
8717 return 0;
8718
8719err_rollback:
8720 netdev_err(dev, "refused to change device tx_queue_len\n");
8721 dev->tx_queue_len = orig_len;
8722 return res;
8723}
8724
8725/**
8726 * dev_set_group - Change group this device belongs to
8727 * @dev: device
8728 * @new_group: group this device should belong to
8729 */
8730void dev_set_group(struct net_device *dev, int new_group)
8731{
8732 dev->group = new_group;
8733}
8734
8735/**
8736 * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
8737 * @dev: device
8738 * @addr: new address
8739 * @extack: netlink extended ack
8740 */
8741int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
8742 struct netlink_ext_ack *extack)
8743{
8744 struct netdev_notifier_pre_changeaddr_info info = {
8745 .info.dev = dev,
8746 .info.extack = extack,
8747 .dev_addr = addr,
8748 };
8749 int rc;
8750
8751 rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
8752 return notifier_to_errno(rc);
8753}
8754EXPORT_SYMBOL(dev_pre_changeaddr_notify);
8755
8756/**
8757 * dev_set_mac_address - Change Media Access Control Address
8758 * @dev: device
8759 * @sa: new address
8760 * @extack: netlink extended ack
8761 *
8762 * Change the hardware (MAC) address of the device
8763 */
8764int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
8765 struct netlink_ext_ack *extack)
8766{
8767 const struct net_device_ops *ops = dev->netdev_ops;
8768 int err;
8769
8770 if (!ops->ndo_set_mac_address)
8771 return -EOPNOTSUPP;
8772 if (sa->sa_family != dev->type)
8773 return -EINVAL;
8774 if (!netif_device_present(dev))
8775 return -ENODEV;
8776 err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
8777 if (err)
8778 return err;
8779 err = ops->ndo_set_mac_address(dev, sa);
8780 if (err)
8781 return err;
8782 dev->addr_assign_type = NET_ADDR_SET;
8783 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
8784 add_device_randomness(dev->dev_addr, dev->addr_len);
8785 return 0;
8786}
8787EXPORT_SYMBOL(dev_set_mac_address);
8788
8789static DECLARE_RWSEM(dev_addr_sem);
8790
8791int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
8792 struct netlink_ext_ack *extack)
8793{
8794 int ret;
8795
8796 down_write(&dev_addr_sem);
8797 ret = dev_set_mac_address(dev, sa, extack);
8798 up_write(&dev_addr_sem);
8799 return ret;
8800}
8801EXPORT_SYMBOL(dev_set_mac_address_user);
8802
8803int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
8804{
8805 size_t size = sizeof(sa->sa_data_min);
8806 struct net_device *dev;
8807 int ret = 0;
8808
8809 down_read(&dev_addr_sem);
8810 rcu_read_lock();
8811
8812 dev = dev_get_by_name_rcu(net, dev_name);
8813 if (!dev) {
8814 ret = -ENODEV;
8815 goto unlock;
8816 }
8817 if (!dev->addr_len)
8818 memset(sa->sa_data, 0, size);
8819 else
8820 memcpy(sa->sa_data, dev->dev_addr,
8821 min_t(size_t, size, dev->addr_len));
8822 sa->sa_family = dev->type;
8823
8824unlock:
8825 rcu_read_unlock();
8826 up_read(&dev_addr_sem);
8827 return ret;
8828}
8829EXPORT_SYMBOL(dev_get_mac_address);
8830
8831/**
8832 * dev_change_carrier - Change device carrier
8833 * @dev: device
8834 * @new_carrier: new value
8835 *
8836 * Change device carrier
8837 */
8838int dev_change_carrier(struct net_device *dev, bool new_carrier)
8839{
8840 const struct net_device_ops *ops = dev->netdev_ops;
8841
8842 if (!ops->ndo_change_carrier)
8843 return -EOPNOTSUPP;
8844 if (!netif_device_present(dev))
8845 return -ENODEV;
8846 return ops->ndo_change_carrier(dev, new_carrier);
8847}
8848
8849/**
8850 * dev_get_phys_port_id - Get device physical port ID
8851 * @dev: device
8852 * @ppid: port ID
8853 *
8854 * Get device physical port ID
8855 */
8856int dev_get_phys_port_id(struct net_device *dev,
8857 struct netdev_phys_item_id *ppid)
8858{
8859 const struct net_device_ops *ops = dev->netdev_ops;
8860
8861 if (!ops->ndo_get_phys_port_id)
8862 return -EOPNOTSUPP;
8863 return ops->ndo_get_phys_port_id(dev, ppid);
8864}
8865
8866/**
8867 * dev_get_phys_port_name - Get device physical port name
8868 * @dev: device
8869 * @name: port name
8870 * @len: limit of bytes to copy to name
8871 *
8872 * Get device physical port name
8873 */
8874int dev_get_phys_port_name(struct net_device *dev,
8875 char *name, size_t len)
8876{
8877 const struct net_device_ops *ops = dev->netdev_ops;
8878 int err;
8879
8880 if (ops->ndo_get_phys_port_name) {
8881 err = ops->ndo_get_phys_port_name(dev, name, len);
8882 if (err != -EOPNOTSUPP)
8883 return err;
8884 }
8885 return devlink_compat_phys_port_name_get(dev, name, len);
8886}
8887
8888/**
8889 * dev_get_port_parent_id - Get the device's port parent identifier
8890 * @dev: network device
8891 * @ppid: pointer to a storage for the port's parent identifier
8892 * @recurse: allow/disallow recursion to lower devices
8893 *
8894 * Get the devices's port parent identifier
8895 */
8896int dev_get_port_parent_id(struct net_device *dev,
8897 struct netdev_phys_item_id *ppid,
8898 bool recurse)
8899{
8900 const struct net_device_ops *ops = dev->netdev_ops;
8901 struct netdev_phys_item_id first = { };
8902 struct net_device *lower_dev;
8903 struct list_head *iter;
8904 int err;
8905
8906 if (ops->ndo_get_port_parent_id) {
8907 err = ops->ndo_get_port_parent_id(dev, ppid);
8908 if (err != -EOPNOTSUPP)
8909 return err;
8910 }
8911
8912 err = devlink_compat_switch_id_get(dev, ppid);
8913 if (!recurse || err != -EOPNOTSUPP)
8914 return err;
8915
8916 netdev_for_each_lower_dev(dev, lower_dev, iter) {
8917 err = dev_get_port_parent_id(lower_dev, ppid, true);
8918 if (err)
8919 break;
8920 if (!first.id_len)
8921 first = *ppid;
8922 else if (memcmp(&first, ppid, sizeof(*ppid)))
8923 return -EOPNOTSUPP;
8924 }
8925
8926 return err;
8927}
8928EXPORT_SYMBOL(dev_get_port_parent_id);
8929
8930/**
8931 * netdev_port_same_parent_id - Indicate if two network devices have
8932 * the same port parent identifier
8933 * @a: first network device
8934 * @b: second network device
8935 */
8936bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
8937{
8938 struct netdev_phys_item_id a_id = { };
8939 struct netdev_phys_item_id b_id = { };
8940
8941 if (dev_get_port_parent_id(a, &a_id, true) ||
8942 dev_get_port_parent_id(b, &b_id, true))
8943 return false;
8944
8945 return netdev_phys_item_id_same(&a_id, &b_id);
8946}
8947EXPORT_SYMBOL(netdev_port_same_parent_id);
8948
8949/**
8950 * dev_change_proto_down - set carrier according to proto_down.
8951 *
8952 * @dev: device
8953 * @proto_down: new value
8954 */
8955int dev_change_proto_down(struct net_device *dev, bool proto_down)
8956{
8957 if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN))
8958 return -EOPNOTSUPP;
8959 if (!netif_device_present(dev))
8960 return -ENODEV;
8961 if (proto_down)
8962 netif_carrier_off(dev);
8963 else
8964 netif_carrier_on(dev);
8965 dev->proto_down = proto_down;
8966 return 0;
8967}
8968
8969/**
8970 * dev_change_proto_down_reason - proto down reason
8971 *
8972 * @dev: device
8973 * @mask: proto down mask
8974 * @value: proto down value
8975 */
8976void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
8977 u32 value)
8978{
8979 int b;
8980
8981 if (!mask) {
8982 dev->proto_down_reason = value;
8983 } else {
8984 for_each_set_bit(b, &mask, 32) {
8985 if (value & (1 << b))
8986 dev->proto_down_reason |= BIT(b);
8987 else
8988 dev->proto_down_reason &= ~BIT(b);
8989 }
8990 }
8991}
8992
8993struct bpf_xdp_link {
8994 struct bpf_link link;
8995 struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
8996 int flags;
8997};
8998
8999static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
9000{
9001 if (flags & XDP_FLAGS_HW_MODE)
9002 return XDP_MODE_HW;
9003 if (flags & XDP_FLAGS_DRV_MODE)
9004 return XDP_MODE_DRV;
9005 if (flags & XDP_FLAGS_SKB_MODE)
9006 return XDP_MODE_SKB;
9007 return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
9008}
9009
9010static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
9011{
9012 switch (mode) {
9013 case XDP_MODE_SKB:
9014 return generic_xdp_install;
9015 case XDP_MODE_DRV:
9016 case XDP_MODE_HW:
9017 return dev->netdev_ops->ndo_bpf;
9018 default:
9019 return NULL;
9020 }
9021}
9022
9023static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
9024 enum bpf_xdp_mode mode)
9025{
9026 return dev->xdp_state[mode].link;
9027}
9028
9029static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
9030 enum bpf_xdp_mode mode)
9031{
9032 struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
9033
9034 if (link)
9035 return link->link.prog;
9036 return dev->xdp_state[mode].prog;
9037}
9038
9039u8 dev_xdp_prog_count(struct net_device *dev)
9040{
9041 u8 count = 0;
9042 int i;
9043
9044 for (i = 0; i < __MAX_XDP_MODE; i++)
9045 if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
9046 count++;
9047 return count;
9048}
9049EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
9050
9051u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
9052{
9053 struct bpf_prog *prog = dev_xdp_prog(dev, mode);
9054
9055 return prog ? prog->aux->id : 0;
9056}
9057
9058static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
9059 struct bpf_xdp_link *link)
9060{
9061 dev->xdp_state[mode].link = link;
9062 dev->xdp_state[mode].prog = NULL;
9063}
9064
9065static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
9066 struct bpf_prog *prog)
9067{
9068 dev->xdp_state[mode].link = NULL;
9069 dev->xdp_state[mode].prog = prog;
9070}
9071
9072static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
9073 bpf_op_t bpf_op, struct netlink_ext_ack *extack,
9074 u32 flags, struct bpf_prog *prog)
9075{
9076 struct netdev_bpf xdp;
9077 int err;
9078
9079 memset(&xdp, 0, sizeof(xdp));
9080 xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
9081 xdp.extack = extack;
9082 xdp.flags = flags;
9083 xdp.prog = prog;
9084
9085 /* Drivers assume refcnt is already incremented (i.e, prog pointer is
9086 * "moved" into driver), so they don't increment it on their own, but
9087 * they do decrement refcnt when program is detached or replaced.
9088 * Given net_device also owns link/prog, we need to bump refcnt here
9089 * to prevent drivers from underflowing it.
9090 */
9091 if (prog)
9092 bpf_prog_inc(prog);
9093 err = bpf_op(dev, &xdp);
9094 if (err) {
9095 if (prog)
9096 bpf_prog_put(prog);
9097 return err;
9098 }
9099
9100 if (mode != XDP_MODE_HW)
9101 bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
9102
9103 return 0;
9104}
9105
9106static void dev_xdp_uninstall(struct net_device *dev)
9107{
9108 struct bpf_xdp_link *link;
9109 struct bpf_prog *prog;
9110 enum bpf_xdp_mode mode;
9111 bpf_op_t bpf_op;
9112
9113 ASSERT_RTNL();
9114
9115 for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
9116 prog = dev_xdp_prog(dev, mode);
9117 if (!prog)
9118 continue;
9119
9120 bpf_op = dev_xdp_bpf_op(dev, mode);
9121 if (!bpf_op)
9122 continue;
9123
9124 WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9125
9126 /* auto-detach link from net device */
9127 link = dev_xdp_link(dev, mode);
9128 if (link)
9129 link->dev = NULL;
9130 else
9131 bpf_prog_put(prog);
9132
9133 dev_xdp_set_link(dev, mode, NULL);
9134 }
9135}
9136
9137static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
9138 struct bpf_xdp_link *link, struct bpf_prog *new_prog,
9139 struct bpf_prog *old_prog, u32 flags)
9140{
9141 unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
9142 struct bpf_prog *cur_prog;
9143 struct net_device *upper;
9144 struct list_head *iter;
9145 enum bpf_xdp_mode mode;
9146 bpf_op_t bpf_op;
9147 int err;
9148
9149 ASSERT_RTNL();
9150
9151 /* either link or prog attachment, never both */
9152 if (link && (new_prog || old_prog))
9153 return -EINVAL;
9154 /* link supports only XDP mode flags */
9155 if (link && (flags & ~XDP_FLAGS_MODES)) {
9156 NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
9157 return -EINVAL;
9158 }
9159 /* just one XDP mode bit should be set, zero defaults to drv/skb mode */
9160 if (num_modes > 1) {
9161 NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
9162 return -EINVAL;
9163 }
9164 /* avoid ambiguity if offload + drv/skb mode progs are both loaded */
9165 if (!num_modes && dev_xdp_prog_count(dev) > 1) {
9166 NL_SET_ERR_MSG(extack,
9167 "More than one program loaded, unset mode is ambiguous");
9168 return -EINVAL;
9169 }
9170 /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
9171 if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
9172 NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
9173 return -EINVAL;
9174 }
9175
9176 mode = dev_xdp_mode(dev, flags);
9177 /* can't replace attached link */
9178 if (dev_xdp_link(dev, mode)) {
9179 NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
9180 return -EBUSY;
9181 }
9182
9183 /* don't allow if an upper device already has a program */
9184 netdev_for_each_upper_dev_rcu(dev, upper, iter) {
9185 if (dev_xdp_prog_count(upper) > 0) {
9186 NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
9187 return -EEXIST;
9188 }
9189 }
9190
9191 cur_prog = dev_xdp_prog(dev, mode);
9192 /* can't replace attached prog with link */
9193 if (link && cur_prog) {
9194 NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
9195 return -EBUSY;
9196 }
9197 if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
9198 NL_SET_ERR_MSG(extack, "Active program does not match expected");
9199 return -EEXIST;
9200 }
9201
9202 /* put effective new program into new_prog */
9203 if (link)
9204 new_prog = link->link.prog;
9205
9206 if (new_prog) {
9207 bool offload = mode == XDP_MODE_HW;
9208 enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
9209 ? XDP_MODE_DRV : XDP_MODE_SKB;
9210
9211 if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
9212 NL_SET_ERR_MSG(extack, "XDP program already attached");
9213 return -EBUSY;
9214 }
9215 if (!offload && dev_xdp_prog(dev, other_mode)) {
9216 NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
9217 return -EEXIST;
9218 }
9219 if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) {
9220 NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
9221 return -EINVAL;
9222 }
9223 if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
9224 NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
9225 return -EINVAL;
9226 }
9227 if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
9228 NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
9229 return -EINVAL;
9230 }
9231 }
9232
9233 /* don't call drivers if the effective program didn't change */
9234 if (new_prog != cur_prog) {
9235 bpf_op = dev_xdp_bpf_op(dev, mode);
9236 if (!bpf_op) {
9237 NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
9238 return -EOPNOTSUPP;
9239 }
9240
9241 err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
9242 if (err)
9243 return err;
9244 }
9245
9246 if (link)
9247 dev_xdp_set_link(dev, mode, link);
9248 else
9249 dev_xdp_set_prog(dev, mode, new_prog);
9250 if (cur_prog)
9251 bpf_prog_put(cur_prog);
9252
9253 return 0;
9254}
9255
9256static int dev_xdp_attach_link(struct net_device *dev,
9257 struct netlink_ext_ack *extack,
9258 struct bpf_xdp_link *link)
9259{
9260 return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
9261}
9262
9263static int dev_xdp_detach_link(struct net_device *dev,
9264 struct netlink_ext_ack *extack,
9265 struct bpf_xdp_link *link)
9266{
9267 enum bpf_xdp_mode mode;
9268 bpf_op_t bpf_op;
9269
9270 ASSERT_RTNL();
9271
9272 mode = dev_xdp_mode(dev, link->flags);
9273 if (dev_xdp_link(dev, mode) != link)
9274 return -EINVAL;
9275
9276 bpf_op = dev_xdp_bpf_op(dev, mode);
9277 WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9278 dev_xdp_set_link(dev, mode, NULL);
9279 return 0;
9280}
9281
9282static void bpf_xdp_link_release(struct bpf_link *link)
9283{
9284 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9285
9286 rtnl_lock();
9287
9288 /* if racing with net_device's tear down, xdp_link->dev might be
9289 * already NULL, in which case link was already auto-detached
9290 */
9291 if (xdp_link->dev) {
9292 WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
9293 xdp_link->dev = NULL;
9294 }
9295
9296 rtnl_unlock();
9297}
9298
9299static int bpf_xdp_link_detach(struct bpf_link *link)
9300{
9301 bpf_xdp_link_release(link);
9302 return 0;
9303}
9304
9305static void bpf_xdp_link_dealloc(struct bpf_link *link)
9306{
9307 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9308
9309 kfree(xdp_link);
9310}
9311
9312static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
9313 struct seq_file *seq)
9314{
9315 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9316 u32 ifindex = 0;
9317
9318 rtnl_lock();
9319 if (xdp_link->dev)
9320 ifindex = xdp_link->dev->ifindex;
9321 rtnl_unlock();
9322
9323 seq_printf(seq, "ifindex:\t%u\n", ifindex);
9324}
9325
9326static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
9327 struct bpf_link_info *info)
9328{
9329 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9330 u32 ifindex = 0;
9331
9332 rtnl_lock();
9333 if (xdp_link->dev)
9334 ifindex = xdp_link->dev->ifindex;
9335 rtnl_unlock();
9336
9337 info->xdp.ifindex = ifindex;
9338 return 0;
9339}
9340
9341static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
9342 struct bpf_prog *old_prog)
9343{
9344 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9345 enum bpf_xdp_mode mode;
9346 bpf_op_t bpf_op;
9347 int err = 0;
9348
9349 rtnl_lock();
9350
9351 /* link might have been auto-released already, so fail */
9352 if (!xdp_link->dev) {
9353 err = -ENOLINK;
9354 goto out_unlock;
9355 }
9356
9357 if (old_prog && link->prog != old_prog) {
9358 err = -EPERM;
9359 goto out_unlock;
9360 }
9361 old_prog = link->prog;
9362 if (old_prog->type != new_prog->type ||
9363 old_prog->expected_attach_type != new_prog->expected_attach_type) {
9364 err = -EINVAL;
9365 goto out_unlock;
9366 }
9367
9368 if (old_prog == new_prog) {
9369 /* no-op, don't disturb drivers */
9370 bpf_prog_put(new_prog);
9371 goto out_unlock;
9372 }
9373
9374 mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
9375 bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
9376 err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
9377 xdp_link->flags, new_prog);
9378 if (err)
9379 goto out_unlock;
9380
9381 old_prog = xchg(&link->prog, new_prog);
9382 bpf_prog_put(old_prog);
9383
9384out_unlock:
9385 rtnl_unlock();
9386 return err;
9387}
9388
9389static const struct bpf_link_ops bpf_xdp_link_lops = {
9390 .release = bpf_xdp_link_release,
9391 .dealloc = bpf_xdp_link_dealloc,
9392 .detach = bpf_xdp_link_detach,
9393 .show_fdinfo = bpf_xdp_link_show_fdinfo,
9394 .fill_link_info = bpf_xdp_link_fill_link_info,
9395 .update_prog = bpf_xdp_link_update,
9396};
9397
9398int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
9399{
9400 struct net *net = current->nsproxy->net_ns;
9401 struct bpf_link_primer link_primer;
9402 struct bpf_xdp_link *link;
9403 struct net_device *dev;
9404 int err, fd;
9405
9406 rtnl_lock();
9407 dev = dev_get_by_index(net, attr->link_create.target_ifindex);
9408 if (!dev) {
9409 rtnl_unlock();
9410 return -EINVAL;
9411 }
9412
9413 link = kzalloc(sizeof(*link), GFP_USER);
9414 if (!link) {
9415 err = -ENOMEM;
9416 goto unlock;
9417 }
9418
9419 bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
9420 link->dev = dev;
9421 link->flags = attr->link_create.flags;
9422
9423 err = bpf_link_prime(&link->link, &link_primer);
9424 if (err) {
9425 kfree(link);
9426 goto unlock;
9427 }
9428
9429 err = dev_xdp_attach_link(dev, NULL, link);
9430 rtnl_unlock();
9431
9432 if (err) {
9433 link->dev = NULL;
9434 bpf_link_cleanup(&link_primer);
9435 goto out_put_dev;
9436 }
9437
9438 fd = bpf_link_settle(&link_primer);
9439 /* link itself doesn't hold dev's refcnt to not complicate shutdown */
9440 dev_put(dev);
9441 return fd;
9442
9443unlock:
9444 rtnl_unlock();
9445
9446out_put_dev:
9447 dev_put(dev);
9448 return err;
9449}
9450
9451/**
9452 * dev_change_xdp_fd - set or clear a bpf program for a device rx path
9453 * @dev: device
9454 * @extack: netlink extended ack
9455 * @fd: new program fd or negative value to clear
9456 * @expected_fd: old program fd that userspace expects to replace or clear
9457 * @flags: xdp-related flags
9458 *
9459 * Set or clear a bpf program for a device
9460 */
9461int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
9462 int fd, int expected_fd, u32 flags)
9463{
9464 enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
9465 struct bpf_prog *new_prog = NULL, *old_prog = NULL;
9466 int err;
9467
9468 ASSERT_RTNL();
9469
9470 if (fd >= 0) {
9471 new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
9472 mode != XDP_MODE_SKB);
9473 if (IS_ERR(new_prog))
9474 return PTR_ERR(new_prog);
9475 }
9476
9477 if (expected_fd >= 0) {
9478 old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
9479 mode != XDP_MODE_SKB);
9480 if (IS_ERR(old_prog)) {
9481 err = PTR_ERR(old_prog);
9482 old_prog = NULL;
9483 goto err_out;
9484 }
9485 }
9486
9487 err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
9488
9489err_out:
9490 if (err && new_prog)
9491 bpf_prog_put(new_prog);
9492 if (old_prog)
9493 bpf_prog_put(old_prog);
9494 return err;
9495}
9496
9497/**
9498 * dev_new_index - allocate an ifindex
9499 * @net: the applicable net namespace
9500 *
9501 * Returns a suitable unique value for a new device interface
9502 * number. The caller must hold the rtnl semaphore or the
9503 * dev_base_lock to be sure it remains unique.
9504 */
9505static int dev_new_index(struct net *net)
9506{
9507 int ifindex = net->ifindex;
9508
9509 for (;;) {
9510 if (++ifindex <= 0)
9511 ifindex = 1;
9512 if (!__dev_get_by_index(net, ifindex))
9513 return net->ifindex = ifindex;
9514 }
9515}
9516
9517/* Delayed registration/unregisteration */
9518LIST_HEAD(net_todo_list);
9519DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
9520
9521static void net_set_todo(struct net_device *dev)
9522{
9523 list_add_tail(&dev->todo_list, &net_todo_list);
9524 atomic_inc(&dev_net(dev)->dev_unreg_count);
9525}
9526
9527static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
9528 struct net_device *upper, netdev_features_t features)
9529{
9530 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9531 netdev_features_t feature;
9532 int feature_bit;
9533
9534 for_each_netdev_feature(upper_disables, feature_bit) {
9535 feature = __NETIF_F_BIT(feature_bit);
9536 if (!(upper->wanted_features & feature)
9537 && (features & feature)) {
9538 netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
9539 &feature, upper->name);
9540 features &= ~feature;
9541 }
9542 }
9543
9544 return features;
9545}
9546
9547static void netdev_sync_lower_features(struct net_device *upper,
9548 struct net_device *lower, netdev_features_t features)
9549{
9550 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9551 netdev_features_t feature;
9552 int feature_bit;
9553
9554 for_each_netdev_feature(upper_disables, feature_bit) {
9555 feature = __NETIF_F_BIT(feature_bit);
9556 if (!(features & feature) && (lower->features & feature)) {
9557 netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
9558 &feature, lower->name);
9559 lower->wanted_features &= ~feature;
9560 __netdev_update_features(lower);
9561
9562 if (unlikely(lower->features & feature))
9563 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
9564 &feature, lower->name);
9565 else
9566 netdev_features_change(lower);
9567 }
9568 }
9569}
9570
9571static netdev_features_t netdev_fix_features(struct net_device *dev,
9572 netdev_features_t features)
9573{
9574 /* Fix illegal checksum combinations */
9575 if ((features & NETIF_F_HW_CSUM) &&
9576 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
9577 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
9578 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
9579 }
9580
9581 /* TSO requires that SG is present as well. */
9582 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
9583 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
9584 features &= ~NETIF_F_ALL_TSO;
9585 }
9586
9587 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
9588 !(features & NETIF_F_IP_CSUM)) {
9589 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
9590 features &= ~NETIF_F_TSO;
9591 features &= ~NETIF_F_TSO_ECN;
9592 }
9593
9594 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
9595 !(features & NETIF_F_IPV6_CSUM)) {
9596 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
9597 features &= ~NETIF_F_TSO6;
9598 }
9599
9600 /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
9601 if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
9602 features &= ~NETIF_F_TSO_MANGLEID;
9603
9604 /* TSO ECN requires that TSO is present as well. */
9605 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
9606 features &= ~NETIF_F_TSO_ECN;
9607
9608 /* Software GSO depends on SG. */
9609 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
9610 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
9611 features &= ~NETIF_F_GSO;
9612 }
9613
9614 /* GSO partial features require GSO partial be set */
9615 if ((features & dev->gso_partial_features) &&
9616 !(features & NETIF_F_GSO_PARTIAL)) {
9617 netdev_dbg(dev,
9618 "Dropping partially supported GSO features since no GSO partial.\n");
9619 features &= ~dev->gso_partial_features;
9620 }
9621
9622 if (!(features & NETIF_F_RXCSUM)) {
9623 /* NETIF_F_GRO_HW implies doing RXCSUM since every packet
9624 * successfully merged by hardware must also have the
9625 * checksum verified by hardware. If the user does not
9626 * want to enable RXCSUM, logically, we should disable GRO_HW.
9627 */
9628 if (features & NETIF_F_GRO_HW) {
9629 netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
9630 features &= ~NETIF_F_GRO_HW;
9631 }
9632 }
9633
9634 /* LRO/HW-GRO features cannot be combined with RX-FCS */
9635 if (features & NETIF_F_RXFCS) {
9636 if (features & NETIF_F_LRO) {
9637 netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
9638 features &= ~NETIF_F_LRO;
9639 }
9640
9641 if (features & NETIF_F_GRO_HW) {
9642 netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
9643 features &= ~NETIF_F_GRO_HW;
9644 }
9645 }
9646
9647 if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
9648 netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
9649 features &= ~NETIF_F_LRO;
9650 }
9651
9652 if (features & NETIF_F_HW_TLS_TX) {
9653 bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
9654 (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
9655 bool hw_csum = features & NETIF_F_HW_CSUM;
9656
9657 if (!ip_csum && !hw_csum) {
9658 netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
9659 features &= ~NETIF_F_HW_TLS_TX;
9660 }
9661 }
9662
9663 if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
9664 netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
9665 features &= ~NETIF_F_HW_TLS_RX;
9666 }
9667
9668 return features;
9669}
9670
9671int __netdev_update_features(struct net_device *dev)
9672{
9673 struct net_device *upper, *lower;
9674 netdev_features_t features;
9675 struct list_head *iter;
9676 int err = -1;
9677
9678 ASSERT_RTNL();
9679
9680 features = netdev_get_wanted_features(dev);
9681
9682 if (dev->netdev_ops->ndo_fix_features)
9683 features = dev->netdev_ops->ndo_fix_features(dev, features);
9684
9685 /* driver might be less strict about feature dependencies */
9686 features = netdev_fix_features(dev, features);
9687
9688 /* some features can't be enabled if they're off on an upper device */
9689 netdev_for_each_upper_dev_rcu(dev, upper, iter)
9690 features = netdev_sync_upper_features(dev, upper, features);
9691
9692 if (dev->features == features)
9693 goto sync_lower;
9694
9695 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
9696 &dev->features, &features);
9697
9698 if (dev->netdev_ops->ndo_set_features)
9699 err = dev->netdev_ops->ndo_set_features(dev, features);
9700 else
9701 err = 0;
9702
9703 if (unlikely(err < 0)) {
9704 netdev_err(dev,
9705 "set_features() failed (%d); wanted %pNF, left %pNF\n",
9706 err, &features, &dev->features);
9707 /* return non-0 since some features might have changed and
9708 * it's better to fire a spurious notification than miss it
9709 */
9710 return -1;
9711 }
9712
9713sync_lower:
9714 /* some features must be disabled on lower devices when disabled
9715 * on an upper device (think: bonding master or bridge)
9716 */
9717 netdev_for_each_lower_dev(dev, lower, iter)
9718 netdev_sync_lower_features(dev, lower, features);
9719
9720 if (!err) {
9721 netdev_features_t diff = features ^ dev->features;
9722
9723 if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
9724 /* udp_tunnel_{get,drop}_rx_info both need
9725 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
9726 * device, or they won't do anything.
9727 * Thus we need to update dev->features
9728 * *before* calling udp_tunnel_get_rx_info,
9729 * but *after* calling udp_tunnel_drop_rx_info.
9730 */
9731 if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
9732 dev->features = features;
9733 udp_tunnel_get_rx_info(dev);
9734 } else {
9735 udp_tunnel_drop_rx_info(dev);
9736 }
9737 }
9738
9739 if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
9740 if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
9741 dev->features = features;
9742 err |= vlan_get_rx_ctag_filter_info(dev);
9743 } else {
9744 vlan_drop_rx_ctag_filter_info(dev);
9745 }
9746 }
9747
9748 if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
9749 if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
9750 dev->features = features;
9751 err |= vlan_get_rx_stag_filter_info(dev);
9752 } else {
9753 vlan_drop_rx_stag_filter_info(dev);
9754 }
9755 }
9756
9757 dev->features = features;
9758 }
9759
9760 return err < 0 ? 0 : 1;
9761}
9762
9763/**
9764 * netdev_update_features - recalculate device features
9765 * @dev: the device to check
9766 *
9767 * Recalculate dev->features set and send notifications if it
9768 * has changed. Should be called after driver or hardware dependent
9769 * conditions might have changed that influence the features.
9770 */
9771void netdev_update_features(struct net_device *dev)
9772{
9773 if (__netdev_update_features(dev))
9774 netdev_features_change(dev);
9775}
9776EXPORT_SYMBOL(netdev_update_features);
9777
9778/**
9779 * netdev_change_features - recalculate device features
9780 * @dev: the device to check
9781 *
9782 * Recalculate dev->features set and send notifications even
9783 * if they have not changed. Should be called instead of
9784 * netdev_update_features() if also dev->vlan_features might
9785 * have changed to allow the changes to be propagated to stacked
9786 * VLAN devices.
9787 */
9788void netdev_change_features(struct net_device *dev)
9789{
9790 __netdev_update_features(dev);
9791 netdev_features_change(dev);
9792}
9793EXPORT_SYMBOL(netdev_change_features);
9794
9795/**
9796 * netif_stacked_transfer_operstate - transfer operstate
9797 * @rootdev: the root or lower level device to transfer state from
9798 * @dev: the device to transfer operstate to
9799 *
9800 * Transfer operational state from root to device. This is normally
9801 * called when a stacking relationship exists between the root
9802 * device and the device(a leaf device).
9803 */
9804void netif_stacked_transfer_operstate(const struct net_device *rootdev,
9805 struct net_device *dev)
9806{
9807 if (rootdev->operstate == IF_OPER_DORMANT)
9808 netif_dormant_on(dev);
9809 else
9810 netif_dormant_off(dev);
9811
9812 if (rootdev->operstate == IF_OPER_TESTING)
9813 netif_testing_on(dev);
9814 else
9815 netif_testing_off(dev);
9816
9817 if (netif_carrier_ok(rootdev))
9818 netif_carrier_on(dev);
9819 else
9820 netif_carrier_off(dev);
9821}
9822EXPORT_SYMBOL(netif_stacked_transfer_operstate);
9823
9824static int netif_alloc_rx_queues(struct net_device *dev)
9825{
9826 unsigned int i, count = dev->num_rx_queues;
9827 struct netdev_rx_queue *rx;
9828 size_t sz = count * sizeof(*rx);
9829 int err = 0;
9830
9831 BUG_ON(count < 1);
9832
9833 rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
9834 if (!rx)
9835 return -ENOMEM;
9836
9837 dev->_rx = rx;
9838
9839 for (i = 0; i < count; i++) {
9840 rx[i].dev = dev;
9841
9842 /* XDP RX-queue setup */
9843 err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
9844 if (err < 0)
9845 goto err_rxq_info;
9846 }
9847 return 0;
9848
9849err_rxq_info:
9850 /* Rollback successful reg's and free other resources */
9851 while (i--)
9852 xdp_rxq_info_unreg(&rx[i].xdp_rxq);
9853 kvfree(dev->_rx);
9854 dev->_rx = NULL;
9855 return err;
9856}
9857
9858static void netif_free_rx_queues(struct net_device *dev)
9859{
9860 unsigned int i, count = dev->num_rx_queues;
9861
9862 /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
9863 if (!dev->_rx)
9864 return;
9865
9866 for (i = 0; i < count; i++)
9867 xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
9868
9869 kvfree(dev->_rx);
9870}
9871
9872static void netdev_init_one_queue(struct net_device *dev,
9873 struct netdev_queue *queue, void *_unused)
9874{
9875 /* Initialize queue lock */
9876 spin_lock_init(&queue->_xmit_lock);
9877 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
9878 queue->xmit_lock_owner = -1;
9879 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
9880 queue->dev = dev;
9881#ifdef CONFIG_BQL
9882 dql_init(&queue->dql, HZ);
9883#endif
9884}
9885
9886static void netif_free_tx_queues(struct net_device *dev)
9887{
9888 kvfree(dev->_tx);
9889}
9890
9891static int netif_alloc_netdev_queues(struct net_device *dev)
9892{
9893 unsigned int count = dev->num_tx_queues;
9894 struct netdev_queue *tx;
9895 size_t sz = count * sizeof(*tx);
9896
9897 if (count < 1 || count > 0xffff)
9898 return -EINVAL;
9899
9900 tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
9901 if (!tx)
9902 return -ENOMEM;
9903
9904 dev->_tx = tx;
9905
9906 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
9907 spin_lock_init(&dev->tx_global_lock);
9908
9909 return 0;
9910}
9911
9912void netif_tx_stop_all_queues(struct net_device *dev)
9913{
9914 unsigned int i;
9915
9916 for (i = 0; i < dev->num_tx_queues; i++) {
9917 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
9918
9919 netif_tx_stop_queue(txq);
9920 }
9921}
9922EXPORT_SYMBOL(netif_tx_stop_all_queues);
9923
9924/**
9925 * register_netdevice() - register a network device
9926 * @dev: device to register
9927 *
9928 * Take a prepared network device structure and make it externally accessible.
9929 * A %NETDEV_REGISTER message is sent to the netdev notifier chain.
9930 * Callers must hold the rtnl lock - you may want register_netdev()
9931 * instead of this.
9932 */
9933int register_netdevice(struct net_device *dev)
9934{
9935 int ret;
9936 struct net *net = dev_net(dev);
9937
9938 BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
9939 NETDEV_FEATURE_COUNT);
9940 BUG_ON(dev_boot_phase);
9941 ASSERT_RTNL();
9942
9943 might_sleep();
9944
9945 /* When net_device's are persistent, this will be fatal. */
9946 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
9947 BUG_ON(!net);
9948
9949 ret = ethtool_check_ops(dev->ethtool_ops);
9950 if (ret)
9951 return ret;
9952
9953 spin_lock_init(&dev->addr_list_lock);
9954 netdev_set_addr_lockdep_class(dev);
9955
9956 ret = dev_get_valid_name(net, dev, dev->name);
9957 if (ret < 0)
9958 goto out;
9959
9960 ret = -ENOMEM;
9961 dev->name_node = netdev_name_node_head_alloc(dev);
9962 if (!dev->name_node)
9963 goto out;
9964
9965 /* Init, if this function is available */
9966 if (dev->netdev_ops->ndo_init) {
9967 ret = dev->netdev_ops->ndo_init(dev);
9968 if (ret) {
9969 if (ret > 0)
9970 ret = -EIO;
9971 goto err_free_name;
9972 }
9973 }
9974
9975 if (((dev->hw_features | dev->features) &
9976 NETIF_F_HW_VLAN_CTAG_FILTER) &&
9977 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
9978 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
9979 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
9980 ret = -EINVAL;
9981 goto err_uninit;
9982 }
9983
9984 ret = -EBUSY;
9985 if (!dev->ifindex)
9986 dev->ifindex = dev_new_index(net);
9987 else if (__dev_get_by_index(net, dev->ifindex))
9988 goto err_uninit;
9989
9990 /* Transfer changeable features to wanted_features and enable
9991 * software offloads (GSO and GRO).
9992 */
9993 dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
9994 dev->features |= NETIF_F_SOFT_FEATURES;
9995
9996 if (dev->udp_tunnel_nic_info) {
9997 dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
9998 dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
9999 }
10000
10001 dev->wanted_features = dev->features & dev->hw_features;
10002
10003 if (!(dev->flags & IFF_LOOPBACK))
10004 dev->hw_features |= NETIF_F_NOCACHE_COPY;
10005
10006 /* If IPv4 TCP segmentation offload is supported we should also
10007 * allow the device to enable segmenting the frame with the option
10008 * of ignoring a static IP ID value. This doesn't enable the
10009 * feature itself but allows the user to enable it later.
10010 */
10011 if (dev->hw_features & NETIF_F_TSO)
10012 dev->hw_features |= NETIF_F_TSO_MANGLEID;
10013 if (dev->vlan_features & NETIF_F_TSO)
10014 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
10015 if (dev->mpls_features & NETIF_F_TSO)
10016 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
10017 if (dev->hw_enc_features & NETIF_F_TSO)
10018 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
10019
10020 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
10021 */
10022 dev->vlan_features |= NETIF_F_HIGHDMA;
10023
10024 /* Make NETIF_F_SG inheritable to tunnel devices.
10025 */
10026 dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
10027
10028 /* Make NETIF_F_SG inheritable to MPLS.
10029 */
10030 dev->mpls_features |= NETIF_F_SG;
10031
10032 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
10033 ret = notifier_to_errno(ret);
10034 if (ret)
10035 goto err_uninit;
10036
10037 ret = netdev_register_kobject(dev);
10038 write_lock(&dev_base_lock);
10039 dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED;
10040 write_unlock(&dev_base_lock);
10041 if (ret)
10042 goto err_uninit_notify;
10043
10044 __netdev_update_features(dev);
10045
10046 /*
10047 * Default initial state at registry is that the
10048 * device is present.
10049 */
10050
10051 set_bit(__LINK_STATE_PRESENT, &dev->state);
10052
10053 linkwatch_init_dev(dev);
10054
10055 dev_init_scheduler(dev);
10056
10057 netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL);
10058 list_netdevice(dev);
10059
10060 add_device_randomness(dev->dev_addr, dev->addr_len);
10061
10062 /* If the device has permanent device address, driver should
10063 * set dev_addr and also addr_assign_type should be set to
10064 * NET_ADDR_PERM (default value).
10065 */
10066 if (dev->addr_assign_type == NET_ADDR_PERM)
10067 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
10068
10069 /* Notify protocols, that a new device appeared. */
10070 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
10071 ret = notifier_to_errno(ret);
10072 if (ret) {
10073 /* Expect explicit free_netdev() on failure */
10074 dev->needs_free_netdev = false;
10075 unregister_netdevice_queue(dev, NULL);
10076 goto out;
10077 }
10078 /*
10079 * Prevent userspace races by waiting until the network
10080 * device is fully setup before sending notifications.
10081 */
10082 if (!dev->rtnl_link_ops ||
10083 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10084 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
10085
10086out:
10087 return ret;
10088
10089err_uninit_notify:
10090 call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
10091err_uninit:
10092 if (dev->netdev_ops->ndo_uninit)
10093 dev->netdev_ops->ndo_uninit(dev);
10094 if (dev->priv_destructor)
10095 dev->priv_destructor(dev);
10096err_free_name:
10097 netdev_name_node_free(dev->name_node);
10098 goto out;
10099}
10100EXPORT_SYMBOL(register_netdevice);
10101
10102/**
10103 * init_dummy_netdev - init a dummy network device for NAPI
10104 * @dev: device to init
10105 *
10106 * This takes a network device structure and initialize the minimum
10107 * amount of fields so it can be used to schedule NAPI polls without
10108 * registering a full blown interface. This is to be used by drivers
10109 * that need to tie several hardware interfaces to a single NAPI
10110 * poll scheduler due to HW limitations.
10111 */
10112int init_dummy_netdev(struct net_device *dev)
10113{
10114 /* Clear everything. Note we don't initialize spinlocks
10115 * are they aren't supposed to be taken by any of the
10116 * NAPI code and this dummy netdev is supposed to be
10117 * only ever used for NAPI polls
10118 */
10119 memset(dev, 0, sizeof(struct net_device));
10120
10121 /* make sure we BUG if trying to hit standard
10122 * register/unregister code path
10123 */
10124 dev->reg_state = NETREG_DUMMY;
10125
10126 /* NAPI wants this */
10127 INIT_LIST_HEAD(&dev->napi_list);
10128
10129 /* a dummy interface is started by default */
10130 set_bit(__LINK_STATE_PRESENT, &dev->state);
10131 set_bit(__LINK_STATE_START, &dev->state);
10132
10133 /* napi_busy_loop stats accounting wants this */
10134 dev_net_set(dev, &init_net);
10135
10136 /* Note : We dont allocate pcpu_refcnt for dummy devices,
10137 * because users of this 'device' dont need to change
10138 * its refcount.
10139 */
10140
10141 return 0;
10142}
10143EXPORT_SYMBOL_GPL(init_dummy_netdev);
10144
10145
10146/**
10147 * register_netdev - register a network device
10148 * @dev: device to register
10149 *
10150 * Take a completed network device structure and add it to the kernel
10151 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
10152 * chain. 0 is returned on success. A negative errno code is returned
10153 * on a failure to set up the device, or if the name is a duplicate.
10154 *
10155 * This is a wrapper around register_netdevice that takes the rtnl semaphore
10156 * and expands the device name if you passed a format string to
10157 * alloc_netdev.
10158 */
10159int register_netdev(struct net_device *dev)
10160{
10161 int err;
10162
10163 if (rtnl_lock_killable())
10164 return -EINTR;
10165 err = register_netdevice(dev);
10166 rtnl_unlock();
10167 return err;
10168}
10169EXPORT_SYMBOL(register_netdev);
10170
10171int netdev_refcnt_read(const struct net_device *dev)
10172{
10173#ifdef CONFIG_PCPU_DEV_REFCNT
10174 int i, refcnt = 0;
10175
10176 for_each_possible_cpu(i)
10177 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
10178 return refcnt;
10179#else
10180 return refcount_read(&dev->dev_refcnt);
10181#endif
10182}
10183EXPORT_SYMBOL(netdev_refcnt_read);
10184
10185int netdev_unregister_timeout_secs __read_mostly = 10;
10186
10187#define WAIT_REFS_MIN_MSECS 1
10188#define WAIT_REFS_MAX_MSECS 250
10189/**
10190 * netdev_wait_allrefs_any - wait until all references are gone.
10191 * @list: list of net_devices to wait on
10192 *
10193 * This is called when unregistering network devices.
10194 *
10195 * Any protocol or device that holds a reference should register
10196 * for netdevice notification, and cleanup and put back the
10197 * reference if they receive an UNREGISTER event.
10198 * We can get stuck here if buggy protocols don't correctly
10199 * call dev_put.
10200 */
10201static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
10202{
10203 unsigned long rebroadcast_time, warning_time;
10204 struct net_device *dev;
10205 int wait = 0;
10206
10207 rebroadcast_time = warning_time = jiffies;
10208
10209 list_for_each_entry(dev, list, todo_list)
10210 if (netdev_refcnt_read(dev) == 1)
10211 return dev;
10212
10213 while (true) {
10214 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
10215 rtnl_lock();
10216
10217 /* Rebroadcast unregister notification */
10218 list_for_each_entry(dev, list, todo_list)
10219 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10220
10221 __rtnl_unlock();
10222 rcu_barrier();
10223 rtnl_lock();
10224
10225 list_for_each_entry(dev, list, todo_list)
10226 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
10227 &dev->state)) {
10228 /* We must not have linkwatch events
10229 * pending on unregister. If this
10230 * happens, we simply run the queue
10231 * unscheduled, resulting in a noop
10232 * for this device.
10233 */
10234 linkwatch_run_queue();
10235 break;
10236 }
10237
10238 __rtnl_unlock();
10239
10240 rebroadcast_time = jiffies;
10241 }
10242
10243 if (!wait) {
10244 rcu_barrier();
10245 wait = WAIT_REFS_MIN_MSECS;
10246 } else {
10247 msleep(wait);
10248 wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
10249 }
10250
10251 list_for_each_entry(dev, list, todo_list)
10252 if (netdev_refcnt_read(dev) == 1)
10253 return dev;
10254
10255 if (time_after(jiffies, warning_time +
10256 READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
10257 list_for_each_entry(dev, list, todo_list) {
10258 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
10259 dev->name, netdev_refcnt_read(dev));
10260 ref_tracker_dir_print(&dev->refcnt_tracker, 10);
10261 }
10262
10263 warning_time = jiffies;
10264 }
10265 }
10266}
10267
10268/* The sequence is:
10269 *
10270 * rtnl_lock();
10271 * ...
10272 * register_netdevice(x1);
10273 * register_netdevice(x2);
10274 * ...
10275 * unregister_netdevice(y1);
10276 * unregister_netdevice(y2);
10277 * ...
10278 * rtnl_unlock();
10279 * free_netdev(y1);
10280 * free_netdev(y2);
10281 *
10282 * We are invoked by rtnl_unlock().
10283 * This allows us to deal with problems:
10284 * 1) We can delete sysfs objects which invoke hotplug
10285 * without deadlocking with linkwatch via keventd.
10286 * 2) Since we run with the RTNL semaphore not held, we can sleep
10287 * safely in order to wait for the netdev refcnt to drop to zero.
10288 *
10289 * We must not return until all unregister events added during
10290 * the interval the lock was held have been completed.
10291 */
10292void netdev_run_todo(void)
10293{
10294 struct net_device *dev, *tmp;
10295 struct list_head list;
10296#ifdef CONFIG_LOCKDEP
10297 struct list_head unlink_list;
10298
10299 list_replace_init(&net_unlink_list, &unlink_list);
10300
10301 while (!list_empty(&unlink_list)) {
10302 struct net_device *dev = list_first_entry(&unlink_list,
10303 struct net_device,
10304 unlink_list);
10305 list_del_init(&dev->unlink_list);
10306 dev->nested_level = dev->lower_level - 1;
10307 }
10308#endif
10309
10310 /* Snapshot list, allow later requests */
10311 list_replace_init(&net_todo_list, &list);
10312
10313 __rtnl_unlock();
10314
10315 /* Wait for rcu callbacks to finish before next phase */
10316 if (!list_empty(&list))
10317 rcu_barrier();
10318
10319 list_for_each_entry_safe(dev, tmp, &list, todo_list) {
10320 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
10321 netdev_WARN(dev, "run_todo but not unregistering\n");
10322 list_del(&dev->todo_list);
10323 continue;
10324 }
10325
10326 write_lock(&dev_base_lock);
10327 dev->reg_state = NETREG_UNREGISTERED;
10328 write_unlock(&dev_base_lock);
10329 linkwatch_forget_dev(dev);
10330 }
10331
10332 while (!list_empty(&list)) {
10333 dev = netdev_wait_allrefs_any(&list);
10334 list_del(&dev->todo_list);
10335
10336 /* paranoia */
10337 BUG_ON(netdev_refcnt_read(dev) != 1);
10338 BUG_ON(!list_empty(&dev->ptype_all));
10339 BUG_ON(!list_empty(&dev->ptype_specific));
10340 WARN_ON(rcu_access_pointer(dev->ip_ptr));
10341 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
10342
10343 if (dev->priv_destructor)
10344 dev->priv_destructor(dev);
10345 if (dev->needs_free_netdev)
10346 free_netdev(dev);
10347
10348 if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count))
10349 wake_up(&netdev_unregistering_wq);
10350
10351 /* Free network device */
10352 kobject_put(&dev->dev.kobj);
10353 }
10354}
10355
10356/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
10357 * all the same fields in the same order as net_device_stats, with only
10358 * the type differing, but rtnl_link_stats64 may have additional fields
10359 * at the end for newer counters.
10360 */
10361void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
10362 const struct net_device_stats *netdev_stats)
10363{
10364 size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
10365 const atomic_long_t *src = (atomic_long_t *)netdev_stats;
10366 u64 *dst = (u64 *)stats64;
10367
10368 BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
10369 for (i = 0; i < n; i++)
10370 dst[i] = (unsigned long)atomic_long_read(&src[i]);
10371 /* zero out counters that only exist in rtnl_link_stats64 */
10372 memset((char *)stats64 + n * sizeof(u64), 0,
10373 sizeof(*stats64) - n * sizeof(u64));
10374}
10375EXPORT_SYMBOL(netdev_stats_to_stats64);
10376
10377struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev)
10378{
10379 struct net_device_core_stats __percpu *p;
10380
10381 p = alloc_percpu_gfp(struct net_device_core_stats,
10382 GFP_ATOMIC | __GFP_NOWARN);
10383
10384 if (p && cmpxchg(&dev->core_stats, NULL, p))
10385 free_percpu(p);
10386
10387 /* This READ_ONCE() pairs with the cmpxchg() above */
10388 return READ_ONCE(dev->core_stats);
10389}
10390EXPORT_SYMBOL(netdev_core_stats_alloc);
10391
10392/**
10393 * dev_get_stats - get network device statistics
10394 * @dev: device to get statistics from
10395 * @storage: place to store stats
10396 *
10397 * Get network statistics from device. Return @storage.
10398 * The device driver may provide its own method by setting
10399 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
10400 * otherwise the internal statistics structure is used.
10401 */
10402struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
10403 struct rtnl_link_stats64 *storage)
10404{
10405 const struct net_device_ops *ops = dev->netdev_ops;
10406 const struct net_device_core_stats __percpu *p;
10407
10408 if (ops->ndo_get_stats64) {
10409 memset(storage, 0, sizeof(*storage));
10410 ops->ndo_get_stats64(dev, storage);
10411 } else if (ops->ndo_get_stats) {
10412 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
10413 } else {
10414 netdev_stats_to_stats64(storage, &dev->stats);
10415 }
10416
10417 /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
10418 p = READ_ONCE(dev->core_stats);
10419 if (p) {
10420 const struct net_device_core_stats *core_stats;
10421 int i;
10422
10423 for_each_possible_cpu(i) {
10424 core_stats = per_cpu_ptr(p, i);
10425 storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
10426 storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
10427 storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
10428 storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped);
10429 }
10430 }
10431 return storage;
10432}
10433EXPORT_SYMBOL(dev_get_stats);
10434
10435/**
10436 * dev_fetch_sw_netstats - get per-cpu network device statistics
10437 * @s: place to store stats
10438 * @netstats: per-cpu network stats to read from
10439 *
10440 * Read per-cpu network statistics and populate the related fields in @s.
10441 */
10442void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
10443 const struct pcpu_sw_netstats __percpu *netstats)
10444{
10445 int cpu;
10446
10447 for_each_possible_cpu(cpu) {
10448 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
10449 const struct pcpu_sw_netstats *stats;
10450 unsigned int start;
10451
10452 stats = per_cpu_ptr(netstats, cpu);
10453 do {
10454 start = u64_stats_fetch_begin(&stats->syncp);
10455 rx_packets = u64_stats_read(&stats->rx_packets);
10456 rx_bytes = u64_stats_read(&stats->rx_bytes);
10457 tx_packets = u64_stats_read(&stats->tx_packets);
10458 tx_bytes = u64_stats_read(&stats->tx_bytes);
10459 } while (u64_stats_fetch_retry(&stats->syncp, start));
10460
10461 s->rx_packets += rx_packets;
10462 s->rx_bytes += rx_bytes;
10463 s->tx_packets += tx_packets;
10464 s->tx_bytes += tx_bytes;
10465 }
10466}
10467EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
10468
10469/**
10470 * dev_get_tstats64 - ndo_get_stats64 implementation
10471 * @dev: device to get statistics from
10472 * @s: place to store stats
10473 *
10474 * Populate @s from dev->stats and dev->tstats. Can be used as
10475 * ndo_get_stats64() callback.
10476 */
10477void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
10478{
10479 netdev_stats_to_stats64(s, &dev->stats);
10480 dev_fetch_sw_netstats(s, dev->tstats);
10481}
10482EXPORT_SYMBOL_GPL(dev_get_tstats64);
10483
10484struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
10485{
10486 struct netdev_queue *queue = dev_ingress_queue(dev);
10487
10488#ifdef CONFIG_NET_CLS_ACT
10489 if (queue)
10490 return queue;
10491 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
10492 if (!queue)
10493 return NULL;
10494 netdev_init_one_queue(dev, queue, NULL);
10495 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
10496 queue->qdisc_sleeping = &noop_qdisc;
10497 rcu_assign_pointer(dev->ingress_queue, queue);
10498#endif
10499 return queue;
10500}
10501
10502static const struct ethtool_ops default_ethtool_ops;
10503
10504void netdev_set_default_ethtool_ops(struct net_device *dev,
10505 const struct ethtool_ops *ops)
10506{
10507 if (dev->ethtool_ops == &default_ethtool_ops)
10508 dev->ethtool_ops = ops;
10509}
10510EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
10511
10512/**
10513 * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default
10514 * @dev: netdev to enable the IRQ coalescing on
10515 *
10516 * Sets a conservative default for SW IRQ coalescing. Users can use
10517 * sysfs attributes to override the default values.
10518 */
10519void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
10520{
10521 WARN_ON(dev->reg_state == NETREG_REGISTERED);
10522
10523 dev->gro_flush_timeout = 20000;
10524 dev->napi_defer_hard_irqs = 1;
10525}
10526EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
10527
10528void netdev_freemem(struct net_device *dev)
10529{
10530 char *addr = (char *)dev - dev->padded;
10531
10532 kvfree(addr);
10533}
10534
10535/**
10536 * alloc_netdev_mqs - allocate network device
10537 * @sizeof_priv: size of private data to allocate space for
10538 * @name: device name format string
10539 * @name_assign_type: origin of device name
10540 * @setup: callback to initialize device
10541 * @txqs: the number of TX subqueues to allocate
10542 * @rxqs: the number of RX subqueues to allocate
10543 *
10544 * Allocates a struct net_device with private data area for driver use
10545 * and performs basic initialization. Also allocates subqueue structs
10546 * for each queue on the device.
10547 */
10548struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
10549 unsigned char name_assign_type,
10550 void (*setup)(struct net_device *),
10551 unsigned int txqs, unsigned int rxqs)
10552{
10553 struct net_device *dev;
10554 unsigned int alloc_size;
10555 struct net_device *p;
10556
10557 BUG_ON(strlen(name) >= sizeof(dev->name));
10558
10559 if (txqs < 1) {
10560 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
10561 return NULL;
10562 }
10563
10564 if (rxqs < 1) {
10565 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
10566 return NULL;
10567 }
10568
10569 alloc_size = sizeof(struct net_device);
10570 if (sizeof_priv) {
10571 /* ensure 32-byte alignment of private area */
10572 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
10573 alloc_size += sizeof_priv;
10574 }
10575 /* ensure 32-byte alignment of whole construct */
10576 alloc_size += NETDEV_ALIGN - 1;
10577
10578 p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10579 if (!p)
10580 return NULL;
10581
10582 dev = PTR_ALIGN(p, NETDEV_ALIGN);
10583 dev->padded = (char *)dev - (char *)p;
10584
10585 ref_tracker_dir_init(&dev->refcnt_tracker, 128);
10586#ifdef CONFIG_PCPU_DEV_REFCNT
10587 dev->pcpu_refcnt = alloc_percpu(int);
10588 if (!dev->pcpu_refcnt)
10589 goto free_dev;
10590 __dev_hold(dev);
10591#else
10592 refcount_set(&dev->dev_refcnt, 1);
10593#endif
10594
10595 if (dev_addr_init(dev))
10596 goto free_pcpu;
10597
10598 dev_mc_init(dev);
10599 dev_uc_init(dev);
10600
10601 dev_net_set(dev, &init_net);
10602
10603 dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
10604 dev->gso_max_segs = GSO_MAX_SEGS;
10605 dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
10606 dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
10607 dev->tso_max_segs = TSO_MAX_SEGS;
10608 dev->upper_level = 1;
10609 dev->lower_level = 1;
10610#ifdef CONFIG_LOCKDEP
10611 dev->nested_level = 0;
10612 INIT_LIST_HEAD(&dev->unlink_list);
10613#endif
10614
10615 INIT_LIST_HEAD(&dev->napi_list);
10616 INIT_LIST_HEAD(&dev->unreg_list);
10617 INIT_LIST_HEAD(&dev->close_list);
10618 INIT_LIST_HEAD(&dev->link_watch_list);
10619 INIT_LIST_HEAD(&dev->adj_list.upper);
10620 INIT_LIST_HEAD(&dev->adj_list.lower);
10621 INIT_LIST_HEAD(&dev->ptype_all);
10622 INIT_LIST_HEAD(&dev->ptype_specific);
10623 INIT_LIST_HEAD(&dev->net_notifier_list);
10624#ifdef CONFIG_NET_SCHED
10625 hash_init(dev->qdisc_hash);
10626#endif
10627 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
10628 setup(dev);
10629
10630 if (!dev->tx_queue_len) {
10631 dev->priv_flags |= IFF_NO_QUEUE;
10632 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
10633 }
10634
10635 dev->num_tx_queues = txqs;
10636 dev->real_num_tx_queues = txqs;
10637 if (netif_alloc_netdev_queues(dev))
10638 goto free_all;
10639
10640 dev->num_rx_queues = rxqs;
10641 dev->real_num_rx_queues = rxqs;
10642 if (netif_alloc_rx_queues(dev))
10643 goto free_all;
10644
10645 strcpy(dev->name, name);
10646 dev->name_assign_type = name_assign_type;
10647 dev->group = INIT_NETDEV_GROUP;
10648 if (!dev->ethtool_ops)
10649 dev->ethtool_ops = &default_ethtool_ops;
10650
10651 nf_hook_netdev_init(dev);
10652
10653 return dev;
10654
10655free_all:
10656 free_netdev(dev);
10657 return NULL;
10658
10659free_pcpu:
10660#ifdef CONFIG_PCPU_DEV_REFCNT
10661 free_percpu(dev->pcpu_refcnt);
10662free_dev:
10663#endif
10664 netdev_freemem(dev);
10665 return NULL;
10666}
10667EXPORT_SYMBOL(alloc_netdev_mqs);
10668
10669/**
10670 * free_netdev - free network device
10671 * @dev: device
10672 *
10673 * This function does the last stage of destroying an allocated device
10674 * interface. The reference to the device object is released. If this
10675 * is the last reference then it will be freed.Must be called in process
10676 * context.
10677 */
10678void free_netdev(struct net_device *dev)
10679{
10680 struct napi_struct *p, *n;
10681
10682 might_sleep();
10683
10684 /* When called immediately after register_netdevice() failed the unwind
10685 * handling may still be dismantling the device. Handle that case by
10686 * deferring the free.
10687 */
10688 if (dev->reg_state == NETREG_UNREGISTERING) {
10689 ASSERT_RTNL();
10690 dev->needs_free_netdev = true;
10691 return;
10692 }
10693
10694 netif_free_tx_queues(dev);
10695 netif_free_rx_queues(dev);
10696
10697 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
10698
10699 /* Flush device addresses */
10700 dev_addr_flush(dev);
10701
10702 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
10703 netif_napi_del(p);
10704
10705 ref_tracker_dir_exit(&dev->refcnt_tracker);
10706#ifdef CONFIG_PCPU_DEV_REFCNT
10707 free_percpu(dev->pcpu_refcnt);
10708 dev->pcpu_refcnt = NULL;
10709#endif
10710 free_percpu(dev->core_stats);
10711 dev->core_stats = NULL;
10712 free_percpu(dev->xdp_bulkq);
10713 dev->xdp_bulkq = NULL;
10714
10715 /* Compatibility with error handling in drivers */
10716 if (dev->reg_state == NETREG_UNINITIALIZED) {
10717 netdev_freemem(dev);
10718 return;
10719 }
10720
10721 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
10722 dev->reg_state = NETREG_RELEASED;
10723
10724 /* will free via device release */
10725 put_device(&dev->dev);
10726}
10727EXPORT_SYMBOL(free_netdev);
10728
10729/**
10730 * synchronize_net - Synchronize with packet receive processing
10731 *
10732 * Wait for packets currently being received to be done.
10733 * Does not block later packets from starting.
10734 */
10735void synchronize_net(void)
10736{
10737 might_sleep();
10738 if (rtnl_is_locked())
10739 synchronize_rcu_expedited();
10740 else
10741 synchronize_rcu();
10742}
10743EXPORT_SYMBOL(synchronize_net);
10744
10745/**
10746 * unregister_netdevice_queue - remove device from the kernel
10747 * @dev: device
10748 * @head: list
10749 *
10750 * This function shuts down a device interface and removes it
10751 * from the kernel tables.
10752 * If head not NULL, device is queued to be unregistered later.
10753 *
10754 * Callers must hold the rtnl semaphore. You may want
10755 * unregister_netdev() instead of this.
10756 */
10757
10758void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
10759{
10760 ASSERT_RTNL();
10761
10762 if (head) {
10763 list_move_tail(&dev->unreg_list, head);
10764 } else {
10765 LIST_HEAD(single);
10766
10767 list_add(&dev->unreg_list, &single);
10768 unregister_netdevice_many(&single);
10769 }
10770}
10771EXPORT_SYMBOL(unregister_netdevice_queue);
10772
10773void unregister_netdevice_many_notify(struct list_head *head,
10774 u32 portid, const struct nlmsghdr *nlh)
10775{
10776 struct net_device *dev, *tmp;
10777 LIST_HEAD(close_head);
10778
10779 BUG_ON(dev_boot_phase);
10780 ASSERT_RTNL();
10781
10782 if (list_empty(head))
10783 return;
10784
10785 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
10786 /* Some devices call without registering
10787 * for initialization unwind. Remove those
10788 * devices and proceed with the remaining.
10789 */
10790 if (dev->reg_state == NETREG_UNINITIALIZED) {
10791 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
10792 dev->name, dev);
10793
10794 WARN_ON(1);
10795 list_del(&dev->unreg_list);
10796 continue;
10797 }
10798 dev->dismantle = true;
10799 BUG_ON(dev->reg_state != NETREG_REGISTERED);
10800 }
10801
10802 /* If device is running, close it first. */
10803 list_for_each_entry(dev, head, unreg_list)
10804 list_add_tail(&dev->close_list, &close_head);
10805 dev_close_many(&close_head, true);
10806
10807 list_for_each_entry(dev, head, unreg_list) {
10808 /* And unlink it from device chain. */
10809 write_lock(&dev_base_lock);
10810 unlist_netdevice(dev, false);
10811 dev->reg_state = NETREG_UNREGISTERING;
10812 write_unlock(&dev_base_lock);
10813 }
10814 flush_all_backlogs();
10815
10816 synchronize_net();
10817
10818 list_for_each_entry(dev, head, unreg_list) {
10819 struct sk_buff *skb = NULL;
10820
10821 /* Shutdown queueing discipline. */
10822 dev_shutdown(dev);
10823
10824 dev_xdp_uninstall(dev);
10825
10826 netdev_offload_xstats_disable_all(dev);
10827
10828 /* Notify protocols, that we are about to destroy
10829 * this device. They should clean all the things.
10830 */
10831 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10832
10833 if (!dev->rtnl_link_ops ||
10834 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10835 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
10836 GFP_KERNEL, NULL, 0,
10837 portid, nlmsg_seq(nlh));
10838
10839 /*
10840 * Flush the unicast and multicast chains
10841 */
10842 dev_uc_flush(dev);
10843 dev_mc_flush(dev);
10844
10845 netdev_name_node_alt_flush(dev);
10846 netdev_name_node_free(dev->name_node);
10847
10848 call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
10849
10850 if (dev->netdev_ops->ndo_uninit)
10851 dev->netdev_ops->ndo_uninit(dev);
10852
10853 if (skb)
10854 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);
10855
10856 /* Notifier chain MUST detach us all upper devices. */
10857 WARN_ON(netdev_has_any_upper_dev(dev));
10858 WARN_ON(netdev_has_any_lower_dev(dev));
10859
10860 /* Remove entries from kobject tree */
10861 netdev_unregister_kobject(dev);
10862#ifdef CONFIG_XPS
10863 /* Remove XPS queueing entries */
10864 netif_reset_xps_queues_gt(dev, 0);
10865#endif
10866 }
10867
10868 synchronize_net();
10869
10870 list_for_each_entry(dev, head, unreg_list) {
10871 netdev_put(dev, &dev->dev_registered_tracker);
10872 net_set_todo(dev);
10873 }
10874
10875 list_del(head);
10876}
10877
10878/**
10879 * unregister_netdevice_many - unregister many devices
10880 * @head: list of devices
10881 *
10882 * Note: As most callers use a stack allocated list_head,
10883 * we force a list_del() to make sure stack wont be corrupted later.
10884 */
10885void unregister_netdevice_many(struct list_head *head)
10886{
10887 unregister_netdevice_many_notify(head, 0, NULL);
10888}
10889EXPORT_SYMBOL(unregister_netdevice_many);
10890
10891/**
10892 * unregister_netdev - remove device from the kernel
10893 * @dev: device
10894 *
10895 * This function shuts down a device interface and removes it
10896 * from the kernel tables.
10897 *
10898 * This is just a wrapper for unregister_netdevice that takes
10899 * the rtnl semaphore. In general you want to use this and not
10900 * unregister_netdevice.
10901 */
10902void unregister_netdev(struct net_device *dev)
10903{
10904 rtnl_lock();
10905 unregister_netdevice(dev);
10906 rtnl_unlock();
10907}
10908EXPORT_SYMBOL(unregister_netdev);
10909
10910/**
10911 * __dev_change_net_namespace - move device to different nethost namespace
10912 * @dev: device
10913 * @net: network namespace
10914 * @pat: If not NULL name pattern to try if the current device name
10915 * is already taken in the destination network namespace.
10916 * @new_ifindex: If not zero, specifies device index in the target
10917 * namespace.
10918 *
10919 * This function shuts down a device interface and moves it
10920 * to a new network namespace. On success 0 is returned, on
10921 * a failure a netagive errno code is returned.
10922 *
10923 * Callers must hold the rtnl semaphore.
10924 */
10925
10926int __dev_change_net_namespace(struct net_device *dev, struct net *net,
10927 const char *pat, int new_ifindex)
10928{
10929 struct net *net_old = dev_net(dev);
10930 int err, new_nsid;
10931
10932 ASSERT_RTNL();
10933
10934 /* Don't allow namespace local devices to be moved. */
10935 err = -EINVAL;
10936 if (dev->features & NETIF_F_NETNS_LOCAL)
10937 goto out;
10938
10939 /* Ensure the device has been registrered */
10940 if (dev->reg_state != NETREG_REGISTERED)
10941 goto out;
10942
10943 /* Get out if there is nothing todo */
10944 err = 0;
10945 if (net_eq(net_old, net))
10946 goto out;
10947
10948 /* Pick the destination device name, and ensure
10949 * we can use it in the destination network namespace.
10950 */
10951 err = -EEXIST;
10952 if (netdev_name_in_use(net, dev->name)) {
10953 /* We get here if we can't use the current device name */
10954 if (!pat)
10955 goto out;
10956 err = dev_get_valid_name(net, dev, pat);
10957 if (err < 0)
10958 goto out;
10959 }
10960
10961 /* Check that new_ifindex isn't used yet. */
10962 err = -EBUSY;
10963 if (new_ifindex && __dev_get_by_index(net, new_ifindex))
10964 goto out;
10965
10966 /*
10967 * And now a mini version of register_netdevice unregister_netdevice.
10968 */
10969
10970 /* If device is running close it first. */
10971 dev_close(dev);
10972
10973 /* And unlink it from device chain */
10974 unlist_netdevice(dev, true);
10975
10976 synchronize_net();
10977
10978 /* Shutdown queueing discipline. */
10979 dev_shutdown(dev);
10980
10981 /* Notify protocols, that we are about to destroy
10982 * this device. They should clean all the things.
10983 *
10984 * Note that dev->reg_state stays at NETREG_REGISTERED.
10985 * This is wanted because this way 8021q and macvlan know
10986 * the device is just moving and can keep their slaves up.
10987 */
10988 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10989 rcu_barrier();
10990
10991 new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
10992 /* If there is an ifindex conflict assign a new one */
10993 if (!new_ifindex) {
10994 if (__dev_get_by_index(net, dev->ifindex))
10995 new_ifindex = dev_new_index(net);
10996 else
10997 new_ifindex = dev->ifindex;
10998 }
10999
11000 rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
11001 new_ifindex);
11002
11003 /*
11004 * Flush the unicast and multicast chains
11005 */
11006 dev_uc_flush(dev);
11007 dev_mc_flush(dev);
11008
11009 /* Send a netdev-removed uevent to the old namespace */
11010 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
11011 netdev_adjacent_del_links(dev);
11012
11013 /* Move per-net netdevice notifiers that are following the netdevice */
11014 move_netdevice_notifiers_dev_net(dev, net);
11015
11016 /* Actually switch the network namespace */
11017 dev_net_set(dev, net);
11018 dev->ifindex = new_ifindex;
11019
11020 /* Send a netdev-add uevent to the new namespace */
11021 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
11022 netdev_adjacent_add_links(dev);
11023
11024 /* Fixup kobjects */
11025 err = device_rename(&dev->dev, dev->name);
11026 WARN_ON(err);
11027
11028 /* Adapt owner in case owning user namespace of target network
11029 * namespace is different from the original one.
11030 */
11031 err = netdev_change_owner(dev, net_old, net);
11032 WARN_ON(err);
11033
11034 /* Add the device back in the hashes */
11035 list_netdevice(dev);
11036
11037 /* Notify protocols, that a new device appeared. */
11038 call_netdevice_notifiers(NETDEV_REGISTER, dev);
11039
11040 /*
11041 * Prevent userspace races by waiting until the network
11042 * device is fully setup before sending notifications.
11043 */
11044 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
11045
11046 synchronize_net();
11047 err = 0;
11048out:
11049 return err;
11050}
11051EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
11052
11053static int dev_cpu_dead(unsigned int oldcpu)
11054{
11055 struct sk_buff **list_skb;
11056 struct sk_buff *skb;
11057 unsigned int cpu;
11058 struct softnet_data *sd, *oldsd, *remsd = NULL;
11059
11060 local_irq_disable();
11061 cpu = smp_processor_id();
11062 sd = &per_cpu(softnet_data, cpu);
11063 oldsd = &per_cpu(softnet_data, oldcpu);
11064
11065 /* Find end of our completion_queue. */
11066 list_skb = &sd->completion_queue;
11067 while (*list_skb)
11068 list_skb = &(*list_skb)->next;
11069 /* Append completion queue from offline CPU. */
11070 *list_skb = oldsd->completion_queue;
11071 oldsd->completion_queue = NULL;
11072
11073 /* Append output queue from offline CPU. */
11074 if (oldsd->output_queue) {
11075 *sd->output_queue_tailp = oldsd->output_queue;
11076 sd->output_queue_tailp = oldsd->output_queue_tailp;
11077 oldsd->output_queue = NULL;
11078 oldsd->output_queue_tailp = &oldsd->output_queue;
11079 }
11080 /* Append NAPI poll list from offline CPU, with one exception :
11081 * process_backlog() must be called by cpu owning percpu backlog.
11082 * We properly handle process_queue & input_pkt_queue later.
11083 */
11084 while (!list_empty(&oldsd->poll_list)) {
11085 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
11086 struct napi_struct,
11087 poll_list);
11088
11089 list_del_init(&napi->poll_list);
11090 if (napi->poll == process_backlog)
11091 napi->state = 0;
11092 else
11093 ____napi_schedule(sd, napi);
11094 }
11095
11096 raise_softirq_irqoff(NET_TX_SOFTIRQ);
11097 local_irq_enable();
11098
11099#ifdef CONFIG_RPS
11100 remsd = oldsd->rps_ipi_list;
11101 oldsd->rps_ipi_list = NULL;
11102#endif
11103 /* send out pending IPI's on offline CPU */
11104 net_rps_send_ipi(remsd);
11105
11106 /* Process offline CPU's input_pkt_queue */
11107 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
11108 netif_rx(skb);
11109 input_queue_head_incr(oldsd);
11110 }
11111 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
11112 netif_rx(skb);
11113 input_queue_head_incr(oldsd);
11114 }
11115
11116 return 0;
11117}
11118
11119/**
11120 * netdev_increment_features - increment feature set by one
11121 * @all: current feature set
11122 * @one: new feature set
11123 * @mask: mask feature set
11124 *
11125 * Computes a new feature set after adding a device with feature set
11126 * @one to the master device with current feature set @all. Will not
11127 * enable anything that is off in @mask. Returns the new feature set.
11128 */
11129netdev_features_t netdev_increment_features(netdev_features_t all,
11130 netdev_features_t one, netdev_features_t mask)
11131{
11132 if (mask & NETIF_F_HW_CSUM)
11133 mask |= NETIF_F_CSUM_MASK;
11134 mask |= NETIF_F_VLAN_CHALLENGED;
11135
11136 all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
11137 all &= one | ~NETIF_F_ALL_FOR_ALL;
11138
11139 /* If one device supports hw checksumming, set for all. */
11140 if (all & NETIF_F_HW_CSUM)
11141 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
11142
11143 return all;
11144}
11145EXPORT_SYMBOL(netdev_increment_features);
11146
11147static struct hlist_head * __net_init netdev_create_hash(void)
11148{
11149 int i;
11150 struct hlist_head *hash;
11151
11152 hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
11153 if (hash != NULL)
11154 for (i = 0; i < NETDEV_HASHENTRIES; i++)
11155 INIT_HLIST_HEAD(&hash[i]);
11156
11157 return hash;
11158}
11159
11160/* Initialize per network namespace state */
11161static int __net_init netdev_init(struct net *net)
11162{
11163 BUILD_BUG_ON(GRO_HASH_BUCKETS >
11164 8 * sizeof_field(struct napi_struct, gro_bitmask));
11165
11166 INIT_LIST_HEAD(&net->dev_base_head);
11167
11168 net->dev_name_head = netdev_create_hash();
11169 if (net->dev_name_head == NULL)
11170 goto err_name;
11171
11172 net->dev_index_head = netdev_create_hash();
11173 if (net->dev_index_head == NULL)
11174 goto err_idx;
11175
11176 RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
11177
11178 return 0;
11179
11180err_idx:
11181 kfree(net->dev_name_head);
11182err_name:
11183 return -ENOMEM;
11184}
11185
11186/**
11187 * netdev_drivername - network driver for the device
11188 * @dev: network device
11189 *
11190 * Determine network driver for device.
11191 */
11192const char *netdev_drivername(const struct net_device *dev)
11193{
11194 const struct device_driver *driver;
11195 const struct device *parent;
11196 const char *empty = "";
11197
11198 parent = dev->dev.parent;
11199 if (!parent)
11200 return empty;
11201
11202 driver = parent->driver;
11203 if (driver && driver->name)
11204 return driver->name;
11205 return empty;
11206}
11207
11208static void __netdev_printk(const char *level, const struct net_device *dev,
11209 struct va_format *vaf)
11210{
11211 if (dev && dev->dev.parent) {
11212 dev_printk_emit(level[1] - '0',
11213 dev->dev.parent,
11214 "%s %s %s%s: %pV",
11215 dev_driver_string(dev->dev.parent),
11216 dev_name(dev->dev.parent),
11217 netdev_name(dev), netdev_reg_state(dev),
11218 vaf);
11219 } else if (dev) {
11220 printk("%s%s%s: %pV",
11221 level, netdev_name(dev), netdev_reg_state(dev), vaf);
11222 } else {
11223 printk("%s(NULL net_device): %pV", level, vaf);
11224 }
11225}
11226
11227void netdev_printk(const char *level, const struct net_device *dev,
11228 const char *format, ...)
11229{
11230 struct va_format vaf;
11231 va_list args;
11232
11233 va_start(args, format);
11234
11235 vaf.fmt = format;
11236 vaf.va = &args;
11237
11238 __netdev_printk(level, dev, &vaf);
11239
11240 va_end(args);
11241}
11242EXPORT_SYMBOL(netdev_printk);
11243
11244#define define_netdev_printk_level(func, level) \
11245void func(const struct net_device *dev, const char *fmt, ...) \
11246{ \
11247 struct va_format vaf; \
11248 va_list args; \
11249 \
11250 va_start(args, fmt); \
11251 \
11252 vaf.fmt = fmt; \
11253 vaf.va = &args; \
11254 \
11255 __netdev_printk(level, dev, &vaf); \
11256 \
11257 va_end(args); \
11258} \
11259EXPORT_SYMBOL(func);
11260
11261define_netdev_printk_level(netdev_emerg, KERN_EMERG);
11262define_netdev_printk_level(netdev_alert, KERN_ALERT);
11263define_netdev_printk_level(netdev_crit, KERN_CRIT);
11264define_netdev_printk_level(netdev_err, KERN_ERR);
11265define_netdev_printk_level(netdev_warn, KERN_WARNING);
11266define_netdev_printk_level(netdev_notice, KERN_NOTICE);
11267define_netdev_printk_level(netdev_info, KERN_INFO);
11268
11269static void __net_exit netdev_exit(struct net *net)
11270{
11271 kfree(net->dev_name_head);
11272 kfree(net->dev_index_head);
11273 if (net != &init_net)
11274 WARN_ON_ONCE(!list_empty(&net->dev_base_head));
11275}
11276
11277static struct pernet_operations __net_initdata netdev_net_ops = {
11278 .init = netdev_init,
11279 .exit = netdev_exit,
11280};
11281
11282static void __net_exit default_device_exit_net(struct net *net)
11283{
11284 struct net_device *dev, *aux;
11285 /*
11286 * Push all migratable network devices back to the
11287 * initial network namespace
11288 */
11289 ASSERT_RTNL();
11290 for_each_netdev_safe(net, dev, aux) {
11291 int err;
11292 char fb_name[IFNAMSIZ];
11293
11294 /* Ignore unmoveable devices (i.e. loopback) */
11295 if (dev->features & NETIF_F_NETNS_LOCAL)
11296 continue;
11297
11298 /* Leave virtual devices for the generic cleanup */
11299 if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
11300 continue;
11301
11302 /* Push remaining network devices to init_net */
11303 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
11304 if (netdev_name_in_use(&init_net, fb_name))
11305 snprintf(fb_name, IFNAMSIZ, "dev%%d");
11306 err = dev_change_net_namespace(dev, &init_net, fb_name);
11307 if (err) {
11308 pr_emerg("%s: failed to move %s to init_net: %d\n",
11309 __func__, dev->name, err);
11310 BUG();
11311 }
11312 }
11313}
11314
11315static void __net_exit default_device_exit_batch(struct list_head *net_list)
11316{
11317 /* At exit all network devices most be removed from a network
11318 * namespace. Do this in the reverse order of registration.
11319 * Do this across as many network namespaces as possible to
11320 * improve batching efficiency.
11321 */
11322 struct net_device *dev;
11323 struct net *net;
11324 LIST_HEAD(dev_kill_list);
11325
11326 rtnl_lock();
11327 list_for_each_entry(net, net_list, exit_list) {
11328 default_device_exit_net(net);
11329 cond_resched();
11330 }
11331
11332 list_for_each_entry(net, net_list, exit_list) {
11333 for_each_netdev_reverse(net, dev) {
11334 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
11335 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
11336 else
11337 unregister_netdevice_queue(dev, &dev_kill_list);
11338 }
11339 }
11340 unregister_netdevice_many(&dev_kill_list);
11341 rtnl_unlock();
11342}
11343
11344static struct pernet_operations __net_initdata default_device_ops = {
11345 .exit_batch = default_device_exit_batch,
11346};
11347
11348/*
11349 * Initialize the DEV module. At boot time this walks the device list and
11350 * unhooks any devices that fail to initialise (normally hardware not
11351 * present) and leaves us with a valid list of present and active devices.
11352 *
11353 */
11354
11355/*
11356 * This is called single threaded during boot, so no need
11357 * to take the rtnl semaphore.
11358 */
11359static int __init net_dev_init(void)
11360{
11361 int i, rc = -ENOMEM;
11362
11363 BUG_ON(!dev_boot_phase);
11364
11365 if (dev_proc_init())
11366 goto out;
11367
11368 if (netdev_kobject_init())
11369 goto out;
11370
11371 INIT_LIST_HEAD(&ptype_all);
11372 for (i = 0; i < PTYPE_HASH_SIZE; i++)
11373 INIT_LIST_HEAD(&ptype_base[i]);
11374
11375 if (register_pernet_subsys(&netdev_net_ops))
11376 goto out;
11377
11378 /*
11379 * Initialise the packet receive queues.
11380 */
11381
11382 for_each_possible_cpu(i) {
11383 struct work_struct *flush = per_cpu_ptr(&flush_works, i);
11384 struct softnet_data *sd = &per_cpu(softnet_data, i);
11385
11386 INIT_WORK(flush, flush_backlog);
11387
11388 skb_queue_head_init(&sd->input_pkt_queue);
11389 skb_queue_head_init(&sd->process_queue);
11390#ifdef CONFIG_XFRM_OFFLOAD
11391 skb_queue_head_init(&sd->xfrm_backlog);
11392#endif
11393 INIT_LIST_HEAD(&sd->poll_list);
11394 sd->output_queue_tailp = &sd->output_queue;
11395#ifdef CONFIG_RPS
11396 INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
11397 sd->cpu = i;
11398#endif
11399 INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
11400 spin_lock_init(&sd->defer_lock);
11401
11402 init_gro_hash(&sd->backlog);
11403 sd->backlog.poll = process_backlog;
11404 sd->backlog.weight = weight_p;
11405 }
11406
11407 dev_boot_phase = 0;
11408
11409 /* The loopback device is special if any other network devices
11410 * is present in a network namespace the loopback device must
11411 * be present. Since we now dynamically allocate and free the
11412 * loopback device ensure this invariant is maintained by
11413 * keeping the loopback device as the first device on the
11414 * list of network devices. Ensuring the loopback devices
11415 * is the first device that appears and the last network device
11416 * that disappears.
11417 */
11418 if (register_pernet_device(&loopback_net_ops))
11419 goto out;
11420
11421 if (register_pernet_device(&default_device_ops))
11422 goto out;
11423
11424 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
11425 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
11426
11427 rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
11428 NULL, dev_cpu_dead);
11429 WARN_ON(rc < 0);
11430 rc = 0;
11431out:
11432 return rc;
11433}
11434
11435subsys_initcall(net_dev_init);