Loading...
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
76#include <linux/bitops.h>
77#include <linux/capability.h>
78#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
81#include <linux/hash.h>
82#include <linux/slab.h>
83#include <linux/sched.h>
84#include <linux/mutex.h>
85#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
94#include <linux/ethtool.h>
95#include <linux/notifier.h>
96#include <linux/skbuff.h>
97#include <net/net_namespace.h>
98#include <net/sock.h>
99#include <linux/rtnetlink.h>
100#include <linux/stat.h>
101#include <net/dst.h>
102#include <net/pkt_sched.h>
103#include <net/checksum.h>
104#include <net/xfrm.h>
105#include <linux/highmem.h>
106#include <linux/init.h>
107#include <linux/module.h>
108#include <linux/netpoll.h>
109#include <linux/rcupdate.h>
110#include <linux/delay.h>
111#include <net/iw_handler.h>
112#include <asm/current.h>
113#include <linux/audit.h>
114#include <linux/dmaengine.h>
115#include <linux/err.h>
116#include <linux/ctype.h>
117#include <linux/if_arp.h>
118#include <linux/if_vlan.h>
119#include <linux/ip.h>
120#include <net/ip.h>
121#include <linux/ipv6.h>
122#include <linux/in.h>
123#include <linux/jhash.h>
124#include <linux/random.h>
125#include <trace/events/napi.h>
126#include <trace/events/net.h>
127#include <trace/events/skb.h>
128#include <linux/pci.h>
129#include <linux/inetdevice.h>
130#include <linux/cpu_rmap.h>
131#include <linux/static_key.h>
132#include <linux/hashtable.h>
133#include <linux/vmalloc.h>
134#include <linux/if_macvlan.h>
135
136#include "net-sysfs.h"
137
138/* Instead of increasing this, you should create a hash table. */
139#define MAX_GRO_SKBS 8
140
141/* This should be increased if a protocol with a bigger head is added. */
142#define GRO_MAX_HEAD (MAX_HEADER + 128)
143
144static DEFINE_SPINLOCK(ptype_lock);
145static DEFINE_SPINLOCK(offload_lock);
146struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
147struct list_head ptype_all __read_mostly; /* Taps */
148static struct list_head offload_base __read_mostly;
149
150static int netif_rx_internal(struct sk_buff *skb);
151
152/*
153 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
154 * semaphore.
155 *
156 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
157 *
158 * Writers must hold the rtnl semaphore while they loop through the
159 * dev_base_head list, and hold dev_base_lock for writing when they do the
160 * actual updates. This allows pure readers to access the list even
161 * while a writer is preparing to update it.
162 *
163 * To put it another way, dev_base_lock is held for writing only to
164 * protect against pure readers; the rtnl semaphore provides the
165 * protection against other writers.
166 *
167 * See, for example usages, register_netdevice() and
168 * unregister_netdevice(), which must be called with the rtnl
169 * semaphore held.
170 */
171DEFINE_RWLOCK(dev_base_lock);
172EXPORT_SYMBOL(dev_base_lock);
173
174/* protects napi_hash addition/deletion and napi_gen_id */
175static DEFINE_SPINLOCK(napi_hash_lock);
176
177static unsigned int napi_gen_id;
178static DEFINE_HASHTABLE(napi_hash, 8);
179
180static seqcount_t devnet_rename_seq;
181
182static inline void dev_base_seq_inc(struct net *net)
183{
184 while (++net->dev_base_seq == 0);
185}
186
187static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
188{
189 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
190
191 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
192}
193
194static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
195{
196 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
197}
198
199static inline void rps_lock(struct softnet_data *sd)
200{
201#ifdef CONFIG_RPS
202 spin_lock(&sd->input_pkt_queue.lock);
203#endif
204}
205
206static inline void rps_unlock(struct softnet_data *sd)
207{
208#ifdef CONFIG_RPS
209 spin_unlock(&sd->input_pkt_queue.lock);
210#endif
211}
212
213/* Device list insertion */
214static void list_netdevice(struct net_device *dev)
215{
216 struct net *net = dev_net(dev);
217
218 ASSERT_RTNL();
219
220 write_lock_bh(&dev_base_lock);
221 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
222 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
223 hlist_add_head_rcu(&dev->index_hlist,
224 dev_index_hash(net, dev->ifindex));
225 write_unlock_bh(&dev_base_lock);
226
227 dev_base_seq_inc(net);
228}
229
230/* Device list removal
231 * caller must respect a RCU grace period before freeing/reusing dev
232 */
233static void unlist_netdevice(struct net_device *dev)
234{
235 ASSERT_RTNL();
236
237 /* Unlink dev from the device chain */
238 write_lock_bh(&dev_base_lock);
239 list_del_rcu(&dev->dev_list);
240 hlist_del_rcu(&dev->name_hlist);
241 hlist_del_rcu(&dev->index_hlist);
242 write_unlock_bh(&dev_base_lock);
243
244 dev_base_seq_inc(dev_net(dev));
245}
246
247/*
248 * Our notifier list
249 */
250
251static RAW_NOTIFIER_HEAD(netdev_chain);
252
253/*
254 * Device drivers call our routines to queue packets here. We empty the
255 * queue in the local softnet handler.
256 */
257
258DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
259EXPORT_PER_CPU_SYMBOL(softnet_data);
260
261#ifdef CONFIG_LOCKDEP
262/*
263 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
264 * according to dev->type
265 */
266static const unsigned short netdev_lock_type[] =
267 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
268 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
269 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
270 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
271 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
272 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
273 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
274 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
275 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
276 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
277 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
278 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
279 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
280 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
281 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
282
283static const char *const netdev_lock_name[] =
284 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
285 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
286 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
287 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
288 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
289 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
290 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
291 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
292 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
293 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
294 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
295 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
296 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
297 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
298 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
299
300static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
301static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
302
303static inline unsigned short netdev_lock_pos(unsigned short dev_type)
304{
305 int i;
306
307 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
308 if (netdev_lock_type[i] == dev_type)
309 return i;
310 /* the last key is used by default */
311 return ARRAY_SIZE(netdev_lock_type) - 1;
312}
313
314static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
315 unsigned short dev_type)
316{
317 int i;
318
319 i = netdev_lock_pos(dev_type);
320 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
321 netdev_lock_name[i]);
322}
323
324static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
325{
326 int i;
327
328 i = netdev_lock_pos(dev->type);
329 lockdep_set_class_and_name(&dev->addr_list_lock,
330 &netdev_addr_lock_key[i],
331 netdev_lock_name[i]);
332}
333#else
334static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
335 unsigned short dev_type)
336{
337}
338static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
339{
340}
341#endif
342
343/*******************************************************************************
344
345 Protocol management and registration routines
346
347*******************************************************************************/
348
349/*
350 * Add a protocol ID to the list. Now that the input handler is
351 * smarter we can dispense with all the messy stuff that used to be
352 * here.
353 *
354 * BEWARE!!! Protocol handlers, mangling input packets,
355 * MUST BE last in hash buckets and checking protocol handlers
356 * MUST start from promiscuous ptype_all chain in net_bh.
357 * It is true now, do not change it.
358 * Explanation follows: if protocol handler, mangling packet, will
359 * be the first on list, it is not able to sense, that packet
360 * is cloned and should be copied-on-write, so that it will
361 * change it and subsequent readers will get broken packet.
362 * --ANK (980803)
363 */
364
365static inline struct list_head *ptype_head(const struct packet_type *pt)
366{
367 if (pt->type == htons(ETH_P_ALL))
368 return &ptype_all;
369 else
370 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
371}
372
373/**
374 * dev_add_pack - add packet handler
375 * @pt: packet type declaration
376 *
377 * Add a protocol handler to the networking stack. The passed &packet_type
378 * is linked into kernel lists and may not be freed until it has been
379 * removed from the kernel lists.
380 *
381 * This call does not sleep therefore it can not
382 * guarantee all CPU's that are in middle of receiving packets
383 * will see the new packet type (until the next received packet).
384 */
385
386void dev_add_pack(struct packet_type *pt)
387{
388 struct list_head *head = ptype_head(pt);
389
390 spin_lock(&ptype_lock);
391 list_add_rcu(&pt->list, head);
392 spin_unlock(&ptype_lock);
393}
394EXPORT_SYMBOL(dev_add_pack);
395
396/**
397 * __dev_remove_pack - remove packet handler
398 * @pt: packet type declaration
399 *
400 * Remove a protocol handler that was previously added to the kernel
401 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
402 * from the kernel lists and can be freed or reused once this function
403 * returns.
404 *
405 * The packet type might still be in use by receivers
406 * and must not be freed until after all the CPU's have gone
407 * through a quiescent state.
408 */
409void __dev_remove_pack(struct packet_type *pt)
410{
411 struct list_head *head = ptype_head(pt);
412 struct packet_type *pt1;
413
414 spin_lock(&ptype_lock);
415
416 list_for_each_entry(pt1, head, list) {
417 if (pt == pt1) {
418 list_del_rcu(&pt->list);
419 goto out;
420 }
421 }
422
423 pr_warn("dev_remove_pack: %p not found\n", pt);
424out:
425 spin_unlock(&ptype_lock);
426}
427EXPORT_SYMBOL(__dev_remove_pack);
428
429/**
430 * dev_remove_pack - remove packet handler
431 * @pt: packet type declaration
432 *
433 * Remove a protocol handler that was previously added to the kernel
434 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
435 * from the kernel lists and can be freed or reused once this function
436 * returns.
437 *
438 * This call sleeps to guarantee that no CPU is looking at the packet
439 * type after return.
440 */
441void dev_remove_pack(struct packet_type *pt)
442{
443 __dev_remove_pack(pt);
444
445 synchronize_net();
446}
447EXPORT_SYMBOL(dev_remove_pack);
448
449
450/**
451 * dev_add_offload - register offload handlers
452 * @po: protocol offload declaration
453 *
454 * Add protocol offload handlers to the networking stack. The passed
455 * &proto_offload is linked into kernel lists and may not be freed until
456 * it has been removed from the kernel lists.
457 *
458 * This call does not sleep therefore it can not
459 * guarantee all CPU's that are in middle of receiving packets
460 * will see the new offload handlers (until the next received packet).
461 */
462void dev_add_offload(struct packet_offload *po)
463{
464 struct list_head *head = &offload_base;
465
466 spin_lock(&offload_lock);
467 list_add_rcu(&po->list, head);
468 spin_unlock(&offload_lock);
469}
470EXPORT_SYMBOL(dev_add_offload);
471
472/**
473 * __dev_remove_offload - remove offload handler
474 * @po: packet offload declaration
475 *
476 * Remove a protocol offload handler that was previously added to the
477 * kernel offload handlers by dev_add_offload(). The passed &offload_type
478 * is removed from the kernel lists and can be freed or reused once this
479 * function returns.
480 *
481 * The packet type might still be in use by receivers
482 * and must not be freed until after all the CPU's have gone
483 * through a quiescent state.
484 */
485static void __dev_remove_offload(struct packet_offload *po)
486{
487 struct list_head *head = &offload_base;
488 struct packet_offload *po1;
489
490 spin_lock(&offload_lock);
491
492 list_for_each_entry(po1, head, list) {
493 if (po == po1) {
494 list_del_rcu(&po->list);
495 goto out;
496 }
497 }
498
499 pr_warn("dev_remove_offload: %p not found\n", po);
500out:
501 spin_unlock(&offload_lock);
502}
503
504/**
505 * dev_remove_offload - remove packet offload handler
506 * @po: packet offload declaration
507 *
508 * Remove a packet offload handler that was previously added to the kernel
509 * offload handlers by dev_add_offload(). The passed &offload_type is
510 * removed from the kernel lists and can be freed or reused once this
511 * function returns.
512 *
513 * This call sleeps to guarantee that no CPU is looking at the packet
514 * type after return.
515 */
516void dev_remove_offload(struct packet_offload *po)
517{
518 __dev_remove_offload(po);
519
520 synchronize_net();
521}
522EXPORT_SYMBOL(dev_remove_offload);
523
524/******************************************************************************
525
526 Device Boot-time Settings Routines
527
528*******************************************************************************/
529
530/* Boot time configuration table */
531static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
532
533/**
534 * netdev_boot_setup_add - add new setup entry
535 * @name: name of the device
536 * @map: configured settings for the device
537 *
538 * Adds new setup entry to the dev_boot_setup list. The function
539 * returns 0 on error and 1 on success. This is a generic routine to
540 * all netdevices.
541 */
542static int netdev_boot_setup_add(char *name, struct ifmap *map)
543{
544 struct netdev_boot_setup *s;
545 int i;
546
547 s = dev_boot_setup;
548 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
549 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
550 memset(s[i].name, 0, sizeof(s[i].name));
551 strlcpy(s[i].name, name, IFNAMSIZ);
552 memcpy(&s[i].map, map, sizeof(s[i].map));
553 break;
554 }
555 }
556
557 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
558}
559
560/**
561 * netdev_boot_setup_check - check boot time settings
562 * @dev: the netdevice
563 *
564 * Check boot time settings for the device.
565 * The found settings are set for the device to be used
566 * later in the device probing.
567 * Returns 0 if no settings found, 1 if they are.
568 */
569int netdev_boot_setup_check(struct net_device *dev)
570{
571 struct netdev_boot_setup *s = dev_boot_setup;
572 int i;
573
574 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
575 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
576 !strcmp(dev->name, s[i].name)) {
577 dev->irq = s[i].map.irq;
578 dev->base_addr = s[i].map.base_addr;
579 dev->mem_start = s[i].map.mem_start;
580 dev->mem_end = s[i].map.mem_end;
581 return 1;
582 }
583 }
584 return 0;
585}
586EXPORT_SYMBOL(netdev_boot_setup_check);
587
588
589/**
590 * netdev_boot_base - get address from boot time settings
591 * @prefix: prefix for network device
592 * @unit: id for network device
593 *
594 * Check boot time settings for the base address of device.
595 * The found settings are set for the device to be used
596 * later in the device probing.
597 * Returns 0 if no settings found.
598 */
599unsigned long netdev_boot_base(const char *prefix, int unit)
600{
601 const struct netdev_boot_setup *s = dev_boot_setup;
602 char name[IFNAMSIZ];
603 int i;
604
605 sprintf(name, "%s%d", prefix, unit);
606
607 /*
608 * If device already registered then return base of 1
609 * to indicate not to probe for this interface
610 */
611 if (__dev_get_by_name(&init_net, name))
612 return 1;
613
614 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
615 if (!strcmp(name, s[i].name))
616 return s[i].map.base_addr;
617 return 0;
618}
619
620/*
621 * Saves at boot time configured settings for any netdevice.
622 */
623int __init netdev_boot_setup(char *str)
624{
625 int ints[5];
626 struct ifmap map;
627
628 str = get_options(str, ARRAY_SIZE(ints), ints);
629 if (!str || !*str)
630 return 0;
631
632 /* Save settings */
633 memset(&map, 0, sizeof(map));
634 if (ints[0] > 0)
635 map.irq = ints[1];
636 if (ints[0] > 1)
637 map.base_addr = ints[2];
638 if (ints[0] > 2)
639 map.mem_start = ints[3];
640 if (ints[0] > 3)
641 map.mem_end = ints[4];
642
643 /* Add new entry to the list */
644 return netdev_boot_setup_add(str, &map);
645}
646
647__setup("netdev=", netdev_boot_setup);
648
649/*******************************************************************************
650
651 Device Interface Subroutines
652
653*******************************************************************************/
654
655/**
656 * __dev_get_by_name - find a device by its name
657 * @net: the applicable net namespace
658 * @name: name to find
659 *
660 * Find an interface by name. Must be called under RTNL semaphore
661 * or @dev_base_lock. If the name is found a pointer to the device
662 * is returned. If the name is not found then %NULL is returned. The
663 * reference counters are not incremented so the caller must be
664 * careful with locks.
665 */
666
667struct net_device *__dev_get_by_name(struct net *net, const char *name)
668{
669 struct net_device *dev;
670 struct hlist_head *head = dev_name_hash(net, name);
671
672 hlist_for_each_entry(dev, head, name_hlist)
673 if (!strncmp(dev->name, name, IFNAMSIZ))
674 return dev;
675
676 return NULL;
677}
678EXPORT_SYMBOL(__dev_get_by_name);
679
680/**
681 * dev_get_by_name_rcu - find a device by its name
682 * @net: the applicable net namespace
683 * @name: name to find
684 *
685 * Find an interface by name.
686 * If the name is found a pointer to the device is returned.
687 * If the name is not found then %NULL is returned.
688 * The reference counters are not incremented so the caller must be
689 * careful with locks. The caller must hold RCU lock.
690 */
691
692struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
693{
694 struct net_device *dev;
695 struct hlist_head *head = dev_name_hash(net, name);
696
697 hlist_for_each_entry_rcu(dev, head, name_hlist)
698 if (!strncmp(dev->name, name, IFNAMSIZ))
699 return dev;
700
701 return NULL;
702}
703EXPORT_SYMBOL(dev_get_by_name_rcu);
704
705/**
706 * dev_get_by_name - find a device by its name
707 * @net: the applicable net namespace
708 * @name: name to find
709 *
710 * Find an interface by name. This can be called from any
711 * context and does its own locking. The returned handle has
712 * the usage count incremented and the caller must use dev_put() to
713 * release it when it is no longer needed. %NULL is returned if no
714 * matching device is found.
715 */
716
717struct net_device *dev_get_by_name(struct net *net, const char *name)
718{
719 struct net_device *dev;
720
721 rcu_read_lock();
722 dev = dev_get_by_name_rcu(net, name);
723 if (dev)
724 dev_hold(dev);
725 rcu_read_unlock();
726 return dev;
727}
728EXPORT_SYMBOL(dev_get_by_name);
729
730/**
731 * __dev_get_by_index - find a device by its ifindex
732 * @net: the applicable net namespace
733 * @ifindex: index of device
734 *
735 * Search for an interface by index. Returns %NULL if the device
736 * is not found or a pointer to the device. The device has not
737 * had its reference counter increased so the caller must be careful
738 * about locking. The caller must hold either the RTNL semaphore
739 * or @dev_base_lock.
740 */
741
742struct net_device *__dev_get_by_index(struct net *net, int ifindex)
743{
744 struct net_device *dev;
745 struct hlist_head *head = dev_index_hash(net, ifindex);
746
747 hlist_for_each_entry(dev, head, index_hlist)
748 if (dev->ifindex == ifindex)
749 return dev;
750
751 return NULL;
752}
753EXPORT_SYMBOL(__dev_get_by_index);
754
755/**
756 * dev_get_by_index_rcu - find a device by its ifindex
757 * @net: the applicable net namespace
758 * @ifindex: index of device
759 *
760 * Search for an interface by index. Returns %NULL if the device
761 * is not found or a pointer to the device. The device has not
762 * had its reference counter increased so the caller must be careful
763 * about locking. The caller must hold RCU lock.
764 */
765
766struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
767{
768 struct net_device *dev;
769 struct hlist_head *head = dev_index_hash(net, ifindex);
770
771 hlist_for_each_entry_rcu(dev, head, index_hlist)
772 if (dev->ifindex == ifindex)
773 return dev;
774
775 return NULL;
776}
777EXPORT_SYMBOL(dev_get_by_index_rcu);
778
779
780/**
781 * dev_get_by_index - find a device by its ifindex
782 * @net: the applicable net namespace
783 * @ifindex: index of device
784 *
785 * Search for an interface by index. Returns NULL if the device
786 * is not found or a pointer to the device. The device returned has
787 * had a reference added and the pointer is safe until the user calls
788 * dev_put to indicate they have finished with it.
789 */
790
791struct net_device *dev_get_by_index(struct net *net, int ifindex)
792{
793 struct net_device *dev;
794
795 rcu_read_lock();
796 dev = dev_get_by_index_rcu(net, ifindex);
797 if (dev)
798 dev_hold(dev);
799 rcu_read_unlock();
800 return dev;
801}
802EXPORT_SYMBOL(dev_get_by_index);
803
804/**
805 * netdev_get_name - get a netdevice name, knowing its ifindex.
806 * @net: network namespace
807 * @name: a pointer to the buffer where the name will be stored.
808 * @ifindex: the ifindex of the interface to get the name from.
809 *
810 * The use of raw_seqcount_begin() and cond_resched() before
811 * retrying is required as we want to give the writers a chance
812 * to complete when CONFIG_PREEMPT is not set.
813 */
814int netdev_get_name(struct net *net, char *name, int ifindex)
815{
816 struct net_device *dev;
817 unsigned int seq;
818
819retry:
820 seq = raw_seqcount_begin(&devnet_rename_seq);
821 rcu_read_lock();
822 dev = dev_get_by_index_rcu(net, ifindex);
823 if (!dev) {
824 rcu_read_unlock();
825 return -ENODEV;
826 }
827
828 strcpy(name, dev->name);
829 rcu_read_unlock();
830 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
831 cond_resched();
832 goto retry;
833 }
834
835 return 0;
836}
837
838/**
839 * dev_getbyhwaddr_rcu - find a device by its hardware address
840 * @net: the applicable net namespace
841 * @type: media type of device
842 * @ha: hardware address
843 *
844 * Search for an interface by MAC address. Returns NULL if the device
845 * is not found or a pointer to the device.
846 * The caller must hold RCU or RTNL.
847 * The returned device has not had its ref count increased
848 * and the caller must therefore be careful about locking
849 *
850 */
851
852struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
853 const char *ha)
854{
855 struct net_device *dev;
856
857 for_each_netdev_rcu(net, dev)
858 if (dev->type == type &&
859 !memcmp(dev->dev_addr, ha, dev->addr_len))
860 return dev;
861
862 return NULL;
863}
864EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
865
866struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
867{
868 struct net_device *dev;
869
870 ASSERT_RTNL();
871 for_each_netdev(net, dev)
872 if (dev->type == type)
873 return dev;
874
875 return NULL;
876}
877EXPORT_SYMBOL(__dev_getfirstbyhwtype);
878
879struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
880{
881 struct net_device *dev, *ret = NULL;
882
883 rcu_read_lock();
884 for_each_netdev_rcu(net, dev)
885 if (dev->type == type) {
886 dev_hold(dev);
887 ret = dev;
888 break;
889 }
890 rcu_read_unlock();
891 return ret;
892}
893EXPORT_SYMBOL(dev_getfirstbyhwtype);
894
895/**
896 * dev_get_by_flags_rcu - find any device with given flags
897 * @net: the applicable net namespace
898 * @if_flags: IFF_* values
899 * @mask: bitmask of bits in if_flags to check
900 *
901 * Search for any interface with the given flags. Returns NULL if a device
902 * is not found or a pointer to the device. Must be called inside
903 * rcu_read_lock(), and result refcount is unchanged.
904 */
905
906struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
907 unsigned short mask)
908{
909 struct net_device *dev, *ret;
910
911 ret = NULL;
912 for_each_netdev_rcu(net, dev) {
913 if (((dev->flags ^ if_flags) & mask) == 0) {
914 ret = dev;
915 break;
916 }
917 }
918 return ret;
919}
920EXPORT_SYMBOL(dev_get_by_flags_rcu);
921
922/**
923 * dev_valid_name - check if name is okay for network device
924 * @name: name string
925 *
926 * Network device names need to be valid file names to
927 * to allow sysfs to work. We also disallow any kind of
928 * whitespace.
929 */
930bool dev_valid_name(const char *name)
931{
932 if (*name == '\0')
933 return false;
934 if (strlen(name) >= IFNAMSIZ)
935 return false;
936 if (!strcmp(name, ".") || !strcmp(name, ".."))
937 return false;
938
939 while (*name) {
940 if (*name == '/' || isspace(*name))
941 return false;
942 name++;
943 }
944 return true;
945}
946EXPORT_SYMBOL(dev_valid_name);
947
948/**
949 * __dev_alloc_name - allocate a name for a device
950 * @net: network namespace to allocate the device name in
951 * @name: name format string
952 * @buf: scratch buffer and result name string
953 *
954 * Passed a format string - eg "lt%d" it will try and find a suitable
955 * id. It scans list of devices to build up a free map, then chooses
956 * the first empty slot. The caller must hold the dev_base or rtnl lock
957 * while allocating the name and adding the device in order to avoid
958 * duplicates.
959 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
960 * Returns the number of the unit assigned or a negative errno code.
961 */
962
963static int __dev_alloc_name(struct net *net, const char *name, char *buf)
964{
965 int i = 0;
966 const char *p;
967 const int max_netdevices = 8*PAGE_SIZE;
968 unsigned long *inuse;
969 struct net_device *d;
970
971 p = strnchr(name, IFNAMSIZ-1, '%');
972 if (p) {
973 /*
974 * Verify the string as this thing may have come from
975 * the user. There must be either one "%d" and no other "%"
976 * characters.
977 */
978 if (p[1] != 'd' || strchr(p + 2, '%'))
979 return -EINVAL;
980
981 /* Use one page as a bit array of possible slots */
982 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
983 if (!inuse)
984 return -ENOMEM;
985
986 for_each_netdev(net, d) {
987 if (!sscanf(d->name, name, &i))
988 continue;
989 if (i < 0 || i >= max_netdevices)
990 continue;
991
992 /* avoid cases where sscanf is not exact inverse of printf */
993 snprintf(buf, IFNAMSIZ, name, i);
994 if (!strncmp(buf, d->name, IFNAMSIZ))
995 set_bit(i, inuse);
996 }
997
998 i = find_first_zero_bit(inuse, max_netdevices);
999 free_page((unsigned long) inuse);
1000 }
1001
1002 if (buf != name)
1003 snprintf(buf, IFNAMSIZ, name, i);
1004 if (!__dev_get_by_name(net, buf))
1005 return i;
1006
1007 /* It is possible to run out of possible slots
1008 * when the name is long and there isn't enough space left
1009 * for the digits, or if all bits are used.
1010 */
1011 return -ENFILE;
1012}
1013
1014/**
1015 * dev_alloc_name - allocate a name for a device
1016 * @dev: device
1017 * @name: name format string
1018 *
1019 * Passed a format string - eg "lt%d" it will try and find a suitable
1020 * id. It scans list of devices to build up a free map, then chooses
1021 * the first empty slot. The caller must hold the dev_base or rtnl lock
1022 * while allocating the name and adding the device in order to avoid
1023 * duplicates.
1024 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1025 * Returns the number of the unit assigned or a negative errno code.
1026 */
1027
1028int dev_alloc_name(struct net_device *dev, const char *name)
1029{
1030 char buf[IFNAMSIZ];
1031 struct net *net;
1032 int ret;
1033
1034 BUG_ON(!dev_net(dev));
1035 net = dev_net(dev);
1036 ret = __dev_alloc_name(net, name, buf);
1037 if (ret >= 0)
1038 strlcpy(dev->name, buf, IFNAMSIZ);
1039 return ret;
1040}
1041EXPORT_SYMBOL(dev_alloc_name);
1042
1043static int dev_alloc_name_ns(struct net *net,
1044 struct net_device *dev,
1045 const char *name)
1046{
1047 char buf[IFNAMSIZ];
1048 int ret;
1049
1050 ret = __dev_alloc_name(net, name, buf);
1051 if (ret >= 0)
1052 strlcpy(dev->name, buf, IFNAMSIZ);
1053 return ret;
1054}
1055
1056static int dev_get_valid_name(struct net *net,
1057 struct net_device *dev,
1058 const char *name)
1059{
1060 BUG_ON(!net);
1061
1062 if (!dev_valid_name(name))
1063 return -EINVAL;
1064
1065 if (strchr(name, '%'))
1066 return dev_alloc_name_ns(net, dev, name);
1067 else if (__dev_get_by_name(net, name))
1068 return -EEXIST;
1069 else if (dev->name != name)
1070 strlcpy(dev->name, name, IFNAMSIZ);
1071
1072 return 0;
1073}
1074
1075/**
1076 * dev_change_name - change name of a device
1077 * @dev: device
1078 * @newname: name (or format string) must be at least IFNAMSIZ
1079 *
1080 * Change name of a device, can pass format strings "eth%d".
1081 * for wildcarding.
1082 */
1083int dev_change_name(struct net_device *dev, const char *newname)
1084{
1085 char oldname[IFNAMSIZ];
1086 int err = 0;
1087 int ret;
1088 struct net *net;
1089
1090 ASSERT_RTNL();
1091 BUG_ON(!dev_net(dev));
1092
1093 net = dev_net(dev);
1094 if (dev->flags & IFF_UP)
1095 return -EBUSY;
1096
1097 write_seqcount_begin(&devnet_rename_seq);
1098
1099 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1100 write_seqcount_end(&devnet_rename_seq);
1101 return 0;
1102 }
1103
1104 memcpy(oldname, dev->name, IFNAMSIZ);
1105
1106 err = dev_get_valid_name(net, dev, newname);
1107 if (err < 0) {
1108 write_seqcount_end(&devnet_rename_seq);
1109 return err;
1110 }
1111
1112rollback:
1113 ret = device_rename(&dev->dev, dev->name);
1114 if (ret) {
1115 memcpy(dev->name, oldname, IFNAMSIZ);
1116 write_seqcount_end(&devnet_rename_seq);
1117 return ret;
1118 }
1119
1120 write_seqcount_end(&devnet_rename_seq);
1121
1122 netdev_adjacent_rename_links(dev, oldname);
1123
1124 write_lock_bh(&dev_base_lock);
1125 hlist_del_rcu(&dev->name_hlist);
1126 write_unlock_bh(&dev_base_lock);
1127
1128 synchronize_rcu();
1129
1130 write_lock_bh(&dev_base_lock);
1131 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1132 write_unlock_bh(&dev_base_lock);
1133
1134 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1135 ret = notifier_to_errno(ret);
1136
1137 if (ret) {
1138 /* err >= 0 after dev_alloc_name() or stores the first errno */
1139 if (err >= 0) {
1140 err = ret;
1141 write_seqcount_begin(&devnet_rename_seq);
1142 memcpy(dev->name, oldname, IFNAMSIZ);
1143 memcpy(oldname, newname, IFNAMSIZ);
1144 goto rollback;
1145 } else {
1146 pr_err("%s: name change rollback failed: %d\n",
1147 dev->name, ret);
1148 }
1149 }
1150
1151 return err;
1152}
1153
1154/**
1155 * dev_set_alias - change ifalias of a device
1156 * @dev: device
1157 * @alias: name up to IFALIASZ
1158 * @len: limit of bytes to copy from info
1159 *
1160 * Set ifalias for a device,
1161 */
1162int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1163{
1164 char *new_ifalias;
1165
1166 ASSERT_RTNL();
1167
1168 if (len >= IFALIASZ)
1169 return -EINVAL;
1170
1171 if (!len) {
1172 kfree(dev->ifalias);
1173 dev->ifalias = NULL;
1174 return 0;
1175 }
1176
1177 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1178 if (!new_ifalias)
1179 return -ENOMEM;
1180 dev->ifalias = new_ifalias;
1181
1182 strlcpy(dev->ifalias, alias, len+1);
1183 return len;
1184}
1185
1186
1187/**
1188 * netdev_features_change - device changes features
1189 * @dev: device to cause notification
1190 *
1191 * Called to indicate a device has changed features.
1192 */
1193void netdev_features_change(struct net_device *dev)
1194{
1195 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1196}
1197EXPORT_SYMBOL(netdev_features_change);
1198
1199/**
1200 * netdev_state_change - device changes state
1201 * @dev: device to cause notification
1202 *
1203 * Called to indicate a device has changed state. This function calls
1204 * the notifier chains for netdev_chain and sends a NEWLINK message
1205 * to the routing socket.
1206 */
1207void netdev_state_change(struct net_device *dev)
1208{
1209 if (dev->flags & IFF_UP) {
1210 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1211 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1212 }
1213}
1214EXPORT_SYMBOL(netdev_state_change);
1215
1216/**
1217 * netdev_notify_peers - notify network peers about existence of @dev
1218 * @dev: network device
1219 *
1220 * Generate traffic such that interested network peers are aware of
1221 * @dev, such as by generating a gratuitous ARP. This may be used when
1222 * a device wants to inform the rest of the network about some sort of
1223 * reconfiguration such as a failover event or virtual machine
1224 * migration.
1225 */
1226void netdev_notify_peers(struct net_device *dev)
1227{
1228 rtnl_lock();
1229 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1230 rtnl_unlock();
1231}
1232EXPORT_SYMBOL(netdev_notify_peers);
1233
1234static int __dev_open(struct net_device *dev)
1235{
1236 const struct net_device_ops *ops = dev->netdev_ops;
1237 int ret;
1238
1239 ASSERT_RTNL();
1240
1241 if (!netif_device_present(dev))
1242 return -ENODEV;
1243
1244 /* Block netpoll from trying to do any rx path servicing.
1245 * If we don't do this there is a chance ndo_poll_controller
1246 * or ndo_poll may be running while we open the device
1247 */
1248 netpoll_poll_disable(dev);
1249
1250 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1251 ret = notifier_to_errno(ret);
1252 if (ret)
1253 return ret;
1254
1255 set_bit(__LINK_STATE_START, &dev->state);
1256
1257 if (ops->ndo_validate_addr)
1258 ret = ops->ndo_validate_addr(dev);
1259
1260 if (!ret && ops->ndo_open)
1261 ret = ops->ndo_open(dev);
1262
1263 netpoll_poll_enable(dev);
1264
1265 if (ret)
1266 clear_bit(__LINK_STATE_START, &dev->state);
1267 else {
1268 dev->flags |= IFF_UP;
1269 net_dmaengine_get();
1270 dev_set_rx_mode(dev);
1271 dev_activate(dev);
1272 add_device_randomness(dev->dev_addr, dev->addr_len);
1273 }
1274
1275 return ret;
1276}
1277
1278/**
1279 * dev_open - prepare an interface for use.
1280 * @dev: device to open
1281 *
1282 * Takes a device from down to up state. The device's private open
1283 * function is invoked and then the multicast lists are loaded. Finally
1284 * the device is moved into the up state and a %NETDEV_UP message is
1285 * sent to the netdev notifier chain.
1286 *
1287 * Calling this function on an active interface is a nop. On a failure
1288 * a negative errno code is returned.
1289 */
1290int dev_open(struct net_device *dev)
1291{
1292 int ret;
1293
1294 if (dev->flags & IFF_UP)
1295 return 0;
1296
1297 ret = __dev_open(dev);
1298 if (ret < 0)
1299 return ret;
1300
1301 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1302 call_netdevice_notifiers(NETDEV_UP, dev);
1303
1304 return ret;
1305}
1306EXPORT_SYMBOL(dev_open);
1307
1308static int __dev_close_many(struct list_head *head)
1309{
1310 struct net_device *dev;
1311
1312 ASSERT_RTNL();
1313 might_sleep();
1314
1315 list_for_each_entry(dev, head, close_list) {
1316 /* Temporarily disable netpoll until the interface is down */
1317 netpoll_poll_disable(dev);
1318
1319 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1320
1321 clear_bit(__LINK_STATE_START, &dev->state);
1322
1323 /* Synchronize to scheduled poll. We cannot touch poll list, it
1324 * can be even on different cpu. So just clear netif_running().
1325 *
1326 * dev->stop() will invoke napi_disable() on all of it's
1327 * napi_struct instances on this device.
1328 */
1329 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1330 }
1331
1332 dev_deactivate_many(head);
1333
1334 list_for_each_entry(dev, head, close_list) {
1335 const struct net_device_ops *ops = dev->netdev_ops;
1336
1337 /*
1338 * Call the device specific close. This cannot fail.
1339 * Only if device is UP
1340 *
1341 * We allow it to be called even after a DETACH hot-plug
1342 * event.
1343 */
1344 if (ops->ndo_stop)
1345 ops->ndo_stop(dev);
1346
1347 dev->flags &= ~IFF_UP;
1348 net_dmaengine_put();
1349 netpoll_poll_enable(dev);
1350 }
1351
1352 return 0;
1353}
1354
1355static int __dev_close(struct net_device *dev)
1356{
1357 int retval;
1358 LIST_HEAD(single);
1359
1360 list_add(&dev->close_list, &single);
1361 retval = __dev_close_many(&single);
1362 list_del(&single);
1363
1364 return retval;
1365}
1366
1367static int dev_close_many(struct list_head *head)
1368{
1369 struct net_device *dev, *tmp;
1370
1371 /* Remove the devices that don't need to be closed */
1372 list_for_each_entry_safe(dev, tmp, head, close_list)
1373 if (!(dev->flags & IFF_UP))
1374 list_del_init(&dev->close_list);
1375
1376 __dev_close_many(head);
1377
1378 list_for_each_entry_safe(dev, tmp, head, close_list) {
1379 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1380 call_netdevice_notifiers(NETDEV_DOWN, dev);
1381 list_del_init(&dev->close_list);
1382 }
1383
1384 return 0;
1385}
1386
1387/**
1388 * dev_close - shutdown an interface.
1389 * @dev: device to shutdown
1390 *
1391 * This function moves an active device into down state. A
1392 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1393 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1394 * chain.
1395 */
1396int dev_close(struct net_device *dev)
1397{
1398 if (dev->flags & IFF_UP) {
1399 LIST_HEAD(single);
1400
1401 list_add(&dev->close_list, &single);
1402 dev_close_many(&single);
1403 list_del(&single);
1404 }
1405 return 0;
1406}
1407EXPORT_SYMBOL(dev_close);
1408
1409
1410/**
1411 * dev_disable_lro - disable Large Receive Offload on a device
1412 * @dev: device
1413 *
1414 * Disable Large Receive Offload (LRO) on a net device. Must be
1415 * called under RTNL. This is needed if received packets may be
1416 * forwarded to another interface.
1417 */
1418void dev_disable_lro(struct net_device *dev)
1419{
1420 /*
1421 * If we're trying to disable lro on a vlan device
1422 * use the underlying physical device instead
1423 */
1424 if (is_vlan_dev(dev))
1425 dev = vlan_dev_real_dev(dev);
1426
1427 /* the same for macvlan devices */
1428 if (netif_is_macvlan(dev))
1429 dev = macvlan_dev_real_dev(dev);
1430
1431 dev->wanted_features &= ~NETIF_F_LRO;
1432 netdev_update_features(dev);
1433
1434 if (unlikely(dev->features & NETIF_F_LRO))
1435 netdev_WARN(dev, "failed to disable LRO!\n");
1436}
1437EXPORT_SYMBOL(dev_disable_lro);
1438
1439static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1440 struct net_device *dev)
1441{
1442 struct netdev_notifier_info info;
1443
1444 netdev_notifier_info_init(&info, dev);
1445 return nb->notifier_call(nb, val, &info);
1446}
1447
1448static int dev_boot_phase = 1;
1449
1450/**
1451 * register_netdevice_notifier - register a network notifier block
1452 * @nb: notifier
1453 *
1454 * Register a notifier to be called when network device events occur.
1455 * The notifier passed is linked into the kernel structures and must
1456 * not be reused until it has been unregistered. A negative errno code
1457 * is returned on a failure.
1458 *
1459 * When registered all registration and up events are replayed
1460 * to the new notifier to allow device to have a race free
1461 * view of the network device list.
1462 */
1463
1464int register_netdevice_notifier(struct notifier_block *nb)
1465{
1466 struct net_device *dev;
1467 struct net_device *last;
1468 struct net *net;
1469 int err;
1470
1471 rtnl_lock();
1472 err = raw_notifier_chain_register(&netdev_chain, nb);
1473 if (err)
1474 goto unlock;
1475 if (dev_boot_phase)
1476 goto unlock;
1477 for_each_net(net) {
1478 for_each_netdev(net, dev) {
1479 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1480 err = notifier_to_errno(err);
1481 if (err)
1482 goto rollback;
1483
1484 if (!(dev->flags & IFF_UP))
1485 continue;
1486
1487 call_netdevice_notifier(nb, NETDEV_UP, dev);
1488 }
1489 }
1490
1491unlock:
1492 rtnl_unlock();
1493 return err;
1494
1495rollback:
1496 last = dev;
1497 for_each_net(net) {
1498 for_each_netdev(net, dev) {
1499 if (dev == last)
1500 goto outroll;
1501
1502 if (dev->flags & IFF_UP) {
1503 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1504 dev);
1505 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1506 }
1507 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1508 }
1509 }
1510
1511outroll:
1512 raw_notifier_chain_unregister(&netdev_chain, nb);
1513 goto unlock;
1514}
1515EXPORT_SYMBOL(register_netdevice_notifier);
1516
1517/**
1518 * unregister_netdevice_notifier - unregister a network notifier block
1519 * @nb: notifier
1520 *
1521 * Unregister a notifier previously registered by
1522 * register_netdevice_notifier(). The notifier is unlinked into the
1523 * kernel structures and may then be reused. A negative errno code
1524 * is returned on a failure.
1525 *
1526 * After unregistering unregister and down device events are synthesized
1527 * for all devices on the device list to the removed notifier to remove
1528 * the need for special case cleanup code.
1529 */
1530
1531int unregister_netdevice_notifier(struct notifier_block *nb)
1532{
1533 struct net_device *dev;
1534 struct net *net;
1535 int err;
1536
1537 rtnl_lock();
1538 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1539 if (err)
1540 goto unlock;
1541
1542 for_each_net(net) {
1543 for_each_netdev(net, dev) {
1544 if (dev->flags & IFF_UP) {
1545 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1546 dev);
1547 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1548 }
1549 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1550 }
1551 }
1552unlock:
1553 rtnl_unlock();
1554 return err;
1555}
1556EXPORT_SYMBOL(unregister_netdevice_notifier);
1557
1558/**
1559 * call_netdevice_notifiers_info - call all network notifier blocks
1560 * @val: value passed unmodified to notifier function
1561 * @dev: net_device pointer passed unmodified to notifier function
1562 * @info: notifier information data
1563 *
1564 * Call all network notifier blocks. Parameters and return value
1565 * are as for raw_notifier_call_chain().
1566 */
1567
1568static int call_netdevice_notifiers_info(unsigned long val,
1569 struct net_device *dev,
1570 struct netdev_notifier_info *info)
1571{
1572 ASSERT_RTNL();
1573 netdev_notifier_info_init(info, dev);
1574 return raw_notifier_call_chain(&netdev_chain, val, info);
1575}
1576
1577/**
1578 * call_netdevice_notifiers - call all network notifier blocks
1579 * @val: value passed unmodified to notifier function
1580 * @dev: net_device pointer passed unmodified to notifier function
1581 *
1582 * Call all network notifier blocks. Parameters and return value
1583 * are as for raw_notifier_call_chain().
1584 */
1585
1586int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1587{
1588 struct netdev_notifier_info info;
1589
1590 return call_netdevice_notifiers_info(val, dev, &info);
1591}
1592EXPORT_SYMBOL(call_netdevice_notifiers);
1593
1594static struct static_key netstamp_needed __read_mostly;
1595#ifdef HAVE_JUMP_LABEL
1596/* We are not allowed to call static_key_slow_dec() from irq context
1597 * If net_disable_timestamp() is called from irq context, defer the
1598 * static_key_slow_dec() calls.
1599 */
1600static atomic_t netstamp_needed_deferred;
1601#endif
1602
1603void net_enable_timestamp(void)
1604{
1605#ifdef HAVE_JUMP_LABEL
1606 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1607
1608 if (deferred) {
1609 while (--deferred)
1610 static_key_slow_dec(&netstamp_needed);
1611 return;
1612 }
1613#endif
1614 static_key_slow_inc(&netstamp_needed);
1615}
1616EXPORT_SYMBOL(net_enable_timestamp);
1617
1618void net_disable_timestamp(void)
1619{
1620#ifdef HAVE_JUMP_LABEL
1621 if (in_interrupt()) {
1622 atomic_inc(&netstamp_needed_deferred);
1623 return;
1624 }
1625#endif
1626 static_key_slow_dec(&netstamp_needed);
1627}
1628EXPORT_SYMBOL(net_disable_timestamp);
1629
1630static inline void net_timestamp_set(struct sk_buff *skb)
1631{
1632 skb->tstamp.tv64 = 0;
1633 if (static_key_false(&netstamp_needed))
1634 __net_timestamp(skb);
1635}
1636
1637#define net_timestamp_check(COND, SKB) \
1638 if (static_key_false(&netstamp_needed)) { \
1639 if ((COND) && !(SKB)->tstamp.tv64) \
1640 __net_timestamp(SKB); \
1641 } \
1642
1643bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1644{
1645 unsigned int len;
1646
1647 if (!(dev->flags & IFF_UP))
1648 return false;
1649
1650 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1651 if (skb->len <= len)
1652 return true;
1653
1654 /* if TSO is enabled, we don't care about the length as the packet
1655 * could be forwarded without being segmented before
1656 */
1657 if (skb_is_gso(skb))
1658 return true;
1659
1660 return false;
1661}
1662EXPORT_SYMBOL_GPL(is_skb_forwardable);
1663
1664/**
1665 * dev_forward_skb - loopback an skb to another netif
1666 *
1667 * @dev: destination network device
1668 * @skb: buffer to forward
1669 *
1670 * return values:
1671 * NET_RX_SUCCESS (no congestion)
1672 * NET_RX_DROP (packet was dropped, but freed)
1673 *
1674 * dev_forward_skb can be used for injecting an skb from the
1675 * start_xmit function of one device into the receive queue
1676 * of another device.
1677 *
1678 * The receiving device may be in another namespace, so
1679 * we have to clear all information in the skb that could
1680 * impact namespace isolation.
1681 */
1682int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1683{
1684 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1685 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1686 atomic_long_inc(&dev->rx_dropped);
1687 kfree_skb(skb);
1688 return NET_RX_DROP;
1689 }
1690 }
1691
1692 if (unlikely(!is_skb_forwardable(dev, skb))) {
1693 atomic_long_inc(&dev->rx_dropped);
1694 kfree_skb(skb);
1695 return NET_RX_DROP;
1696 }
1697
1698 skb_scrub_packet(skb, true);
1699 skb->protocol = eth_type_trans(skb, dev);
1700
1701 return netif_rx_internal(skb);
1702}
1703EXPORT_SYMBOL_GPL(dev_forward_skb);
1704
1705static inline int deliver_skb(struct sk_buff *skb,
1706 struct packet_type *pt_prev,
1707 struct net_device *orig_dev)
1708{
1709 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1710 return -ENOMEM;
1711 atomic_inc(&skb->users);
1712 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1713}
1714
1715static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1716{
1717 if (!ptype->af_packet_priv || !skb->sk)
1718 return false;
1719
1720 if (ptype->id_match)
1721 return ptype->id_match(ptype, skb->sk);
1722 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1723 return true;
1724
1725 return false;
1726}
1727
1728/*
1729 * Support routine. Sends outgoing frames to any network
1730 * taps currently in use.
1731 */
1732
1733static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1734{
1735 struct packet_type *ptype;
1736 struct sk_buff *skb2 = NULL;
1737 struct packet_type *pt_prev = NULL;
1738
1739 rcu_read_lock();
1740 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1741 /* Never send packets back to the socket
1742 * they originated from - MvS (miquels@drinkel.ow.org)
1743 */
1744 if ((ptype->dev == dev || !ptype->dev) &&
1745 (!skb_loop_sk(ptype, skb))) {
1746 if (pt_prev) {
1747 deliver_skb(skb2, pt_prev, skb->dev);
1748 pt_prev = ptype;
1749 continue;
1750 }
1751
1752 skb2 = skb_clone(skb, GFP_ATOMIC);
1753 if (!skb2)
1754 break;
1755
1756 net_timestamp_set(skb2);
1757
1758 /* skb->nh should be correctly
1759 set by sender, so that the second statement is
1760 just protection against buggy protocols.
1761 */
1762 skb_reset_mac_header(skb2);
1763
1764 if (skb_network_header(skb2) < skb2->data ||
1765 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1766 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1767 ntohs(skb2->protocol),
1768 dev->name);
1769 skb_reset_network_header(skb2);
1770 }
1771
1772 skb2->transport_header = skb2->network_header;
1773 skb2->pkt_type = PACKET_OUTGOING;
1774 pt_prev = ptype;
1775 }
1776 }
1777 if (pt_prev)
1778 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1779 rcu_read_unlock();
1780}
1781
1782/**
1783 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1784 * @dev: Network device
1785 * @txq: number of queues available
1786 *
1787 * If real_num_tx_queues is changed the tc mappings may no longer be
1788 * valid. To resolve this verify the tc mapping remains valid and if
1789 * not NULL the mapping. With no priorities mapping to this
1790 * offset/count pair it will no longer be used. In the worst case TC0
1791 * is invalid nothing can be done so disable priority mappings. If is
1792 * expected that drivers will fix this mapping if they can before
1793 * calling netif_set_real_num_tx_queues.
1794 */
1795static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1796{
1797 int i;
1798 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1799
1800 /* If TC0 is invalidated disable TC mapping */
1801 if (tc->offset + tc->count > txq) {
1802 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1803 dev->num_tc = 0;
1804 return;
1805 }
1806
1807 /* Invalidated prio to tc mappings set to TC0 */
1808 for (i = 1; i < TC_BITMASK + 1; i++) {
1809 int q = netdev_get_prio_tc_map(dev, i);
1810
1811 tc = &dev->tc_to_txq[q];
1812 if (tc->offset + tc->count > txq) {
1813 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1814 i, q);
1815 netdev_set_prio_tc_map(dev, i, 0);
1816 }
1817 }
1818}
1819
1820#ifdef CONFIG_XPS
1821static DEFINE_MUTEX(xps_map_mutex);
1822#define xmap_dereference(P) \
1823 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1824
1825static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1826 int cpu, u16 index)
1827{
1828 struct xps_map *map = NULL;
1829 int pos;
1830
1831 if (dev_maps)
1832 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1833
1834 for (pos = 0; map && pos < map->len; pos++) {
1835 if (map->queues[pos] == index) {
1836 if (map->len > 1) {
1837 map->queues[pos] = map->queues[--map->len];
1838 } else {
1839 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1840 kfree_rcu(map, rcu);
1841 map = NULL;
1842 }
1843 break;
1844 }
1845 }
1846
1847 return map;
1848}
1849
1850static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1851{
1852 struct xps_dev_maps *dev_maps;
1853 int cpu, i;
1854 bool active = false;
1855
1856 mutex_lock(&xps_map_mutex);
1857 dev_maps = xmap_dereference(dev->xps_maps);
1858
1859 if (!dev_maps)
1860 goto out_no_maps;
1861
1862 for_each_possible_cpu(cpu) {
1863 for (i = index; i < dev->num_tx_queues; i++) {
1864 if (!remove_xps_queue(dev_maps, cpu, i))
1865 break;
1866 }
1867 if (i == dev->num_tx_queues)
1868 active = true;
1869 }
1870
1871 if (!active) {
1872 RCU_INIT_POINTER(dev->xps_maps, NULL);
1873 kfree_rcu(dev_maps, rcu);
1874 }
1875
1876 for (i = index; i < dev->num_tx_queues; i++)
1877 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1878 NUMA_NO_NODE);
1879
1880out_no_maps:
1881 mutex_unlock(&xps_map_mutex);
1882}
1883
1884static struct xps_map *expand_xps_map(struct xps_map *map,
1885 int cpu, u16 index)
1886{
1887 struct xps_map *new_map;
1888 int alloc_len = XPS_MIN_MAP_ALLOC;
1889 int i, pos;
1890
1891 for (pos = 0; map && pos < map->len; pos++) {
1892 if (map->queues[pos] != index)
1893 continue;
1894 return map;
1895 }
1896
1897 /* Need to add queue to this CPU's existing map */
1898 if (map) {
1899 if (pos < map->alloc_len)
1900 return map;
1901
1902 alloc_len = map->alloc_len * 2;
1903 }
1904
1905 /* Need to allocate new map to store queue on this CPU's map */
1906 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1907 cpu_to_node(cpu));
1908 if (!new_map)
1909 return NULL;
1910
1911 for (i = 0; i < pos; i++)
1912 new_map->queues[i] = map->queues[i];
1913 new_map->alloc_len = alloc_len;
1914 new_map->len = pos;
1915
1916 return new_map;
1917}
1918
1919int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1920 u16 index)
1921{
1922 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1923 struct xps_map *map, *new_map;
1924 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1925 int cpu, numa_node_id = -2;
1926 bool active = false;
1927
1928 mutex_lock(&xps_map_mutex);
1929
1930 dev_maps = xmap_dereference(dev->xps_maps);
1931
1932 /* allocate memory for queue storage */
1933 for_each_online_cpu(cpu) {
1934 if (!cpumask_test_cpu(cpu, mask))
1935 continue;
1936
1937 if (!new_dev_maps)
1938 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1939 if (!new_dev_maps) {
1940 mutex_unlock(&xps_map_mutex);
1941 return -ENOMEM;
1942 }
1943
1944 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1945 NULL;
1946
1947 map = expand_xps_map(map, cpu, index);
1948 if (!map)
1949 goto error;
1950
1951 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1952 }
1953
1954 if (!new_dev_maps)
1955 goto out_no_new_maps;
1956
1957 for_each_possible_cpu(cpu) {
1958 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1959 /* add queue to CPU maps */
1960 int pos = 0;
1961
1962 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1963 while ((pos < map->len) && (map->queues[pos] != index))
1964 pos++;
1965
1966 if (pos == map->len)
1967 map->queues[map->len++] = index;
1968#ifdef CONFIG_NUMA
1969 if (numa_node_id == -2)
1970 numa_node_id = cpu_to_node(cpu);
1971 else if (numa_node_id != cpu_to_node(cpu))
1972 numa_node_id = -1;
1973#endif
1974 } else if (dev_maps) {
1975 /* fill in the new device map from the old device map */
1976 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1977 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1978 }
1979
1980 }
1981
1982 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1983
1984 /* Cleanup old maps */
1985 if (dev_maps) {
1986 for_each_possible_cpu(cpu) {
1987 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1988 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1989 if (map && map != new_map)
1990 kfree_rcu(map, rcu);
1991 }
1992
1993 kfree_rcu(dev_maps, rcu);
1994 }
1995
1996 dev_maps = new_dev_maps;
1997 active = true;
1998
1999out_no_new_maps:
2000 /* update Tx queue numa node */
2001 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2002 (numa_node_id >= 0) ? numa_node_id :
2003 NUMA_NO_NODE);
2004
2005 if (!dev_maps)
2006 goto out_no_maps;
2007
2008 /* removes queue from unused CPUs */
2009 for_each_possible_cpu(cpu) {
2010 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2011 continue;
2012
2013 if (remove_xps_queue(dev_maps, cpu, index))
2014 active = true;
2015 }
2016
2017 /* free map if not active */
2018 if (!active) {
2019 RCU_INIT_POINTER(dev->xps_maps, NULL);
2020 kfree_rcu(dev_maps, rcu);
2021 }
2022
2023out_no_maps:
2024 mutex_unlock(&xps_map_mutex);
2025
2026 return 0;
2027error:
2028 /* remove any maps that we added */
2029 for_each_possible_cpu(cpu) {
2030 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2031 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2032 NULL;
2033 if (new_map && new_map != map)
2034 kfree(new_map);
2035 }
2036
2037 mutex_unlock(&xps_map_mutex);
2038
2039 kfree(new_dev_maps);
2040 return -ENOMEM;
2041}
2042EXPORT_SYMBOL(netif_set_xps_queue);
2043
2044#endif
2045/*
2046 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2047 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2048 */
2049int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2050{
2051 int rc;
2052
2053 if (txq < 1 || txq > dev->num_tx_queues)
2054 return -EINVAL;
2055
2056 if (dev->reg_state == NETREG_REGISTERED ||
2057 dev->reg_state == NETREG_UNREGISTERING) {
2058 ASSERT_RTNL();
2059
2060 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2061 txq);
2062 if (rc)
2063 return rc;
2064
2065 if (dev->num_tc)
2066 netif_setup_tc(dev, txq);
2067
2068 if (txq < dev->real_num_tx_queues) {
2069 qdisc_reset_all_tx_gt(dev, txq);
2070#ifdef CONFIG_XPS
2071 netif_reset_xps_queues_gt(dev, txq);
2072#endif
2073 }
2074 }
2075
2076 dev->real_num_tx_queues = txq;
2077 return 0;
2078}
2079EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2080
2081#ifdef CONFIG_SYSFS
2082/**
2083 * netif_set_real_num_rx_queues - set actual number of RX queues used
2084 * @dev: Network device
2085 * @rxq: Actual number of RX queues
2086 *
2087 * This must be called either with the rtnl_lock held or before
2088 * registration of the net device. Returns 0 on success, or a
2089 * negative error code. If called before registration, it always
2090 * succeeds.
2091 */
2092int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2093{
2094 int rc;
2095
2096 if (rxq < 1 || rxq > dev->num_rx_queues)
2097 return -EINVAL;
2098
2099 if (dev->reg_state == NETREG_REGISTERED) {
2100 ASSERT_RTNL();
2101
2102 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2103 rxq);
2104 if (rc)
2105 return rc;
2106 }
2107
2108 dev->real_num_rx_queues = rxq;
2109 return 0;
2110}
2111EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2112#endif
2113
2114/**
2115 * netif_get_num_default_rss_queues - default number of RSS queues
2116 *
2117 * This routine should set an upper limit on the number of RSS queues
2118 * used by default by multiqueue devices.
2119 */
2120int netif_get_num_default_rss_queues(void)
2121{
2122 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2123}
2124EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2125
2126static inline void __netif_reschedule(struct Qdisc *q)
2127{
2128 struct softnet_data *sd;
2129 unsigned long flags;
2130
2131 local_irq_save(flags);
2132 sd = &__get_cpu_var(softnet_data);
2133 q->next_sched = NULL;
2134 *sd->output_queue_tailp = q;
2135 sd->output_queue_tailp = &q->next_sched;
2136 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2137 local_irq_restore(flags);
2138}
2139
2140void __netif_schedule(struct Qdisc *q)
2141{
2142 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2143 __netif_reschedule(q);
2144}
2145EXPORT_SYMBOL(__netif_schedule);
2146
2147struct dev_kfree_skb_cb {
2148 enum skb_free_reason reason;
2149};
2150
2151static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2152{
2153 return (struct dev_kfree_skb_cb *)skb->cb;
2154}
2155
2156void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2157{
2158 unsigned long flags;
2159
2160 if (likely(atomic_read(&skb->users) == 1)) {
2161 smp_rmb();
2162 atomic_set(&skb->users, 0);
2163 } else if (likely(!atomic_dec_and_test(&skb->users))) {
2164 return;
2165 }
2166 get_kfree_skb_cb(skb)->reason = reason;
2167 local_irq_save(flags);
2168 skb->next = __this_cpu_read(softnet_data.completion_queue);
2169 __this_cpu_write(softnet_data.completion_queue, skb);
2170 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2171 local_irq_restore(flags);
2172}
2173EXPORT_SYMBOL(__dev_kfree_skb_irq);
2174
2175void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2176{
2177 if (in_irq() || irqs_disabled())
2178 __dev_kfree_skb_irq(skb, reason);
2179 else
2180 dev_kfree_skb(skb);
2181}
2182EXPORT_SYMBOL(__dev_kfree_skb_any);
2183
2184
2185/**
2186 * netif_device_detach - mark device as removed
2187 * @dev: network device
2188 *
2189 * Mark device as removed from system and therefore no longer available.
2190 */
2191void netif_device_detach(struct net_device *dev)
2192{
2193 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2194 netif_running(dev)) {
2195 netif_tx_stop_all_queues(dev);
2196 }
2197}
2198EXPORT_SYMBOL(netif_device_detach);
2199
2200/**
2201 * netif_device_attach - mark device as attached
2202 * @dev: network device
2203 *
2204 * Mark device as attached from system and restart if needed.
2205 */
2206void netif_device_attach(struct net_device *dev)
2207{
2208 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2209 netif_running(dev)) {
2210 netif_tx_wake_all_queues(dev);
2211 __netdev_watchdog_up(dev);
2212 }
2213}
2214EXPORT_SYMBOL(netif_device_attach);
2215
2216static void skb_warn_bad_offload(const struct sk_buff *skb)
2217{
2218 static const netdev_features_t null_features = 0;
2219 struct net_device *dev = skb->dev;
2220 const char *driver = "";
2221
2222 if (!net_ratelimit())
2223 return;
2224
2225 if (dev && dev->dev.parent)
2226 driver = dev_driver_string(dev->dev.parent);
2227
2228 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2229 "gso_type=%d ip_summed=%d\n",
2230 driver, dev ? &dev->features : &null_features,
2231 skb->sk ? &skb->sk->sk_route_caps : &null_features,
2232 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2233 skb_shinfo(skb)->gso_type, skb->ip_summed);
2234}
2235
2236/*
2237 * Invalidate hardware checksum when packet is to be mangled, and
2238 * complete checksum manually on outgoing path.
2239 */
2240int skb_checksum_help(struct sk_buff *skb)
2241{
2242 __wsum csum;
2243 int ret = 0, offset;
2244
2245 if (skb->ip_summed == CHECKSUM_COMPLETE)
2246 goto out_set_summed;
2247
2248 if (unlikely(skb_shinfo(skb)->gso_size)) {
2249 skb_warn_bad_offload(skb);
2250 return -EINVAL;
2251 }
2252
2253 /* Before computing a checksum, we should make sure no frag could
2254 * be modified by an external entity : checksum could be wrong.
2255 */
2256 if (skb_has_shared_frag(skb)) {
2257 ret = __skb_linearize(skb);
2258 if (ret)
2259 goto out;
2260 }
2261
2262 offset = skb_checksum_start_offset(skb);
2263 BUG_ON(offset >= skb_headlen(skb));
2264 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2265
2266 offset += skb->csum_offset;
2267 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2268
2269 if (skb_cloned(skb) &&
2270 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2271 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2272 if (ret)
2273 goto out;
2274 }
2275
2276 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2277out_set_summed:
2278 skb->ip_summed = CHECKSUM_NONE;
2279out:
2280 return ret;
2281}
2282EXPORT_SYMBOL(skb_checksum_help);
2283
2284__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2285{
2286 unsigned int vlan_depth = skb->mac_len;
2287 __be16 type = skb->protocol;
2288
2289 /* Tunnel gso handlers can set protocol to ethernet. */
2290 if (type == htons(ETH_P_TEB)) {
2291 struct ethhdr *eth;
2292
2293 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2294 return 0;
2295
2296 eth = (struct ethhdr *)skb_mac_header(skb);
2297 type = eth->h_proto;
2298 }
2299
2300 /* if skb->protocol is 802.1Q/AD then the header should already be
2301 * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2302 * ETH_HLEN otherwise
2303 */
2304 if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2305 if (vlan_depth) {
2306 if (unlikely(WARN_ON(vlan_depth < VLAN_HLEN)))
2307 return 0;
2308 vlan_depth -= VLAN_HLEN;
2309 } else {
2310 vlan_depth = ETH_HLEN;
2311 }
2312 do {
2313 struct vlan_hdr *vh;
2314
2315 if (unlikely(!pskb_may_pull(skb,
2316 vlan_depth + VLAN_HLEN)))
2317 return 0;
2318
2319 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2320 type = vh->h_vlan_encapsulated_proto;
2321 vlan_depth += VLAN_HLEN;
2322 } while (type == htons(ETH_P_8021Q) ||
2323 type == htons(ETH_P_8021AD));
2324 }
2325
2326 *depth = vlan_depth;
2327
2328 return type;
2329}
2330
2331/**
2332 * skb_mac_gso_segment - mac layer segmentation handler.
2333 * @skb: buffer to segment
2334 * @features: features for the output path (see dev->features)
2335 */
2336struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2337 netdev_features_t features)
2338{
2339 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2340 struct packet_offload *ptype;
2341 int vlan_depth = skb->mac_len;
2342 __be16 type = skb_network_protocol(skb, &vlan_depth);
2343
2344 if (unlikely(!type))
2345 return ERR_PTR(-EINVAL);
2346
2347 __skb_pull(skb, vlan_depth);
2348
2349 rcu_read_lock();
2350 list_for_each_entry_rcu(ptype, &offload_base, list) {
2351 if (ptype->type == type && ptype->callbacks.gso_segment) {
2352 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2353 int err;
2354
2355 err = ptype->callbacks.gso_send_check(skb);
2356 segs = ERR_PTR(err);
2357 if (err || skb_gso_ok(skb, features))
2358 break;
2359 __skb_push(skb, (skb->data -
2360 skb_network_header(skb)));
2361 }
2362 segs = ptype->callbacks.gso_segment(skb, features);
2363 break;
2364 }
2365 }
2366 rcu_read_unlock();
2367
2368 __skb_push(skb, skb->data - skb_mac_header(skb));
2369
2370 return segs;
2371}
2372EXPORT_SYMBOL(skb_mac_gso_segment);
2373
2374
2375/* openvswitch calls this on rx path, so we need a different check.
2376 */
2377static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2378{
2379 if (tx_path)
2380 return skb->ip_summed != CHECKSUM_PARTIAL;
2381 else
2382 return skb->ip_summed == CHECKSUM_NONE;
2383}
2384
2385/**
2386 * __skb_gso_segment - Perform segmentation on skb.
2387 * @skb: buffer to segment
2388 * @features: features for the output path (see dev->features)
2389 * @tx_path: whether it is called in TX path
2390 *
2391 * This function segments the given skb and returns a list of segments.
2392 *
2393 * It may return NULL if the skb requires no segmentation. This is
2394 * only possible when GSO is used for verifying header integrity.
2395 */
2396struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2397 netdev_features_t features, bool tx_path)
2398{
2399 if (unlikely(skb_needs_check(skb, tx_path))) {
2400 int err;
2401
2402 skb_warn_bad_offload(skb);
2403
2404 if (skb_header_cloned(skb) &&
2405 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2406 return ERR_PTR(err);
2407 }
2408
2409 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2410 SKB_GSO_CB(skb)->encap_level = 0;
2411
2412 skb_reset_mac_header(skb);
2413 skb_reset_mac_len(skb);
2414
2415 return skb_mac_gso_segment(skb, features);
2416}
2417EXPORT_SYMBOL(__skb_gso_segment);
2418
2419/* Take action when hardware reception checksum errors are detected. */
2420#ifdef CONFIG_BUG
2421void netdev_rx_csum_fault(struct net_device *dev)
2422{
2423 if (net_ratelimit()) {
2424 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2425 dump_stack();
2426 }
2427}
2428EXPORT_SYMBOL(netdev_rx_csum_fault);
2429#endif
2430
2431/* Actually, we should eliminate this check as soon as we know, that:
2432 * 1. IOMMU is present and allows to map all the memory.
2433 * 2. No high memory really exists on this machine.
2434 */
2435
2436static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2437{
2438#ifdef CONFIG_HIGHMEM
2439 int i;
2440 if (!(dev->features & NETIF_F_HIGHDMA)) {
2441 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2442 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2443 if (PageHighMem(skb_frag_page(frag)))
2444 return 1;
2445 }
2446 }
2447
2448 if (PCI_DMA_BUS_IS_PHYS) {
2449 struct device *pdev = dev->dev.parent;
2450
2451 if (!pdev)
2452 return 0;
2453 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2454 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2455 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2456 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2457 return 1;
2458 }
2459 }
2460#endif
2461 return 0;
2462}
2463
2464struct dev_gso_cb {
2465 void (*destructor)(struct sk_buff *skb);
2466};
2467
2468#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2469
2470static void dev_gso_skb_destructor(struct sk_buff *skb)
2471{
2472 struct dev_gso_cb *cb;
2473
2474 kfree_skb_list(skb->next);
2475 skb->next = NULL;
2476
2477 cb = DEV_GSO_CB(skb);
2478 if (cb->destructor)
2479 cb->destructor(skb);
2480}
2481
2482/**
2483 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2484 * @skb: buffer to segment
2485 * @features: device features as applicable to this skb
2486 *
2487 * This function segments the given skb and stores the list of segments
2488 * in skb->next.
2489 */
2490static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2491{
2492 struct sk_buff *segs;
2493
2494 segs = skb_gso_segment(skb, features);
2495
2496 /* Verifying header integrity only. */
2497 if (!segs)
2498 return 0;
2499
2500 if (IS_ERR(segs))
2501 return PTR_ERR(segs);
2502
2503 skb->next = segs;
2504 DEV_GSO_CB(skb)->destructor = skb->destructor;
2505 skb->destructor = dev_gso_skb_destructor;
2506
2507 return 0;
2508}
2509
2510static netdev_features_t harmonize_features(struct sk_buff *skb,
2511 netdev_features_t features)
2512{
2513 int tmp;
2514
2515 if (skb->ip_summed != CHECKSUM_NONE &&
2516 !can_checksum_protocol(features, skb_network_protocol(skb, &tmp))) {
2517 features &= ~NETIF_F_ALL_CSUM;
2518 } else if (illegal_highdma(skb->dev, skb)) {
2519 features &= ~NETIF_F_SG;
2520 }
2521
2522 return features;
2523}
2524
2525netdev_features_t netif_skb_features(struct sk_buff *skb)
2526{
2527 __be16 protocol = skb->protocol;
2528 netdev_features_t features = skb->dev->features;
2529
2530 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2531 features &= ~NETIF_F_GSO_MASK;
2532
2533 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2534 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2535 protocol = veh->h_vlan_encapsulated_proto;
2536 } else if (!vlan_tx_tag_present(skb)) {
2537 return harmonize_features(skb, features);
2538 }
2539
2540 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2541 NETIF_F_HW_VLAN_STAG_TX);
2542
2543 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2544 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2545 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2546 NETIF_F_HW_VLAN_STAG_TX;
2547
2548 return harmonize_features(skb, features);
2549}
2550EXPORT_SYMBOL(netif_skb_features);
2551
2552int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2553 struct netdev_queue *txq)
2554{
2555 const struct net_device_ops *ops = dev->netdev_ops;
2556 int rc = NETDEV_TX_OK;
2557 unsigned int skb_len;
2558
2559 if (likely(!skb->next)) {
2560 netdev_features_t features;
2561
2562 /*
2563 * If device doesn't need skb->dst, release it right now while
2564 * its hot in this cpu cache
2565 */
2566 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2567 skb_dst_drop(skb);
2568
2569 features = netif_skb_features(skb);
2570
2571 if (vlan_tx_tag_present(skb) &&
2572 !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2573 skb = __vlan_put_tag(skb, skb->vlan_proto,
2574 vlan_tx_tag_get(skb));
2575 if (unlikely(!skb))
2576 goto out;
2577
2578 skb->vlan_tci = 0;
2579 }
2580
2581 /* If encapsulation offload request, verify we are testing
2582 * hardware encapsulation features instead of standard
2583 * features for the netdev
2584 */
2585 if (skb->encapsulation)
2586 features &= dev->hw_enc_features;
2587
2588 if (netif_needs_gso(skb, features)) {
2589 if (unlikely(dev_gso_segment(skb, features)))
2590 goto out_kfree_skb;
2591 if (skb->next)
2592 goto gso;
2593 } else {
2594 if (skb_needs_linearize(skb, features) &&
2595 __skb_linearize(skb))
2596 goto out_kfree_skb;
2597
2598 /* If packet is not checksummed and device does not
2599 * support checksumming for this protocol, complete
2600 * checksumming here.
2601 */
2602 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2603 if (skb->encapsulation)
2604 skb_set_inner_transport_header(skb,
2605 skb_checksum_start_offset(skb));
2606 else
2607 skb_set_transport_header(skb,
2608 skb_checksum_start_offset(skb));
2609 if (!(features & NETIF_F_ALL_CSUM) &&
2610 skb_checksum_help(skb))
2611 goto out_kfree_skb;
2612 }
2613 }
2614
2615 if (!list_empty(&ptype_all))
2616 dev_queue_xmit_nit(skb, dev);
2617
2618 skb_len = skb->len;
2619 trace_net_dev_start_xmit(skb, dev);
2620 rc = ops->ndo_start_xmit(skb, dev);
2621 trace_net_dev_xmit(skb, rc, dev, skb_len);
2622 if (rc == NETDEV_TX_OK)
2623 txq_trans_update(txq);
2624 return rc;
2625 }
2626
2627gso:
2628 do {
2629 struct sk_buff *nskb = skb->next;
2630
2631 skb->next = nskb->next;
2632 nskb->next = NULL;
2633
2634 if (!list_empty(&ptype_all))
2635 dev_queue_xmit_nit(nskb, dev);
2636
2637 skb_len = nskb->len;
2638 trace_net_dev_start_xmit(nskb, dev);
2639 rc = ops->ndo_start_xmit(nskb, dev);
2640 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2641 if (unlikely(rc != NETDEV_TX_OK)) {
2642 if (rc & ~NETDEV_TX_MASK)
2643 goto out_kfree_gso_skb;
2644 nskb->next = skb->next;
2645 skb->next = nskb;
2646 return rc;
2647 }
2648 txq_trans_update(txq);
2649 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2650 return NETDEV_TX_BUSY;
2651 } while (skb->next);
2652
2653out_kfree_gso_skb:
2654 if (likely(skb->next == NULL)) {
2655 skb->destructor = DEV_GSO_CB(skb)->destructor;
2656 consume_skb(skb);
2657 return rc;
2658 }
2659out_kfree_skb:
2660 kfree_skb(skb);
2661out:
2662 return rc;
2663}
2664EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
2665
2666static void qdisc_pkt_len_init(struct sk_buff *skb)
2667{
2668 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2669
2670 qdisc_skb_cb(skb)->pkt_len = skb->len;
2671
2672 /* To get more precise estimation of bytes sent on wire,
2673 * we add to pkt_len the headers size of all segments
2674 */
2675 if (shinfo->gso_size) {
2676 unsigned int hdr_len;
2677 u16 gso_segs = shinfo->gso_segs;
2678
2679 /* mac layer + network layer */
2680 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2681
2682 /* + transport layer */
2683 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2684 hdr_len += tcp_hdrlen(skb);
2685 else
2686 hdr_len += sizeof(struct udphdr);
2687
2688 if (shinfo->gso_type & SKB_GSO_DODGY)
2689 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2690 shinfo->gso_size);
2691
2692 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2693 }
2694}
2695
2696static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2697 struct net_device *dev,
2698 struct netdev_queue *txq)
2699{
2700 spinlock_t *root_lock = qdisc_lock(q);
2701 bool contended;
2702 int rc;
2703
2704 qdisc_pkt_len_init(skb);
2705 qdisc_calculate_pkt_len(skb, q);
2706 /*
2707 * Heuristic to force contended enqueues to serialize on a
2708 * separate lock before trying to get qdisc main lock.
2709 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2710 * and dequeue packets faster.
2711 */
2712 contended = qdisc_is_running(q);
2713 if (unlikely(contended))
2714 spin_lock(&q->busylock);
2715
2716 spin_lock(root_lock);
2717 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2718 kfree_skb(skb);
2719 rc = NET_XMIT_DROP;
2720 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2721 qdisc_run_begin(q)) {
2722 /*
2723 * This is a work-conserving queue; there are no old skbs
2724 * waiting to be sent out; and the qdisc is not running -
2725 * xmit the skb directly.
2726 */
2727 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2728 skb_dst_force(skb);
2729
2730 qdisc_bstats_update(q, skb);
2731
2732 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2733 if (unlikely(contended)) {
2734 spin_unlock(&q->busylock);
2735 contended = false;
2736 }
2737 __qdisc_run(q);
2738 } else
2739 qdisc_run_end(q);
2740
2741 rc = NET_XMIT_SUCCESS;
2742 } else {
2743 skb_dst_force(skb);
2744 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2745 if (qdisc_run_begin(q)) {
2746 if (unlikely(contended)) {
2747 spin_unlock(&q->busylock);
2748 contended = false;
2749 }
2750 __qdisc_run(q);
2751 }
2752 }
2753 spin_unlock(root_lock);
2754 if (unlikely(contended))
2755 spin_unlock(&q->busylock);
2756 return rc;
2757}
2758
2759#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2760static void skb_update_prio(struct sk_buff *skb)
2761{
2762 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2763
2764 if (!skb->priority && skb->sk && map) {
2765 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2766
2767 if (prioidx < map->priomap_len)
2768 skb->priority = map->priomap[prioidx];
2769 }
2770}
2771#else
2772#define skb_update_prio(skb)
2773#endif
2774
2775static DEFINE_PER_CPU(int, xmit_recursion);
2776#define RECURSION_LIMIT 10
2777
2778/**
2779 * dev_loopback_xmit - loop back @skb
2780 * @skb: buffer to transmit
2781 */
2782int dev_loopback_xmit(struct sk_buff *skb)
2783{
2784 skb_reset_mac_header(skb);
2785 __skb_pull(skb, skb_network_offset(skb));
2786 skb->pkt_type = PACKET_LOOPBACK;
2787 skb->ip_summed = CHECKSUM_UNNECESSARY;
2788 WARN_ON(!skb_dst(skb));
2789 skb_dst_force(skb);
2790 netif_rx_ni(skb);
2791 return 0;
2792}
2793EXPORT_SYMBOL(dev_loopback_xmit);
2794
2795/**
2796 * __dev_queue_xmit - transmit a buffer
2797 * @skb: buffer to transmit
2798 * @accel_priv: private data used for L2 forwarding offload
2799 *
2800 * Queue a buffer for transmission to a network device. The caller must
2801 * have set the device and priority and built the buffer before calling
2802 * this function. The function can be called from an interrupt.
2803 *
2804 * A negative errno code is returned on a failure. A success does not
2805 * guarantee the frame will be transmitted as it may be dropped due
2806 * to congestion or traffic shaping.
2807 *
2808 * -----------------------------------------------------------------------------------
2809 * I notice this method can also return errors from the queue disciplines,
2810 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2811 * be positive.
2812 *
2813 * Regardless of the return value, the skb is consumed, so it is currently
2814 * difficult to retry a send to this method. (You can bump the ref count
2815 * before sending to hold a reference for retry if you are careful.)
2816 *
2817 * When calling this method, interrupts MUST be enabled. This is because
2818 * the BH enable code must have IRQs enabled so that it will not deadlock.
2819 * --BLG
2820 */
2821static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2822{
2823 struct net_device *dev = skb->dev;
2824 struct netdev_queue *txq;
2825 struct Qdisc *q;
2826 int rc = -ENOMEM;
2827
2828 skb_reset_mac_header(skb);
2829
2830 /* Disable soft irqs for various locks below. Also
2831 * stops preemption for RCU.
2832 */
2833 rcu_read_lock_bh();
2834
2835 skb_update_prio(skb);
2836
2837 txq = netdev_pick_tx(dev, skb, accel_priv);
2838 q = rcu_dereference_bh(txq->qdisc);
2839
2840#ifdef CONFIG_NET_CLS_ACT
2841 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2842#endif
2843 trace_net_dev_queue(skb);
2844 if (q->enqueue) {
2845 rc = __dev_xmit_skb(skb, q, dev, txq);
2846 goto out;
2847 }
2848
2849 /* The device has no queue. Common case for software devices:
2850 loopback, all the sorts of tunnels...
2851
2852 Really, it is unlikely that netif_tx_lock protection is necessary
2853 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2854 counters.)
2855 However, it is possible, that they rely on protection
2856 made by us here.
2857
2858 Check this and shot the lock. It is not prone from deadlocks.
2859 Either shot noqueue qdisc, it is even simpler 8)
2860 */
2861 if (dev->flags & IFF_UP) {
2862 int cpu = smp_processor_id(); /* ok because BHs are off */
2863
2864 if (txq->xmit_lock_owner != cpu) {
2865
2866 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2867 goto recursion_alert;
2868
2869 HARD_TX_LOCK(dev, txq, cpu);
2870
2871 if (!netif_xmit_stopped(txq)) {
2872 __this_cpu_inc(xmit_recursion);
2873 rc = dev_hard_start_xmit(skb, dev, txq);
2874 __this_cpu_dec(xmit_recursion);
2875 if (dev_xmit_complete(rc)) {
2876 HARD_TX_UNLOCK(dev, txq);
2877 goto out;
2878 }
2879 }
2880 HARD_TX_UNLOCK(dev, txq);
2881 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2882 dev->name);
2883 } else {
2884 /* Recursion is detected! It is possible,
2885 * unfortunately
2886 */
2887recursion_alert:
2888 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2889 dev->name);
2890 }
2891 }
2892
2893 rc = -ENETDOWN;
2894 rcu_read_unlock_bh();
2895
2896 atomic_long_inc(&dev->tx_dropped);
2897 kfree_skb(skb);
2898 return rc;
2899out:
2900 rcu_read_unlock_bh();
2901 return rc;
2902}
2903
2904int dev_queue_xmit(struct sk_buff *skb)
2905{
2906 return __dev_queue_xmit(skb, NULL);
2907}
2908EXPORT_SYMBOL(dev_queue_xmit);
2909
2910int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
2911{
2912 return __dev_queue_xmit(skb, accel_priv);
2913}
2914EXPORT_SYMBOL(dev_queue_xmit_accel);
2915
2916
2917/*=======================================================================
2918 Receiver routines
2919 =======================================================================*/
2920
2921int netdev_max_backlog __read_mostly = 1000;
2922EXPORT_SYMBOL(netdev_max_backlog);
2923
2924int netdev_tstamp_prequeue __read_mostly = 1;
2925int netdev_budget __read_mostly = 300;
2926int weight_p __read_mostly = 64; /* old backlog weight */
2927
2928/* Called with irq disabled */
2929static inline void ____napi_schedule(struct softnet_data *sd,
2930 struct napi_struct *napi)
2931{
2932 list_add_tail(&napi->poll_list, &sd->poll_list);
2933 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2934}
2935
2936#ifdef CONFIG_RPS
2937
2938/* One global table that all flow-based protocols share. */
2939struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2940EXPORT_SYMBOL(rps_sock_flow_table);
2941
2942struct static_key rps_needed __read_mostly;
2943
2944static struct rps_dev_flow *
2945set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2946 struct rps_dev_flow *rflow, u16 next_cpu)
2947{
2948 if (next_cpu != RPS_NO_CPU) {
2949#ifdef CONFIG_RFS_ACCEL
2950 struct netdev_rx_queue *rxqueue;
2951 struct rps_dev_flow_table *flow_table;
2952 struct rps_dev_flow *old_rflow;
2953 u32 flow_id;
2954 u16 rxq_index;
2955 int rc;
2956
2957 /* Should we steer this flow to a different hardware queue? */
2958 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2959 !(dev->features & NETIF_F_NTUPLE))
2960 goto out;
2961 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2962 if (rxq_index == skb_get_rx_queue(skb))
2963 goto out;
2964
2965 rxqueue = dev->_rx + rxq_index;
2966 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2967 if (!flow_table)
2968 goto out;
2969 flow_id = skb_get_hash(skb) & flow_table->mask;
2970 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2971 rxq_index, flow_id);
2972 if (rc < 0)
2973 goto out;
2974 old_rflow = rflow;
2975 rflow = &flow_table->flows[flow_id];
2976 rflow->filter = rc;
2977 if (old_rflow->filter == rflow->filter)
2978 old_rflow->filter = RPS_NO_FILTER;
2979 out:
2980#endif
2981 rflow->last_qtail =
2982 per_cpu(softnet_data, next_cpu).input_queue_head;
2983 }
2984
2985 rflow->cpu = next_cpu;
2986 return rflow;
2987}
2988
2989/*
2990 * get_rps_cpu is called from netif_receive_skb and returns the target
2991 * CPU from the RPS map of the receiving queue for a given skb.
2992 * rcu_read_lock must be held on entry.
2993 */
2994static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2995 struct rps_dev_flow **rflowp)
2996{
2997 struct netdev_rx_queue *rxqueue;
2998 struct rps_map *map;
2999 struct rps_dev_flow_table *flow_table;
3000 struct rps_sock_flow_table *sock_flow_table;
3001 int cpu = -1;
3002 u16 tcpu;
3003 u32 hash;
3004
3005 if (skb_rx_queue_recorded(skb)) {
3006 u16 index = skb_get_rx_queue(skb);
3007 if (unlikely(index >= dev->real_num_rx_queues)) {
3008 WARN_ONCE(dev->real_num_rx_queues > 1,
3009 "%s received packet on queue %u, but number "
3010 "of RX queues is %u\n",
3011 dev->name, index, dev->real_num_rx_queues);
3012 goto done;
3013 }
3014 rxqueue = dev->_rx + index;
3015 } else
3016 rxqueue = dev->_rx;
3017
3018 map = rcu_dereference(rxqueue->rps_map);
3019 if (map) {
3020 if (map->len == 1 &&
3021 !rcu_access_pointer(rxqueue->rps_flow_table)) {
3022 tcpu = map->cpus[0];
3023 if (cpu_online(tcpu))
3024 cpu = tcpu;
3025 goto done;
3026 }
3027 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3028 goto done;
3029 }
3030
3031 skb_reset_network_header(skb);
3032 hash = skb_get_hash(skb);
3033 if (!hash)
3034 goto done;
3035
3036 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3037 sock_flow_table = rcu_dereference(rps_sock_flow_table);
3038 if (flow_table && sock_flow_table) {
3039 u16 next_cpu;
3040 struct rps_dev_flow *rflow;
3041
3042 rflow = &flow_table->flows[hash & flow_table->mask];
3043 tcpu = rflow->cpu;
3044
3045 next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3046
3047 /*
3048 * If the desired CPU (where last recvmsg was done) is
3049 * different from current CPU (one in the rx-queue flow
3050 * table entry), switch if one of the following holds:
3051 * - Current CPU is unset (equal to RPS_NO_CPU).
3052 * - Current CPU is offline.
3053 * - The current CPU's queue tail has advanced beyond the
3054 * last packet that was enqueued using this table entry.
3055 * This guarantees that all previous packets for the flow
3056 * have been dequeued, thus preserving in order delivery.
3057 */
3058 if (unlikely(tcpu != next_cpu) &&
3059 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3060 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3061 rflow->last_qtail)) >= 0)) {
3062 tcpu = next_cpu;
3063 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3064 }
3065
3066 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3067 *rflowp = rflow;
3068 cpu = tcpu;
3069 goto done;
3070 }
3071 }
3072
3073 if (map) {
3074 tcpu = map->cpus[((u64) hash * map->len) >> 32];
3075
3076 if (cpu_online(tcpu)) {
3077 cpu = tcpu;
3078 goto done;
3079 }
3080 }
3081
3082done:
3083 return cpu;
3084}
3085
3086#ifdef CONFIG_RFS_ACCEL
3087
3088/**
3089 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3090 * @dev: Device on which the filter was set
3091 * @rxq_index: RX queue index
3092 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3093 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3094 *
3095 * Drivers that implement ndo_rx_flow_steer() should periodically call
3096 * this function for each installed filter and remove the filters for
3097 * which it returns %true.
3098 */
3099bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3100 u32 flow_id, u16 filter_id)
3101{
3102 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3103 struct rps_dev_flow_table *flow_table;
3104 struct rps_dev_flow *rflow;
3105 bool expire = true;
3106 int cpu;
3107
3108 rcu_read_lock();
3109 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3110 if (flow_table && flow_id <= flow_table->mask) {
3111 rflow = &flow_table->flows[flow_id];
3112 cpu = ACCESS_ONCE(rflow->cpu);
3113 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3114 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3115 rflow->last_qtail) <
3116 (int)(10 * flow_table->mask)))
3117 expire = false;
3118 }
3119 rcu_read_unlock();
3120 return expire;
3121}
3122EXPORT_SYMBOL(rps_may_expire_flow);
3123
3124#endif /* CONFIG_RFS_ACCEL */
3125
3126/* Called from hardirq (IPI) context */
3127static void rps_trigger_softirq(void *data)
3128{
3129 struct softnet_data *sd = data;
3130
3131 ____napi_schedule(sd, &sd->backlog);
3132 sd->received_rps++;
3133}
3134
3135#endif /* CONFIG_RPS */
3136
3137/*
3138 * Check if this softnet_data structure is another cpu one
3139 * If yes, queue it to our IPI list and return 1
3140 * If no, return 0
3141 */
3142static int rps_ipi_queued(struct softnet_data *sd)
3143{
3144#ifdef CONFIG_RPS
3145 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3146
3147 if (sd != mysd) {
3148 sd->rps_ipi_next = mysd->rps_ipi_list;
3149 mysd->rps_ipi_list = sd;
3150
3151 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3152 return 1;
3153 }
3154#endif /* CONFIG_RPS */
3155 return 0;
3156}
3157
3158#ifdef CONFIG_NET_FLOW_LIMIT
3159int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3160#endif
3161
3162static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3163{
3164#ifdef CONFIG_NET_FLOW_LIMIT
3165 struct sd_flow_limit *fl;
3166 struct softnet_data *sd;
3167 unsigned int old_flow, new_flow;
3168
3169 if (qlen < (netdev_max_backlog >> 1))
3170 return false;
3171
3172 sd = &__get_cpu_var(softnet_data);
3173
3174 rcu_read_lock();
3175 fl = rcu_dereference(sd->flow_limit);
3176 if (fl) {
3177 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3178 old_flow = fl->history[fl->history_head];
3179 fl->history[fl->history_head] = new_flow;
3180
3181 fl->history_head++;
3182 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3183
3184 if (likely(fl->buckets[old_flow]))
3185 fl->buckets[old_flow]--;
3186
3187 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3188 fl->count++;
3189 rcu_read_unlock();
3190 return true;
3191 }
3192 }
3193 rcu_read_unlock();
3194#endif
3195 return false;
3196}
3197
3198/*
3199 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3200 * queue (may be a remote CPU queue).
3201 */
3202static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3203 unsigned int *qtail)
3204{
3205 struct softnet_data *sd;
3206 unsigned long flags;
3207 unsigned int qlen;
3208
3209 sd = &per_cpu(softnet_data, cpu);
3210
3211 local_irq_save(flags);
3212
3213 rps_lock(sd);
3214 qlen = skb_queue_len(&sd->input_pkt_queue);
3215 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3216 if (skb_queue_len(&sd->input_pkt_queue)) {
3217enqueue:
3218 __skb_queue_tail(&sd->input_pkt_queue, skb);
3219 input_queue_tail_incr_save(sd, qtail);
3220 rps_unlock(sd);
3221 local_irq_restore(flags);
3222 return NET_RX_SUCCESS;
3223 }
3224
3225 /* Schedule NAPI for backlog device
3226 * We can use non atomic operation since we own the queue lock
3227 */
3228 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3229 if (!rps_ipi_queued(sd))
3230 ____napi_schedule(sd, &sd->backlog);
3231 }
3232 goto enqueue;
3233 }
3234
3235 sd->dropped++;
3236 rps_unlock(sd);
3237
3238 local_irq_restore(flags);
3239
3240 atomic_long_inc(&skb->dev->rx_dropped);
3241 kfree_skb(skb);
3242 return NET_RX_DROP;
3243}
3244
3245static int netif_rx_internal(struct sk_buff *skb)
3246{
3247 int ret;
3248
3249 net_timestamp_check(netdev_tstamp_prequeue, skb);
3250
3251 trace_netif_rx(skb);
3252#ifdef CONFIG_RPS
3253 if (static_key_false(&rps_needed)) {
3254 struct rps_dev_flow voidflow, *rflow = &voidflow;
3255 int cpu;
3256
3257 preempt_disable();
3258 rcu_read_lock();
3259
3260 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3261 if (cpu < 0)
3262 cpu = smp_processor_id();
3263
3264 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3265
3266 rcu_read_unlock();
3267 preempt_enable();
3268 } else
3269#endif
3270 {
3271 unsigned int qtail;
3272 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3273 put_cpu();
3274 }
3275 return ret;
3276}
3277
3278/**
3279 * netif_rx - post buffer to the network code
3280 * @skb: buffer to post
3281 *
3282 * This function receives a packet from a device driver and queues it for
3283 * the upper (protocol) levels to process. It always succeeds. The buffer
3284 * may be dropped during processing for congestion control or by the
3285 * protocol layers.
3286 *
3287 * return values:
3288 * NET_RX_SUCCESS (no congestion)
3289 * NET_RX_DROP (packet was dropped)
3290 *
3291 */
3292
3293int netif_rx(struct sk_buff *skb)
3294{
3295 trace_netif_rx_entry(skb);
3296
3297 return netif_rx_internal(skb);
3298}
3299EXPORT_SYMBOL(netif_rx);
3300
3301int netif_rx_ni(struct sk_buff *skb)
3302{
3303 int err;
3304
3305 trace_netif_rx_ni_entry(skb);
3306
3307 preempt_disable();
3308 err = netif_rx_internal(skb);
3309 if (local_softirq_pending())
3310 do_softirq();
3311 preempt_enable();
3312
3313 return err;
3314}
3315EXPORT_SYMBOL(netif_rx_ni);
3316
3317static void net_tx_action(struct softirq_action *h)
3318{
3319 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3320
3321 if (sd->completion_queue) {
3322 struct sk_buff *clist;
3323
3324 local_irq_disable();
3325 clist = sd->completion_queue;
3326 sd->completion_queue = NULL;
3327 local_irq_enable();
3328
3329 while (clist) {
3330 struct sk_buff *skb = clist;
3331 clist = clist->next;
3332
3333 WARN_ON(atomic_read(&skb->users));
3334 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3335 trace_consume_skb(skb);
3336 else
3337 trace_kfree_skb(skb, net_tx_action);
3338 __kfree_skb(skb);
3339 }
3340 }
3341
3342 if (sd->output_queue) {
3343 struct Qdisc *head;
3344
3345 local_irq_disable();
3346 head = sd->output_queue;
3347 sd->output_queue = NULL;
3348 sd->output_queue_tailp = &sd->output_queue;
3349 local_irq_enable();
3350
3351 while (head) {
3352 struct Qdisc *q = head;
3353 spinlock_t *root_lock;
3354
3355 head = head->next_sched;
3356
3357 root_lock = qdisc_lock(q);
3358 if (spin_trylock(root_lock)) {
3359 smp_mb__before_clear_bit();
3360 clear_bit(__QDISC_STATE_SCHED,
3361 &q->state);
3362 qdisc_run(q);
3363 spin_unlock(root_lock);
3364 } else {
3365 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3366 &q->state)) {
3367 __netif_reschedule(q);
3368 } else {
3369 smp_mb__before_clear_bit();
3370 clear_bit(__QDISC_STATE_SCHED,
3371 &q->state);
3372 }
3373 }
3374 }
3375 }
3376}
3377
3378#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3379 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3380/* This hook is defined here for ATM LANE */
3381int (*br_fdb_test_addr_hook)(struct net_device *dev,
3382 unsigned char *addr) __read_mostly;
3383EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3384#endif
3385
3386#ifdef CONFIG_NET_CLS_ACT
3387/* TODO: Maybe we should just force sch_ingress to be compiled in
3388 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3389 * a compare and 2 stores extra right now if we dont have it on
3390 * but have CONFIG_NET_CLS_ACT
3391 * NOTE: This doesn't stop any functionality; if you dont have
3392 * the ingress scheduler, you just can't add policies on ingress.
3393 *
3394 */
3395static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3396{
3397 struct net_device *dev = skb->dev;
3398 u32 ttl = G_TC_RTTL(skb->tc_verd);
3399 int result = TC_ACT_OK;
3400 struct Qdisc *q;
3401
3402 if (unlikely(MAX_RED_LOOP < ttl++)) {
3403 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3404 skb->skb_iif, dev->ifindex);
3405 return TC_ACT_SHOT;
3406 }
3407
3408 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3409 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3410
3411 q = rxq->qdisc;
3412 if (q != &noop_qdisc) {
3413 spin_lock(qdisc_lock(q));
3414 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3415 result = qdisc_enqueue_root(skb, q);
3416 spin_unlock(qdisc_lock(q));
3417 }
3418
3419 return result;
3420}
3421
3422static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3423 struct packet_type **pt_prev,
3424 int *ret, struct net_device *orig_dev)
3425{
3426 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3427
3428 if (!rxq || rxq->qdisc == &noop_qdisc)
3429 goto out;
3430
3431 if (*pt_prev) {
3432 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3433 *pt_prev = NULL;
3434 }
3435
3436 switch (ing_filter(skb, rxq)) {
3437 case TC_ACT_SHOT:
3438 case TC_ACT_STOLEN:
3439 kfree_skb(skb);
3440 return NULL;
3441 }
3442
3443out:
3444 skb->tc_verd = 0;
3445 return skb;
3446}
3447#endif
3448
3449/**
3450 * netdev_rx_handler_register - register receive handler
3451 * @dev: device to register a handler for
3452 * @rx_handler: receive handler to register
3453 * @rx_handler_data: data pointer that is used by rx handler
3454 *
3455 * Register a receive handler for a device. This handler will then be
3456 * called from __netif_receive_skb. A negative errno code is returned
3457 * on a failure.
3458 *
3459 * The caller must hold the rtnl_mutex.
3460 *
3461 * For a general description of rx_handler, see enum rx_handler_result.
3462 */
3463int netdev_rx_handler_register(struct net_device *dev,
3464 rx_handler_func_t *rx_handler,
3465 void *rx_handler_data)
3466{
3467 ASSERT_RTNL();
3468
3469 if (dev->rx_handler)
3470 return -EBUSY;
3471
3472 /* Note: rx_handler_data must be set before rx_handler */
3473 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3474 rcu_assign_pointer(dev->rx_handler, rx_handler);
3475
3476 return 0;
3477}
3478EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3479
3480/**
3481 * netdev_rx_handler_unregister - unregister receive handler
3482 * @dev: device to unregister a handler from
3483 *
3484 * Unregister a receive handler from a device.
3485 *
3486 * The caller must hold the rtnl_mutex.
3487 */
3488void netdev_rx_handler_unregister(struct net_device *dev)
3489{
3490
3491 ASSERT_RTNL();
3492 RCU_INIT_POINTER(dev->rx_handler, NULL);
3493 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3494 * section has a guarantee to see a non NULL rx_handler_data
3495 * as well.
3496 */
3497 synchronize_net();
3498 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3499}
3500EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3501
3502/*
3503 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3504 * the special handling of PFMEMALLOC skbs.
3505 */
3506static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3507{
3508 switch (skb->protocol) {
3509 case htons(ETH_P_ARP):
3510 case htons(ETH_P_IP):
3511 case htons(ETH_P_IPV6):
3512 case htons(ETH_P_8021Q):
3513 case htons(ETH_P_8021AD):
3514 return true;
3515 default:
3516 return false;
3517 }
3518}
3519
3520static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3521{
3522 struct packet_type *ptype, *pt_prev;
3523 rx_handler_func_t *rx_handler;
3524 struct net_device *orig_dev;
3525 struct net_device *null_or_dev;
3526 bool deliver_exact = false;
3527 int ret = NET_RX_DROP;
3528 __be16 type;
3529
3530 net_timestamp_check(!netdev_tstamp_prequeue, skb);
3531
3532 trace_netif_receive_skb(skb);
3533
3534 orig_dev = skb->dev;
3535
3536 skb_reset_network_header(skb);
3537 if (!skb_transport_header_was_set(skb))
3538 skb_reset_transport_header(skb);
3539 skb_reset_mac_len(skb);
3540
3541 pt_prev = NULL;
3542
3543 rcu_read_lock();
3544
3545another_round:
3546 skb->skb_iif = skb->dev->ifindex;
3547
3548 __this_cpu_inc(softnet_data.processed);
3549
3550 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3551 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3552 skb = vlan_untag(skb);
3553 if (unlikely(!skb))
3554 goto unlock;
3555 }
3556
3557#ifdef CONFIG_NET_CLS_ACT
3558 if (skb->tc_verd & TC_NCLS) {
3559 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3560 goto ncls;
3561 }
3562#endif
3563
3564 if (pfmemalloc)
3565 goto skip_taps;
3566
3567 list_for_each_entry_rcu(ptype, &ptype_all, list) {
3568 if (!ptype->dev || ptype->dev == skb->dev) {
3569 if (pt_prev)
3570 ret = deliver_skb(skb, pt_prev, orig_dev);
3571 pt_prev = ptype;
3572 }
3573 }
3574
3575skip_taps:
3576#ifdef CONFIG_NET_CLS_ACT
3577 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3578 if (!skb)
3579 goto unlock;
3580ncls:
3581#endif
3582
3583 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3584 goto drop;
3585
3586 if (vlan_tx_tag_present(skb)) {
3587 if (pt_prev) {
3588 ret = deliver_skb(skb, pt_prev, orig_dev);
3589 pt_prev = NULL;
3590 }
3591 if (vlan_do_receive(&skb))
3592 goto another_round;
3593 else if (unlikely(!skb))
3594 goto unlock;
3595 }
3596
3597 rx_handler = rcu_dereference(skb->dev->rx_handler);
3598 if (rx_handler) {
3599 if (pt_prev) {
3600 ret = deliver_skb(skb, pt_prev, orig_dev);
3601 pt_prev = NULL;
3602 }
3603 switch (rx_handler(&skb)) {
3604 case RX_HANDLER_CONSUMED:
3605 ret = NET_RX_SUCCESS;
3606 goto unlock;
3607 case RX_HANDLER_ANOTHER:
3608 goto another_round;
3609 case RX_HANDLER_EXACT:
3610 deliver_exact = true;
3611 case RX_HANDLER_PASS:
3612 break;
3613 default:
3614 BUG();
3615 }
3616 }
3617
3618 if (unlikely(vlan_tx_tag_present(skb))) {
3619 if (vlan_tx_tag_get_id(skb))
3620 skb->pkt_type = PACKET_OTHERHOST;
3621 /* Note: we might in the future use prio bits
3622 * and set skb->priority like in vlan_do_receive()
3623 * For the time being, just ignore Priority Code Point
3624 */
3625 skb->vlan_tci = 0;
3626 }
3627
3628 /* deliver only exact match when indicated */
3629 null_or_dev = deliver_exact ? skb->dev : NULL;
3630
3631 type = skb->protocol;
3632 list_for_each_entry_rcu(ptype,
3633 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3634 if (ptype->type == type &&
3635 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3636 ptype->dev == orig_dev)) {
3637 if (pt_prev)
3638 ret = deliver_skb(skb, pt_prev, orig_dev);
3639 pt_prev = ptype;
3640 }
3641 }
3642
3643 if (pt_prev) {
3644 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3645 goto drop;
3646 else
3647 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3648 } else {
3649drop:
3650 atomic_long_inc(&skb->dev->rx_dropped);
3651 kfree_skb(skb);
3652 /* Jamal, now you will not able to escape explaining
3653 * me how you were going to use this. :-)
3654 */
3655 ret = NET_RX_DROP;
3656 }
3657
3658unlock:
3659 rcu_read_unlock();
3660 return ret;
3661}
3662
3663static int __netif_receive_skb(struct sk_buff *skb)
3664{
3665 int ret;
3666
3667 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3668 unsigned long pflags = current->flags;
3669
3670 /*
3671 * PFMEMALLOC skbs are special, they should
3672 * - be delivered to SOCK_MEMALLOC sockets only
3673 * - stay away from userspace
3674 * - have bounded memory usage
3675 *
3676 * Use PF_MEMALLOC as this saves us from propagating the allocation
3677 * context down to all allocation sites.
3678 */
3679 current->flags |= PF_MEMALLOC;
3680 ret = __netif_receive_skb_core(skb, true);
3681 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3682 } else
3683 ret = __netif_receive_skb_core(skb, false);
3684
3685 return ret;
3686}
3687
3688static int netif_receive_skb_internal(struct sk_buff *skb)
3689{
3690 net_timestamp_check(netdev_tstamp_prequeue, skb);
3691
3692 if (skb_defer_rx_timestamp(skb))
3693 return NET_RX_SUCCESS;
3694
3695#ifdef CONFIG_RPS
3696 if (static_key_false(&rps_needed)) {
3697 struct rps_dev_flow voidflow, *rflow = &voidflow;
3698 int cpu, ret;
3699
3700 rcu_read_lock();
3701
3702 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3703
3704 if (cpu >= 0) {
3705 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3706 rcu_read_unlock();
3707 return ret;
3708 }
3709 rcu_read_unlock();
3710 }
3711#endif
3712 return __netif_receive_skb(skb);
3713}
3714
3715/**
3716 * netif_receive_skb - process receive buffer from network
3717 * @skb: buffer to process
3718 *
3719 * netif_receive_skb() is the main receive data processing function.
3720 * It always succeeds. The buffer may be dropped during processing
3721 * for congestion control or by the protocol layers.
3722 *
3723 * This function may only be called from softirq context and interrupts
3724 * should be enabled.
3725 *
3726 * Return values (usually ignored):
3727 * NET_RX_SUCCESS: no congestion
3728 * NET_RX_DROP: packet was dropped
3729 */
3730int netif_receive_skb(struct sk_buff *skb)
3731{
3732 trace_netif_receive_skb_entry(skb);
3733
3734 return netif_receive_skb_internal(skb);
3735}
3736EXPORT_SYMBOL(netif_receive_skb);
3737
3738/* Network device is going away, flush any packets still pending
3739 * Called with irqs disabled.
3740 */
3741static void flush_backlog(void *arg)
3742{
3743 struct net_device *dev = arg;
3744 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3745 struct sk_buff *skb, *tmp;
3746
3747 rps_lock(sd);
3748 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3749 if (skb->dev == dev) {
3750 __skb_unlink(skb, &sd->input_pkt_queue);
3751 kfree_skb(skb);
3752 input_queue_head_incr(sd);
3753 }
3754 }
3755 rps_unlock(sd);
3756
3757 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3758 if (skb->dev == dev) {
3759 __skb_unlink(skb, &sd->process_queue);
3760 kfree_skb(skb);
3761 input_queue_head_incr(sd);
3762 }
3763 }
3764}
3765
3766static int napi_gro_complete(struct sk_buff *skb)
3767{
3768 struct packet_offload *ptype;
3769 __be16 type = skb->protocol;
3770 struct list_head *head = &offload_base;
3771 int err = -ENOENT;
3772
3773 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3774
3775 if (NAPI_GRO_CB(skb)->count == 1) {
3776 skb_shinfo(skb)->gso_size = 0;
3777 goto out;
3778 }
3779
3780 rcu_read_lock();
3781 list_for_each_entry_rcu(ptype, head, list) {
3782 if (ptype->type != type || !ptype->callbacks.gro_complete)
3783 continue;
3784
3785 err = ptype->callbacks.gro_complete(skb, 0);
3786 break;
3787 }
3788 rcu_read_unlock();
3789
3790 if (err) {
3791 WARN_ON(&ptype->list == head);
3792 kfree_skb(skb);
3793 return NET_RX_SUCCESS;
3794 }
3795
3796out:
3797 return netif_receive_skb_internal(skb);
3798}
3799
3800/* napi->gro_list contains packets ordered by age.
3801 * youngest packets at the head of it.
3802 * Complete skbs in reverse order to reduce latencies.
3803 */
3804void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3805{
3806 struct sk_buff *skb, *prev = NULL;
3807
3808 /* scan list and build reverse chain */
3809 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3810 skb->prev = prev;
3811 prev = skb;
3812 }
3813
3814 for (skb = prev; skb; skb = prev) {
3815 skb->next = NULL;
3816
3817 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3818 return;
3819
3820 prev = skb->prev;
3821 napi_gro_complete(skb);
3822 napi->gro_count--;
3823 }
3824
3825 napi->gro_list = NULL;
3826}
3827EXPORT_SYMBOL(napi_gro_flush);
3828
3829static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3830{
3831 struct sk_buff *p;
3832 unsigned int maclen = skb->dev->hard_header_len;
3833 u32 hash = skb_get_hash_raw(skb);
3834
3835 for (p = napi->gro_list; p; p = p->next) {
3836 unsigned long diffs;
3837
3838 NAPI_GRO_CB(p)->flush = 0;
3839
3840 if (hash != skb_get_hash_raw(p)) {
3841 NAPI_GRO_CB(p)->same_flow = 0;
3842 continue;
3843 }
3844
3845 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3846 diffs |= p->vlan_tci ^ skb->vlan_tci;
3847 if (maclen == ETH_HLEN)
3848 diffs |= compare_ether_header(skb_mac_header(p),
3849 skb_mac_header(skb));
3850 else if (!diffs)
3851 diffs = memcmp(skb_mac_header(p),
3852 skb_mac_header(skb),
3853 maclen);
3854 NAPI_GRO_CB(p)->same_flow = !diffs;
3855 }
3856}
3857
3858static void skb_gro_reset_offset(struct sk_buff *skb)
3859{
3860 const struct skb_shared_info *pinfo = skb_shinfo(skb);
3861 const skb_frag_t *frag0 = &pinfo->frags[0];
3862
3863 NAPI_GRO_CB(skb)->data_offset = 0;
3864 NAPI_GRO_CB(skb)->frag0 = NULL;
3865 NAPI_GRO_CB(skb)->frag0_len = 0;
3866
3867 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3868 pinfo->nr_frags &&
3869 !PageHighMem(skb_frag_page(frag0))) {
3870 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3871 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3872 }
3873}
3874
3875static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3876{
3877 struct skb_shared_info *pinfo = skb_shinfo(skb);
3878
3879 BUG_ON(skb->end - skb->tail < grow);
3880
3881 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3882
3883 skb->data_len -= grow;
3884 skb->tail += grow;
3885
3886 pinfo->frags[0].page_offset += grow;
3887 skb_frag_size_sub(&pinfo->frags[0], grow);
3888
3889 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3890 skb_frag_unref(skb, 0);
3891 memmove(pinfo->frags, pinfo->frags + 1,
3892 --pinfo->nr_frags * sizeof(pinfo->frags[0]));
3893 }
3894}
3895
3896static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3897{
3898 struct sk_buff **pp = NULL;
3899 struct packet_offload *ptype;
3900 __be16 type = skb->protocol;
3901 struct list_head *head = &offload_base;
3902 int same_flow;
3903 enum gro_result ret;
3904 int grow;
3905
3906 if (!(skb->dev->features & NETIF_F_GRO))
3907 goto normal;
3908
3909 if (skb_is_gso(skb) || skb_has_frag_list(skb))
3910 goto normal;
3911
3912 gro_list_prepare(napi, skb);
3913 NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
3914
3915 rcu_read_lock();
3916 list_for_each_entry_rcu(ptype, head, list) {
3917 if (ptype->type != type || !ptype->callbacks.gro_receive)
3918 continue;
3919
3920 skb_set_network_header(skb, skb_gro_offset(skb));
3921 skb_reset_mac_len(skb);
3922 NAPI_GRO_CB(skb)->same_flow = 0;
3923 NAPI_GRO_CB(skb)->flush = 0;
3924 NAPI_GRO_CB(skb)->free = 0;
3925 NAPI_GRO_CB(skb)->udp_mark = 0;
3926
3927 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3928 break;
3929 }
3930 rcu_read_unlock();
3931
3932 if (&ptype->list == head)
3933 goto normal;
3934
3935 same_flow = NAPI_GRO_CB(skb)->same_flow;
3936 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3937
3938 if (pp) {
3939 struct sk_buff *nskb = *pp;
3940
3941 *pp = nskb->next;
3942 nskb->next = NULL;
3943 napi_gro_complete(nskb);
3944 napi->gro_count--;
3945 }
3946
3947 if (same_flow)
3948 goto ok;
3949
3950 if (NAPI_GRO_CB(skb)->flush)
3951 goto normal;
3952
3953 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
3954 struct sk_buff *nskb = napi->gro_list;
3955
3956 /* locate the end of the list to select the 'oldest' flow */
3957 while (nskb->next) {
3958 pp = &nskb->next;
3959 nskb = *pp;
3960 }
3961 *pp = NULL;
3962 nskb->next = NULL;
3963 napi_gro_complete(nskb);
3964 } else {
3965 napi->gro_count++;
3966 }
3967 NAPI_GRO_CB(skb)->count = 1;
3968 NAPI_GRO_CB(skb)->age = jiffies;
3969 NAPI_GRO_CB(skb)->last = skb;
3970 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3971 skb->next = napi->gro_list;
3972 napi->gro_list = skb;
3973 ret = GRO_HELD;
3974
3975pull:
3976 grow = skb_gro_offset(skb) - skb_headlen(skb);
3977 if (grow > 0)
3978 gro_pull_from_frag0(skb, grow);
3979ok:
3980 return ret;
3981
3982normal:
3983 ret = GRO_NORMAL;
3984 goto pull;
3985}
3986
3987struct packet_offload *gro_find_receive_by_type(__be16 type)
3988{
3989 struct list_head *offload_head = &offload_base;
3990 struct packet_offload *ptype;
3991
3992 list_for_each_entry_rcu(ptype, offload_head, list) {
3993 if (ptype->type != type || !ptype->callbacks.gro_receive)
3994 continue;
3995 return ptype;
3996 }
3997 return NULL;
3998}
3999EXPORT_SYMBOL(gro_find_receive_by_type);
4000
4001struct packet_offload *gro_find_complete_by_type(__be16 type)
4002{
4003 struct list_head *offload_head = &offload_base;
4004 struct packet_offload *ptype;
4005
4006 list_for_each_entry_rcu(ptype, offload_head, list) {
4007 if (ptype->type != type || !ptype->callbacks.gro_complete)
4008 continue;
4009 return ptype;
4010 }
4011 return NULL;
4012}
4013EXPORT_SYMBOL(gro_find_complete_by_type);
4014
4015static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4016{
4017 switch (ret) {
4018 case GRO_NORMAL:
4019 if (netif_receive_skb_internal(skb))
4020 ret = GRO_DROP;
4021 break;
4022
4023 case GRO_DROP:
4024 kfree_skb(skb);
4025 break;
4026
4027 case GRO_MERGED_FREE:
4028 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4029 kmem_cache_free(skbuff_head_cache, skb);
4030 else
4031 __kfree_skb(skb);
4032 break;
4033
4034 case GRO_HELD:
4035 case GRO_MERGED:
4036 break;
4037 }
4038
4039 return ret;
4040}
4041
4042gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4043{
4044 trace_napi_gro_receive_entry(skb);
4045
4046 skb_gro_reset_offset(skb);
4047
4048 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4049}
4050EXPORT_SYMBOL(napi_gro_receive);
4051
4052static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4053{
4054 __skb_pull(skb, skb_headlen(skb));
4055 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4056 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4057 skb->vlan_tci = 0;
4058 skb->dev = napi->dev;
4059 skb->skb_iif = 0;
4060 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4061
4062 napi->skb = skb;
4063}
4064
4065struct sk_buff *napi_get_frags(struct napi_struct *napi)
4066{
4067 struct sk_buff *skb = napi->skb;
4068
4069 if (!skb) {
4070 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
4071 napi->skb = skb;
4072 }
4073 return skb;
4074}
4075EXPORT_SYMBOL(napi_get_frags);
4076
4077static gro_result_t napi_frags_finish(struct napi_struct *napi,
4078 struct sk_buff *skb,
4079 gro_result_t ret)
4080{
4081 switch (ret) {
4082 case GRO_NORMAL:
4083 case GRO_HELD:
4084 __skb_push(skb, ETH_HLEN);
4085 skb->protocol = eth_type_trans(skb, skb->dev);
4086 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4087 ret = GRO_DROP;
4088 break;
4089
4090 case GRO_DROP:
4091 case GRO_MERGED_FREE:
4092 napi_reuse_skb(napi, skb);
4093 break;
4094
4095 case GRO_MERGED:
4096 break;
4097 }
4098
4099 return ret;
4100}
4101
4102/* Upper GRO stack assumes network header starts at gro_offset=0
4103 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4104 * We copy ethernet header into skb->data to have a common layout.
4105 */
4106static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4107{
4108 struct sk_buff *skb = napi->skb;
4109 const struct ethhdr *eth;
4110 unsigned int hlen = sizeof(*eth);
4111
4112 napi->skb = NULL;
4113
4114 skb_reset_mac_header(skb);
4115 skb_gro_reset_offset(skb);
4116
4117 eth = skb_gro_header_fast(skb, 0);
4118 if (unlikely(skb_gro_header_hard(skb, hlen))) {
4119 eth = skb_gro_header_slow(skb, hlen, 0);
4120 if (unlikely(!eth)) {
4121 napi_reuse_skb(napi, skb);
4122 return NULL;
4123 }
4124 } else {
4125 gro_pull_from_frag0(skb, hlen);
4126 NAPI_GRO_CB(skb)->frag0 += hlen;
4127 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4128 }
4129 __skb_pull(skb, hlen);
4130
4131 /*
4132 * This works because the only protocols we care about don't require
4133 * special handling.
4134 * We'll fix it up properly in napi_frags_finish()
4135 */
4136 skb->protocol = eth->h_proto;
4137
4138 return skb;
4139}
4140
4141gro_result_t napi_gro_frags(struct napi_struct *napi)
4142{
4143 struct sk_buff *skb = napi_frags_skb(napi);
4144
4145 if (!skb)
4146 return GRO_DROP;
4147
4148 trace_napi_gro_frags_entry(skb);
4149
4150 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4151}
4152EXPORT_SYMBOL(napi_gro_frags);
4153
4154/*
4155 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4156 * Note: called with local irq disabled, but exits with local irq enabled.
4157 */
4158static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4159{
4160#ifdef CONFIG_RPS
4161 struct softnet_data *remsd = sd->rps_ipi_list;
4162
4163 if (remsd) {
4164 sd->rps_ipi_list = NULL;
4165
4166 local_irq_enable();
4167
4168 /* Send pending IPI's to kick RPS processing on remote cpus. */
4169 while (remsd) {
4170 struct softnet_data *next = remsd->rps_ipi_next;
4171
4172 if (cpu_online(remsd->cpu))
4173 smp_call_function_single_async(remsd->cpu,
4174 &remsd->csd);
4175 remsd = next;
4176 }
4177 } else
4178#endif
4179 local_irq_enable();
4180}
4181
4182static int process_backlog(struct napi_struct *napi, int quota)
4183{
4184 int work = 0;
4185 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4186
4187#ifdef CONFIG_RPS
4188 /* Check if we have pending ipi, its better to send them now,
4189 * not waiting net_rx_action() end.
4190 */
4191 if (sd->rps_ipi_list) {
4192 local_irq_disable();
4193 net_rps_action_and_irq_enable(sd);
4194 }
4195#endif
4196 napi->weight = weight_p;
4197 local_irq_disable();
4198 while (work < quota) {
4199 struct sk_buff *skb;
4200 unsigned int qlen;
4201
4202 while ((skb = __skb_dequeue(&sd->process_queue))) {
4203 local_irq_enable();
4204 __netif_receive_skb(skb);
4205 local_irq_disable();
4206 input_queue_head_incr(sd);
4207 if (++work >= quota) {
4208 local_irq_enable();
4209 return work;
4210 }
4211 }
4212
4213 rps_lock(sd);
4214 qlen = skb_queue_len(&sd->input_pkt_queue);
4215 if (qlen)
4216 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4217 &sd->process_queue);
4218
4219 if (qlen < quota - work) {
4220 /*
4221 * Inline a custom version of __napi_complete().
4222 * only current cpu owns and manipulates this napi,
4223 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4224 * we can use a plain write instead of clear_bit(),
4225 * and we dont need an smp_mb() memory barrier.
4226 */
4227 list_del(&napi->poll_list);
4228 napi->state = 0;
4229
4230 quota = work + qlen;
4231 }
4232 rps_unlock(sd);
4233 }
4234 local_irq_enable();
4235
4236 return work;
4237}
4238
4239/**
4240 * __napi_schedule - schedule for receive
4241 * @n: entry to schedule
4242 *
4243 * The entry's receive function will be scheduled to run
4244 */
4245void __napi_schedule(struct napi_struct *n)
4246{
4247 unsigned long flags;
4248
4249 local_irq_save(flags);
4250 ____napi_schedule(&__get_cpu_var(softnet_data), n);
4251 local_irq_restore(flags);
4252}
4253EXPORT_SYMBOL(__napi_schedule);
4254
4255void __napi_complete(struct napi_struct *n)
4256{
4257 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4258 BUG_ON(n->gro_list);
4259
4260 list_del(&n->poll_list);
4261 smp_mb__before_clear_bit();
4262 clear_bit(NAPI_STATE_SCHED, &n->state);
4263}
4264EXPORT_SYMBOL(__napi_complete);
4265
4266void napi_complete(struct napi_struct *n)
4267{
4268 unsigned long flags;
4269
4270 /*
4271 * don't let napi dequeue from the cpu poll list
4272 * just in case its running on a different cpu
4273 */
4274 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4275 return;
4276
4277 napi_gro_flush(n, false);
4278 local_irq_save(flags);
4279 __napi_complete(n);
4280 local_irq_restore(flags);
4281}
4282EXPORT_SYMBOL(napi_complete);
4283
4284/* must be called under rcu_read_lock(), as we dont take a reference */
4285struct napi_struct *napi_by_id(unsigned int napi_id)
4286{
4287 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4288 struct napi_struct *napi;
4289
4290 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4291 if (napi->napi_id == napi_id)
4292 return napi;
4293
4294 return NULL;
4295}
4296EXPORT_SYMBOL_GPL(napi_by_id);
4297
4298void napi_hash_add(struct napi_struct *napi)
4299{
4300 if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4301
4302 spin_lock(&napi_hash_lock);
4303
4304 /* 0 is not a valid id, we also skip an id that is taken
4305 * we expect both events to be extremely rare
4306 */
4307 napi->napi_id = 0;
4308 while (!napi->napi_id) {
4309 napi->napi_id = ++napi_gen_id;
4310 if (napi_by_id(napi->napi_id))
4311 napi->napi_id = 0;
4312 }
4313
4314 hlist_add_head_rcu(&napi->napi_hash_node,
4315 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4316
4317 spin_unlock(&napi_hash_lock);
4318 }
4319}
4320EXPORT_SYMBOL_GPL(napi_hash_add);
4321
4322/* Warning : caller is responsible to make sure rcu grace period
4323 * is respected before freeing memory containing @napi
4324 */
4325void napi_hash_del(struct napi_struct *napi)
4326{
4327 spin_lock(&napi_hash_lock);
4328
4329 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4330 hlist_del_rcu(&napi->napi_hash_node);
4331
4332 spin_unlock(&napi_hash_lock);
4333}
4334EXPORT_SYMBOL_GPL(napi_hash_del);
4335
4336void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4337 int (*poll)(struct napi_struct *, int), int weight)
4338{
4339 INIT_LIST_HEAD(&napi->poll_list);
4340 napi->gro_count = 0;
4341 napi->gro_list = NULL;
4342 napi->skb = NULL;
4343 napi->poll = poll;
4344 if (weight > NAPI_POLL_WEIGHT)
4345 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4346 weight, dev->name);
4347 napi->weight = weight;
4348 list_add(&napi->dev_list, &dev->napi_list);
4349 napi->dev = dev;
4350#ifdef CONFIG_NETPOLL
4351 spin_lock_init(&napi->poll_lock);
4352 napi->poll_owner = -1;
4353#endif
4354 set_bit(NAPI_STATE_SCHED, &napi->state);
4355}
4356EXPORT_SYMBOL(netif_napi_add);
4357
4358void netif_napi_del(struct napi_struct *napi)
4359{
4360 list_del_init(&napi->dev_list);
4361 napi_free_frags(napi);
4362
4363 kfree_skb_list(napi->gro_list);
4364 napi->gro_list = NULL;
4365 napi->gro_count = 0;
4366}
4367EXPORT_SYMBOL(netif_napi_del);
4368
4369static void net_rx_action(struct softirq_action *h)
4370{
4371 struct softnet_data *sd = &__get_cpu_var(softnet_data);
4372 unsigned long time_limit = jiffies + 2;
4373 int budget = netdev_budget;
4374 void *have;
4375
4376 local_irq_disable();
4377
4378 while (!list_empty(&sd->poll_list)) {
4379 struct napi_struct *n;
4380 int work, weight;
4381
4382 /* If softirq window is exhuasted then punt.
4383 * Allow this to run for 2 jiffies since which will allow
4384 * an average latency of 1.5/HZ.
4385 */
4386 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4387 goto softnet_break;
4388
4389 local_irq_enable();
4390
4391 /* Even though interrupts have been re-enabled, this
4392 * access is safe because interrupts can only add new
4393 * entries to the tail of this list, and only ->poll()
4394 * calls can remove this head entry from the list.
4395 */
4396 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4397
4398 have = netpoll_poll_lock(n);
4399
4400 weight = n->weight;
4401
4402 /* This NAPI_STATE_SCHED test is for avoiding a race
4403 * with netpoll's poll_napi(). Only the entity which
4404 * obtains the lock and sees NAPI_STATE_SCHED set will
4405 * actually make the ->poll() call. Therefore we avoid
4406 * accidentally calling ->poll() when NAPI is not scheduled.
4407 */
4408 work = 0;
4409 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4410 work = n->poll(n, weight);
4411 trace_napi_poll(n);
4412 }
4413
4414 WARN_ON_ONCE(work > weight);
4415
4416 budget -= work;
4417
4418 local_irq_disable();
4419
4420 /* Drivers must not modify the NAPI state if they
4421 * consume the entire weight. In such cases this code
4422 * still "owns" the NAPI instance and therefore can
4423 * move the instance around on the list at-will.
4424 */
4425 if (unlikely(work == weight)) {
4426 if (unlikely(napi_disable_pending(n))) {
4427 local_irq_enable();
4428 napi_complete(n);
4429 local_irq_disable();
4430 } else {
4431 if (n->gro_list) {
4432 /* flush too old packets
4433 * If HZ < 1000, flush all packets.
4434 */
4435 local_irq_enable();
4436 napi_gro_flush(n, HZ >= 1000);
4437 local_irq_disable();
4438 }
4439 list_move_tail(&n->poll_list, &sd->poll_list);
4440 }
4441 }
4442
4443 netpoll_poll_unlock(have);
4444 }
4445out:
4446 net_rps_action_and_irq_enable(sd);
4447
4448#ifdef CONFIG_NET_DMA
4449 /*
4450 * There may not be any more sk_buffs coming right now, so push
4451 * any pending DMA copies to hardware
4452 */
4453 dma_issue_pending_all();
4454#endif
4455
4456 return;
4457
4458softnet_break:
4459 sd->time_squeeze++;
4460 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4461 goto out;
4462}
4463
4464struct netdev_adjacent {
4465 struct net_device *dev;
4466
4467 /* upper master flag, there can only be one master device per list */
4468 bool master;
4469
4470 /* counter for the number of times this device was added to us */
4471 u16 ref_nr;
4472
4473 /* private field for the users */
4474 void *private;
4475
4476 struct list_head list;
4477 struct rcu_head rcu;
4478};
4479
4480static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4481 struct net_device *adj_dev,
4482 struct list_head *adj_list)
4483{
4484 struct netdev_adjacent *adj;
4485
4486 list_for_each_entry(adj, adj_list, list) {
4487 if (adj->dev == adj_dev)
4488 return adj;
4489 }
4490 return NULL;
4491}
4492
4493/**
4494 * netdev_has_upper_dev - Check if device is linked to an upper device
4495 * @dev: device
4496 * @upper_dev: upper device to check
4497 *
4498 * Find out if a device is linked to specified upper device and return true
4499 * in case it is. Note that this checks only immediate upper device,
4500 * not through a complete stack of devices. The caller must hold the RTNL lock.
4501 */
4502bool netdev_has_upper_dev(struct net_device *dev,
4503 struct net_device *upper_dev)
4504{
4505 ASSERT_RTNL();
4506
4507 return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4508}
4509EXPORT_SYMBOL(netdev_has_upper_dev);
4510
4511/**
4512 * netdev_has_any_upper_dev - Check if device is linked to some device
4513 * @dev: device
4514 *
4515 * Find out if a device is linked to an upper device and return true in case
4516 * it is. The caller must hold the RTNL lock.
4517 */
4518static bool netdev_has_any_upper_dev(struct net_device *dev)
4519{
4520 ASSERT_RTNL();
4521
4522 return !list_empty(&dev->all_adj_list.upper);
4523}
4524
4525/**
4526 * netdev_master_upper_dev_get - Get master upper device
4527 * @dev: device
4528 *
4529 * Find a master upper device and return pointer to it or NULL in case
4530 * it's not there. The caller must hold the RTNL lock.
4531 */
4532struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4533{
4534 struct netdev_adjacent *upper;
4535
4536 ASSERT_RTNL();
4537
4538 if (list_empty(&dev->adj_list.upper))
4539 return NULL;
4540
4541 upper = list_first_entry(&dev->adj_list.upper,
4542 struct netdev_adjacent, list);
4543 if (likely(upper->master))
4544 return upper->dev;
4545 return NULL;
4546}
4547EXPORT_SYMBOL(netdev_master_upper_dev_get);
4548
4549void *netdev_adjacent_get_private(struct list_head *adj_list)
4550{
4551 struct netdev_adjacent *adj;
4552
4553 adj = list_entry(adj_list, struct netdev_adjacent, list);
4554
4555 return adj->private;
4556}
4557EXPORT_SYMBOL(netdev_adjacent_get_private);
4558
4559/**
4560 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4561 * @dev: device
4562 * @iter: list_head ** of the current position
4563 *
4564 * Gets the next device from the dev's upper list, starting from iter
4565 * position. The caller must hold RCU read lock.
4566 */
4567struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4568 struct list_head **iter)
4569{
4570 struct netdev_adjacent *upper;
4571
4572 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4573
4574 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4575
4576 if (&upper->list == &dev->adj_list.upper)
4577 return NULL;
4578
4579 *iter = &upper->list;
4580
4581 return upper->dev;
4582}
4583EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4584
4585/**
4586 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4587 * @dev: device
4588 * @iter: list_head ** of the current position
4589 *
4590 * Gets the next device from the dev's upper list, starting from iter
4591 * position. The caller must hold RCU read lock.
4592 */
4593struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4594 struct list_head **iter)
4595{
4596 struct netdev_adjacent *upper;
4597
4598 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4599
4600 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4601
4602 if (&upper->list == &dev->all_adj_list.upper)
4603 return NULL;
4604
4605 *iter = &upper->list;
4606
4607 return upper->dev;
4608}
4609EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4610
4611/**
4612 * netdev_lower_get_next_private - Get the next ->private from the
4613 * lower neighbour list
4614 * @dev: device
4615 * @iter: list_head ** of the current position
4616 *
4617 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4618 * list, starting from iter position. The caller must hold either hold the
4619 * RTNL lock or its own locking that guarantees that the neighbour lower
4620 * list will remain unchainged.
4621 */
4622void *netdev_lower_get_next_private(struct net_device *dev,
4623 struct list_head **iter)
4624{
4625 struct netdev_adjacent *lower;
4626
4627 lower = list_entry(*iter, struct netdev_adjacent, list);
4628
4629 if (&lower->list == &dev->adj_list.lower)
4630 return NULL;
4631
4632 *iter = lower->list.next;
4633
4634 return lower->private;
4635}
4636EXPORT_SYMBOL(netdev_lower_get_next_private);
4637
4638/**
4639 * netdev_lower_get_next_private_rcu - Get the next ->private from the
4640 * lower neighbour list, RCU
4641 * variant
4642 * @dev: device
4643 * @iter: list_head ** of the current position
4644 *
4645 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4646 * list, starting from iter position. The caller must hold RCU read lock.
4647 */
4648void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4649 struct list_head **iter)
4650{
4651 struct netdev_adjacent *lower;
4652
4653 WARN_ON_ONCE(!rcu_read_lock_held());
4654
4655 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4656
4657 if (&lower->list == &dev->adj_list.lower)
4658 return NULL;
4659
4660 *iter = &lower->list;
4661
4662 return lower->private;
4663}
4664EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4665
4666/**
4667 * netdev_lower_get_next - Get the next device from the lower neighbour
4668 * list
4669 * @dev: device
4670 * @iter: list_head ** of the current position
4671 *
4672 * Gets the next netdev_adjacent from the dev's lower neighbour
4673 * list, starting from iter position. The caller must hold RTNL lock or
4674 * its own locking that guarantees that the neighbour lower
4675 * list will remain unchainged.
4676 */
4677void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4678{
4679 struct netdev_adjacent *lower;
4680
4681 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4682
4683 if (&lower->list == &dev->adj_list.lower)
4684 return NULL;
4685
4686 *iter = &lower->list;
4687
4688 return lower->dev;
4689}
4690EXPORT_SYMBOL(netdev_lower_get_next);
4691
4692/**
4693 * netdev_lower_get_first_private_rcu - Get the first ->private from the
4694 * lower neighbour list, RCU
4695 * variant
4696 * @dev: device
4697 *
4698 * Gets the first netdev_adjacent->private from the dev's lower neighbour
4699 * list. The caller must hold RCU read lock.
4700 */
4701void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4702{
4703 struct netdev_adjacent *lower;
4704
4705 lower = list_first_or_null_rcu(&dev->adj_list.lower,
4706 struct netdev_adjacent, list);
4707 if (lower)
4708 return lower->private;
4709 return NULL;
4710}
4711EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4712
4713/**
4714 * netdev_master_upper_dev_get_rcu - Get master upper device
4715 * @dev: device
4716 *
4717 * Find a master upper device and return pointer to it or NULL in case
4718 * it's not there. The caller must hold the RCU read lock.
4719 */
4720struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4721{
4722 struct netdev_adjacent *upper;
4723
4724 upper = list_first_or_null_rcu(&dev->adj_list.upper,
4725 struct netdev_adjacent, list);
4726 if (upper && likely(upper->master))
4727 return upper->dev;
4728 return NULL;
4729}
4730EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4731
4732static int netdev_adjacent_sysfs_add(struct net_device *dev,
4733 struct net_device *adj_dev,
4734 struct list_head *dev_list)
4735{
4736 char linkname[IFNAMSIZ+7];
4737 sprintf(linkname, dev_list == &dev->adj_list.upper ?
4738 "upper_%s" : "lower_%s", adj_dev->name);
4739 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4740 linkname);
4741}
4742static void netdev_adjacent_sysfs_del(struct net_device *dev,
4743 char *name,
4744 struct list_head *dev_list)
4745{
4746 char linkname[IFNAMSIZ+7];
4747 sprintf(linkname, dev_list == &dev->adj_list.upper ?
4748 "upper_%s" : "lower_%s", name);
4749 sysfs_remove_link(&(dev->dev.kobj), linkname);
4750}
4751
4752#define netdev_adjacent_is_neigh_list(dev, dev_list) \
4753 (dev_list == &dev->adj_list.upper || \
4754 dev_list == &dev->adj_list.lower)
4755
4756static int __netdev_adjacent_dev_insert(struct net_device *dev,
4757 struct net_device *adj_dev,
4758 struct list_head *dev_list,
4759 void *private, bool master)
4760{
4761 struct netdev_adjacent *adj;
4762 int ret;
4763
4764 adj = __netdev_find_adj(dev, adj_dev, dev_list);
4765
4766 if (adj) {
4767 adj->ref_nr++;
4768 return 0;
4769 }
4770
4771 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4772 if (!adj)
4773 return -ENOMEM;
4774
4775 adj->dev = adj_dev;
4776 adj->master = master;
4777 adj->ref_nr = 1;
4778 adj->private = private;
4779 dev_hold(adj_dev);
4780
4781 pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4782 adj_dev->name, dev->name, adj_dev->name);
4783
4784 if (netdev_adjacent_is_neigh_list(dev, dev_list)) {
4785 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
4786 if (ret)
4787 goto free_adj;
4788 }
4789
4790 /* Ensure that master link is always the first item in list. */
4791 if (master) {
4792 ret = sysfs_create_link(&(dev->dev.kobj),
4793 &(adj_dev->dev.kobj), "master");
4794 if (ret)
4795 goto remove_symlinks;
4796
4797 list_add_rcu(&adj->list, dev_list);
4798 } else {
4799 list_add_tail_rcu(&adj->list, dev_list);
4800 }
4801
4802 return 0;
4803
4804remove_symlinks:
4805 if (netdev_adjacent_is_neigh_list(dev, dev_list))
4806 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4807free_adj:
4808 kfree(adj);
4809 dev_put(adj_dev);
4810
4811 return ret;
4812}
4813
4814static void __netdev_adjacent_dev_remove(struct net_device *dev,
4815 struct net_device *adj_dev,
4816 struct list_head *dev_list)
4817{
4818 struct netdev_adjacent *adj;
4819
4820 adj = __netdev_find_adj(dev, adj_dev, dev_list);
4821
4822 if (!adj) {
4823 pr_err("tried to remove device %s from %s\n",
4824 dev->name, adj_dev->name);
4825 BUG();
4826 }
4827
4828 if (adj->ref_nr > 1) {
4829 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
4830 adj->ref_nr-1);
4831 adj->ref_nr--;
4832 return;
4833 }
4834
4835 if (adj->master)
4836 sysfs_remove_link(&(dev->dev.kobj), "master");
4837
4838 if (netdev_adjacent_is_neigh_list(dev, dev_list))
4839 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4840
4841 list_del_rcu(&adj->list);
4842 pr_debug("dev_put for %s, because link removed from %s to %s\n",
4843 adj_dev->name, dev->name, adj_dev->name);
4844 dev_put(adj_dev);
4845 kfree_rcu(adj, rcu);
4846}
4847
4848static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
4849 struct net_device *upper_dev,
4850 struct list_head *up_list,
4851 struct list_head *down_list,
4852 void *private, bool master)
4853{
4854 int ret;
4855
4856 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
4857 master);
4858 if (ret)
4859 return ret;
4860
4861 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
4862 false);
4863 if (ret) {
4864 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4865 return ret;
4866 }
4867
4868 return 0;
4869}
4870
4871static int __netdev_adjacent_dev_link(struct net_device *dev,
4872 struct net_device *upper_dev)
4873{
4874 return __netdev_adjacent_dev_link_lists(dev, upper_dev,
4875 &dev->all_adj_list.upper,
4876 &upper_dev->all_adj_list.lower,
4877 NULL, false);
4878}
4879
4880static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
4881 struct net_device *upper_dev,
4882 struct list_head *up_list,
4883 struct list_head *down_list)
4884{
4885 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4886 __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
4887}
4888
4889static void __netdev_adjacent_dev_unlink(struct net_device *dev,
4890 struct net_device *upper_dev)
4891{
4892 __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4893 &dev->all_adj_list.upper,
4894 &upper_dev->all_adj_list.lower);
4895}
4896
4897static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
4898 struct net_device *upper_dev,
4899 void *private, bool master)
4900{
4901 int ret = __netdev_adjacent_dev_link(dev, upper_dev);
4902
4903 if (ret)
4904 return ret;
4905
4906 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
4907 &dev->adj_list.upper,
4908 &upper_dev->adj_list.lower,
4909 private, master);
4910 if (ret) {
4911 __netdev_adjacent_dev_unlink(dev, upper_dev);
4912 return ret;
4913 }
4914
4915 return 0;
4916}
4917
4918static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
4919 struct net_device *upper_dev)
4920{
4921 __netdev_adjacent_dev_unlink(dev, upper_dev);
4922 __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4923 &dev->adj_list.upper,
4924 &upper_dev->adj_list.lower);
4925}
4926
4927static int __netdev_upper_dev_link(struct net_device *dev,
4928 struct net_device *upper_dev, bool master,
4929 void *private)
4930{
4931 struct netdev_adjacent *i, *j, *to_i, *to_j;
4932 int ret = 0;
4933
4934 ASSERT_RTNL();
4935
4936 if (dev == upper_dev)
4937 return -EBUSY;
4938
4939 /* To prevent loops, check if dev is not upper device to upper_dev. */
4940 if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
4941 return -EBUSY;
4942
4943 if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
4944 return -EEXIST;
4945
4946 if (master && netdev_master_upper_dev_get(dev))
4947 return -EBUSY;
4948
4949 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
4950 master);
4951 if (ret)
4952 return ret;
4953
4954 /* Now that we linked these devs, make all the upper_dev's
4955 * all_adj_list.upper visible to every dev's all_adj_list.lower an
4956 * versa, and don't forget the devices itself. All of these
4957 * links are non-neighbours.
4958 */
4959 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4960 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
4961 pr_debug("Interlinking %s with %s, non-neighbour\n",
4962 i->dev->name, j->dev->name);
4963 ret = __netdev_adjacent_dev_link(i->dev, j->dev);
4964 if (ret)
4965 goto rollback_mesh;
4966 }
4967 }
4968
4969 /* add dev to every upper_dev's upper device */
4970 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
4971 pr_debug("linking %s's upper device %s with %s\n",
4972 upper_dev->name, i->dev->name, dev->name);
4973 ret = __netdev_adjacent_dev_link(dev, i->dev);
4974 if (ret)
4975 goto rollback_upper_mesh;
4976 }
4977
4978 /* add upper_dev to every dev's lower device */
4979 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4980 pr_debug("linking %s's lower device %s with %s\n", dev->name,
4981 i->dev->name, upper_dev->name);
4982 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
4983 if (ret)
4984 goto rollback_lower_mesh;
4985 }
4986
4987 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4988 return 0;
4989
4990rollback_lower_mesh:
4991 to_i = i;
4992 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4993 if (i == to_i)
4994 break;
4995 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
4996 }
4997
4998 i = NULL;
4999
5000rollback_upper_mesh:
5001 to_i = i;
5002 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5003 if (i == to_i)
5004 break;
5005 __netdev_adjacent_dev_unlink(dev, i->dev);
5006 }
5007
5008 i = j = NULL;
5009
5010rollback_mesh:
5011 to_i = i;
5012 to_j = j;
5013 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5014 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5015 if (i == to_i && j == to_j)
5016 break;
5017 __netdev_adjacent_dev_unlink(i->dev, j->dev);
5018 }
5019 if (i == to_i)
5020 break;
5021 }
5022
5023 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5024
5025 return ret;
5026}
5027
5028/**
5029 * netdev_upper_dev_link - Add a link to the upper device
5030 * @dev: device
5031 * @upper_dev: new upper device
5032 *
5033 * Adds a link to device which is upper to this one. The caller must hold
5034 * the RTNL lock. On a failure a negative errno code is returned.
5035 * On success the reference counts are adjusted and the function
5036 * returns zero.
5037 */
5038int netdev_upper_dev_link(struct net_device *dev,
5039 struct net_device *upper_dev)
5040{
5041 return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5042}
5043EXPORT_SYMBOL(netdev_upper_dev_link);
5044
5045/**
5046 * netdev_master_upper_dev_link - Add a master link to the upper device
5047 * @dev: device
5048 * @upper_dev: new upper device
5049 *
5050 * Adds a link to device which is upper to this one. In this case, only
5051 * one master upper device can be linked, although other non-master devices
5052 * might be linked as well. The caller must hold the RTNL lock.
5053 * On a failure a negative errno code is returned. On success the reference
5054 * counts are adjusted and the function returns zero.
5055 */
5056int netdev_master_upper_dev_link(struct net_device *dev,
5057 struct net_device *upper_dev)
5058{
5059 return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5060}
5061EXPORT_SYMBOL(netdev_master_upper_dev_link);
5062
5063int netdev_master_upper_dev_link_private(struct net_device *dev,
5064 struct net_device *upper_dev,
5065 void *private)
5066{
5067 return __netdev_upper_dev_link(dev, upper_dev, true, private);
5068}
5069EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5070
5071/**
5072 * netdev_upper_dev_unlink - Removes a link to upper device
5073 * @dev: device
5074 * @upper_dev: new upper device
5075 *
5076 * Removes a link to device which is upper to this one. The caller must hold
5077 * the RTNL lock.
5078 */
5079void netdev_upper_dev_unlink(struct net_device *dev,
5080 struct net_device *upper_dev)
5081{
5082 struct netdev_adjacent *i, *j;
5083 ASSERT_RTNL();
5084
5085 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5086
5087 /* Here is the tricky part. We must remove all dev's lower
5088 * devices from all upper_dev's upper devices and vice
5089 * versa, to maintain the graph relationship.
5090 */
5091 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5092 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5093 __netdev_adjacent_dev_unlink(i->dev, j->dev);
5094
5095 /* remove also the devices itself from lower/upper device
5096 * list
5097 */
5098 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5099 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5100
5101 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5102 __netdev_adjacent_dev_unlink(dev, i->dev);
5103
5104 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5105}
5106EXPORT_SYMBOL(netdev_upper_dev_unlink);
5107
5108void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5109{
5110 struct netdev_adjacent *iter;
5111
5112 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5113 netdev_adjacent_sysfs_del(iter->dev, oldname,
5114 &iter->dev->adj_list.lower);
5115 netdev_adjacent_sysfs_add(iter->dev, dev,
5116 &iter->dev->adj_list.lower);
5117 }
5118
5119 list_for_each_entry(iter, &dev->adj_list.lower, list) {
5120 netdev_adjacent_sysfs_del(iter->dev, oldname,
5121 &iter->dev->adj_list.upper);
5122 netdev_adjacent_sysfs_add(iter->dev, dev,
5123 &iter->dev->adj_list.upper);
5124 }
5125}
5126
5127void *netdev_lower_dev_get_private(struct net_device *dev,
5128 struct net_device *lower_dev)
5129{
5130 struct netdev_adjacent *lower;
5131
5132 if (!lower_dev)
5133 return NULL;
5134 lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5135 if (!lower)
5136 return NULL;
5137
5138 return lower->private;
5139}
5140EXPORT_SYMBOL(netdev_lower_dev_get_private);
5141
5142
5143int dev_get_nest_level(struct net_device *dev,
5144 bool (*type_check)(struct net_device *dev))
5145{
5146 struct net_device *lower = NULL;
5147 struct list_head *iter;
5148 int max_nest = -1;
5149 int nest;
5150
5151 ASSERT_RTNL();
5152
5153 netdev_for_each_lower_dev(dev, lower, iter) {
5154 nest = dev_get_nest_level(lower, type_check);
5155 if (max_nest < nest)
5156 max_nest = nest;
5157 }
5158
5159 if (type_check(dev))
5160 max_nest++;
5161
5162 return max_nest;
5163}
5164EXPORT_SYMBOL(dev_get_nest_level);
5165
5166static void dev_change_rx_flags(struct net_device *dev, int flags)
5167{
5168 const struct net_device_ops *ops = dev->netdev_ops;
5169
5170 if (ops->ndo_change_rx_flags)
5171 ops->ndo_change_rx_flags(dev, flags);
5172}
5173
5174static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5175{
5176 unsigned int old_flags = dev->flags;
5177 kuid_t uid;
5178 kgid_t gid;
5179
5180 ASSERT_RTNL();
5181
5182 dev->flags |= IFF_PROMISC;
5183 dev->promiscuity += inc;
5184 if (dev->promiscuity == 0) {
5185 /*
5186 * Avoid overflow.
5187 * If inc causes overflow, untouch promisc and return error.
5188 */
5189 if (inc < 0)
5190 dev->flags &= ~IFF_PROMISC;
5191 else {
5192 dev->promiscuity -= inc;
5193 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5194 dev->name);
5195 return -EOVERFLOW;
5196 }
5197 }
5198 if (dev->flags != old_flags) {
5199 pr_info("device %s %s promiscuous mode\n",
5200 dev->name,
5201 dev->flags & IFF_PROMISC ? "entered" : "left");
5202 if (audit_enabled) {
5203 current_uid_gid(&uid, &gid);
5204 audit_log(current->audit_context, GFP_ATOMIC,
5205 AUDIT_ANOM_PROMISCUOUS,
5206 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5207 dev->name, (dev->flags & IFF_PROMISC),
5208 (old_flags & IFF_PROMISC),
5209 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5210 from_kuid(&init_user_ns, uid),
5211 from_kgid(&init_user_ns, gid),
5212 audit_get_sessionid(current));
5213 }
5214
5215 dev_change_rx_flags(dev, IFF_PROMISC);
5216 }
5217 if (notify)
5218 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5219 return 0;
5220}
5221
5222/**
5223 * dev_set_promiscuity - update promiscuity count on a device
5224 * @dev: device
5225 * @inc: modifier
5226 *
5227 * Add or remove promiscuity from a device. While the count in the device
5228 * remains above zero the interface remains promiscuous. Once it hits zero
5229 * the device reverts back to normal filtering operation. A negative inc
5230 * value is used to drop promiscuity on the device.
5231 * Return 0 if successful or a negative errno code on error.
5232 */
5233int dev_set_promiscuity(struct net_device *dev, int inc)
5234{
5235 unsigned int old_flags = dev->flags;
5236 int err;
5237
5238 err = __dev_set_promiscuity(dev, inc, true);
5239 if (err < 0)
5240 return err;
5241 if (dev->flags != old_flags)
5242 dev_set_rx_mode(dev);
5243 return err;
5244}
5245EXPORT_SYMBOL(dev_set_promiscuity);
5246
5247static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5248{
5249 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5250
5251 ASSERT_RTNL();
5252
5253 dev->flags |= IFF_ALLMULTI;
5254 dev->allmulti += inc;
5255 if (dev->allmulti == 0) {
5256 /*
5257 * Avoid overflow.
5258 * If inc causes overflow, untouch allmulti and return error.
5259 */
5260 if (inc < 0)
5261 dev->flags &= ~IFF_ALLMULTI;
5262 else {
5263 dev->allmulti -= inc;
5264 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5265 dev->name);
5266 return -EOVERFLOW;
5267 }
5268 }
5269 if (dev->flags ^ old_flags) {
5270 dev_change_rx_flags(dev, IFF_ALLMULTI);
5271 dev_set_rx_mode(dev);
5272 if (notify)
5273 __dev_notify_flags(dev, old_flags,
5274 dev->gflags ^ old_gflags);
5275 }
5276 return 0;
5277}
5278
5279/**
5280 * dev_set_allmulti - update allmulti count on a device
5281 * @dev: device
5282 * @inc: modifier
5283 *
5284 * Add or remove reception of all multicast frames to a device. While the
5285 * count in the device remains above zero the interface remains listening
5286 * to all interfaces. Once it hits zero the device reverts back to normal
5287 * filtering operation. A negative @inc value is used to drop the counter
5288 * when releasing a resource needing all multicasts.
5289 * Return 0 if successful or a negative errno code on error.
5290 */
5291
5292int dev_set_allmulti(struct net_device *dev, int inc)
5293{
5294 return __dev_set_allmulti(dev, inc, true);
5295}
5296EXPORT_SYMBOL(dev_set_allmulti);
5297
5298/*
5299 * Upload unicast and multicast address lists to device and
5300 * configure RX filtering. When the device doesn't support unicast
5301 * filtering it is put in promiscuous mode while unicast addresses
5302 * are present.
5303 */
5304void __dev_set_rx_mode(struct net_device *dev)
5305{
5306 const struct net_device_ops *ops = dev->netdev_ops;
5307
5308 /* dev_open will call this function so the list will stay sane. */
5309 if (!(dev->flags&IFF_UP))
5310 return;
5311
5312 if (!netif_device_present(dev))
5313 return;
5314
5315 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5316 /* Unicast addresses changes may only happen under the rtnl,
5317 * therefore calling __dev_set_promiscuity here is safe.
5318 */
5319 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5320 __dev_set_promiscuity(dev, 1, false);
5321 dev->uc_promisc = true;
5322 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5323 __dev_set_promiscuity(dev, -1, false);
5324 dev->uc_promisc = false;
5325 }
5326 }
5327
5328 if (ops->ndo_set_rx_mode)
5329 ops->ndo_set_rx_mode(dev);
5330}
5331
5332void dev_set_rx_mode(struct net_device *dev)
5333{
5334 netif_addr_lock_bh(dev);
5335 __dev_set_rx_mode(dev);
5336 netif_addr_unlock_bh(dev);
5337}
5338
5339/**
5340 * dev_get_flags - get flags reported to userspace
5341 * @dev: device
5342 *
5343 * Get the combination of flag bits exported through APIs to userspace.
5344 */
5345unsigned int dev_get_flags(const struct net_device *dev)
5346{
5347 unsigned int flags;
5348
5349 flags = (dev->flags & ~(IFF_PROMISC |
5350 IFF_ALLMULTI |
5351 IFF_RUNNING |
5352 IFF_LOWER_UP |
5353 IFF_DORMANT)) |
5354 (dev->gflags & (IFF_PROMISC |
5355 IFF_ALLMULTI));
5356
5357 if (netif_running(dev)) {
5358 if (netif_oper_up(dev))
5359 flags |= IFF_RUNNING;
5360 if (netif_carrier_ok(dev))
5361 flags |= IFF_LOWER_UP;
5362 if (netif_dormant(dev))
5363 flags |= IFF_DORMANT;
5364 }
5365
5366 return flags;
5367}
5368EXPORT_SYMBOL(dev_get_flags);
5369
5370int __dev_change_flags(struct net_device *dev, unsigned int flags)
5371{
5372 unsigned int old_flags = dev->flags;
5373 int ret;
5374
5375 ASSERT_RTNL();
5376
5377 /*
5378 * Set the flags on our device.
5379 */
5380
5381 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5382 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5383 IFF_AUTOMEDIA)) |
5384 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5385 IFF_ALLMULTI));
5386
5387 /*
5388 * Load in the correct multicast list now the flags have changed.
5389 */
5390
5391 if ((old_flags ^ flags) & IFF_MULTICAST)
5392 dev_change_rx_flags(dev, IFF_MULTICAST);
5393
5394 dev_set_rx_mode(dev);
5395
5396 /*
5397 * Have we downed the interface. We handle IFF_UP ourselves
5398 * according to user attempts to set it, rather than blindly
5399 * setting it.
5400 */
5401
5402 ret = 0;
5403 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
5404 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5405
5406 if (!ret)
5407 dev_set_rx_mode(dev);
5408 }
5409
5410 if ((flags ^ dev->gflags) & IFF_PROMISC) {
5411 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5412 unsigned int old_flags = dev->flags;
5413
5414 dev->gflags ^= IFF_PROMISC;
5415
5416 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5417 if (dev->flags != old_flags)
5418 dev_set_rx_mode(dev);
5419 }
5420
5421 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5422 is important. Some (broken) drivers set IFF_PROMISC, when
5423 IFF_ALLMULTI is requested not asking us and not reporting.
5424 */
5425 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5426 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5427
5428 dev->gflags ^= IFF_ALLMULTI;
5429 __dev_set_allmulti(dev, inc, false);
5430 }
5431
5432 return ret;
5433}
5434
5435void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5436 unsigned int gchanges)
5437{
5438 unsigned int changes = dev->flags ^ old_flags;
5439
5440 if (gchanges)
5441 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5442
5443 if (changes & IFF_UP) {
5444 if (dev->flags & IFF_UP)
5445 call_netdevice_notifiers(NETDEV_UP, dev);
5446 else
5447 call_netdevice_notifiers(NETDEV_DOWN, dev);
5448 }
5449
5450 if (dev->flags & IFF_UP &&
5451 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5452 struct netdev_notifier_change_info change_info;
5453
5454 change_info.flags_changed = changes;
5455 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5456 &change_info.info);
5457 }
5458}
5459
5460/**
5461 * dev_change_flags - change device settings
5462 * @dev: device
5463 * @flags: device state flags
5464 *
5465 * Change settings on device based state flags. The flags are
5466 * in the userspace exported format.
5467 */
5468int dev_change_flags(struct net_device *dev, unsigned int flags)
5469{
5470 int ret;
5471 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5472
5473 ret = __dev_change_flags(dev, flags);
5474 if (ret < 0)
5475 return ret;
5476
5477 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5478 __dev_notify_flags(dev, old_flags, changes);
5479 return ret;
5480}
5481EXPORT_SYMBOL(dev_change_flags);
5482
5483static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5484{
5485 const struct net_device_ops *ops = dev->netdev_ops;
5486
5487 if (ops->ndo_change_mtu)
5488 return ops->ndo_change_mtu(dev, new_mtu);
5489
5490 dev->mtu = new_mtu;
5491 return 0;
5492}
5493
5494/**
5495 * dev_set_mtu - Change maximum transfer unit
5496 * @dev: device
5497 * @new_mtu: new transfer unit
5498 *
5499 * Change the maximum transfer size of the network device.
5500 */
5501int dev_set_mtu(struct net_device *dev, int new_mtu)
5502{
5503 int err, orig_mtu;
5504
5505 if (new_mtu == dev->mtu)
5506 return 0;
5507
5508 /* MTU must be positive. */
5509 if (new_mtu < 0)
5510 return -EINVAL;
5511
5512 if (!netif_device_present(dev))
5513 return -ENODEV;
5514
5515 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5516 err = notifier_to_errno(err);
5517 if (err)
5518 return err;
5519
5520 orig_mtu = dev->mtu;
5521 err = __dev_set_mtu(dev, new_mtu);
5522
5523 if (!err) {
5524 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5525 err = notifier_to_errno(err);
5526 if (err) {
5527 /* setting mtu back and notifying everyone again,
5528 * so that they have a chance to revert changes.
5529 */
5530 __dev_set_mtu(dev, orig_mtu);
5531 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5532 }
5533 }
5534 return err;
5535}
5536EXPORT_SYMBOL(dev_set_mtu);
5537
5538/**
5539 * dev_set_group - Change group this device belongs to
5540 * @dev: device
5541 * @new_group: group this device should belong to
5542 */
5543void dev_set_group(struct net_device *dev, int new_group)
5544{
5545 dev->group = new_group;
5546}
5547EXPORT_SYMBOL(dev_set_group);
5548
5549/**
5550 * dev_set_mac_address - Change Media Access Control Address
5551 * @dev: device
5552 * @sa: new address
5553 *
5554 * Change the hardware (MAC) address of the device
5555 */
5556int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5557{
5558 const struct net_device_ops *ops = dev->netdev_ops;
5559 int err;
5560
5561 if (!ops->ndo_set_mac_address)
5562 return -EOPNOTSUPP;
5563 if (sa->sa_family != dev->type)
5564 return -EINVAL;
5565 if (!netif_device_present(dev))
5566 return -ENODEV;
5567 err = ops->ndo_set_mac_address(dev, sa);
5568 if (err)
5569 return err;
5570 dev->addr_assign_type = NET_ADDR_SET;
5571 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5572 add_device_randomness(dev->dev_addr, dev->addr_len);
5573 return 0;
5574}
5575EXPORT_SYMBOL(dev_set_mac_address);
5576
5577/**
5578 * dev_change_carrier - Change device carrier
5579 * @dev: device
5580 * @new_carrier: new value
5581 *
5582 * Change device carrier
5583 */
5584int dev_change_carrier(struct net_device *dev, bool new_carrier)
5585{
5586 const struct net_device_ops *ops = dev->netdev_ops;
5587
5588 if (!ops->ndo_change_carrier)
5589 return -EOPNOTSUPP;
5590 if (!netif_device_present(dev))
5591 return -ENODEV;
5592 return ops->ndo_change_carrier(dev, new_carrier);
5593}
5594EXPORT_SYMBOL(dev_change_carrier);
5595
5596/**
5597 * dev_get_phys_port_id - Get device physical port ID
5598 * @dev: device
5599 * @ppid: port ID
5600 *
5601 * Get device physical port ID
5602 */
5603int dev_get_phys_port_id(struct net_device *dev,
5604 struct netdev_phys_port_id *ppid)
5605{
5606 const struct net_device_ops *ops = dev->netdev_ops;
5607
5608 if (!ops->ndo_get_phys_port_id)
5609 return -EOPNOTSUPP;
5610 return ops->ndo_get_phys_port_id(dev, ppid);
5611}
5612EXPORT_SYMBOL(dev_get_phys_port_id);
5613
5614/**
5615 * dev_new_index - allocate an ifindex
5616 * @net: the applicable net namespace
5617 *
5618 * Returns a suitable unique value for a new device interface
5619 * number. The caller must hold the rtnl semaphore or the
5620 * dev_base_lock to be sure it remains unique.
5621 */
5622static int dev_new_index(struct net *net)
5623{
5624 int ifindex = net->ifindex;
5625 for (;;) {
5626 if (++ifindex <= 0)
5627 ifindex = 1;
5628 if (!__dev_get_by_index(net, ifindex))
5629 return net->ifindex = ifindex;
5630 }
5631}
5632
5633/* Delayed registration/unregisteration */
5634static LIST_HEAD(net_todo_list);
5635DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5636
5637static void net_set_todo(struct net_device *dev)
5638{
5639 list_add_tail(&dev->todo_list, &net_todo_list);
5640 dev_net(dev)->dev_unreg_count++;
5641}
5642
5643static void rollback_registered_many(struct list_head *head)
5644{
5645 struct net_device *dev, *tmp;
5646 LIST_HEAD(close_head);
5647
5648 BUG_ON(dev_boot_phase);
5649 ASSERT_RTNL();
5650
5651 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5652 /* Some devices call without registering
5653 * for initialization unwind. Remove those
5654 * devices and proceed with the remaining.
5655 */
5656 if (dev->reg_state == NETREG_UNINITIALIZED) {
5657 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5658 dev->name, dev);
5659
5660 WARN_ON(1);
5661 list_del(&dev->unreg_list);
5662 continue;
5663 }
5664 dev->dismantle = true;
5665 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5666 }
5667
5668 /* If device is running, close it first. */
5669 list_for_each_entry(dev, head, unreg_list)
5670 list_add_tail(&dev->close_list, &close_head);
5671 dev_close_many(&close_head);
5672
5673 list_for_each_entry(dev, head, unreg_list) {
5674 /* And unlink it from device chain. */
5675 unlist_netdevice(dev);
5676
5677 dev->reg_state = NETREG_UNREGISTERING;
5678 }
5679
5680 synchronize_net();
5681
5682 list_for_each_entry(dev, head, unreg_list) {
5683 /* Shutdown queueing discipline. */
5684 dev_shutdown(dev);
5685
5686
5687 /* Notify protocols, that we are about to destroy
5688 this device. They should clean all the things.
5689 */
5690 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5691
5692 if (!dev->rtnl_link_ops ||
5693 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5694 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5695
5696 /*
5697 * Flush the unicast and multicast chains
5698 */
5699 dev_uc_flush(dev);
5700 dev_mc_flush(dev);
5701
5702 if (dev->netdev_ops->ndo_uninit)
5703 dev->netdev_ops->ndo_uninit(dev);
5704
5705 /* Notifier chain MUST detach us all upper devices. */
5706 WARN_ON(netdev_has_any_upper_dev(dev));
5707
5708 /* Remove entries from kobject tree */
5709 netdev_unregister_kobject(dev);
5710#ifdef CONFIG_XPS
5711 /* Remove XPS queueing entries */
5712 netif_reset_xps_queues_gt(dev, 0);
5713#endif
5714 }
5715
5716 synchronize_net();
5717
5718 list_for_each_entry(dev, head, unreg_list)
5719 dev_put(dev);
5720}
5721
5722static void rollback_registered(struct net_device *dev)
5723{
5724 LIST_HEAD(single);
5725
5726 list_add(&dev->unreg_list, &single);
5727 rollback_registered_many(&single);
5728 list_del(&single);
5729}
5730
5731static netdev_features_t netdev_fix_features(struct net_device *dev,
5732 netdev_features_t features)
5733{
5734 /* Fix illegal checksum combinations */
5735 if ((features & NETIF_F_HW_CSUM) &&
5736 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5737 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5738 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5739 }
5740
5741 /* TSO requires that SG is present as well. */
5742 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5743 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5744 features &= ~NETIF_F_ALL_TSO;
5745 }
5746
5747 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5748 !(features & NETIF_F_IP_CSUM)) {
5749 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5750 features &= ~NETIF_F_TSO;
5751 features &= ~NETIF_F_TSO_ECN;
5752 }
5753
5754 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5755 !(features & NETIF_F_IPV6_CSUM)) {
5756 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5757 features &= ~NETIF_F_TSO6;
5758 }
5759
5760 /* TSO ECN requires that TSO is present as well. */
5761 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5762 features &= ~NETIF_F_TSO_ECN;
5763
5764 /* Software GSO depends on SG. */
5765 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5766 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5767 features &= ~NETIF_F_GSO;
5768 }
5769
5770 /* UFO needs SG and checksumming */
5771 if (features & NETIF_F_UFO) {
5772 /* maybe split UFO into V4 and V6? */
5773 if (!((features & NETIF_F_GEN_CSUM) ||
5774 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5775 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5776 netdev_dbg(dev,
5777 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5778 features &= ~NETIF_F_UFO;
5779 }
5780
5781 if (!(features & NETIF_F_SG)) {
5782 netdev_dbg(dev,
5783 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5784 features &= ~NETIF_F_UFO;
5785 }
5786 }
5787
5788#ifdef CONFIG_NET_RX_BUSY_POLL
5789 if (dev->netdev_ops->ndo_busy_poll)
5790 features |= NETIF_F_BUSY_POLL;
5791 else
5792#endif
5793 features &= ~NETIF_F_BUSY_POLL;
5794
5795 return features;
5796}
5797
5798int __netdev_update_features(struct net_device *dev)
5799{
5800 netdev_features_t features;
5801 int err = 0;
5802
5803 ASSERT_RTNL();
5804
5805 features = netdev_get_wanted_features(dev);
5806
5807 if (dev->netdev_ops->ndo_fix_features)
5808 features = dev->netdev_ops->ndo_fix_features(dev, features);
5809
5810 /* driver might be less strict about feature dependencies */
5811 features = netdev_fix_features(dev, features);
5812
5813 if (dev->features == features)
5814 return 0;
5815
5816 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5817 &dev->features, &features);
5818
5819 if (dev->netdev_ops->ndo_set_features)
5820 err = dev->netdev_ops->ndo_set_features(dev, features);
5821
5822 if (unlikely(err < 0)) {
5823 netdev_err(dev,
5824 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5825 err, &features, &dev->features);
5826 return -1;
5827 }
5828
5829 if (!err)
5830 dev->features = features;
5831
5832 return 1;
5833}
5834
5835/**
5836 * netdev_update_features - recalculate device features
5837 * @dev: the device to check
5838 *
5839 * Recalculate dev->features set and send notifications if it
5840 * has changed. Should be called after driver or hardware dependent
5841 * conditions might have changed that influence the features.
5842 */
5843void netdev_update_features(struct net_device *dev)
5844{
5845 if (__netdev_update_features(dev))
5846 netdev_features_change(dev);
5847}
5848EXPORT_SYMBOL(netdev_update_features);
5849
5850/**
5851 * netdev_change_features - recalculate device features
5852 * @dev: the device to check
5853 *
5854 * Recalculate dev->features set and send notifications even
5855 * if they have not changed. Should be called instead of
5856 * netdev_update_features() if also dev->vlan_features might
5857 * have changed to allow the changes to be propagated to stacked
5858 * VLAN devices.
5859 */
5860void netdev_change_features(struct net_device *dev)
5861{
5862 __netdev_update_features(dev);
5863 netdev_features_change(dev);
5864}
5865EXPORT_SYMBOL(netdev_change_features);
5866
5867/**
5868 * netif_stacked_transfer_operstate - transfer operstate
5869 * @rootdev: the root or lower level device to transfer state from
5870 * @dev: the device to transfer operstate to
5871 *
5872 * Transfer operational state from root to device. This is normally
5873 * called when a stacking relationship exists between the root
5874 * device and the device(a leaf device).
5875 */
5876void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5877 struct net_device *dev)
5878{
5879 if (rootdev->operstate == IF_OPER_DORMANT)
5880 netif_dormant_on(dev);
5881 else
5882 netif_dormant_off(dev);
5883
5884 if (netif_carrier_ok(rootdev)) {
5885 if (!netif_carrier_ok(dev))
5886 netif_carrier_on(dev);
5887 } else {
5888 if (netif_carrier_ok(dev))
5889 netif_carrier_off(dev);
5890 }
5891}
5892EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5893
5894#ifdef CONFIG_SYSFS
5895static int netif_alloc_rx_queues(struct net_device *dev)
5896{
5897 unsigned int i, count = dev->num_rx_queues;
5898 struct netdev_rx_queue *rx;
5899
5900 BUG_ON(count < 1);
5901
5902 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5903 if (!rx)
5904 return -ENOMEM;
5905
5906 dev->_rx = rx;
5907
5908 for (i = 0; i < count; i++)
5909 rx[i].dev = dev;
5910 return 0;
5911}
5912#endif
5913
5914static void netdev_init_one_queue(struct net_device *dev,
5915 struct netdev_queue *queue, void *_unused)
5916{
5917 /* Initialize queue lock */
5918 spin_lock_init(&queue->_xmit_lock);
5919 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5920 queue->xmit_lock_owner = -1;
5921 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5922 queue->dev = dev;
5923#ifdef CONFIG_BQL
5924 dql_init(&queue->dql, HZ);
5925#endif
5926}
5927
5928static void netif_free_tx_queues(struct net_device *dev)
5929{
5930 if (is_vmalloc_addr(dev->_tx))
5931 vfree(dev->_tx);
5932 else
5933 kfree(dev->_tx);
5934}
5935
5936static int netif_alloc_netdev_queues(struct net_device *dev)
5937{
5938 unsigned int count = dev->num_tx_queues;
5939 struct netdev_queue *tx;
5940 size_t sz = count * sizeof(*tx);
5941
5942 BUG_ON(count < 1 || count > 0xffff);
5943
5944 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
5945 if (!tx) {
5946 tx = vzalloc(sz);
5947 if (!tx)
5948 return -ENOMEM;
5949 }
5950 dev->_tx = tx;
5951
5952 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5953 spin_lock_init(&dev->tx_global_lock);
5954
5955 return 0;
5956}
5957
5958/**
5959 * register_netdevice - register a network device
5960 * @dev: device to register
5961 *
5962 * Take a completed network device structure and add it to the kernel
5963 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5964 * chain. 0 is returned on success. A negative errno code is returned
5965 * on a failure to set up the device, or if the name is a duplicate.
5966 *
5967 * Callers must hold the rtnl semaphore. You may want
5968 * register_netdev() instead of this.
5969 *
5970 * BUGS:
5971 * The locking appears insufficient to guarantee two parallel registers
5972 * will not get the same name.
5973 */
5974
5975int register_netdevice(struct net_device *dev)
5976{
5977 int ret;
5978 struct net *net = dev_net(dev);
5979
5980 BUG_ON(dev_boot_phase);
5981 ASSERT_RTNL();
5982
5983 might_sleep();
5984
5985 /* When net_device's are persistent, this will be fatal. */
5986 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5987 BUG_ON(!net);
5988
5989 spin_lock_init(&dev->addr_list_lock);
5990 netdev_set_addr_lockdep_class(dev);
5991
5992 dev->iflink = -1;
5993
5994 ret = dev_get_valid_name(net, dev, dev->name);
5995 if (ret < 0)
5996 goto out;
5997
5998 /* Init, if this function is available */
5999 if (dev->netdev_ops->ndo_init) {
6000 ret = dev->netdev_ops->ndo_init(dev);
6001 if (ret) {
6002 if (ret > 0)
6003 ret = -EIO;
6004 goto out;
6005 }
6006 }
6007
6008 if (((dev->hw_features | dev->features) &
6009 NETIF_F_HW_VLAN_CTAG_FILTER) &&
6010 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6011 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6012 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6013 ret = -EINVAL;
6014 goto err_uninit;
6015 }
6016
6017 ret = -EBUSY;
6018 if (!dev->ifindex)
6019 dev->ifindex = dev_new_index(net);
6020 else if (__dev_get_by_index(net, dev->ifindex))
6021 goto err_uninit;
6022
6023 if (dev->iflink == -1)
6024 dev->iflink = dev->ifindex;
6025
6026 /* Transfer changeable features to wanted_features and enable
6027 * software offloads (GSO and GRO).
6028 */
6029 dev->hw_features |= NETIF_F_SOFT_FEATURES;
6030 dev->features |= NETIF_F_SOFT_FEATURES;
6031 dev->wanted_features = dev->features & dev->hw_features;
6032
6033 if (!(dev->flags & IFF_LOOPBACK)) {
6034 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6035 }
6036
6037 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6038 */
6039 dev->vlan_features |= NETIF_F_HIGHDMA;
6040
6041 /* Make NETIF_F_SG inheritable to tunnel devices.
6042 */
6043 dev->hw_enc_features |= NETIF_F_SG;
6044
6045 /* Make NETIF_F_SG inheritable to MPLS.
6046 */
6047 dev->mpls_features |= NETIF_F_SG;
6048
6049 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6050 ret = notifier_to_errno(ret);
6051 if (ret)
6052 goto err_uninit;
6053
6054 ret = netdev_register_kobject(dev);
6055 if (ret)
6056 goto err_uninit;
6057 dev->reg_state = NETREG_REGISTERED;
6058
6059 __netdev_update_features(dev);
6060
6061 /*
6062 * Default initial state at registry is that the
6063 * device is present.
6064 */
6065
6066 set_bit(__LINK_STATE_PRESENT, &dev->state);
6067
6068 linkwatch_init_dev(dev);
6069
6070 dev_init_scheduler(dev);
6071 dev_hold(dev);
6072 list_netdevice(dev);
6073 add_device_randomness(dev->dev_addr, dev->addr_len);
6074
6075 /* If the device has permanent device address, driver should
6076 * set dev_addr and also addr_assign_type should be set to
6077 * NET_ADDR_PERM (default value).
6078 */
6079 if (dev->addr_assign_type == NET_ADDR_PERM)
6080 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6081
6082 /* Notify protocols, that a new device appeared. */
6083 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6084 ret = notifier_to_errno(ret);
6085 if (ret) {
6086 rollback_registered(dev);
6087 dev->reg_state = NETREG_UNREGISTERED;
6088 }
6089 /*
6090 * Prevent userspace races by waiting until the network
6091 * device is fully setup before sending notifications.
6092 */
6093 if (!dev->rtnl_link_ops ||
6094 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6095 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6096
6097out:
6098 return ret;
6099
6100err_uninit:
6101 if (dev->netdev_ops->ndo_uninit)
6102 dev->netdev_ops->ndo_uninit(dev);
6103 goto out;
6104}
6105EXPORT_SYMBOL(register_netdevice);
6106
6107/**
6108 * init_dummy_netdev - init a dummy network device for NAPI
6109 * @dev: device to init
6110 *
6111 * This takes a network device structure and initialize the minimum
6112 * amount of fields so it can be used to schedule NAPI polls without
6113 * registering a full blown interface. This is to be used by drivers
6114 * that need to tie several hardware interfaces to a single NAPI
6115 * poll scheduler due to HW limitations.
6116 */
6117int init_dummy_netdev(struct net_device *dev)
6118{
6119 /* Clear everything. Note we don't initialize spinlocks
6120 * are they aren't supposed to be taken by any of the
6121 * NAPI code and this dummy netdev is supposed to be
6122 * only ever used for NAPI polls
6123 */
6124 memset(dev, 0, sizeof(struct net_device));
6125
6126 /* make sure we BUG if trying to hit standard
6127 * register/unregister code path
6128 */
6129 dev->reg_state = NETREG_DUMMY;
6130
6131 /* NAPI wants this */
6132 INIT_LIST_HEAD(&dev->napi_list);
6133
6134 /* a dummy interface is started by default */
6135 set_bit(__LINK_STATE_PRESENT, &dev->state);
6136 set_bit(__LINK_STATE_START, &dev->state);
6137
6138 /* Note : We dont allocate pcpu_refcnt for dummy devices,
6139 * because users of this 'device' dont need to change
6140 * its refcount.
6141 */
6142
6143 return 0;
6144}
6145EXPORT_SYMBOL_GPL(init_dummy_netdev);
6146
6147
6148/**
6149 * register_netdev - register a network device
6150 * @dev: device to register
6151 *
6152 * Take a completed network device structure and add it to the kernel
6153 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6154 * chain. 0 is returned on success. A negative errno code is returned
6155 * on a failure to set up the device, or if the name is a duplicate.
6156 *
6157 * This is a wrapper around register_netdevice that takes the rtnl semaphore
6158 * and expands the device name if you passed a format string to
6159 * alloc_netdev.
6160 */
6161int register_netdev(struct net_device *dev)
6162{
6163 int err;
6164
6165 rtnl_lock();
6166 err = register_netdevice(dev);
6167 rtnl_unlock();
6168 return err;
6169}
6170EXPORT_SYMBOL(register_netdev);
6171
6172int netdev_refcnt_read(const struct net_device *dev)
6173{
6174 int i, refcnt = 0;
6175
6176 for_each_possible_cpu(i)
6177 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6178 return refcnt;
6179}
6180EXPORT_SYMBOL(netdev_refcnt_read);
6181
6182/**
6183 * netdev_wait_allrefs - wait until all references are gone.
6184 * @dev: target net_device
6185 *
6186 * This is called when unregistering network devices.
6187 *
6188 * Any protocol or device that holds a reference should register
6189 * for netdevice notification, and cleanup and put back the
6190 * reference if they receive an UNREGISTER event.
6191 * We can get stuck here if buggy protocols don't correctly
6192 * call dev_put.
6193 */
6194static void netdev_wait_allrefs(struct net_device *dev)
6195{
6196 unsigned long rebroadcast_time, warning_time;
6197 int refcnt;
6198
6199 linkwatch_forget_dev(dev);
6200
6201 rebroadcast_time = warning_time = jiffies;
6202 refcnt = netdev_refcnt_read(dev);
6203
6204 while (refcnt != 0) {
6205 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6206 rtnl_lock();
6207
6208 /* Rebroadcast unregister notification */
6209 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6210
6211 __rtnl_unlock();
6212 rcu_barrier();
6213 rtnl_lock();
6214
6215 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6216 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6217 &dev->state)) {
6218 /* We must not have linkwatch events
6219 * pending on unregister. If this
6220 * happens, we simply run the queue
6221 * unscheduled, resulting in a noop
6222 * for this device.
6223 */
6224 linkwatch_run_queue();
6225 }
6226
6227 __rtnl_unlock();
6228
6229 rebroadcast_time = jiffies;
6230 }
6231
6232 msleep(250);
6233
6234 refcnt = netdev_refcnt_read(dev);
6235
6236 if (time_after(jiffies, warning_time + 10 * HZ)) {
6237 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6238 dev->name, refcnt);
6239 warning_time = jiffies;
6240 }
6241 }
6242}
6243
6244/* The sequence is:
6245 *
6246 * rtnl_lock();
6247 * ...
6248 * register_netdevice(x1);
6249 * register_netdevice(x2);
6250 * ...
6251 * unregister_netdevice(y1);
6252 * unregister_netdevice(y2);
6253 * ...
6254 * rtnl_unlock();
6255 * free_netdev(y1);
6256 * free_netdev(y2);
6257 *
6258 * We are invoked by rtnl_unlock().
6259 * This allows us to deal with problems:
6260 * 1) We can delete sysfs objects which invoke hotplug
6261 * without deadlocking with linkwatch via keventd.
6262 * 2) Since we run with the RTNL semaphore not held, we can sleep
6263 * safely in order to wait for the netdev refcnt to drop to zero.
6264 *
6265 * We must not return until all unregister events added during
6266 * the interval the lock was held have been completed.
6267 */
6268void netdev_run_todo(void)
6269{
6270 struct list_head list;
6271
6272 /* Snapshot list, allow later requests */
6273 list_replace_init(&net_todo_list, &list);
6274
6275 __rtnl_unlock();
6276
6277
6278 /* Wait for rcu callbacks to finish before next phase */
6279 if (!list_empty(&list))
6280 rcu_barrier();
6281
6282 while (!list_empty(&list)) {
6283 struct net_device *dev
6284 = list_first_entry(&list, struct net_device, todo_list);
6285 list_del(&dev->todo_list);
6286
6287 rtnl_lock();
6288 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6289 __rtnl_unlock();
6290
6291 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6292 pr_err("network todo '%s' but state %d\n",
6293 dev->name, dev->reg_state);
6294 dump_stack();
6295 continue;
6296 }
6297
6298 dev->reg_state = NETREG_UNREGISTERED;
6299
6300 on_each_cpu(flush_backlog, dev, 1);
6301
6302 netdev_wait_allrefs(dev);
6303
6304 /* paranoia */
6305 BUG_ON(netdev_refcnt_read(dev));
6306 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6307 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6308 WARN_ON(dev->dn_ptr);
6309
6310 if (dev->destructor)
6311 dev->destructor(dev);
6312
6313 /* Report a network device has been unregistered */
6314 rtnl_lock();
6315 dev_net(dev)->dev_unreg_count--;
6316 __rtnl_unlock();
6317 wake_up(&netdev_unregistering_wq);
6318
6319 /* Free network device */
6320 kobject_put(&dev->dev.kobj);
6321 }
6322}
6323
6324/* Convert net_device_stats to rtnl_link_stats64. They have the same
6325 * fields in the same order, with only the type differing.
6326 */
6327void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6328 const struct net_device_stats *netdev_stats)
6329{
6330#if BITS_PER_LONG == 64
6331 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6332 memcpy(stats64, netdev_stats, sizeof(*stats64));
6333#else
6334 size_t i, n = sizeof(*stats64) / sizeof(u64);
6335 const unsigned long *src = (const unsigned long *)netdev_stats;
6336 u64 *dst = (u64 *)stats64;
6337
6338 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6339 sizeof(*stats64) / sizeof(u64));
6340 for (i = 0; i < n; i++)
6341 dst[i] = src[i];
6342#endif
6343}
6344EXPORT_SYMBOL(netdev_stats_to_stats64);
6345
6346/**
6347 * dev_get_stats - get network device statistics
6348 * @dev: device to get statistics from
6349 * @storage: place to store stats
6350 *
6351 * Get network statistics from device. Return @storage.
6352 * The device driver may provide its own method by setting
6353 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6354 * otherwise the internal statistics structure is used.
6355 */
6356struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6357 struct rtnl_link_stats64 *storage)
6358{
6359 const struct net_device_ops *ops = dev->netdev_ops;
6360
6361 if (ops->ndo_get_stats64) {
6362 memset(storage, 0, sizeof(*storage));
6363 ops->ndo_get_stats64(dev, storage);
6364 } else if (ops->ndo_get_stats) {
6365 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6366 } else {
6367 netdev_stats_to_stats64(storage, &dev->stats);
6368 }
6369 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6370 storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6371 return storage;
6372}
6373EXPORT_SYMBOL(dev_get_stats);
6374
6375struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6376{
6377 struct netdev_queue *queue = dev_ingress_queue(dev);
6378
6379#ifdef CONFIG_NET_CLS_ACT
6380 if (queue)
6381 return queue;
6382 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6383 if (!queue)
6384 return NULL;
6385 netdev_init_one_queue(dev, queue, NULL);
6386 queue->qdisc = &noop_qdisc;
6387 queue->qdisc_sleeping = &noop_qdisc;
6388 rcu_assign_pointer(dev->ingress_queue, queue);
6389#endif
6390 return queue;
6391}
6392
6393static const struct ethtool_ops default_ethtool_ops;
6394
6395void netdev_set_default_ethtool_ops(struct net_device *dev,
6396 const struct ethtool_ops *ops)
6397{
6398 if (dev->ethtool_ops == &default_ethtool_ops)
6399 dev->ethtool_ops = ops;
6400}
6401EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6402
6403void netdev_freemem(struct net_device *dev)
6404{
6405 char *addr = (char *)dev - dev->padded;
6406
6407 if (is_vmalloc_addr(addr))
6408 vfree(addr);
6409 else
6410 kfree(addr);
6411}
6412
6413/**
6414 * alloc_netdev_mqs - allocate network device
6415 * @sizeof_priv: size of private data to allocate space for
6416 * @name: device name format string
6417 * @setup: callback to initialize device
6418 * @txqs: the number of TX subqueues to allocate
6419 * @rxqs: the number of RX subqueues to allocate
6420 *
6421 * Allocates a struct net_device with private data area for driver use
6422 * and performs basic initialization. Also allocates subqueue structs
6423 * for each queue on the device.
6424 */
6425struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6426 void (*setup)(struct net_device *),
6427 unsigned int txqs, unsigned int rxqs)
6428{
6429 struct net_device *dev;
6430 size_t alloc_size;
6431 struct net_device *p;
6432
6433 BUG_ON(strlen(name) >= sizeof(dev->name));
6434
6435 if (txqs < 1) {
6436 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6437 return NULL;
6438 }
6439
6440#ifdef CONFIG_SYSFS
6441 if (rxqs < 1) {
6442 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6443 return NULL;
6444 }
6445#endif
6446
6447 alloc_size = sizeof(struct net_device);
6448 if (sizeof_priv) {
6449 /* ensure 32-byte alignment of private area */
6450 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6451 alloc_size += sizeof_priv;
6452 }
6453 /* ensure 32-byte alignment of whole construct */
6454 alloc_size += NETDEV_ALIGN - 1;
6455
6456 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6457 if (!p)
6458 p = vzalloc(alloc_size);
6459 if (!p)
6460 return NULL;
6461
6462 dev = PTR_ALIGN(p, NETDEV_ALIGN);
6463 dev->padded = (char *)dev - (char *)p;
6464
6465 dev->pcpu_refcnt = alloc_percpu(int);
6466 if (!dev->pcpu_refcnt)
6467 goto free_dev;
6468
6469 if (dev_addr_init(dev))
6470 goto free_pcpu;
6471
6472 dev_mc_init(dev);
6473 dev_uc_init(dev);
6474
6475 dev_net_set(dev, &init_net);
6476
6477 dev->gso_max_size = GSO_MAX_SIZE;
6478 dev->gso_max_segs = GSO_MAX_SEGS;
6479
6480 INIT_LIST_HEAD(&dev->napi_list);
6481 INIT_LIST_HEAD(&dev->unreg_list);
6482 INIT_LIST_HEAD(&dev->close_list);
6483 INIT_LIST_HEAD(&dev->link_watch_list);
6484 INIT_LIST_HEAD(&dev->adj_list.upper);
6485 INIT_LIST_HEAD(&dev->adj_list.lower);
6486 INIT_LIST_HEAD(&dev->all_adj_list.upper);
6487 INIT_LIST_HEAD(&dev->all_adj_list.lower);
6488 dev->priv_flags = IFF_XMIT_DST_RELEASE;
6489 setup(dev);
6490
6491 dev->num_tx_queues = txqs;
6492 dev->real_num_tx_queues = txqs;
6493 if (netif_alloc_netdev_queues(dev))
6494 goto free_all;
6495
6496#ifdef CONFIG_SYSFS
6497 dev->num_rx_queues = rxqs;
6498 dev->real_num_rx_queues = rxqs;
6499 if (netif_alloc_rx_queues(dev))
6500 goto free_all;
6501#endif
6502
6503 strcpy(dev->name, name);
6504 dev->group = INIT_NETDEV_GROUP;
6505 if (!dev->ethtool_ops)
6506 dev->ethtool_ops = &default_ethtool_ops;
6507 return dev;
6508
6509free_all:
6510 free_netdev(dev);
6511 return NULL;
6512
6513free_pcpu:
6514 free_percpu(dev->pcpu_refcnt);
6515 netif_free_tx_queues(dev);
6516#ifdef CONFIG_SYSFS
6517 kfree(dev->_rx);
6518#endif
6519
6520free_dev:
6521 netdev_freemem(dev);
6522 return NULL;
6523}
6524EXPORT_SYMBOL(alloc_netdev_mqs);
6525
6526/**
6527 * free_netdev - free network device
6528 * @dev: device
6529 *
6530 * This function does the last stage of destroying an allocated device
6531 * interface. The reference to the device object is released.
6532 * If this is the last reference then it will be freed.
6533 */
6534void free_netdev(struct net_device *dev)
6535{
6536 struct napi_struct *p, *n;
6537
6538 release_net(dev_net(dev));
6539
6540 netif_free_tx_queues(dev);
6541#ifdef CONFIG_SYSFS
6542 kfree(dev->_rx);
6543#endif
6544
6545 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6546
6547 /* Flush device addresses */
6548 dev_addr_flush(dev);
6549
6550 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6551 netif_napi_del(p);
6552
6553 free_percpu(dev->pcpu_refcnt);
6554 dev->pcpu_refcnt = NULL;
6555
6556 /* Compatibility with error handling in drivers */
6557 if (dev->reg_state == NETREG_UNINITIALIZED) {
6558 netdev_freemem(dev);
6559 return;
6560 }
6561
6562 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6563 dev->reg_state = NETREG_RELEASED;
6564
6565 /* will free via device release */
6566 put_device(&dev->dev);
6567}
6568EXPORT_SYMBOL(free_netdev);
6569
6570/**
6571 * synchronize_net - Synchronize with packet receive processing
6572 *
6573 * Wait for packets currently being received to be done.
6574 * Does not block later packets from starting.
6575 */
6576void synchronize_net(void)
6577{
6578 might_sleep();
6579 if (rtnl_is_locked())
6580 synchronize_rcu_expedited();
6581 else
6582 synchronize_rcu();
6583}
6584EXPORT_SYMBOL(synchronize_net);
6585
6586/**
6587 * unregister_netdevice_queue - remove device from the kernel
6588 * @dev: device
6589 * @head: list
6590 *
6591 * This function shuts down a device interface and removes it
6592 * from the kernel tables.
6593 * If head not NULL, device is queued to be unregistered later.
6594 *
6595 * Callers must hold the rtnl semaphore. You may want
6596 * unregister_netdev() instead of this.
6597 */
6598
6599void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6600{
6601 ASSERT_RTNL();
6602
6603 if (head) {
6604 list_move_tail(&dev->unreg_list, head);
6605 } else {
6606 rollback_registered(dev);
6607 /* Finish processing unregister after unlock */
6608 net_set_todo(dev);
6609 }
6610}
6611EXPORT_SYMBOL(unregister_netdevice_queue);
6612
6613/**
6614 * unregister_netdevice_many - unregister many devices
6615 * @head: list of devices
6616 */
6617void unregister_netdevice_many(struct list_head *head)
6618{
6619 struct net_device *dev;
6620
6621 if (!list_empty(head)) {
6622 rollback_registered_many(head);
6623 list_for_each_entry(dev, head, unreg_list)
6624 net_set_todo(dev);
6625 }
6626}
6627EXPORT_SYMBOL(unregister_netdevice_many);
6628
6629/**
6630 * unregister_netdev - remove device from the kernel
6631 * @dev: device
6632 *
6633 * This function shuts down a device interface and removes it
6634 * from the kernel tables.
6635 *
6636 * This is just a wrapper for unregister_netdevice that takes
6637 * the rtnl semaphore. In general you want to use this and not
6638 * unregister_netdevice.
6639 */
6640void unregister_netdev(struct net_device *dev)
6641{
6642 rtnl_lock();
6643 unregister_netdevice(dev);
6644 rtnl_unlock();
6645}
6646EXPORT_SYMBOL(unregister_netdev);
6647
6648/**
6649 * dev_change_net_namespace - move device to different nethost namespace
6650 * @dev: device
6651 * @net: network namespace
6652 * @pat: If not NULL name pattern to try if the current device name
6653 * is already taken in the destination network namespace.
6654 *
6655 * This function shuts down a device interface and moves it
6656 * to a new network namespace. On success 0 is returned, on
6657 * a failure a netagive errno code is returned.
6658 *
6659 * Callers must hold the rtnl semaphore.
6660 */
6661
6662int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6663{
6664 int err;
6665
6666 ASSERT_RTNL();
6667
6668 /* Don't allow namespace local devices to be moved. */
6669 err = -EINVAL;
6670 if (dev->features & NETIF_F_NETNS_LOCAL)
6671 goto out;
6672
6673 /* Ensure the device has been registrered */
6674 if (dev->reg_state != NETREG_REGISTERED)
6675 goto out;
6676
6677 /* Get out if there is nothing todo */
6678 err = 0;
6679 if (net_eq(dev_net(dev), net))
6680 goto out;
6681
6682 /* Pick the destination device name, and ensure
6683 * we can use it in the destination network namespace.
6684 */
6685 err = -EEXIST;
6686 if (__dev_get_by_name(net, dev->name)) {
6687 /* We get here if we can't use the current device name */
6688 if (!pat)
6689 goto out;
6690 if (dev_get_valid_name(net, dev, pat) < 0)
6691 goto out;
6692 }
6693
6694 /*
6695 * And now a mini version of register_netdevice unregister_netdevice.
6696 */
6697
6698 /* If device is running close it first. */
6699 dev_close(dev);
6700
6701 /* And unlink it from device chain */
6702 err = -ENODEV;
6703 unlist_netdevice(dev);
6704
6705 synchronize_net();
6706
6707 /* Shutdown queueing discipline. */
6708 dev_shutdown(dev);
6709
6710 /* Notify protocols, that we are about to destroy
6711 this device. They should clean all the things.
6712
6713 Note that dev->reg_state stays at NETREG_REGISTERED.
6714 This is wanted because this way 8021q and macvlan know
6715 the device is just moving and can keep their slaves up.
6716 */
6717 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6718 rcu_barrier();
6719 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6720 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6721
6722 /*
6723 * Flush the unicast and multicast chains
6724 */
6725 dev_uc_flush(dev);
6726 dev_mc_flush(dev);
6727
6728 /* Send a netdev-removed uevent to the old namespace */
6729 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6730
6731 /* Actually switch the network namespace */
6732 dev_net_set(dev, net);
6733
6734 /* If there is an ifindex conflict assign a new one */
6735 if (__dev_get_by_index(net, dev->ifindex)) {
6736 int iflink = (dev->iflink == dev->ifindex);
6737 dev->ifindex = dev_new_index(net);
6738 if (iflink)
6739 dev->iflink = dev->ifindex;
6740 }
6741
6742 /* Send a netdev-add uevent to the new namespace */
6743 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6744
6745 /* Fixup kobjects */
6746 err = device_rename(&dev->dev, dev->name);
6747 WARN_ON(err);
6748
6749 /* Add the device back in the hashes */
6750 list_netdevice(dev);
6751
6752 /* Notify protocols, that a new device appeared. */
6753 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6754
6755 /*
6756 * Prevent userspace races by waiting until the network
6757 * device is fully setup before sending notifications.
6758 */
6759 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6760
6761 synchronize_net();
6762 err = 0;
6763out:
6764 return err;
6765}
6766EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6767
6768static int dev_cpu_callback(struct notifier_block *nfb,
6769 unsigned long action,
6770 void *ocpu)
6771{
6772 struct sk_buff **list_skb;
6773 struct sk_buff *skb;
6774 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6775 struct softnet_data *sd, *oldsd;
6776
6777 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6778 return NOTIFY_OK;
6779
6780 local_irq_disable();
6781 cpu = smp_processor_id();
6782 sd = &per_cpu(softnet_data, cpu);
6783 oldsd = &per_cpu(softnet_data, oldcpu);
6784
6785 /* Find end of our completion_queue. */
6786 list_skb = &sd->completion_queue;
6787 while (*list_skb)
6788 list_skb = &(*list_skb)->next;
6789 /* Append completion queue from offline CPU. */
6790 *list_skb = oldsd->completion_queue;
6791 oldsd->completion_queue = NULL;
6792
6793 /* Append output queue from offline CPU. */
6794 if (oldsd->output_queue) {
6795 *sd->output_queue_tailp = oldsd->output_queue;
6796 sd->output_queue_tailp = oldsd->output_queue_tailp;
6797 oldsd->output_queue = NULL;
6798 oldsd->output_queue_tailp = &oldsd->output_queue;
6799 }
6800 /* Append NAPI poll list from offline CPU. */
6801 if (!list_empty(&oldsd->poll_list)) {
6802 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6803 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6804 }
6805
6806 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6807 local_irq_enable();
6808
6809 /* Process offline CPU's input_pkt_queue */
6810 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6811 netif_rx_internal(skb);
6812 input_queue_head_incr(oldsd);
6813 }
6814 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6815 netif_rx_internal(skb);
6816 input_queue_head_incr(oldsd);
6817 }
6818
6819 return NOTIFY_OK;
6820}
6821
6822
6823/**
6824 * netdev_increment_features - increment feature set by one
6825 * @all: current feature set
6826 * @one: new feature set
6827 * @mask: mask feature set
6828 *
6829 * Computes a new feature set after adding a device with feature set
6830 * @one to the master device with current feature set @all. Will not
6831 * enable anything that is off in @mask. Returns the new feature set.
6832 */
6833netdev_features_t netdev_increment_features(netdev_features_t all,
6834 netdev_features_t one, netdev_features_t mask)
6835{
6836 if (mask & NETIF_F_GEN_CSUM)
6837 mask |= NETIF_F_ALL_CSUM;
6838 mask |= NETIF_F_VLAN_CHALLENGED;
6839
6840 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6841 all &= one | ~NETIF_F_ALL_FOR_ALL;
6842
6843 /* If one device supports hw checksumming, set for all. */
6844 if (all & NETIF_F_GEN_CSUM)
6845 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6846
6847 return all;
6848}
6849EXPORT_SYMBOL(netdev_increment_features);
6850
6851static struct hlist_head * __net_init netdev_create_hash(void)
6852{
6853 int i;
6854 struct hlist_head *hash;
6855
6856 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6857 if (hash != NULL)
6858 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6859 INIT_HLIST_HEAD(&hash[i]);
6860
6861 return hash;
6862}
6863
6864/* Initialize per network namespace state */
6865static int __net_init netdev_init(struct net *net)
6866{
6867 if (net != &init_net)
6868 INIT_LIST_HEAD(&net->dev_base_head);
6869
6870 net->dev_name_head = netdev_create_hash();
6871 if (net->dev_name_head == NULL)
6872 goto err_name;
6873
6874 net->dev_index_head = netdev_create_hash();
6875 if (net->dev_index_head == NULL)
6876 goto err_idx;
6877
6878 return 0;
6879
6880err_idx:
6881 kfree(net->dev_name_head);
6882err_name:
6883 return -ENOMEM;
6884}
6885
6886/**
6887 * netdev_drivername - network driver for the device
6888 * @dev: network device
6889 *
6890 * Determine network driver for device.
6891 */
6892const char *netdev_drivername(const struct net_device *dev)
6893{
6894 const struct device_driver *driver;
6895 const struct device *parent;
6896 const char *empty = "";
6897
6898 parent = dev->dev.parent;
6899 if (!parent)
6900 return empty;
6901
6902 driver = parent->driver;
6903 if (driver && driver->name)
6904 return driver->name;
6905 return empty;
6906}
6907
6908static int __netdev_printk(const char *level, const struct net_device *dev,
6909 struct va_format *vaf)
6910{
6911 int r;
6912
6913 if (dev && dev->dev.parent) {
6914 r = dev_printk_emit(level[1] - '0',
6915 dev->dev.parent,
6916 "%s %s %s: %pV",
6917 dev_driver_string(dev->dev.parent),
6918 dev_name(dev->dev.parent),
6919 netdev_name(dev), vaf);
6920 } else if (dev) {
6921 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6922 } else {
6923 r = printk("%s(NULL net_device): %pV", level, vaf);
6924 }
6925
6926 return r;
6927}
6928
6929int netdev_printk(const char *level, const struct net_device *dev,
6930 const char *format, ...)
6931{
6932 struct va_format vaf;
6933 va_list args;
6934 int r;
6935
6936 va_start(args, format);
6937
6938 vaf.fmt = format;
6939 vaf.va = &args;
6940
6941 r = __netdev_printk(level, dev, &vaf);
6942
6943 va_end(args);
6944
6945 return r;
6946}
6947EXPORT_SYMBOL(netdev_printk);
6948
6949#define define_netdev_printk_level(func, level) \
6950int func(const struct net_device *dev, const char *fmt, ...) \
6951{ \
6952 int r; \
6953 struct va_format vaf; \
6954 va_list args; \
6955 \
6956 va_start(args, fmt); \
6957 \
6958 vaf.fmt = fmt; \
6959 vaf.va = &args; \
6960 \
6961 r = __netdev_printk(level, dev, &vaf); \
6962 \
6963 va_end(args); \
6964 \
6965 return r; \
6966} \
6967EXPORT_SYMBOL(func);
6968
6969define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6970define_netdev_printk_level(netdev_alert, KERN_ALERT);
6971define_netdev_printk_level(netdev_crit, KERN_CRIT);
6972define_netdev_printk_level(netdev_err, KERN_ERR);
6973define_netdev_printk_level(netdev_warn, KERN_WARNING);
6974define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6975define_netdev_printk_level(netdev_info, KERN_INFO);
6976
6977static void __net_exit netdev_exit(struct net *net)
6978{
6979 kfree(net->dev_name_head);
6980 kfree(net->dev_index_head);
6981}
6982
6983static struct pernet_operations __net_initdata netdev_net_ops = {
6984 .init = netdev_init,
6985 .exit = netdev_exit,
6986};
6987
6988static void __net_exit default_device_exit(struct net *net)
6989{
6990 struct net_device *dev, *aux;
6991 /*
6992 * Push all migratable network devices back to the
6993 * initial network namespace
6994 */
6995 rtnl_lock();
6996 for_each_netdev_safe(net, dev, aux) {
6997 int err;
6998 char fb_name[IFNAMSIZ];
6999
7000 /* Ignore unmoveable devices (i.e. loopback) */
7001 if (dev->features & NETIF_F_NETNS_LOCAL)
7002 continue;
7003
7004 /* Leave virtual devices for the generic cleanup */
7005 if (dev->rtnl_link_ops)
7006 continue;
7007
7008 /* Push remaining network devices to init_net */
7009 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7010 err = dev_change_net_namespace(dev, &init_net, fb_name);
7011 if (err) {
7012 pr_emerg("%s: failed to move %s to init_net: %d\n",
7013 __func__, dev->name, err);
7014 BUG();
7015 }
7016 }
7017 rtnl_unlock();
7018}
7019
7020static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7021{
7022 /* Return with the rtnl_lock held when there are no network
7023 * devices unregistering in any network namespace in net_list.
7024 */
7025 struct net *net;
7026 bool unregistering;
7027 DEFINE_WAIT(wait);
7028
7029 for (;;) {
7030 prepare_to_wait(&netdev_unregistering_wq, &wait,
7031 TASK_UNINTERRUPTIBLE);
7032 unregistering = false;
7033 rtnl_lock();
7034 list_for_each_entry(net, net_list, exit_list) {
7035 if (net->dev_unreg_count > 0) {
7036 unregistering = true;
7037 break;
7038 }
7039 }
7040 if (!unregistering)
7041 break;
7042 __rtnl_unlock();
7043 schedule();
7044 }
7045 finish_wait(&netdev_unregistering_wq, &wait);
7046}
7047
7048static void __net_exit default_device_exit_batch(struct list_head *net_list)
7049{
7050 /* At exit all network devices most be removed from a network
7051 * namespace. Do this in the reverse order of registration.
7052 * Do this across as many network namespaces as possible to
7053 * improve batching efficiency.
7054 */
7055 struct net_device *dev;
7056 struct net *net;
7057 LIST_HEAD(dev_kill_list);
7058
7059 /* To prevent network device cleanup code from dereferencing
7060 * loopback devices or network devices that have been freed
7061 * wait here for all pending unregistrations to complete,
7062 * before unregistring the loopback device and allowing the
7063 * network namespace be freed.
7064 *
7065 * The netdev todo list containing all network devices
7066 * unregistrations that happen in default_device_exit_batch
7067 * will run in the rtnl_unlock() at the end of
7068 * default_device_exit_batch.
7069 */
7070 rtnl_lock_unregistering(net_list);
7071 list_for_each_entry(net, net_list, exit_list) {
7072 for_each_netdev_reverse(net, dev) {
7073 if (dev->rtnl_link_ops)
7074 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7075 else
7076 unregister_netdevice_queue(dev, &dev_kill_list);
7077 }
7078 }
7079 unregister_netdevice_many(&dev_kill_list);
7080 list_del(&dev_kill_list);
7081 rtnl_unlock();
7082}
7083
7084static struct pernet_operations __net_initdata default_device_ops = {
7085 .exit = default_device_exit,
7086 .exit_batch = default_device_exit_batch,
7087};
7088
7089/*
7090 * Initialize the DEV module. At boot time this walks the device list and
7091 * unhooks any devices that fail to initialise (normally hardware not
7092 * present) and leaves us with a valid list of present and active devices.
7093 *
7094 */
7095
7096/*
7097 * This is called single threaded during boot, so no need
7098 * to take the rtnl semaphore.
7099 */
7100static int __init net_dev_init(void)
7101{
7102 int i, rc = -ENOMEM;
7103
7104 BUG_ON(!dev_boot_phase);
7105
7106 if (dev_proc_init())
7107 goto out;
7108
7109 if (netdev_kobject_init())
7110 goto out;
7111
7112 INIT_LIST_HEAD(&ptype_all);
7113 for (i = 0; i < PTYPE_HASH_SIZE; i++)
7114 INIT_LIST_HEAD(&ptype_base[i]);
7115
7116 INIT_LIST_HEAD(&offload_base);
7117
7118 if (register_pernet_subsys(&netdev_net_ops))
7119 goto out;
7120
7121 /*
7122 * Initialise the packet receive queues.
7123 */
7124
7125 for_each_possible_cpu(i) {
7126 struct softnet_data *sd = &per_cpu(softnet_data, i);
7127
7128 skb_queue_head_init(&sd->input_pkt_queue);
7129 skb_queue_head_init(&sd->process_queue);
7130 INIT_LIST_HEAD(&sd->poll_list);
7131 sd->output_queue_tailp = &sd->output_queue;
7132#ifdef CONFIG_RPS
7133 sd->csd.func = rps_trigger_softirq;
7134 sd->csd.info = sd;
7135 sd->cpu = i;
7136#endif
7137
7138 sd->backlog.poll = process_backlog;
7139 sd->backlog.weight = weight_p;
7140 }
7141
7142 dev_boot_phase = 0;
7143
7144 /* The loopback device is special if any other network devices
7145 * is present in a network namespace the loopback device must
7146 * be present. Since we now dynamically allocate and free the
7147 * loopback device ensure this invariant is maintained by
7148 * keeping the loopback device as the first device on the
7149 * list of network devices. Ensuring the loopback devices
7150 * is the first device that appears and the last network device
7151 * that disappears.
7152 */
7153 if (register_pernet_device(&loopback_net_ops))
7154 goto out;
7155
7156 if (register_pernet_device(&default_device_ops))
7157 goto out;
7158
7159 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7160 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7161
7162 hotcpu_notifier(dev_cpu_callback, 0);
7163 dst_init();
7164 rc = 0;
7165out:
7166 return rc;
7167}
7168
7169subsys_initcall(net_dev_init);
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
76#include <linux/bitops.h>
77#include <linux/capability.h>
78#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
81#include <linux/hash.h>
82#include <linux/slab.h>
83#include <linux/sched.h>
84#include <linux/mutex.h>
85#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
94#include <linux/ethtool.h>
95#include <linux/notifier.h>
96#include <linux/skbuff.h>
97#include <net/net_namespace.h>
98#include <net/sock.h>
99#include <linux/rtnetlink.h>
100#include <linux/proc_fs.h>
101#include <linux/seq_file.h>
102#include <linux/stat.h>
103#include <net/dst.h>
104#include <net/pkt_sched.h>
105#include <net/checksum.h>
106#include <net/xfrm.h>
107#include <linux/highmem.h>
108#include <linux/init.h>
109#include <linux/kmod.h>
110#include <linux/module.h>
111#include <linux/netpoll.h>
112#include <linux/rcupdate.h>
113#include <linux/delay.h>
114#include <net/wext.h>
115#include <net/iw_handler.h>
116#include <asm/current.h>
117#include <linux/audit.h>
118#include <linux/dmaengine.h>
119#include <linux/err.h>
120#include <linux/ctype.h>
121#include <linux/if_arp.h>
122#include <linux/if_vlan.h>
123#include <linux/ip.h>
124#include <net/ip.h>
125#include <linux/ipv6.h>
126#include <linux/in.h>
127#include <linux/jhash.h>
128#include <linux/random.h>
129#include <trace/events/napi.h>
130#include <trace/events/net.h>
131#include <trace/events/skb.h>
132#include <linux/pci.h>
133#include <linux/inetdevice.h>
134#include <linux/cpu_rmap.h>
135#include <linux/net_tstamp.h>
136#include <linux/static_key.h>
137#include <net/flow_keys.h>
138
139#include "net-sysfs.h"
140
141/* Instead of increasing this, you should create a hash table. */
142#define MAX_GRO_SKBS 8
143
144/* This should be increased if a protocol with a bigger head is added. */
145#define GRO_MAX_HEAD (MAX_HEADER + 128)
146
147/*
148 * The list of packet types we will receive (as opposed to discard)
149 * and the routines to invoke.
150 *
151 * Why 16. Because with 16 the only overlap we get on a hash of the
152 * low nibble of the protocol value is RARP/SNAP/X.25.
153 *
154 * NOTE: That is no longer true with the addition of VLAN tags. Not
155 * sure which should go first, but I bet it won't make much
156 * difference if we are running VLANs. The good news is that
157 * this protocol won't be in the list unless compiled in, so
158 * the average user (w/out VLANs) will not be adversely affected.
159 * --BLG
160 *
161 * 0800 IP
162 * 8100 802.1Q VLAN
163 * 0001 802.3
164 * 0002 AX.25
165 * 0004 802.2
166 * 8035 RARP
167 * 0005 SNAP
168 * 0805 X.25
169 * 0806 ARP
170 * 8137 IPX
171 * 0009 Localtalk
172 * 86DD IPv6
173 */
174
175#define PTYPE_HASH_SIZE (16)
176#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
177
178static DEFINE_SPINLOCK(ptype_lock);
179static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
180static struct list_head ptype_all __read_mostly; /* Taps */
181
182/*
183 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
184 * semaphore.
185 *
186 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
187 *
188 * Writers must hold the rtnl semaphore while they loop through the
189 * dev_base_head list, and hold dev_base_lock for writing when they do the
190 * actual updates. This allows pure readers to access the list even
191 * while a writer is preparing to update it.
192 *
193 * To put it another way, dev_base_lock is held for writing only to
194 * protect against pure readers; the rtnl semaphore provides the
195 * protection against other writers.
196 *
197 * See, for example usages, register_netdevice() and
198 * unregister_netdevice(), which must be called with the rtnl
199 * semaphore held.
200 */
201DEFINE_RWLOCK(dev_base_lock);
202EXPORT_SYMBOL(dev_base_lock);
203
204static inline void dev_base_seq_inc(struct net *net)
205{
206 while (++net->dev_base_seq == 0);
207}
208
209static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
210{
211 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
212
213 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
214}
215
216static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
217{
218 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
219}
220
221static inline void rps_lock(struct softnet_data *sd)
222{
223#ifdef CONFIG_RPS
224 spin_lock(&sd->input_pkt_queue.lock);
225#endif
226}
227
228static inline void rps_unlock(struct softnet_data *sd)
229{
230#ifdef CONFIG_RPS
231 spin_unlock(&sd->input_pkt_queue.lock);
232#endif
233}
234
235/* Device list insertion */
236static int list_netdevice(struct net_device *dev)
237{
238 struct net *net = dev_net(dev);
239
240 ASSERT_RTNL();
241
242 write_lock_bh(&dev_base_lock);
243 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
244 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
245 hlist_add_head_rcu(&dev->index_hlist,
246 dev_index_hash(net, dev->ifindex));
247 write_unlock_bh(&dev_base_lock);
248
249 dev_base_seq_inc(net);
250
251 return 0;
252}
253
254/* Device list removal
255 * caller must respect a RCU grace period before freeing/reusing dev
256 */
257static void unlist_netdevice(struct net_device *dev)
258{
259 ASSERT_RTNL();
260
261 /* Unlink dev from the device chain */
262 write_lock_bh(&dev_base_lock);
263 list_del_rcu(&dev->dev_list);
264 hlist_del_rcu(&dev->name_hlist);
265 hlist_del_rcu(&dev->index_hlist);
266 write_unlock_bh(&dev_base_lock);
267
268 dev_base_seq_inc(dev_net(dev));
269}
270
271/*
272 * Our notifier list
273 */
274
275static RAW_NOTIFIER_HEAD(netdev_chain);
276
277/*
278 * Device drivers call our routines to queue packets here. We empty the
279 * queue in the local softnet handler.
280 */
281
282DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
283EXPORT_PER_CPU_SYMBOL(softnet_data);
284
285#ifdef CONFIG_LOCKDEP
286/*
287 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
288 * according to dev->type
289 */
290static const unsigned short netdev_lock_type[] =
291 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
292 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
293 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
294 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
295 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
296 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
297 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
298 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
299 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
300 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
301 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
302 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
303 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
304 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
305 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
306
307static const char *const netdev_lock_name[] =
308 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
309 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
310 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
311 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
312 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
313 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
314 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
315 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
316 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
317 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
318 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
319 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
320 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
321 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
322 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
323
324static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
325static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
326
327static inline unsigned short netdev_lock_pos(unsigned short dev_type)
328{
329 int i;
330
331 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
332 if (netdev_lock_type[i] == dev_type)
333 return i;
334 /* the last key is used by default */
335 return ARRAY_SIZE(netdev_lock_type) - 1;
336}
337
338static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
339 unsigned short dev_type)
340{
341 int i;
342
343 i = netdev_lock_pos(dev_type);
344 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
345 netdev_lock_name[i]);
346}
347
348static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
349{
350 int i;
351
352 i = netdev_lock_pos(dev->type);
353 lockdep_set_class_and_name(&dev->addr_list_lock,
354 &netdev_addr_lock_key[i],
355 netdev_lock_name[i]);
356}
357#else
358static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
359 unsigned short dev_type)
360{
361}
362static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
363{
364}
365#endif
366
367/*******************************************************************************
368
369 Protocol management and registration routines
370
371*******************************************************************************/
372
373/*
374 * Add a protocol ID to the list. Now that the input handler is
375 * smarter we can dispense with all the messy stuff that used to be
376 * here.
377 *
378 * BEWARE!!! Protocol handlers, mangling input packets,
379 * MUST BE last in hash buckets and checking protocol handlers
380 * MUST start from promiscuous ptype_all chain in net_bh.
381 * It is true now, do not change it.
382 * Explanation follows: if protocol handler, mangling packet, will
383 * be the first on list, it is not able to sense, that packet
384 * is cloned and should be copied-on-write, so that it will
385 * change it and subsequent readers will get broken packet.
386 * --ANK (980803)
387 */
388
389static inline struct list_head *ptype_head(const struct packet_type *pt)
390{
391 if (pt->type == htons(ETH_P_ALL))
392 return &ptype_all;
393 else
394 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
395}
396
397/**
398 * dev_add_pack - add packet handler
399 * @pt: packet type declaration
400 *
401 * Add a protocol handler to the networking stack. The passed &packet_type
402 * is linked into kernel lists and may not be freed until it has been
403 * removed from the kernel lists.
404 *
405 * This call does not sleep therefore it can not
406 * guarantee all CPU's that are in middle of receiving packets
407 * will see the new packet type (until the next received packet).
408 */
409
410void dev_add_pack(struct packet_type *pt)
411{
412 struct list_head *head = ptype_head(pt);
413
414 spin_lock(&ptype_lock);
415 list_add_rcu(&pt->list, head);
416 spin_unlock(&ptype_lock);
417}
418EXPORT_SYMBOL(dev_add_pack);
419
420/**
421 * __dev_remove_pack - remove packet handler
422 * @pt: packet type declaration
423 *
424 * Remove a protocol handler that was previously added to the kernel
425 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
426 * from the kernel lists and can be freed or reused once this function
427 * returns.
428 *
429 * The packet type might still be in use by receivers
430 * and must not be freed until after all the CPU's have gone
431 * through a quiescent state.
432 */
433void __dev_remove_pack(struct packet_type *pt)
434{
435 struct list_head *head = ptype_head(pt);
436 struct packet_type *pt1;
437
438 spin_lock(&ptype_lock);
439
440 list_for_each_entry(pt1, head, list) {
441 if (pt == pt1) {
442 list_del_rcu(&pt->list);
443 goto out;
444 }
445 }
446
447 pr_warn("dev_remove_pack: %p not found\n", pt);
448out:
449 spin_unlock(&ptype_lock);
450}
451EXPORT_SYMBOL(__dev_remove_pack);
452
453/**
454 * dev_remove_pack - remove packet handler
455 * @pt: packet type declaration
456 *
457 * Remove a protocol handler that was previously added to the kernel
458 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
459 * from the kernel lists and can be freed or reused once this function
460 * returns.
461 *
462 * This call sleeps to guarantee that no CPU is looking at the packet
463 * type after return.
464 */
465void dev_remove_pack(struct packet_type *pt)
466{
467 __dev_remove_pack(pt);
468
469 synchronize_net();
470}
471EXPORT_SYMBOL(dev_remove_pack);
472
473/******************************************************************************
474
475 Device Boot-time Settings Routines
476
477*******************************************************************************/
478
479/* Boot time configuration table */
480static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
481
482/**
483 * netdev_boot_setup_add - add new setup entry
484 * @name: name of the device
485 * @map: configured settings for the device
486 *
487 * Adds new setup entry to the dev_boot_setup list. The function
488 * returns 0 on error and 1 on success. This is a generic routine to
489 * all netdevices.
490 */
491static int netdev_boot_setup_add(char *name, struct ifmap *map)
492{
493 struct netdev_boot_setup *s;
494 int i;
495
496 s = dev_boot_setup;
497 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
498 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
499 memset(s[i].name, 0, sizeof(s[i].name));
500 strlcpy(s[i].name, name, IFNAMSIZ);
501 memcpy(&s[i].map, map, sizeof(s[i].map));
502 break;
503 }
504 }
505
506 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
507}
508
509/**
510 * netdev_boot_setup_check - check boot time settings
511 * @dev: the netdevice
512 *
513 * Check boot time settings for the device.
514 * The found settings are set for the device to be used
515 * later in the device probing.
516 * Returns 0 if no settings found, 1 if they are.
517 */
518int netdev_boot_setup_check(struct net_device *dev)
519{
520 struct netdev_boot_setup *s = dev_boot_setup;
521 int i;
522
523 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
524 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
525 !strcmp(dev->name, s[i].name)) {
526 dev->irq = s[i].map.irq;
527 dev->base_addr = s[i].map.base_addr;
528 dev->mem_start = s[i].map.mem_start;
529 dev->mem_end = s[i].map.mem_end;
530 return 1;
531 }
532 }
533 return 0;
534}
535EXPORT_SYMBOL(netdev_boot_setup_check);
536
537
538/**
539 * netdev_boot_base - get address from boot time settings
540 * @prefix: prefix for network device
541 * @unit: id for network device
542 *
543 * Check boot time settings for the base address of device.
544 * The found settings are set for the device to be used
545 * later in the device probing.
546 * Returns 0 if no settings found.
547 */
548unsigned long netdev_boot_base(const char *prefix, int unit)
549{
550 const struct netdev_boot_setup *s = dev_boot_setup;
551 char name[IFNAMSIZ];
552 int i;
553
554 sprintf(name, "%s%d", prefix, unit);
555
556 /*
557 * If device already registered then return base of 1
558 * to indicate not to probe for this interface
559 */
560 if (__dev_get_by_name(&init_net, name))
561 return 1;
562
563 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
564 if (!strcmp(name, s[i].name))
565 return s[i].map.base_addr;
566 return 0;
567}
568
569/*
570 * Saves at boot time configured settings for any netdevice.
571 */
572int __init netdev_boot_setup(char *str)
573{
574 int ints[5];
575 struct ifmap map;
576
577 str = get_options(str, ARRAY_SIZE(ints), ints);
578 if (!str || !*str)
579 return 0;
580
581 /* Save settings */
582 memset(&map, 0, sizeof(map));
583 if (ints[0] > 0)
584 map.irq = ints[1];
585 if (ints[0] > 1)
586 map.base_addr = ints[2];
587 if (ints[0] > 2)
588 map.mem_start = ints[3];
589 if (ints[0] > 3)
590 map.mem_end = ints[4];
591
592 /* Add new entry to the list */
593 return netdev_boot_setup_add(str, &map);
594}
595
596__setup("netdev=", netdev_boot_setup);
597
598/*******************************************************************************
599
600 Device Interface Subroutines
601
602*******************************************************************************/
603
604/**
605 * __dev_get_by_name - find a device by its name
606 * @net: the applicable net namespace
607 * @name: name to find
608 *
609 * Find an interface by name. Must be called under RTNL semaphore
610 * or @dev_base_lock. If the name is found a pointer to the device
611 * is returned. If the name is not found then %NULL is returned. The
612 * reference counters are not incremented so the caller must be
613 * careful with locks.
614 */
615
616struct net_device *__dev_get_by_name(struct net *net, const char *name)
617{
618 struct hlist_node *p;
619 struct net_device *dev;
620 struct hlist_head *head = dev_name_hash(net, name);
621
622 hlist_for_each_entry(dev, p, head, name_hlist)
623 if (!strncmp(dev->name, name, IFNAMSIZ))
624 return dev;
625
626 return NULL;
627}
628EXPORT_SYMBOL(__dev_get_by_name);
629
630/**
631 * dev_get_by_name_rcu - find a device by its name
632 * @net: the applicable net namespace
633 * @name: name to find
634 *
635 * Find an interface by name.
636 * If the name is found a pointer to the device is returned.
637 * If the name is not found then %NULL is returned.
638 * The reference counters are not incremented so the caller must be
639 * careful with locks. The caller must hold RCU lock.
640 */
641
642struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
643{
644 struct hlist_node *p;
645 struct net_device *dev;
646 struct hlist_head *head = dev_name_hash(net, name);
647
648 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
649 if (!strncmp(dev->name, name, IFNAMSIZ))
650 return dev;
651
652 return NULL;
653}
654EXPORT_SYMBOL(dev_get_by_name_rcu);
655
656/**
657 * dev_get_by_name - find a device by its name
658 * @net: the applicable net namespace
659 * @name: name to find
660 *
661 * Find an interface by name. This can be called from any
662 * context and does its own locking. The returned handle has
663 * the usage count incremented and the caller must use dev_put() to
664 * release it when it is no longer needed. %NULL is returned if no
665 * matching device is found.
666 */
667
668struct net_device *dev_get_by_name(struct net *net, const char *name)
669{
670 struct net_device *dev;
671
672 rcu_read_lock();
673 dev = dev_get_by_name_rcu(net, name);
674 if (dev)
675 dev_hold(dev);
676 rcu_read_unlock();
677 return dev;
678}
679EXPORT_SYMBOL(dev_get_by_name);
680
681/**
682 * __dev_get_by_index - find a device by its ifindex
683 * @net: the applicable net namespace
684 * @ifindex: index of device
685 *
686 * Search for an interface by index. Returns %NULL if the device
687 * is not found or a pointer to the device. The device has not
688 * had its reference counter increased so the caller must be careful
689 * about locking. The caller must hold either the RTNL semaphore
690 * or @dev_base_lock.
691 */
692
693struct net_device *__dev_get_by_index(struct net *net, int ifindex)
694{
695 struct hlist_node *p;
696 struct net_device *dev;
697 struct hlist_head *head = dev_index_hash(net, ifindex);
698
699 hlist_for_each_entry(dev, p, head, index_hlist)
700 if (dev->ifindex == ifindex)
701 return dev;
702
703 return NULL;
704}
705EXPORT_SYMBOL(__dev_get_by_index);
706
707/**
708 * dev_get_by_index_rcu - find a device by its ifindex
709 * @net: the applicable net namespace
710 * @ifindex: index of device
711 *
712 * Search for an interface by index. Returns %NULL if the device
713 * is not found or a pointer to the device. The device has not
714 * had its reference counter increased so the caller must be careful
715 * about locking. The caller must hold RCU lock.
716 */
717
718struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
719{
720 struct hlist_node *p;
721 struct net_device *dev;
722 struct hlist_head *head = dev_index_hash(net, ifindex);
723
724 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
725 if (dev->ifindex == ifindex)
726 return dev;
727
728 return NULL;
729}
730EXPORT_SYMBOL(dev_get_by_index_rcu);
731
732
733/**
734 * dev_get_by_index - find a device by its ifindex
735 * @net: the applicable net namespace
736 * @ifindex: index of device
737 *
738 * Search for an interface by index. Returns NULL if the device
739 * is not found or a pointer to the device. The device returned has
740 * had a reference added and the pointer is safe until the user calls
741 * dev_put to indicate they have finished with it.
742 */
743
744struct net_device *dev_get_by_index(struct net *net, int ifindex)
745{
746 struct net_device *dev;
747
748 rcu_read_lock();
749 dev = dev_get_by_index_rcu(net, ifindex);
750 if (dev)
751 dev_hold(dev);
752 rcu_read_unlock();
753 return dev;
754}
755EXPORT_SYMBOL(dev_get_by_index);
756
757/**
758 * dev_getbyhwaddr_rcu - find a device by its hardware address
759 * @net: the applicable net namespace
760 * @type: media type of device
761 * @ha: hardware address
762 *
763 * Search for an interface by MAC address. Returns NULL if the device
764 * is not found or a pointer to the device.
765 * The caller must hold RCU or RTNL.
766 * The returned device has not had its ref count increased
767 * and the caller must therefore be careful about locking
768 *
769 */
770
771struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
772 const char *ha)
773{
774 struct net_device *dev;
775
776 for_each_netdev_rcu(net, dev)
777 if (dev->type == type &&
778 !memcmp(dev->dev_addr, ha, dev->addr_len))
779 return dev;
780
781 return NULL;
782}
783EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
784
785struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
786{
787 struct net_device *dev;
788
789 ASSERT_RTNL();
790 for_each_netdev(net, dev)
791 if (dev->type == type)
792 return dev;
793
794 return NULL;
795}
796EXPORT_SYMBOL(__dev_getfirstbyhwtype);
797
798struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
799{
800 struct net_device *dev, *ret = NULL;
801
802 rcu_read_lock();
803 for_each_netdev_rcu(net, dev)
804 if (dev->type == type) {
805 dev_hold(dev);
806 ret = dev;
807 break;
808 }
809 rcu_read_unlock();
810 return ret;
811}
812EXPORT_SYMBOL(dev_getfirstbyhwtype);
813
814/**
815 * dev_get_by_flags_rcu - find any device with given flags
816 * @net: the applicable net namespace
817 * @if_flags: IFF_* values
818 * @mask: bitmask of bits in if_flags to check
819 *
820 * Search for any interface with the given flags. Returns NULL if a device
821 * is not found or a pointer to the device. Must be called inside
822 * rcu_read_lock(), and result refcount is unchanged.
823 */
824
825struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
826 unsigned short mask)
827{
828 struct net_device *dev, *ret;
829
830 ret = NULL;
831 for_each_netdev_rcu(net, dev) {
832 if (((dev->flags ^ if_flags) & mask) == 0) {
833 ret = dev;
834 break;
835 }
836 }
837 return ret;
838}
839EXPORT_SYMBOL(dev_get_by_flags_rcu);
840
841/**
842 * dev_valid_name - check if name is okay for network device
843 * @name: name string
844 *
845 * Network device names need to be valid file names to
846 * to allow sysfs to work. We also disallow any kind of
847 * whitespace.
848 */
849bool dev_valid_name(const char *name)
850{
851 if (*name == '\0')
852 return false;
853 if (strlen(name) >= IFNAMSIZ)
854 return false;
855 if (!strcmp(name, ".") || !strcmp(name, ".."))
856 return false;
857
858 while (*name) {
859 if (*name == '/' || isspace(*name))
860 return false;
861 name++;
862 }
863 return true;
864}
865EXPORT_SYMBOL(dev_valid_name);
866
867/**
868 * __dev_alloc_name - allocate a name for a device
869 * @net: network namespace to allocate the device name in
870 * @name: name format string
871 * @buf: scratch buffer and result name string
872 *
873 * Passed a format string - eg "lt%d" it will try and find a suitable
874 * id. It scans list of devices to build up a free map, then chooses
875 * the first empty slot. The caller must hold the dev_base or rtnl lock
876 * while allocating the name and adding the device in order to avoid
877 * duplicates.
878 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
879 * Returns the number of the unit assigned or a negative errno code.
880 */
881
882static int __dev_alloc_name(struct net *net, const char *name, char *buf)
883{
884 int i = 0;
885 const char *p;
886 const int max_netdevices = 8*PAGE_SIZE;
887 unsigned long *inuse;
888 struct net_device *d;
889
890 p = strnchr(name, IFNAMSIZ-1, '%');
891 if (p) {
892 /*
893 * Verify the string as this thing may have come from
894 * the user. There must be either one "%d" and no other "%"
895 * characters.
896 */
897 if (p[1] != 'd' || strchr(p + 2, '%'))
898 return -EINVAL;
899
900 /* Use one page as a bit array of possible slots */
901 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
902 if (!inuse)
903 return -ENOMEM;
904
905 for_each_netdev(net, d) {
906 if (!sscanf(d->name, name, &i))
907 continue;
908 if (i < 0 || i >= max_netdevices)
909 continue;
910
911 /* avoid cases where sscanf is not exact inverse of printf */
912 snprintf(buf, IFNAMSIZ, name, i);
913 if (!strncmp(buf, d->name, IFNAMSIZ))
914 set_bit(i, inuse);
915 }
916
917 i = find_first_zero_bit(inuse, max_netdevices);
918 free_page((unsigned long) inuse);
919 }
920
921 if (buf != name)
922 snprintf(buf, IFNAMSIZ, name, i);
923 if (!__dev_get_by_name(net, buf))
924 return i;
925
926 /* It is possible to run out of possible slots
927 * when the name is long and there isn't enough space left
928 * for the digits, or if all bits are used.
929 */
930 return -ENFILE;
931}
932
933/**
934 * dev_alloc_name - allocate a name for a device
935 * @dev: device
936 * @name: name format string
937 *
938 * Passed a format string - eg "lt%d" it will try and find a suitable
939 * id. It scans list of devices to build up a free map, then chooses
940 * the first empty slot. The caller must hold the dev_base or rtnl lock
941 * while allocating the name and adding the device in order to avoid
942 * duplicates.
943 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
944 * Returns the number of the unit assigned or a negative errno code.
945 */
946
947int dev_alloc_name(struct net_device *dev, const char *name)
948{
949 char buf[IFNAMSIZ];
950 struct net *net;
951 int ret;
952
953 BUG_ON(!dev_net(dev));
954 net = dev_net(dev);
955 ret = __dev_alloc_name(net, name, buf);
956 if (ret >= 0)
957 strlcpy(dev->name, buf, IFNAMSIZ);
958 return ret;
959}
960EXPORT_SYMBOL(dev_alloc_name);
961
962static int dev_get_valid_name(struct net_device *dev, const char *name)
963{
964 struct net *net;
965
966 BUG_ON(!dev_net(dev));
967 net = dev_net(dev);
968
969 if (!dev_valid_name(name))
970 return -EINVAL;
971
972 if (strchr(name, '%'))
973 return dev_alloc_name(dev, name);
974 else if (__dev_get_by_name(net, name))
975 return -EEXIST;
976 else if (dev->name != name)
977 strlcpy(dev->name, name, IFNAMSIZ);
978
979 return 0;
980}
981
982/**
983 * dev_change_name - change name of a device
984 * @dev: device
985 * @newname: name (or format string) must be at least IFNAMSIZ
986 *
987 * Change name of a device, can pass format strings "eth%d".
988 * for wildcarding.
989 */
990int dev_change_name(struct net_device *dev, const char *newname)
991{
992 char oldname[IFNAMSIZ];
993 int err = 0;
994 int ret;
995 struct net *net;
996
997 ASSERT_RTNL();
998 BUG_ON(!dev_net(dev));
999
1000 net = dev_net(dev);
1001 if (dev->flags & IFF_UP)
1002 return -EBUSY;
1003
1004 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1005 return 0;
1006
1007 memcpy(oldname, dev->name, IFNAMSIZ);
1008
1009 err = dev_get_valid_name(dev, newname);
1010 if (err < 0)
1011 return err;
1012
1013rollback:
1014 ret = device_rename(&dev->dev, dev->name);
1015 if (ret) {
1016 memcpy(dev->name, oldname, IFNAMSIZ);
1017 return ret;
1018 }
1019
1020 write_lock_bh(&dev_base_lock);
1021 hlist_del_rcu(&dev->name_hlist);
1022 write_unlock_bh(&dev_base_lock);
1023
1024 synchronize_rcu();
1025
1026 write_lock_bh(&dev_base_lock);
1027 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1028 write_unlock_bh(&dev_base_lock);
1029
1030 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1031 ret = notifier_to_errno(ret);
1032
1033 if (ret) {
1034 /* err >= 0 after dev_alloc_name() or stores the first errno */
1035 if (err >= 0) {
1036 err = ret;
1037 memcpy(dev->name, oldname, IFNAMSIZ);
1038 goto rollback;
1039 } else {
1040 pr_err("%s: name change rollback failed: %d\n",
1041 dev->name, ret);
1042 }
1043 }
1044
1045 return err;
1046}
1047
1048/**
1049 * dev_set_alias - change ifalias of a device
1050 * @dev: device
1051 * @alias: name up to IFALIASZ
1052 * @len: limit of bytes to copy from info
1053 *
1054 * Set ifalias for a device,
1055 */
1056int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1057{
1058 char *new_ifalias;
1059
1060 ASSERT_RTNL();
1061
1062 if (len >= IFALIASZ)
1063 return -EINVAL;
1064
1065 if (!len) {
1066 if (dev->ifalias) {
1067 kfree(dev->ifalias);
1068 dev->ifalias = NULL;
1069 }
1070 return 0;
1071 }
1072
1073 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1074 if (!new_ifalias)
1075 return -ENOMEM;
1076 dev->ifalias = new_ifalias;
1077
1078 strlcpy(dev->ifalias, alias, len+1);
1079 return len;
1080}
1081
1082
1083/**
1084 * netdev_features_change - device changes features
1085 * @dev: device to cause notification
1086 *
1087 * Called to indicate a device has changed features.
1088 */
1089void netdev_features_change(struct net_device *dev)
1090{
1091 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1092}
1093EXPORT_SYMBOL(netdev_features_change);
1094
1095/**
1096 * netdev_state_change - device changes state
1097 * @dev: device to cause notification
1098 *
1099 * Called to indicate a device has changed state. This function calls
1100 * the notifier chains for netdev_chain and sends a NEWLINK message
1101 * to the routing socket.
1102 */
1103void netdev_state_change(struct net_device *dev)
1104{
1105 if (dev->flags & IFF_UP) {
1106 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1107 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1108 }
1109}
1110EXPORT_SYMBOL(netdev_state_change);
1111
1112int netdev_bonding_change(struct net_device *dev, unsigned long event)
1113{
1114 return call_netdevice_notifiers(event, dev);
1115}
1116EXPORT_SYMBOL(netdev_bonding_change);
1117
1118/**
1119 * dev_load - load a network module
1120 * @net: the applicable net namespace
1121 * @name: name of interface
1122 *
1123 * If a network interface is not present and the process has suitable
1124 * privileges this function loads the module. If module loading is not
1125 * available in this kernel then it becomes a nop.
1126 */
1127
1128void dev_load(struct net *net, const char *name)
1129{
1130 struct net_device *dev;
1131 int no_module;
1132
1133 rcu_read_lock();
1134 dev = dev_get_by_name_rcu(net, name);
1135 rcu_read_unlock();
1136
1137 no_module = !dev;
1138 if (no_module && capable(CAP_NET_ADMIN))
1139 no_module = request_module("netdev-%s", name);
1140 if (no_module && capable(CAP_SYS_MODULE)) {
1141 if (!request_module("%s", name))
1142 pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1143 name);
1144 }
1145}
1146EXPORT_SYMBOL(dev_load);
1147
1148static int __dev_open(struct net_device *dev)
1149{
1150 const struct net_device_ops *ops = dev->netdev_ops;
1151 int ret;
1152
1153 ASSERT_RTNL();
1154
1155 if (!netif_device_present(dev))
1156 return -ENODEV;
1157
1158 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1159 ret = notifier_to_errno(ret);
1160 if (ret)
1161 return ret;
1162
1163 set_bit(__LINK_STATE_START, &dev->state);
1164
1165 if (ops->ndo_validate_addr)
1166 ret = ops->ndo_validate_addr(dev);
1167
1168 if (!ret && ops->ndo_open)
1169 ret = ops->ndo_open(dev);
1170
1171 if (ret)
1172 clear_bit(__LINK_STATE_START, &dev->state);
1173 else {
1174 dev->flags |= IFF_UP;
1175 net_dmaengine_get();
1176 dev_set_rx_mode(dev);
1177 dev_activate(dev);
1178 add_device_randomness(dev->dev_addr, dev->addr_len);
1179 }
1180
1181 return ret;
1182}
1183
1184/**
1185 * dev_open - prepare an interface for use.
1186 * @dev: device to open
1187 *
1188 * Takes a device from down to up state. The device's private open
1189 * function is invoked and then the multicast lists are loaded. Finally
1190 * the device is moved into the up state and a %NETDEV_UP message is
1191 * sent to the netdev notifier chain.
1192 *
1193 * Calling this function on an active interface is a nop. On a failure
1194 * a negative errno code is returned.
1195 */
1196int dev_open(struct net_device *dev)
1197{
1198 int ret;
1199
1200 if (dev->flags & IFF_UP)
1201 return 0;
1202
1203 ret = __dev_open(dev);
1204 if (ret < 0)
1205 return ret;
1206
1207 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1208 call_netdevice_notifiers(NETDEV_UP, dev);
1209
1210 return ret;
1211}
1212EXPORT_SYMBOL(dev_open);
1213
1214static int __dev_close_many(struct list_head *head)
1215{
1216 struct net_device *dev;
1217
1218 ASSERT_RTNL();
1219 might_sleep();
1220
1221 list_for_each_entry(dev, head, unreg_list) {
1222 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1223
1224 clear_bit(__LINK_STATE_START, &dev->state);
1225
1226 /* Synchronize to scheduled poll. We cannot touch poll list, it
1227 * can be even on different cpu. So just clear netif_running().
1228 *
1229 * dev->stop() will invoke napi_disable() on all of it's
1230 * napi_struct instances on this device.
1231 */
1232 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1233 }
1234
1235 dev_deactivate_many(head);
1236
1237 list_for_each_entry(dev, head, unreg_list) {
1238 const struct net_device_ops *ops = dev->netdev_ops;
1239
1240 /*
1241 * Call the device specific close. This cannot fail.
1242 * Only if device is UP
1243 *
1244 * We allow it to be called even after a DETACH hot-plug
1245 * event.
1246 */
1247 if (ops->ndo_stop)
1248 ops->ndo_stop(dev);
1249
1250 dev->flags &= ~IFF_UP;
1251 net_dmaengine_put();
1252 }
1253
1254 return 0;
1255}
1256
1257static int __dev_close(struct net_device *dev)
1258{
1259 int retval;
1260 LIST_HEAD(single);
1261
1262 list_add(&dev->unreg_list, &single);
1263 retval = __dev_close_many(&single);
1264 list_del(&single);
1265 return retval;
1266}
1267
1268static int dev_close_many(struct list_head *head)
1269{
1270 struct net_device *dev, *tmp;
1271 LIST_HEAD(tmp_list);
1272
1273 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1274 if (!(dev->flags & IFF_UP))
1275 list_move(&dev->unreg_list, &tmp_list);
1276
1277 __dev_close_many(head);
1278
1279 list_for_each_entry(dev, head, unreg_list) {
1280 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1281 call_netdevice_notifiers(NETDEV_DOWN, dev);
1282 }
1283
1284 /* rollback_registered_many needs the complete original list */
1285 list_splice(&tmp_list, head);
1286 return 0;
1287}
1288
1289/**
1290 * dev_close - shutdown an interface.
1291 * @dev: device to shutdown
1292 *
1293 * This function moves an active device into down state. A
1294 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1295 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1296 * chain.
1297 */
1298int dev_close(struct net_device *dev)
1299{
1300 if (dev->flags & IFF_UP) {
1301 LIST_HEAD(single);
1302
1303 list_add(&dev->unreg_list, &single);
1304 dev_close_many(&single);
1305 list_del(&single);
1306 }
1307 return 0;
1308}
1309EXPORT_SYMBOL(dev_close);
1310
1311
1312/**
1313 * dev_disable_lro - disable Large Receive Offload on a device
1314 * @dev: device
1315 *
1316 * Disable Large Receive Offload (LRO) on a net device. Must be
1317 * called under RTNL. This is needed if received packets may be
1318 * forwarded to another interface.
1319 */
1320void dev_disable_lro(struct net_device *dev)
1321{
1322 /*
1323 * If we're trying to disable lro on a vlan device
1324 * use the underlying physical device instead
1325 */
1326 if (is_vlan_dev(dev))
1327 dev = vlan_dev_real_dev(dev);
1328
1329 dev->wanted_features &= ~NETIF_F_LRO;
1330 netdev_update_features(dev);
1331
1332 if (unlikely(dev->features & NETIF_F_LRO))
1333 netdev_WARN(dev, "failed to disable LRO!\n");
1334}
1335EXPORT_SYMBOL(dev_disable_lro);
1336
1337
1338static int dev_boot_phase = 1;
1339
1340/**
1341 * register_netdevice_notifier - register a network notifier block
1342 * @nb: notifier
1343 *
1344 * Register a notifier to be called when network device events occur.
1345 * The notifier passed is linked into the kernel structures and must
1346 * not be reused until it has been unregistered. A negative errno code
1347 * is returned on a failure.
1348 *
1349 * When registered all registration and up events are replayed
1350 * to the new notifier to allow device to have a race free
1351 * view of the network device list.
1352 */
1353
1354int register_netdevice_notifier(struct notifier_block *nb)
1355{
1356 struct net_device *dev;
1357 struct net_device *last;
1358 struct net *net;
1359 int err;
1360
1361 rtnl_lock();
1362 err = raw_notifier_chain_register(&netdev_chain, nb);
1363 if (err)
1364 goto unlock;
1365 if (dev_boot_phase)
1366 goto unlock;
1367 for_each_net(net) {
1368 for_each_netdev(net, dev) {
1369 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1370 err = notifier_to_errno(err);
1371 if (err)
1372 goto rollback;
1373
1374 if (!(dev->flags & IFF_UP))
1375 continue;
1376
1377 nb->notifier_call(nb, NETDEV_UP, dev);
1378 }
1379 }
1380
1381unlock:
1382 rtnl_unlock();
1383 return err;
1384
1385rollback:
1386 last = dev;
1387 for_each_net(net) {
1388 for_each_netdev(net, dev) {
1389 if (dev == last)
1390 goto outroll;
1391
1392 if (dev->flags & IFF_UP) {
1393 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1394 nb->notifier_call(nb, NETDEV_DOWN, dev);
1395 }
1396 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1397 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1398 }
1399 }
1400
1401outroll:
1402 raw_notifier_chain_unregister(&netdev_chain, nb);
1403 goto unlock;
1404}
1405EXPORT_SYMBOL(register_netdevice_notifier);
1406
1407/**
1408 * unregister_netdevice_notifier - unregister a network notifier block
1409 * @nb: notifier
1410 *
1411 * Unregister a notifier previously registered by
1412 * register_netdevice_notifier(). The notifier is unlinked into the
1413 * kernel structures and may then be reused. A negative errno code
1414 * is returned on a failure.
1415 *
1416 * After unregistering unregister and down device events are synthesized
1417 * for all devices on the device list to the removed notifier to remove
1418 * the need for special case cleanup code.
1419 */
1420
1421int unregister_netdevice_notifier(struct notifier_block *nb)
1422{
1423 struct net_device *dev;
1424 struct net *net;
1425 int err;
1426
1427 rtnl_lock();
1428 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1429 if (err)
1430 goto unlock;
1431
1432 for_each_net(net) {
1433 for_each_netdev(net, dev) {
1434 if (dev->flags & IFF_UP) {
1435 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1436 nb->notifier_call(nb, NETDEV_DOWN, dev);
1437 }
1438 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1439 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1440 }
1441 }
1442unlock:
1443 rtnl_unlock();
1444 return err;
1445}
1446EXPORT_SYMBOL(unregister_netdevice_notifier);
1447
1448/**
1449 * call_netdevice_notifiers - call all network notifier blocks
1450 * @val: value passed unmodified to notifier function
1451 * @dev: net_device pointer passed unmodified to notifier function
1452 *
1453 * Call all network notifier blocks. Parameters and return value
1454 * are as for raw_notifier_call_chain().
1455 */
1456
1457int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1458{
1459 ASSERT_RTNL();
1460 return raw_notifier_call_chain(&netdev_chain, val, dev);
1461}
1462EXPORT_SYMBOL(call_netdevice_notifiers);
1463
1464static struct static_key netstamp_needed __read_mostly;
1465#ifdef HAVE_JUMP_LABEL
1466/* We are not allowed to call static_key_slow_dec() from irq context
1467 * If net_disable_timestamp() is called from irq context, defer the
1468 * static_key_slow_dec() calls.
1469 */
1470static atomic_t netstamp_needed_deferred;
1471#endif
1472
1473void net_enable_timestamp(void)
1474{
1475#ifdef HAVE_JUMP_LABEL
1476 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1477
1478 if (deferred) {
1479 while (--deferred)
1480 static_key_slow_dec(&netstamp_needed);
1481 return;
1482 }
1483#endif
1484 WARN_ON(in_interrupt());
1485 static_key_slow_inc(&netstamp_needed);
1486}
1487EXPORT_SYMBOL(net_enable_timestamp);
1488
1489void net_disable_timestamp(void)
1490{
1491#ifdef HAVE_JUMP_LABEL
1492 if (in_interrupt()) {
1493 atomic_inc(&netstamp_needed_deferred);
1494 return;
1495 }
1496#endif
1497 static_key_slow_dec(&netstamp_needed);
1498}
1499EXPORT_SYMBOL(net_disable_timestamp);
1500
1501static inline void net_timestamp_set(struct sk_buff *skb)
1502{
1503 skb->tstamp.tv64 = 0;
1504 if (static_key_false(&netstamp_needed))
1505 __net_timestamp(skb);
1506}
1507
1508#define net_timestamp_check(COND, SKB) \
1509 if (static_key_false(&netstamp_needed)) { \
1510 if ((COND) && !(SKB)->tstamp.tv64) \
1511 __net_timestamp(SKB); \
1512 } \
1513
1514static int net_hwtstamp_validate(struct ifreq *ifr)
1515{
1516 struct hwtstamp_config cfg;
1517 enum hwtstamp_tx_types tx_type;
1518 enum hwtstamp_rx_filters rx_filter;
1519 int tx_type_valid = 0;
1520 int rx_filter_valid = 0;
1521
1522 if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1523 return -EFAULT;
1524
1525 if (cfg.flags) /* reserved for future extensions */
1526 return -EINVAL;
1527
1528 tx_type = cfg.tx_type;
1529 rx_filter = cfg.rx_filter;
1530
1531 switch (tx_type) {
1532 case HWTSTAMP_TX_OFF:
1533 case HWTSTAMP_TX_ON:
1534 case HWTSTAMP_TX_ONESTEP_SYNC:
1535 tx_type_valid = 1;
1536 break;
1537 }
1538
1539 switch (rx_filter) {
1540 case HWTSTAMP_FILTER_NONE:
1541 case HWTSTAMP_FILTER_ALL:
1542 case HWTSTAMP_FILTER_SOME:
1543 case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1544 case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1545 case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1546 case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1547 case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1548 case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1549 case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1550 case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1551 case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1552 case HWTSTAMP_FILTER_PTP_V2_EVENT:
1553 case HWTSTAMP_FILTER_PTP_V2_SYNC:
1554 case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1555 rx_filter_valid = 1;
1556 break;
1557 }
1558
1559 if (!tx_type_valid || !rx_filter_valid)
1560 return -ERANGE;
1561
1562 return 0;
1563}
1564
1565static inline bool is_skb_forwardable(struct net_device *dev,
1566 struct sk_buff *skb)
1567{
1568 unsigned int len;
1569
1570 if (!(dev->flags & IFF_UP))
1571 return false;
1572
1573 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1574 if (skb->len <= len)
1575 return true;
1576
1577 /* if TSO is enabled, we don't care about the length as the packet
1578 * could be forwarded without being segmented before
1579 */
1580 if (skb_is_gso(skb))
1581 return true;
1582
1583 return false;
1584}
1585
1586/**
1587 * dev_forward_skb - loopback an skb to another netif
1588 *
1589 * @dev: destination network device
1590 * @skb: buffer to forward
1591 *
1592 * return values:
1593 * NET_RX_SUCCESS (no congestion)
1594 * NET_RX_DROP (packet was dropped, but freed)
1595 *
1596 * dev_forward_skb can be used for injecting an skb from the
1597 * start_xmit function of one device into the receive queue
1598 * of another device.
1599 *
1600 * The receiving device may be in another namespace, so
1601 * we have to clear all information in the skb that could
1602 * impact namespace isolation.
1603 */
1604int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1605{
1606 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1607 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1608 atomic_long_inc(&dev->rx_dropped);
1609 kfree_skb(skb);
1610 return NET_RX_DROP;
1611 }
1612 }
1613
1614 skb_orphan(skb);
1615 nf_reset(skb);
1616
1617 if (unlikely(!is_skb_forwardable(dev, skb))) {
1618 atomic_long_inc(&dev->rx_dropped);
1619 kfree_skb(skb);
1620 return NET_RX_DROP;
1621 }
1622 skb->skb_iif = 0;
1623 skb->dev = dev;
1624 skb_dst_drop(skb);
1625 skb->tstamp.tv64 = 0;
1626 skb->pkt_type = PACKET_HOST;
1627 skb->protocol = eth_type_trans(skb, dev);
1628 skb->mark = 0;
1629 secpath_reset(skb);
1630 nf_reset(skb);
1631 return netif_rx(skb);
1632}
1633EXPORT_SYMBOL_GPL(dev_forward_skb);
1634
1635static inline int deliver_skb(struct sk_buff *skb,
1636 struct packet_type *pt_prev,
1637 struct net_device *orig_dev)
1638{
1639 atomic_inc(&skb->users);
1640 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1641}
1642
1643static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1644{
1645 if (ptype->af_packet_priv == NULL)
1646 return false;
1647
1648 if (ptype->id_match)
1649 return ptype->id_match(ptype, skb->sk);
1650 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1651 return true;
1652
1653 return false;
1654}
1655
1656/*
1657 * Support routine. Sends outgoing frames to any network
1658 * taps currently in use.
1659 */
1660
1661static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1662{
1663 struct packet_type *ptype;
1664 struct sk_buff *skb2 = NULL;
1665 struct packet_type *pt_prev = NULL;
1666
1667 rcu_read_lock();
1668 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1669 /* Never send packets back to the socket
1670 * they originated from - MvS (miquels@drinkel.ow.org)
1671 */
1672 if ((ptype->dev == dev || !ptype->dev) &&
1673 (!skb_loop_sk(ptype, skb))) {
1674 if (pt_prev) {
1675 deliver_skb(skb2, pt_prev, skb->dev);
1676 pt_prev = ptype;
1677 continue;
1678 }
1679
1680 skb2 = skb_clone(skb, GFP_ATOMIC);
1681 if (!skb2)
1682 break;
1683
1684 net_timestamp_set(skb2);
1685
1686 /* skb->nh should be correctly
1687 set by sender, so that the second statement is
1688 just protection against buggy protocols.
1689 */
1690 skb_reset_mac_header(skb2);
1691
1692 if (skb_network_header(skb2) < skb2->data ||
1693 skb2->network_header > skb2->tail) {
1694 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1695 ntohs(skb2->protocol),
1696 dev->name);
1697 skb_reset_network_header(skb2);
1698 }
1699
1700 skb2->transport_header = skb2->network_header;
1701 skb2->pkt_type = PACKET_OUTGOING;
1702 pt_prev = ptype;
1703 }
1704 }
1705 if (pt_prev)
1706 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1707 rcu_read_unlock();
1708}
1709
1710/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1711 * @dev: Network device
1712 * @txq: number of queues available
1713 *
1714 * If real_num_tx_queues is changed the tc mappings may no longer be
1715 * valid. To resolve this verify the tc mapping remains valid and if
1716 * not NULL the mapping. With no priorities mapping to this
1717 * offset/count pair it will no longer be used. In the worst case TC0
1718 * is invalid nothing can be done so disable priority mappings. If is
1719 * expected that drivers will fix this mapping if they can before
1720 * calling netif_set_real_num_tx_queues.
1721 */
1722static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1723{
1724 int i;
1725 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1726
1727 /* If TC0 is invalidated disable TC mapping */
1728 if (tc->offset + tc->count > txq) {
1729 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1730 dev->num_tc = 0;
1731 return;
1732 }
1733
1734 /* Invalidated prio to tc mappings set to TC0 */
1735 for (i = 1; i < TC_BITMASK + 1; i++) {
1736 int q = netdev_get_prio_tc_map(dev, i);
1737
1738 tc = &dev->tc_to_txq[q];
1739 if (tc->offset + tc->count > txq) {
1740 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1741 i, q);
1742 netdev_set_prio_tc_map(dev, i, 0);
1743 }
1744 }
1745}
1746
1747/*
1748 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1749 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1750 */
1751int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1752{
1753 int rc;
1754
1755 if (txq < 1 || txq > dev->num_tx_queues)
1756 return -EINVAL;
1757
1758 if (dev->reg_state == NETREG_REGISTERED ||
1759 dev->reg_state == NETREG_UNREGISTERING) {
1760 ASSERT_RTNL();
1761
1762 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1763 txq);
1764 if (rc)
1765 return rc;
1766
1767 if (dev->num_tc)
1768 netif_setup_tc(dev, txq);
1769
1770 if (txq < dev->real_num_tx_queues)
1771 qdisc_reset_all_tx_gt(dev, txq);
1772 }
1773
1774 dev->real_num_tx_queues = txq;
1775 return 0;
1776}
1777EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1778
1779#ifdef CONFIG_RPS
1780/**
1781 * netif_set_real_num_rx_queues - set actual number of RX queues used
1782 * @dev: Network device
1783 * @rxq: Actual number of RX queues
1784 *
1785 * This must be called either with the rtnl_lock held or before
1786 * registration of the net device. Returns 0 on success, or a
1787 * negative error code. If called before registration, it always
1788 * succeeds.
1789 */
1790int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1791{
1792 int rc;
1793
1794 if (rxq < 1 || rxq > dev->num_rx_queues)
1795 return -EINVAL;
1796
1797 if (dev->reg_state == NETREG_REGISTERED) {
1798 ASSERT_RTNL();
1799
1800 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1801 rxq);
1802 if (rc)
1803 return rc;
1804 }
1805
1806 dev->real_num_rx_queues = rxq;
1807 return 0;
1808}
1809EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1810#endif
1811
1812static inline void __netif_reschedule(struct Qdisc *q)
1813{
1814 struct softnet_data *sd;
1815 unsigned long flags;
1816
1817 local_irq_save(flags);
1818 sd = &__get_cpu_var(softnet_data);
1819 q->next_sched = NULL;
1820 *sd->output_queue_tailp = q;
1821 sd->output_queue_tailp = &q->next_sched;
1822 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1823 local_irq_restore(flags);
1824}
1825
1826void __netif_schedule(struct Qdisc *q)
1827{
1828 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1829 __netif_reschedule(q);
1830}
1831EXPORT_SYMBOL(__netif_schedule);
1832
1833void dev_kfree_skb_irq(struct sk_buff *skb)
1834{
1835 if (atomic_dec_and_test(&skb->users)) {
1836 struct softnet_data *sd;
1837 unsigned long flags;
1838
1839 local_irq_save(flags);
1840 sd = &__get_cpu_var(softnet_data);
1841 skb->next = sd->completion_queue;
1842 sd->completion_queue = skb;
1843 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1844 local_irq_restore(flags);
1845 }
1846}
1847EXPORT_SYMBOL(dev_kfree_skb_irq);
1848
1849void dev_kfree_skb_any(struct sk_buff *skb)
1850{
1851 if (in_irq() || irqs_disabled())
1852 dev_kfree_skb_irq(skb);
1853 else
1854 dev_kfree_skb(skb);
1855}
1856EXPORT_SYMBOL(dev_kfree_skb_any);
1857
1858
1859/**
1860 * netif_device_detach - mark device as removed
1861 * @dev: network device
1862 *
1863 * Mark device as removed from system and therefore no longer available.
1864 */
1865void netif_device_detach(struct net_device *dev)
1866{
1867 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1868 netif_running(dev)) {
1869 netif_tx_stop_all_queues(dev);
1870 }
1871}
1872EXPORT_SYMBOL(netif_device_detach);
1873
1874/**
1875 * netif_device_attach - mark device as attached
1876 * @dev: network device
1877 *
1878 * Mark device as attached from system and restart if needed.
1879 */
1880void netif_device_attach(struct net_device *dev)
1881{
1882 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1883 netif_running(dev)) {
1884 netif_tx_wake_all_queues(dev);
1885 __netdev_watchdog_up(dev);
1886 }
1887}
1888EXPORT_SYMBOL(netif_device_attach);
1889
1890static void skb_warn_bad_offload(const struct sk_buff *skb)
1891{
1892 static const netdev_features_t null_features = 0;
1893 struct net_device *dev = skb->dev;
1894 const char *driver = "";
1895
1896 if (dev && dev->dev.parent)
1897 driver = dev_driver_string(dev->dev.parent);
1898
1899 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
1900 "gso_type=%d ip_summed=%d\n",
1901 driver, dev ? &dev->features : &null_features,
1902 skb->sk ? &skb->sk->sk_route_caps : &null_features,
1903 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
1904 skb_shinfo(skb)->gso_type, skb->ip_summed);
1905}
1906
1907/*
1908 * Invalidate hardware checksum when packet is to be mangled, and
1909 * complete checksum manually on outgoing path.
1910 */
1911int skb_checksum_help(struct sk_buff *skb)
1912{
1913 __wsum csum;
1914 int ret = 0, offset;
1915
1916 if (skb->ip_summed == CHECKSUM_COMPLETE)
1917 goto out_set_summed;
1918
1919 if (unlikely(skb_shinfo(skb)->gso_size)) {
1920 skb_warn_bad_offload(skb);
1921 return -EINVAL;
1922 }
1923
1924 offset = skb_checksum_start_offset(skb);
1925 BUG_ON(offset >= skb_headlen(skb));
1926 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1927
1928 offset += skb->csum_offset;
1929 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1930
1931 if (skb_cloned(skb) &&
1932 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1933 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1934 if (ret)
1935 goto out;
1936 }
1937
1938 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1939out_set_summed:
1940 skb->ip_summed = CHECKSUM_NONE;
1941out:
1942 return ret;
1943}
1944EXPORT_SYMBOL(skb_checksum_help);
1945
1946/**
1947 * skb_gso_segment - Perform segmentation on skb.
1948 * @skb: buffer to segment
1949 * @features: features for the output path (see dev->features)
1950 *
1951 * This function segments the given skb and returns a list of segments.
1952 *
1953 * It may return NULL if the skb requires no segmentation. This is
1954 * only possible when GSO is used for verifying header integrity.
1955 */
1956struct sk_buff *skb_gso_segment(struct sk_buff *skb,
1957 netdev_features_t features)
1958{
1959 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1960 struct packet_type *ptype;
1961 __be16 type = skb->protocol;
1962 int vlan_depth = ETH_HLEN;
1963 int err;
1964
1965 while (type == htons(ETH_P_8021Q)) {
1966 struct vlan_hdr *vh;
1967
1968 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1969 return ERR_PTR(-EINVAL);
1970
1971 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1972 type = vh->h_vlan_encapsulated_proto;
1973 vlan_depth += VLAN_HLEN;
1974 }
1975
1976 skb_reset_mac_header(skb);
1977 skb->mac_len = skb->network_header - skb->mac_header;
1978 __skb_pull(skb, skb->mac_len);
1979
1980 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1981 skb_warn_bad_offload(skb);
1982
1983 if (skb_header_cloned(skb) &&
1984 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1985 return ERR_PTR(err);
1986 }
1987
1988 rcu_read_lock();
1989 list_for_each_entry_rcu(ptype,
1990 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1991 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1992 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1993 err = ptype->gso_send_check(skb);
1994 segs = ERR_PTR(err);
1995 if (err || skb_gso_ok(skb, features))
1996 break;
1997 __skb_push(skb, (skb->data -
1998 skb_network_header(skb)));
1999 }
2000 segs = ptype->gso_segment(skb, features);
2001 break;
2002 }
2003 }
2004 rcu_read_unlock();
2005
2006 __skb_push(skb, skb->data - skb_mac_header(skb));
2007
2008 return segs;
2009}
2010EXPORT_SYMBOL(skb_gso_segment);
2011
2012/* Take action when hardware reception checksum errors are detected. */
2013#ifdef CONFIG_BUG
2014void netdev_rx_csum_fault(struct net_device *dev)
2015{
2016 if (net_ratelimit()) {
2017 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2018 dump_stack();
2019 }
2020}
2021EXPORT_SYMBOL(netdev_rx_csum_fault);
2022#endif
2023
2024/* Actually, we should eliminate this check as soon as we know, that:
2025 * 1. IOMMU is present and allows to map all the memory.
2026 * 2. No high memory really exists on this machine.
2027 */
2028
2029static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2030{
2031#ifdef CONFIG_HIGHMEM
2032 int i;
2033 if (!(dev->features & NETIF_F_HIGHDMA)) {
2034 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2035 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2036 if (PageHighMem(skb_frag_page(frag)))
2037 return 1;
2038 }
2039 }
2040
2041 if (PCI_DMA_BUS_IS_PHYS) {
2042 struct device *pdev = dev->dev.parent;
2043
2044 if (!pdev)
2045 return 0;
2046 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2047 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2048 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2049 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2050 return 1;
2051 }
2052 }
2053#endif
2054 return 0;
2055}
2056
2057struct dev_gso_cb {
2058 void (*destructor)(struct sk_buff *skb);
2059};
2060
2061#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2062
2063static void dev_gso_skb_destructor(struct sk_buff *skb)
2064{
2065 struct dev_gso_cb *cb;
2066
2067 do {
2068 struct sk_buff *nskb = skb->next;
2069
2070 skb->next = nskb->next;
2071 nskb->next = NULL;
2072 kfree_skb(nskb);
2073 } while (skb->next);
2074
2075 cb = DEV_GSO_CB(skb);
2076 if (cb->destructor)
2077 cb->destructor(skb);
2078}
2079
2080/**
2081 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2082 * @skb: buffer to segment
2083 * @features: device features as applicable to this skb
2084 *
2085 * This function segments the given skb and stores the list of segments
2086 * in skb->next.
2087 */
2088static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2089{
2090 struct sk_buff *segs;
2091
2092 segs = skb_gso_segment(skb, features);
2093
2094 /* Verifying header integrity only. */
2095 if (!segs)
2096 return 0;
2097
2098 if (IS_ERR(segs))
2099 return PTR_ERR(segs);
2100
2101 skb->next = segs;
2102 DEV_GSO_CB(skb)->destructor = skb->destructor;
2103 skb->destructor = dev_gso_skb_destructor;
2104
2105 return 0;
2106}
2107
2108static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2109{
2110 return ((features & NETIF_F_GEN_CSUM) ||
2111 ((features & NETIF_F_V4_CSUM) &&
2112 protocol == htons(ETH_P_IP)) ||
2113 ((features & NETIF_F_V6_CSUM) &&
2114 protocol == htons(ETH_P_IPV6)) ||
2115 ((features & NETIF_F_FCOE_CRC) &&
2116 protocol == htons(ETH_P_FCOE)));
2117}
2118
2119static netdev_features_t harmonize_features(struct sk_buff *skb,
2120 __be16 protocol, netdev_features_t features)
2121{
2122 if (!can_checksum_protocol(features, protocol)) {
2123 features &= ~NETIF_F_ALL_CSUM;
2124 features &= ~NETIF_F_SG;
2125 } else if (illegal_highdma(skb->dev, skb)) {
2126 features &= ~NETIF_F_SG;
2127 }
2128
2129 return features;
2130}
2131
2132netdev_features_t netif_skb_features(struct sk_buff *skb)
2133{
2134 __be16 protocol = skb->protocol;
2135 netdev_features_t features = skb->dev->features;
2136
2137 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2138 features &= ~NETIF_F_GSO_MASK;
2139
2140 if (protocol == htons(ETH_P_8021Q)) {
2141 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2142 protocol = veh->h_vlan_encapsulated_proto;
2143 } else if (!vlan_tx_tag_present(skb)) {
2144 return harmonize_features(skb, protocol, features);
2145 }
2146
2147 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2148
2149 if (protocol != htons(ETH_P_8021Q)) {
2150 return harmonize_features(skb, protocol, features);
2151 } else {
2152 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2153 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2154 return harmonize_features(skb, protocol, features);
2155 }
2156}
2157EXPORT_SYMBOL(netif_skb_features);
2158
2159/*
2160 * Returns true if either:
2161 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2162 * 2. skb is fragmented and the device does not support SG, or if
2163 * at least one of fragments is in highmem and device does not
2164 * support DMA from it.
2165 */
2166static inline int skb_needs_linearize(struct sk_buff *skb,
2167 int features)
2168{
2169 return skb_is_nonlinear(skb) &&
2170 ((skb_has_frag_list(skb) &&
2171 !(features & NETIF_F_FRAGLIST)) ||
2172 (skb_shinfo(skb)->nr_frags &&
2173 !(features & NETIF_F_SG)));
2174}
2175
2176int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2177 struct netdev_queue *txq)
2178{
2179 const struct net_device_ops *ops = dev->netdev_ops;
2180 int rc = NETDEV_TX_OK;
2181 unsigned int skb_len;
2182
2183 if (likely(!skb->next)) {
2184 netdev_features_t features;
2185
2186 /*
2187 * If device doesn't need skb->dst, release it right now while
2188 * its hot in this cpu cache
2189 */
2190 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2191 skb_dst_drop(skb);
2192
2193 if (!list_empty(&ptype_all))
2194 dev_queue_xmit_nit(skb, dev);
2195
2196 features = netif_skb_features(skb);
2197
2198 if (vlan_tx_tag_present(skb) &&
2199 !(features & NETIF_F_HW_VLAN_TX)) {
2200 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2201 if (unlikely(!skb))
2202 goto out;
2203
2204 skb->vlan_tci = 0;
2205 }
2206
2207 if (netif_needs_gso(skb, features)) {
2208 if (unlikely(dev_gso_segment(skb, features)))
2209 goto out_kfree_skb;
2210 if (skb->next)
2211 goto gso;
2212 } else {
2213 if (skb_needs_linearize(skb, features) &&
2214 __skb_linearize(skb))
2215 goto out_kfree_skb;
2216
2217 /* If packet is not checksummed and device does not
2218 * support checksumming for this protocol, complete
2219 * checksumming here.
2220 */
2221 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2222 skb_set_transport_header(skb,
2223 skb_checksum_start_offset(skb));
2224 if (!(features & NETIF_F_ALL_CSUM) &&
2225 skb_checksum_help(skb))
2226 goto out_kfree_skb;
2227 }
2228 }
2229
2230 skb_len = skb->len;
2231 rc = ops->ndo_start_xmit(skb, dev);
2232 trace_net_dev_xmit(skb, rc, dev, skb_len);
2233 if (rc == NETDEV_TX_OK)
2234 txq_trans_update(txq);
2235 return rc;
2236 }
2237
2238gso:
2239 do {
2240 struct sk_buff *nskb = skb->next;
2241
2242 skb->next = nskb->next;
2243 nskb->next = NULL;
2244
2245 /*
2246 * If device doesn't need nskb->dst, release it right now while
2247 * its hot in this cpu cache
2248 */
2249 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2250 skb_dst_drop(nskb);
2251
2252 skb_len = nskb->len;
2253 rc = ops->ndo_start_xmit(nskb, dev);
2254 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2255 if (unlikely(rc != NETDEV_TX_OK)) {
2256 if (rc & ~NETDEV_TX_MASK)
2257 goto out_kfree_gso_skb;
2258 nskb->next = skb->next;
2259 skb->next = nskb;
2260 return rc;
2261 }
2262 txq_trans_update(txq);
2263 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2264 return NETDEV_TX_BUSY;
2265 } while (skb->next);
2266
2267out_kfree_gso_skb:
2268 if (likely(skb->next == NULL))
2269 skb->destructor = DEV_GSO_CB(skb)->destructor;
2270out_kfree_skb:
2271 kfree_skb(skb);
2272out:
2273 return rc;
2274}
2275
2276static u32 hashrnd __read_mostly;
2277
2278/*
2279 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2280 * to be used as a distribution range.
2281 */
2282u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2283 unsigned int num_tx_queues)
2284{
2285 u32 hash;
2286 u16 qoffset = 0;
2287 u16 qcount = num_tx_queues;
2288
2289 if (skb_rx_queue_recorded(skb)) {
2290 hash = skb_get_rx_queue(skb);
2291 while (unlikely(hash >= num_tx_queues))
2292 hash -= num_tx_queues;
2293 return hash;
2294 }
2295
2296 if (dev->num_tc) {
2297 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2298 qoffset = dev->tc_to_txq[tc].offset;
2299 qcount = dev->tc_to_txq[tc].count;
2300 }
2301
2302 if (skb->sk && skb->sk->sk_hash)
2303 hash = skb->sk->sk_hash;
2304 else
2305 hash = (__force u16) skb->protocol;
2306 hash = jhash_1word(hash, hashrnd);
2307
2308 return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2309}
2310EXPORT_SYMBOL(__skb_tx_hash);
2311
2312static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2313{
2314 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2315 net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
2316 dev->name, queue_index,
2317 dev->real_num_tx_queues);
2318 return 0;
2319 }
2320 return queue_index;
2321}
2322
2323static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2324{
2325#ifdef CONFIG_XPS
2326 struct xps_dev_maps *dev_maps;
2327 struct xps_map *map;
2328 int queue_index = -1;
2329
2330 rcu_read_lock();
2331 dev_maps = rcu_dereference(dev->xps_maps);
2332 if (dev_maps) {
2333 map = rcu_dereference(
2334 dev_maps->cpu_map[raw_smp_processor_id()]);
2335 if (map) {
2336 if (map->len == 1)
2337 queue_index = map->queues[0];
2338 else {
2339 u32 hash;
2340 if (skb->sk && skb->sk->sk_hash)
2341 hash = skb->sk->sk_hash;
2342 else
2343 hash = (__force u16) skb->protocol ^
2344 skb->rxhash;
2345 hash = jhash_1word(hash, hashrnd);
2346 queue_index = map->queues[
2347 ((u64)hash * map->len) >> 32];
2348 }
2349 if (unlikely(queue_index >= dev->real_num_tx_queues))
2350 queue_index = -1;
2351 }
2352 }
2353 rcu_read_unlock();
2354
2355 return queue_index;
2356#else
2357 return -1;
2358#endif
2359}
2360
2361static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2362 struct sk_buff *skb)
2363{
2364 int queue_index;
2365 const struct net_device_ops *ops = dev->netdev_ops;
2366
2367 if (dev->real_num_tx_queues == 1)
2368 queue_index = 0;
2369 else if (ops->ndo_select_queue) {
2370 queue_index = ops->ndo_select_queue(dev, skb);
2371 queue_index = dev_cap_txqueue(dev, queue_index);
2372 } else {
2373 struct sock *sk = skb->sk;
2374 queue_index = sk_tx_queue_get(sk);
2375
2376 if (queue_index < 0 || skb->ooo_okay ||
2377 queue_index >= dev->real_num_tx_queues) {
2378 int old_index = queue_index;
2379
2380 queue_index = get_xps_queue(dev, skb);
2381 if (queue_index < 0)
2382 queue_index = skb_tx_hash(dev, skb);
2383
2384 if (queue_index != old_index && sk) {
2385 struct dst_entry *dst =
2386 rcu_dereference_check(sk->sk_dst_cache, 1);
2387
2388 if (dst && skb_dst(skb) == dst)
2389 sk_tx_queue_set(sk, queue_index);
2390 }
2391 }
2392 }
2393
2394 skb_set_queue_mapping(skb, queue_index);
2395 return netdev_get_tx_queue(dev, queue_index);
2396}
2397
2398static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2399 struct net_device *dev,
2400 struct netdev_queue *txq)
2401{
2402 spinlock_t *root_lock = qdisc_lock(q);
2403 bool contended;
2404 int rc;
2405
2406 qdisc_skb_cb(skb)->pkt_len = skb->len;
2407 qdisc_calculate_pkt_len(skb, q);
2408 /*
2409 * Heuristic to force contended enqueues to serialize on a
2410 * separate lock before trying to get qdisc main lock.
2411 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2412 * and dequeue packets faster.
2413 */
2414 contended = qdisc_is_running(q);
2415 if (unlikely(contended))
2416 spin_lock(&q->busylock);
2417
2418 spin_lock(root_lock);
2419 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2420 kfree_skb(skb);
2421 rc = NET_XMIT_DROP;
2422 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2423 qdisc_run_begin(q)) {
2424 /*
2425 * This is a work-conserving queue; there are no old skbs
2426 * waiting to be sent out; and the qdisc is not running -
2427 * xmit the skb directly.
2428 */
2429 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2430 skb_dst_force(skb);
2431
2432 qdisc_bstats_update(q, skb);
2433
2434 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2435 if (unlikely(contended)) {
2436 spin_unlock(&q->busylock);
2437 contended = false;
2438 }
2439 __qdisc_run(q);
2440 } else
2441 qdisc_run_end(q);
2442
2443 rc = NET_XMIT_SUCCESS;
2444 } else {
2445 skb_dst_force(skb);
2446 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2447 if (qdisc_run_begin(q)) {
2448 if (unlikely(contended)) {
2449 spin_unlock(&q->busylock);
2450 contended = false;
2451 }
2452 __qdisc_run(q);
2453 }
2454 }
2455 spin_unlock(root_lock);
2456 if (unlikely(contended))
2457 spin_unlock(&q->busylock);
2458 return rc;
2459}
2460
2461#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2462static void skb_update_prio(struct sk_buff *skb)
2463{
2464 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2465
2466 if (!skb->priority && skb->sk && map) {
2467 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2468
2469 if (prioidx < map->priomap_len)
2470 skb->priority = map->priomap[prioidx];
2471 }
2472}
2473#else
2474#define skb_update_prio(skb)
2475#endif
2476
2477static DEFINE_PER_CPU(int, xmit_recursion);
2478#define RECURSION_LIMIT 10
2479
2480/**
2481 * dev_queue_xmit - transmit a buffer
2482 * @skb: buffer to transmit
2483 *
2484 * Queue a buffer for transmission to a network device. The caller must
2485 * have set the device and priority and built the buffer before calling
2486 * this function. The function can be called from an interrupt.
2487 *
2488 * A negative errno code is returned on a failure. A success does not
2489 * guarantee the frame will be transmitted as it may be dropped due
2490 * to congestion or traffic shaping.
2491 *
2492 * -----------------------------------------------------------------------------------
2493 * I notice this method can also return errors from the queue disciplines,
2494 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2495 * be positive.
2496 *
2497 * Regardless of the return value, the skb is consumed, so it is currently
2498 * difficult to retry a send to this method. (You can bump the ref count
2499 * before sending to hold a reference for retry if you are careful.)
2500 *
2501 * When calling this method, interrupts MUST be enabled. This is because
2502 * the BH enable code must have IRQs enabled so that it will not deadlock.
2503 * --BLG
2504 */
2505int dev_queue_xmit(struct sk_buff *skb)
2506{
2507 struct net_device *dev = skb->dev;
2508 struct netdev_queue *txq;
2509 struct Qdisc *q;
2510 int rc = -ENOMEM;
2511
2512 /* Disable soft irqs for various locks below. Also
2513 * stops preemption for RCU.
2514 */
2515 rcu_read_lock_bh();
2516
2517 skb_update_prio(skb);
2518
2519 txq = dev_pick_tx(dev, skb);
2520 q = rcu_dereference_bh(txq->qdisc);
2521
2522#ifdef CONFIG_NET_CLS_ACT
2523 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2524#endif
2525 trace_net_dev_queue(skb);
2526 if (q->enqueue) {
2527 rc = __dev_xmit_skb(skb, q, dev, txq);
2528 goto out;
2529 }
2530
2531 /* The device has no queue. Common case for software devices:
2532 loopback, all the sorts of tunnels...
2533
2534 Really, it is unlikely that netif_tx_lock protection is necessary
2535 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2536 counters.)
2537 However, it is possible, that they rely on protection
2538 made by us here.
2539
2540 Check this and shot the lock. It is not prone from deadlocks.
2541 Either shot noqueue qdisc, it is even simpler 8)
2542 */
2543 if (dev->flags & IFF_UP) {
2544 int cpu = smp_processor_id(); /* ok because BHs are off */
2545
2546 if (txq->xmit_lock_owner != cpu) {
2547
2548 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2549 goto recursion_alert;
2550
2551 HARD_TX_LOCK(dev, txq, cpu);
2552
2553 if (!netif_xmit_stopped(txq)) {
2554 __this_cpu_inc(xmit_recursion);
2555 rc = dev_hard_start_xmit(skb, dev, txq);
2556 __this_cpu_dec(xmit_recursion);
2557 if (dev_xmit_complete(rc)) {
2558 HARD_TX_UNLOCK(dev, txq);
2559 goto out;
2560 }
2561 }
2562 HARD_TX_UNLOCK(dev, txq);
2563 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2564 dev->name);
2565 } else {
2566 /* Recursion is detected! It is possible,
2567 * unfortunately
2568 */
2569recursion_alert:
2570 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2571 dev->name);
2572 }
2573 }
2574
2575 rc = -ENETDOWN;
2576 rcu_read_unlock_bh();
2577
2578 kfree_skb(skb);
2579 return rc;
2580out:
2581 rcu_read_unlock_bh();
2582 return rc;
2583}
2584EXPORT_SYMBOL(dev_queue_xmit);
2585
2586
2587/*=======================================================================
2588 Receiver routines
2589 =======================================================================*/
2590
2591int netdev_max_backlog __read_mostly = 1000;
2592int netdev_tstamp_prequeue __read_mostly = 1;
2593int netdev_budget __read_mostly = 300;
2594int weight_p __read_mostly = 64; /* old backlog weight */
2595
2596/* Called with irq disabled */
2597static inline void ____napi_schedule(struct softnet_data *sd,
2598 struct napi_struct *napi)
2599{
2600 list_add_tail(&napi->poll_list, &sd->poll_list);
2601 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2602}
2603
2604/*
2605 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2606 * and src/dst port numbers. Sets rxhash in skb to non-zero hash value
2607 * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb
2608 * if hash is a canonical 4-tuple hash over transport ports.
2609 */
2610void __skb_get_rxhash(struct sk_buff *skb)
2611{
2612 struct flow_keys keys;
2613 u32 hash;
2614
2615 if (!skb_flow_dissect(skb, &keys))
2616 return;
2617
2618 if (keys.ports) {
2619 if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0])
2620 swap(keys.port16[0], keys.port16[1]);
2621 skb->l4_rxhash = 1;
2622 }
2623
2624 /* get a consistent hash (same value on both flow directions) */
2625 if ((__force u32)keys.dst < (__force u32)keys.src)
2626 swap(keys.dst, keys.src);
2627
2628 hash = jhash_3words((__force u32)keys.dst,
2629 (__force u32)keys.src,
2630 (__force u32)keys.ports, hashrnd);
2631 if (!hash)
2632 hash = 1;
2633
2634 skb->rxhash = hash;
2635}
2636EXPORT_SYMBOL(__skb_get_rxhash);
2637
2638#ifdef CONFIG_RPS
2639
2640/* One global table that all flow-based protocols share. */
2641struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2642EXPORT_SYMBOL(rps_sock_flow_table);
2643
2644struct static_key rps_needed __read_mostly;
2645
2646static struct rps_dev_flow *
2647set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2648 struct rps_dev_flow *rflow, u16 next_cpu)
2649{
2650 if (next_cpu != RPS_NO_CPU) {
2651#ifdef CONFIG_RFS_ACCEL
2652 struct netdev_rx_queue *rxqueue;
2653 struct rps_dev_flow_table *flow_table;
2654 struct rps_dev_flow *old_rflow;
2655 u32 flow_id;
2656 u16 rxq_index;
2657 int rc;
2658
2659 /* Should we steer this flow to a different hardware queue? */
2660 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2661 !(dev->features & NETIF_F_NTUPLE))
2662 goto out;
2663 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2664 if (rxq_index == skb_get_rx_queue(skb))
2665 goto out;
2666
2667 rxqueue = dev->_rx + rxq_index;
2668 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2669 if (!flow_table)
2670 goto out;
2671 flow_id = skb->rxhash & flow_table->mask;
2672 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2673 rxq_index, flow_id);
2674 if (rc < 0)
2675 goto out;
2676 old_rflow = rflow;
2677 rflow = &flow_table->flows[flow_id];
2678 rflow->filter = rc;
2679 if (old_rflow->filter == rflow->filter)
2680 old_rflow->filter = RPS_NO_FILTER;
2681 out:
2682#endif
2683 rflow->last_qtail =
2684 per_cpu(softnet_data, next_cpu).input_queue_head;
2685 }
2686
2687 rflow->cpu = next_cpu;
2688 return rflow;
2689}
2690
2691/*
2692 * get_rps_cpu is called from netif_receive_skb and returns the target
2693 * CPU from the RPS map of the receiving queue for a given skb.
2694 * rcu_read_lock must be held on entry.
2695 */
2696static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2697 struct rps_dev_flow **rflowp)
2698{
2699 struct netdev_rx_queue *rxqueue;
2700 struct rps_map *map;
2701 struct rps_dev_flow_table *flow_table;
2702 struct rps_sock_flow_table *sock_flow_table;
2703 int cpu = -1;
2704 u16 tcpu;
2705
2706 if (skb_rx_queue_recorded(skb)) {
2707 u16 index = skb_get_rx_queue(skb);
2708 if (unlikely(index >= dev->real_num_rx_queues)) {
2709 WARN_ONCE(dev->real_num_rx_queues > 1,
2710 "%s received packet on queue %u, but number "
2711 "of RX queues is %u\n",
2712 dev->name, index, dev->real_num_rx_queues);
2713 goto done;
2714 }
2715 rxqueue = dev->_rx + index;
2716 } else
2717 rxqueue = dev->_rx;
2718
2719 map = rcu_dereference(rxqueue->rps_map);
2720 if (map) {
2721 if (map->len == 1 &&
2722 !rcu_access_pointer(rxqueue->rps_flow_table)) {
2723 tcpu = map->cpus[0];
2724 if (cpu_online(tcpu))
2725 cpu = tcpu;
2726 goto done;
2727 }
2728 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2729 goto done;
2730 }
2731
2732 skb_reset_network_header(skb);
2733 if (!skb_get_rxhash(skb))
2734 goto done;
2735
2736 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2737 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2738 if (flow_table && sock_flow_table) {
2739 u16 next_cpu;
2740 struct rps_dev_flow *rflow;
2741
2742 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2743 tcpu = rflow->cpu;
2744
2745 next_cpu = sock_flow_table->ents[skb->rxhash &
2746 sock_flow_table->mask];
2747
2748 /*
2749 * If the desired CPU (where last recvmsg was done) is
2750 * different from current CPU (one in the rx-queue flow
2751 * table entry), switch if one of the following holds:
2752 * - Current CPU is unset (equal to RPS_NO_CPU).
2753 * - Current CPU is offline.
2754 * - The current CPU's queue tail has advanced beyond the
2755 * last packet that was enqueued using this table entry.
2756 * This guarantees that all previous packets for the flow
2757 * have been dequeued, thus preserving in order delivery.
2758 */
2759 if (unlikely(tcpu != next_cpu) &&
2760 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2761 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2762 rflow->last_qtail)) >= 0))
2763 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2764
2765 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2766 *rflowp = rflow;
2767 cpu = tcpu;
2768 goto done;
2769 }
2770 }
2771
2772 if (map) {
2773 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2774
2775 if (cpu_online(tcpu)) {
2776 cpu = tcpu;
2777 goto done;
2778 }
2779 }
2780
2781done:
2782 return cpu;
2783}
2784
2785#ifdef CONFIG_RFS_ACCEL
2786
2787/**
2788 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2789 * @dev: Device on which the filter was set
2790 * @rxq_index: RX queue index
2791 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2792 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2793 *
2794 * Drivers that implement ndo_rx_flow_steer() should periodically call
2795 * this function for each installed filter and remove the filters for
2796 * which it returns %true.
2797 */
2798bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2799 u32 flow_id, u16 filter_id)
2800{
2801 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2802 struct rps_dev_flow_table *flow_table;
2803 struct rps_dev_flow *rflow;
2804 bool expire = true;
2805 int cpu;
2806
2807 rcu_read_lock();
2808 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2809 if (flow_table && flow_id <= flow_table->mask) {
2810 rflow = &flow_table->flows[flow_id];
2811 cpu = ACCESS_ONCE(rflow->cpu);
2812 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2813 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2814 rflow->last_qtail) <
2815 (int)(10 * flow_table->mask)))
2816 expire = false;
2817 }
2818 rcu_read_unlock();
2819 return expire;
2820}
2821EXPORT_SYMBOL(rps_may_expire_flow);
2822
2823#endif /* CONFIG_RFS_ACCEL */
2824
2825/* Called from hardirq (IPI) context */
2826static void rps_trigger_softirq(void *data)
2827{
2828 struct softnet_data *sd = data;
2829
2830 ____napi_schedule(sd, &sd->backlog);
2831 sd->received_rps++;
2832}
2833
2834#endif /* CONFIG_RPS */
2835
2836/*
2837 * Check if this softnet_data structure is another cpu one
2838 * If yes, queue it to our IPI list and return 1
2839 * If no, return 0
2840 */
2841static int rps_ipi_queued(struct softnet_data *sd)
2842{
2843#ifdef CONFIG_RPS
2844 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2845
2846 if (sd != mysd) {
2847 sd->rps_ipi_next = mysd->rps_ipi_list;
2848 mysd->rps_ipi_list = sd;
2849
2850 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2851 return 1;
2852 }
2853#endif /* CONFIG_RPS */
2854 return 0;
2855}
2856
2857/*
2858 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2859 * queue (may be a remote CPU queue).
2860 */
2861static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2862 unsigned int *qtail)
2863{
2864 struct softnet_data *sd;
2865 unsigned long flags;
2866
2867 sd = &per_cpu(softnet_data, cpu);
2868
2869 local_irq_save(flags);
2870
2871 rps_lock(sd);
2872 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2873 if (skb_queue_len(&sd->input_pkt_queue)) {
2874enqueue:
2875 __skb_queue_tail(&sd->input_pkt_queue, skb);
2876 input_queue_tail_incr_save(sd, qtail);
2877 rps_unlock(sd);
2878 local_irq_restore(flags);
2879 return NET_RX_SUCCESS;
2880 }
2881
2882 /* Schedule NAPI for backlog device
2883 * We can use non atomic operation since we own the queue lock
2884 */
2885 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2886 if (!rps_ipi_queued(sd))
2887 ____napi_schedule(sd, &sd->backlog);
2888 }
2889 goto enqueue;
2890 }
2891
2892 sd->dropped++;
2893 rps_unlock(sd);
2894
2895 local_irq_restore(flags);
2896
2897 atomic_long_inc(&skb->dev->rx_dropped);
2898 kfree_skb(skb);
2899 return NET_RX_DROP;
2900}
2901
2902/**
2903 * netif_rx - post buffer to the network code
2904 * @skb: buffer to post
2905 *
2906 * This function receives a packet from a device driver and queues it for
2907 * the upper (protocol) levels to process. It always succeeds. The buffer
2908 * may be dropped during processing for congestion control or by the
2909 * protocol layers.
2910 *
2911 * return values:
2912 * NET_RX_SUCCESS (no congestion)
2913 * NET_RX_DROP (packet was dropped)
2914 *
2915 */
2916
2917int netif_rx(struct sk_buff *skb)
2918{
2919 int ret;
2920
2921 /* if netpoll wants it, pretend we never saw it */
2922 if (netpoll_rx(skb))
2923 return NET_RX_DROP;
2924
2925 net_timestamp_check(netdev_tstamp_prequeue, skb);
2926
2927 trace_netif_rx(skb);
2928#ifdef CONFIG_RPS
2929 if (static_key_false(&rps_needed)) {
2930 struct rps_dev_flow voidflow, *rflow = &voidflow;
2931 int cpu;
2932
2933 preempt_disable();
2934 rcu_read_lock();
2935
2936 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2937 if (cpu < 0)
2938 cpu = smp_processor_id();
2939
2940 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2941
2942 rcu_read_unlock();
2943 preempt_enable();
2944 } else
2945#endif
2946 {
2947 unsigned int qtail;
2948 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2949 put_cpu();
2950 }
2951 return ret;
2952}
2953EXPORT_SYMBOL(netif_rx);
2954
2955int netif_rx_ni(struct sk_buff *skb)
2956{
2957 int err;
2958
2959 preempt_disable();
2960 err = netif_rx(skb);
2961 if (local_softirq_pending())
2962 do_softirq();
2963 preempt_enable();
2964
2965 return err;
2966}
2967EXPORT_SYMBOL(netif_rx_ni);
2968
2969static void net_tx_action(struct softirq_action *h)
2970{
2971 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2972
2973 if (sd->completion_queue) {
2974 struct sk_buff *clist;
2975
2976 local_irq_disable();
2977 clist = sd->completion_queue;
2978 sd->completion_queue = NULL;
2979 local_irq_enable();
2980
2981 while (clist) {
2982 struct sk_buff *skb = clist;
2983 clist = clist->next;
2984
2985 WARN_ON(atomic_read(&skb->users));
2986 trace_kfree_skb(skb, net_tx_action);
2987 __kfree_skb(skb);
2988 }
2989 }
2990
2991 if (sd->output_queue) {
2992 struct Qdisc *head;
2993
2994 local_irq_disable();
2995 head = sd->output_queue;
2996 sd->output_queue = NULL;
2997 sd->output_queue_tailp = &sd->output_queue;
2998 local_irq_enable();
2999
3000 while (head) {
3001 struct Qdisc *q = head;
3002 spinlock_t *root_lock;
3003
3004 head = head->next_sched;
3005
3006 root_lock = qdisc_lock(q);
3007 if (spin_trylock(root_lock)) {
3008 smp_mb__before_clear_bit();
3009 clear_bit(__QDISC_STATE_SCHED,
3010 &q->state);
3011 qdisc_run(q);
3012 spin_unlock(root_lock);
3013 } else {
3014 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3015 &q->state)) {
3016 __netif_reschedule(q);
3017 } else {
3018 smp_mb__before_clear_bit();
3019 clear_bit(__QDISC_STATE_SCHED,
3020 &q->state);
3021 }
3022 }
3023 }
3024 }
3025}
3026
3027#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3028 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3029/* This hook is defined here for ATM LANE */
3030int (*br_fdb_test_addr_hook)(struct net_device *dev,
3031 unsigned char *addr) __read_mostly;
3032EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3033#endif
3034
3035#ifdef CONFIG_NET_CLS_ACT
3036/* TODO: Maybe we should just force sch_ingress to be compiled in
3037 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3038 * a compare and 2 stores extra right now if we dont have it on
3039 * but have CONFIG_NET_CLS_ACT
3040 * NOTE: This doesn't stop any functionality; if you dont have
3041 * the ingress scheduler, you just can't add policies on ingress.
3042 *
3043 */
3044static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3045{
3046 struct net_device *dev = skb->dev;
3047 u32 ttl = G_TC_RTTL(skb->tc_verd);
3048 int result = TC_ACT_OK;
3049 struct Qdisc *q;
3050
3051 if (unlikely(MAX_RED_LOOP < ttl++)) {
3052 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3053 skb->skb_iif, dev->ifindex);
3054 return TC_ACT_SHOT;
3055 }
3056
3057 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3058 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3059
3060 q = rxq->qdisc;
3061 if (q != &noop_qdisc) {
3062 spin_lock(qdisc_lock(q));
3063 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3064 result = qdisc_enqueue_root(skb, q);
3065 spin_unlock(qdisc_lock(q));
3066 }
3067
3068 return result;
3069}
3070
3071static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3072 struct packet_type **pt_prev,
3073 int *ret, struct net_device *orig_dev)
3074{
3075 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3076
3077 if (!rxq || rxq->qdisc == &noop_qdisc)
3078 goto out;
3079
3080 if (*pt_prev) {
3081 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3082 *pt_prev = NULL;
3083 }
3084
3085 switch (ing_filter(skb, rxq)) {
3086 case TC_ACT_SHOT:
3087 case TC_ACT_STOLEN:
3088 kfree_skb(skb);
3089 return NULL;
3090 }
3091
3092out:
3093 skb->tc_verd = 0;
3094 return skb;
3095}
3096#endif
3097
3098/**
3099 * netdev_rx_handler_register - register receive handler
3100 * @dev: device to register a handler for
3101 * @rx_handler: receive handler to register
3102 * @rx_handler_data: data pointer that is used by rx handler
3103 *
3104 * Register a receive hander for a device. This handler will then be
3105 * called from __netif_receive_skb. A negative errno code is returned
3106 * on a failure.
3107 *
3108 * The caller must hold the rtnl_mutex.
3109 *
3110 * For a general description of rx_handler, see enum rx_handler_result.
3111 */
3112int netdev_rx_handler_register(struct net_device *dev,
3113 rx_handler_func_t *rx_handler,
3114 void *rx_handler_data)
3115{
3116 ASSERT_RTNL();
3117
3118 if (dev->rx_handler)
3119 return -EBUSY;
3120
3121 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3122 rcu_assign_pointer(dev->rx_handler, rx_handler);
3123
3124 return 0;
3125}
3126EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3127
3128/**
3129 * netdev_rx_handler_unregister - unregister receive handler
3130 * @dev: device to unregister a handler from
3131 *
3132 * Unregister a receive hander from a device.
3133 *
3134 * The caller must hold the rtnl_mutex.
3135 */
3136void netdev_rx_handler_unregister(struct net_device *dev)
3137{
3138
3139 ASSERT_RTNL();
3140 RCU_INIT_POINTER(dev->rx_handler, NULL);
3141 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3142}
3143EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3144
3145static int __netif_receive_skb(struct sk_buff *skb)
3146{
3147 struct packet_type *ptype, *pt_prev;
3148 rx_handler_func_t *rx_handler;
3149 struct net_device *orig_dev;
3150 struct net_device *null_or_dev;
3151 bool deliver_exact = false;
3152 int ret = NET_RX_DROP;
3153 __be16 type;
3154
3155 net_timestamp_check(!netdev_tstamp_prequeue, skb);
3156
3157 trace_netif_receive_skb(skb);
3158
3159 /* if we've gotten here through NAPI, check netpoll */
3160 if (netpoll_receive_skb(skb))
3161 return NET_RX_DROP;
3162
3163 if (!skb->skb_iif)
3164 skb->skb_iif = skb->dev->ifindex;
3165 orig_dev = skb->dev;
3166
3167 skb_reset_network_header(skb);
3168 skb_reset_transport_header(skb);
3169 skb_reset_mac_len(skb);
3170
3171 pt_prev = NULL;
3172
3173 rcu_read_lock();
3174
3175another_round:
3176
3177 __this_cpu_inc(softnet_data.processed);
3178
3179 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3180 skb = vlan_untag(skb);
3181 if (unlikely(!skb))
3182 goto out;
3183 }
3184
3185#ifdef CONFIG_NET_CLS_ACT
3186 if (skb->tc_verd & TC_NCLS) {
3187 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3188 goto ncls;
3189 }
3190#endif
3191
3192 list_for_each_entry_rcu(ptype, &ptype_all, list) {
3193 if (!ptype->dev || ptype->dev == skb->dev) {
3194 if (pt_prev)
3195 ret = deliver_skb(skb, pt_prev, orig_dev);
3196 pt_prev = ptype;
3197 }
3198 }
3199
3200#ifdef CONFIG_NET_CLS_ACT
3201 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3202 if (!skb)
3203 goto out;
3204ncls:
3205#endif
3206
3207 rx_handler = rcu_dereference(skb->dev->rx_handler);
3208 if (vlan_tx_tag_present(skb)) {
3209 if (pt_prev) {
3210 ret = deliver_skb(skb, pt_prev, orig_dev);
3211 pt_prev = NULL;
3212 }
3213 if (vlan_do_receive(&skb, !rx_handler))
3214 goto another_round;
3215 else if (unlikely(!skb))
3216 goto out;
3217 }
3218
3219 if (rx_handler) {
3220 if (pt_prev) {
3221 ret = deliver_skb(skb, pt_prev, orig_dev);
3222 pt_prev = NULL;
3223 }
3224 switch (rx_handler(&skb)) {
3225 case RX_HANDLER_CONSUMED:
3226 goto out;
3227 case RX_HANDLER_ANOTHER:
3228 goto another_round;
3229 case RX_HANDLER_EXACT:
3230 deliver_exact = true;
3231 case RX_HANDLER_PASS:
3232 break;
3233 default:
3234 BUG();
3235 }
3236 }
3237
3238 /* deliver only exact match when indicated */
3239 null_or_dev = deliver_exact ? skb->dev : NULL;
3240
3241 type = skb->protocol;
3242 list_for_each_entry_rcu(ptype,
3243 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3244 if (ptype->type == type &&
3245 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3246 ptype->dev == orig_dev)) {
3247 if (pt_prev)
3248 ret = deliver_skb(skb, pt_prev, orig_dev);
3249 pt_prev = ptype;
3250 }
3251 }
3252
3253 if (pt_prev) {
3254 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3255 } else {
3256 atomic_long_inc(&skb->dev->rx_dropped);
3257 kfree_skb(skb);
3258 /* Jamal, now you will not able to escape explaining
3259 * me how you were going to use this. :-)
3260 */
3261 ret = NET_RX_DROP;
3262 }
3263
3264out:
3265 rcu_read_unlock();
3266 return ret;
3267}
3268
3269/**
3270 * netif_receive_skb - process receive buffer from network
3271 * @skb: buffer to process
3272 *
3273 * netif_receive_skb() is the main receive data processing function.
3274 * It always succeeds. The buffer may be dropped during processing
3275 * for congestion control or by the protocol layers.
3276 *
3277 * This function may only be called from softirq context and interrupts
3278 * should be enabled.
3279 *
3280 * Return values (usually ignored):
3281 * NET_RX_SUCCESS: no congestion
3282 * NET_RX_DROP: packet was dropped
3283 */
3284int netif_receive_skb(struct sk_buff *skb)
3285{
3286 net_timestamp_check(netdev_tstamp_prequeue, skb);
3287
3288 if (skb_defer_rx_timestamp(skb))
3289 return NET_RX_SUCCESS;
3290
3291#ifdef CONFIG_RPS
3292 if (static_key_false(&rps_needed)) {
3293 struct rps_dev_flow voidflow, *rflow = &voidflow;
3294 int cpu, ret;
3295
3296 rcu_read_lock();
3297
3298 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3299
3300 if (cpu >= 0) {
3301 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3302 rcu_read_unlock();
3303 return ret;
3304 }
3305 rcu_read_unlock();
3306 }
3307#endif
3308 return __netif_receive_skb(skb);
3309}
3310EXPORT_SYMBOL(netif_receive_skb);
3311
3312/* Network device is going away, flush any packets still pending
3313 * Called with irqs disabled.
3314 */
3315static void flush_backlog(void *arg)
3316{
3317 struct net_device *dev = arg;
3318 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3319 struct sk_buff *skb, *tmp;
3320
3321 rps_lock(sd);
3322 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3323 if (skb->dev == dev) {
3324 __skb_unlink(skb, &sd->input_pkt_queue);
3325 kfree_skb(skb);
3326 input_queue_head_incr(sd);
3327 }
3328 }
3329 rps_unlock(sd);
3330
3331 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3332 if (skb->dev == dev) {
3333 __skb_unlink(skb, &sd->process_queue);
3334 kfree_skb(skb);
3335 input_queue_head_incr(sd);
3336 }
3337 }
3338}
3339
3340static int napi_gro_complete(struct sk_buff *skb)
3341{
3342 struct packet_type *ptype;
3343 __be16 type = skb->protocol;
3344 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3345 int err = -ENOENT;
3346
3347 if (NAPI_GRO_CB(skb)->count == 1) {
3348 skb_shinfo(skb)->gso_size = 0;
3349 goto out;
3350 }
3351
3352 rcu_read_lock();
3353 list_for_each_entry_rcu(ptype, head, list) {
3354 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3355 continue;
3356
3357 err = ptype->gro_complete(skb);
3358 break;
3359 }
3360 rcu_read_unlock();
3361
3362 if (err) {
3363 WARN_ON(&ptype->list == head);
3364 kfree_skb(skb);
3365 return NET_RX_SUCCESS;
3366 }
3367
3368out:
3369 return netif_receive_skb(skb);
3370}
3371
3372inline void napi_gro_flush(struct napi_struct *napi)
3373{
3374 struct sk_buff *skb, *next;
3375
3376 for (skb = napi->gro_list; skb; skb = next) {
3377 next = skb->next;
3378 skb->next = NULL;
3379 napi_gro_complete(skb);
3380 }
3381
3382 napi->gro_count = 0;
3383 napi->gro_list = NULL;
3384}
3385EXPORT_SYMBOL(napi_gro_flush);
3386
3387enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3388{
3389 struct sk_buff **pp = NULL;
3390 struct packet_type *ptype;
3391 __be16 type = skb->protocol;
3392 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3393 int same_flow;
3394 int mac_len;
3395 enum gro_result ret;
3396
3397 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3398 goto normal;
3399
3400 if (skb_is_gso(skb) || skb_has_frag_list(skb))
3401 goto normal;
3402
3403 rcu_read_lock();
3404 list_for_each_entry_rcu(ptype, head, list) {
3405 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3406 continue;
3407
3408 skb_set_network_header(skb, skb_gro_offset(skb));
3409 mac_len = skb->network_header - skb->mac_header;
3410 skb->mac_len = mac_len;
3411 NAPI_GRO_CB(skb)->same_flow = 0;
3412 NAPI_GRO_CB(skb)->flush = 0;
3413 NAPI_GRO_CB(skb)->free = 0;
3414
3415 pp = ptype->gro_receive(&napi->gro_list, skb);
3416 break;
3417 }
3418 rcu_read_unlock();
3419
3420 if (&ptype->list == head)
3421 goto normal;
3422
3423 same_flow = NAPI_GRO_CB(skb)->same_flow;
3424 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3425
3426 if (pp) {
3427 struct sk_buff *nskb = *pp;
3428
3429 *pp = nskb->next;
3430 nskb->next = NULL;
3431 napi_gro_complete(nskb);
3432 napi->gro_count--;
3433 }
3434
3435 if (same_flow)
3436 goto ok;
3437
3438 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3439 goto normal;
3440
3441 napi->gro_count++;
3442 NAPI_GRO_CB(skb)->count = 1;
3443 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3444 skb->next = napi->gro_list;
3445 napi->gro_list = skb;
3446 ret = GRO_HELD;
3447
3448pull:
3449 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3450 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3451
3452 BUG_ON(skb->end - skb->tail < grow);
3453
3454 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3455
3456 skb->tail += grow;
3457 skb->data_len -= grow;
3458
3459 skb_shinfo(skb)->frags[0].page_offset += grow;
3460 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3461
3462 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3463 skb_frag_unref(skb, 0);
3464 memmove(skb_shinfo(skb)->frags,
3465 skb_shinfo(skb)->frags + 1,
3466 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3467 }
3468 }
3469
3470ok:
3471 return ret;
3472
3473normal:
3474 ret = GRO_NORMAL;
3475 goto pull;
3476}
3477EXPORT_SYMBOL(dev_gro_receive);
3478
3479static inline gro_result_t
3480__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3481{
3482 struct sk_buff *p;
3483 unsigned int maclen = skb->dev->hard_header_len;
3484
3485 for (p = napi->gro_list; p; p = p->next) {
3486 unsigned long diffs;
3487
3488 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3489 diffs |= p->vlan_tci ^ skb->vlan_tci;
3490 if (maclen == ETH_HLEN)
3491 diffs |= compare_ether_header(skb_mac_header(p),
3492 skb_gro_mac_header(skb));
3493 else if (!diffs)
3494 diffs = memcmp(skb_mac_header(p),
3495 skb_gro_mac_header(skb),
3496 maclen);
3497 NAPI_GRO_CB(p)->same_flow = !diffs;
3498 NAPI_GRO_CB(p)->flush = 0;
3499 }
3500
3501 return dev_gro_receive(napi, skb);
3502}
3503
3504gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3505{
3506 switch (ret) {
3507 case GRO_NORMAL:
3508 if (netif_receive_skb(skb))
3509 ret = GRO_DROP;
3510 break;
3511
3512 case GRO_DROP:
3513 kfree_skb(skb);
3514 break;
3515
3516 case GRO_MERGED_FREE:
3517 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3518 kmem_cache_free(skbuff_head_cache, skb);
3519 else
3520 __kfree_skb(skb);
3521 break;
3522
3523 case GRO_HELD:
3524 case GRO_MERGED:
3525 break;
3526 }
3527
3528 return ret;
3529}
3530EXPORT_SYMBOL(napi_skb_finish);
3531
3532void skb_gro_reset_offset(struct sk_buff *skb)
3533{
3534 NAPI_GRO_CB(skb)->data_offset = 0;
3535 NAPI_GRO_CB(skb)->frag0 = NULL;
3536 NAPI_GRO_CB(skb)->frag0_len = 0;
3537
3538 if (skb->mac_header == skb->tail &&
3539 !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3540 NAPI_GRO_CB(skb)->frag0 =
3541 skb_frag_address(&skb_shinfo(skb)->frags[0]);
3542 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3543 }
3544}
3545EXPORT_SYMBOL(skb_gro_reset_offset);
3546
3547gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3548{
3549 skb_gro_reset_offset(skb);
3550
3551 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3552}
3553EXPORT_SYMBOL(napi_gro_receive);
3554
3555static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3556{
3557 __skb_pull(skb, skb_headlen(skb));
3558 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3559 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3560 skb->vlan_tci = 0;
3561 skb->dev = napi->dev;
3562 skb->skb_iif = 0;
3563
3564 napi->skb = skb;
3565}
3566
3567struct sk_buff *napi_get_frags(struct napi_struct *napi)
3568{
3569 struct sk_buff *skb = napi->skb;
3570
3571 if (!skb) {
3572 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3573 if (skb)
3574 napi->skb = skb;
3575 }
3576 return skb;
3577}
3578EXPORT_SYMBOL(napi_get_frags);
3579
3580gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3581 gro_result_t ret)
3582{
3583 switch (ret) {
3584 case GRO_NORMAL:
3585 case GRO_HELD:
3586 skb->protocol = eth_type_trans(skb, skb->dev);
3587
3588 if (ret == GRO_HELD)
3589 skb_gro_pull(skb, -ETH_HLEN);
3590 else if (netif_receive_skb(skb))
3591 ret = GRO_DROP;
3592 break;
3593
3594 case GRO_DROP:
3595 case GRO_MERGED_FREE:
3596 napi_reuse_skb(napi, skb);
3597 break;
3598
3599 case GRO_MERGED:
3600 break;
3601 }
3602
3603 return ret;
3604}
3605EXPORT_SYMBOL(napi_frags_finish);
3606
3607static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3608{
3609 struct sk_buff *skb = napi->skb;
3610 struct ethhdr *eth;
3611 unsigned int hlen;
3612 unsigned int off;
3613
3614 napi->skb = NULL;
3615
3616 skb_reset_mac_header(skb);
3617 skb_gro_reset_offset(skb);
3618
3619 off = skb_gro_offset(skb);
3620 hlen = off + sizeof(*eth);
3621 eth = skb_gro_header_fast(skb, off);
3622 if (skb_gro_header_hard(skb, hlen)) {
3623 eth = skb_gro_header_slow(skb, hlen, off);
3624 if (unlikely(!eth)) {
3625 napi_reuse_skb(napi, skb);
3626 skb = NULL;
3627 goto out;
3628 }
3629 }
3630
3631 skb_gro_pull(skb, sizeof(*eth));
3632
3633 /*
3634 * This works because the only protocols we care about don't require
3635 * special handling. We'll fix it up properly at the end.
3636 */
3637 skb->protocol = eth->h_proto;
3638
3639out:
3640 return skb;
3641}
3642
3643gro_result_t napi_gro_frags(struct napi_struct *napi)
3644{
3645 struct sk_buff *skb = napi_frags_skb(napi);
3646
3647 if (!skb)
3648 return GRO_DROP;
3649
3650 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3651}
3652EXPORT_SYMBOL(napi_gro_frags);
3653
3654/*
3655 * net_rps_action sends any pending IPI's for rps.
3656 * Note: called with local irq disabled, but exits with local irq enabled.
3657 */
3658static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3659{
3660#ifdef CONFIG_RPS
3661 struct softnet_data *remsd = sd->rps_ipi_list;
3662
3663 if (remsd) {
3664 sd->rps_ipi_list = NULL;
3665
3666 local_irq_enable();
3667
3668 /* Send pending IPI's to kick RPS processing on remote cpus. */
3669 while (remsd) {
3670 struct softnet_data *next = remsd->rps_ipi_next;
3671
3672 if (cpu_online(remsd->cpu))
3673 __smp_call_function_single(remsd->cpu,
3674 &remsd->csd, 0);
3675 remsd = next;
3676 }
3677 } else
3678#endif
3679 local_irq_enable();
3680}
3681
3682static int process_backlog(struct napi_struct *napi, int quota)
3683{
3684 int work = 0;
3685 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3686
3687#ifdef CONFIG_RPS
3688 /* Check if we have pending ipi, its better to send them now,
3689 * not waiting net_rx_action() end.
3690 */
3691 if (sd->rps_ipi_list) {
3692 local_irq_disable();
3693 net_rps_action_and_irq_enable(sd);
3694 }
3695#endif
3696 napi->weight = weight_p;
3697 local_irq_disable();
3698 while (work < quota) {
3699 struct sk_buff *skb;
3700 unsigned int qlen;
3701
3702 while ((skb = __skb_dequeue(&sd->process_queue))) {
3703 local_irq_enable();
3704 __netif_receive_skb(skb);
3705 local_irq_disable();
3706 input_queue_head_incr(sd);
3707 if (++work >= quota) {
3708 local_irq_enable();
3709 return work;
3710 }
3711 }
3712
3713 rps_lock(sd);
3714 qlen = skb_queue_len(&sd->input_pkt_queue);
3715 if (qlen)
3716 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3717 &sd->process_queue);
3718
3719 if (qlen < quota - work) {
3720 /*
3721 * Inline a custom version of __napi_complete().
3722 * only current cpu owns and manipulates this napi,
3723 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3724 * we can use a plain write instead of clear_bit(),
3725 * and we dont need an smp_mb() memory barrier.
3726 */
3727 list_del(&napi->poll_list);
3728 napi->state = 0;
3729
3730 quota = work + qlen;
3731 }
3732 rps_unlock(sd);
3733 }
3734 local_irq_enable();
3735
3736 return work;
3737}
3738
3739/**
3740 * __napi_schedule - schedule for receive
3741 * @n: entry to schedule
3742 *
3743 * The entry's receive function will be scheduled to run
3744 */
3745void __napi_schedule(struct napi_struct *n)
3746{
3747 unsigned long flags;
3748
3749 local_irq_save(flags);
3750 ____napi_schedule(&__get_cpu_var(softnet_data), n);
3751 local_irq_restore(flags);
3752}
3753EXPORT_SYMBOL(__napi_schedule);
3754
3755void __napi_complete(struct napi_struct *n)
3756{
3757 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3758 BUG_ON(n->gro_list);
3759
3760 list_del(&n->poll_list);
3761 smp_mb__before_clear_bit();
3762 clear_bit(NAPI_STATE_SCHED, &n->state);
3763}
3764EXPORT_SYMBOL(__napi_complete);
3765
3766void napi_complete(struct napi_struct *n)
3767{
3768 unsigned long flags;
3769
3770 /*
3771 * don't let napi dequeue from the cpu poll list
3772 * just in case its running on a different cpu
3773 */
3774 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3775 return;
3776
3777 napi_gro_flush(n);
3778 local_irq_save(flags);
3779 __napi_complete(n);
3780 local_irq_restore(flags);
3781}
3782EXPORT_SYMBOL(napi_complete);
3783
3784void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3785 int (*poll)(struct napi_struct *, int), int weight)
3786{
3787 INIT_LIST_HEAD(&napi->poll_list);
3788 napi->gro_count = 0;
3789 napi->gro_list = NULL;
3790 napi->skb = NULL;
3791 napi->poll = poll;
3792 napi->weight = weight;
3793 list_add(&napi->dev_list, &dev->napi_list);
3794 napi->dev = dev;
3795#ifdef CONFIG_NETPOLL
3796 spin_lock_init(&napi->poll_lock);
3797 napi->poll_owner = -1;
3798#endif
3799 set_bit(NAPI_STATE_SCHED, &napi->state);
3800}
3801EXPORT_SYMBOL(netif_napi_add);
3802
3803void netif_napi_del(struct napi_struct *napi)
3804{
3805 struct sk_buff *skb, *next;
3806
3807 list_del_init(&napi->dev_list);
3808 napi_free_frags(napi);
3809
3810 for (skb = napi->gro_list; skb; skb = next) {
3811 next = skb->next;
3812 skb->next = NULL;
3813 kfree_skb(skb);
3814 }
3815
3816 napi->gro_list = NULL;
3817 napi->gro_count = 0;
3818}
3819EXPORT_SYMBOL(netif_napi_del);
3820
3821static void net_rx_action(struct softirq_action *h)
3822{
3823 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3824 unsigned long time_limit = jiffies + 2;
3825 int budget = netdev_budget;
3826 void *have;
3827
3828 local_irq_disable();
3829
3830 while (!list_empty(&sd->poll_list)) {
3831 struct napi_struct *n;
3832 int work, weight;
3833
3834 /* If softirq window is exhuasted then punt.
3835 * Allow this to run for 2 jiffies since which will allow
3836 * an average latency of 1.5/HZ.
3837 */
3838 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3839 goto softnet_break;
3840
3841 local_irq_enable();
3842
3843 /* Even though interrupts have been re-enabled, this
3844 * access is safe because interrupts can only add new
3845 * entries to the tail of this list, and only ->poll()
3846 * calls can remove this head entry from the list.
3847 */
3848 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3849
3850 have = netpoll_poll_lock(n);
3851
3852 weight = n->weight;
3853
3854 /* This NAPI_STATE_SCHED test is for avoiding a race
3855 * with netpoll's poll_napi(). Only the entity which
3856 * obtains the lock and sees NAPI_STATE_SCHED set will
3857 * actually make the ->poll() call. Therefore we avoid
3858 * accidentally calling ->poll() when NAPI is not scheduled.
3859 */
3860 work = 0;
3861 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3862 work = n->poll(n, weight);
3863 trace_napi_poll(n);
3864 }
3865
3866 WARN_ON_ONCE(work > weight);
3867
3868 budget -= work;
3869
3870 local_irq_disable();
3871
3872 /* Drivers must not modify the NAPI state if they
3873 * consume the entire weight. In such cases this code
3874 * still "owns" the NAPI instance and therefore can
3875 * move the instance around on the list at-will.
3876 */
3877 if (unlikely(work == weight)) {
3878 if (unlikely(napi_disable_pending(n))) {
3879 local_irq_enable();
3880 napi_complete(n);
3881 local_irq_disable();
3882 } else
3883 list_move_tail(&n->poll_list, &sd->poll_list);
3884 }
3885
3886 netpoll_poll_unlock(have);
3887 }
3888out:
3889 net_rps_action_and_irq_enable(sd);
3890
3891#ifdef CONFIG_NET_DMA
3892 /*
3893 * There may not be any more sk_buffs coming right now, so push
3894 * any pending DMA copies to hardware
3895 */
3896 dma_issue_pending_all();
3897#endif
3898
3899 return;
3900
3901softnet_break:
3902 sd->time_squeeze++;
3903 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3904 goto out;
3905}
3906
3907static gifconf_func_t *gifconf_list[NPROTO];
3908
3909/**
3910 * register_gifconf - register a SIOCGIF handler
3911 * @family: Address family
3912 * @gifconf: Function handler
3913 *
3914 * Register protocol dependent address dumping routines. The handler
3915 * that is passed must not be freed or reused until it has been replaced
3916 * by another handler.
3917 */
3918int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3919{
3920 if (family >= NPROTO)
3921 return -EINVAL;
3922 gifconf_list[family] = gifconf;
3923 return 0;
3924}
3925EXPORT_SYMBOL(register_gifconf);
3926
3927
3928/*
3929 * Map an interface index to its name (SIOCGIFNAME)
3930 */
3931
3932/*
3933 * We need this ioctl for efficient implementation of the
3934 * if_indextoname() function required by the IPv6 API. Without
3935 * it, we would have to search all the interfaces to find a
3936 * match. --pb
3937 */
3938
3939static int dev_ifname(struct net *net, struct ifreq __user *arg)
3940{
3941 struct net_device *dev;
3942 struct ifreq ifr;
3943
3944 /*
3945 * Fetch the caller's info block.
3946 */
3947
3948 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3949 return -EFAULT;
3950
3951 rcu_read_lock();
3952 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3953 if (!dev) {
3954 rcu_read_unlock();
3955 return -ENODEV;
3956 }
3957
3958 strcpy(ifr.ifr_name, dev->name);
3959 rcu_read_unlock();
3960
3961 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3962 return -EFAULT;
3963 return 0;
3964}
3965
3966/*
3967 * Perform a SIOCGIFCONF call. This structure will change
3968 * size eventually, and there is nothing I can do about it.
3969 * Thus we will need a 'compatibility mode'.
3970 */
3971
3972static int dev_ifconf(struct net *net, char __user *arg)
3973{
3974 struct ifconf ifc;
3975 struct net_device *dev;
3976 char __user *pos;
3977 int len;
3978 int total;
3979 int i;
3980
3981 /*
3982 * Fetch the caller's info block.
3983 */
3984
3985 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3986 return -EFAULT;
3987
3988 pos = ifc.ifc_buf;
3989 len = ifc.ifc_len;
3990
3991 /*
3992 * Loop over the interfaces, and write an info block for each.
3993 */
3994
3995 total = 0;
3996 for_each_netdev(net, dev) {
3997 for (i = 0; i < NPROTO; i++) {
3998 if (gifconf_list[i]) {
3999 int done;
4000 if (!pos)
4001 done = gifconf_list[i](dev, NULL, 0);
4002 else
4003 done = gifconf_list[i](dev, pos + total,
4004 len - total);
4005 if (done < 0)
4006 return -EFAULT;
4007 total += done;
4008 }
4009 }
4010 }
4011
4012 /*
4013 * All done. Write the updated control block back to the caller.
4014 */
4015 ifc.ifc_len = total;
4016
4017 /*
4018 * Both BSD and Solaris return 0 here, so we do too.
4019 */
4020 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4021}
4022
4023#ifdef CONFIG_PROC_FS
4024
4025#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4026
4027#define get_bucket(x) ((x) >> BUCKET_SPACE)
4028#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4029#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4030
4031static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4032{
4033 struct net *net = seq_file_net(seq);
4034 struct net_device *dev;
4035 struct hlist_node *p;
4036 struct hlist_head *h;
4037 unsigned int count = 0, offset = get_offset(*pos);
4038
4039 h = &net->dev_name_head[get_bucket(*pos)];
4040 hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4041 if (++count == offset)
4042 return dev;
4043 }
4044
4045 return NULL;
4046}
4047
4048static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4049{
4050 struct net_device *dev;
4051 unsigned int bucket;
4052
4053 do {
4054 dev = dev_from_same_bucket(seq, pos);
4055 if (dev)
4056 return dev;
4057
4058 bucket = get_bucket(*pos) + 1;
4059 *pos = set_bucket_offset(bucket, 1);
4060 } while (bucket < NETDEV_HASHENTRIES);
4061
4062 return NULL;
4063}
4064
4065/*
4066 * This is invoked by the /proc filesystem handler to display a device
4067 * in detail.
4068 */
4069void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4070 __acquires(RCU)
4071{
4072 rcu_read_lock();
4073 if (!*pos)
4074 return SEQ_START_TOKEN;
4075
4076 if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4077 return NULL;
4078
4079 return dev_from_bucket(seq, pos);
4080}
4081
4082void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4083{
4084 ++*pos;
4085 return dev_from_bucket(seq, pos);
4086}
4087
4088void dev_seq_stop(struct seq_file *seq, void *v)
4089 __releases(RCU)
4090{
4091 rcu_read_unlock();
4092}
4093
4094static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4095{
4096 struct rtnl_link_stats64 temp;
4097 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4098
4099 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4100 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4101 dev->name, stats->rx_bytes, stats->rx_packets,
4102 stats->rx_errors,
4103 stats->rx_dropped + stats->rx_missed_errors,
4104 stats->rx_fifo_errors,
4105 stats->rx_length_errors + stats->rx_over_errors +
4106 stats->rx_crc_errors + stats->rx_frame_errors,
4107 stats->rx_compressed, stats->multicast,
4108 stats->tx_bytes, stats->tx_packets,
4109 stats->tx_errors, stats->tx_dropped,
4110 stats->tx_fifo_errors, stats->collisions,
4111 stats->tx_carrier_errors +
4112 stats->tx_aborted_errors +
4113 stats->tx_window_errors +
4114 stats->tx_heartbeat_errors,
4115 stats->tx_compressed);
4116}
4117
4118/*
4119 * Called from the PROCfs module. This now uses the new arbitrary sized
4120 * /proc/net interface to create /proc/net/dev
4121 */
4122static int dev_seq_show(struct seq_file *seq, void *v)
4123{
4124 if (v == SEQ_START_TOKEN)
4125 seq_puts(seq, "Inter-| Receive "
4126 " | Transmit\n"
4127 " face |bytes packets errs drop fifo frame "
4128 "compressed multicast|bytes packets errs "
4129 "drop fifo colls carrier compressed\n");
4130 else
4131 dev_seq_printf_stats(seq, v);
4132 return 0;
4133}
4134
4135static struct softnet_data *softnet_get_online(loff_t *pos)
4136{
4137 struct softnet_data *sd = NULL;
4138
4139 while (*pos < nr_cpu_ids)
4140 if (cpu_online(*pos)) {
4141 sd = &per_cpu(softnet_data, *pos);
4142 break;
4143 } else
4144 ++*pos;
4145 return sd;
4146}
4147
4148static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4149{
4150 return softnet_get_online(pos);
4151}
4152
4153static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4154{
4155 ++*pos;
4156 return softnet_get_online(pos);
4157}
4158
4159static void softnet_seq_stop(struct seq_file *seq, void *v)
4160{
4161}
4162
4163static int softnet_seq_show(struct seq_file *seq, void *v)
4164{
4165 struct softnet_data *sd = v;
4166
4167 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4168 sd->processed, sd->dropped, sd->time_squeeze, 0,
4169 0, 0, 0, 0, /* was fastroute */
4170 sd->cpu_collision, sd->received_rps);
4171 return 0;
4172}
4173
4174static const struct seq_operations dev_seq_ops = {
4175 .start = dev_seq_start,
4176 .next = dev_seq_next,
4177 .stop = dev_seq_stop,
4178 .show = dev_seq_show,
4179};
4180
4181static int dev_seq_open(struct inode *inode, struct file *file)
4182{
4183 return seq_open_net(inode, file, &dev_seq_ops,
4184 sizeof(struct seq_net_private));
4185}
4186
4187static const struct file_operations dev_seq_fops = {
4188 .owner = THIS_MODULE,
4189 .open = dev_seq_open,
4190 .read = seq_read,
4191 .llseek = seq_lseek,
4192 .release = seq_release_net,
4193};
4194
4195static const struct seq_operations softnet_seq_ops = {
4196 .start = softnet_seq_start,
4197 .next = softnet_seq_next,
4198 .stop = softnet_seq_stop,
4199 .show = softnet_seq_show,
4200};
4201
4202static int softnet_seq_open(struct inode *inode, struct file *file)
4203{
4204 return seq_open(file, &softnet_seq_ops);
4205}
4206
4207static const struct file_operations softnet_seq_fops = {
4208 .owner = THIS_MODULE,
4209 .open = softnet_seq_open,
4210 .read = seq_read,
4211 .llseek = seq_lseek,
4212 .release = seq_release,
4213};
4214
4215static void *ptype_get_idx(loff_t pos)
4216{
4217 struct packet_type *pt = NULL;
4218 loff_t i = 0;
4219 int t;
4220
4221 list_for_each_entry_rcu(pt, &ptype_all, list) {
4222 if (i == pos)
4223 return pt;
4224 ++i;
4225 }
4226
4227 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4228 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4229 if (i == pos)
4230 return pt;
4231 ++i;
4232 }
4233 }
4234 return NULL;
4235}
4236
4237static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4238 __acquires(RCU)
4239{
4240 rcu_read_lock();
4241 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4242}
4243
4244static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4245{
4246 struct packet_type *pt;
4247 struct list_head *nxt;
4248 int hash;
4249
4250 ++*pos;
4251 if (v == SEQ_START_TOKEN)
4252 return ptype_get_idx(0);
4253
4254 pt = v;
4255 nxt = pt->list.next;
4256 if (pt->type == htons(ETH_P_ALL)) {
4257 if (nxt != &ptype_all)
4258 goto found;
4259 hash = 0;
4260 nxt = ptype_base[0].next;
4261 } else
4262 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4263
4264 while (nxt == &ptype_base[hash]) {
4265 if (++hash >= PTYPE_HASH_SIZE)
4266 return NULL;
4267 nxt = ptype_base[hash].next;
4268 }
4269found:
4270 return list_entry(nxt, struct packet_type, list);
4271}
4272
4273static void ptype_seq_stop(struct seq_file *seq, void *v)
4274 __releases(RCU)
4275{
4276 rcu_read_unlock();
4277}
4278
4279static int ptype_seq_show(struct seq_file *seq, void *v)
4280{
4281 struct packet_type *pt = v;
4282
4283 if (v == SEQ_START_TOKEN)
4284 seq_puts(seq, "Type Device Function\n");
4285 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4286 if (pt->type == htons(ETH_P_ALL))
4287 seq_puts(seq, "ALL ");
4288 else
4289 seq_printf(seq, "%04x", ntohs(pt->type));
4290
4291 seq_printf(seq, " %-8s %pF\n",
4292 pt->dev ? pt->dev->name : "", pt->func);
4293 }
4294
4295 return 0;
4296}
4297
4298static const struct seq_operations ptype_seq_ops = {
4299 .start = ptype_seq_start,
4300 .next = ptype_seq_next,
4301 .stop = ptype_seq_stop,
4302 .show = ptype_seq_show,
4303};
4304
4305static int ptype_seq_open(struct inode *inode, struct file *file)
4306{
4307 return seq_open_net(inode, file, &ptype_seq_ops,
4308 sizeof(struct seq_net_private));
4309}
4310
4311static const struct file_operations ptype_seq_fops = {
4312 .owner = THIS_MODULE,
4313 .open = ptype_seq_open,
4314 .read = seq_read,
4315 .llseek = seq_lseek,
4316 .release = seq_release_net,
4317};
4318
4319
4320static int __net_init dev_proc_net_init(struct net *net)
4321{
4322 int rc = -ENOMEM;
4323
4324 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4325 goto out;
4326 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4327 goto out_dev;
4328 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4329 goto out_softnet;
4330
4331 if (wext_proc_init(net))
4332 goto out_ptype;
4333 rc = 0;
4334out:
4335 return rc;
4336out_ptype:
4337 proc_net_remove(net, "ptype");
4338out_softnet:
4339 proc_net_remove(net, "softnet_stat");
4340out_dev:
4341 proc_net_remove(net, "dev");
4342 goto out;
4343}
4344
4345static void __net_exit dev_proc_net_exit(struct net *net)
4346{
4347 wext_proc_exit(net);
4348
4349 proc_net_remove(net, "ptype");
4350 proc_net_remove(net, "softnet_stat");
4351 proc_net_remove(net, "dev");
4352}
4353
4354static struct pernet_operations __net_initdata dev_proc_ops = {
4355 .init = dev_proc_net_init,
4356 .exit = dev_proc_net_exit,
4357};
4358
4359static int __init dev_proc_init(void)
4360{
4361 return register_pernet_subsys(&dev_proc_ops);
4362}
4363#else
4364#define dev_proc_init() 0
4365#endif /* CONFIG_PROC_FS */
4366
4367
4368/**
4369 * netdev_set_master - set up master pointer
4370 * @slave: slave device
4371 * @master: new master device
4372 *
4373 * Changes the master device of the slave. Pass %NULL to break the
4374 * bonding. The caller must hold the RTNL semaphore. On a failure
4375 * a negative errno code is returned. On success the reference counts
4376 * are adjusted and the function returns zero.
4377 */
4378int netdev_set_master(struct net_device *slave, struct net_device *master)
4379{
4380 struct net_device *old = slave->master;
4381
4382 ASSERT_RTNL();
4383
4384 if (master) {
4385 if (old)
4386 return -EBUSY;
4387 dev_hold(master);
4388 }
4389
4390 slave->master = master;
4391
4392 if (old)
4393 dev_put(old);
4394 return 0;
4395}
4396EXPORT_SYMBOL(netdev_set_master);
4397
4398/**
4399 * netdev_set_bond_master - set up bonding master/slave pair
4400 * @slave: slave device
4401 * @master: new master device
4402 *
4403 * Changes the master device of the slave. Pass %NULL to break the
4404 * bonding. The caller must hold the RTNL semaphore. On a failure
4405 * a negative errno code is returned. On success %RTM_NEWLINK is sent
4406 * to the routing socket and the function returns zero.
4407 */
4408int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4409{
4410 int err;
4411
4412 ASSERT_RTNL();
4413
4414 err = netdev_set_master(slave, master);
4415 if (err)
4416 return err;
4417 if (master)
4418 slave->flags |= IFF_SLAVE;
4419 else
4420 slave->flags &= ~IFF_SLAVE;
4421
4422 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4423 return 0;
4424}
4425EXPORT_SYMBOL(netdev_set_bond_master);
4426
4427static void dev_change_rx_flags(struct net_device *dev, int flags)
4428{
4429 const struct net_device_ops *ops = dev->netdev_ops;
4430
4431 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4432 ops->ndo_change_rx_flags(dev, flags);
4433}
4434
4435static int __dev_set_promiscuity(struct net_device *dev, int inc)
4436{
4437 unsigned int old_flags = dev->flags;
4438 uid_t uid;
4439 gid_t gid;
4440
4441 ASSERT_RTNL();
4442
4443 dev->flags |= IFF_PROMISC;
4444 dev->promiscuity += inc;
4445 if (dev->promiscuity == 0) {
4446 /*
4447 * Avoid overflow.
4448 * If inc causes overflow, untouch promisc and return error.
4449 */
4450 if (inc < 0)
4451 dev->flags &= ~IFF_PROMISC;
4452 else {
4453 dev->promiscuity -= inc;
4454 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4455 dev->name);
4456 return -EOVERFLOW;
4457 }
4458 }
4459 if (dev->flags != old_flags) {
4460 pr_info("device %s %s promiscuous mode\n",
4461 dev->name,
4462 dev->flags & IFF_PROMISC ? "entered" : "left");
4463 if (audit_enabled) {
4464 current_uid_gid(&uid, &gid);
4465 audit_log(current->audit_context, GFP_ATOMIC,
4466 AUDIT_ANOM_PROMISCUOUS,
4467 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4468 dev->name, (dev->flags & IFF_PROMISC),
4469 (old_flags & IFF_PROMISC),
4470 audit_get_loginuid(current),
4471 uid, gid,
4472 audit_get_sessionid(current));
4473 }
4474
4475 dev_change_rx_flags(dev, IFF_PROMISC);
4476 }
4477 return 0;
4478}
4479
4480/**
4481 * dev_set_promiscuity - update promiscuity count on a device
4482 * @dev: device
4483 * @inc: modifier
4484 *
4485 * Add or remove promiscuity from a device. While the count in the device
4486 * remains above zero the interface remains promiscuous. Once it hits zero
4487 * the device reverts back to normal filtering operation. A negative inc
4488 * value is used to drop promiscuity on the device.
4489 * Return 0 if successful or a negative errno code on error.
4490 */
4491int dev_set_promiscuity(struct net_device *dev, int inc)
4492{
4493 unsigned int old_flags = dev->flags;
4494 int err;
4495
4496 err = __dev_set_promiscuity(dev, inc);
4497 if (err < 0)
4498 return err;
4499 if (dev->flags != old_flags)
4500 dev_set_rx_mode(dev);
4501 return err;
4502}
4503EXPORT_SYMBOL(dev_set_promiscuity);
4504
4505/**
4506 * dev_set_allmulti - update allmulti count on a device
4507 * @dev: device
4508 * @inc: modifier
4509 *
4510 * Add or remove reception of all multicast frames to a device. While the
4511 * count in the device remains above zero the interface remains listening
4512 * to all interfaces. Once it hits zero the device reverts back to normal
4513 * filtering operation. A negative @inc value is used to drop the counter
4514 * when releasing a resource needing all multicasts.
4515 * Return 0 if successful or a negative errno code on error.
4516 */
4517
4518int dev_set_allmulti(struct net_device *dev, int inc)
4519{
4520 unsigned int old_flags = dev->flags;
4521
4522 ASSERT_RTNL();
4523
4524 dev->flags |= IFF_ALLMULTI;
4525 dev->allmulti += inc;
4526 if (dev->allmulti == 0) {
4527 /*
4528 * Avoid overflow.
4529 * If inc causes overflow, untouch allmulti and return error.
4530 */
4531 if (inc < 0)
4532 dev->flags &= ~IFF_ALLMULTI;
4533 else {
4534 dev->allmulti -= inc;
4535 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4536 dev->name);
4537 return -EOVERFLOW;
4538 }
4539 }
4540 if (dev->flags ^ old_flags) {
4541 dev_change_rx_flags(dev, IFF_ALLMULTI);
4542 dev_set_rx_mode(dev);
4543 }
4544 return 0;
4545}
4546EXPORT_SYMBOL(dev_set_allmulti);
4547
4548/*
4549 * Upload unicast and multicast address lists to device and
4550 * configure RX filtering. When the device doesn't support unicast
4551 * filtering it is put in promiscuous mode while unicast addresses
4552 * are present.
4553 */
4554void __dev_set_rx_mode(struct net_device *dev)
4555{
4556 const struct net_device_ops *ops = dev->netdev_ops;
4557
4558 /* dev_open will call this function so the list will stay sane. */
4559 if (!(dev->flags&IFF_UP))
4560 return;
4561
4562 if (!netif_device_present(dev))
4563 return;
4564
4565 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4566 /* Unicast addresses changes may only happen under the rtnl,
4567 * therefore calling __dev_set_promiscuity here is safe.
4568 */
4569 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4570 __dev_set_promiscuity(dev, 1);
4571 dev->uc_promisc = true;
4572 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4573 __dev_set_promiscuity(dev, -1);
4574 dev->uc_promisc = false;
4575 }
4576 }
4577
4578 if (ops->ndo_set_rx_mode)
4579 ops->ndo_set_rx_mode(dev);
4580}
4581
4582void dev_set_rx_mode(struct net_device *dev)
4583{
4584 netif_addr_lock_bh(dev);
4585 __dev_set_rx_mode(dev);
4586 netif_addr_unlock_bh(dev);
4587}
4588
4589/**
4590 * dev_get_flags - get flags reported to userspace
4591 * @dev: device
4592 *
4593 * Get the combination of flag bits exported through APIs to userspace.
4594 */
4595unsigned int dev_get_flags(const struct net_device *dev)
4596{
4597 unsigned int flags;
4598
4599 flags = (dev->flags & ~(IFF_PROMISC |
4600 IFF_ALLMULTI |
4601 IFF_RUNNING |
4602 IFF_LOWER_UP |
4603 IFF_DORMANT)) |
4604 (dev->gflags & (IFF_PROMISC |
4605 IFF_ALLMULTI));
4606
4607 if (netif_running(dev)) {
4608 if (netif_oper_up(dev))
4609 flags |= IFF_RUNNING;
4610 if (netif_carrier_ok(dev))
4611 flags |= IFF_LOWER_UP;
4612 if (netif_dormant(dev))
4613 flags |= IFF_DORMANT;
4614 }
4615
4616 return flags;
4617}
4618EXPORT_SYMBOL(dev_get_flags);
4619
4620int __dev_change_flags(struct net_device *dev, unsigned int flags)
4621{
4622 unsigned int old_flags = dev->flags;
4623 int ret;
4624
4625 ASSERT_RTNL();
4626
4627 /*
4628 * Set the flags on our device.
4629 */
4630
4631 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4632 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4633 IFF_AUTOMEDIA)) |
4634 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4635 IFF_ALLMULTI));
4636
4637 /*
4638 * Load in the correct multicast list now the flags have changed.
4639 */
4640
4641 if ((old_flags ^ flags) & IFF_MULTICAST)
4642 dev_change_rx_flags(dev, IFF_MULTICAST);
4643
4644 dev_set_rx_mode(dev);
4645
4646 /*
4647 * Have we downed the interface. We handle IFF_UP ourselves
4648 * according to user attempts to set it, rather than blindly
4649 * setting it.
4650 */
4651
4652 ret = 0;
4653 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
4654 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4655
4656 if (!ret)
4657 dev_set_rx_mode(dev);
4658 }
4659
4660 if ((flags ^ dev->gflags) & IFF_PROMISC) {
4661 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4662
4663 dev->gflags ^= IFF_PROMISC;
4664 dev_set_promiscuity(dev, inc);
4665 }
4666
4667 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4668 is important. Some (broken) drivers set IFF_PROMISC, when
4669 IFF_ALLMULTI is requested not asking us and not reporting.
4670 */
4671 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4672 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4673
4674 dev->gflags ^= IFF_ALLMULTI;
4675 dev_set_allmulti(dev, inc);
4676 }
4677
4678 return ret;
4679}
4680
4681void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4682{
4683 unsigned int changes = dev->flags ^ old_flags;
4684
4685 if (changes & IFF_UP) {
4686 if (dev->flags & IFF_UP)
4687 call_netdevice_notifiers(NETDEV_UP, dev);
4688 else
4689 call_netdevice_notifiers(NETDEV_DOWN, dev);
4690 }
4691
4692 if (dev->flags & IFF_UP &&
4693 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4694 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4695}
4696
4697/**
4698 * dev_change_flags - change device settings
4699 * @dev: device
4700 * @flags: device state flags
4701 *
4702 * Change settings on device based state flags. The flags are
4703 * in the userspace exported format.
4704 */
4705int dev_change_flags(struct net_device *dev, unsigned int flags)
4706{
4707 int ret;
4708 unsigned int changes, old_flags = dev->flags;
4709
4710 ret = __dev_change_flags(dev, flags);
4711 if (ret < 0)
4712 return ret;
4713
4714 changes = old_flags ^ dev->flags;
4715 if (changes)
4716 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4717
4718 __dev_notify_flags(dev, old_flags);
4719 return ret;
4720}
4721EXPORT_SYMBOL(dev_change_flags);
4722
4723/**
4724 * dev_set_mtu - Change maximum transfer unit
4725 * @dev: device
4726 * @new_mtu: new transfer unit
4727 *
4728 * Change the maximum transfer size of the network device.
4729 */
4730int dev_set_mtu(struct net_device *dev, int new_mtu)
4731{
4732 const struct net_device_ops *ops = dev->netdev_ops;
4733 int err;
4734
4735 if (new_mtu == dev->mtu)
4736 return 0;
4737
4738 /* MTU must be positive. */
4739 if (new_mtu < 0)
4740 return -EINVAL;
4741
4742 if (!netif_device_present(dev))
4743 return -ENODEV;
4744
4745 err = 0;
4746 if (ops->ndo_change_mtu)
4747 err = ops->ndo_change_mtu(dev, new_mtu);
4748 else
4749 dev->mtu = new_mtu;
4750
4751 if (!err && dev->flags & IFF_UP)
4752 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4753 return err;
4754}
4755EXPORT_SYMBOL(dev_set_mtu);
4756
4757/**
4758 * dev_set_group - Change group this device belongs to
4759 * @dev: device
4760 * @new_group: group this device should belong to
4761 */
4762void dev_set_group(struct net_device *dev, int new_group)
4763{
4764 dev->group = new_group;
4765}
4766EXPORT_SYMBOL(dev_set_group);
4767
4768/**
4769 * dev_set_mac_address - Change Media Access Control Address
4770 * @dev: device
4771 * @sa: new address
4772 *
4773 * Change the hardware (MAC) address of the device
4774 */
4775int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4776{
4777 const struct net_device_ops *ops = dev->netdev_ops;
4778 int err;
4779
4780 if (!ops->ndo_set_mac_address)
4781 return -EOPNOTSUPP;
4782 if (sa->sa_family != dev->type)
4783 return -EINVAL;
4784 if (!netif_device_present(dev))
4785 return -ENODEV;
4786 err = ops->ndo_set_mac_address(dev, sa);
4787 if (!err)
4788 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4789 add_device_randomness(dev->dev_addr, dev->addr_len);
4790 return err;
4791}
4792EXPORT_SYMBOL(dev_set_mac_address);
4793
4794/*
4795 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4796 */
4797static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4798{
4799 int err;
4800 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4801
4802 if (!dev)
4803 return -ENODEV;
4804
4805 switch (cmd) {
4806 case SIOCGIFFLAGS: /* Get interface flags */
4807 ifr->ifr_flags = (short) dev_get_flags(dev);
4808 return 0;
4809
4810 case SIOCGIFMETRIC: /* Get the metric on the interface
4811 (currently unused) */
4812 ifr->ifr_metric = 0;
4813 return 0;
4814
4815 case SIOCGIFMTU: /* Get the MTU of a device */
4816 ifr->ifr_mtu = dev->mtu;
4817 return 0;
4818
4819 case SIOCGIFHWADDR:
4820 if (!dev->addr_len)
4821 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4822 else
4823 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4824 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4825 ifr->ifr_hwaddr.sa_family = dev->type;
4826 return 0;
4827
4828 case SIOCGIFSLAVE:
4829 err = -EINVAL;
4830 break;
4831
4832 case SIOCGIFMAP:
4833 ifr->ifr_map.mem_start = dev->mem_start;
4834 ifr->ifr_map.mem_end = dev->mem_end;
4835 ifr->ifr_map.base_addr = dev->base_addr;
4836 ifr->ifr_map.irq = dev->irq;
4837 ifr->ifr_map.dma = dev->dma;
4838 ifr->ifr_map.port = dev->if_port;
4839 return 0;
4840
4841 case SIOCGIFINDEX:
4842 ifr->ifr_ifindex = dev->ifindex;
4843 return 0;
4844
4845 case SIOCGIFTXQLEN:
4846 ifr->ifr_qlen = dev->tx_queue_len;
4847 return 0;
4848
4849 default:
4850 /* dev_ioctl() should ensure this case
4851 * is never reached
4852 */
4853 WARN_ON(1);
4854 err = -ENOTTY;
4855 break;
4856
4857 }
4858 return err;
4859}
4860
4861/*
4862 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4863 */
4864static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4865{
4866 int err;
4867 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4868 const struct net_device_ops *ops;
4869
4870 if (!dev)
4871 return -ENODEV;
4872
4873 ops = dev->netdev_ops;
4874
4875 switch (cmd) {
4876 case SIOCSIFFLAGS: /* Set interface flags */
4877 return dev_change_flags(dev, ifr->ifr_flags);
4878
4879 case SIOCSIFMETRIC: /* Set the metric on the interface
4880 (currently unused) */
4881 return -EOPNOTSUPP;
4882
4883 case SIOCSIFMTU: /* Set the MTU of a device */
4884 return dev_set_mtu(dev, ifr->ifr_mtu);
4885
4886 case SIOCSIFHWADDR:
4887 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4888
4889 case SIOCSIFHWBROADCAST:
4890 if (ifr->ifr_hwaddr.sa_family != dev->type)
4891 return -EINVAL;
4892 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4893 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4894 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4895 return 0;
4896
4897 case SIOCSIFMAP:
4898 if (ops->ndo_set_config) {
4899 if (!netif_device_present(dev))
4900 return -ENODEV;
4901 return ops->ndo_set_config(dev, &ifr->ifr_map);
4902 }
4903 return -EOPNOTSUPP;
4904
4905 case SIOCADDMULTI:
4906 if (!ops->ndo_set_rx_mode ||
4907 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4908 return -EINVAL;
4909 if (!netif_device_present(dev))
4910 return -ENODEV;
4911 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4912
4913 case SIOCDELMULTI:
4914 if (!ops->ndo_set_rx_mode ||
4915 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4916 return -EINVAL;
4917 if (!netif_device_present(dev))
4918 return -ENODEV;
4919 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4920
4921 case SIOCSIFTXQLEN:
4922 if (ifr->ifr_qlen < 0)
4923 return -EINVAL;
4924 dev->tx_queue_len = ifr->ifr_qlen;
4925 return 0;
4926
4927 case SIOCSIFNAME:
4928 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4929 return dev_change_name(dev, ifr->ifr_newname);
4930
4931 case SIOCSHWTSTAMP:
4932 err = net_hwtstamp_validate(ifr);
4933 if (err)
4934 return err;
4935 /* fall through */
4936
4937 /*
4938 * Unknown or private ioctl
4939 */
4940 default:
4941 if ((cmd >= SIOCDEVPRIVATE &&
4942 cmd <= SIOCDEVPRIVATE + 15) ||
4943 cmd == SIOCBONDENSLAVE ||
4944 cmd == SIOCBONDRELEASE ||
4945 cmd == SIOCBONDSETHWADDR ||
4946 cmd == SIOCBONDSLAVEINFOQUERY ||
4947 cmd == SIOCBONDINFOQUERY ||
4948 cmd == SIOCBONDCHANGEACTIVE ||
4949 cmd == SIOCGMIIPHY ||
4950 cmd == SIOCGMIIREG ||
4951 cmd == SIOCSMIIREG ||
4952 cmd == SIOCBRADDIF ||
4953 cmd == SIOCBRDELIF ||
4954 cmd == SIOCSHWTSTAMP ||
4955 cmd == SIOCWANDEV) {
4956 err = -EOPNOTSUPP;
4957 if (ops->ndo_do_ioctl) {
4958 if (netif_device_present(dev))
4959 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4960 else
4961 err = -ENODEV;
4962 }
4963 } else
4964 err = -EINVAL;
4965
4966 }
4967 return err;
4968}
4969
4970/*
4971 * This function handles all "interface"-type I/O control requests. The actual
4972 * 'doing' part of this is dev_ifsioc above.
4973 */
4974
4975/**
4976 * dev_ioctl - network device ioctl
4977 * @net: the applicable net namespace
4978 * @cmd: command to issue
4979 * @arg: pointer to a struct ifreq in user space
4980 *
4981 * Issue ioctl functions to devices. This is normally called by the
4982 * user space syscall interfaces but can sometimes be useful for
4983 * other purposes. The return value is the return from the syscall if
4984 * positive or a negative errno code on error.
4985 */
4986
4987int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4988{
4989 struct ifreq ifr;
4990 int ret;
4991 char *colon;
4992
4993 /* One special case: SIOCGIFCONF takes ifconf argument
4994 and requires shared lock, because it sleeps writing
4995 to user space.
4996 */
4997
4998 if (cmd == SIOCGIFCONF) {
4999 rtnl_lock();
5000 ret = dev_ifconf(net, (char __user *) arg);
5001 rtnl_unlock();
5002 return ret;
5003 }
5004 if (cmd == SIOCGIFNAME)
5005 return dev_ifname(net, (struct ifreq __user *)arg);
5006
5007 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5008 return -EFAULT;
5009
5010 ifr.ifr_name[IFNAMSIZ-1] = 0;
5011
5012 colon = strchr(ifr.ifr_name, ':');
5013 if (colon)
5014 *colon = 0;
5015
5016 /*
5017 * See which interface the caller is talking about.
5018 */
5019
5020 switch (cmd) {
5021 /*
5022 * These ioctl calls:
5023 * - can be done by all.
5024 * - atomic and do not require locking.
5025 * - return a value
5026 */
5027 case SIOCGIFFLAGS:
5028 case SIOCGIFMETRIC:
5029 case SIOCGIFMTU:
5030 case SIOCGIFHWADDR:
5031 case SIOCGIFSLAVE:
5032 case SIOCGIFMAP:
5033 case SIOCGIFINDEX:
5034 case SIOCGIFTXQLEN:
5035 dev_load(net, ifr.ifr_name);
5036 rcu_read_lock();
5037 ret = dev_ifsioc_locked(net, &ifr, cmd);
5038 rcu_read_unlock();
5039 if (!ret) {
5040 if (colon)
5041 *colon = ':';
5042 if (copy_to_user(arg, &ifr,
5043 sizeof(struct ifreq)))
5044 ret = -EFAULT;
5045 }
5046 return ret;
5047
5048 case SIOCETHTOOL:
5049 dev_load(net, ifr.ifr_name);
5050 rtnl_lock();
5051 ret = dev_ethtool(net, &ifr);
5052 rtnl_unlock();
5053 if (!ret) {
5054 if (colon)
5055 *colon = ':';
5056 if (copy_to_user(arg, &ifr,
5057 sizeof(struct ifreq)))
5058 ret = -EFAULT;
5059 }
5060 return ret;
5061
5062 /*
5063 * These ioctl calls:
5064 * - require superuser power.
5065 * - require strict serialization.
5066 * - return a value
5067 */
5068 case SIOCGMIIPHY:
5069 case SIOCGMIIREG:
5070 case SIOCSIFNAME:
5071 if (!capable(CAP_NET_ADMIN))
5072 return -EPERM;
5073 dev_load(net, ifr.ifr_name);
5074 rtnl_lock();
5075 ret = dev_ifsioc(net, &ifr, cmd);
5076 rtnl_unlock();
5077 if (!ret) {
5078 if (colon)
5079 *colon = ':';
5080 if (copy_to_user(arg, &ifr,
5081 sizeof(struct ifreq)))
5082 ret = -EFAULT;
5083 }
5084 return ret;
5085
5086 /*
5087 * These ioctl calls:
5088 * - require superuser power.
5089 * - require strict serialization.
5090 * - do not return a value
5091 */
5092 case SIOCSIFFLAGS:
5093 case SIOCSIFMETRIC:
5094 case SIOCSIFMTU:
5095 case SIOCSIFMAP:
5096 case SIOCSIFHWADDR:
5097 case SIOCSIFSLAVE:
5098 case SIOCADDMULTI:
5099 case SIOCDELMULTI:
5100 case SIOCSIFHWBROADCAST:
5101 case SIOCSIFTXQLEN:
5102 case SIOCSMIIREG:
5103 case SIOCBONDENSLAVE:
5104 case SIOCBONDRELEASE:
5105 case SIOCBONDSETHWADDR:
5106 case SIOCBONDCHANGEACTIVE:
5107 case SIOCBRADDIF:
5108 case SIOCBRDELIF:
5109 case SIOCSHWTSTAMP:
5110 if (!capable(CAP_NET_ADMIN))
5111 return -EPERM;
5112 /* fall through */
5113 case SIOCBONDSLAVEINFOQUERY:
5114 case SIOCBONDINFOQUERY:
5115 dev_load(net, ifr.ifr_name);
5116 rtnl_lock();
5117 ret = dev_ifsioc(net, &ifr, cmd);
5118 rtnl_unlock();
5119 return ret;
5120
5121 case SIOCGIFMEM:
5122 /* Get the per device memory space. We can add this but
5123 * currently do not support it */
5124 case SIOCSIFMEM:
5125 /* Set the per device memory buffer space.
5126 * Not applicable in our case */
5127 case SIOCSIFLINK:
5128 return -ENOTTY;
5129
5130 /*
5131 * Unknown or private ioctl.
5132 */
5133 default:
5134 if (cmd == SIOCWANDEV ||
5135 (cmd >= SIOCDEVPRIVATE &&
5136 cmd <= SIOCDEVPRIVATE + 15)) {
5137 dev_load(net, ifr.ifr_name);
5138 rtnl_lock();
5139 ret = dev_ifsioc(net, &ifr, cmd);
5140 rtnl_unlock();
5141 if (!ret && copy_to_user(arg, &ifr,
5142 sizeof(struct ifreq)))
5143 ret = -EFAULT;
5144 return ret;
5145 }
5146 /* Take care of Wireless Extensions */
5147 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5148 return wext_handle_ioctl(net, &ifr, cmd, arg);
5149 return -ENOTTY;
5150 }
5151}
5152
5153
5154/**
5155 * dev_new_index - allocate an ifindex
5156 * @net: the applicable net namespace
5157 *
5158 * Returns a suitable unique value for a new device interface
5159 * number. The caller must hold the rtnl semaphore or the
5160 * dev_base_lock to be sure it remains unique.
5161 */
5162static int dev_new_index(struct net *net)
5163{
5164 static int ifindex;
5165 for (;;) {
5166 if (++ifindex <= 0)
5167 ifindex = 1;
5168 if (!__dev_get_by_index(net, ifindex))
5169 return ifindex;
5170 }
5171}
5172
5173/* Delayed registration/unregisteration */
5174static LIST_HEAD(net_todo_list);
5175
5176static void net_set_todo(struct net_device *dev)
5177{
5178 list_add_tail(&dev->todo_list, &net_todo_list);
5179}
5180
5181static void rollback_registered_many(struct list_head *head)
5182{
5183 struct net_device *dev, *tmp;
5184
5185 BUG_ON(dev_boot_phase);
5186 ASSERT_RTNL();
5187
5188 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5189 /* Some devices call without registering
5190 * for initialization unwind. Remove those
5191 * devices and proceed with the remaining.
5192 */
5193 if (dev->reg_state == NETREG_UNINITIALIZED) {
5194 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5195 dev->name, dev);
5196
5197 WARN_ON(1);
5198 list_del(&dev->unreg_list);
5199 continue;
5200 }
5201 dev->dismantle = true;
5202 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5203 }
5204
5205 /* If device is running, close it first. */
5206 dev_close_many(head);
5207
5208 list_for_each_entry(dev, head, unreg_list) {
5209 /* And unlink it from device chain. */
5210 unlist_netdevice(dev);
5211
5212 dev->reg_state = NETREG_UNREGISTERING;
5213 }
5214
5215 synchronize_net();
5216
5217 list_for_each_entry(dev, head, unreg_list) {
5218 /* Shutdown queueing discipline. */
5219 dev_shutdown(dev);
5220
5221
5222 /* Notify protocols, that we are about to destroy
5223 this device. They should clean all the things.
5224 */
5225 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5226
5227 if (!dev->rtnl_link_ops ||
5228 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5229 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5230
5231 /*
5232 * Flush the unicast and multicast chains
5233 */
5234 dev_uc_flush(dev);
5235 dev_mc_flush(dev);
5236
5237 if (dev->netdev_ops->ndo_uninit)
5238 dev->netdev_ops->ndo_uninit(dev);
5239
5240 /* Notifier chain MUST detach us from master device. */
5241 WARN_ON(dev->master);
5242
5243 /* Remove entries from kobject tree */
5244 netdev_unregister_kobject(dev);
5245 }
5246
5247 /* Process any work delayed until the end of the batch */
5248 dev = list_first_entry(head, struct net_device, unreg_list);
5249 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5250
5251 synchronize_net();
5252
5253 list_for_each_entry(dev, head, unreg_list)
5254 dev_put(dev);
5255}
5256
5257static void rollback_registered(struct net_device *dev)
5258{
5259 LIST_HEAD(single);
5260
5261 list_add(&dev->unreg_list, &single);
5262 rollback_registered_many(&single);
5263 list_del(&single);
5264}
5265
5266static netdev_features_t netdev_fix_features(struct net_device *dev,
5267 netdev_features_t features)
5268{
5269 /* Fix illegal checksum combinations */
5270 if ((features & NETIF_F_HW_CSUM) &&
5271 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5272 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5273 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5274 }
5275
5276 /* Fix illegal SG+CSUM combinations. */
5277 if ((features & NETIF_F_SG) &&
5278 !(features & NETIF_F_ALL_CSUM)) {
5279 netdev_dbg(dev,
5280 "Dropping NETIF_F_SG since no checksum feature.\n");
5281 features &= ~NETIF_F_SG;
5282 }
5283
5284 /* TSO requires that SG is present as well. */
5285 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5286 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5287 features &= ~NETIF_F_ALL_TSO;
5288 }
5289
5290 /* TSO ECN requires that TSO is present as well. */
5291 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5292 features &= ~NETIF_F_TSO_ECN;
5293
5294 /* Software GSO depends on SG. */
5295 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5296 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5297 features &= ~NETIF_F_GSO;
5298 }
5299
5300 /* UFO needs SG and checksumming */
5301 if (features & NETIF_F_UFO) {
5302 /* maybe split UFO into V4 and V6? */
5303 if (!((features & NETIF_F_GEN_CSUM) ||
5304 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5305 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5306 netdev_dbg(dev,
5307 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5308 features &= ~NETIF_F_UFO;
5309 }
5310
5311 if (!(features & NETIF_F_SG)) {
5312 netdev_dbg(dev,
5313 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5314 features &= ~NETIF_F_UFO;
5315 }
5316 }
5317
5318 return features;
5319}
5320
5321int __netdev_update_features(struct net_device *dev)
5322{
5323 netdev_features_t features;
5324 int err = 0;
5325
5326 ASSERT_RTNL();
5327
5328 features = netdev_get_wanted_features(dev);
5329
5330 if (dev->netdev_ops->ndo_fix_features)
5331 features = dev->netdev_ops->ndo_fix_features(dev, features);
5332
5333 /* driver might be less strict about feature dependencies */
5334 features = netdev_fix_features(dev, features);
5335
5336 if (dev->features == features)
5337 return 0;
5338
5339 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5340 &dev->features, &features);
5341
5342 if (dev->netdev_ops->ndo_set_features)
5343 err = dev->netdev_ops->ndo_set_features(dev, features);
5344
5345 if (unlikely(err < 0)) {
5346 netdev_err(dev,
5347 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5348 err, &features, &dev->features);
5349 return -1;
5350 }
5351
5352 if (!err)
5353 dev->features = features;
5354
5355 return 1;
5356}
5357
5358/**
5359 * netdev_update_features - recalculate device features
5360 * @dev: the device to check
5361 *
5362 * Recalculate dev->features set and send notifications if it
5363 * has changed. Should be called after driver or hardware dependent
5364 * conditions might have changed that influence the features.
5365 */
5366void netdev_update_features(struct net_device *dev)
5367{
5368 if (__netdev_update_features(dev))
5369 netdev_features_change(dev);
5370}
5371EXPORT_SYMBOL(netdev_update_features);
5372
5373/**
5374 * netdev_change_features - recalculate device features
5375 * @dev: the device to check
5376 *
5377 * Recalculate dev->features set and send notifications even
5378 * if they have not changed. Should be called instead of
5379 * netdev_update_features() if also dev->vlan_features might
5380 * have changed to allow the changes to be propagated to stacked
5381 * VLAN devices.
5382 */
5383void netdev_change_features(struct net_device *dev)
5384{
5385 __netdev_update_features(dev);
5386 netdev_features_change(dev);
5387}
5388EXPORT_SYMBOL(netdev_change_features);
5389
5390/**
5391 * netif_stacked_transfer_operstate - transfer operstate
5392 * @rootdev: the root or lower level device to transfer state from
5393 * @dev: the device to transfer operstate to
5394 *
5395 * Transfer operational state from root to device. This is normally
5396 * called when a stacking relationship exists between the root
5397 * device and the device(a leaf device).
5398 */
5399void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5400 struct net_device *dev)
5401{
5402 if (rootdev->operstate == IF_OPER_DORMANT)
5403 netif_dormant_on(dev);
5404 else
5405 netif_dormant_off(dev);
5406
5407 if (netif_carrier_ok(rootdev)) {
5408 if (!netif_carrier_ok(dev))
5409 netif_carrier_on(dev);
5410 } else {
5411 if (netif_carrier_ok(dev))
5412 netif_carrier_off(dev);
5413 }
5414}
5415EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5416
5417#ifdef CONFIG_RPS
5418static int netif_alloc_rx_queues(struct net_device *dev)
5419{
5420 unsigned int i, count = dev->num_rx_queues;
5421 struct netdev_rx_queue *rx;
5422
5423 BUG_ON(count < 1);
5424
5425 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5426 if (!rx) {
5427 pr_err("netdev: Unable to allocate %u rx queues\n", count);
5428 return -ENOMEM;
5429 }
5430 dev->_rx = rx;
5431
5432 for (i = 0; i < count; i++)
5433 rx[i].dev = dev;
5434 return 0;
5435}
5436#endif
5437
5438static void netdev_init_one_queue(struct net_device *dev,
5439 struct netdev_queue *queue, void *_unused)
5440{
5441 /* Initialize queue lock */
5442 spin_lock_init(&queue->_xmit_lock);
5443 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5444 queue->xmit_lock_owner = -1;
5445 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5446 queue->dev = dev;
5447#ifdef CONFIG_BQL
5448 dql_init(&queue->dql, HZ);
5449#endif
5450}
5451
5452static int netif_alloc_netdev_queues(struct net_device *dev)
5453{
5454 unsigned int count = dev->num_tx_queues;
5455 struct netdev_queue *tx;
5456
5457 BUG_ON(count < 1);
5458
5459 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5460 if (!tx) {
5461 pr_err("netdev: Unable to allocate %u tx queues\n", count);
5462 return -ENOMEM;
5463 }
5464 dev->_tx = tx;
5465
5466 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5467 spin_lock_init(&dev->tx_global_lock);
5468
5469 return 0;
5470}
5471
5472/**
5473 * register_netdevice - register a network device
5474 * @dev: device to register
5475 *
5476 * Take a completed network device structure and add it to the kernel
5477 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5478 * chain. 0 is returned on success. A negative errno code is returned
5479 * on a failure to set up the device, or if the name is a duplicate.
5480 *
5481 * Callers must hold the rtnl semaphore. You may want
5482 * register_netdev() instead of this.
5483 *
5484 * BUGS:
5485 * The locking appears insufficient to guarantee two parallel registers
5486 * will not get the same name.
5487 */
5488
5489int register_netdevice(struct net_device *dev)
5490{
5491 int ret;
5492 struct net *net = dev_net(dev);
5493
5494 BUG_ON(dev_boot_phase);
5495 ASSERT_RTNL();
5496
5497 might_sleep();
5498
5499 /* When net_device's are persistent, this will be fatal. */
5500 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5501 BUG_ON(!net);
5502
5503 spin_lock_init(&dev->addr_list_lock);
5504 netdev_set_addr_lockdep_class(dev);
5505
5506 dev->iflink = -1;
5507
5508 ret = dev_get_valid_name(dev, dev->name);
5509 if (ret < 0)
5510 goto out;
5511
5512 /* Init, if this function is available */
5513 if (dev->netdev_ops->ndo_init) {
5514 ret = dev->netdev_ops->ndo_init(dev);
5515 if (ret) {
5516 if (ret > 0)
5517 ret = -EIO;
5518 goto out;
5519 }
5520 }
5521
5522 dev->ifindex = dev_new_index(net);
5523 if (dev->iflink == -1)
5524 dev->iflink = dev->ifindex;
5525
5526 /* Transfer changeable features to wanted_features and enable
5527 * software offloads (GSO and GRO).
5528 */
5529 dev->hw_features |= NETIF_F_SOFT_FEATURES;
5530 dev->features |= NETIF_F_SOFT_FEATURES;
5531 dev->wanted_features = dev->features & dev->hw_features;
5532
5533 /* Turn on no cache copy if HW is doing checksum */
5534 if (!(dev->flags & IFF_LOOPBACK)) {
5535 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5536 if (dev->features & NETIF_F_ALL_CSUM) {
5537 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5538 dev->features |= NETIF_F_NOCACHE_COPY;
5539 }
5540 }
5541
5542 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5543 */
5544 dev->vlan_features |= NETIF_F_HIGHDMA;
5545
5546 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5547 ret = notifier_to_errno(ret);
5548 if (ret)
5549 goto err_uninit;
5550
5551 ret = netdev_register_kobject(dev);
5552 if (ret)
5553 goto err_uninit;
5554 dev->reg_state = NETREG_REGISTERED;
5555
5556 __netdev_update_features(dev);
5557
5558 /*
5559 * Default initial state at registry is that the
5560 * device is present.
5561 */
5562
5563 set_bit(__LINK_STATE_PRESENT, &dev->state);
5564
5565 dev_init_scheduler(dev);
5566 dev_hold(dev);
5567 list_netdevice(dev);
5568 add_device_randomness(dev->dev_addr, dev->addr_len);
5569
5570 /* Notify protocols, that a new device appeared. */
5571 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5572 ret = notifier_to_errno(ret);
5573 if (ret) {
5574 rollback_registered(dev);
5575 dev->reg_state = NETREG_UNREGISTERED;
5576 }
5577 /*
5578 * Prevent userspace races by waiting until the network
5579 * device is fully setup before sending notifications.
5580 */
5581 if (!dev->rtnl_link_ops ||
5582 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5583 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5584
5585out:
5586 return ret;
5587
5588err_uninit:
5589 if (dev->netdev_ops->ndo_uninit)
5590 dev->netdev_ops->ndo_uninit(dev);
5591 goto out;
5592}
5593EXPORT_SYMBOL(register_netdevice);
5594
5595/**
5596 * init_dummy_netdev - init a dummy network device for NAPI
5597 * @dev: device to init
5598 *
5599 * This takes a network device structure and initialize the minimum
5600 * amount of fields so it can be used to schedule NAPI polls without
5601 * registering a full blown interface. This is to be used by drivers
5602 * that need to tie several hardware interfaces to a single NAPI
5603 * poll scheduler due to HW limitations.
5604 */
5605int init_dummy_netdev(struct net_device *dev)
5606{
5607 /* Clear everything. Note we don't initialize spinlocks
5608 * are they aren't supposed to be taken by any of the
5609 * NAPI code and this dummy netdev is supposed to be
5610 * only ever used for NAPI polls
5611 */
5612 memset(dev, 0, sizeof(struct net_device));
5613
5614 /* make sure we BUG if trying to hit standard
5615 * register/unregister code path
5616 */
5617 dev->reg_state = NETREG_DUMMY;
5618
5619 /* NAPI wants this */
5620 INIT_LIST_HEAD(&dev->napi_list);
5621
5622 /* a dummy interface is started by default */
5623 set_bit(__LINK_STATE_PRESENT, &dev->state);
5624 set_bit(__LINK_STATE_START, &dev->state);
5625
5626 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5627 * because users of this 'device' dont need to change
5628 * its refcount.
5629 */
5630
5631 return 0;
5632}
5633EXPORT_SYMBOL_GPL(init_dummy_netdev);
5634
5635
5636/**
5637 * register_netdev - register a network device
5638 * @dev: device to register
5639 *
5640 * Take a completed network device structure and add it to the kernel
5641 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5642 * chain. 0 is returned on success. A negative errno code is returned
5643 * on a failure to set up the device, or if the name is a duplicate.
5644 *
5645 * This is a wrapper around register_netdevice that takes the rtnl semaphore
5646 * and expands the device name if you passed a format string to
5647 * alloc_netdev.
5648 */
5649int register_netdev(struct net_device *dev)
5650{
5651 int err;
5652
5653 rtnl_lock();
5654 err = register_netdevice(dev);
5655 rtnl_unlock();
5656 return err;
5657}
5658EXPORT_SYMBOL(register_netdev);
5659
5660int netdev_refcnt_read(const struct net_device *dev)
5661{
5662 int i, refcnt = 0;
5663
5664 for_each_possible_cpu(i)
5665 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5666 return refcnt;
5667}
5668EXPORT_SYMBOL(netdev_refcnt_read);
5669
5670/*
5671 * netdev_wait_allrefs - wait until all references are gone.
5672 *
5673 * This is called when unregistering network devices.
5674 *
5675 * Any protocol or device that holds a reference should register
5676 * for netdevice notification, and cleanup and put back the
5677 * reference if they receive an UNREGISTER event.
5678 * We can get stuck here if buggy protocols don't correctly
5679 * call dev_put.
5680 */
5681static void netdev_wait_allrefs(struct net_device *dev)
5682{
5683 unsigned long rebroadcast_time, warning_time;
5684 int refcnt;
5685
5686 linkwatch_forget_dev(dev);
5687
5688 rebroadcast_time = warning_time = jiffies;
5689 refcnt = netdev_refcnt_read(dev);
5690
5691 while (refcnt != 0) {
5692 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5693 rtnl_lock();
5694
5695 /* Rebroadcast unregister notification */
5696 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5697 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5698 * should have already handle it the first time */
5699
5700 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5701 &dev->state)) {
5702 /* We must not have linkwatch events
5703 * pending on unregister. If this
5704 * happens, we simply run the queue
5705 * unscheduled, resulting in a noop
5706 * for this device.
5707 */
5708 linkwatch_run_queue();
5709 }
5710
5711 __rtnl_unlock();
5712
5713 rebroadcast_time = jiffies;
5714 }
5715
5716 msleep(250);
5717
5718 refcnt = netdev_refcnt_read(dev);
5719
5720 if (time_after(jiffies, warning_time + 10 * HZ)) {
5721 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5722 dev->name, refcnt);
5723 warning_time = jiffies;
5724 }
5725 }
5726}
5727
5728/* The sequence is:
5729 *
5730 * rtnl_lock();
5731 * ...
5732 * register_netdevice(x1);
5733 * register_netdevice(x2);
5734 * ...
5735 * unregister_netdevice(y1);
5736 * unregister_netdevice(y2);
5737 * ...
5738 * rtnl_unlock();
5739 * free_netdev(y1);
5740 * free_netdev(y2);
5741 *
5742 * We are invoked by rtnl_unlock().
5743 * This allows us to deal with problems:
5744 * 1) We can delete sysfs objects which invoke hotplug
5745 * without deadlocking with linkwatch via keventd.
5746 * 2) Since we run with the RTNL semaphore not held, we can sleep
5747 * safely in order to wait for the netdev refcnt to drop to zero.
5748 *
5749 * We must not return until all unregister events added during
5750 * the interval the lock was held have been completed.
5751 */
5752void netdev_run_todo(void)
5753{
5754 struct list_head list;
5755
5756 /* Snapshot list, allow later requests */
5757 list_replace_init(&net_todo_list, &list);
5758
5759 __rtnl_unlock();
5760
5761 /* Wait for rcu callbacks to finish before attempting to drain
5762 * the device list. This usually avoids a 250ms wait.
5763 */
5764 if (!list_empty(&list))
5765 rcu_barrier();
5766
5767 while (!list_empty(&list)) {
5768 struct net_device *dev
5769 = list_first_entry(&list, struct net_device, todo_list);
5770 list_del(&dev->todo_list);
5771
5772 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5773 pr_err("network todo '%s' but state %d\n",
5774 dev->name, dev->reg_state);
5775 dump_stack();
5776 continue;
5777 }
5778
5779 dev->reg_state = NETREG_UNREGISTERED;
5780
5781 on_each_cpu(flush_backlog, dev, 1);
5782
5783 netdev_wait_allrefs(dev);
5784
5785 /* paranoia */
5786 BUG_ON(netdev_refcnt_read(dev));
5787 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5788 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5789 WARN_ON(dev->dn_ptr);
5790
5791 if (dev->destructor)
5792 dev->destructor(dev);
5793
5794 /* Free network device */
5795 kobject_put(&dev->dev.kobj);
5796 }
5797}
5798
5799/* Convert net_device_stats to rtnl_link_stats64. They have the same
5800 * fields in the same order, with only the type differing.
5801 */
5802void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5803 const struct net_device_stats *netdev_stats)
5804{
5805#if BITS_PER_LONG == 64
5806 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5807 memcpy(stats64, netdev_stats, sizeof(*stats64));
5808#else
5809 size_t i, n = sizeof(*stats64) / sizeof(u64);
5810 const unsigned long *src = (const unsigned long *)netdev_stats;
5811 u64 *dst = (u64 *)stats64;
5812
5813 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5814 sizeof(*stats64) / sizeof(u64));
5815 for (i = 0; i < n; i++)
5816 dst[i] = src[i];
5817#endif
5818}
5819EXPORT_SYMBOL(netdev_stats_to_stats64);
5820
5821/**
5822 * dev_get_stats - get network device statistics
5823 * @dev: device to get statistics from
5824 * @storage: place to store stats
5825 *
5826 * Get network statistics from device. Return @storage.
5827 * The device driver may provide its own method by setting
5828 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5829 * otherwise the internal statistics structure is used.
5830 */
5831struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5832 struct rtnl_link_stats64 *storage)
5833{
5834 const struct net_device_ops *ops = dev->netdev_ops;
5835
5836 if (ops->ndo_get_stats64) {
5837 memset(storage, 0, sizeof(*storage));
5838 ops->ndo_get_stats64(dev, storage);
5839 } else if (ops->ndo_get_stats) {
5840 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5841 } else {
5842 netdev_stats_to_stats64(storage, &dev->stats);
5843 }
5844 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5845 return storage;
5846}
5847EXPORT_SYMBOL(dev_get_stats);
5848
5849struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5850{
5851 struct netdev_queue *queue = dev_ingress_queue(dev);
5852
5853#ifdef CONFIG_NET_CLS_ACT
5854 if (queue)
5855 return queue;
5856 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5857 if (!queue)
5858 return NULL;
5859 netdev_init_one_queue(dev, queue, NULL);
5860 queue->qdisc = &noop_qdisc;
5861 queue->qdisc_sleeping = &noop_qdisc;
5862 rcu_assign_pointer(dev->ingress_queue, queue);
5863#endif
5864 return queue;
5865}
5866
5867/**
5868 * alloc_netdev_mqs - allocate network device
5869 * @sizeof_priv: size of private data to allocate space for
5870 * @name: device name format string
5871 * @setup: callback to initialize device
5872 * @txqs: the number of TX subqueues to allocate
5873 * @rxqs: the number of RX subqueues to allocate
5874 *
5875 * Allocates a struct net_device with private data area for driver use
5876 * and performs basic initialization. Also allocates subquue structs
5877 * for each queue on the device.
5878 */
5879struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5880 void (*setup)(struct net_device *),
5881 unsigned int txqs, unsigned int rxqs)
5882{
5883 struct net_device *dev;
5884 size_t alloc_size;
5885 struct net_device *p;
5886
5887 BUG_ON(strlen(name) >= sizeof(dev->name));
5888
5889 if (txqs < 1) {
5890 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5891 return NULL;
5892 }
5893
5894#ifdef CONFIG_RPS
5895 if (rxqs < 1) {
5896 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5897 return NULL;
5898 }
5899#endif
5900
5901 alloc_size = sizeof(struct net_device);
5902 if (sizeof_priv) {
5903 /* ensure 32-byte alignment of private area */
5904 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5905 alloc_size += sizeof_priv;
5906 }
5907 /* ensure 32-byte alignment of whole construct */
5908 alloc_size += NETDEV_ALIGN - 1;
5909
5910 p = kzalloc(alloc_size, GFP_KERNEL);
5911 if (!p) {
5912 pr_err("alloc_netdev: Unable to allocate device\n");
5913 return NULL;
5914 }
5915
5916 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5917 dev->padded = (char *)dev - (char *)p;
5918
5919 dev->pcpu_refcnt = alloc_percpu(int);
5920 if (!dev->pcpu_refcnt)
5921 goto free_p;
5922
5923 if (dev_addr_init(dev))
5924 goto free_pcpu;
5925
5926 dev_mc_init(dev);
5927 dev_uc_init(dev);
5928
5929 dev_net_set(dev, &init_net);
5930
5931 dev->gso_max_size = GSO_MAX_SIZE;
5932 dev->gso_max_segs = GSO_MAX_SEGS;
5933
5934 INIT_LIST_HEAD(&dev->napi_list);
5935 INIT_LIST_HEAD(&dev->unreg_list);
5936 INIT_LIST_HEAD(&dev->link_watch_list);
5937 dev->priv_flags = IFF_XMIT_DST_RELEASE;
5938 setup(dev);
5939
5940 dev->num_tx_queues = txqs;
5941 dev->real_num_tx_queues = txqs;
5942 if (netif_alloc_netdev_queues(dev))
5943 goto free_all;
5944
5945#ifdef CONFIG_RPS
5946 dev->num_rx_queues = rxqs;
5947 dev->real_num_rx_queues = rxqs;
5948 if (netif_alloc_rx_queues(dev))
5949 goto free_all;
5950#endif
5951
5952 strcpy(dev->name, name);
5953 dev->group = INIT_NETDEV_GROUP;
5954 return dev;
5955
5956free_all:
5957 free_netdev(dev);
5958 return NULL;
5959
5960free_pcpu:
5961 free_percpu(dev->pcpu_refcnt);
5962 kfree(dev->_tx);
5963#ifdef CONFIG_RPS
5964 kfree(dev->_rx);
5965#endif
5966
5967free_p:
5968 kfree(p);
5969 return NULL;
5970}
5971EXPORT_SYMBOL(alloc_netdev_mqs);
5972
5973/**
5974 * free_netdev - free network device
5975 * @dev: device
5976 *
5977 * This function does the last stage of destroying an allocated device
5978 * interface. The reference to the device object is released.
5979 * If this is the last reference then it will be freed.
5980 */
5981void free_netdev(struct net_device *dev)
5982{
5983 struct napi_struct *p, *n;
5984
5985 release_net(dev_net(dev));
5986
5987 kfree(dev->_tx);
5988#ifdef CONFIG_RPS
5989 kfree(dev->_rx);
5990#endif
5991
5992 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
5993
5994 /* Flush device addresses */
5995 dev_addr_flush(dev);
5996
5997 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5998 netif_napi_del(p);
5999
6000 free_percpu(dev->pcpu_refcnt);
6001 dev->pcpu_refcnt = NULL;
6002
6003 /* Compatibility with error handling in drivers */
6004 if (dev->reg_state == NETREG_UNINITIALIZED) {
6005 kfree((char *)dev - dev->padded);
6006 return;
6007 }
6008
6009 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6010 dev->reg_state = NETREG_RELEASED;
6011
6012 /* will free via device release */
6013 put_device(&dev->dev);
6014}
6015EXPORT_SYMBOL(free_netdev);
6016
6017/**
6018 * synchronize_net - Synchronize with packet receive processing
6019 *
6020 * Wait for packets currently being received to be done.
6021 * Does not block later packets from starting.
6022 */
6023void synchronize_net(void)
6024{
6025 might_sleep();
6026 if (rtnl_is_locked())
6027 synchronize_rcu_expedited();
6028 else
6029 synchronize_rcu();
6030}
6031EXPORT_SYMBOL(synchronize_net);
6032
6033/**
6034 * unregister_netdevice_queue - remove device from the kernel
6035 * @dev: device
6036 * @head: list
6037 *
6038 * This function shuts down a device interface and removes it
6039 * from the kernel tables.
6040 * If head not NULL, device is queued to be unregistered later.
6041 *
6042 * Callers must hold the rtnl semaphore. You may want
6043 * unregister_netdev() instead of this.
6044 */
6045
6046void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6047{
6048 ASSERT_RTNL();
6049
6050 if (head) {
6051 list_move_tail(&dev->unreg_list, head);
6052 } else {
6053 rollback_registered(dev);
6054 /* Finish processing unregister after unlock */
6055 net_set_todo(dev);
6056 }
6057}
6058EXPORT_SYMBOL(unregister_netdevice_queue);
6059
6060/**
6061 * unregister_netdevice_many - unregister many devices
6062 * @head: list of devices
6063 */
6064void unregister_netdevice_many(struct list_head *head)
6065{
6066 struct net_device *dev;
6067
6068 if (!list_empty(head)) {
6069 rollback_registered_many(head);
6070 list_for_each_entry(dev, head, unreg_list)
6071 net_set_todo(dev);
6072 }
6073}
6074EXPORT_SYMBOL(unregister_netdevice_many);
6075
6076/**
6077 * unregister_netdev - remove device from the kernel
6078 * @dev: device
6079 *
6080 * This function shuts down a device interface and removes it
6081 * from the kernel tables.
6082 *
6083 * This is just a wrapper for unregister_netdevice that takes
6084 * the rtnl semaphore. In general you want to use this and not
6085 * unregister_netdevice.
6086 */
6087void unregister_netdev(struct net_device *dev)
6088{
6089 rtnl_lock();
6090 unregister_netdevice(dev);
6091 rtnl_unlock();
6092}
6093EXPORT_SYMBOL(unregister_netdev);
6094
6095/**
6096 * dev_change_net_namespace - move device to different nethost namespace
6097 * @dev: device
6098 * @net: network namespace
6099 * @pat: If not NULL name pattern to try if the current device name
6100 * is already taken in the destination network namespace.
6101 *
6102 * This function shuts down a device interface and moves it
6103 * to a new network namespace. On success 0 is returned, on
6104 * a failure a netagive errno code is returned.
6105 *
6106 * Callers must hold the rtnl semaphore.
6107 */
6108
6109int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6110{
6111 int err;
6112
6113 ASSERT_RTNL();
6114
6115 /* Don't allow namespace local devices to be moved. */
6116 err = -EINVAL;
6117 if (dev->features & NETIF_F_NETNS_LOCAL)
6118 goto out;
6119
6120 /* Ensure the device has been registrered */
6121 err = -EINVAL;
6122 if (dev->reg_state != NETREG_REGISTERED)
6123 goto out;
6124
6125 /* Get out if there is nothing todo */
6126 err = 0;
6127 if (net_eq(dev_net(dev), net))
6128 goto out;
6129
6130 /* Pick the destination device name, and ensure
6131 * we can use it in the destination network namespace.
6132 */
6133 err = -EEXIST;
6134 if (__dev_get_by_name(net, dev->name)) {
6135 /* We get here if we can't use the current device name */
6136 if (!pat)
6137 goto out;
6138 if (dev_get_valid_name(dev, pat) < 0)
6139 goto out;
6140 }
6141
6142 /*
6143 * And now a mini version of register_netdevice unregister_netdevice.
6144 */
6145
6146 /* If device is running close it first. */
6147 dev_close(dev);
6148
6149 /* And unlink it from device chain */
6150 err = -ENODEV;
6151 unlist_netdevice(dev);
6152
6153 synchronize_net();
6154
6155 /* Shutdown queueing discipline. */
6156 dev_shutdown(dev);
6157
6158 /* Notify protocols, that we are about to destroy
6159 this device. They should clean all the things.
6160
6161 Note that dev->reg_state stays at NETREG_REGISTERED.
6162 This is wanted because this way 8021q and macvlan know
6163 the device is just moving and can keep their slaves up.
6164 */
6165 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6166 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6167 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6168
6169 /*
6170 * Flush the unicast and multicast chains
6171 */
6172 dev_uc_flush(dev);
6173 dev_mc_flush(dev);
6174
6175 /* Actually switch the network namespace */
6176 dev_net_set(dev, net);
6177
6178 /* If there is an ifindex conflict assign a new one */
6179 if (__dev_get_by_index(net, dev->ifindex)) {
6180 int iflink = (dev->iflink == dev->ifindex);
6181 dev->ifindex = dev_new_index(net);
6182 if (iflink)
6183 dev->iflink = dev->ifindex;
6184 }
6185
6186 /* Fixup kobjects */
6187 err = device_rename(&dev->dev, dev->name);
6188 WARN_ON(err);
6189
6190 /* Add the device back in the hashes */
6191 list_netdevice(dev);
6192
6193 /* Notify protocols, that a new device appeared. */
6194 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6195
6196 /*
6197 * Prevent userspace races by waiting until the network
6198 * device is fully setup before sending notifications.
6199 */
6200 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6201
6202 synchronize_net();
6203 err = 0;
6204out:
6205 return err;
6206}
6207EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6208
6209static int dev_cpu_callback(struct notifier_block *nfb,
6210 unsigned long action,
6211 void *ocpu)
6212{
6213 struct sk_buff **list_skb;
6214 struct sk_buff *skb;
6215 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6216 struct softnet_data *sd, *oldsd;
6217
6218 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6219 return NOTIFY_OK;
6220
6221 local_irq_disable();
6222 cpu = smp_processor_id();
6223 sd = &per_cpu(softnet_data, cpu);
6224 oldsd = &per_cpu(softnet_data, oldcpu);
6225
6226 /* Find end of our completion_queue. */
6227 list_skb = &sd->completion_queue;
6228 while (*list_skb)
6229 list_skb = &(*list_skb)->next;
6230 /* Append completion queue from offline CPU. */
6231 *list_skb = oldsd->completion_queue;
6232 oldsd->completion_queue = NULL;
6233
6234 /* Append output queue from offline CPU. */
6235 if (oldsd->output_queue) {
6236 *sd->output_queue_tailp = oldsd->output_queue;
6237 sd->output_queue_tailp = oldsd->output_queue_tailp;
6238 oldsd->output_queue = NULL;
6239 oldsd->output_queue_tailp = &oldsd->output_queue;
6240 }
6241 /* Append NAPI poll list from offline CPU. */
6242 if (!list_empty(&oldsd->poll_list)) {
6243 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6244 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6245 }
6246
6247 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6248 local_irq_enable();
6249
6250 /* Process offline CPU's input_pkt_queue */
6251 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6252 netif_rx(skb);
6253 input_queue_head_incr(oldsd);
6254 }
6255 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6256 netif_rx(skb);
6257 input_queue_head_incr(oldsd);
6258 }
6259
6260 return NOTIFY_OK;
6261}
6262
6263
6264/**
6265 * netdev_increment_features - increment feature set by one
6266 * @all: current feature set
6267 * @one: new feature set
6268 * @mask: mask feature set
6269 *
6270 * Computes a new feature set after adding a device with feature set
6271 * @one to the master device with current feature set @all. Will not
6272 * enable anything that is off in @mask. Returns the new feature set.
6273 */
6274netdev_features_t netdev_increment_features(netdev_features_t all,
6275 netdev_features_t one, netdev_features_t mask)
6276{
6277 if (mask & NETIF_F_GEN_CSUM)
6278 mask |= NETIF_F_ALL_CSUM;
6279 mask |= NETIF_F_VLAN_CHALLENGED;
6280
6281 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6282 all &= one | ~NETIF_F_ALL_FOR_ALL;
6283
6284 /* If one device supports hw checksumming, set for all. */
6285 if (all & NETIF_F_GEN_CSUM)
6286 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6287
6288 return all;
6289}
6290EXPORT_SYMBOL(netdev_increment_features);
6291
6292static struct hlist_head *netdev_create_hash(void)
6293{
6294 int i;
6295 struct hlist_head *hash;
6296
6297 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6298 if (hash != NULL)
6299 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6300 INIT_HLIST_HEAD(&hash[i]);
6301
6302 return hash;
6303}
6304
6305/* Initialize per network namespace state */
6306static int __net_init netdev_init(struct net *net)
6307{
6308 if (net != &init_net)
6309 INIT_LIST_HEAD(&net->dev_base_head);
6310
6311 net->dev_name_head = netdev_create_hash();
6312 if (net->dev_name_head == NULL)
6313 goto err_name;
6314
6315 net->dev_index_head = netdev_create_hash();
6316 if (net->dev_index_head == NULL)
6317 goto err_idx;
6318
6319 return 0;
6320
6321err_idx:
6322 kfree(net->dev_name_head);
6323err_name:
6324 return -ENOMEM;
6325}
6326
6327/**
6328 * netdev_drivername - network driver for the device
6329 * @dev: network device
6330 *
6331 * Determine network driver for device.
6332 */
6333const char *netdev_drivername(const struct net_device *dev)
6334{
6335 const struct device_driver *driver;
6336 const struct device *parent;
6337 const char *empty = "";
6338
6339 parent = dev->dev.parent;
6340 if (!parent)
6341 return empty;
6342
6343 driver = parent->driver;
6344 if (driver && driver->name)
6345 return driver->name;
6346 return empty;
6347}
6348
6349int __netdev_printk(const char *level, const struct net_device *dev,
6350 struct va_format *vaf)
6351{
6352 int r;
6353
6354 if (dev && dev->dev.parent)
6355 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6356 netdev_name(dev), vaf);
6357 else if (dev)
6358 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6359 else
6360 r = printk("%s(NULL net_device): %pV", level, vaf);
6361
6362 return r;
6363}
6364EXPORT_SYMBOL(__netdev_printk);
6365
6366int netdev_printk(const char *level, const struct net_device *dev,
6367 const char *format, ...)
6368{
6369 struct va_format vaf;
6370 va_list args;
6371 int r;
6372
6373 va_start(args, format);
6374
6375 vaf.fmt = format;
6376 vaf.va = &args;
6377
6378 r = __netdev_printk(level, dev, &vaf);
6379 va_end(args);
6380
6381 return r;
6382}
6383EXPORT_SYMBOL(netdev_printk);
6384
6385#define define_netdev_printk_level(func, level) \
6386int func(const struct net_device *dev, const char *fmt, ...) \
6387{ \
6388 int r; \
6389 struct va_format vaf; \
6390 va_list args; \
6391 \
6392 va_start(args, fmt); \
6393 \
6394 vaf.fmt = fmt; \
6395 vaf.va = &args; \
6396 \
6397 r = __netdev_printk(level, dev, &vaf); \
6398 va_end(args); \
6399 \
6400 return r; \
6401} \
6402EXPORT_SYMBOL(func);
6403
6404define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6405define_netdev_printk_level(netdev_alert, KERN_ALERT);
6406define_netdev_printk_level(netdev_crit, KERN_CRIT);
6407define_netdev_printk_level(netdev_err, KERN_ERR);
6408define_netdev_printk_level(netdev_warn, KERN_WARNING);
6409define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6410define_netdev_printk_level(netdev_info, KERN_INFO);
6411
6412static void __net_exit netdev_exit(struct net *net)
6413{
6414 kfree(net->dev_name_head);
6415 kfree(net->dev_index_head);
6416}
6417
6418static struct pernet_operations __net_initdata netdev_net_ops = {
6419 .init = netdev_init,
6420 .exit = netdev_exit,
6421};
6422
6423static void __net_exit default_device_exit(struct net *net)
6424{
6425 struct net_device *dev, *aux;
6426 /*
6427 * Push all migratable network devices back to the
6428 * initial network namespace
6429 */
6430 rtnl_lock();
6431 for_each_netdev_safe(net, dev, aux) {
6432 int err;
6433 char fb_name[IFNAMSIZ];
6434
6435 /* Ignore unmoveable devices (i.e. loopback) */
6436 if (dev->features & NETIF_F_NETNS_LOCAL)
6437 continue;
6438
6439 /* Leave virtual devices for the generic cleanup */
6440 if (dev->rtnl_link_ops)
6441 continue;
6442
6443 /* Push remaining network devices to init_net */
6444 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6445 err = dev_change_net_namespace(dev, &init_net, fb_name);
6446 if (err) {
6447 pr_emerg("%s: failed to move %s to init_net: %d\n",
6448 __func__, dev->name, err);
6449 BUG();
6450 }
6451 }
6452 rtnl_unlock();
6453}
6454
6455static void __net_exit default_device_exit_batch(struct list_head *net_list)
6456{
6457 /* At exit all network devices most be removed from a network
6458 * namespace. Do this in the reverse order of registration.
6459 * Do this across as many network namespaces as possible to
6460 * improve batching efficiency.
6461 */
6462 struct net_device *dev;
6463 struct net *net;
6464 LIST_HEAD(dev_kill_list);
6465
6466 rtnl_lock();
6467 list_for_each_entry(net, net_list, exit_list) {
6468 for_each_netdev_reverse(net, dev) {
6469 if (dev->rtnl_link_ops)
6470 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6471 else
6472 unregister_netdevice_queue(dev, &dev_kill_list);
6473 }
6474 }
6475 unregister_netdevice_many(&dev_kill_list);
6476 list_del(&dev_kill_list);
6477 rtnl_unlock();
6478}
6479
6480static struct pernet_operations __net_initdata default_device_ops = {
6481 .exit = default_device_exit,
6482 .exit_batch = default_device_exit_batch,
6483};
6484
6485/*
6486 * Initialize the DEV module. At boot time this walks the device list and
6487 * unhooks any devices that fail to initialise (normally hardware not
6488 * present) and leaves us with a valid list of present and active devices.
6489 *
6490 */
6491
6492/*
6493 * This is called single threaded during boot, so no need
6494 * to take the rtnl semaphore.
6495 */
6496static int __init net_dev_init(void)
6497{
6498 int i, rc = -ENOMEM;
6499
6500 BUG_ON(!dev_boot_phase);
6501
6502 if (dev_proc_init())
6503 goto out;
6504
6505 if (netdev_kobject_init())
6506 goto out;
6507
6508 INIT_LIST_HEAD(&ptype_all);
6509 for (i = 0; i < PTYPE_HASH_SIZE; i++)
6510 INIT_LIST_HEAD(&ptype_base[i]);
6511
6512 if (register_pernet_subsys(&netdev_net_ops))
6513 goto out;
6514
6515 /*
6516 * Initialise the packet receive queues.
6517 */
6518
6519 for_each_possible_cpu(i) {
6520 struct softnet_data *sd = &per_cpu(softnet_data, i);
6521
6522 memset(sd, 0, sizeof(*sd));
6523 skb_queue_head_init(&sd->input_pkt_queue);
6524 skb_queue_head_init(&sd->process_queue);
6525 sd->completion_queue = NULL;
6526 INIT_LIST_HEAD(&sd->poll_list);
6527 sd->output_queue = NULL;
6528 sd->output_queue_tailp = &sd->output_queue;
6529#ifdef CONFIG_RPS
6530 sd->csd.func = rps_trigger_softirq;
6531 sd->csd.info = sd;
6532 sd->csd.flags = 0;
6533 sd->cpu = i;
6534#endif
6535
6536 sd->backlog.poll = process_backlog;
6537 sd->backlog.weight = weight_p;
6538 sd->backlog.gro_list = NULL;
6539 sd->backlog.gro_count = 0;
6540 }
6541
6542 dev_boot_phase = 0;
6543
6544 /* The loopback device is special if any other network devices
6545 * is present in a network namespace the loopback device must
6546 * be present. Since we now dynamically allocate and free the
6547 * loopback device ensure this invariant is maintained by
6548 * keeping the loopback device as the first device on the
6549 * list of network devices. Ensuring the loopback devices
6550 * is the first device that appears and the last network device
6551 * that disappears.
6552 */
6553 if (register_pernet_device(&loopback_net_ops))
6554 goto out;
6555
6556 if (register_pernet_device(&default_device_ops))
6557 goto out;
6558
6559 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6560 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6561
6562 hotcpu_notifier(dev_cpu_callback, 0);
6563 dst_init();
6564 dev_mcast_init();
6565 rc = 0;
6566out:
6567 return rc;
6568}
6569
6570subsys_initcall(net_dev_init);
6571
6572static int __init initialize_hashrnd(void)
6573{
6574 get_random_bytes(&hashrnd, sizeof(hashrnd));
6575 return 0;
6576}
6577
6578late_initcall_sync(initialize_hashrnd);
6579